slake-school-service/app/Services/Crawl/Adapters/GenericNewsHtmlAdapter.php

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\HtmlCrawlSupport;
use App\Services\Crawl\HtmlPagination;
use App\Services\Crawl\NewsCategoryMatcher;
use App\Services\Crawl\NewsContentHtml;
use App\Services\Crawl\NewsHtmlImageLocalizer;
use Illuminate\Support\Str;

class GenericNewsHtmlAdapter implements CrawlerAdapterInterface
{
    public function __construct(
        protected NewsCategoryMatcher $categoryMatcher,
        protected NewsHtmlImageLocalizer $imageLocalizer,
    ) {}

    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
        $maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
        $maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));

        $pageHost = HtmlCrawlSupport::hostKey($requestUrl);
        $items = [];
        $seen = [];

        foreach (HtmlPagination::fetchPagesHtml($requestUrl, $maxPages) as $html) {
            foreach ($this->parseListHtml($html, $requestUrl, $pageHost) as $item) {
                if (isset($seen[$item->externalId])) {
                    continue;
                }
                $seen[$item->externalId] = true;
                $items[] = $item;
            }
        }

        if (count($items) <= 1) {
            $firstHtml = HtmlPagination::fetchPagesHtml($requestUrl, 1)[0] ?? '';
            if ($firstHtml !== '' && $this->looksLikeArticlePage($firstHtml)) {
                $single = $this->parseArticleFromHtml($firstHtml, $requestUrl);
                if ($single !== null) {
                    $items = [$single];
                }
            }
        }

        $items = array_values(array_filter(
            $items,
            fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords)
        ));

        $enriched = [];
        foreach ($items as $item) {
            if (count($enriched) >= $maxResults) {
                break;
            }
            $detail = $this->fetchArticleDetail($item->canonicalUrl ?? '');
            $title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null)
                ?: HtmlCrawlSupport::cleanArticleTitle($item->title)
                ?: $item->title;
            if (HtmlCrawlSupport::isWeakLinkTitle($title)) {
                continue;
            }

            $articleUrl = $item->canonicalUrl ?? '';
            $contentHtml = $detail['content_html'] ?? $item->contentHtml;
            if ($contentHtml) {
                $contentHtml = $this->imageLocalizer->localize($contentHtml, $articleUrl);
            }
            $plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : '';
            $publishedAt = $detail['published_at'] ?? $item->publishedAt;

            if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) {
                continue;
            }

            $categoryId = $this->categoryMatcher->resolveCategoryId(
                $title,
                $plainForMatch !== '' ? $plainForMatch : null,
                $keywords
            );

            $enriched[] = new CrawlItemDto(
                externalId: $item->externalId,
                title: $title,
                canonicalUrl: $item->canonicalUrl,
                summary: $detail['summary'] ?? $item->summary,
                publishedAt: $publishedAt,
                contentHtml: $contentHtml,
                extra: [
                    'platform' => 'generic_html',
                    'keywords' => $keywords,
                    'category_dict_item_id' => $categoryId,
                    'category_label' => $this->categoryMatcher->labelForId($categoryId),
                ],
            );
        }

        return $enriched;
    }

    /**
     * @return list<CrawlItemDto>
     */
    protected function parseListHtml(string $html, string $baseUrl, ?string $pageHost): array
    {
        $items = [];
        $seen = [];

        if (preg_match_all('#<li[^>]*>(.*?)</li>#isu', $html, $blocks, PREG_SET_ORDER)) {
            foreach ($blocks as $block) {
                $item = $this->extractLinkFromFragment($block[1], $baseUrl, $pageHost);
                if ($item && ! isset($seen[$item->externalId])) {
                    $seen[$item->externalId] = true;
                    $items[] = $item;
                }
            }
        }

        if (! preg_match_all(
            '#<a[^>]+href=["\']([^"\']+)["\'][^>]*>(.*?)</a>#isu',
            $html,
            $matches,
            PREG_SET_ORDER
        )) {
            return $items;
        }

        foreach ($matches as $m) {
            $item = $this->buildListItem($m[1], $m[2], $html, $baseUrl, $pageHost);
            if ($item && ! isset($seen[$item->externalId])) {
                $seen[$item->externalId] = true;
                $items[] = $item;
            }
        }

        return $items;
    }

    protected function extractLinkFromFragment(string $fragment, string $baseUrl, ?string $pageHost): ?CrawlItemDto
    {
        if (! preg_match('#<a[^>]+href=["\']([^"\']+)["\'][^>]*>(.*?)</a>#isu', $fragment, $m)) {
            return null;
        }

        return $this->buildListItem($m[1], $m[2], $fragment, $baseUrl, $pageHost);
    }

    protected function buildListItem(
        string $href,
        string $linkInner,
        string $context,
        string $baseUrl,
        ?string $pageHost,
    ): ?CrawlItemDto {
        $title = trim(preg_replace('/\s+/u', ' ', strip_tags(html_entity_decode($linkInner))));
        if (HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) {
            return null;
        }

        $url = HtmlCrawlSupport::absoluteUrl($href, $baseUrl);
        if (! $url || HtmlCrawlSupport::isAssetPath($url)) {
            return null;
        }
        if ($pageHost && ! HtmlCrawlSupport::sameHost($pageHost, HtmlCrawlSupport::hostKey($url))) {
            return null;
        }
        if (! $this->looksLikeNewsArticleUrl($url, $baseUrl)) {
            return null;
        }

        $publishedAt = HtmlCrawlSupport::extractDateFromText($context);

        return new CrawlItemDto(
            externalId: 'news:'.md5($url),
            title: $title,
            canonicalUrl: $url,
            publishedAt: $publishedAt,
            extra: ['platform' => 'generic_html'],
        );
    }

    protected function looksLikeNewsArticleUrl(string $url, string $listUrl): bool
    {
        if (rtrim($url, '/') === rtrim($listUrl, '/')) {
            return false;
        }

        $path = strtolower((string) parse_url($url, PHP_URL_PATH));
        if ($path === '' || $path === '/') {
            return false;
        }

        if (preg_match('#/(news|article|content|detail|post|story|infor|view|show|archives?)/#i', $path)) {
            return true;
        }
        if (preg_match('#/\d{4}[/\-]\d{1,2}[/\-]\d{1,2}/#', $path)) {
            return true;
        }
        if (preg_match('#\.(s?html?|php|aspx?)(\?|$)#i', $path) && ! preg_match('#/(index|list|category|tag|search)(\.|/|$)#i', $path)) {
            return true;
        }

        return preg_match('#/[a-z0-9\-]{8,}\.s?html?$#i', $path) === 1;
    }

    protected function looksLikeArticlePage(string $html): bool
    {
        if (! preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
            return false;
        }
        $h1 = trim(strip_tags(html_entity_decode($m[1])));

        return Str::length($h1) >= 8 && NewsContentHtml::extractBody($html) !== null;
    }

    protected function parseArticleFromHtml(string $html, string $url): ?CrawlItemDto
    {
        $detail = $this->parseArticleDetailFromHtml($html);
        $title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null);
        if ($title === null || $title === '') {
            return null;
        }

        return new CrawlItemDto(
            externalId: 'news:'.md5($url),
            title: $title,
            canonicalUrl: $url,
            summary: $detail['summary'],
            publishedAt: $detail['published_at'],
            contentHtml: $detail['content_html'],
            extra: ['platform' => 'generic_html'],
        );
    }

    /**
     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
     */
    protected function fetchArticleDetail(string $url): array
    {
        $empty = [
            'title' => null,
            'summary' => null,
            'content_html' => null,
            'published_at' => null,
        ];

        if ($url === '') {
            return $empty;
        }

        try {
            $html = HtmlCrawlSupport::fetchHtml($url, 20);
        } catch (\Throwable) {
            return $empty;
        }

        return $this->parseArticleDetailFromHtml($html);
    }

    /**
     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
     */
    protected function parseArticleDetailFromHtml(string $html): array
    {
        $title = null;
        if (preg_match('#<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            $title = trim(html_entity_decode($m[1]));
        } elseif (preg_match('#<meta[^>]+name=["\']twitter:title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            $title = trim(html_entity_decode($m[1]));
        } elseif (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
            $title = trim(strip_tags(html_entity_decode($m[1])));
        } elseif (preg_match('#<title[^>]*>(.*?)</title>#is', $html, $m)) {
            $title = trim(strip_tags(html_entity_decode($m[1])));
        }
        $title = HtmlCrawlSupport::cleanArticleTitle($title);

        $publishedAt = $this->extractPublishedAt($html);

        $contentHtml = NewsContentHtml::extractBody($html);

        $summary = null;
        if (preg_match('#<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            $summary = trim(html_entity_decode($m[1]));
        }

        return [
            'title' => $title,
            'summary' => $summary,
            'content_html' => $contentHtml,
            'published_at' => $publishedAt,
        ];
    }

    protected function extractPublishedAt(string $html): ?string
    {
        if (preg_match('#<meta[^>]+property=["\']article:published_time["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            return HtmlCrawlSupport::normalizeDate($m[1]);
        }
        if (preg_match('#<meta[^>]+property=["\']og:published_time["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            return HtmlCrawlSupport::normalizeDate($m[1]);
        }
        if (preg_match('#<meta[^>]+name=["\'](?:publishdate|pubdate|date)["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            return HtmlCrawlSupport::normalizeDate($m[1]);
        }
        if (preg_match_all('#<time[^>]+datetime=["\']([^"\']+)["\'][^>]*>#i', $html, $m)) {
            foreach ($m[1] as $dt) {
                $parsed = HtmlCrawlSupport::normalizeDate($dt);
                if ($parsed) {
                    return $parsed;
                }
            }
        }
        if (preg_match('#<time[^>]*>([^<]{6,40})</time>#i', $html, $m)) {
            return HtmlCrawlSupport::normalizeDate($m[1]);
        }
        if (preg_match('#<span[^>]*class=["\'][^"\']*(?:date|time|publish)[^"\']*["\'][^>]*>([^<]{6,40})</span>#i', $html, $m)) {
            return HtmlCrawlSupport::normalizeDate($m[1]);
        }

        return HtmlCrawlSupport::extractDateFromText($html);
    }

}
更改 1 week ago			`<?php`

			`namespace App\Services\Crawl\Adapters;`

			`use App\Models\CrawlSource;`
			`use App\Services\Crawl\Contracts\CrawlerAdapterInterface;`
			`use App\Services\Crawl\CrawlItemDto;`
			`use App\Services\Crawl\CrawlKeywordParser;`
			`use App\Services\Crawl\HtmlCrawlSupport;`
			`use App\Services\Crawl\HtmlPagination;`
			`use App\Services\Crawl\NewsCategoryMatcher;`
			`use App\Services\Crawl\NewsContentHtml;`
			`use App\Services\Crawl\NewsHtmlImageLocalizer;`
			`use Illuminate\Support\Str;`

			`class GenericNewsHtmlAdapter implements CrawlerAdapterInterface`
			`{`
			`public function __construct(`
			`protected NewsCategoryMatcher $categoryMatcher,`
			`protected NewsHtmlImageLocalizer $imageLocalizer,`
			`) {}`

			`public function fetch(string $requestUrl, CrawlSource $source, array $params): array`
			`{`
			`$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));`
			`$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));`
			`$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));`

			`$pageHost = HtmlCrawlSupport::hostKey($requestUrl);`
			`$items = [];`
			`$seen = [];`

			`foreach (HtmlPagination::fetchPagesHtml($requestUrl, $maxPages) as $html) {`
			`foreach ($this->parseListHtml($html, $requestUrl, $pageHost) as $item) {`
			`if (isset($seen[$item->externalId])) {`
			`continue;`
			`}`
			`$seen[$item->externalId] = true;`
			`$items[] = $item;`
			`}`
			`}`

			`if (count($items) <= 1) {`
			`$firstHtml = HtmlPagination::fetchPagesHtml($requestUrl, 1)[0] ?? '';`
			`if ($firstHtml !== '' && $this->looksLikeArticlePage($firstHtml)) {`
			`$single = $this->parseArticleFromHtml($firstHtml, $requestUrl);`
			`if ($single !== null) {`
			`$items = [$single];`
			`}`
			`}`
			`}`

			`$items = array_values(array_filter(`
			`$items,`
			`fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords)`
			`));`

			`$enriched = [];`
			`foreach ($items as $item) {`
			`if (count($enriched) >= $maxResults) {`
			`break;`
			`}`
			`$detail = $this->fetchArticleDetail($item->canonicalUrl ?? '');`
			`$title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null)`
			`?: HtmlCrawlSupport::cleanArticleTitle($item->title)`
			`?: $item->title;`
			`if (HtmlCrawlSupport::isWeakLinkTitle($title)) {`
			`continue;`
			`}`

			`$articleUrl = $item->canonicalUrl ?? '';`
			`$contentHtml = $detail['content_html'] ?? $item->contentHtml;`
			`if ($contentHtml) {`
			`$contentHtml = $this->imageLocalizer->localize($contentHtml, $articleUrl);`
			`}`
			`$plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : '';`
			`$publishedAt = $detail['published_at'] ?? $item->publishedAt;`

			`if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) {`
			`continue;`
			`}`

			`$categoryId = $this->categoryMatcher->resolveCategoryId(`
			`$title,`
			`$plainForMatch !== '' ? $plainForMatch : null,`
			`$keywords`
			`);`

			`$enriched[] = new CrawlItemDto(`
			`externalId: $item->externalId,`
			`title: $title,`
			`canonicalUrl: $item->canonicalUrl,`
			`summary: $detail['summary'] ?? $item->summary,`
			`publishedAt: $publishedAt,`
			`contentHtml: $contentHtml,`
			`extra: [`
			`'platform' => 'generic_html',`
			`'keywords' => $keywords,`
			`'category_dict_item_id' => $categoryId,`
			`'category_label' => $this->categoryMatcher->labelForId($categoryId),`
			`],`
			`);`
			`}`

			`return $enriched;`
			`}`

			`/**`
			`* @return list<CrawlItemDto>`
			`*/`
			`protected function parseListHtml(string $html, string $baseUrl, ?string $pageHost): array`
			`{`
			`$items = [];`
			`$seen = [];`

			`if (preg_match_all('#<li[^>]>(.?)</li>#isu', $html, $blocks, PREG_SET_ORDER)) {`
			`foreach ($blocks as $block) {`
			`$item = $this->extractLinkFromFragment($block[1], $baseUrl, $pageHost);`
			`if ($item && ! isset($seen[$item->externalId])) {`
			`$seen[$item->externalId] = true;`
			`$items[] = $item;`
			`}`
			`}`
			`}`

			`if (! preg_match_all(`
			`'#<a[^>]+href=["\']([^"\']+)["\'][^>]>(.?)</a>#isu',`
			`$html,`
			`$matches,`
			`PREG_SET_ORDER`
			`)) {`
			`return $items;`
			`}`

			`foreach ($matches as $m) {`
			`$item = $this->buildListItem($m[1], $m[2], $html, $baseUrl, $pageHost);`
			`if ($item && ! isset($seen[$item->externalId])) {`
			`$seen[$item->externalId] = true;`
			`$items[] = $item;`
			`}`
			`}`

			`return $items;`
			`}`

			`protected function extractLinkFromFragment(string $fragment, string $baseUrl, ?string $pageHost): ?CrawlItemDto`
			`{`
			`if (! preg_match('#<a[^>]+href=["\']([^"\']+)["\'][^>]>(.?)</a>#isu', $fragment, $m)) {`
			`return null;`
			`}`

			`return $this->buildListItem($m[1], $m[2], $fragment, $baseUrl, $pageHost);`
			`}`

			`protected function buildListItem(`
			`string $href,`
			`string $linkInner,`
			`string $context,`
			`string $baseUrl,`
			`?string $pageHost,`
			`): ?CrawlItemDto {`
			`$title = trim(preg_replace('/\s+/u', ' ', strip_tags(html_entity_decode($linkInner))));`
			`if (HtmlCrawlSupport::isSkippableLinkTitle($title) \|\| HtmlCrawlSupport::isWeakLinkTitle($title)) {`
			`return null;`
			`}`

			`$url = HtmlCrawlSupport::absoluteUrl($href, $baseUrl);`
			`if (! $url \|\| HtmlCrawlSupport::isAssetPath($url)) {`
			`return null;`
			`}`
			`if ($pageHost && ! HtmlCrawlSupport::sameHost($pageHost, HtmlCrawlSupport::hostKey($url))) {`
			`return null;`
			`}`
			`if (! $this->looksLikeNewsArticleUrl($url, $baseUrl)) {`
			`return null;`
			`}`

			`$publishedAt = HtmlCrawlSupport::extractDateFromText($context);`

			`return new CrawlItemDto(`
			`externalId: 'news:'.md5($url),`
			`title: $title,`
			`canonicalUrl: $url,`
			`publishedAt: $publishedAt,`
			`extra: ['platform' => 'generic_html'],`
			`);`
			`}`

			`protected function looksLikeNewsArticleUrl(string $url, string $listUrl): bool`
			`{`
			`if (rtrim($url, '/') === rtrim($listUrl, '/')) {`
			`return false;`
			`}`

			`$path = strtolower((string) parse_url($url, PHP_URL_PATH));`
			`if ($path === '' \|\| $path === '/') {`
			`return false;`
			`}`

			`if (preg_match('#/(news\|article\|content\|detail\|post\|story\|infor\|view\|show\|archives?)/#i', $path)) {`
			`return true;`
			`}`
			`if (preg_match('#/\d{4}[/\-]\d{1,2}[/\-]\d{1,2}/#', $path)) {`
			`return true;`
			`}`
			`if (preg_match('#\.(s?html?\|php\|aspx?)(\?\|$)#i', $path) && ! preg_match('#/(index\|list\|category\|tag\|search)(\.\|/\|$)#i', $path)) {`
			`return true;`
			`}`

			`return preg_match('#/[a-z0-9\-]{8,}\.s?html?$#i', $path) === 1;`
			`}`

			`protected function looksLikeArticlePage(string $html): bool`
			`{`
			`if (! preg_match('#<h1[^>]>(.?)</h1>#is', $html, $m)) {`
			`return false;`
			`}`
			`$h1 = trim(strip_tags(html_entity_decode($m[1])));`

			`return Str::length($h1) >= 8 && NewsContentHtml::extractBody($html) !== null;`
			`}`

			`protected function parseArticleFromHtml(string $html, string $url): ?CrawlItemDto`
			`{`
			`$detail = $this->parseArticleDetailFromHtml($html);`
			`$title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null);`
			`if ($title === null \|\| $title === '') {`
			`return null;`
			`}`

			`return new CrawlItemDto(`
			`externalId: 'news:'.md5($url),`
			`title: $title,`
			`canonicalUrl: $url,`
			`summary: $detail['summary'],`
			`publishedAt: $detail['published_at'],`
			`contentHtml: $detail['content_html'],`
			`extra: ['platform' => 'generic_html'],`
			`);`
			`}`

			`/**`
			`* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}`
			`*/`
			`protected function fetchArticleDetail(string $url): array`
			`{`
			`$empty = [`
			`'title' => null,`
			`'summary' => null,`
			`'content_html' => null,`
			`'published_at' => null,`
			`];`

			`if ($url === '') {`
			`return $empty;`
			`}`

			`try {`
			`$html = HtmlCrawlSupport::fetchHtml($url, 20);`
			`} catch (\Throwable) {`
			`return $empty;`
			`}`

			`return $this->parseArticleDetailFromHtml($html);`
			`}`

			`/**`
			`* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}`
			`*/`
			`protected function parseArticleDetailFromHtml(string $html): array`
			`{`
			`$title = null;`
			`if (preg_match('#<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {`
			`$title = trim(html_entity_decode($m[1]));`
			`} elseif (preg_match('#<meta[^>]+name=["\']twitter:title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {`
			`$title = trim(html_entity_decode($m[1]));`
			`} elseif (preg_match('#<h1[^>]>(.?)</h1>#is', $html, $m)) {`
			`$title = trim(strip_tags(html_entity_decode($m[1])));`
			`} elseif (preg_match('#<title[^>]>(.?)</title>#is', $html, $m)) {`
			`$title = trim(strip_tags(html_entity_decode($m[1])));`
			`}`
			`$title = HtmlCrawlSupport::cleanArticleTitle($title);`

			`$publishedAt = $this->extractPublishedAt($html);`

			`$contentHtml = NewsContentHtml::extractBody($html);`

			`$summary = null;`
			`if (preg_match('#<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {`
			`$summary = trim(html_entity_decode($m[1]));`
			`}`

			`return [`
			`'title' => $title,`
			`'summary' => $summary,`
			`'content_html' => $contentHtml,`
			`'published_at' => $publishedAt,`
			`];`
			`}`

			`protected function extractPublishedAt(string $html): ?string`
			`{`
			`if (preg_match('#<meta[^>]+property=["\']article:published_time["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {`
			`return HtmlCrawlSupport::normalizeDate($m[1]);`
			`}`
			`if (preg_match('#<meta[^>]+property=["\']og:published_time["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {`
			`return HtmlCrawlSupport::normalizeDate($m[1]);`
			`}`
			`if (preg_match('#<meta[^>]+name=["\'](?:publishdate\|pubdate\|date)["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {`
			`return HtmlCrawlSupport::normalizeDate($m[1]);`
			`}`
			`if (preg_match_all('#<time[^>]+datetime=["\']([^"\']+)["\'][^>]*>#i', $html, $m)) {`
			`foreach ($m[1] as $dt) {`
			`$parsed = HtmlCrawlSupport::normalizeDate($dt);`
			`if ($parsed) {`
			`return $parsed;`
			`}`
			`}`
			`}`
			`if (preg_match('#<time[^>]*>([^<]{6,40})</time>#i', $html, $m)) {`
			`return HtmlCrawlSupport::normalizeDate($m[1]);`
			`}`
			`if (preg_match('#<span[^>]class=["\'][^"\'](?:date\|time\|publish)[^"\']["\'][^>]>([^<]{6,40})</span>#i', $html, $m)) {`
			`return HtmlCrawlSupport::normalizeDate($m[1]);`
			`}`

			`return HtmlCrawlSupport::extractDateFromText($html);`
			`}`

			`}`