parseListHtml($html, $requestUrl, $pageHost) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; } } if (count($items) <= 1) { $firstHtml = HtmlPagination::fetchPagesHtml($requestUrl, 1)[0] ?? ''; if ($firstHtml !== '' && $this->looksLikeArticlePage($firstHtml)) { $single = $this->parseArticleFromHtml($firstHtml, $requestUrl); if ($single !== null) { $items = [$single]; } } } $items = array_values(array_filter( $items, fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords) )); $enriched = []; foreach ($items as $item) { if (count($enriched) >= $maxResults) { break; } $detail = $this->fetchArticleDetail($item->canonicalUrl ?? ''); $title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null) ?: HtmlCrawlSupport::cleanArticleTitle($item->title) ?: $item->title; if (HtmlCrawlSupport::isWeakLinkTitle($title)) { continue; } $articleUrl = $item->canonicalUrl ?? ''; $contentHtml = $detail['content_html'] ?? $item->contentHtml; if ($contentHtml) { $contentHtml = $this->imageLocalizer->localize($contentHtml, $articleUrl); } $plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : ''; $publishedAt = $detail['published_at'] ?? $item->publishedAt; if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) { continue; } $categoryId = $this->categoryMatcher->resolveCategoryId( $title, $plainForMatch !== '' ? $plainForMatch : null, $keywords ); $enriched[] = new CrawlItemDto( externalId: $item->externalId, title: $title, canonicalUrl: $item->canonicalUrl, summary: $detail['summary'] ?? $item->summary, publishedAt: $publishedAt, contentHtml: $contentHtml, extra: [ 'platform' => 'generic_html', 'keywords' => $keywords, 'category_dict_item_id' => $categoryId, 'category_label' => $this->categoryMatcher->labelForId($categoryId), ], ); } return $enriched; } /** * @return list */ protected function parseListHtml(string $html, string $baseUrl, ?string $pageHost): array { $items = []; $seen = []; if (preg_match_all('#]*>(.*?)#isu', $html, $blocks, PREG_SET_ORDER)) { foreach ($blocks as $block) { $item = $this->extractLinkFromFragment($block[1], $baseUrl, $pageHost); if ($item && ! isset($seen[$item->externalId])) { $seen[$item->externalId] = true; $items[] = $item; } } } if (! preg_match_all( '#]+href=["\']([^"\']+)["\'][^>]*>(.*?)#isu', $html, $matches, PREG_SET_ORDER )) { return $items; } foreach ($matches as $m) { $item = $this->buildListItem($m[1], $m[2], $html, $baseUrl, $pageHost); if ($item && ! isset($seen[$item->externalId])) { $seen[$item->externalId] = true; $items[] = $item; } } return $items; } protected function extractLinkFromFragment(string $fragment, string $baseUrl, ?string $pageHost): ?CrawlItemDto { if (! preg_match('#]+href=["\']([^"\']+)["\'][^>]*>(.*?)#isu', $fragment, $m)) { return null; } return $this->buildListItem($m[1], $m[2], $fragment, $baseUrl, $pageHost); } protected function buildListItem( string $href, string $linkInner, string $context, string $baseUrl, ?string $pageHost, ): ?CrawlItemDto { $title = trim(preg_replace('/\s+/u', ' ', strip_tags(html_entity_decode($linkInner)))); if (HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) { return null; } $url = HtmlCrawlSupport::absoluteUrl($href, $baseUrl); if (! $url || HtmlCrawlSupport::isAssetPath($url)) { return null; } if ($pageHost && ! HtmlCrawlSupport::sameHost($pageHost, HtmlCrawlSupport::hostKey($url))) { return null; } if (! $this->looksLikeNewsArticleUrl($url, $baseUrl)) { return null; } $publishedAt = HtmlCrawlSupport::extractDateFromText($context); return new CrawlItemDto( externalId: 'news:'.md5($url), title: $title, canonicalUrl: $url, publishedAt: $publishedAt, extra: ['platform' => 'generic_html'], ); } protected function looksLikeNewsArticleUrl(string $url, string $listUrl): bool { if (rtrim($url, '/') === rtrim($listUrl, '/')) { return false; } $path = strtolower((string) parse_url($url, PHP_URL_PATH)); if ($path === '' || $path === '/') { return false; } if (preg_match('#/(news|article|content|detail|post|story|infor|view|show|archives?)/#i', $path)) { return true; } if (preg_match('#/\d{4}[/\-]\d{1,2}[/\-]\d{1,2}/#', $path)) { return true; } if (preg_match('#\.(s?html?|php|aspx?)(\?|$)#i', $path) && ! preg_match('#/(index|list|category|tag|search)(\.|/|$)#i', $path)) { return true; } return preg_match('#/[a-z0-9\-]{8,}\.s?html?$#i', $path) === 1; } protected function looksLikeArticlePage(string $html): bool { if (! preg_match('#]*>(.*?)#is', $html, $m)) { return false; } $h1 = trim(strip_tags(html_entity_decode($m[1]))); return Str::length($h1) >= 8 && NewsContentHtml::extractBody($html) !== null; } protected function parseArticleFromHtml(string $html, string $url): ?CrawlItemDto { $detail = $this->parseArticleDetailFromHtml($html); $title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null); if ($title === null || $title === '') { return null; } return new CrawlItemDto( externalId: 'news:'.md5($url), title: $title, canonicalUrl: $url, summary: $detail['summary'], publishedAt: $detail['published_at'], contentHtml: $detail['content_html'], extra: ['platform' => 'generic_html'], ); } /** * @return array{title:?string, summary:?string, content_html:?string, published_at:?string} */ protected function fetchArticleDetail(string $url): array { $empty = [ 'title' => null, 'summary' => null, 'content_html' => null, 'published_at' => null, ]; if ($url === '') { return $empty; } try { $html = HtmlCrawlSupport::fetchHtml($url, 20); } catch (\Throwable) { return $empty; } return $this->parseArticleDetailFromHtml($html); } /** * @return array{title:?string, summary:?string, content_html:?string, published_at:?string} */ protected function parseArticleDetailFromHtml(string $html): array { $title = null; if (preg_match('#]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { $title = trim(html_entity_decode($m[1])); } elseif (preg_match('#]+name=["\']twitter:title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { $title = trim(html_entity_decode($m[1])); } elseif (preg_match('#]*>(.*?)#is', $html, $m)) { $title = trim(strip_tags(html_entity_decode($m[1]))); } elseif (preg_match('#]*>(.*?)#is', $html, $m)) { $title = trim(strip_tags(html_entity_decode($m[1]))); } $title = HtmlCrawlSupport::cleanArticleTitle($title); $publishedAt = $this->extractPublishedAt($html); $contentHtml = NewsContentHtml::extractBody($html); $summary = null; if (preg_match('#]+name=["\']description["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { $summary = trim(html_entity_decode($m[1])); } return [ 'title' => $title, 'summary' => $summary, 'content_html' => $contentHtml, 'published_at' => $publishedAt, ]; } protected function extractPublishedAt(string $html): ?string { if (preg_match('#]+property=["\']article:published_time["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { return HtmlCrawlSupport::normalizeDate($m[1]); } if (preg_match('#]+property=["\']og:published_time["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { return HtmlCrawlSupport::normalizeDate($m[1]); } if (preg_match('#]+name=["\'](?:publishdate|pubdate|date)["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { return HtmlCrawlSupport::normalizeDate($m[1]); } if (preg_match_all('#]+datetime=["\']([^"\']+)["\'][^>]*>#i', $html, $m)) { foreach ($m[1] as $dt) { $parsed = HtmlCrawlSupport::normalizeDate($dt); if ($parsed) { return $parsed; } } } if (preg_match('#]*>([^<]{6,40})#i', $html, $m)) { return HtmlCrawlSupport::normalizeDate($m[1]); } if (preg_match('#]*class=["\'][^"\']*(?:date|time|publish)[^"\']*["\'][^>]*>([^<]{6,40})#i', $html, $m)) { return HtmlCrawlSupport::normalizeDate($m[1]); } return HtmlCrawlSupport::extractDateFromText($html); } }