entry_url ?: 'https://www.pedaily.cn/all/'; } $items = []; $seen = []; foreach (HtmlPagination::fetchPagesHtml($fetchUrl, $maxPages) as $html) { foreach ($this->parseListHtml($html, $fetchUrl) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; } } $items = array_values(array_filter( $items, fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords) )); $enriched = []; foreach ($items as $item) { if (count($enriched) >= $maxResults) { break; } $detail = $this->fetchArticleDetail($item->canonicalUrl ?? ''); $title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null) ?: HtmlCrawlSupport::cleanArticleTitle($item->title) ?: $item->title; $articleUrl = $item->canonicalUrl ?? ''; $rawHtml = $detail['content_html'] ?? $item->contentHtml; $contentHtml = $this->imageLocalizer->localize($rawHtml, $articleUrl); $plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : ''; $publishedAt = $detail['published_at'] ?? $item->publishedAt; if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) { continue; } $categoryId = $this->categoryMatcher->resolveCategoryId( $title, $plainForMatch !== '' ? $plainForMatch : null, $keywords ); $categoryLabel = $this->categoryMatcher->labelForId($categoryId); $enriched[] = new CrawlItemDto( externalId: $item->externalId, title: $title, canonicalUrl: $item->canonicalUrl, summary: null, publishedAt: $publishedAt, contentHtml: $contentHtml, extra: [ 'platform' => 'pedaily', 'keywords' => $keywords, 'category_dict_item_id' => $categoryId, 'category_label' => $categoryLabel, ], ); } return $enriched; } /** * @return list */ protected function parseListHtml(string $html, string $baseUrl): array { $items = []; $seen = []; if (preg_match_all( '#]+href=["\']([^"\']+)["\'][^>]*>([^<]{8,200})#iu', $html, $matches, PREG_SET_ORDER )) { foreach ($matches as $m) { $href = html_entity_decode(trim($m[1])); $title = trim(strip_tags(html_entity_decode($m[2]))); if (HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) { continue; } if (Str::contains($title, ['登录', '注册', '更多', '下一页', '上一页'])) { continue; } $url = $this->absoluteUrl($href, $baseUrl); if (! $url || ! Str::contains($url, 'pedaily.cn')) { continue; } if (! preg_match('#/(article|news|inners|vcpe|company)/#i', $url) && ! preg_match('#\.s?html#i', $url)) { continue; } $key = md5($url); if (isset($seen[$key])) { continue; } $seen[$key] = true; $items[] = new CrawlItemDto( externalId: 'pedaily:'.$key, title: $title, canonicalUrl: $url, publishedAt: HtmlCrawlSupport::extractDateFromText($html), extra: ['platform' => 'pedaily'], ); } } return $items; } /** * @return array{title:?string, summary:?string, content_html:?string, published_at:?string} */ protected function fetchArticleDetail(string $url): array { $empty = [ 'title' => null, 'summary' => null, 'content_html' => null, 'published_at' => null, ]; if ($url === '' || ! Str::contains($url, 'pedaily.cn')) { return $empty; } try { $html = HtmlCrawlSupport::fetchHtml($url, 20); } catch (\Throwable) { return $empty; } $title = null; if (preg_match('#]*id=["\']newstitle["\'][^>]*>(.*?)#is', $html, $m)) { $title = trim(strip_tags(html_entity_decode($m[1]))); } elseif (preg_match('#]*>(.*?)#is', $html, $m)) { $title = trim(strip_tags(html_entity_decode($m[1]))); } $title = HtmlCrawlSupport::cleanArticleTitle($title); $publishedAt = null; if (preg_match('#]+datetime=["\']([^"\']+)["\']#i', $html, $m)) { $publishedAt = HtmlCrawlSupport::normalizeDate($m[1]); } elseif (preg_match('#]*>\s*]*>([^<]+)#is', $html, $m)) { $publishedAt = HtmlCrawlSupport::normalizeDate(trim($m[1])); } $contentHtml = null; if (preg_match('#]+id=["\']news-content["\'][^>]*>(.*?)\s*]+id=["\']article-body["\'][^>]*>(.*?)#is', $html, $m)) { $contentHtml = trim($m[1]); } if ($contentHtml === null || $contentHtml === '') { $contentHtml = NewsContentHtml::extractBody($html); } return [ 'title' => $title, 'summary' => null, 'content_html' => $contentHtml, 'published_at' => $publishedAt, ]; } protected function normalizeDate(string $raw): ?string { $raw = str_replace('/', '-', trim($raw)); if (preg_match('#^(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } return null; } protected function absoluteUrl(string $href, string $base): ?string { if (Str::startsWith($href, 'http')) { return $href; } $parts = parse_url($base); if (! $parts || empty($parts['scheme']) || empty($parts['host'])) { return null; } $origin = $parts['scheme'].'://'.$parts['host']; if (Str::startsWith($href, '//')) { return $parts['scheme'].':'.$href; } if (Str::startsWith($href, '/')) { return $origin.$href; } return rtrim($origin, '/').'/'.ltrim($href, '/'); } }