slake-school-service/app/Services/Crawl/Adapters/PedailyHtmlAdapter.php

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\NewsCategoryMatcher;
use App\Services\Crawl\NewsHtmlImageLocalizer;
use App\Services\Crawl\HtmlCrawlSupport;
use App\Services\Crawl\HtmlPagination;
use App\Services\Crawl\NewsContentHtml;
use Illuminate\Support\Str;

class PedailyHtmlAdapter implements CrawlerAdapterInterface
{
    public function __construct(
        protected NewsCategoryMatcher $categoryMatcher,
        protected NewsHtmlImageLocalizer $imageLocalizer,
    ) {}

    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
        $maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
        $maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));

        $fetchUrl = $requestUrl;
        if (! Str::contains($fetchUrl, 'pedaily.cn')) {
            $fetchUrl = $source->entry_url ?: 'https://www.pedaily.cn/all/';
        }

        $items = [];
        $seen = [];
        foreach (HtmlPagination::fetchPagesHtml($fetchUrl, $maxPages) as $html) {
            foreach ($this->parseListHtml($html, $fetchUrl) as $item) {
                if (isset($seen[$item->externalId])) {
                    continue;
                }
                $seen[$item->externalId] = true;
                $items[] = $item;
            }
        }

        $items = array_values(array_filter(
            $items,
            fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords)
        ));

        $enriched = [];
        foreach ($items as $item) {
            if (count($enriched) >= $maxResults) {
                break;
            }
            $detail = $this->fetchArticleDetail($item->canonicalUrl ?? '');
            $title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null)
                ?: HtmlCrawlSupport::cleanArticleTitle($item->title)
                ?: $item->title;
            $articleUrl = $item->canonicalUrl ?? '';
            $rawHtml = $detail['content_html'] ?? $item->contentHtml;
            $contentHtml = $this->imageLocalizer->localize($rawHtml, $articleUrl);
            $plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : '';
            $publishedAt = $detail['published_at'] ?? $item->publishedAt;

            if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) {
                continue;
            }

            $categoryId = $this->categoryMatcher->resolveCategoryId(
                $title,
                $plainForMatch !== '' ? $plainForMatch : null,
                $keywords
            );
            $categoryLabel = $this->categoryMatcher->labelForId($categoryId);

            $enriched[] = new CrawlItemDto(
                externalId: $item->externalId,
                title: $title,
                canonicalUrl: $item->canonicalUrl,
                summary: null,
                publishedAt: $publishedAt,
                contentHtml: $contentHtml,
                extra: [
                    'platform' => 'pedaily',
                    'keywords' => $keywords,
                    'category_dict_item_id' => $categoryId,
                    'category_label' => $categoryLabel,
                ],
            );
        }

        return $enriched;
    }

    /**
     * @return list<CrawlItemDto>
     */
    protected function parseListHtml(string $html, string $baseUrl): array
    {
        $items = [];
        $seen = [];

        if (preg_match_all(
            '#<a[^>]+href=["\']([^"\']+)["\'][^>]*>([^<]{8,200})</a>#iu',
            $html,
            $matches,
            PREG_SET_ORDER
        )) {
            foreach ($matches as $m) {
                $href = html_entity_decode(trim($m[1]));
                $title = trim(strip_tags(html_entity_decode($m[2])));
                if (HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) {
                    continue;
                }
                if (Str::contains($title, ['登录', '注册', '更多', '下一页', '上一页'])) {
                    continue;
                }

                $url = $this->absoluteUrl($href, $baseUrl);
                if (! $url || ! Str::contains($url, 'pedaily.cn')) {
                    continue;
                }
                if (! preg_match('#/(article|news|inners|vcpe|company)/#i', $url) && ! preg_match('#\.s?html#i', $url)) {
                    continue;
                }

                $key = md5($url);
                if (isset($seen[$key])) {
                    continue;
                }
                $seen[$key] = true;

                $items[] = new CrawlItemDto(
                    externalId: 'pedaily:'.$key,
                    title: $title,
                    canonicalUrl: $url,
                    publishedAt: HtmlCrawlSupport::extractDateFromText($html),
                    extra: ['platform' => 'pedaily'],
                );
            }
        }

        return $items;
    }

    /**
     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
     */
    protected function fetchArticleDetail(string $url): array
    {
        $empty = [
            'title' => null,
            'summary' => null,
            'content_html' => null,
            'published_at' => null,
        ];

        if ($url === '' || ! Str::contains($url, 'pedaily.cn')) {
            return $empty;
        }

        try {
            $html = HtmlCrawlSupport::fetchHtml($url, 20);
        } catch (\Throwable) {
            return $empty;
        }

        $title = null;
        if (preg_match('#<h1[^>]*id=["\']newstitle["\'][^>]*>(.*?)</h1>#is', $html, $m)) {
            $title = trim(strip_tags(html_entity_decode($m[1])));
        } elseif (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
            $title = trim(strip_tags(html_entity_decode($m[1])));
        }
        $title = HtmlCrawlSupport::cleanArticleTitle($title);

        $publishedAt = null;
        if (preg_match('#<time[^>]+datetime=["\']([^"\']+)["\']#i', $html, $m)) {
            $publishedAt = HtmlCrawlSupport::normalizeDate($m[1]);
        } elseif (preg_match('#<span class="date"[^>]*>\s*<time[^>]*>([^<]+)</time>#is', $html, $m)) {
            $publishedAt = HtmlCrawlSupport::normalizeDate(trim($m[1]));
        }

        $contentHtml = null;
        if (preg_match('#<div[^>]+id=["\']news-content["\'][^>]*>(.*?)</div>\s*<div#is', $html, $m)) {
            $contentHtml = trim($m[1]);
        } elseif (preg_match('#<div[^>]+id=["\']article-body["\'][^>]*>(.*?)</div>#is', $html, $m)) {
            $contentHtml = trim($m[1]);
        }
        if ($contentHtml === null || $contentHtml === '') {
            $contentHtml = NewsContentHtml::extractBody($html);
        }

        return [
            'title' => $title,
            'summary' => null,
            'content_html' => $contentHtml,
            'published_at' => $publishedAt,
        ];
    }

    protected function normalizeDate(string $raw): ?string
    {
        $raw = str_replace('/', '-', trim($raw));
        if (preg_match('#^(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) {
            return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
        }

        return null;
    }

    protected function absoluteUrl(string $href, string $base): ?string
    {
        if (Str::startsWith($href, 'http')) {
            return $href;
        }
        $parts = parse_url($base);
        if (! $parts || empty($parts['scheme']) || empty($parts['host'])) {
            return null;
        }
        $origin = $parts['scheme'].'://'.$parts['host'];
        if (Str::startsWith($href, '//')) {
            return $parts['scheme'].':'.$href;
        }
        if (Str::startsWith($href, '/')) {
            return $origin.$href;
        }

        return rtrim($origin, '/').'/'.ltrim($href, '/');
    }
}