slake-school-service/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\HtmlCrawlSupport;
use Illuminate\Support\Facades\Http;

class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
{
    protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1';

    protected const ARTICLE_DETAIL_URL = 'https://api-web-article.huxiu.com/web/article/detail';

    protected const PAGE_SIZE = 20;

    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $channelId = $this->resolveChannelId($requestUrl);
        if ($channelId === null) {
            return parent::fetch($requestUrl, $source, $params);
        }

        return $this->fetchChannelViaApi($requestUrl, $params, $channelId);
    }

    /**
     * @param  array<string, mixed>  $params
     * @return list<CrawlItemDto>
     */
    protected function fetchChannelViaApi(string $requestUrl, array $params, int $channelId): array
    {
        $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
        $maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
        $maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));

        $listItems = [];
        $seen = [];
        $lastId = null;

        for ($page = 1; $page <= $maxPages && count($listItems) < $maxResults; $page++) {
            $payload = $this->requestChannelArticleList($channelId, self::PAGE_SIZE, $lastId);
            $rows = $payload['datalist'] ?? [];
            if ($rows === []) {
                break;
            }

            foreach ($rows as $row) {
                if (count($listItems) >= $maxResults) {
                    break 2;
                }

                $item = $this->mapApiRowToDto($row);
                if ($item === null || isset($seen[$item->externalId])) {
                    continue;
                }

                $seen[$item->externalId] = true;
                $listItems[] = $item;
            }

            $nextLastId = isset($payload['last_id']) ? (int) $payload['last_id'] : 0;
            if ($nextLastId <= 0 || $nextLastId === $lastId) {
                break;
            }
            $lastId = $nextLastId;
        }

        $applyKeywordFilter = HtmlCrawlSupport::shouldApplyKeywordFilter($requestUrl, '', $keywords);

        return $this->enrichNewsItems($listItems, $keywords, $applyKeywordFilter, $maxResults, 'huxiu');
    }

    protected function resolveChannelId(string $url): ?int
    {
        $path = (string) parse_url($url, PHP_URL_PATH);
        if (preg_match('#/channel/(\d+)\.(?:html?|shtml)$#i', $path, $match)) {
            return (int) $match[1];
        }

        if (preg_match('#/article/?(?:\.html?|\.shtml)?$#i', $path)) {
            return 0;
        }

        return null;
    }

    /**
     * @return array{name?:string, datalist?:list<array<string, mixed>>, last_id?:int|string}
     */
    protected function requestChannelArticleList(int $channelId, int $pageSize, ?int $lastId): array
    {
        $form = [
            'platform' => 'www',
            'channel_id' => (string) $channelId,
            'pagesize' => (string) max(1, min(30, $pageSize)),
        ];
        if ($lastId) {
            $form['last_id'] = (string) $lastId;
        }

        $response = Http::timeout(30)
            ->withHeaders([
                'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
                'Accept' => 'application/json',
                'Origin' => 'https://www.huxiu.com',
                'Referer' => 'https://www.huxiu.com/',
            ])
            ->asForm()
            ->post(self::API_URL, $form);

        if (! $response->successful()) {
            throw new \RuntimeException('虎嗅列表接口请求失败：HTTP '.$response->status());
        }

        $json = $response->json();
        if (! is_array($json) || empty($json['success'])) {
            $message = is_array($json) ? (string) ($json['message'] ?? '未知错误') : '响应格式异常';

            throw new \RuntimeException('虎嗅列表接口返回失败：'.$message);
        }

        $data = $json['data'] ?? [];

        return is_array($data) ? $data : [];
    }

    /**
     * @param  array<string, mixed>  $row
     */
    protected function mapApiRowToDto(array $row): ?CrawlItemDto
    {
        $title = trim((string) ($row['title'] ?? ''));
        if ($title === '' || HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) {
            return null;
        }

        $url = HtmlCrawlSupport::normalizeNewsUrl((string) ($row['url'] ?? ''));
        if ($url === null || $url === '') {
            $aid = trim((string) ($row['aid'] ?? ''));
            if ($aid === '') {
                return null;
            }
            $url = 'https://www.huxiu.com/article/'.$aid.'.html';
        }

        $publishedAt = null;
        if (! empty($row['dateline'])) {
            $timestamp = (int) $row['dateline'];
            if ($timestamp > 0) {
                $publishedAt = gmdate('Y-m-d', $timestamp);
            }
        }

        $summary = trim((string) ($row['summary'] ?? $row['short_content'] ?? ''));

        return new CrawlItemDto(
            externalId: 'news:'.md5($url),
            title: $title,
            canonicalUrl: $url,
            summary: $summary !== '' ? $summary : null,
            publishedAt: $publishedAt,
            extra: ['platform' => 'huxiu'],
        );
    }

    /**
     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
     */
    protected function fetchArticleDetail(string $url): array
    {
        $aid = $this->resolveArticleId($url);
        if ($aid !== null) {
            $apiDetail = $this->requestArticleDetail($aid, $url);
            if ($apiDetail !== null) {
                return $apiDetail;
            }
        }

        return parent::fetchArticleDetail($url);
    }

    protected function resolveArticleId(string $url): ?int
    {
        $path = (string) parse_url($url, PHP_URL_PATH);
        if (preg_match('#/article/(\d+)\.html#i', $path, $match)) {
            return (int) $match[1];
        }

        return null;
    }

    /**
     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}|null
     */
    protected function requestArticleDetail(int $aid, string $articleUrl): ?array
    {
        $response = Http::timeout(30)
            ->withHeaders([
                'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
                'Accept' => 'application/json',
                'Origin' => 'https://www.huxiu.com',
                'Referer' => 'https://www.huxiu.com/',
            ])
            ->asForm()
            ->post(self::ARTICLE_DETAIL_URL, [
                'platform' => 'www',
                'aid' => (string) $aid,
            ]);

        if (! $response->successful()) {
            return null;
        }

        $json = $response->json();
        if (! is_array($json) || empty($json['success'])) {
            return null;
        }

        $data = $json['data'] ?? [];
        if (! is_array($data)) {
            return null;
        }

        $content = trim((string) ($data['content'] ?? ''));
        if ($content === '' || mb_strlen(strip_tags($content)) < 30) {
            return null;
        }

        $title = HtmlCrawlSupport::cleanArticleTitle(trim((string) ($data['title'] ?? '')));
        $summary = trim((string) ($data['summary'] ?? ''));
        $publishedAt = null;
        if (! empty($data['fdateline'])) {
            $publishedAt = HtmlCrawlSupport::normalizeDate((string) $data['fdateline']);
        } elseif (! empty($data['dateline'])) {
            $timestamp = (int) $data['dateline'];
            if ($timestamp > 0) {
                $publishedAt = gmdate('Y-m-d', $timestamp);
            }
        }

        return [
            'title' => $title,
            'summary' => $summary !== '' ? $summary : null,
            'content_html' => \App\Services\Crawl\NewsContentHtml::normalize($content, $articleUrl),
            'published_at' => $publishedAt,
        ];
    }
}