From 780a7a24e2d5d50c2586bf6957647222482b2b85 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Mon, 22 Jun 2026 17:58:45 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Crawl/Adapters/HuxiuHtmlAdapter.php | 85 +++++++++++++++++++ app/Services/Crawl/CrawlSourceResolver.php | 8 ++ app/Services/Crawl/NewsContentHtml.php | 11 ++- tests/Unit/HuxiuHtmlAdapterTest.php | 38 +++++++++ tests/Unit/NewsContentHtmlTest.php | 18 ++++ 5 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 tests/Unit/HuxiuHtmlAdapterTest.php diff --git a/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php b/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php index 456a828..89b6a42 100644 --- a/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php @@ -12,6 +12,8 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter { protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1'; + protected const ARTICLE_DETAIL_URL = 'https://api-web-article.huxiu.com/web/article/detail'; + protected const PAGE_SIZE = 20; public function fetch(string $requestUrl, CrawlSource $source, array $params): array @@ -163,4 +165,87 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter extra: ['platform' => 'huxiu'], ); } + + /** + * @return array{title:?string, summary:?string, content_html:?string, published_at:?string} + */ + protected function fetchArticleDetail(string $url): array + { + $aid = $this->resolveArticleId($url); + if ($aid !== null) { + $apiDetail = $this->requestArticleDetail($aid, $url); + if ($apiDetail !== null) { + return $apiDetail; + } + } + + return parent::fetchArticleDetail($url); + } + + protected function resolveArticleId(string $url): ?int + { + $path = (string) parse_url($url, PHP_URL_PATH); + if (preg_match('#/article/(\d+)\.html#i', $path, $match)) { + return (int) $match[1]; + } + + return null; + } + + /** + * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}|null + */ + protected function requestArticleDetail(int $aid, string $articleUrl): ?array + { + $response = Http::timeout(30) + ->withHeaders([ + 'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)', + 'Accept' => 'application/json', + 'Origin' => 'https://www.huxiu.com', + 'Referer' => 'https://www.huxiu.com/', + ]) + ->asForm() + ->post(self::ARTICLE_DETAIL_URL, [ + 'platform' => 'www', + 'aid' => (string) $aid, + ]); + + if (! $response->successful()) { + return null; + } + + $json = $response->json(); + if (! is_array($json) || empty($json['success'])) { + return null; + } + + $data = $json['data'] ?? []; + if (! is_array($data)) { + return null; + } + + $content = trim((string) ($data['content'] ?? '')); + if ($content === '' || mb_strlen(strip_tags($content)) < 30) { + return null; + } + + $title = HtmlCrawlSupport::cleanArticleTitle(trim((string) ($data['title'] ?? ''))); + $summary = trim((string) ($data['summary'] ?? '')); + $publishedAt = null; + if (! empty($data['fdateline'])) { + $publishedAt = HtmlCrawlSupport::normalizeDate((string) $data['fdateline']); + } elseif (! empty($data['dateline'])) { + $timestamp = (int) $data['dateline']; + if ($timestamp > 0) { + $publishedAt = gmdate('Y-m-d', $timestamp); + } + } + + return [ + 'title' => $title, + 'summary' => $summary !== '' ? $summary : null, + 'content_html' => \App\Services\Crawl\NewsContentHtml::normalize($content, $articleUrl), + 'published_at' => $publishedAt, + ]; + } } diff --git a/app/Services/Crawl/CrawlSourceResolver.php b/app/Services/Crawl/CrawlSourceResolver.php index 6a811b6..cf5e564 100644 --- a/app/Services/Crawl/CrawlSourceResolver.php +++ b/app/Services/Crawl/CrawlSourceResolver.php @@ -85,6 +85,14 @@ class CrawlSourceResolver return $sources->firstWhere('adapter_code', 'pedaily_html'); } + if ($targetType === 'industry_news' && str_contains($lower, 'huxiu.com')) { + return $sources->firstWhere('adapter_code', 'huxiu_html'); + } + + if ($targetType === 'industry_news' && str_contains($lower, 'pedata.cn')) { + return $sources->firstWhere('adapter_code', 'generic_news_html'); + } + if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) { return $sources->firstWhere('adapter_code', 'arxiv_api'); } diff --git a/app/Services/Crawl/NewsContentHtml.php b/app/Services/Crawl/NewsContentHtml.php index 2f09002..817dd65 100644 --- a/app/Services/Crawl/NewsContentHtml.php +++ b/app/Services/Crawl/NewsContentHtml.php @@ -15,6 +15,9 @@ class NewsContentHtml // 上海交大等:Article_content + Article-source '#
清科研究中心正文段落一,包含足够长的文字用于通过正文提取阈值校验。
+第二段正文内容继续补充长度,确保 strip_tags 后超过三十个字符。
+