From 780a7a24e2d5d50c2586bf6957647222482b2b85 Mon Sep 17 00:00:00 2001 From: lion <120344285@qq.com> Date: Mon, 22 Jun 2026 17:58:45 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Crawl/Adapters/HuxiuHtmlAdapter.php | 85 +++++++++++++++++++ app/Services/Crawl/CrawlSourceResolver.php | 8 ++ app/Services/Crawl/NewsContentHtml.php | 11 ++- tests/Unit/HuxiuHtmlAdapterTest.php | 38 +++++++++ tests/Unit/NewsContentHtmlTest.php | 18 ++++ 5 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 tests/Unit/HuxiuHtmlAdapterTest.php diff --git a/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php b/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php index 456a828..89b6a42 100644 --- a/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php +++ b/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php @@ -12,6 +12,8 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter { protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1'; + protected const ARTICLE_DETAIL_URL = 'https://api-web-article.huxiu.com/web/article/detail'; + protected const PAGE_SIZE = 20; public function fetch(string $requestUrl, CrawlSource $source, array $params): array @@ -163,4 +165,87 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter extra: ['platform' => 'huxiu'], ); } + + /** + * @return array{title:?string, summary:?string, content_html:?string, published_at:?string} + */ + protected function fetchArticleDetail(string $url): array + { + $aid = $this->resolveArticleId($url); + if ($aid !== null) { + $apiDetail = $this->requestArticleDetail($aid, $url); + if ($apiDetail !== null) { + return $apiDetail; + } + } + + return parent::fetchArticleDetail($url); + } + + protected function resolveArticleId(string $url): ?int + { + $path = (string) parse_url($url, PHP_URL_PATH); + if (preg_match('#/article/(\d+)\.html#i', $path, $match)) { + return (int) $match[1]; + } + + return null; + } + + /** + * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}|null + */ + protected function requestArticleDetail(int $aid, string $articleUrl): ?array + { + $response = Http::timeout(30) + ->withHeaders([ + 'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)', + 'Accept' => 'application/json', + 'Origin' => 'https://www.huxiu.com', + 'Referer' => 'https://www.huxiu.com/', + ]) + ->asForm() + ->post(self::ARTICLE_DETAIL_URL, [ + 'platform' => 'www', + 'aid' => (string) $aid, + ]); + + if (! $response->successful()) { + return null; + } + + $json = $response->json(); + if (! is_array($json) || empty($json['success'])) { + return null; + } + + $data = $json['data'] ?? []; + if (! is_array($data)) { + return null; + } + + $content = trim((string) ($data['content'] ?? '')); + if ($content === '' || mb_strlen(strip_tags($content)) < 30) { + return null; + } + + $title = HtmlCrawlSupport::cleanArticleTitle(trim((string) ($data['title'] ?? ''))); + $summary = trim((string) ($data['summary'] ?? '')); + $publishedAt = null; + if (! empty($data['fdateline'])) { + $publishedAt = HtmlCrawlSupport::normalizeDate((string) $data['fdateline']); + } elseif (! empty($data['dateline'])) { + $timestamp = (int) $data['dateline']; + if ($timestamp > 0) { + $publishedAt = gmdate('Y-m-d', $timestamp); + } + } + + return [ + 'title' => $title, + 'summary' => $summary !== '' ? $summary : null, + 'content_html' => \App\Services\Crawl\NewsContentHtml::normalize($content, $articleUrl), + 'published_at' => $publishedAt, + ]; + } } diff --git a/app/Services/Crawl/CrawlSourceResolver.php b/app/Services/Crawl/CrawlSourceResolver.php index 6a811b6..cf5e564 100644 --- a/app/Services/Crawl/CrawlSourceResolver.php +++ b/app/Services/Crawl/CrawlSourceResolver.php @@ -85,6 +85,14 @@ class CrawlSourceResolver return $sources->firstWhere('adapter_code', 'pedaily_html'); } + if ($targetType === 'industry_news' && str_contains($lower, 'huxiu.com')) { + return $sources->firstWhere('adapter_code', 'huxiu_html'); + } + + if ($targetType === 'industry_news' && str_contains($lower, 'pedata.cn')) { + return $sources->firstWhere('adapter_code', 'generic_news_html'); + } + if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) { return $sources->firstWhere('adapter_code', 'arxiv_api'); } diff --git a/app/Services/Crawl/NewsContentHtml.php b/app/Services/Crawl/NewsContentHtml.php index 2f09002..817dd65 100644 --- a/app/Services/Crawl/NewsContentHtml.php +++ b/app/Services/Crawl/NewsContentHtml.php @@ -15,6 +15,9 @@ class NewsContentHtml // 上海交大等:Article_content + Article-source '#]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)\s*]+class=["\'][^"\']*Article-source#is', '#]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)#is', + '#]+class=["\'][^"\']*article_main[^"\']*["\'][^>]*>(.*?)\s*\s*
]*>(.*?)
#is', + '#]+class=["\'][^"\']*article_con[^"\']*["\'][^>]*>(.*?)\s*
]*>(.*?)
#is', '#]+id=["\'](?:vsb_content|v_news_content|news_content|zoom)["\'][^>]*>(.*?)#is', '#]+class=["\'][^"\']*v_news_content[^"\']*["\'][^>]*>(.*?)#is', @@ -25,8 +28,12 @@ class NewsContentHtml ]; foreach ($patterns as $pattern) { - if (preg_match($pattern, $html, $m)) { - $body = trim($m[1]); + if (! preg_match_all($pattern, $html, $matches)) { + continue; + } + + foreach ($matches[1] as $body) { + $body = trim((string) $body); $len = mb_strlen(strip_tags($body)); if ($len >= 30) { $candidates[$len] = $body; diff --git a/tests/Unit/HuxiuHtmlAdapterTest.php b/tests/Unit/HuxiuHtmlAdapterTest.php new file mode 100644 index 0000000..f7e4ceb --- /dev/null +++ b/tests/Unit/HuxiuHtmlAdapterTest.php @@ -0,0 +1,38 @@ +setAccessible(true); + + $this->assertSame(115, $method->invoke($adapter, 'https://www.huxiu.com/channel/115.html')); + $this->assertNull($method->invoke($adapter, 'https://www.huxiu.com/article/123.html')); + } + + public function test_fetches_article_detail_via_api(): void + { + $adapter = new HuxiuHtmlAdapter( + app(\App\Services\Crawl\NewsCategoryMatcher::class), + app(\App\Services\Crawl\NewsHtmlImageLocalizer::class), + ); + $method = new \ReflectionMethod($adapter, 'fetchArticleDetail'); + $method->setAccessible(true); + + $detail = $method->invoke($adapter, 'https://www.huxiu.com/article/4869203.html'); + + $this->assertNotEmpty($detail['title']); + $this->assertNotEmpty($detail['content_html']); + $this->assertGreaterThan(200, mb_strlen(strip_tags((string) $detail['content_html']))); + } +} diff --git a/tests/Unit/NewsContentHtmlTest.php b/tests/Unit/NewsContentHtmlTest.php index 43ff27e..4e1af5f 100644 --- a/tests/Unit/NewsContentHtmlTest.php +++ b/tests/Unit/NewsContentHtmlTest.php @@ -33,4 +33,22 @@ HTML; $this->assertStringContainsString('https://news.sjtu.edu.cn/resource/upload/a.png', $normalized); } + + public function test_extracts_pedata_article_main(): void + { + $html = <<<'HTML' +
+
+

清科研究中心正文段落一,包含足够长的文字用于通过正文提取阈值校验。

+

第二段正文内容继续补充长度,确保 strip_tags 后超过三十个字符。

+
+
+
版权声明
+HTML; + + $body = NewsContentHtml::extractBody($html); + + $this->assertNotNull($body); + $this->assertStringContainsString('清科研究中心正文段落一', $body); + } }