From 780a7a24e2d5d50c2586bf6957647222482b2b85 Mon Sep 17 00:00:00 2001
From: lion <120344285@qq.com>
Date: Mon, 22 Jun 2026 17:58:45 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../Crawl/Adapters/HuxiuHtmlAdapter.php       | 85 +++++++++++++++++++
 app/Services/Crawl/CrawlSourceResolver.php    |  8 ++
 app/Services/Crawl/NewsContentHtml.php        | 11 ++-
 tests/Unit/HuxiuHtmlAdapterTest.php           | 38 +++++++++
 tests/Unit/NewsContentHtmlTest.php            | 18 ++++
 5 files changed, 158 insertions(+), 2 deletions(-)
 create mode 100644 tests/Unit/HuxiuHtmlAdapterTest.php

diff --git a/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php b/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php
index 456a828..89b6a42 100644
--- a/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php
+++ b/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php
@@ -12,6 +12,8 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
 {
     protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1';
 
+    protected const ARTICLE_DETAIL_URL = 'https://api-web-article.huxiu.com/web/article/detail';
+
     protected const PAGE_SIZE = 20;
 
     public function fetch(string $requestUrl, CrawlSource $source, array $params): array
@@ -163,4 +165,87 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
             extra: ['platform' => 'huxiu'],
         );
     }
+
+    /**
+     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
+     */
+    protected function fetchArticleDetail(string $url): array
+    {
+        $aid = $this->resolveArticleId($url);
+        if ($aid !== null) {
+            $apiDetail = $this->requestArticleDetail($aid, $url);
+            if ($apiDetail !== null) {
+                return $apiDetail;
+            }
+        }
+
+        return parent::fetchArticleDetail($url);
+    }
+
+    protected function resolveArticleId(string $url): ?int
+    {
+        $path = (string) parse_url($url, PHP_URL_PATH);
+        if (preg_match('#/article/(\d+)\.html#i', $path, $match)) {
+            return (int) $match[1];
+        }
+
+        return null;
+    }
+
+    /**
+     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}|null
+     */
+    protected function requestArticleDetail(int $aid, string $articleUrl): ?array
+    {
+        $response = Http::timeout(30)
+            ->withHeaders([
+                'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
+                'Accept' => 'application/json',
+                'Origin' => 'https://www.huxiu.com',
+                'Referer' => 'https://www.huxiu.com/',
+            ])
+            ->asForm()
+            ->post(self::ARTICLE_DETAIL_URL, [
+                'platform' => 'www',
+                'aid' => (string) $aid,
+            ]);
+
+        if (! $response->successful()) {
+            return null;
+        }
+
+        $json = $response->json();
+        if (! is_array($json) || empty($json['success'])) {
+            return null;
+        }
+
+        $data = $json['data'] ?? [];
+        if (! is_array($data)) {
+            return null;
+        }
+
+        $content = trim((string) ($data['content'] ?? ''));
+        if ($content === '' || mb_strlen(strip_tags($content)) < 30) {
+            return null;
+        }
+
+        $title = HtmlCrawlSupport::cleanArticleTitle(trim((string) ($data['title'] ?? '')));
+        $summary = trim((string) ($data['summary'] ?? ''));
+        $publishedAt = null;
+        if (! empty($data['fdateline'])) {
+            $publishedAt = HtmlCrawlSupport::normalizeDate((string) $data['fdateline']);
+        } elseif (! empty($data['dateline'])) {
+            $timestamp = (int) $data['dateline'];
+            if ($timestamp > 0) {
+                $publishedAt = gmdate('Y-m-d', $timestamp);
+            }
+        }
+
+        return [
+            'title' => $title,
+            'summary' => $summary !== '' ? $summary : null,
+            'content_html' => \App\Services\Crawl\NewsContentHtml::normalize($content, $articleUrl),
+            'published_at' => $publishedAt,
+        ];
+    }
 }
diff --git a/app/Services/Crawl/CrawlSourceResolver.php b/app/Services/Crawl/CrawlSourceResolver.php
index 6a811b6..cf5e564 100644
--- a/app/Services/Crawl/CrawlSourceResolver.php
+++ b/app/Services/Crawl/CrawlSourceResolver.php
@@ -85,6 +85,14 @@ class CrawlSourceResolver
             return $sources->firstWhere('adapter_code', 'pedaily_html');
         }
 
+        if ($targetType === 'industry_news' && str_contains($lower, 'huxiu.com')) {
+            return $sources->firstWhere('adapter_code', 'huxiu_html');
+        }
+
+        if ($targetType === 'industry_news' && str_contains($lower, 'pedata.cn')) {
+            return $sources->firstWhere('adapter_code', 'generic_news_html');
+        }
+
         if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) {
             return $sources->firstWhere('adapter_code', 'arxiv_api');
         }
diff --git a/app/Services/Crawl/NewsContentHtml.php b/app/Services/Crawl/NewsContentHtml.php
index 2f09002..817dd65 100644
--- a/app/Services/Crawl/NewsContentHtml.php
+++ b/app/Services/Crawl/NewsContentHtml.php
@@ -15,6 +15,9 @@ class NewsContentHtml
             // 上海交大等：Article_content + Article-source
             '#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>\s*<div[^>]+class=["\'][^"\']*Article-source#is',
             '#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>#is',
+            '#<div[^>]+class=["\'][^"\']*article_main[^"\']*["\'][^>]*>(.*?)</div>\s*</div>\s*<div class="news_all_text#is',
+            '#<div[^>]+class=["\'][^"\']*article_main[^"\']*["\'][^>]*>(.*?)</div>#is',
+            '#<div[^>]+class=["\'][^"\']*article_con[^"\']*["\'][^>]*>(.*?)</div>\s*<div class="news_all_text#is',
             '#<div[^>]+id=["\']ivs_content["\'][^>]*>(.*?)</div>#is',
             '#<div[^>]+id=["\'](?:vsb_content|v_news_content|news_content|zoom)["\'][^>]*>(.*?)</div>#is',
             '#<div[^>]+class=["\'][^"\']*v_news_content[^"\']*["\'][^>]*>(.*?)</div>#is',
@@ -25,8 +28,12 @@ class NewsContentHtml
         ];
 
         foreach ($patterns as $pattern) {
-            if (preg_match($pattern, $html, $m)) {
-                $body = trim($m[1]);
+            if (! preg_match_all($pattern, $html, $matches)) {
+                continue;
+            }
+
+            foreach ($matches[1] as $body) {
+                $body = trim((string) $body);
                 $len = mb_strlen(strip_tags($body));
                 if ($len >= 30) {
                     $candidates[$len] = $body;
diff --git a/tests/Unit/HuxiuHtmlAdapterTest.php b/tests/Unit/HuxiuHtmlAdapterTest.php
new file mode 100644
index 0000000..f7e4ceb
--- /dev/null
+++ b/tests/Unit/HuxiuHtmlAdapterTest.php
@@ -0,0 +1,38 @@
+<?php
+
+namespace Tests\Unit;
+
+use App\Services\Crawl\Adapters\HuxiuHtmlAdapter;
+use Tests\TestCase;
+
+class HuxiuHtmlAdapterTest extends TestCase
+{
+    public function test_resolves_channel_id_from_url(): void
+    {
+        $adapter = new HuxiuHtmlAdapter(
+            app(\App\Services\Crawl\NewsCategoryMatcher::class),
+            app(\App\Services\Crawl\NewsHtmlImageLocalizer::class),
+        );
+        $method = new \ReflectionMethod($adapter, 'resolveChannelId');
+        $method->setAccessible(true);
+
+        $this->assertSame(115, $method->invoke($adapter, 'https://www.huxiu.com/channel/115.html'));
+        $this->assertNull($method->invoke($adapter, 'https://www.huxiu.com/article/123.html'));
+    }
+
+    public function test_fetches_article_detail_via_api(): void
+    {
+        $adapter = new HuxiuHtmlAdapter(
+            app(\App\Services\Crawl\NewsCategoryMatcher::class),
+            app(\App\Services\Crawl\NewsHtmlImageLocalizer::class),
+        );
+        $method = new \ReflectionMethod($adapter, 'fetchArticleDetail');
+        $method->setAccessible(true);
+
+        $detail = $method->invoke($adapter, 'https://www.huxiu.com/article/4869203.html');
+
+        $this->assertNotEmpty($detail['title']);
+        $this->assertNotEmpty($detail['content_html']);
+        $this->assertGreaterThan(200, mb_strlen(strip_tags((string) $detail['content_html'])));
+    }
+}
diff --git a/tests/Unit/NewsContentHtmlTest.php b/tests/Unit/NewsContentHtmlTest.php
index 43ff27e..4e1af5f 100644
--- a/tests/Unit/NewsContentHtmlTest.php
+++ b/tests/Unit/NewsContentHtmlTest.php
@@ -33,4 +33,22 @@ HTML;
 
         $this->assertStringContainsString('https://news.sjtu.edu.cn/resource/upload/a.png', $normalized);
     }
+
+    public function test_extracts_pedata_article_main(): void
+    {
+        $html = <<<'HTML'
+<div class="article_con">
+<div class="article_main">
+<p>清科研究中心正文段落一，包含足够长的文字用于通过正文提取阈值校验。</p>
+<p>第二段正文内容继续补充长度，确保 strip_tags 后超过三十个字符。</p>
+</div>
+</div>
+<div class="news_all_text">版权声明</div>
+HTML;
+
+        $body = NewsContentHtml::extractBody($html);
+
+        $this->assertNotNull($body);
+        $this->assertStringContainsString('清科研究中心正文段落一', $body);
+    }
 }