修改

2 days ago · 780a7a24e2
parent bd8527fc55
commit 780a7a24e2
5 changed files with 158 additions and 2 deletions
--- a/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php
+++ b/app/Services/Crawl/Adapters/HuxiuHtmlAdapter.php
@ -12,6 +12,8 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
 {
    protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1';

+    protected const ARTICLE_DETAIL_URL = 'https://api-web-article.huxiu.com/web/article/detail';
+
    protected const PAGE_SIZE = 20;

    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
@ -163,4 +165,87 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
            extra: ['platform' => 'huxiu'],
        );
    }
+
+    /**
+     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
+     */
+    protected function fetchArticleDetail(string $url): array
+    {
+        $aid = $this->resolveArticleId($url);
+        if ($aid !== null) {
+            $apiDetail = $this->requestArticleDetail($aid, $url);
+            if ($apiDetail !== null) {
+                return $apiDetail;
+            }
+        }
+
+        return parent::fetchArticleDetail($url);
+    }
+
+    protected function resolveArticleId(string $url): ?int
+    {
+        $path = (string) parse_url($url, PHP_URL_PATH);
+        if (preg_match('#/article/(\d+)\.html#i', $path, $match)) {
+            return (int) $match[1];
+        }
+
+        return null;
+    }
+
+    /**
+     * @return array{title:?string, summary:?string, content_html:?string, published_at:?string}|null
+     */
+    protected function requestArticleDetail(int $aid, string $articleUrl): ?array
+    {
+        $response = Http::timeout(30)
+            ->withHeaders([
+                'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
+                'Accept' => 'application/json',
+                'Origin' => 'https://www.huxiu.com',
+                'Referer' => 'https://www.huxiu.com/',
+            ])
+            ->asForm()
+            ->post(self::ARTICLE_DETAIL_URL, [
+                'platform' => 'www',
+                'aid' => (string) $aid,
+            ]);
+
+        if (! $response->successful()) {
+            return null;
+        }
+
+        $json = $response->json();
+        if (! is_array($json) || empty($json['success'])) {
+            return null;
+        }
+
+        $data = $json['data'] ?? [];
+        if (! is_array($data)) {
+            return null;
+        }
+
+        $content = trim((string) ($data['content'] ?? ''));
+        if ($content === '' || mb_strlen(strip_tags($content)) < 30) {
+            return null;
+        }
+
+        $title = HtmlCrawlSupport::cleanArticleTitle(trim((string) ($data['title'] ?? '')));
+        $summary = trim((string) ($data['summary'] ?? ''));
+        $publishedAt = null;
+        if (! empty($data['fdateline'])) {
+            $publishedAt = HtmlCrawlSupport::normalizeDate((string) $data['fdateline']);
+        } elseif (! empty($data['dateline'])) {
+            $timestamp = (int) $data['dateline'];
+            if ($timestamp > 0) {
+                $publishedAt = gmdate('Y-m-d', $timestamp);
+            }
+        }
+
+        return [
+            'title' => $title,
+            'summary' => $summary !== '' ? $summary : null,
+            'content_html' => \App\Services\Crawl\NewsContentHtml::normalize($content, $articleUrl),
+            'published_at' => $publishedAt,
+        ];
+    }
 }
--- a/app/Services/Crawl/CrawlSourceResolver.php
+++ b/app/Services/Crawl/CrawlSourceResolver.php
@ -85,6 +85,14 @@ class CrawlSourceResolver
            return $sources->firstWhere('adapter_code', 'pedaily_html');
        }

+        if ($targetType === 'industry_news' && str_contains($lower, 'huxiu.com')) {
+            return $sources->firstWhere('adapter_code', 'huxiu_html');
+        }
+
+        if ($targetType === 'industry_news' && str_contains($lower, 'pedata.cn')) {
+            return $sources->firstWhere('adapter_code', 'generic_news_html');
+        }
+
        if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) {
            return $sources->firstWhere('adapter_code', 'arxiv_api');
        }
--- a/app/Services/Crawl/NewsContentHtml.php
+++ b/app/Services/Crawl/NewsContentHtml.php
@ -15,6 +15,9 @@ class NewsContentHtml
            // 上海交大等：Article_content + Article-source
            '#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>\s*<div[^>]+class=["\'][^"\']*Article-source#is',
            '#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>#is',
+            '#<div[^>]+class=["\'][^"\']*article_main[^"\']*["\'][^>]*>(.*?)</div>\s*</div>\s*<div class="news_all_text#is',
+            '#<div[^>]+class=["\'][^"\']*article_main[^"\']*["\'][^>]*>(.*?)</div>#is',
+            '#<div[^>]+class=["\'][^"\']*article_con[^"\']*["\'][^>]*>(.*?)</div>\s*<div class="news_all_text#is',
            '#<div[^>]+id=["\']ivs_content["\'][^>]*>(.*?)</div>#is',
            '#<div[^>]+id=["\'](?:vsb_content|v_news_content|news_content|zoom)["\'][^>]*>(.*?)</div>#is',
            '#<div[^>]+class=["\'][^"\']*v_news_content[^"\']*["\'][^>]*>(.*?)</div>#is',
@ -25,8 +28,12 @@ class NewsContentHtml
        ];

        foreach ($patterns as $pattern) {
-            if (preg_match($pattern, $html, $m)) {
-                $body = trim($m[1]);
+            if (! preg_match_all($pattern, $html, $matches)) {
+                continue;
+            }
+
+            foreach ($matches[1] as $body) {
+                $body = trim((string) $body);
                $len = mb_strlen(strip_tags($body));
                if ($len >= 30) {
                    $candidates[$len] = $body;
--- a/tests/Unit/HuxiuHtmlAdapterTest.php
+++ b/tests/Unit/HuxiuHtmlAdapterTest.php
@ -0,0 +1,38 @@
+<?php
+
+namespace Tests\Unit;
+
+use App\Services\Crawl\Adapters\HuxiuHtmlAdapter;
+use Tests\TestCase;
+
+class HuxiuHtmlAdapterTest extends TestCase
+{
+    public function test_resolves_channel_id_from_url(): void
+    {
+        $adapter = new HuxiuHtmlAdapter(
+            app(\App\Services\Crawl\NewsCategoryMatcher::class),
+            app(\App\Services\Crawl\NewsHtmlImageLocalizer::class),
+        );
+        $method = new \ReflectionMethod($adapter, 'resolveChannelId');
+        $method->setAccessible(true);
+
+        $this->assertSame(115, $method->invoke($adapter, 'https://www.huxiu.com/channel/115.html'));
+        $this->assertNull($method->invoke($adapter, 'https://www.huxiu.com/article/123.html'));
+    }
+
+    public function test_fetches_article_detail_via_api(): void
+    {
+        $adapter = new HuxiuHtmlAdapter(
+            app(\App\Services\Crawl\NewsCategoryMatcher::class),
+            app(\App\Services\Crawl\NewsHtmlImageLocalizer::class),
+        );
+        $method = new \ReflectionMethod($adapter, 'fetchArticleDetail');
+        $method->setAccessible(true);
+
+        $detail = $method->invoke($adapter, 'https://www.huxiu.com/article/4869203.html');
+
+        $this->assertNotEmpty($detail['title']);
+        $this->assertNotEmpty($detail['content_html']);
+        $this->assertGreaterThan(200, mb_strlen(strip_tags((string) $detail['content_html'])));
+    }
+}
--- a/tests/Unit/NewsContentHtmlTest.php
+++ b/tests/Unit/NewsContentHtmlTest.php
@ -33,4 +33,22 @@ HTML;

        $this->assertStringContainsString('https://news.sjtu.edu.cn/resource/upload/a.png', $normalized);
    }
+
+    public function test_extracts_pedata_article_main(): void
+    {
+        $html = <<<'HTML'
+<div class="article_con">
+<div class="article_main">
+<p>清科研究中心正文段落一，包含足够长的文字用于通过正文提取阈值校验。</p>
+<p>第二段正文内容继续补充长度，确保 strip_tags 后超过三十个字符。</p>
+</div>
+</div>
+<div class="news_all_text">版权声明</div>
+HTML;
+
+        $body = NewsContentHtml::extractBody($html);
+
+        $this->assertNotNull($body);
+        $this->assertStringContainsString('清科研究中心正文段落一', $body);
+    }
 }