master
lion 2 days ago
parent bd8527fc55
commit 780a7a24e2

@ -12,6 +12,8 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
{
protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1';
protected const ARTICLE_DETAIL_URL = 'https://api-web-article.huxiu.com/web/article/detail';
protected const PAGE_SIZE = 20;
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
@ -163,4 +165,87 @@ class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
extra: ['platform' => 'huxiu'],
);
}
/**
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
*/
protected function fetchArticleDetail(string $url): array
{
$aid = $this->resolveArticleId($url);
if ($aid !== null) {
$apiDetail = $this->requestArticleDetail($aid, $url);
if ($apiDetail !== null) {
return $apiDetail;
}
}
return parent::fetchArticleDetail($url);
}
protected function resolveArticleId(string $url): ?int
{
$path = (string) parse_url($url, PHP_URL_PATH);
if (preg_match('#/article/(\d+)\.html#i', $path, $match)) {
return (int) $match[1];
}
return null;
}
/**
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}|null
*/
protected function requestArticleDetail(int $aid, string $articleUrl): ?array
{
$response = Http::timeout(30)
->withHeaders([
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
'Accept' => 'application/json',
'Origin' => 'https://www.huxiu.com',
'Referer' => 'https://www.huxiu.com/',
])
->asForm()
->post(self::ARTICLE_DETAIL_URL, [
'platform' => 'www',
'aid' => (string) $aid,
]);
if (! $response->successful()) {
return null;
}
$json = $response->json();
if (! is_array($json) || empty($json['success'])) {
return null;
}
$data = $json['data'] ?? [];
if (! is_array($data)) {
return null;
}
$content = trim((string) ($data['content'] ?? ''));
if ($content === '' || mb_strlen(strip_tags($content)) < 30) {
return null;
}
$title = HtmlCrawlSupport::cleanArticleTitle(trim((string) ($data['title'] ?? '')));
$summary = trim((string) ($data['summary'] ?? ''));
$publishedAt = null;
if (! empty($data['fdateline'])) {
$publishedAt = HtmlCrawlSupport::normalizeDate((string) $data['fdateline']);
} elseif (! empty($data['dateline'])) {
$timestamp = (int) $data['dateline'];
if ($timestamp > 0) {
$publishedAt = gmdate('Y-m-d', $timestamp);
}
}
return [
'title' => $title,
'summary' => $summary !== '' ? $summary : null,
'content_html' => \App\Services\Crawl\NewsContentHtml::normalize($content, $articleUrl),
'published_at' => $publishedAt,
];
}
}

@ -85,6 +85,14 @@ class CrawlSourceResolver
return $sources->firstWhere('adapter_code', 'pedaily_html');
}
if ($targetType === 'industry_news' && str_contains($lower, 'huxiu.com')) {
return $sources->firstWhere('adapter_code', 'huxiu_html');
}
if ($targetType === 'industry_news' && str_contains($lower, 'pedata.cn')) {
return $sources->firstWhere('adapter_code', 'generic_news_html');
}
if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) {
return $sources->firstWhere('adapter_code', 'arxiv_api');
}

@ -15,6 +15,9 @@ class NewsContentHtml
// 上海交大等Article_content + Article-source
'#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>\s*<div[^>]+class=["\'][^"\']*Article-source#is',
'#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>#is',
'#<div[^>]+class=["\'][^"\']*article_main[^"\']*["\'][^>]*>(.*?)</div>\s*</div>\s*<div class="news_all_text#is',
'#<div[^>]+class=["\'][^"\']*article_main[^"\']*["\'][^>]*>(.*?)</div>#is',
'#<div[^>]+class=["\'][^"\']*article_con[^"\']*["\'][^>]*>(.*?)</div>\s*<div class="news_all_text#is',
'#<div[^>]+id=["\']ivs_content["\'][^>]*>(.*?)</div>#is',
'#<div[^>]+id=["\'](?:vsb_content|v_news_content|news_content|zoom)["\'][^>]*>(.*?)</div>#is',
'#<div[^>]+class=["\'][^"\']*v_news_content[^"\']*["\'][^>]*>(.*?)</div>#is',
@ -25,8 +28,12 @@ class NewsContentHtml
];
foreach ($patterns as $pattern) {
if (preg_match($pattern, $html, $m)) {
$body = trim($m[1]);
if (! preg_match_all($pattern, $html, $matches)) {
continue;
}
foreach ($matches[1] as $body) {
$body = trim((string) $body);
$len = mb_strlen(strip_tags($body));
if ($len >= 30) {
$candidates[$len] = $body;

@ -0,0 +1,38 @@
<?php
namespace Tests\Unit;
use App\Services\Crawl\Adapters\HuxiuHtmlAdapter;
use Tests\TestCase;
class HuxiuHtmlAdapterTest extends TestCase
{
public function test_resolves_channel_id_from_url(): void
{
$adapter = new HuxiuHtmlAdapter(
app(\App\Services\Crawl\NewsCategoryMatcher::class),
app(\App\Services\Crawl\NewsHtmlImageLocalizer::class),
);
$method = new \ReflectionMethod($adapter, 'resolveChannelId');
$method->setAccessible(true);
$this->assertSame(115, $method->invoke($adapter, 'https://www.huxiu.com/channel/115.html'));
$this->assertNull($method->invoke($adapter, 'https://www.huxiu.com/article/123.html'));
}
public function test_fetches_article_detail_via_api(): void
{
$adapter = new HuxiuHtmlAdapter(
app(\App\Services\Crawl\NewsCategoryMatcher::class),
app(\App\Services\Crawl\NewsHtmlImageLocalizer::class),
);
$method = new \ReflectionMethod($adapter, 'fetchArticleDetail');
$method->setAccessible(true);
$detail = $method->invoke($adapter, 'https://www.huxiu.com/article/4869203.html');
$this->assertNotEmpty($detail['title']);
$this->assertNotEmpty($detail['content_html']);
$this->assertGreaterThan(200, mb_strlen(strip_tags((string) $detail['content_html'])));
}
}

@ -33,4 +33,22 @@ HTML;
$this->assertStringContainsString('https://news.sjtu.edu.cn/resource/upload/a.png', $normalized);
}
public function test_extracts_pedata_article_main(): void
{
$html = <<<'HTML'
<div class="article_con">
<div class="article_main">
<p>清科研究中心正文段落一,包含足够长的文字用于通过正文提取阈值校验。</p>
<p>第二段正文内容继续补充长度,确保 strip_tags 后超过三十个字符。</p>
</div>
</div>
<div class="news_all_text">版权声明</div>
HTML;
$body = NewsContentHtml::extractBody($html);
$this->assertNotNull($body);
$this->assertStringContainsString('清科研究中心正文段落一', $body);
}
}

Loading…
Cancel
Save