|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
|
|
use App\Models\CrawlSource;
|
|
|
use App\Services\Crawl\CrawlItemDto;
|
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
|
use App\Services\Crawl\HtmlCrawlSupport;
|
|
|
use Illuminate\Support\Facades\Http;
|
|
|
|
|
|
class HuxiuHtmlAdapter extends GenericNewsHtmlAdapter
|
|
|
{
|
|
|
protected const API_URL = 'https://api-web-article.huxiu.com/web/channel/articleListV1';
|
|
|
|
|
|
protected const ARTICLE_DETAIL_URL = 'https://api-web-article.huxiu.com/web/article/detail';
|
|
|
|
|
|
protected const PAGE_SIZE = 20;
|
|
|
|
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
|
|
{
|
|
|
$channelId = $this->resolveChannelId($requestUrl);
|
|
|
if ($channelId === null) {
|
|
|
return parent::fetch($requestUrl, $source, $params);
|
|
|
}
|
|
|
|
|
|
return $this->fetchChannelViaApi($requestUrl, $params, $channelId);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array<string, mixed> $params
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function fetchChannelViaApi(string $requestUrl, array $params, int $channelId): array
|
|
|
{
|
|
|
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
|
|
|
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
|
|
|
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
|
|
|
|
|
|
$listItems = [];
|
|
|
$seen = [];
|
|
|
$lastId = null;
|
|
|
|
|
|
for ($page = 1; $page <= $maxPages && count($listItems) < $maxResults; $page++) {
|
|
|
$payload = $this->requestChannelArticleList($channelId, self::PAGE_SIZE, $lastId);
|
|
|
$rows = $payload['datalist'] ?? [];
|
|
|
if ($rows === []) {
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
foreach ($rows as $row) {
|
|
|
if (count($listItems) >= $maxResults) {
|
|
|
break 2;
|
|
|
}
|
|
|
|
|
|
$item = $this->mapApiRowToDto($row);
|
|
|
if ($item === null || isset($seen[$item->externalId])) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$seen[$item->externalId] = true;
|
|
|
$listItems[] = $item;
|
|
|
}
|
|
|
|
|
|
$nextLastId = isset($payload['last_id']) ? (int) $payload['last_id'] : 0;
|
|
|
if ($nextLastId <= 0 || $nextLastId === $lastId) {
|
|
|
break;
|
|
|
}
|
|
|
$lastId = $nextLastId;
|
|
|
}
|
|
|
|
|
|
$applyKeywordFilter = HtmlCrawlSupport::shouldApplyKeywordFilter($requestUrl, '', $keywords);
|
|
|
|
|
|
return $this->enrichNewsItems($listItems, $keywords, $applyKeywordFilter, $maxResults, 'huxiu');
|
|
|
}
|
|
|
|
|
|
protected function resolveChannelId(string $url): ?int
|
|
|
{
|
|
|
$path = (string) parse_url($url, PHP_URL_PATH);
|
|
|
if (preg_match('#/channel/(\d+)\.(?:html?|shtml)$#i', $path, $match)) {
|
|
|
return (int) $match[1];
|
|
|
}
|
|
|
|
|
|
if (preg_match('#/article/?(?:\.html?|\.shtml)?$#i', $path)) {
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return array{name?:string, datalist?:list<array<string, mixed>>, last_id?:int|string}
|
|
|
*/
|
|
|
protected function requestChannelArticleList(int $channelId, int $pageSize, ?int $lastId): array
|
|
|
{
|
|
|
$form = [
|
|
|
'platform' => 'www',
|
|
|
'channel_id' => (string) $channelId,
|
|
|
'pagesize' => (string) max(1, min(30, $pageSize)),
|
|
|
];
|
|
|
if ($lastId) {
|
|
|
$form['last_id'] = (string) $lastId;
|
|
|
}
|
|
|
|
|
|
$response = Http::timeout(30)
|
|
|
->withHeaders([
|
|
|
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
|
|
|
'Accept' => 'application/json',
|
|
|
'Origin' => 'https://www.huxiu.com',
|
|
|
'Referer' => 'https://www.huxiu.com/',
|
|
|
])
|
|
|
->asForm()
|
|
|
->post(self::API_URL, $form);
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
throw new \RuntimeException('虎嗅列表接口请求失败:HTTP '.$response->status());
|
|
|
}
|
|
|
|
|
|
$json = $response->json();
|
|
|
if (! is_array($json) || empty($json['success'])) {
|
|
|
$message = is_array($json) ? (string) ($json['message'] ?? '未知错误') : '响应格式异常';
|
|
|
|
|
|
throw new \RuntimeException('虎嗅列表接口返回失败:'.$message);
|
|
|
}
|
|
|
|
|
|
$data = $json['data'] ?? [];
|
|
|
|
|
|
return is_array($data) ? $data : [];
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array<string, mixed> $row
|
|
|
*/
|
|
|
protected function mapApiRowToDto(array $row): ?CrawlItemDto
|
|
|
{
|
|
|
$title = trim((string) ($row['title'] ?? ''));
|
|
|
if ($title === '' || HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$url = HtmlCrawlSupport::normalizeNewsUrl((string) ($row['url'] ?? ''));
|
|
|
if ($url === null || $url === '') {
|
|
|
$aid = trim((string) ($row['aid'] ?? ''));
|
|
|
if ($aid === '') {
|
|
|
return null;
|
|
|
}
|
|
|
$url = 'https://www.huxiu.com/article/'.$aid.'.html';
|
|
|
}
|
|
|
|
|
|
$publishedAt = null;
|
|
|
if (! empty($row['dateline'])) {
|
|
|
$timestamp = (int) $row['dateline'];
|
|
|
if ($timestamp > 0) {
|
|
|
$publishedAt = gmdate('Y-m-d', $timestamp);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$summary = trim((string) ($row['summary'] ?? $row['short_content'] ?? ''));
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
externalId: 'news:'.md5($url),
|
|
|
title: $title,
|
|
|
canonicalUrl: $url,
|
|
|
summary: $summary !== '' ? $summary : null,
|
|
|
publishedAt: $publishedAt,
|
|
|
extra: ['platform' => 'huxiu'],
|
|
|
);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
|
|
|
*/
|
|
|
protected function fetchArticleDetail(string $url): array
|
|
|
{
|
|
|
$aid = $this->resolveArticleId($url);
|
|
|
if ($aid !== null) {
|
|
|
$apiDetail = $this->requestArticleDetail($aid, $url);
|
|
|
if ($apiDetail !== null) {
|
|
|
return $apiDetail;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return parent::fetchArticleDetail($url);
|
|
|
}
|
|
|
|
|
|
protected function resolveArticleId(string $url): ?int
|
|
|
{
|
|
|
$path = (string) parse_url($url, PHP_URL_PATH);
|
|
|
if (preg_match('#/article/(\d+)\.html#i', $path, $match)) {
|
|
|
return (int) $match[1];
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}|null
|
|
|
*/
|
|
|
protected function requestArticleDetail(int $aid, string $articleUrl): ?array
|
|
|
{
|
|
|
$response = Http::timeout(30)
|
|
|
->withHeaders([
|
|
|
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
|
|
|
'Accept' => 'application/json',
|
|
|
'Origin' => 'https://www.huxiu.com',
|
|
|
'Referer' => 'https://www.huxiu.com/',
|
|
|
])
|
|
|
->asForm()
|
|
|
->post(self::ARTICLE_DETAIL_URL, [
|
|
|
'platform' => 'www',
|
|
|
'aid' => (string) $aid,
|
|
|
]);
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$json = $response->json();
|
|
|
if (! is_array($json) || empty($json['success'])) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$data = $json['data'] ?? [];
|
|
|
if (! is_array($data)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$content = trim((string) ($data['content'] ?? ''));
|
|
|
if ($content === '' || mb_strlen(strip_tags($content)) < 30) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$title = HtmlCrawlSupport::cleanArticleTitle(trim((string) ($data['title'] ?? '')));
|
|
|
$summary = trim((string) ($data['summary'] ?? ''));
|
|
|
$publishedAt = null;
|
|
|
if (! empty($data['fdateline'])) {
|
|
|
$publishedAt = HtmlCrawlSupport::normalizeDate((string) $data['fdateline']);
|
|
|
} elseif (! empty($data['dateline'])) {
|
|
|
$timestamp = (int) $data['dateline'];
|
|
|
if ($timestamp > 0) {
|
|
|
$publishedAt = gmdate('Y-m-d', $timestamp);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return [
|
|
|
'title' => $title,
|
|
|
'summary' => $summary !== '' ? $summary : null,
|
|
|
'content_html' => \App\Services\Crawl\NewsContentHtml::normalize($content, $articleUrl),
|
|
|
'published_at' => $publishedAt,
|
|
|
];
|
|
|
}
|
|
|
}
|