You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

240 lines
8.1 KiB

<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\NewsCategoryMatcher;
use App\Services\Crawl\NewsHtmlImageLocalizer;
use App\Services\Crawl\PedailyContentNormalizer;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
class PedailyHtmlAdapter implements CrawlerAdapterInterface
{
public function __construct(
protected NewsCategoryMatcher $categoryMatcher,
protected NewsHtmlImageLocalizer $imageLocalizer,
) {}
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
if ($keywords === []) {
throw new \InvalidArgumentException('请填写至少一个搜索关键词');
}
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
$fetchUrl = $requestUrl;
if (! Str::contains($fetchUrl, 'pedaily.cn')) {
$fetchUrl = $source->entry_url ?: 'https://www.pedaily.cn/all/';
}
$response = Http::timeout(45)
->withHeaders([
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
'Accept' => 'text/html,application/xhtml+xml',
'Accept-Language' => 'zh-CN,zh;q=0.9',
])
->get($fetchUrl);
if (! $response->successful()) {
throw new \RuntimeException('投资界页面请求失败:'.$response->status());
}
$items = $this->parseListHtml($response->body(), $fetchUrl);
$items = array_values(array_filter(
$items,
fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords)
));
$enriched = [];
foreach ($items as $item) {
if (count($enriched) >= $maxResults) {
break;
}
$detail = $this->fetchArticleDetail($item->canonicalUrl ?? '');
$title = $detail['title'] ?: $item->title;
$contentHtml = PedailyContentNormalizer::normalize($detail['content_html'] ?? $item->contentHtml);
$contentHtml = $this->imageLocalizer->localize($contentHtml);
$plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : '';
$publishedAt = $detail['published_at'] ?? $item->publishedAt;
if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) {
continue;
}
$categoryId = $this->categoryMatcher->resolveCategoryId(
$title,
$plainForMatch !== '' ? $plainForMatch : null,
$keywords
);
$categoryLabel = $this->categoryMatcher->labelForId($categoryId);
$enriched[] = new CrawlItemDto(
externalId: $item->externalId,
title: $title,
canonicalUrl: $item->canonicalUrl,
summary: null,
publishedAt: $publishedAt,
contentHtml: $contentHtml,
extra: [
'platform' => 'pedaily',
'keywords' => $keywords,
'category_dict_item_id' => $categoryId,
'category_label' => $categoryLabel,
],
);
}
return $enriched;
}
/**
* @return list<CrawlItemDto>
*/
protected function parseListHtml(string $html, string $baseUrl): array
{
$items = [];
$seen = [];
if (preg_match_all(
'#<a[^>]+href=["\']([^"\']+)["\'][^>]*>([^<]{8,200})</a>#iu',
$html,
$matches,
PREG_SET_ORDER
)) {
foreach ($matches as $m) {
$href = html_entity_decode(trim($m[1]));
$title = trim(strip_tags(html_entity_decode($m[2])));
if ($title === '' || Str::length($title) < 8) {
continue;
}
if (Str::contains($title, ['登录', '注册', '更多', '下一页', '上一页'])) {
continue;
}
$url = $this->absoluteUrl($href, $baseUrl);
if (! $url || ! Str::contains($url, 'pedaily.cn')) {
continue;
}
if (! preg_match('#/(article|news|inners|vcpe|company)/#i', $url) && ! preg_match('#\.s?html#i', $url)) {
continue;
}
$key = md5($url);
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$items[] = new CrawlItemDto(
externalId: 'pedaily:'.$key,
title: $title,
canonicalUrl: $url,
extra: ['platform' => 'pedaily'],
);
}
}
return $items;
}
/**
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
*/
protected function fetchArticleDetail(string $url): array
{
$empty = [
'title' => null,
'summary' => null,
'content_html' => null,
'published_at' => null,
];
if ($url === '' || ! Str::contains($url, 'pedaily.cn')) {
return $empty;
}
try {
$response = Http::timeout(20)
->withHeaders([
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
'Accept' => 'text/html,application/xhtml+xml',
'Accept-Language' => 'zh-CN,zh;q=0.9',
])
->get($url);
} catch (\Throwable) {
return $empty;
}
if (! $response->successful()) {
return $empty;
}
$html = $response->body();
$title = null;
if (preg_match('#<h1[^>]*id=["\']newstitle["\'][^>]*>(.*?)</h1>#is', $html, $m)) {
$title = trim(strip_tags(html_entity_decode($m[1])));
} elseif (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
$title = trim(strip_tags(html_entity_decode($m[1])));
}
$publishedAt = null;
if (preg_match('#<time[^>]+datetime=["\']([^"\']+)["\']#i', $html, $m)) {
$publishedAt = substr($m[1], 0, 10);
} elseif (preg_match('#<span class="date"[^>]*>\s*<time[^>]*>([^<]+)</time>#is', $html, $m)) {
$publishedAt = $this->normalizeDate(trim($m[1]));
}
$contentHtml = null;
if (preg_match('#<div[^>]+id=["\']news-content["\'][^>]*>(.*?)</div>\s*<div#is', $html, $m)) {
$contentHtml = trim($m[1]);
} elseif (preg_match('#<div[^>]+id=["\']article-body["\'][^>]*>(.*?)</div>#is', $html, $m)) {
$contentHtml = trim($m[1]);
}
return [
'title' => $title,
'summary' => null,
'content_html' => PedailyContentNormalizer::normalize($contentHtml),
'published_at' => $publishedAt,
];
}
protected function normalizeDate(string $raw): ?string
{
$raw = str_replace('/', '-', trim($raw));
if (preg_match('#^(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
return null;
}
protected function absoluteUrl(string $href, string $base): ?string
{
if (Str::startsWith($href, 'http')) {
return $href;
}
$parts = parse_url($base);
if (! $parts || empty($parts['scheme']) || empty($parts['host'])) {
return null;
}
$origin = $parts['scheme'].'://'.$parts['host'];
if (Str::startsWith($href, '//')) {
return $parts['scheme'].':'.$href;
}
if (Str::startsWith($href, '/')) {
return $origin.$href;
}
return rtrim($origin, '/').'/'.ltrim($href, '/');
}
}