You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
240 lines
8.1 KiB
240 lines
8.1 KiB
<?php
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
use App\Models\CrawlSource;
|
|
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
|
|
use App\Services\Crawl\CrawlItemDto;
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
use App\Services\Crawl\NewsCategoryMatcher;
|
|
use App\Services\Crawl\NewsHtmlImageLocalizer;
|
|
use App\Services\Crawl\PedailyContentNormalizer;
|
|
use Illuminate\Support\Facades\Http;
|
|
use Illuminate\Support\Str;
|
|
|
|
class PedailyHtmlAdapter implements CrawlerAdapterInterface
|
|
{
|
|
public function __construct(
|
|
protected NewsCategoryMatcher $categoryMatcher,
|
|
protected NewsHtmlImageLocalizer $imageLocalizer,
|
|
) {}
|
|
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
|
{
|
|
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
|
|
if ($keywords === []) {
|
|
throw new \InvalidArgumentException('请填写至少一个搜索关键词');
|
|
}
|
|
|
|
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
|
|
|
|
$fetchUrl = $requestUrl;
|
|
if (! Str::contains($fetchUrl, 'pedaily.cn')) {
|
|
$fetchUrl = $source->entry_url ?: 'https://www.pedaily.cn/all/';
|
|
}
|
|
|
|
$response = Http::timeout(45)
|
|
->withHeaders([
|
|
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
|
|
'Accept' => 'text/html,application/xhtml+xml',
|
|
'Accept-Language' => 'zh-CN,zh;q=0.9',
|
|
])
|
|
->get($fetchUrl);
|
|
|
|
if (! $response->successful()) {
|
|
throw new \RuntimeException('投资界页面请求失败:'.$response->status());
|
|
}
|
|
|
|
$items = $this->parseListHtml($response->body(), $fetchUrl);
|
|
|
|
$items = array_values(array_filter(
|
|
$items,
|
|
fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords)
|
|
));
|
|
|
|
$enriched = [];
|
|
foreach ($items as $item) {
|
|
if (count($enriched) >= $maxResults) {
|
|
break;
|
|
}
|
|
$detail = $this->fetchArticleDetail($item->canonicalUrl ?? '');
|
|
$title = $detail['title'] ?: $item->title;
|
|
$contentHtml = PedailyContentNormalizer::normalize($detail['content_html'] ?? $item->contentHtml);
|
|
$contentHtml = $this->imageLocalizer->localize($contentHtml);
|
|
$plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : '';
|
|
$publishedAt = $detail['published_at'] ?? $item->publishedAt;
|
|
|
|
if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) {
|
|
continue;
|
|
}
|
|
|
|
$categoryId = $this->categoryMatcher->resolveCategoryId(
|
|
$title,
|
|
$plainForMatch !== '' ? $plainForMatch : null,
|
|
$keywords
|
|
);
|
|
$categoryLabel = $this->categoryMatcher->labelForId($categoryId);
|
|
|
|
$enriched[] = new CrawlItemDto(
|
|
externalId: $item->externalId,
|
|
title: $title,
|
|
canonicalUrl: $item->canonicalUrl,
|
|
summary: null,
|
|
publishedAt: $publishedAt,
|
|
contentHtml: $contentHtml,
|
|
extra: [
|
|
'platform' => 'pedaily',
|
|
'keywords' => $keywords,
|
|
'category_dict_item_id' => $categoryId,
|
|
'category_label' => $categoryLabel,
|
|
],
|
|
);
|
|
}
|
|
|
|
return $enriched;
|
|
}
|
|
|
|
/**
|
|
* @return list<CrawlItemDto>
|
|
*/
|
|
protected function parseListHtml(string $html, string $baseUrl): array
|
|
{
|
|
$items = [];
|
|
$seen = [];
|
|
|
|
if (preg_match_all(
|
|
'#<a[^>]+href=["\']([^"\']+)["\'][^>]*>([^<]{8,200})</a>#iu',
|
|
$html,
|
|
$matches,
|
|
PREG_SET_ORDER
|
|
)) {
|
|
foreach ($matches as $m) {
|
|
$href = html_entity_decode(trim($m[1]));
|
|
$title = trim(strip_tags(html_entity_decode($m[2])));
|
|
if ($title === '' || Str::length($title) < 8) {
|
|
continue;
|
|
}
|
|
if (Str::contains($title, ['登录', '注册', '更多', '下一页', '上一页'])) {
|
|
continue;
|
|
}
|
|
|
|
$url = $this->absoluteUrl($href, $baseUrl);
|
|
if (! $url || ! Str::contains($url, 'pedaily.cn')) {
|
|
continue;
|
|
}
|
|
if (! preg_match('#/(article|news|inners|vcpe|company)/#i', $url) && ! preg_match('#\.s?html#i', $url)) {
|
|
continue;
|
|
}
|
|
|
|
$key = md5($url);
|
|
if (isset($seen[$key])) {
|
|
continue;
|
|
}
|
|
$seen[$key] = true;
|
|
|
|
$items[] = new CrawlItemDto(
|
|
externalId: 'pedaily:'.$key,
|
|
title: $title,
|
|
canonicalUrl: $url,
|
|
extra: ['platform' => 'pedaily'],
|
|
);
|
|
}
|
|
}
|
|
|
|
return $items;
|
|
}
|
|
|
|
/**
|
|
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
|
|
*/
|
|
protected function fetchArticleDetail(string $url): array
|
|
{
|
|
$empty = [
|
|
'title' => null,
|
|
'summary' => null,
|
|
'content_html' => null,
|
|
'published_at' => null,
|
|
];
|
|
|
|
if ($url === '' || ! Str::contains($url, 'pedaily.cn')) {
|
|
return $empty;
|
|
}
|
|
|
|
try {
|
|
$response = Http::timeout(20)
|
|
->withHeaders([
|
|
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
|
|
'Accept' => 'text/html,application/xhtml+xml',
|
|
'Accept-Language' => 'zh-CN,zh;q=0.9',
|
|
])
|
|
->get($url);
|
|
} catch (\Throwable) {
|
|
return $empty;
|
|
}
|
|
|
|
if (! $response->successful()) {
|
|
return $empty;
|
|
}
|
|
|
|
$html = $response->body();
|
|
|
|
$title = null;
|
|
if (preg_match('#<h1[^>]*id=["\']newstitle["\'][^>]*>(.*?)</h1>#is', $html, $m)) {
|
|
$title = trim(strip_tags(html_entity_decode($m[1])));
|
|
} elseif (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
|
|
$title = trim(strip_tags(html_entity_decode($m[1])));
|
|
}
|
|
|
|
$publishedAt = null;
|
|
if (preg_match('#<time[^>]+datetime=["\']([^"\']+)["\']#i', $html, $m)) {
|
|
$publishedAt = substr($m[1], 0, 10);
|
|
} elseif (preg_match('#<span class="date"[^>]*>\s*<time[^>]*>([^<]+)</time>#is', $html, $m)) {
|
|
$publishedAt = $this->normalizeDate(trim($m[1]));
|
|
}
|
|
|
|
$contentHtml = null;
|
|
if (preg_match('#<div[^>]+id=["\']news-content["\'][^>]*>(.*?)</div>\s*<div#is', $html, $m)) {
|
|
$contentHtml = trim($m[1]);
|
|
} elseif (preg_match('#<div[^>]+id=["\']article-body["\'][^>]*>(.*?)</div>#is', $html, $m)) {
|
|
$contentHtml = trim($m[1]);
|
|
}
|
|
|
|
return [
|
|
'title' => $title,
|
|
'summary' => null,
|
|
'content_html' => PedailyContentNormalizer::normalize($contentHtml),
|
|
'published_at' => $publishedAt,
|
|
];
|
|
}
|
|
|
|
protected function normalizeDate(string $raw): ?string
|
|
{
|
|
$raw = str_replace('/', '-', trim($raw));
|
|
if (preg_match('#^(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) {
|
|
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
protected function absoluteUrl(string $href, string $base): ?string
|
|
{
|
|
if (Str::startsWith($href, 'http')) {
|
|
return $href;
|
|
}
|
|
$parts = parse_url($base);
|
|
if (! $parts || empty($parts['scheme']) || empty($parts['host'])) {
|
|
return null;
|
|
}
|
|
$origin = $parts['scheme'].'://'.$parts['host'];
|
|
if (Str::startsWith($href, '//')) {
|
|
return $parts['scheme'].':'.$href;
|
|
}
|
|
if (Str::startsWith($href, '/')) {
|
|
return $origin.$href;
|
|
}
|
|
|
|
return rtrim($origin, '/').'/'.ltrim($href, '/');
|
|
}
|
|
}
|