You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
232 lines
8.0 KiB
232 lines
8.0 KiB
<?php
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
use App\Models\CrawlSource;
|
|
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
|
|
use App\Services\Crawl\CrawlItemDto;
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
use App\Services\Crawl\NewsCategoryMatcher;
|
|
use App\Services\Crawl\NewsHtmlImageLocalizer;
|
|
use App\Services\Crawl\HtmlCrawlSupport;
|
|
use App\Services\Crawl\HtmlPagination;
|
|
use App\Services\Crawl\NewsContentHtml;
|
|
use Illuminate\Support\Str;
|
|
|
|
class PedailyHtmlAdapter implements CrawlerAdapterInterface
|
|
{
|
|
public function __construct(
|
|
protected NewsCategoryMatcher $categoryMatcher,
|
|
protected NewsHtmlImageLocalizer $imageLocalizer,
|
|
) {}
|
|
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
|
{
|
|
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
|
|
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
|
|
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
|
|
|
|
$fetchUrl = $requestUrl;
|
|
if (! Str::contains($fetchUrl, 'pedaily.cn')) {
|
|
$fetchUrl = $source->entry_url ?: 'https://www.pedaily.cn/all/';
|
|
}
|
|
|
|
$items = [];
|
|
$seen = [];
|
|
foreach (HtmlPagination::fetchPagesHtml($fetchUrl, $maxPages) as $html) {
|
|
foreach ($this->parseListHtml($html, $fetchUrl) as $item) {
|
|
if (isset($seen[$item->externalId])) {
|
|
continue;
|
|
}
|
|
$seen[$item->externalId] = true;
|
|
$items[] = $item;
|
|
}
|
|
}
|
|
|
|
$items = array_values(array_filter(
|
|
$items,
|
|
fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords)
|
|
));
|
|
|
|
$enriched = [];
|
|
foreach ($items as $item) {
|
|
if (count($enriched) >= $maxResults) {
|
|
break;
|
|
}
|
|
$detail = $this->fetchArticleDetail($item->canonicalUrl ?? '');
|
|
$title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null)
|
|
?: HtmlCrawlSupport::cleanArticleTitle($item->title)
|
|
?: $item->title;
|
|
$articleUrl = $item->canonicalUrl ?? '';
|
|
$rawHtml = $detail['content_html'] ?? $item->contentHtml;
|
|
$contentHtml = $this->imageLocalizer->localize($rawHtml, $articleUrl);
|
|
$plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : '';
|
|
$publishedAt = $detail['published_at'] ?? $item->publishedAt;
|
|
|
|
if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) {
|
|
continue;
|
|
}
|
|
|
|
$categoryId = $this->categoryMatcher->resolveCategoryId(
|
|
$title,
|
|
$plainForMatch !== '' ? $plainForMatch : null,
|
|
$keywords
|
|
);
|
|
$categoryLabel = $this->categoryMatcher->labelForId($categoryId);
|
|
|
|
$enriched[] = new CrawlItemDto(
|
|
externalId: $item->externalId,
|
|
title: $title,
|
|
canonicalUrl: $item->canonicalUrl,
|
|
summary: null,
|
|
publishedAt: $publishedAt,
|
|
contentHtml: $contentHtml,
|
|
extra: [
|
|
'platform' => 'pedaily',
|
|
'keywords' => $keywords,
|
|
'category_dict_item_id' => $categoryId,
|
|
'category_label' => $categoryLabel,
|
|
],
|
|
);
|
|
}
|
|
|
|
return $enriched;
|
|
}
|
|
|
|
/**
|
|
* @return list<CrawlItemDto>
|
|
*/
|
|
protected function parseListHtml(string $html, string $baseUrl): array
|
|
{
|
|
$items = [];
|
|
$seen = [];
|
|
|
|
if (preg_match_all(
|
|
'#<a[^>]+href=["\']([^"\']+)["\'][^>]*>([^<]{8,200})</a>#iu',
|
|
$html,
|
|
$matches,
|
|
PREG_SET_ORDER
|
|
)) {
|
|
foreach ($matches as $m) {
|
|
$href = html_entity_decode(trim($m[1]));
|
|
$title = trim(strip_tags(html_entity_decode($m[2])));
|
|
if (HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) {
|
|
continue;
|
|
}
|
|
if (Str::contains($title, ['登录', '注册', '更多', '下一页', '上一页'])) {
|
|
continue;
|
|
}
|
|
|
|
$url = $this->absoluteUrl($href, $baseUrl);
|
|
if (! $url || ! Str::contains($url, 'pedaily.cn')) {
|
|
continue;
|
|
}
|
|
if (! preg_match('#/(article|news|inners|vcpe|company)/#i', $url) && ! preg_match('#\.s?html#i', $url)) {
|
|
continue;
|
|
}
|
|
|
|
$key = md5($url);
|
|
if (isset($seen[$key])) {
|
|
continue;
|
|
}
|
|
$seen[$key] = true;
|
|
|
|
$items[] = new CrawlItemDto(
|
|
externalId: 'pedaily:'.$key,
|
|
title: $title,
|
|
canonicalUrl: $url,
|
|
publishedAt: HtmlCrawlSupport::extractDateFromText($html),
|
|
extra: ['platform' => 'pedaily'],
|
|
);
|
|
}
|
|
}
|
|
|
|
return $items;
|
|
}
|
|
|
|
/**
|
|
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
|
|
*/
|
|
protected function fetchArticleDetail(string $url): array
|
|
{
|
|
$empty = [
|
|
'title' => null,
|
|
'summary' => null,
|
|
'content_html' => null,
|
|
'published_at' => null,
|
|
];
|
|
|
|
if ($url === '' || ! Str::contains($url, 'pedaily.cn')) {
|
|
return $empty;
|
|
}
|
|
|
|
try {
|
|
$html = HtmlCrawlSupport::fetchHtml($url, 20);
|
|
} catch (\Throwable) {
|
|
return $empty;
|
|
}
|
|
|
|
$title = null;
|
|
if (preg_match('#<h1[^>]*id=["\']newstitle["\'][^>]*>(.*?)</h1>#is', $html, $m)) {
|
|
$title = trim(strip_tags(html_entity_decode($m[1])));
|
|
} elseif (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
|
|
$title = trim(strip_tags(html_entity_decode($m[1])));
|
|
}
|
|
$title = HtmlCrawlSupport::cleanArticleTitle($title);
|
|
|
|
$publishedAt = null;
|
|
if (preg_match('#<time[^>]+datetime=["\']([^"\']+)["\']#i', $html, $m)) {
|
|
$publishedAt = HtmlCrawlSupport::normalizeDate($m[1]);
|
|
} elseif (preg_match('#<span class="date"[^>]*>\s*<time[^>]*>([^<]+)</time>#is', $html, $m)) {
|
|
$publishedAt = HtmlCrawlSupport::normalizeDate(trim($m[1]));
|
|
}
|
|
|
|
$contentHtml = null;
|
|
if (preg_match('#<div[^>]+id=["\']news-content["\'][^>]*>(.*?)</div>\s*<div#is', $html, $m)) {
|
|
$contentHtml = trim($m[1]);
|
|
} elseif (preg_match('#<div[^>]+id=["\']article-body["\'][^>]*>(.*?)</div>#is', $html, $m)) {
|
|
$contentHtml = trim($m[1]);
|
|
}
|
|
if ($contentHtml === null || $contentHtml === '') {
|
|
$contentHtml = NewsContentHtml::extractBody($html);
|
|
}
|
|
|
|
return [
|
|
'title' => $title,
|
|
'summary' => null,
|
|
'content_html' => $contentHtml,
|
|
'published_at' => $publishedAt,
|
|
];
|
|
}
|
|
|
|
protected function normalizeDate(string $raw): ?string
|
|
{
|
|
$raw = str_replace('/', '-', trim($raw));
|
|
if (preg_match('#^(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) {
|
|
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
protected function absoluteUrl(string $href, string $base): ?string
|
|
{
|
|
if (Str::startsWith($href, 'http')) {
|
|
return $href;
|
|
}
|
|
$parts = parse_url($base);
|
|
if (! $parts || empty($parts['scheme']) || empty($parts['host'])) {
|
|
return null;
|
|
}
|
|
$origin = $parts['scheme'].'://'.$parts['host'];
|
|
if (Str::startsWith($href, '//')) {
|
|
return $parts['scheme'].':'.$href;
|
|
}
|
|
if (Str::startsWith($href, '/')) {
|
|
return $origin.$href;
|
|
}
|
|
|
|
return rtrim($origin, '/').'/'.ltrim($href, '/');
|
|
}
|
|
}
|