You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
331 lines
12 KiB
331 lines
12 KiB
|
1 week ago
|
<?php
|
||
|
|
|
||
|
|
namespace App\Services\Crawl\Adapters;
|
||
|
|
|
||
|
|
use App\Models\CrawlSource;
|
||
|
|
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
|
||
|
|
use App\Services\Crawl\CrawlItemDto;
|
||
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
||
|
|
use App\Services\Crawl\HtmlCrawlSupport;
|
||
|
|
use App\Services\Crawl\HtmlPagination;
|
||
|
|
use App\Services\Crawl\NewsCategoryMatcher;
|
||
|
|
use App\Services\Crawl\NewsContentHtml;
|
||
|
|
use App\Services\Crawl\NewsHtmlImageLocalizer;
|
||
|
|
use Illuminate\Support\Str;
|
||
|
|
|
||
|
|
class GenericNewsHtmlAdapter implements CrawlerAdapterInterface
|
||
|
|
{
|
||
|
|
public function __construct(
|
||
|
|
protected NewsCategoryMatcher $categoryMatcher,
|
||
|
|
protected NewsHtmlImageLocalizer $imageLocalizer,
|
||
|
|
) {}
|
||
|
|
|
||
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
||
|
|
{
|
||
|
|
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
|
||
|
|
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 30)));
|
||
|
|
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
|
||
|
|
|
||
|
|
$pageHost = HtmlCrawlSupport::hostKey($requestUrl);
|
||
|
|
$items = [];
|
||
|
|
$seen = [];
|
||
|
|
|
||
|
|
foreach (HtmlPagination::fetchPagesHtml($requestUrl, $maxPages) as $html) {
|
||
|
|
foreach ($this->parseListHtml($html, $requestUrl, $pageHost) as $item) {
|
||
|
|
if (isset($seen[$item->externalId])) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
$seen[$item->externalId] = true;
|
||
|
|
$items[] = $item;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (count($items) <= 1) {
|
||
|
|
$firstHtml = HtmlPagination::fetchPagesHtml($requestUrl, 1)[0] ?? '';
|
||
|
|
if ($firstHtml !== '' && $this->looksLikeArticlePage($firstHtml)) {
|
||
|
|
$single = $this->parseArticleFromHtml($firstHtml, $requestUrl);
|
||
|
|
if ($single !== null) {
|
||
|
|
$items = [$single];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
$items = array_values(array_filter(
|
||
|
|
$items,
|
||
|
|
fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny($item->title, $item->summary, $keywords)
|
||
|
|
));
|
||
|
|
|
||
|
|
$enriched = [];
|
||
|
|
foreach ($items as $item) {
|
||
|
|
if (count($enriched) >= $maxResults) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
$detail = $this->fetchArticleDetail($item->canonicalUrl ?? '');
|
||
|
|
$title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null)
|
||
|
|
?: HtmlCrawlSupport::cleanArticleTitle($item->title)
|
||
|
|
?: $item->title;
|
||
|
|
if (HtmlCrawlSupport::isWeakLinkTitle($title)) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
$articleUrl = $item->canonicalUrl ?? '';
|
||
|
|
$contentHtml = $detail['content_html'] ?? $item->contentHtml;
|
||
|
|
if ($contentHtml) {
|
||
|
|
$contentHtml = $this->imageLocalizer->localize($contentHtml, $articleUrl);
|
||
|
|
}
|
||
|
|
$plainForMatch = $contentHtml ? trim(strip_tags($contentHtml)) : '';
|
||
|
|
$publishedAt = $detail['published_at'] ?? $item->publishedAt;
|
||
|
|
|
||
|
|
if (! CrawlKeywordParser::matchesAny($title, $plainForMatch !== '' ? $plainForMatch : null, $keywords)) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
$categoryId = $this->categoryMatcher->resolveCategoryId(
|
||
|
|
$title,
|
||
|
|
$plainForMatch !== '' ? $plainForMatch : null,
|
||
|
|
$keywords
|
||
|
|
);
|
||
|
|
|
||
|
|
$enriched[] = new CrawlItemDto(
|
||
|
|
externalId: $item->externalId,
|
||
|
|
title: $title,
|
||
|
|
canonicalUrl: $item->canonicalUrl,
|
||
|
|
summary: $detail['summary'] ?? $item->summary,
|
||
|
|
publishedAt: $publishedAt,
|
||
|
|
contentHtml: $contentHtml,
|
||
|
|
extra: [
|
||
|
|
'platform' => 'generic_html',
|
||
|
|
'keywords' => $keywords,
|
||
|
|
'category_dict_item_id' => $categoryId,
|
||
|
|
'category_label' => $this->categoryMatcher->labelForId($categoryId),
|
||
|
|
],
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
return $enriched;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return list<CrawlItemDto>
|
||
|
|
*/
|
||
|
|
protected function parseListHtml(string $html, string $baseUrl, ?string $pageHost): array
|
||
|
|
{
|
||
|
|
$items = [];
|
||
|
|
$seen = [];
|
||
|
|
|
||
|
|
if (preg_match_all('#<li[^>]*>(.*?)</li>#isu', $html, $blocks, PREG_SET_ORDER)) {
|
||
|
|
foreach ($blocks as $block) {
|
||
|
|
$item = $this->extractLinkFromFragment($block[1], $baseUrl, $pageHost);
|
||
|
|
if ($item && ! isset($seen[$item->externalId])) {
|
||
|
|
$seen[$item->externalId] = true;
|
||
|
|
$items[] = $item;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (! preg_match_all(
|
||
|
|
'#<a[^>]+href=["\']([^"\']+)["\'][^>]*>(.*?)</a>#isu',
|
||
|
|
$html,
|
||
|
|
$matches,
|
||
|
|
PREG_SET_ORDER
|
||
|
|
)) {
|
||
|
|
return $items;
|
||
|
|
}
|
||
|
|
|
||
|
|
foreach ($matches as $m) {
|
||
|
|
$item = $this->buildListItem($m[1], $m[2], $html, $baseUrl, $pageHost);
|
||
|
|
if ($item && ! isset($seen[$item->externalId])) {
|
||
|
|
$seen[$item->externalId] = true;
|
||
|
|
$items[] = $item;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return $items;
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function extractLinkFromFragment(string $fragment, string $baseUrl, ?string $pageHost): ?CrawlItemDto
|
||
|
|
{
|
||
|
|
if (! preg_match('#<a[^>]+href=["\']([^"\']+)["\'][^>]*>(.*?)</a>#isu', $fragment, $m)) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
return $this->buildListItem($m[1], $m[2], $fragment, $baseUrl, $pageHost);
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function buildListItem(
|
||
|
|
string $href,
|
||
|
|
string $linkInner,
|
||
|
|
string $context,
|
||
|
|
string $baseUrl,
|
||
|
|
?string $pageHost,
|
||
|
|
): ?CrawlItemDto {
|
||
|
|
$title = trim(preg_replace('/\s+/u', ' ', strip_tags(html_entity_decode($linkInner))));
|
||
|
|
if (HtmlCrawlSupport::isSkippableLinkTitle($title) || HtmlCrawlSupport::isWeakLinkTitle($title)) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
$url = HtmlCrawlSupport::absoluteUrl($href, $baseUrl);
|
||
|
|
if (! $url || HtmlCrawlSupport::isAssetPath($url)) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
if ($pageHost && ! HtmlCrawlSupport::sameHost($pageHost, HtmlCrawlSupport::hostKey($url))) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
if (! $this->looksLikeNewsArticleUrl($url, $baseUrl)) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
$publishedAt = HtmlCrawlSupport::extractDateFromText($context);
|
||
|
|
|
||
|
|
return new CrawlItemDto(
|
||
|
|
externalId: 'news:'.md5($url),
|
||
|
|
title: $title,
|
||
|
|
canonicalUrl: $url,
|
||
|
|
publishedAt: $publishedAt,
|
||
|
|
extra: ['platform' => 'generic_html'],
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function looksLikeNewsArticleUrl(string $url, string $listUrl): bool
|
||
|
|
{
|
||
|
|
if (rtrim($url, '/') === rtrim($listUrl, '/')) {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
$path = strtolower((string) parse_url($url, PHP_URL_PATH));
|
||
|
|
if ($path === '' || $path === '/') {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (preg_match('#/(news|article|content|detail|post|story|infor|view|show|archives?)/#i', $path)) {
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
if (preg_match('#/\d{4}[/\-]\d{1,2}[/\-]\d{1,2}/#', $path)) {
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
if (preg_match('#\.(s?html?|php|aspx?)(\?|$)#i', $path) && ! preg_match('#/(index|list|category|tag|search)(\.|/|$)#i', $path)) {
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
return preg_match('#/[a-z0-9\-]{8,}\.s?html?$#i', $path) === 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function looksLikeArticlePage(string $html): bool
|
||
|
|
{
|
||
|
|
if (! preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
$h1 = trim(strip_tags(html_entity_decode($m[1])));
|
||
|
|
|
||
|
|
return Str::length($h1) >= 8 && NewsContentHtml::extractBody($html) !== null;
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function parseArticleFromHtml(string $html, string $url): ?CrawlItemDto
|
||
|
|
{
|
||
|
|
$detail = $this->parseArticleDetailFromHtml($html);
|
||
|
|
$title = HtmlCrawlSupport::cleanArticleTitle($detail['title'] ?? null);
|
||
|
|
if ($title === null || $title === '') {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
return new CrawlItemDto(
|
||
|
|
externalId: 'news:'.md5($url),
|
||
|
|
title: $title,
|
||
|
|
canonicalUrl: $url,
|
||
|
|
summary: $detail['summary'],
|
||
|
|
publishedAt: $detail['published_at'],
|
||
|
|
contentHtml: $detail['content_html'],
|
||
|
|
extra: ['platform' => 'generic_html'],
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
|
||
|
|
*/
|
||
|
|
protected function fetchArticleDetail(string $url): array
|
||
|
|
{
|
||
|
|
$empty = [
|
||
|
|
'title' => null,
|
||
|
|
'summary' => null,
|
||
|
|
'content_html' => null,
|
||
|
|
'published_at' => null,
|
||
|
|
];
|
||
|
|
|
||
|
|
if ($url === '') {
|
||
|
|
return $empty;
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
$html = HtmlCrawlSupport::fetchHtml($url, 20);
|
||
|
|
} catch (\Throwable) {
|
||
|
|
return $empty;
|
||
|
|
}
|
||
|
|
|
||
|
|
return $this->parseArticleDetailFromHtml($html);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return array{title:?string, summary:?string, content_html:?string, published_at:?string}
|
||
|
|
*/
|
||
|
|
protected function parseArticleDetailFromHtml(string $html): array
|
||
|
|
{
|
||
|
|
$title = null;
|
||
|
|
if (preg_match('#<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
||
|
|
$title = trim(html_entity_decode($m[1]));
|
||
|
|
} elseif (preg_match('#<meta[^>]+name=["\']twitter:title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
||
|
|
$title = trim(html_entity_decode($m[1]));
|
||
|
|
} elseif (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
|
||
|
|
$title = trim(strip_tags(html_entity_decode($m[1])));
|
||
|
|
} elseif (preg_match('#<title[^>]*>(.*?)</title>#is', $html, $m)) {
|
||
|
|
$title = trim(strip_tags(html_entity_decode($m[1])));
|
||
|
|
}
|
||
|
|
$title = HtmlCrawlSupport::cleanArticleTitle($title);
|
||
|
|
|
||
|
|
$publishedAt = $this->extractPublishedAt($html);
|
||
|
|
|
||
|
|
$contentHtml = NewsContentHtml::extractBody($html);
|
||
|
|
|
||
|
|
$summary = null;
|
||
|
|
if (preg_match('#<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
||
|
|
$summary = trim(html_entity_decode($m[1]));
|
||
|
|
}
|
||
|
|
|
||
|
|
return [
|
||
|
|
'title' => $title,
|
||
|
|
'summary' => $summary,
|
||
|
|
'content_html' => $contentHtml,
|
||
|
|
'published_at' => $publishedAt,
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function extractPublishedAt(string $html): ?string
|
||
|
|
{
|
||
|
|
if (preg_match('#<meta[^>]+property=["\']article:published_time["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
||
|
|
return HtmlCrawlSupport::normalizeDate($m[1]);
|
||
|
|
}
|
||
|
|
if (preg_match('#<meta[^>]+property=["\']og:published_time["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
||
|
|
return HtmlCrawlSupport::normalizeDate($m[1]);
|
||
|
|
}
|
||
|
|
if (preg_match('#<meta[^>]+name=["\'](?:publishdate|pubdate|date)["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
||
|
|
return HtmlCrawlSupport::normalizeDate($m[1]);
|
||
|
|
}
|
||
|
|
if (preg_match_all('#<time[^>]+datetime=["\']([^"\']+)["\'][^>]*>#i', $html, $m)) {
|
||
|
|
foreach ($m[1] as $dt) {
|
||
|
|
$parsed = HtmlCrawlSupport::normalizeDate($dt);
|
||
|
|
if ($parsed) {
|
||
|
|
return $parsed;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (preg_match('#<time[^>]*>([^<]{6,40})</time>#i', $html, $m)) {
|
||
|
|
return HtmlCrawlSupport::normalizeDate($m[1]);
|
||
|
|
}
|
||
|
|
if (preg_match('#<span[^>]*class=["\'][^"\']*(?:date|time|publish)[^"\']*["\'][^>]*>([^<]{6,40})</span>#i', $html, $m)) {
|
||
|
|
return HtmlCrawlSupport::normalizeDate($m[1]);
|
||
|
|
}
|
||
|
|
|
||
|
|
return HtmlCrawlSupport::extractDateFromText($html);
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|