You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
266 lines
8.7 KiB
266 lines
8.7 KiB
<?php
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
use App\Models\CrawlSource;
|
|
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
|
|
use App\Services\Crawl\CrawlAuthorParser;
|
|
use App\Services\Crawl\CrawlItemDto;
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
use App\Services\Crawl\HtmlCrawlSupport;
|
|
use Illuminate\Support\Str;
|
|
|
|
class GenericPaperHtmlAdapter implements CrawlerAdapterInterface
|
|
{
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
|
{
|
|
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
|
|
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 20)));
|
|
|
|
$html = HtmlCrawlSupport::fetchHtml($requestUrl);
|
|
$pageHost = HtmlCrawlSupport::hostKey($requestUrl);
|
|
|
|
$items = $this->parseListHtml($html, $requestUrl, $pageHost);
|
|
|
|
if (count($items) <= 1 && $this->looksLikePaperPage($html)) {
|
|
$single = $this->parsePaperFromHtml($html, $requestUrl);
|
|
if ($single !== null) {
|
|
$items = [$single];
|
|
}
|
|
}
|
|
|
|
$items = array_values(array_filter(
|
|
$items,
|
|
fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny(
|
|
$item->title,
|
|
$item->summary,
|
|
$keywords
|
|
)
|
|
));
|
|
|
|
$enriched = [];
|
|
foreach ($items as $item) {
|
|
if (count($enriched) >= $maxResults) {
|
|
break;
|
|
}
|
|
$detail = $this->fetchPaperDetail($item->canonicalUrl ?? '');
|
|
$title = $detail['title'] ?: $item->title;
|
|
$summary = $detail['summary'] ?? $item->summary;
|
|
$authors = $detail['authors'] ?? $item->authors;
|
|
$authorsParsed = $detail['authors_parsed'] ?? [];
|
|
$publishedAt = $detail['published_at'] ?? $item->publishedAt;
|
|
|
|
if (! CrawlKeywordParser::matchesAny($title, $summary, $keywords)) {
|
|
continue;
|
|
}
|
|
|
|
$lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);
|
|
|
|
$enriched[] = new CrawlItemDto(
|
|
externalId: $item->externalId,
|
|
title: $title,
|
|
canonicalUrl: $item->canonicalUrl,
|
|
authors: $authors,
|
|
summary: $summary,
|
|
publishedAt: $publishedAt,
|
|
schoolName: $lead['university_name'] ?? null,
|
|
extra: [
|
|
'platform' => 'generic_html',
|
|
'keyword' => implode(' ', $keywords),
|
|
'source' => 'html',
|
|
'authors_parsed' => $authorsParsed,
|
|
'lead_author' => $lead,
|
|
],
|
|
authorsParsed: $authorsParsed,
|
|
);
|
|
}
|
|
|
|
return $enriched;
|
|
}
|
|
|
|
/**
|
|
* @return list<CrawlItemDto>
|
|
*/
|
|
protected function parseListHtml(string $html, string $baseUrl, ?string $pageHost): array
|
|
{
|
|
$items = [];
|
|
$seen = [];
|
|
|
|
if (! preg_match_all(
|
|
'#<a[^>]+href=["\']([^"\']+)["\'][^>]*>(.*?)</a>#isu',
|
|
$html,
|
|
$matches,
|
|
PREG_SET_ORDER
|
|
)) {
|
|
return [];
|
|
}
|
|
|
|
foreach ($matches as $m) {
|
|
$href = $m[1];
|
|
$title = trim(preg_replace('/\s+/u', ' ', strip_tags(html_entity_decode($m[2]))));
|
|
if (HtmlCrawlSupport::isSkippableLinkTitle($title)) {
|
|
continue;
|
|
}
|
|
|
|
$url = HtmlCrawlSupport::absoluteUrl($href, $baseUrl);
|
|
if (! $url || HtmlCrawlSupport::isAssetPath($url)) {
|
|
continue;
|
|
}
|
|
if ($pageHost && ! HtmlCrawlSupport::sameHost($pageHost, HtmlCrawlSupport::hostKey($url))) {
|
|
continue;
|
|
}
|
|
if (! $this->looksLikePaperUrl($url, $baseUrl)) {
|
|
continue;
|
|
}
|
|
|
|
$key = md5($url);
|
|
if (isset($seen[$key])) {
|
|
continue;
|
|
}
|
|
$seen[$key] = true;
|
|
|
|
$items[] = new CrawlItemDto(
|
|
externalId: 'paper:'.$key,
|
|
title: $title,
|
|
canonicalUrl: $url,
|
|
extra: ['platform' => 'generic_html'],
|
|
);
|
|
}
|
|
|
|
return $items;
|
|
}
|
|
|
|
protected function looksLikePaperUrl(string $url, string $listUrl): bool
|
|
{
|
|
if (rtrim($url, '/') === rtrim($listUrl, '/')) {
|
|
return false;
|
|
}
|
|
|
|
$lower = strtolower($url);
|
|
if (str_contains($lower, 'doi.org/')) {
|
|
return true;
|
|
}
|
|
|
|
$path = strtolower((string) parse_url($url, PHP_URL_PATH));
|
|
if ($path === '' || $path === '/') {
|
|
return false;
|
|
}
|
|
|
|
return (bool) preg_match(
|
|
'#/(paper|papers|publication|publications|preprint|abs|arxiv|scholar|thesis|dissertation|doc|detail|view)/#i',
|
|
$path
|
|
) || preg_match('#\.pdf(\?|$)#i', $path);
|
|
}
|
|
|
|
protected function looksLikePaperPage(string $html): bool
|
|
{
|
|
if (preg_match('#<meta[^>]+name=["\']citation_title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
|
return trim($m[1]) !== '';
|
|
}
|
|
|
|
if (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
|
|
return Str::length(trim(strip_tags($m[1]))) >= 8;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
protected function parsePaperFromHtml(string $html, string $url): ?CrawlItemDto
|
|
{
|
|
$detail = $this->parsePaperDetailFromHtml($html);
|
|
if (($detail['title'] ?? '') === '') {
|
|
return null;
|
|
}
|
|
|
|
return new CrawlItemDto(
|
|
externalId: 'paper:'.md5($url),
|
|
title: $detail['title'],
|
|
canonicalUrl: $url,
|
|
authors: $detail['authors'],
|
|
summary: $detail['summary'],
|
|
publishedAt: $detail['published_at'],
|
|
extra: ['platform' => 'generic_html'],
|
|
authorsParsed: $detail['authors_parsed'],
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @return array{title:?string, summary:?string, authors:?string, published_at:?string, authors_parsed:list<array<string,mixed>>}
|
|
*/
|
|
protected function fetchPaperDetail(string $url): array
|
|
{
|
|
$empty = [
|
|
'title' => null,
|
|
'summary' => null,
|
|
'authors' => null,
|
|
'published_at' => null,
|
|
'authors_parsed' => [],
|
|
];
|
|
|
|
if ($url === '') {
|
|
return $empty;
|
|
}
|
|
|
|
try {
|
|
$html = HtmlCrawlSupport::fetchHtml($url, 20);
|
|
} catch (\Throwable) {
|
|
return $empty;
|
|
}
|
|
|
|
return $this->parsePaperDetailFromHtml($html);
|
|
}
|
|
|
|
/**
|
|
* @return array{title:string, summary:?string, authors:?string, published_at:?string, authors_parsed:list<array<string,mixed>>}
|
|
*/
|
|
protected function parsePaperDetailFromHtml(string $html): array
|
|
{
|
|
$title = '';
|
|
if (preg_match_all('#<meta[^>]+name=["\']citation_title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
|
$title = trim(html_entity_decode(end($m[1])));
|
|
} elseif (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
|
|
$title = trim(strip_tags(html_entity_decode($m[1])));
|
|
}
|
|
|
|
$authorsParsed = [];
|
|
if (preg_match_all('#<meta[^>]+name=["\']citation_author["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
|
foreach ($m[1] as $name) {
|
|
$name = trim(html_entity_decode($name));
|
|
if ($name === '') {
|
|
continue;
|
|
}
|
|
$authorsParsed[] = [
|
|
'name' => $name,
|
|
'email' => null,
|
|
'affiliation' => null,
|
|
'university_name' => null,
|
|
];
|
|
}
|
|
}
|
|
|
|
$summary = null;
|
|
if (preg_match('#<meta[^>]+name=["\']citation_abstract["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
|
$summary = trim(html_entity_decode($m[1]));
|
|
} elseif (preg_match('#<div[^>]+class=["\'][^"\']*abstract[^"\']*["\'][^>]*>(.*?)</div>#is', $html, $m)) {
|
|
$summary = trim(strip_tags(html_entity_decode($m[1])));
|
|
}
|
|
|
|
$publishedAt = null;
|
|
if (preg_match('#<meta[^>]+name=["\']citation_publication_date["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
|
$publishedAt = HtmlCrawlSupport::normalizeDate($m[1]);
|
|
} elseif (preg_match('#<meta[^>]+name=["\']citation_date["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
|
|
$publishedAt = HtmlCrawlSupport::normalizeDate($m[1]);
|
|
}
|
|
|
|
$authorNames = array_column($authorsParsed, 'name');
|
|
|
|
return [
|
|
'title' => $title,
|
|
'summary' => $summary,
|
|
'authors' => $authorNames !== [] ? implode('; ', $authorNames) : null,
|
|
'published_at' => $publishedAt,
|
|
'authors_parsed' => $authorsParsed,
|
|
];
|
|
}
|
|
}
|