You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
257 lines
8.0 KiB
257 lines
8.0 KiB
<?php
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
use Illuminate\Support\Facades\Cache;
|
|
use Illuminate\Support\Facades\Http;
|
|
|
|
/**
|
|
* 从 arXiv 摘要页或 HTML 版补全日期/机构(按需、单次请求、带缓存)。
|
|
*/
|
|
class ArxivAbsEnricher
|
|
{
|
|
public function __construct(
|
|
protected ArxivRequestGate $gate,
|
|
) {}
|
|
|
|
/**
|
|
* @param list<CrawlItemDto> $items
|
|
* @return list<CrawlItemDto>
|
|
*/
|
|
public function enrichMany(array $items): array
|
|
{
|
|
if (! config('crawl.arxiv.abs_enrich_enabled', true)) {
|
|
return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items);
|
|
}
|
|
|
|
$max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 8));
|
|
$enriched = 0;
|
|
$out = [];
|
|
|
|
foreach ($items as $dto) {
|
|
if ($enriched >= $max || ! $this->shouldEnrich($dto)) {
|
|
$out[] = $this->ensureLeadAuthor($dto);
|
|
continue;
|
|
}
|
|
|
|
$out[] = $this->enrichOne($dto);
|
|
$enriched++;
|
|
}
|
|
|
|
return $out;
|
|
}
|
|
|
|
public function enrichOne(CrawlItemDto $dto): CrawlItemDto
|
|
{
|
|
$arxivId = $dto->extra['arxiv_id'] ?? null;
|
|
if (! $arxivId || ! $dto->canonicalUrl) {
|
|
return $this->ensureLeadAuthor($dto);
|
|
}
|
|
|
|
if (! $this->shouldEnrich($dto)) {
|
|
return $this->ensureLeadAuthor($dto);
|
|
}
|
|
|
|
$publishedAt = $dto->publishedAt;
|
|
$authorsParsed = $dto->authorsParsed;
|
|
$enrichedFrom = null;
|
|
$pageHtml = '';
|
|
|
|
$preferHtml = $this->shouldPreferHtmlEnrich($dto);
|
|
|
|
if ($preferHtml && (bool) config('crawl.arxiv.try_html_version', true)) {
|
|
$pageHtml = $this->fetchHtmlVersion((string) $arxivId);
|
|
if ($pageHtml !== '') {
|
|
$enrichedFrom = 'arxiv_html';
|
|
}
|
|
}
|
|
|
|
if ($pageHtml === '') {
|
|
$pageHtml = $this->fetchAbsHtml((string) $arxivId);
|
|
if ($pageHtml !== '') {
|
|
$enrichedFrom = 'abs_html';
|
|
$preferHtml = false;
|
|
}
|
|
}
|
|
|
|
if ($pageHtml !== '') {
|
|
$publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;
|
|
|
|
$parsed = $preferHtml
|
|
? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
|
|
: ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
|
|
|
|
if ($parsed === [] && $preferHtml) {
|
|
$parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
|
|
}
|
|
|
|
if ($parsed !== []) {
|
|
$authorsParsed = $parsed;
|
|
}
|
|
}
|
|
|
|
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed);
|
|
$schoolName = $lead['university_name'] ?? $dto->schoolName;
|
|
|
|
$extra = array_merge($dto->extra, [
|
|
'authors_parsed' => $authorsParsed,
|
|
'lead_author' => $lead,
|
|
]);
|
|
if ($enrichedFrom !== null) {
|
|
$extra['enriched_from'] = $enrichedFrom;
|
|
}
|
|
if (! isset($extra['pdf_url'])) {
|
|
$extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml, (string) $arxivId);
|
|
}
|
|
if (! isset($extra['html_url'])) {
|
|
$extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml, (string) $arxivId);
|
|
}
|
|
|
|
return new CrawlItemDto(
|
|
externalId: $dto->externalId,
|
|
title: $dto->title,
|
|
canonicalUrl: $dto->canonicalUrl,
|
|
authors: $dto->authors,
|
|
summary: $dto->summary,
|
|
publishedAt: $publishedAt,
|
|
schoolName: $schoolName,
|
|
section: $dto->section,
|
|
contentHtml: $dto->contentHtml,
|
|
extra: $extra,
|
|
authorsParsed: $authorsParsed,
|
|
);
|
|
}
|
|
|
|
protected function shouldEnrich(CrawlItemDto $dto): bool
|
|
{
|
|
$mode = (string) config('crawl.arxiv.abs_enrich_mode', 'auto');
|
|
if ($mode === 'never') {
|
|
return false;
|
|
}
|
|
if ($mode === 'always') {
|
|
return true;
|
|
}
|
|
|
|
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
|
|
$hasDate = ($dto->publishedAt ?? '') !== '';
|
|
$hasSchool = ($dto->schoolName ?? null) !== null
|
|
|| ($lead['university_name'] ?? null) !== null
|
|
|| ($lead['affiliation'] ?? null) !== null;
|
|
|
|
if ($hasDate && $hasSchool) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
protected function shouldPreferHtmlEnrich(CrawlItemDto $dto): bool
|
|
{
|
|
if ((bool) config('crawl.arxiv.enrich_prefer_html', true)) {
|
|
return true;
|
|
}
|
|
|
|
return ($dto->extra['source'] ?? '') === 'html_search'
|
|
|| ! empty($dto->extra['html_url']);
|
|
}
|
|
|
|
protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto
|
|
{
|
|
if (! empty($dto->extra['lead_author'])) {
|
|
return $dto;
|
|
}
|
|
|
|
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
|
|
$extra = array_merge($dto->extra, [
|
|
'lead_author' => $lead,
|
|
'authors_parsed' => $dto->authorsParsed !== [] ? $dto->authorsParsed : ($dto->extra['authors_parsed'] ?? []),
|
|
]);
|
|
|
|
return new CrawlItemDto(
|
|
externalId: $dto->externalId,
|
|
title: $dto->title,
|
|
canonicalUrl: $dto->canonicalUrl,
|
|
authors: $dto->authors,
|
|
summary: $dto->summary,
|
|
publishedAt: $dto->publishedAt,
|
|
schoolName: $dto->schoolName ?? $lead['university_name'] ?? null,
|
|
section: $dto->section,
|
|
contentHtml: $dto->contentHtml,
|
|
extra: $extra,
|
|
authorsParsed: $dto->authorsParsed,
|
|
);
|
|
}
|
|
|
|
protected function fetchAbsHtml(string $arxivId): string
|
|
{
|
|
return $this->fetchCachedPage('abs', $arxivId, function () use ($arxivId) {
|
|
foreach ($this->versionIdCandidates($arxivId) as $id) {
|
|
$html = $this->fetchPage('https://arxiv.org/abs/'.$id);
|
|
if ($html !== '' && str_contains($html, 'abs-outer')) {
|
|
return $html;
|
|
}
|
|
}
|
|
|
|
return '';
|
|
});
|
|
}
|
|
|
|
protected function fetchHtmlVersion(string $arxivId): string
|
|
{
|
|
return $this->fetchCachedPage('html', $arxivId, function () use ($arxivId) {
|
|
foreach ($this->versionIdCandidates($arxivId) as $id) {
|
|
$html = $this->fetchPage('https://arxiv.org/html/'.$id);
|
|
if ($html !== '' && (str_contains($html, 'ltx_document') || str_contains($html, 'ltx_authors'))) {
|
|
return $html;
|
|
}
|
|
}
|
|
|
|
return '';
|
|
});
|
|
}
|
|
|
|
/**
|
|
* @return list<string>
|
|
*/
|
|
protected function versionIdCandidates(string $arxivId): array
|
|
{
|
|
if (preg_match('/v\d+$/i', $arxivId)) {
|
|
return [$arxivId];
|
|
}
|
|
|
|
return [$arxivId.'v1'];
|
|
}
|
|
|
|
protected function fetchCachedPage(string $kind, string $arxivId, callable $fetcher): string
|
|
{
|
|
$ttl = max(60, (int) config('crawl.arxiv.page_cache_seconds', 86400));
|
|
$key = 'arxiv_'.$kind.':'.preg_replace('/[^a-zA-Z0-9._-]/', '_', $arxivId);
|
|
|
|
return (string) Cache::remember($key, $ttl, fn () => (string) $fetcher());
|
|
}
|
|
|
|
protected function fetchPage(string $url): string
|
|
{
|
|
try {
|
|
$timeout = (int) config('crawl.arxiv.enrich_http_timeout_seconds', 25);
|
|
$connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 15);
|
|
|
|
$response = $this->gate->run(fn () => Http::timeout($timeout)
|
|
->connectTimeout($connectTimeout)
|
|
->withHeaders([
|
|
'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org)',
|
|
'Accept' => 'text/html',
|
|
])
|
|
->get($url));
|
|
|
|
if ($response->status() === 429) {
|
|
return '';
|
|
}
|
|
|
|
return $response->successful() ? (string) $response->body() : '';
|
|
} catch (\Throwable) {
|
|
return '';
|
|
}
|
|
}
|
|
}
|