You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

257 lines
8.0 KiB

<?php
namespace App\Services\Crawl;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
/**
* 从 arXiv 摘要页或 HTML 版补全日期/机构(按需、单次请求、带缓存)。
*/
class ArxivAbsEnricher
{
public function __construct(
protected ArxivRequestGate $gate,
) {}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
public function enrichMany(array $items): array
{
if (! config('crawl.arxiv.abs_enrich_enabled', true)) {
return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items);
}
$max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 8));
$enriched = 0;
$out = [];
foreach ($items as $dto) {
if ($enriched >= $max || ! $this->shouldEnrich($dto)) {
$out[] = $this->ensureLeadAuthor($dto);
continue;
}
$out[] = $this->enrichOne($dto);
$enriched++;
}
return $out;
}
public function enrichOne(CrawlItemDto $dto): CrawlItemDto
{
$arxivId = $dto->extra['arxiv_id'] ?? null;
if (! $arxivId || ! $dto->canonicalUrl) {
return $this->ensureLeadAuthor($dto);
}
if (! $this->shouldEnrich($dto)) {
return $this->ensureLeadAuthor($dto);
}
$publishedAt = $dto->publishedAt;
$authorsParsed = $dto->authorsParsed;
$enrichedFrom = null;
$pageHtml = '';
$preferHtml = $this->shouldPreferHtmlEnrich($dto);
if ($preferHtml && (bool) config('crawl.arxiv.try_html_version', true)) {
$pageHtml = $this->fetchHtmlVersion((string) $arxivId);
if ($pageHtml !== '') {
$enrichedFrom = 'arxiv_html';
}
}
if ($pageHtml === '') {
$pageHtml = $this->fetchAbsHtml((string) $arxivId);
if ($pageHtml !== '') {
$enrichedFrom = 'abs_html';
$preferHtml = false;
}
}
if ($pageHtml !== '') {
$publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;
$parsed = $preferHtml
? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
: ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
if ($parsed === [] && $preferHtml) {
$parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
}
if ($parsed !== []) {
$authorsParsed = $parsed;
}
}
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed);
$schoolName = $lead['university_name'] ?? $dto->schoolName;
$extra = array_merge($dto->extra, [
'authors_parsed' => $authorsParsed,
'lead_author' => $lead,
]);
if ($enrichedFrom !== null) {
$extra['enriched_from'] = $enrichedFrom;
}
if (! isset($extra['pdf_url'])) {
$extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml, (string) $arxivId);
}
if (! isset($extra['html_url'])) {
$extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml, (string) $arxivId);
}
return new CrawlItemDto(
externalId: $dto->externalId,
title: $dto->title,
canonicalUrl: $dto->canonicalUrl,
authors: $dto->authors,
summary: $dto->summary,
publishedAt: $publishedAt,
schoolName: $schoolName,
section: $dto->section,
contentHtml: $dto->contentHtml,
extra: $extra,
authorsParsed: $authorsParsed,
);
}
protected function shouldEnrich(CrawlItemDto $dto): bool
{
$mode = (string) config('crawl.arxiv.abs_enrich_mode', 'auto');
if ($mode === 'never') {
return false;
}
if ($mode === 'always') {
return true;
}
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
$hasDate = ($dto->publishedAt ?? '') !== '';
$hasSchool = ($dto->schoolName ?? null) !== null
|| ($lead['university_name'] ?? null) !== null
|| ($lead['affiliation'] ?? null) !== null;
if ($hasDate && $hasSchool) {
return false;
}
return true;
}
protected function shouldPreferHtmlEnrich(CrawlItemDto $dto): bool
{
if ((bool) config('crawl.arxiv.enrich_prefer_html', true)) {
return true;
}
return ($dto->extra['source'] ?? '') === 'html_search'
|| ! empty($dto->extra['html_url']);
}
protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto
{
if (! empty($dto->extra['lead_author'])) {
return $dto;
}
$lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
$extra = array_merge($dto->extra, [
'lead_author' => $lead,
'authors_parsed' => $dto->authorsParsed !== [] ? $dto->authorsParsed : ($dto->extra['authors_parsed'] ?? []),
]);
return new CrawlItemDto(
externalId: $dto->externalId,
title: $dto->title,
canonicalUrl: $dto->canonicalUrl,
authors: $dto->authors,
summary: $dto->summary,
publishedAt: $dto->publishedAt,
schoolName: $dto->schoolName ?? $lead['university_name'] ?? null,
section: $dto->section,
contentHtml: $dto->contentHtml,
extra: $extra,
authorsParsed: $dto->authorsParsed,
);
}
protected function fetchAbsHtml(string $arxivId): string
{
return $this->fetchCachedPage('abs', $arxivId, function () use ($arxivId) {
foreach ($this->versionIdCandidates($arxivId) as $id) {
$html = $this->fetchPage('https://arxiv.org/abs/'.$id);
if ($html !== '' && str_contains($html, 'abs-outer')) {
return $html;
}
}
return '';
});
}
protected function fetchHtmlVersion(string $arxivId): string
{
return $this->fetchCachedPage('html', $arxivId, function () use ($arxivId) {
foreach ($this->versionIdCandidates($arxivId) as $id) {
$html = $this->fetchPage('https://arxiv.org/html/'.$id);
if ($html !== '' && (str_contains($html, 'ltx_document') || str_contains($html, 'ltx_authors'))) {
return $html;
}
}
return '';
});
}
/**
* @return list<string>
*/
protected function versionIdCandidates(string $arxivId): array
{
if (preg_match('/v\d+$/i', $arxivId)) {
return [$arxivId];
}
return [$arxivId.'v1'];
}
protected function fetchCachedPage(string $kind, string $arxivId, callable $fetcher): string
{
$ttl = max(60, (int) config('crawl.arxiv.page_cache_seconds', 86400));
$key = 'arxiv_'.$kind.':'.preg_replace('/[^a-zA-Z0-9._-]/', '_', $arxivId);
return (string) Cache::remember($key, $ttl, fn () => (string) $fetcher());
}
protected function fetchPage(string $url): string
{
try {
$timeout = (int) config('crawl.arxiv.enrich_http_timeout_seconds', 25);
$connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 15);
$response = $this->gate->run(fn () => Http::timeout($timeout)
->connectTimeout($connectTimeout)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org)',
'Accept' => 'text/html',
])
->get($url));
if ($response->status() === 429) {
return '';
}
return $response->successful() ? (string) $response->body() : '';
} catch (\Throwable) {
return '';
}
}
}