slake-school-service/app/Services/Crawl/ArxivAbsEnricher.php

<?php

namespace App\Services\Crawl;

use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;

/**
 * 从 arXiv 摘要页或 HTML 版补全日期/机构（按需、单次请求、带缓存）。
 */
class ArxivAbsEnricher
{
    public function __construct(
        protected ArxivRequestGate $gate,
    ) {}

    /**
     * @param  list<CrawlItemDto>  $items
     * @return list<CrawlItemDto>
     */
    public function enrichMany(array $items): array
    {
        if (! config('crawl.arxiv.abs_enrich_enabled', true)) {
            return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items);
        }

        $max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 8));
        $enriched = 0;
        $out = [];

        foreach ($items as $dto) {
            if ($enriched >= $max || ! $this->shouldEnrich($dto)) {
                $out[] = $this->ensureLeadAuthor($dto);
                continue;
            }

            $out[] = $this->enrichOne($dto);
            $enriched++;
        }

        return $out;
    }

    public function enrichOne(CrawlItemDto $dto): CrawlItemDto
    {
        $arxivId = $dto->extra['arxiv_id'] ?? null;
        if (! $arxivId || ! $dto->canonicalUrl) {
            return $this->ensureLeadAuthor($dto);
        }

        if (! $this->shouldEnrich($dto)) {
            return $this->ensureLeadAuthor($dto);
        }

        $publishedAt = $dto->publishedAt;
        $authorsParsed = $dto->authorsParsed;
        $enrichedFrom = null;
        $pageHtml = '';

        $preferHtml = $this->shouldPreferHtmlEnrich($dto);

        if ($preferHtml && (bool) config('crawl.arxiv.try_html_version', true)) {
            $pageHtml = $this->fetchHtmlVersion((string) $arxivId);
            if ($pageHtml !== '') {
                $enrichedFrom = 'arxiv_html';
            }
        }

        if ($pageHtml === '') {
            $pageHtml = $this->fetchAbsHtml((string) $arxivId);
            if ($pageHtml !== '') {
                $enrichedFrom = 'abs_html';
                $preferHtml = false;
            }
        }

        if ($pageHtml !== '') {
            $publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt;

            $parsed = $preferHtml
                ? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml)
                : ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);

            if ($parsed === [] && $preferHtml) {
                $parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml);
            }

            if ($parsed !== []) {
                $authorsParsed = $parsed;
            }
        }

        $lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed);
        $schoolName = $lead['university_name'] ?? $dto->schoolName;

        $extra = array_merge($dto->extra, [
            'authors_parsed' => $authorsParsed,
            'lead_author' => $lead,
        ]);
        if ($enrichedFrom !== null) {
            $extra['enriched_from'] = $enrichedFrom;
        }
        if (! isset($extra['pdf_url'])) {
            $extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml, (string) $arxivId);
        }
        if (! isset($extra['html_url'])) {
            $extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml, (string) $arxivId);
        }

        return new CrawlItemDto(
            externalId: $dto->externalId,
            title: $dto->title,
            canonicalUrl: $dto->canonicalUrl,
            authors: $dto->authors,
            summary: $dto->summary,
            publishedAt: $publishedAt,
            schoolName: $schoolName,
            section: $dto->section,
            contentHtml: $dto->contentHtml,
            extra: $extra,
            authorsParsed: $authorsParsed,
        );
    }

    protected function shouldEnrich(CrawlItemDto $dto): bool
    {
        $mode = (string) config('crawl.arxiv.abs_enrich_mode', 'auto');
        if ($mode === 'never') {
            return false;
        }
        if ($mode === 'always') {
            return true;
        }

        $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
        $hasDate = ($dto->publishedAt ?? '') !== '';
        $hasSchool = ($dto->schoolName ?? null) !== null
            || ($lead['university_name'] ?? null) !== null
            || ($lead['affiliation'] ?? null) !== null;

        if ($hasDate && $hasSchool) {
            return false;
        }

        return true;
    }

    protected function shouldPreferHtmlEnrich(CrawlItemDto $dto): bool
    {
        if ((bool) config('crawl.arxiv.enrich_prefer_html', true)) {
            return true;
        }

        return ($dto->extra['source'] ?? '') === 'html_search'
            || ! empty($dto->extra['html_url']);
    }

    protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto
    {
        if (! empty($dto->extra['lead_author'])) {
            return $dto;
        }

        $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
        $extra = array_merge($dto->extra, [
            'lead_author' => $lead,
            'authors_parsed' => $dto->authorsParsed !== [] ? $dto->authorsParsed : ($dto->extra['authors_parsed'] ?? []),
        ]);

        return new CrawlItemDto(
            externalId: $dto->externalId,
            title: $dto->title,
            canonicalUrl: $dto->canonicalUrl,
            authors: $dto->authors,
            summary: $dto->summary,
            publishedAt: $dto->publishedAt,
            schoolName: $dto->schoolName ?? $lead['university_name'] ?? null,
            section: $dto->section,
            contentHtml: $dto->contentHtml,
            extra: $extra,
            authorsParsed: $dto->authorsParsed,
        );
    }

    protected function fetchAbsHtml(string $arxivId): string
    {
        return $this->fetchCachedPage('abs', $arxivId, function () use ($arxivId) {
            foreach ($this->versionIdCandidates($arxivId) as $id) {
                $html = $this->fetchPage('https://arxiv.org/abs/'.$id);
                if ($html !== '' && str_contains($html, 'abs-outer')) {
                    return $html;
                }
            }

            return '';
        });
    }

    protected function fetchHtmlVersion(string $arxivId): string
    {
        return $this->fetchCachedPage('html', $arxivId, function () use ($arxivId) {
            foreach ($this->versionIdCandidates($arxivId) as $id) {
                $html = $this->fetchPage('https://arxiv.org/html/'.$id);
                if ($html !== '' && (str_contains($html, 'ltx_document') || str_contains($html, 'ltx_authors'))) {
                    return $html;
                }
            }

            return '';
        });
    }

    /**
     * @return list<string>
     */
    protected function versionIdCandidates(string $arxivId): array
    {
        if (preg_match('/v\d+$/i', $arxivId)) {
            return [$arxivId];
        }

        return [$arxivId.'v1'];
    }

    protected function fetchCachedPage(string $kind, string $arxivId, callable $fetcher): string
    {
        $ttl = max(60, (int) config('crawl.arxiv.page_cache_seconds', 86400));
        $key = 'arxiv_'.$kind.':'.preg_replace('/[^a-zA-Z0-9._-]/', '_', $arxivId);

        return (string) Cache::remember($key, $ttl, fn () => (string) $fetcher());
    }

    protected function fetchPage(string $url): string
    {
        try {
            $timeout = (int) config('crawl.arxiv.enrich_http_timeout_seconds', 25);
            $connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 15);

            $response = $this->gate->run(fn () => Http::timeout($timeout)
                ->connectTimeout($connectTimeout)
                ->withHeaders([
                    'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org)',
                    'Accept' => 'text/html',
                ])
                ->get($url));

            if ($response->status() === 429) {
                return '';
            }

            return $response->successful() ? (string) $response->body() : '';
        } catch (\Throwable) {
            return '';
        }
    }
}