slake-school-service/app/Services/Crawl/Adapters/ArxivApiAdapter.php

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Models\Paper;
use App\Services\Crawl\ArxivAbsEnricher;
use App\Services\Crawl\ArxivMetadataParser;
use App\Services\Crawl\ArxivRequestGate;
use App\Services\Crawl\ArxivTextNormalizer;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlItemDto;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\RequestException;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Facades\Http;
use SimpleXMLElement;

/**
 * arXiv 论文采集：优先 export API，限流或 429 时降级为搜索页 HTML 解析。
 *
 * @see https://info.arxiv.org/help/api/user-manual.html
 * @see https://info.arxiv.org/help/api/tou.html
 */
class ArxivApiAdapter implements CrawlerAdapterInterface
{
    private const API_URL = 'https://export.arxiv.org/api/query';

    private const SEARCH_URL = 'https://arxiv.org/search/';

    public function __construct(
        protected ArxivRequestGate $gate,
        protected ArxivAbsEnricher $absEnricher,
    ) {}

    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $keywordRaw = trim((string) ($params['keyword'] ?? ''));

        $maxResults = min(200, max(1, (int) ($params['max_results'] ?? 50)));
        $maxPages = min(20, max(1, (int) ($params['max_pages'] ?? 1)));
        $pageSize = 50;
        $skipImported = ($params['skip_imported'] ?? true) !== false;
        $importedIds = $skipImported ? $this->loadImportedExternalIds() : [];
        $maxScanPages = $this->resolveMaxScanPages($maxPages, $maxResults, $skipImported);

        if ((bool) config('crawl.arxiv.prefer_html_search', false)) {
            return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
        }

        $items = [];
        $seen = [];

        for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) {
            $start = $page * $pageSize;
            $batch = $this->fetchApiPage($keywordRaw, $start, $pageSize);
            if ($batch === []) {
                break;
            }

            foreach ($batch as $item) {
                if (isset($seen[$item->externalId])) {
                    continue;
                }
                $seen[$item->externalId] = true;

                if ($skipImported && isset($importedIds[$item->externalId])) {
                    continue;
                }

                $items[] = $item;
                if (count($items) >= $maxResults) {
                    break 2;
                }
            }

            if (count($batch) < $pageSize) {
                break;
            }
        }

        if ($items !== []) {
            return $this->finalizeItems($items);
        }

        if ($keywordRaw === '') {
            throw new \RuntimeException('arXiv API 未返回结果，请稍后重试');
        }

        return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize)));
    }

    /**
     * @return list<CrawlItemDto>
     */
    protected function fetchApiPage(string $keywordRaw, int $start, int $maxResults): array
    {
        $maxResults = min(50, max(1, $maxResults));

        try {
            $response = $this->requestApiOnce([
                'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw),
                'start' => $start,
                'max_results' => $maxResults,
                'sortBy' => 'submittedDate',
                'sortOrder' => 'descending',
            ]);
        } catch (ConnectionException|RequestException) {
            return [];
        }

        if (! $response->successful()) {
            return [];
        }

        $body = $response->body();
        if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, '<title>Error</title>')) {
            return [];
        }

        return $this->parseAtomFeed($body, $keywordRaw);
    }

    /**
     * @param  array<string, mixed>  $queryParams
     */
    protected function requestApiOnce(array $queryParams): Response
    {
        try {
            return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
        } catch (ConnectionException $e) {
            sleep(3);

            return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
        }
    }

    /**
     * @return list<CrawlItemDto>
     */
    protected function requireHtmlSearchItems(string $keyword, int $maxResults, ?\Throwable $previous = null): array
    {
        $items = $this->fetchViaHtmlSearch($keyword, $maxResults);
        if ($items !== []) {
            return $items;
        }

        $hint = $previous instanceof RequestException && $previous->response?->status() === 429
            ? 'arXiv 访问过于频繁（HTTP 429），请等待 1～2 分钟后再试'
            : 'arXiv 搜索页抓取失败，请检查网络或稍后重试';

        throw new \RuntimeException($hint, 0, $previous);
    }

    /**
     * @return CrawlItemDto[]
     */
    protected function parseAtomFeed(string $body, string $keyword): array
    {
        $xml = new SimpleXMLElement($body);
        $xml->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
        $entries = $xml->xpath('//atom:entry') ?: [];

        $items = [];
        foreach ($entries as $entry) {
            $entry->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
            $idUrl = (string) ($entry->id ?? '');
            $arxivId = $this->extractArxivId($idUrl);
            if (! $arxivId) {
                continue;
            }

            $authorsParsed = [];
            foreach ($entry->author as $author) {
                $author->registerXPathNamespace('arxiv', 'http://arxiv.org/schemas/atom');
                $name = trim((string) ($author->name ?? ''));
                $affNodes = $author->xpath('arxiv:affiliation') ?: [];
                $affiliation = trim((string) ($affNodes[0] ?? ''));
                if ($name !== '') {
                    $authorsParsed[] = [
                        'name' => $name,
                        'email' => null,
                        'affiliation' => $affiliation !== '' ? $affiliation : null,
                        'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
                    ];
                }
            }
            $authorNames = array_column($authorsParsed, 'name');

            $published = (string) ($entry->published ?? '');
            $publishedAt = $published ? substr($published, 0, 10) : null;
            $lead = CrawlAuthorParser::leadAuthor(implode('; ', $authorNames), $authorsParsed);

            $items[] = new CrawlItemDto(
                externalId: 'arxiv:'.$arxivId,
                title: ArxivTextNormalizer::normalize(trim((string) ($entry->title ?? ''))) ?? '',
                canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
                authors: implode('; ', $authorNames),
                summary: ArxivTextNormalizer::normalize(trim((string) ($entry->summary ?? ''))),
                publishedAt: $publishedAt,
                schoolName: $lead['university_name'] ?? null,
                extra: [
                    'platform' => 'arxiv',
                    'arxiv_id' => $arxivId,
                    'keyword' => $keyword,
                    'source' => 'api',
                    'authors_parsed' => $authorsParsed,
                    'lead_author' => $lead,
                ],
                authorsParsed: $authorsParsed,
            );
        }

        return $items;
    }

    /**
     * 搜索页降级（export API 被 429 时）。勿传 size 参数，否则会 400。
     *
     * @return CrawlItemDto[]
     */
    protected function fetchViaHtmlSearch(string $keyword, int $maxResults): array
    {
        $response = $this->gate->run(fn () => $this->sendRequest(self::SEARCH_URL, [
            'query' => $keyword,
            'searchtype' => 'all',
        ]));

        if (! $response->successful()) {
            return [];
        }

        return $this->parseSearchHtml($response->body(), $keyword, $maxResults);
    }

    /**
     * @return CrawlItemDto[]
     */
    protected function parseSearchHtml(string $html, string $keyword, int $maxResults): array
    {
        if (! preg_match_all('#<li class="arxiv-result">(.*?)</li>#s', $html, $blocks)) {
            return [];
        }

        $items = [];
        foreach ($blocks[1] as $block) {
            if (count($items) >= $maxResults) {
                break;
            }

            if (! preg_match('#arxiv\.org/abs/([^"?\s]+)#', $block, $idMatch)) {
                continue;
            }
            $arxivId = $idMatch[1];

            $title = '';
            if (preg_match('#<p class="title is-5 mathjax">\s*(.*?)\s*</p>#s', $block, $titleMatch)) {
                $title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? '';
            }

            $authors = '';
            if (preg_match('#<p class="authors">(.*?)</p>#s', $block, $authorMatch)) {
                if (preg_match_all('#<a[^>]*>([^<]+)</a>#', $authorMatch[1], $authorNames)) {
                    $authors = implode('; ', array_map('trim', $authorNames[1]));
                }
            }

            $summary = '';
            if (preg_match('#<span class="abstract-full[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractMatch)) {
                $summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
            } elseif (preg_match('#<span class="abstract-short[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractShort)) {
                $summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractShort[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
            }

            $publishedAt = ArxivMetadataParser::parsePublishedDate($block);
            $authorsParsed = [];
            if ($authors !== '') {
                foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) {
                    $name = trim($name);
                    if ($name !== '') {
                        $authorsParsed[] = [
                            'name' => $name,
                            'email' => null,
                            'affiliation' => null,
                            'university_name' => null,
                        ];
                    }
                }
            }
            $lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);

            $items[] = new CrawlItemDto(
                externalId: 'arxiv:'.$arxivId,
                title: $title,
                canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
                authors: $authors,
                summary: $summary,
                publishedAt: $publishedAt,
                schoolName: $lead['university_name'] ?? null,
                extra: [
                    'platform' => 'arxiv',
                    'arxiv_id' => $arxivId,
                    'keyword' => $keyword,
                    'source' => 'html_search',
                    'pdf_url' => ArxivMetadataParser::extractPdfUrl($block, $arxivId),
                    'html_url' => ArxivMetadataParser::extractHtmlUrl($block, $arxivId),
                    'authors_parsed' => $authorsParsed,
                    'lead_author' => $lead,
                ],
                authorsParsed: $authorsParsed,
            );
        }

        return $items;
    }

    /**
     * @param  array<string, mixed>  $queryParams
     */
    protected function sendRequest(string $url, array $queryParams): Response
    {
        $email = (string) config('crawl.arxiv.contact_email', 'support@example.com');

        $timeout = (int) config('crawl.arxiv.http_timeout_seconds', 60);
        $connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 30);

        return Http::timeout($timeout)
            ->connectTimeout($connectTimeout)
            ->retry(2, 2000, fn ($exception) => $exception instanceof ConnectionException, throw: false)
            ->withHeaders([
                'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org; mailto:'.$email.')',
                'Accept' => 'application/atom+xml, text/html;q=0.9',
            ])
            ->get($url, $queryParams);
    }

    protected function extractArxivId(string $idUrl): ?string
    {
        if (preg_match('~arxiv\.org/abs/([^\s/?]+)~i', $idUrl, $m)) {
            return $m[1];
        }
        if (preg_match('~/abs/([^\s/?]+)~', $idUrl, $m)) {
            return $m[1];
        }

        return null;
    }

    /**
     * @param  list<CrawlItemDto>  $items
     * @return list<CrawlItemDto>
     */
    protected function finalizeItems(array $items, bool $enrichAbs = true): array
    {
        if ($items === []) {
            return $items;
        }

        return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items;
    }

    /**
     * @return array<string, true>
     */
    protected function loadImportedExternalIds(): array
    {
        $ids = Paper::query()
            ->where('source', 'crawl')
            ->whereNotNull('external_id')
            ->pluck('external_id')
            ->all();

        return array_fill_keys($ids, true);
    }

    protected function resolveMaxScanPages(int $maxPages, int $maxResults, bool $skipImported): int
    {
        $maxPages = min(20, max(1, $maxPages));
        if (! $skipImported) {
            return $maxPages;
        }

        $minForTarget = (int) ceil($maxResults / 50);

        return min(200, max($maxPages, $minForTarget * 10));
    }
}