You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

390 lines
14 KiB

3 weeks ago
<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
4 days ago
use App\Models\Paper;
3 weeks ago
use App\Services\Crawl\ArxivAbsEnricher;
use App\Services\Crawl\ArxivMetadataParser;
use App\Services\Crawl\ArxivRequestGate;
use App\Services\Crawl\ArxivTextNormalizer;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlItemDto;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\RequestException;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Facades\Http;
use SimpleXMLElement;
/**
* arXiv 论文采集:优先 export API限流或 429 时降级为搜索页 HTML 解析。
*
* @see https://info.arxiv.org/help/api/user-manual.html
* @see https://info.arxiv.org/help/api/tou.html
*/
class ArxivApiAdapter implements CrawlerAdapterInterface
{
private const API_URL = 'https://export.arxiv.org/api/query';
private const SEARCH_URL = 'https://arxiv.org/search/';
public function __construct(
protected ArxivRequestGate $gate,
protected ArxivAbsEnricher $absEnricher,
) {}
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywordRaw = trim((string) ($params['keyword'] ?? ''));
4 days ago
$maxResults = min(200, max(1, (int) ($params['max_results'] ?? 50)));
$maxPages = min(20, max(1, (int) ($params['max_pages'] ?? 1)));
$pageSize = 50;
4 days ago
$skipImported = ($params['skip_imported'] ?? true) !== false;
$importedIds = $skipImported ? $this->loadImportedExternalIds() : [];
$maxScanPages = $this->resolveMaxScanPages($maxPages, $maxResults, $skipImported);
3 weeks ago
if ((bool) config('crawl.arxiv.prefer_html_search', false)) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
}
4 days ago
$items = [];
$seen = [];
4 days ago
for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) {
4 days ago
$start = $page * $pageSize;
4 days ago
$batch = $this->fetchApiPage($keywordRaw, $start, $pageSize);
4 days ago
if ($batch === []) {
break;
}
foreach ($batch as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
4 days ago
if ($skipImported && isset($importedIds[$item->externalId])) {
continue;
}
4 days ago
$items[] = $item;
4 days ago
if (count($items) >= $maxResults) {
break 2;
}
4 days ago
}
4 days ago
if (count($batch) < $pageSize) {
4 days ago
break;
}
}
if ($items !== []) {
return $this->finalizeItems($items);
}
4 days ago
if ($keywordRaw === '') {
throw new \RuntimeException('arXiv API 未返回结果,请稍后重试');
}
4 days ago
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize)));
}
/**
* @return list<CrawlItemDto>
*/
protected function fetchApiPage(string $keywordRaw, int $start, int $maxResults): array
{
$maxResults = min(50, max(1, $maxResults));
3 weeks ago
try {
$response = $this->requestApiOnce([
'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw),
4 days ago
'start' => $start,
3 weeks ago
'max_results' => $maxResults,
'sortBy' => 'submittedDate',
'sortOrder' => 'descending',
]);
4 days ago
} catch (ConnectionException|RequestException) {
return [];
3 weeks ago
}
4 days ago
if (! $response->successful()) {
return [];
3 weeks ago
}
4 days ago
$body = $response->body();
if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, '<title>Error</title>')) {
return [];
}
return $this->parseAtomFeed($body, $keywordRaw);
3 weeks ago
}
/**
* @param array<string, mixed> $queryParams
*/
protected function requestApiOnce(array $queryParams): Response
{
try {
return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
} catch (ConnectionException $e) {
sleep(3);
return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
}
}
/**
* @return list<CrawlItemDto>
*/
protected function requireHtmlSearchItems(string $keyword, int $maxResults, ?\Throwable $previous = null): array
{
$items = $this->fetchViaHtmlSearch($keyword, $maxResults);
if ($items !== []) {
return $items;
}
$hint = $previous instanceof RequestException && $previous->response?->status() === 429
? 'arXiv 访问过于频繁HTTP 429请等待 12 分钟后再试'
: 'arXiv 搜索页抓取失败,请检查网络或稍后重试';
throw new \RuntimeException($hint, 0, $previous);
}
/**
* @return CrawlItemDto[]
*/
protected function parseAtomFeed(string $body, string $keyword): array
{
$xml = new SimpleXMLElement($body);
$xml->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
$entries = $xml->xpath('//atom:entry') ?: [];
$items = [];
foreach ($entries as $entry) {
$entry->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
$idUrl = (string) ($entry->id ?? '');
$arxivId = $this->extractArxivId($idUrl);
if (! $arxivId) {
continue;
}
$authorsParsed = [];
foreach ($entry->author as $author) {
$author->registerXPathNamespace('arxiv', 'http://arxiv.org/schemas/atom');
$name = trim((string) ($author->name ?? ''));
$affNodes = $author->xpath('arxiv:affiliation') ?: [];
$affiliation = trim((string) ($affNodes[0] ?? ''));
if ($name !== '') {
$authorsParsed[] = [
'name' => $name,
'email' => null,
'affiliation' => $affiliation !== '' ? $affiliation : null,
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
];
}
}
$authorNames = array_column($authorsParsed, 'name');
$published = (string) ($entry->published ?? '');
$publishedAt = $published ? substr($published, 0, 10) : null;
$lead = CrawlAuthorParser::leadAuthor(implode('; ', $authorNames), $authorsParsed);
$items[] = new CrawlItemDto(
externalId: 'arxiv:'.$arxivId,
title: ArxivTextNormalizer::normalize(trim((string) ($entry->title ?? ''))) ?? '',
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
authors: implode('; ', $authorNames),
summary: ArxivTextNormalizer::normalize(trim((string) ($entry->summary ?? ''))),
publishedAt: $publishedAt,
schoolName: $lead['university_name'] ?? null,
extra: [
'platform' => 'arxiv',
'arxiv_id' => $arxivId,
'keyword' => $keyword,
'source' => 'api',
'authors_parsed' => $authorsParsed,
'lead_author' => $lead,
],
authorsParsed: $authorsParsed,
);
}
return $items;
}
/**
* 搜索页降级export API 被 429 时)。勿传 size 参数,否则会 400。
*
* @return CrawlItemDto[]
*/
protected function fetchViaHtmlSearch(string $keyword, int $maxResults): array
{
$response = $this->gate->run(fn () => $this->sendRequest(self::SEARCH_URL, [
'query' => $keyword,
'searchtype' => 'all',
]));
if (! $response->successful()) {
return [];
}
return $this->parseSearchHtml($response->body(), $keyword, $maxResults);
}
/**
* @return CrawlItemDto[]
*/
protected function parseSearchHtml(string $html, string $keyword, int $maxResults): array
{
if (! preg_match_all('#<li class="arxiv-result">(.*?)</li>#s', $html, $blocks)) {
return [];
}
$items = [];
foreach ($blocks[1] as $block) {
if (count($items) >= $maxResults) {
break;
}
if (! preg_match('#arxiv\.org/abs/([^"?\s]+)#', $block, $idMatch)) {
continue;
}
$arxivId = $idMatch[1];
$title = '';
if (preg_match('#<p class="title is-5 mathjax">\s*(.*?)\s*</p>#s', $block, $titleMatch)) {
$title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? '';
}
$authors = '';
if (preg_match('#<p class="authors">(.*?)</p>#s', $block, $authorMatch)) {
if (preg_match_all('#<a[^>]*>([^<]+)</a>#', $authorMatch[1], $authorNames)) {
$authors = implode('; ', array_map('trim', $authorNames[1]));
}
}
$summary = '';
if (preg_match('#<span class="abstract-full[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractMatch)) {
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
} elseif (preg_match('#<span class="abstract-short[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractShort)) {
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractShort[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
}
$publishedAt = ArxivMetadataParser::parsePublishedDate($block);
$authorsParsed = [];
if ($authors !== '') {
foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) {
$name = trim($name);
if ($name !== '') {
$authorsParsed[] = [
'name' => $name,
'email' => null,
'affiliation' => null,
'university_name' => null,
];
}
}
}
$lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);
$items[] = new CrawlItemDto(
externalId: 'arxiv:'.$arxivId,
title: $title,
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
authors: $authors,
summary: $summary,
publishedAt: $publishedAt,
schoolName: $lead['university_name'] ?? null,
extra: [
'platform' => 'arxiv',
'arxiv_id' => $arxivId,
'keyword' => $keyword,
'source' => 'html_search',
'pdf_url' => ArxivMetadataParser::extractPdfUrl($block, $arxivId),
'html_url' => ArxivMetadataParser::extractHtmlUrl($block, $arxivId),
'authors_parsed' => $authorsParsed,
'lead_author' => $lead,
],
authorsParsed: $authorsParsed,
);
}
return $items;
}
/**
* @param array<string, mixed> $queryParams
*/
protected function sendRequest(string $url, array $queryParams): Response
{
$email = (string) config('crawl.arxiv.contact_email', 'support@example.com');
$timeout = (int) config('crawl.arxiv.http_timeout_seconds', 60);
$connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 30);
return Http::timeout($timeout)
->connectTimeout($connectTimeout)
->retry(2, 2000, fn ($exception) => $exception instanceof ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org; mailto:'.$email.')',
'Accept' => 'application/atom+xml, text/html;q=0.9',
])
->get($url, $queryParams);
}
protected function extractArxivId(string $idUrl): ?string
{
if (preg_match('~arxiv\.org/abs/([^\s/?]+)~i', $idUrl, $m)) {
return $m[1];
}
if (preg_match('~/abs/([^\s/?]+)~', $idUrl, $m)) {
return $m[1];
}
return null;
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function finalizeItems(array $items, bool $enrichAbs = true): array
{
if ($items === []) {
return $items;
}
return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items;
}
4 days ago
/**
* @return array<string, true>
*/
protected function loadImportedExternalIds(): array
{
$ids = Paper::query()
->where('source', 'crawl')
->whereNotNull('external_id')
->pluck('external_id')
->all();
return array_fill_keys($ids, true);
}
protected function resolveMaxScanPages(int $maxPages, int $maxResults, bool $skipImported): int
{
$maxPages = min(20, max(1, $maxPages));
if (! $skipImported) {
return $maxPages;
}
$minForTarget = (int) ceil($maxResults / 50);
return min(200, max($maxPages, $minForTarget * 10));
}
3 weeks ago
}