You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

312 lines
11 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\ArxivAbsEnricher;
use App\Services\Crawl\ArxivMetadataParser;
use App\Services\Crawl\ArxivRequestGate;
use App\Services\Crawl\ArxivTextNormalizer;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlItemDto;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\RequestException;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Facades\Http;
use SimpleXMLElement;
/**
* arXiv 论文采集:优先 export API限流或 429 时降级为搜索页 HTML 解析。
*
* @see https://info.arxiv.org/help/api/user-manual.html
* @see https://info.arxiv.org/help/api/tou.html
*/
class ArxivApiAdapter implements CrawlerAdapterInterface
{
private const API_URL = 'https://export.arxiv.org/api/query';
private const SEARCH_URL = 'https://arxiv.org/search/';
public function __construct(
protected ArxivRequestGate $gate,
protected ArxivAbsEnricher $absEnricher,
) {}
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywordRaw = trim((string) ($params['keyword'] ?? ''));
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 20)));
if ((bool) config('crawl.arxiv.prefer_html_search', false)) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
}
$response = null;
try {
$response = $this->requestApiOnce([
'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw),
'start' => 0,
'max_results' => $maxResults,
'sortBy' => 'submittedDate',
'sortOrder' => 'descending',
]);
} catch (ConnectionException|RequestException $e) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults, $e));
}
if ($response->successful()) {
$items = $this->parseAtomFeed($response->body(), $keywordRaw);
if ($items !== []) {
return $this->finalizeItems(array_slice($items, 0, $maxResults));
}
}
if ($response->status() === 429 || ! $response->successful()) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
}
return [];
}
/**
* @param array<string, mixed> $queryParams
*/
protected function requestApiOnce(array $queryParams): Response
{
try {
return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
} catch (ConnectionException $e) {
sleep(3);
return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
}
}
/**
* @return list<CrawlItemDto>
*/
protected function requireHtmlSearchItems(string $keyword, int $maxResults, ?\Throwable $previous = null): array
{
$items = $this->fetchViaHtmlSearch($keyword, $maxResults);
if ($items !== []) {
return $items;
}
$hint = $previous instanceof RequestException && $previous->response?->status() === 429
? 'arXiv 访问过于频繁HTTP 429请等待 12 分钟后再试'
: 'arXiv 搜索页抓取失败,请检查网络或稍后重试';
throw new \RuntimeException($hint, 0, $previous);
}
/**
* @return CrawlItemDto[]
*/
protected function parseAtomFeed(string $body, string $keyword): array
{
$xml = new SimpleXMLElement($body);
$xml->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
$entries = $xml->xpath('//atom:entry') ?: [];
$items = [];
foreach ($entries as $entry) {
$entry->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
$idUrl = (string) ($entry->id ?? '');
$arxivId = $this->extractArxivId($idUrl);
if (! $arxivId) {
continue;
}
$authorsParsed = [];
foreach ($entry->author as $author) {
$author->registerXPathNamespace('arxiv', 'http://arxiv.org/schemas/atom');
$name = trim((string) ($author->name ?? ''));
$affNodes = $author->xpath('arxiv:affiliation') ?: [];
$affiliation = trim((string) ($affNodes[0] ?? ''));
if ($name !== '') {
$authorsParsed[] = [
'name' => $name,
'email' => null,
'affiliation' => $affiliation !== '' ? $affiliation : null,
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
];
}
}
$authorNames = array_column($authorsParsed, 'name');
$published = (string) ($entry->published ?? '');
$publishedAt = $published ? substr($published, 0, 10) : null;
$lead = CrawlAuthorParser::leadAuthor(implode('; ', $authorNames), $authorsParsed);
$items[] = new CrawlItemDto(
externalId: 'arxiv:'.$arxivId,
title: ArxivTextNormalizer::normalize(trim((string) ($entry->title ?? ''))) ?? '',
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
authors: implode('; ', $authorNames),
summary: ArxivTextNormalizer::normalize(trim((string) ($entry->summary ?? ''))),
publishedAt: $publishedAt,
schoolName: $lead['university_name'] ?? null,
extra: [
'platform' => 'arxiv',
'arxiv_id' => $arxivId,
'keyword' => $keyword,
'source' => 'api',
'authors_parsed' => $authorsParsed,
'lead_author' => $lead,
],
authorsParsed: $authorsParsed,
);
}
return $items;
}
/**
* 搜索页降级export API 被 429 时)。勿传 size 参数,否则会 400。
*
* @return CrawlItemDto[]
*/
protected function fetchViaHtmlSearch(string $keyword, int $maxResults): array
{
$response = $this->gate->run(fn () => $this->sendRequest(self::SEARCH_URL, [
'query' => $keyword,
'searchtype' => 'all',
]));
if (! $response->successful()) {
return [];
}
return $this->parseSearchHtml($response->body(), $keyword, $maxResults);
}
/**
* @return CrawlItemDto[]
*/
protected function parseSearchHtml(string $html, string $keyword, int $maxResults): array
{
if (! preg_match_all('#<li class="arxiv-result">(.*?)</li>#s', $html, $blocks)) {
return [];
}
$items = [];
foreach ($blocks[1] as $block) {
if (count($items) >= $maxResults) {
break;
}
if (! preg_match('#arxiv\.org/abs/([^"?\s]+)#', $block, $idMatch)) {
continue;
}
$arxivId = $idMatch[1];
$title = '';
if (preg_match('#<p class="title is-5 mathjax">\s*(.*?)\s*</p>#s', $block, $titleMatch)) {
$title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? '';
}
$authors = '';
if (preg_match('#<p class="authors">(.*?)</p>#s', $block, $authorMatch)) {
if (preg_match_all('#<a[^>]*>([^<]+)</a>#', $authorMatch[1], $authorNames)) {
$authors = implode('; ', array_map('trim', $authorNames[1]));
}
}
$summary = '';
if (preg_match('#<span class="abstract-full[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractMatch)) {
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
} elseif (preg_match('#<span class="abstract-short[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractShort)) {
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractShort[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
}
$publishedAt = ArxivMetadataParser::parsePublishedDate($block);
$authorsParsed = [];
if ($authors !== '') {
foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) {
$name = trim($name);
if ($name !== '') {
$authorsParsed[] = [
'name' => $name,
'email' => null,
'affiliation' => null,
'university_name' => null,
];
}
}
}
$lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);
$items[] = new CrawlItemDto(
externalId: 'arxiv:'.$arxivId,
title: $title,
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
authors: $authors,
summary: $summary,
publishedAt: $publishedAt,
schoolName: $lead['university_name'] ?? null,
extra: [
'platform' => 'arxiv',
'arxiv_id' => $arxivId,
'keyword' => $keyword,
'source' => 'html_search',
'pdf_url' => ArxivMetadataParser::extractPdfUrl($block, $arxivId),
'html_url' => ArxivMetadataParser::extractHtmlUrl($block, $arxivId),
'authors_parsed' => $authorsParsed,
'lead_author' => $lead,
],
authorsParsed: $authorsParsed,
);
}
return $items;
}
/**
* @param array<string, mixed> $queryParams
*/
protected function sendRequest(string $url, array $queryParams): Response
{
$email = (string) config('crawl.arxiv.contact_email', 'support@example.com');
$timeout = (int) config('crawl.arxiv.http_timeout_seconds', 60);
$connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 30);
return Http::timeout($timeout)
->connectTimeout($connectTimeout)
->retry(2, 2000, fn ($exception) => $exception instanceof ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org; mailto:'.$email.')',
'Accept' => 'application/atom+xml, text/html;q=0.9',
])
->get($url, $queryParams);
}
protected function extractArxivId(string $idUrl): ?string
{
if (preg_match('~arxiv\.org/abs/([^\s/?]+)~i', $idUrl, $m)) {
return $m[1];
}
if (preg_match('~/abs/([^\s/?]+)~', $idUrl, $m)) {
return $m[1];
}
return null;
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function finalizeItems(array $items, bool $enrichAbs = true): array
{
if ($items === []) {
return $items;
}
return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items;
}
}