You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

312 lines
11 KiB

4 weeks ago
<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\ArxivAbsEnricher;
use App\Services\Crawl\ArxivMetadataParser;
use App\Services\Crawl\ArxivRequestGate;
use App\Services\Crawl\ArxivTextNormalizer;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlItemDto;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\RequestException;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Facades\Http;
use SimpleXMLElement;
/**
* arXiv 论文采集:优先 export API限流或 429 时降级为搜索页 HTML 解析。
*
* @see https://info.arxiv.org/help/api/user-manual.html
* @see https://info.arxiv.org/help/api/tou.html
*/
class ArxivApiAdapter implements CrawlerAdapterInterface
{
private const API_URL = 'https://export.arxiv.org/api/query';
private const SEARCH_URL = 'https://arxiv.org/search/';
public function __construct(
protected ArxivRequestGate $gate,
protected ArxivAbsEnricher $absEnricher,
) {}
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywordRaw = trim((string) ($params['keyword'] ?? ''));
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 20)));
if ((bool) config('crawl.arxiv.prefer_html_search', false)) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
}
$response = null;
try {
$response = $this->requestApiOnce([
'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw),
'start' => 0,
'max_results' => $maxResults,
'sortBy' => 'submittedDate',
'sortOrder' => 'descending',
]);
} catch (ConnectionException|RequestException $e) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults, $e));
}
if ($response->successful()) {
$items = $this->parseAtomFeed($response->body(), $keywordRaw);
if ($items !== []) {
return $this->finalizeItems(array_slice($items, 0, $maxResults));
}
}
if ($response->status() === 429 || ! $response->successful()) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
}
return [];
}
/**
* @param array<string, mixed> $queryParams
*/
protected function requestApiOnce(array $queryParams): Response
{
try {
return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
} catch (ConnectionException $e) {
sleep(3);
return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
}
}
/**
* @return list<CrawlItemDto>
*/
protected function requireHtmlSearchItems(string $keyword, int $maxResults, ?\Throwable $previous = null): array
{
$items = $this->fetchViaHtmlSearch($keyword, $maxResults);
if ($items !== []) {
return $items;
}
$hint = $previous instanceof RequestException && $previous->response?->status() === 429
? 'arXiv 访问过于频繁HTTP 429请等待 12 分钟后再试'
: 'arXiv 搜索页抓取失败,请检查网络或稍后重试';
throw new \RuntimeException($hint, 0, $previous);
}
/**
* @return CrawlItemDto[]
*/
protected function parseAtomFeed(string $body, string $keyword): array
{
$xml = new SimpleXMLElement($body);
$xml->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
$entries = $xml->xpath('//atom:entry') ?: [];
$items = [];
foreach ($entries as $entry) {
$entry->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
$idUrl = (string) ($entry->id ?? '');
$arxivId = $this->extractArxivId($idUrl);
if (! $arxivId) {
continue;
}
$authorsParsed = [];
foreach ($entry->author as $author) {
$author->registerXPathNamespace('arxiv', 'http://arxiv.org/schemas/atom');
$name = trim((string) ($author->name ?? ''));
$affNodes = $author->xpath('arxiv:affiliation') ?: [];
$affiliation = trim((string) ($affNodes[0] ?? ''));
if ($name !== '') {
$authorsParsed[] = [
'name' => $name,
'email' => null,
'affiliation' => $affiliation !== '' ? $affiliation : null,
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
];
}
}
$authorNames = array_column($authorsParsed, 'name');
$published = (string) ($entry->published ?? '');
$publishedAt = $published ? substr($published, 0, 10) : null;
$lead = CrawlAuthorParser::leadAuthor(implode('; ', $authorNames), $authorsParsed);
$items[] = new CrawlItemDto(
externalId: 'arxiv:'.$arxivId,
title: ArxivTextNormalizer::normalize(trim((string) ($entry->title ?? ''))) ?? '',
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
authors: implode('; ', $authorNames),
summary: ArxivTextNormalizer::normalize(trim((string) ($entry->summary ?? ''))),
publishedAt: $publishedAt,
schoolName: $lead['university_name'] ?? null,
extra: [
'platform' => 'arxiv',
'arxiv_id' => $arxivId,
'keyword' => $keyword,
'source' => 'api',
'authors_parsed' => $authorsParsed,
'lead_author' => $lead,
],
authorsParsed: $authorsParsed,
);
}
return $items;
}
/**
* 搜索页降级export API 被 429 时)。勿传 size 参数,否则会 400。
*
* @return CrawlItemDto[]
*/
protected function fetchViaHtmlSearch(string $keyword, int $maxResults): array
{
$response = $this->gate->run(fn () => $this->sendRequest(self::SEARCH_URL, [
'query' => $keyword,
'searchtype' => 'all',
]));
if (! $response->successful()) {
return [];
}
return $this->parseSearchHtml($response->body(), $keyword, $maxResults);
}
/**
* @return CrawlItemDto[]
*/
protected function parseSearchHtml(string $html, string $keyword, int $maxResults): array
{
if (! preg_match_all('#<li class="arxiv-result">(.*?)</li>#s', $html, $blocks)) {
return [];
}
$items = [];
foreach ($blocks[1] as $block) {
if (count($items) >= $maxResults) {
break;
}
if (! preg_match('#arxiv\.org/abs/([^"?\s]+)#', $block, $idMatch)) {
continue;
}
$arxivId = $idMatch[1];
$title = '';
if (preg_match('#<p class="title is-5 mathjax">\s*(.*?)\s*</p>#s', $block, $titleMatch)) {
$title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? '';
}
$authors = '';
if (preg_match('#<p class="authors">(.*?)</p>#s', $block, $authorMatch)) {
if (preg_match_all('#<a[^>]*>([^<]+)</a>#', $authorMatch[1], $authorNames)) {
$authors = implode('; ', array_map('trim', $authorNames[1]));
}
}
$summary = '';
if (preg_match('#<span class="abstract-full[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractMatch)) {
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
} elseif (preg_match('#<span class="abstract-short[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractShort)) {
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractShort[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
}
$publishedAt = ArxivMetadataParser::parsePublishedDate($block);
$authorsParsed = [];
if ($authors !== '') {
foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) {
$name = trim($name);
if ($name !== '') {
$authorsParsed[] = [
'name' => $name,
'email' => null,
'affiliation' => null,
'university_name' => null,
];
}
}
}
$lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);
$items[] = new CrawlItemDto(
externalId: 'arxiv:'.$arxivId,
title: $title,
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
authors: $authors,
summary: $summary,
publishedAt: $publishedAt,
schoolName: $lead['university_name'] ?? null,
extra: [
'platform' => 'arxiv',
'arxiv_id' => $arxivId,
'keyword' => $keyword,
'source' => 'html_search',
'pdf_url' => ArxivMetadataParser::extractPdfUrl($block, $arxivId),
'html_url' => ArxivMetadataParser::extractHtmlUrl($block, $arxivId),
'authors_parsed' => $authorsParsed,
'lead_author' => $lead,
],
authorsParsed: $authorsParsed,
);
}
return $items;
}
/**
* @param array<string, mixed> $queryParams
*/
protected function sendRequest(string $url, array $queryParams): Response
{
$email = (string) config('crawl.arxiv.contact_email', 'support@example.com');
$timeout = (int) config('crawl.arxiv.http_timeout_seconds', 60);
$connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 30);
return Http::timeout($timeout)
->connectTimeout($connectTimeout)
->retry(2, 2000, fn ($exception) => $exception instanceof ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org; mailto:'.$email.')',
'Accept' => 'application/atom+xml, text/html;q=0.9',
])
->get($url, $queryParams);
}
protected function extractArxivId(string $idUrl): ?string
{
if (preg_match('~arxiv\.org/abs/([^\s/?]+)~i', $idUrl, $m)) {
return $m[1];
}
if (preg_match('~/abs/([^\s/?]+)~', $idUrl, $m)) {
return $m[1];
}
return null;
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function finalizeItems(array $items, bool $enrichAbs = true): array
{
if ($items === []) {
return $items;
}
return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items;
}
}