|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
|
|
use App\Models\CrawlSource;
|
|
|
use App\Services\Crawl\ArxivAbsEnricher;
|
|
|
use App\Services\Crawl\ArxivMetadataParser;
|
|
|
use App\Services\Crawl\ArxivRequestGate;
|
|
|
use App\Services\Crawl\ArxivTextNormalizer;
|
|
|
use App\Services\Crawl\CrawlAuthorParser;
|
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
|
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
|
|
|
use App\Services\Crawl\CrawlItemDto;
|
|
|
use Illuminate\Http\Client\ConnectionException;
|
|
|
use Illuminate\Http\Client\RequestException;
|
|
|
use Illuminate\Http\Client\Response;
|
|
|
use Illuminate\Support\Facades\Http;
|
|
|
use SimpleXMLElement;
|
|
|
|
|
|
/**
|
|
|
* arXiv 论文采集:优先 export API,限流或 429 时降级为搜索页 HTML 解析。
|
|
|
*
|
|
|
* @see https://info.arxiv.org/help/api/user-manual.html
|
|
|
* @see https://info.arxiv.org/help/api/tou.html
|
|
|
*/
|
|
|
class ArxivApiAdapter implements CrawlerAdapterInterface
|
|
|
{
|
|
|
private const API_URL = 'https://export.arxiv.org/api/query';
|
|
|
|
|
|
private const SEARCH_URL = 'https://arxiv.org/search/';
|
|
|
|
|
|
public function __construct(
|
|
|
protected ArxivRequestGate $gate,
|
|
|
protected ArxivAbsEnricher $absEnricher,
|
|
|
) {}
|
|
|
|
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
|
|
{
|
|
|
$keywordRaw = trim((string) ($params['keyword'] ?? ''));
|
|
|
|
|
|
$maxResults = min(50, max(1, (int) ($params['max_results'] ?? 20)));
|
|
|
|
|
|
if ((bool) config('crawl.arxiv.prefer_html_search', false)) {
|
|
|
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
|
|
|
}
|
|
|
|
|
|
$response = null;
|
|
|
try {
|
|
|
$response = $this->requestApiOnce([
|
|
|
'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw),
|
|
|
'start' => 0,
|
|
|
'max_results' => $maxResults,
|
|
|
'sortBy' => 'submittedDate',
|
|
|
'sortOrder' => 'descending',
|
|
|
]);
|
|
|
} catch (ConnectionException|RequestException $e) {
|
|
|
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults, $e));
|
|
|
}
|
|
|
|
|
|
if ($response->successful()) {
|
|
|
$items = $this->parseAtomFeed($response->body(), $keywordRaw);
|
|
|
if ($items !== []) {
|
|
|
return $this->finalizeItems(array_slice($items, 0, $maxResults));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if ($response->status() === 429 || ! $response->successful()) {
|
|
|
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
|
|
|
}
|
|
|
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array<string, mixed> $queryParams
|
|
|
*/
|
|
|
protected function requestApiOnce(array $queryParams): Response
|
|
|
{
|
|
|
try {
|
|
|
return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
|
|
|
} catch (ConnectionException $e) {
|
|
|
sleep(3);
|
|
|
|
|
|
return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function requireHtmlSearchItems(string $keyword, int $maxResults, ?\Throwable $previous = null): array
|
|
|
{
|
|
|
$items = $this->fetchViaHtmlSearch($keyword, $maxResults);
|
|
|
if ($items !== []) {
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
$hint = $previous instanceof RequestException && $previous->response?->status() === 429
|
|
|
? 'arXiv 访问过于频繁(HTTP 429),请等待 1~2 分钟后再试'
|
|
|
: 'arXiv 搜索页抓取失败,请检查网络或稍后重试';
|
|
|
|
|
|
throw new \RuntimeException($hint, 0, $previous);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return CrawlItemDto[]
|
|
|
*/
|
|
|
protected function parseAtomFeed(string $body, string $keyword): array
|
|
|
{
|
|
|
$xml = new SimpleXMLElement($body);
|
|
|
$xml->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
|
|
|
$entries = $xml->xpath('//atom:entry') ?: [];
|
|
|
|
|
|
$items = [];
|
|
|
foreach ($entries as $entry) {
|
|
|
$entry->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
|
|
|
$idUrl = (string) ($entry->id ?? '');
|
|
|
$arxivId = $this->extractArxivId($idUrl);
|
|
|
if (! $arxivId) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$authorsParsed = [];
|
|
|
foreach ($entry->author as $author) {
|
|
|
$author->registerXPathNamespace('arxiv', 'http://arxiv.org/schemas/atom');
|
|
|
$name = trim((string) ($author->name ?? ''));
|
|
|
$affNodes = $author->xpath('arxiv:affiliation') ?: [];
|
|
|
$affiliation = trim((string) ($affNodes[0] ?? ''));
|
|
|
if ($name !== '') {
|
|
|
$authorsParsed[] = [
|
|
|
'name' => $name,
|
|
|
'email' => null,
|
|
|
'affiliation' => $affiliation !== '' ? $affiliation : null,
|
|
|
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
|
|
|
];
|
|
|
}
|
|
|
}
|
|
|
$authorNames = array_column($authorsParsed, 'name');
|
|
|
|
|
|
$published = (string) ($entry->published ?? '');
|
|
|
$publishedAt = $published ? substr($published, 0, 10) : null;
|
|
|
$lead = CrawlAuthorParser::leadAuthor(implode('; ', $authorNames), $authorsParsed);
|
|
|
|
|
|
$items[] = new CrawlItemDto(
|
|
|
externalId: 'arxiv:'.$arxivId,
|
|
|
title: ArxivTextNormalizer::normalize(trim((string) ($entry->title ?? ''))) ?? '',
|
|
|
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
|
|
|
authors: implode('; ', $authorNames),
|
|
|
summary: ArxivTextNormalizer::normalize(trim((string) ($entry->summary ?? ''))),
|
|
|
publishedAt: $publishedAt,
|
|
|
schoolName: $lead['university_name'] ?? null,
|
|
|
extra: [
|
|
|
'platform' => 'arxiv',
|
|
|
'arxiv_id' => $arxivId,
|
|
|
'keyword' => $keyword,
|
|
|
'source' => 'api',
|
|
|
'authors_parsed' => $authorsParsed,
|
|
|
'lead_author' => $lead,
|
|
|
],
|
|
|
authorsParsed: $authorsParsed,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 搜索页降级(export API 被 429 时)。勿传 size 参数,否则会 400。
|
|
|
*
|
|
|
* @return CrawlItemDto[]
|
|
|
*/
|
|
|
protected function fetchViaHtmlSearch(string $keyword, int $maxResults): array
|
|
|
{
|
|
|
$response = $this->gate->run(fn () => $this->sendRequest(self::SEARCH_URL, [
|
|
|
'query' => $keyword,
|
|
|
'searchtype' => 'all',
|
|
|
]));
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
return $this->parseSearchHtml($response->body(), $keyword, $maxResults);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return CrawlItemDto[]
|
|
|
*/
|
|
|
protected function parseSearchHtml(string $html, string $keyword, int $maxResults): array
|
|
|
{
|
|
|
if (! preg_match_all('#<li class="arxiv-result">(.*?)</li>#s', $html, $blocks)) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
$items = [];
|
|
|
foreach ($blocks[1] as $block) {
|
|
|
if (count($items) >= $maxResults) {
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
if (! preg_match('#arxiv\.org/abs/([^"?\s]+)#', $block, $idMatch)) {
|
|
|
continue;
|
|
|
}
|
|
|
$arxivId = $idMatch[1];
|
|
|
|
|
|
$title = '';
|
|
|
if (preg_match('#<p class="title is-5 mathjax">\s*(.*?)\s*</p>#s', $block, $titleMatch)) {
|
|
|
$title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? '';
|
|
|
}
|
|
|
|
|
|
$authors = '';
|
|
|
if (preg_match('#<p class="authors">(.*?)</p>#s', $block, $authorMatch)) {
|
|
|
if (preg_match_all('#<a[^>]*>([^<]+)</a>#', $authorMatch[1], $authorNames)) {
|
|
|
$authors = implode('; ', array_map('trim', $authorNames[1]));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$summary = '';
|
|
|
if (preg_match('#<span class="abstract-full[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractMatch)) {
|
|
|
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
|
|
|
} elseif (preg_match('#<span class="abstract-short[^"]*"[^>]*>(.*?)</span>#s', $block, $abstractShort)) {
|
|
|
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractShort[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
|
|
|
}
|
|
|
|
|
|
$publishedAt = ArxivMetadataParser::parsePublishedDate($block);
|
|
|
$authorsParsed = [];
|
|
|
if ($authors !== '') {
|
|
|
foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) {
|
|
|
$name = trim($name);
|
|
|
if ($name !== '') {
|
|
|
$authorsParsed[] = [
|
|
|
'name' => $name,
|
|
|
'email' => null,
|
|
|
'affiliation' => null,
|
|
|
'university_name' => null,
|
|
|
];
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
$lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);
|
|
|
|
|
|
$items[] = new CrawlItemDto(
|
|
|
externalId: 'arxiv:'.$arxivId,
|
|
|
title: $title,
|
|
|
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
|
|
|
authors: $authors,
|
|
|
summary: $summary,
|
|
|
publishedAt: $publishedAt,
|
|
|
schoolName: $lead['university_name'] ?? null,
|
|
|
extra: [
|
|
|
'platform' => 'arxiv',
|
|
|
'arxiv_id' => $arxivId,
|
|
|
'keyword' => $keyword,
|
|
|
'source' => 'html_search',
|
|
|
'pdf_url' => ArxivMetadataParser::extractPdfUrl($block, $arxivId),
|
|
|
'html_url' => ArxivMetadataParser::extractHtmlUrl($block, $arxivId),
|
|
|
'authors_parsed' => $authorsParsed,
|
|
|
'lead_author' => $lead,
|
|
|
],
|
|
|
authorsParsed: $authorsParsed,
|
|
|
);
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array<string, mixed> $queryParams
|
|
|
*/
|
|
|
protected function sendRequest(string $url, array $queryParams): Response
|
|
|
{
|
|
|
$email = (string) config('crawl.arxiv.contact_email', 'support@example.com');
|
|
|
|
|
|
$timeout = (int) config('crawl.arxiv.http_timeout_seconds', 60);
|
|
|
$connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 30);
|
|
|
|
|
|
return Http::timeout($timeout)
|
|
|
->connectTimeout($connectTimeout)
|
|
|
->retry(2, 2000, fn ($exception) => $exception instanceof ConnectionException, throw: false)
|
|
|
->withHeaders([
|
|
|
'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org; mailto:'.$email.')',
|
|
|
'Accept' => 'application/atom+xml, text/html;q=0.9',
|
|
|
])
|
|
|
->get($url, $queryParams);
|
|
|
}
|
|
|
|
|
|
protected function extractArxivId(string $idUrl): ?string
|
|
|
{
|
|
|
if (preg_match('~arxiv\.org/abs/([^\s/?]+)~i', $idUrl, $m)) {
|
|
|
return $m[1];
|
|
|
}
|
|
|
if (preg_match('~/abs/([^\s/?]+)~', $idUrl, $m)) {
|
|
|
return $m[1];
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<CrawlItemDto> $items
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function finalizeItems(array $items, bool $enrichAbs = true): array
|
|
|
{
|
|
|
if ($items === []) {
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items;
|
|
|
}
|
|
|
}
|