|
|
|
|
@ -3,6 +3,7 @@
|
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
|
|
|
|
|
|
use App\Models\CrawlSource;
|
|
|
|
|
use App\Models\Paper;
|
|
|
|
|
use App\Services\Crawl\ArxivAbsEnricher;
|
|
|
|
|
use App\Services\Crawl\ArxivMetadataParser;
|
|
|
|
|
use App\Services\Crawl\ArxivRequestGate;
|
|
|
|
|
@ -41,6 +42,9 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
|
|
|
|
|
$maxResults = min(200, max(1, (int) ($params['max_results'] ?? 50)));
|
|
|
|
|
$maxPages = min(20, max(1, (int) ($params['max_pages'] ?? 1)));
|
|
|
|
|
$pageSize = 50;
|
|
|
|
|
$skipImported = ($params['skip_imported'] ?? true) !== false;
|
|
|
|
|
$importedIds = $skipImported ? $this->loadImportedExternalIds() : [];
|
|
|
|
|
$maxScanPages = $this->resolveMaxScanPages($maxPages, $maxResults, $skipImported);
|
|
|
|
|
|
|
|
|
|
if ((bool) config('crawl.arxiv.prefer_html_search', false)) {
|
|
|
|
|
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
|
|
|
|
|
@ -49,10 +53,9 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
|
|
|
|
|
for ($page = 0; $page < $maxPages && count($items) < $maxResults; $page++) {
|
|
|
|
|
for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) {
|
|
|
|
|
$start = $page * $pageSize;
|
|
|
|
|
$batchSize = min($pageSize, $maxResults - count($items));
|
|
|
|
|
$batch = $this->fetchApiPage($keywordRaw, $start, $batchSize);
|
|
|
|
|
$batch = $this->fetchApiPage($keywordRaw, $start, $pageSize);
|
|
|
|
|
if ($batch === []) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
@ -62,10 +65,18 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$seen[$item->externalId] = true;
|
|
|
|
|
|
|
|
|
|
if ($skipImported && isset($importedIds[$item->externalId])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$items[] = $item;
|
|
|
|
|
if (count($items) >= $maxResults) {
|
|
|
|
|
break 2;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (count($batch) < $batchSize) {
|
|
|
|
|
if (count($batch) < $pageSize) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@ -74,6 +85,10 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return $this->finalizeItems($items);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($keywordRaw === '') {
|
|
|
|
|
throw new \RuntimeException('arXiv API 未返回结果,请稍后重试');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -100,7 +115,12 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $this->parseAtomFeed($response->body(), $keywordRaw);
|
|
|
|
|
$body = $response->body();
|
|
|
|
|
if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, '<title>Error</title>')) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $this->parseAtomFeed($body, $keywordRaw);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@ -340,4 +360,30 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
|
|
|
|
|
|
|
|
|
|
return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return array<string, true>
|
|
|
|
|
*/
|
|
|
|
|
protected function loadImportedExternalIds(): array
|
|
|
|
|
{
|
|
|
|
|
$ids = Paper::query()
|
|
|
|
|
->where('source', 'crawl')
|
|
|
|
|
->whereNotNull('external_id')
|
|
|
|
|
->pluck('external_id')
|
|
|
|
|
->all();
|
|
|
|
|
|
|
|
|
|
return array_fill_keys($ids, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function resolveMaxScanPages(int $maxPages, int $maxResults, bool $skipImported): int
|
|
|
|
|
{
|
|
|
|
|
$maxPages = min(20, max(1, $maxPages));
|
|
|
|
|
if (! $skipImported) {
|
|
|
|
|
return $maxPages;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$minForTarget = (int) ceil($maxResults / 50);
|
|
|
|
|
|
|
|
|
|
return min(200, max($maxPages, $minForTarget * 10));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|