finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults)); } $response = null; try { $response = $this->requestApiOnce([ 'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw), 'start' => 0, 'max_results' => $maxResults, 'sortBy' => 'submittedDate', 'sortOrder' => 'descending', ]); } catch (ConnectionException|RequestException $e) { return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults, $e)); } if ($response->successful()) { $items = $this->parseAtomFeed($response->body(), $keywordRaw); if ($items !== []) { return $this->finalizeItems(array_slice($items, 0, $maxResults)); } } if ($response->status() === 429 || ! $response->successful()) { return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults)); } return []; } /** * @param array $queryParams */ protected function requestApiOnce(array $queryParams): Response { try { return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams)); } catch (ConnectionException $e) { sleep(3); return $this->gate->run(fn () => $this->sendRequest(self::API_URL, $queryParams)); } } /** * @return list */ protected function requireHtmlSearchItems(string $keyword, int $maxResults, ?\Throwable $previous = null): array { $items = $this->fetchViaHtmlSearch($keyword, $maxResults); if ($items !== []) { return $items; } $hint = $previous instanceof RequestException && $previous->response?->status() === 429 ? 'arXiv 访问过于频繁(HTTP 429),请等待 1~2 分钟后再试' : 'arXiv 搜索页抓取失败,请检查网络或稍后重试'; throw new \RuntimeException($hint, 0, $previous); } /** * @return CrawlItemDto[] */ protected function parseAtomFeed(string $body, string $keyword): array { $xml = new SimpleXMLElement($body); $xml->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom'); $entries = $xml->xpath('//atom:entry') ?: []; $items = []; foreach ($entries as $entry) { $entry->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom'); $idUrl = (string) ($entry->id ?? ''); $arxivId = $this->extractArxivId($idUrl); if (! $arxivId) { continue; } $authorsParsed = []; foreach ($entry->author as $author) { $author->registerXPathNamespace('arxiv', 'http://arxiv.org/schemas/atom'); $name = trim((string) ($author->name ?? '')); $affNodes = $author->xpath('arxiv:affiliation') ?: []; $affiliation = trim((string) ($affNodes[0] ?? '')); if ($name !== '') { $authorsParsed[] = [ 'name' => $name, 'email' => null, 'affiliation' => $affiliation !== '' ? $affiliation : null, 'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation), ]; } } $authorNames = array_column($authorsParsed, 'name'); $published = (string) ($entry->published ?? ''); $publishedAt = $published ? substr($published, 0, 10) : null; $lead = CrawlAuthorParser::leadAuthor(implode('; ', $authorNames), $authorsParsed); $items[] = new CrawlItemDto( externalId: 'arxiv:'.$arxivId, title: ArxivTextNormalizer::normalize(trim((string) ($entry->title ?? ''))) ?? '', canonicalUrl: 'https://arxiv.org/abs/'.$arxivId, authors: implode('; ', $authorNames), summary: ArxivTextNormalizer::normalize(trim((string) ($entry->summary ?? ''))), publishedAt: $publishedAt, schoolName: $lead['university_name'] ?? null, extra: [ 'platform' => 'arxiv', 'arxiv_id' => $arxivId, 'keyword' => $keyword, 'source' => 'api', 'authors_parsed' => $authorsParsed, 'lead_author' => $lead, ], authorsParsed: $authorsParsed, ); } return $items; } /** * 搜索页降级(export API 被 429 时)。勿传 size 参数,否则会 400。 * * @return CrawlItemDto[] */ protected function fetchViaHtmlSearch(string $keyword, int $maxResults): array { $response = $this->gate->run(fn () => $this->sendRequest(self::SEARCH_URL, [ 'query' => $keyword, 'searchtype' => 'all', ])); if (! $response->successful()) { return []; } return $this->parseSearchHtml($response->body(), $keyword, $maxResults); } /** * @return CrawlItemDto[] */ protected function parseSearchHtml(string $html, string $keyword, int $maxResults): array { if (! preg_match_all('#
  • (.*?)
  • #s', $html, $blocks)) { return []; } $items = []; foreach ($blocks[1] as $block) { if (count($items) >= $maxResults) { break; } if (! preg_match('#arxiv\.org/abs/([^"?\s]+)#', $block, $idMatch)) { continue; } $arxivId = $idMatch[1]; $title = ''; if (preg_match('#

    \s*(.*?)\s*

    #s', $block, $titleMatch)) { $title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? ''; } $authors = ''; if (preg_match('#

    (.*?)

    #s', $block, $authorMatch)) { if (preg_match_all('#]*>([^<]+)#', $authorMatch[1], $authorNames)) { $authors = implode('; ', array_map('trim', $authorNames[1])); } } $summary = ''; if (preg_match('#]*>(.*?)#s', $block, $abstractMatch)) { $summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))); } elseif (preg_match('#]*>(.*?)#s', $block, $abstractShort)) { $summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractShort[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))); } $publishedAt = ArxivMetadataParser::parsePublishedDate($block); $authorsParsed = []; if ($authors !== '') { foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) { $name = trim($name); if ($name !== '') { $authorsParsed[] = [ 'name' => $name, 'email' => null, 'affiliation' => null, 'university_name' => null, ]; } } } $lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed); $items[] = new CrawlItemDto( externalId: 'arxiv:'.$arxivId, title: $title, canonicalUrl: 'https://arxiv.org/abs/'.$arxivId, authors: $authors, summary: $summary, publishedAt: $publishedAt, schoolName: $lead['university_name'] ?? null, extra: [ 'platform' => 'arxiv', 'arxiv_id' => $arxivId, 'keyword' => $keyword, 'source' => 'html_search', 'pdf_url' => ArxivMetadataParser::extractPdfUrl($block, $arxivId), 'html_url' => ArxivMetadataParser::extractHtmlUrl($block, $arxivId), 'authors_parsed' => $authorsParsed, 'lead_author' => $lead, ], authorsParsed: $authorsParsed, ); } return $items; } /** * @param array $queryParams */ protected function sendRequest(string $url, array $queryParams): Response { $email = (string) config('crawl.arxiv.contact_email', 'support@example.com'); $timeout = (int) config('crawl.arxiv.http_timeout_seconds', 60); $connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 30); return Http::timeout($timeout) ->connectTimeout($connectTimeout) ->retry(2, 2000, fn ($exception) => $exception instanceof ConnectionException, throw: false) ->withHeaders([ 'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org; mailto:'.$email.')', 'Accept' => 'application/atom+xml, text/html;q=0.9', ]) ->get($url, $queryParams); } protected function extractArxivId(string $idUrl): ?string { if (preg_match('~arxiv\.org/abs/([^\s/?]+)~i', $idUrl, $m)) { return $m[1]; } if (preg_match('~/abs/([^\s/?]+)~', $idUrl, $m)) { return $m[1]; } return null; } /** * @param list $items * @return list */ protected function finalizeItems(array $items, bool $enrichAbs = true): array { if ($items === []) { return $items; } return $enrichAbs ? $this->absEnricher->enrichMany($items) : $items; } }