$items * @return list */ public function enrichMany(array $items): array { if (! config('crawl.arxiv.abs_enrich_enabled', true)) { return array_map(fn (CrawlItemDto $d) => $this->ensureLeadAuthor($d), $items); } $max = max(0, (int) config('crawl.arxiv.abs_enrich_max', 8)); $enriched = 0; $out = []; foreach ($items as $dto) { if ($enriched >= $max || ! $this->shouldEnrich($dto)) { $out[] = $this->ensureLeadAuthor($dto); continue; } $out[] = $this->enrichOne($dto); $enriched++; } return $out; } public function enrichOne(CrawlItemDto $dto): CrawlItemDto { $arxivId = $dto->extra['arxiv_id'] ?? null; if (! $arxivId || ! $dto->canonicalUrl) { return $this->ensureLeadAuthor($dto); } if (! $this->shouldEnrich($dto)) { return $this->ensureLeadAuthor($dto); } $publishedAt = $dto->publishedAt; $authorsParsed = $dto->authorsParsed; $enrichedFrom = null; $pageHtml = ''; $preferHtml = $this->shouldPreferHtmlEnrich($dto); if ($preferHtml && (bool) config('crawl.arxiv.try_html_version', true)) { $pageHtml = $this->fetchHtmlVersion((string) $arxivId); if ($pageHtml !== '') { $enrichedFrom = 'arxiv_html'; } } if ($pageHtml === '') { $pageHtml = $this->fetchAbsHtml((string) $arxivId); if ($pageHtml !== '') { $enrichedFrom = 'abs_html'; $preferHtml = false; } } if ($pageHtml !== '') { $publishedAt = ArxivMetadataParser::parsePublishedDate($pageHtml) ?? $publishedAt; $parsed = $preferHtml ? ArxivMetadataParser::parseAuthorsFromHtmlVersion($pageHtml) : ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml); if ($parsed === [] && $preferHtml) { $parsed = ArxivMetadataParser::parseAuthorsFromAbsHtml($pageHtml); } if ($parsed !== []) { $authorsParsed = $parsed; } } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $authorsParsed); $schoolName = $lead['university_name'] ?? $dto->schoolName; $extra = array_merge($dto->extra, [ 'authors_parsed' => $authorsParsed, 'lead_author' => $lead, ]); if ($enrichedFrom !== null) { $extra['enriched_from'] = $enrichedFrom; } if (! isset($extra['pdf_url'])) { $extra['pdf_url'] = ArxivMetadataParser::extractPdfUrl($pageHtml, (string) $arxivId); } if (! isset($extra['html_url'])) { $extra['html_url'] = ArxivMetadataParser::extractHtmlUrl($pageHtml, (string) $arxivId); } return new CrawlItemDto( externalId: $dto->externalId, title: $dto->title, canonicalUrl: $dto->canonicalUrl, authors: $dto->authors, summary: $dto->summary, publishedAt: $publishedAt, schoolName: $schoolName, section: $dto->section, contentHtml: $dto->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } protected function shouldEnrich(CrawlItemDto $dto): bool { $mode = (string) config('crawl.arxiv.abs_enrich_mode', 'auto'); if ($mode === 'never') { return false; } if ($mode === 'always') { return true; } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); $hasDate = ($dto->publishedAt ?? '') !== ''; $hasSchool = ($dto->schoolName ?? null) !== null || ($lead['university_name'] ?? null) !== null || ($lead['affiliation'] ?? null) !== null; if ($hasDate && $hasSchool) { return false; } return true; } protected function shouldPreferHtmlEnrich(CrawlItemDto $dto): bool { if ((bool) config('crawl.arxiv.enrich_prefer_html', true)) { return true; } return ($dto->extra['source'] ?? '') === 'html_search' || ! empty($dto->extra['html_url']); } protected function ensureLeadAuthor(CrawlItemDto $dto): CrawlItemDto { if (! empty($dto->extra['lead_author'])) { return $dto; } $lead = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); $extra = array_merge($dto->extra, [ 'lead_author' => $lead, 'authors_parsed' => $dto->authorsParsed !== [] ? $dto->authorsParsed : ($dto->extra['authors_parsed'] ?? []), ]); return new CrawlItemDto( externalId: $dto->externalId, title: $dto->title, canonicalUrl: $dto->canonicalUrl, authors: $dto->authors, summary: $dto->summary, publishedAt: $dto->publishedAt, schoolName: $dto->schoolName ?? $lead['university_name'] ?? null, section: $dto->section, contentHtml: $dto->contentHtml, extra: $extra, authorsParsed: $dto->authorsParsed, ); } protected function fetchAbsHtml(string $arxivId): string { return $this->fetchCachedPage('abs', $arxivId, function () use ($arxivId) { foreach ($this->versionIdCandidates($arxivId) as $id) { $html = $this->fetchPage('https://arxiv.org/abs/'.$id); if ($html !== '' && str_contains($html, 'abs-outer')) { return $html; } } return ''; }); } protected function fetchHtmlVersion(string $arxivId): string { return $this->fetchCachedPage('html', $arxivId, function () use ($arxivId) { foreach ($this->versionIdCandidates($arxivId) as $id) { $html = $this->fetchPage('https://arxiv.org/html/'.$id); if ($html !== '' && (str_contains($html, 'ltx_document') || str_contains($html, 'ltx_authors'))) { return $html; } } return ''; }); } /** * @return list */ protected function versionIdCandidates(string $arxivId): array { if (preg_match('/v\d+$/i', $arxivId)) { return [$arxivId]; } return [$arxivId.'v1']; } protected function fetchCachedPage(string $kind, string $arxivId, callable $fetcher): string { $ttl = max(60, (int) config('crawl.arxiv.page_cache_seconds', 86400)); $key = 'arxiv_'.$kind.':'.preg_replace('/[^a-zA-Z0-9._-]/', '_', $arxivId); return (string) Cache::remember($key, $ttl, fn () => (string) $fetcher()); } protected function fetchPage(string $url): string { try { $timeout = (int) config('crawl.arxiv.enrich_http_timeout_seconds', 25); $connectTimeout = (int) config('crawl.arxiv.connect_timeout_seconds', 15); $response = $this->gate->run(fn () => Http::timeout($timeout) ->connectTimeout($connectTimeout) ->withHeaders([ 'User-Agent' => 'SlakeSchool-Crawler/1.0 (https://arxiv.org)', 'Accept' => 'text/html', ]) ->get($url)); if ($response->status() === 429) { return ''; } return $response->successful() ? (string) $response->body() : ''; } catch (\Throwable) { return ''; } } }