diff --git a/app/Http/Controllers/Admin/CrawlJobController.php b/app/Http/Controllers/Admin/CrawlJobController.php index 0ee987c..e1a7f12 100644 --- a/app/Http/Controllers/Admin/CrawlJobController.php +++ b/app/Http/Controllers/Admin/CrawlJobController.php @@ -314,6 +314,12 @@ class CrawlJobController extends Controller ->where('status', 'imported') ->count(); + $duplicateTeacherCount = CrawlJobItem::query() + ->where('crawl_job_id', $job->id) + ->where('target_type', 'teacher') + ->where('status', 'duplicate') + ->count(); + return [ 'id' => $job->id, 'target_type' => $job->target_type, @@ -333,6 +339,7 @@ class CrawlJobController extends Controller 'preview_teacher_count' => $previewTeacherCount, 'papers_imported' => $importedPaperCount, 'teacher_leads_imported' => $importedTeacherLeadCount, + 'teacher_duplicates_skipped' => $duplicateTeacherCount, 'result_summary' => $job->result_summary, 'completed_at' => $job->completed_at?->toIso8601String(), ]; @@ -425,12 +432,21 @@ class CrawlJobController extends Controller } if ($job->target_type === 'teacher') { + $imported = (int) ($importResult['teachers_imported'] ?? 0); + $duplicateCount = $this->countTeacherDuplicateItems($job); $summary = sprintf( '已从 %s 抓取 %d 位老师,已入库 %d 位老师', $sourceName, $fetched, - (int) ($importResult['teachers_imported'] ?? 0), + $imported, ); + if ($duplicateCount > 0) { + $summary .= sprintf(',跳过 %d 位(老师库中已存在)', $duplicateCount); + } + $failedCount = (int) ($importResult['failed'] ?? 0); + if ($failedCount > 0) { + $summary .= sprintf(',失败 %d 位', $failedCount); + } $skippedProfiles = $this->countProfileEnrichSkipped($job); if ($skippedProfiles > 0) { $summary .= sprintf('(%d 位未访问主页补邮箱,避免超时)', $skippedProfiles); @@ -461,6 +477,15 @@ class CrawlJobController extends Controller ->count(); } + protected function countTeacherDuplicateItems(CrawlJob $job): int + { + return (int) CrawlJobItem::query() + ->where('crawl_job_id', $job->id) + ->where('target_type', 'teacher') + ->where('status', 'duplicate') + ->count(); + } + /** * @param array{ * imported:int, @@ -484,6 +509,10 @@ class CrawlJobController extends Controller if ($job->target_type === 'teacher') { $teachers = (int) ($importResult['teachers_imported'] ?? 0); + $duplicateCount = $this->countTeacherDuplicateItems($job); + if ($duplicateCount > 0) { + return "抓取完成,已入库 {$teachers} 位老师,跳过 {$duplicateCount} 位"; + } return "抓取完成,已入库 {$teachers} 位老师"; } diff --git a/app/Services/Crawl/Adapters/ArxivApiAdapter.php b/app/Services/Crawl/Adapters/ArxivApiAdapter.php index a8081ba..87fe2f1 100644 --- a/app/Services/Crawl/Adapters/ArxivApiAdapter.php +++ b/app/Services/Crawl/Adapters/ArxivApiAdapter.php @@ -5,6 +5,7 @@ namespace App\Services\Crawl\Adapters; use App\Models\CrawlSource; use App\Models\Paper; use App\Services\Crawl\ArxivAbsEnricher; +use App\Services\Crawl\ArxivListUrlParser; use App\Services\Crawl\ArxivMetadataParser; use App\Services\Crawl\ArxivRequestGate; use App\Services\Crawl\ArxivTextNormalizer; @@ -45,6 +46,9 @@ class ArxivApiAdapter implements CrawlerAdapterInterface $skipImported = ($params['skip_imported'] ?? true) !== false; $importedIds = $skipImported ? $this->loadImportedExternalIds() : []; $maxScanPages = $this->resolveMaxScanPages($maxPages, $maxResults, $skipImported); + $isListUrl = ArxivListUrlParser::isListUrl($requestUrl); + $searchQuery = $this->resolveSearchQuery($requestUrl, $keywordRaw); + $sort = ArxivListUrlParser::sortParams($requestUrl); if ((bool) config('crawl.arxiv.prefer_html_search', false)) { return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults)); @@ -55,7 +59,7 @@ class ArxivApiAdapter implements CrawlerAdapterInterface for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) { $start = $page * $pageSize; - $batch = $this->fetchApiPage($keywordRaw, $start, $pageSize); + $batch = $this->fetchApiPage($keywordRaw, $start, $pageSize, $searchQuery, $sort); if ($batch === []) { break; } @@ -85,27 +89,58 @@ class ArxivApiAdapter implements CrawlerAdapterInterface return $this->finalizeItems($items); } - if ($keywordRaw === '') { - throw new \RuntimeException('arXiv API 未返回结果,请稍后重试'); + if ($isListUrl) { + $listItems = $this->fetchViaListHtml( + $requestUrl, + $keywordRaw, + $maxResults, + $maxScanPages, + $skipImported, + $importedIds, + ); + if ($listItems !== []) { + return $this->finalizeItems($listItems); + } + } + + if ($keywordRaw !== '') { + return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize))); } - return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize))); + throw new \RuntimeException( + $isListUrl + ? 'arXiv 分类列表未返回结果,请稍后重试或检查 URL' + : 'arXiv API 未返回结果,请稍后重试' + ); + } + + protected function resolveSearchQuery(string $requestUrl, string $keywordRaw): string + { + $categoryQuery = ArxivListUrlParser::buildCategorySearchQuery($requestUrl); + + return CrawlKeywordParser::combineArxivSearchQueries($categoryQuery, $keywordRaw); } /** + * @param array{sortBy: string, sortOrder: string} $sort * @return list */ - protected function fetchApiPage(string $keywordRaw, int $start, int $maxResults): array - { + protected function fetchApiPage( + string $keywordRaw, + int $start, + int $maxResults, + ?string $searchQuery = null, + array $sort = ['sortBy' => 'submittedDate', 'sortOrder' => 'descending'], + ): array { $maxResults = min(50, max(1, $maxResults)); try { $response = $this->requestApiOnce([ - 'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw), + 'search_query' => $searchQuery ?? CrawlKeywordParser::buildArxivSearchQuery($keywordRaw), 'start' => $start, 'max_results' => $maxResults, - 'sortBy' => 'submittedDate', - 'sortOrder' => 'descending', + 'sortBy' => $sort['sortBy'] ?? 'submittedDate', + 'sortOrder' => $sort['sortOrder'] ?? 'descending', ]); } catch (ConnectionException|RequestException) { return []; @@ -116,7 +151,7 @@ class ArxivApiAdapter implements CrawlerAdapterInterface } $body = $response->body(); - if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, 'Error')) { + if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, 'Error') || str_contains($body, 'Rate exceeded')) { return []; } @@ -316,6 +351,161 @@ class ArxivApiAdapter implements CrawlerAdapterInterface return $items; } + /** + * @param array $importedIds + * @return list + */ + protected function fetchViaListHtml( + string $requestUrl, + string $keywordRaw, + int $maxResults, + int $maxScanPages, + bool $skipImported, + array $importedIds, + ): array { + $keywords = CrawlKeywordParser::parse($keywordRaw); + $pageSize = min(200, max(50, $maxResults)); + $items = []; + $seen = []; + + for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) { + $pageUrl = ArxivListUrlParser::buildListPageUrl($requestUrl, $page * $pageSize, $pageSize); + try { + $response = $this->gate->run(fn () => $this->sendRequest($pageUrl, [])); + } catch (ConnectionException|RequestException) { + break; + } + + if (! $response->successful()) { + break; + } + + $batch = $this->parseListHtml($response->body(), $keywordRaw, $keywords, $maxResults - count($items)); + if ($batch === []) { + break; + } + + foreach ($batch as $item) { + if (isset($seen[$item->externalId])) { + continue; + } + $seen[$item->externalId] = true; + + if ($skipImported && isset($importedIds[$item->externalId])) { + continue; + } + + $items[] = $item; + if (count($items) >= $maxResults) { + break 2; + } + } + + if (count($batch) < $pageSize) { + break; + } + } + + return $items; + } + + /** + * @param list $keywords + * @return list + */ + protected function parseListHtml(string $html, string $keyword, array $keywords, int $maxResults): array + { + if (! preg_match_all('#
(.*?)
\s*
(.*?)
#s', $html, $blocks, PREG_SET_ORDER)) { + return []; + } + + $items = []; + foreach ($blocks as $block) { + if (count($items) >= $maxResults) { + break; + } + + $head = $block[1]; + $body = $block[2]; + if (! preg_match('#/abs/([^"\s?]+)#', $head, $idMatch)) { + continue; + } + $arxivId = trim($idMatch[1], '/'); + if ($arxivId === '') { + continue; + } + + $title = ''; + if (preg_match('#
]*>.*?Title:\s*(.*?)\s*
#s', $body, $titleMatch)) { + $title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? ''; + } + if ($title === '') { + continue; + } + + if (! CrawlKeywordParser::matchesAny($title, null, $keywords)) { + $summaryProbe = ''; + if (preg_match('#

(.*?)

#s', $body, $abstractMatch)) { + $summaryProbe = trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')); + } + if (! CrawlKeywordParser::matchesAny($title, $summaryProbe, $keywords)) { + continue; + } + } + + $authors = ''; + if (preg_match('#
(.*?)
#s', $body, $authorMatch)) { + if (preg_match_all('#>([^<]+)#', $authorMatch[1], $authorNames)) { + $authors = implode('; ', array_map('trim', $authorNames[1])); + } + } + + $summary = null; + if (preg_match('#

(.*?)

#s', $body, $abstractMatch)) { + $summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))); + } + + $authorsParsed = []; + if ($authors !== '') { + foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) { + $name = trim($name); + if ($name !== '') { + $authorsParsed[] = [ + 'name' => $name, + 'email' => null, + 'affiliation' => null, + 'university_name' => null, + ]; + } + } + } + $lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed); + + $items[] = new CrawlItemDto( + externalId: 'arxiv:'.$arxivId, + title: $title, + canonicalUrl: 'https://arxiv.org/abs/'.$arxivId, + authors: $authors, + summary: $summary, + publishedAt: ArxivMetadataParser::parsePublishedDate($body), + schoolName: $lead['university_name'] ?? null, + extra: [ + 'platform' => 'arxiv', + 'arxiv_id' => $arxivId, + 'keyword' => $keyword, + 'source' => 'list_html', + 'pdf_url' => ArxivMetadataParser::extractPdfUrl($body, $arxivId), + 'html_url' => ArxivMetadataParser::extractHtmlUrl($body, $arxivId), + 'authors_parsed' => $authorsParsed, + 'lead_author' => $lead, + ], + authorsParsed: $authorsParsed, + ); + } + + return $items; + } + /** * @param array $queryParams */ diff --git a/app/Services/Crawl/ArxivListUrlParser.php b/app/Services/Crawl/ArxivListUrlParser.php new file mode 100644 index 0000000..959c99d --- /dev/null +++ b/app/Services/Crawl/ArxivListUrlParser.php @@ -0,0 +1,90 @@ + $category, + 'slice' => $slice, + ]; + } + + public static function isListUrl(?string $url): bool + { + return self::parse($url) !== null; + } + + public static function buildCategorySearchQuery(?string $url): ?string + { + $parsed = self::parse($url); + if ($parsed === null) { + return null; + } + + $category = $parsed['category']; + if (str_contains($category, '.')) { + return 'cat:'.$category; + } + + return 'cat:'.$category.'*'; + } + + /** + * @return array{sortBy: string, sortOrder: string} + */ + public static function sortParams(?string $url): array + { + $parsed = self::parse($url); + $slice = strtolower($parsed['slice'] ?? 'new'); + + if ($slice === 'recent') { + return [ + 'sortBy' => 'lastUpdatedDate', + 'sortOrder' => 'descending', + ]; + } + + return [ + 'sortBy' => 'submittedDate', + 'sortOrder' => 'descending', + ]; + } + + public static function buildListPageUrl(string $requestUrl, int $skip, int $show): string + { + $parts = parse_url($requestUrl); + $scheme = $parts['scheme'] ?? 'https'; + $host = $parts['host'] ?? 'arxiv.org'; + $path = $parts['path'] ?? '/'; + $query = []; + if (! empty($parts['query'])) { + parse_str((string) $parts['query'], $query); + } + $query['skip'] = max(0, $skip); + $query['show'] = min(2000, max(1, $show)); + + return $scheme.'://'.$host.$path.'?'.http_build_query($query); + } + +} diff --git a/app/Services/Crawl/CrawlImportService.php b/app/Services/Crawl/CrawlImportService.php index 0a71204..7e40bcf 100644 --- a/app/Services/Crawl/CrawlImportService.php +++ b/app/Services/Crawl/CrawlImportService.php @@ -138,6 +138,10 @@ class CrawlImportService { $payload = $item->payload ?? []; $externalId = $item->external_id; + $title = trim((string) $item->title); + if ($title === '') { + return null; + } $existing = Paper::query() ->where('external_id', $externalId) @@ -150,7 +154,7 @@ class CrawlImportService } $paper = Paper::query()->create([ - 'title' => $item->title, + 'title' => $title, 'authors' => $payload['authors'] ?? null, 'school_name' => $payload['school_name'] ?? null, 'published_at' => $this->normalizePaperPublishedAt($payload['published_at'] ?? null), diff --git a/app/Services/Crawl/CrawlKeywordParser.php b/app/Services/Crawl/CrawlKeywordParser.php index 442b87a..eb94138 100644 --- a/app/Services/Crawl/CrawlKeywordParser.php +++ b/app/Services/Crawl/CrawlKeywordParser.php @@ -83,6 +83,30 @@ class CrawlKeywordParser * 构建 arXiv search_query:短语内空格 AND,多短语之间 OR。 */ public static function buildArxivSearchQuery(?string $raw): string + { + return self::combineArxivSearchQueries(null, $raw); + } + + /** + * 结合列表页分类与关键词构建 search_query。 + */ + public static function combineArxivSearchQueries(?string $categoryQuery, ?string $keywordRaw): string + { + $categoryQuery = trim((string) $categoryQuery); + $keywordQuery = self::buildKeywordOnlyArxivSearchQuery($keywordRaw); + + if ($categoryQuery !== '' && $keywordQuery !== 'cat:*') { + return '('.$categoryQuery.') AND ('.$keywordQuery.')'; + } + + if ($categoryQuery !== '') { + return $categoryQuery; + } + + return $keywordQuery; + } + + protected static function buildKeywordOnlyArxivSearchQuery(?string $raw): string { $phrases = self::parsePhrases($raw); if ($phrases === []) { diff --git a/tests/Unit/ArxivApiAdapterTest.php b/tests/Unit/ArxivApiAdapterTest.php new file mode 100644 index 0000000..b0c7c35 --- /dev/null +++ b/tests/Unit/ArxivApiAdapterTest.php @@ -0,0 +1,76 @@ + Http::response('Rate exceeded.', 200), + 'arxiv.org/list/cs/new*' => Http::response($this->sampleListHtml(), 200), + ]); + + $enricher = $this->createMock(ArxivAbsEnricher::class); + $enricher->method('enrichMany')->willReturnCallback(fn (array $items) => $items); + + $adapter = new ArxivApiAdapter( + new ArxivRequestGate(0), + $enricher, + ); + $source = new CrawlSource([ + 'adapter_code' => 'arxiv_api', + 'target_type' => 'paper', + ]); + + $items = $adapter->fetch('https://arxiv.org/list/cs/new', $source, [ + 'max_results' => 5, + 'max_pages' => 1, + 'skip_imported' => false, + ]); + + $this->assertCount(2, $items); + $this->assertSame('arxiv:2606.23690', $items[0]->externalId); + $this->assertSame('Sample CS Paper One', $items[0]->title); + $this->assertSame('list_html', $items[0]->extra['source']); + } + + protected function sampleListHtml(): string + { + return <<<'HTML' +
+
+ arXiv:2606.23690 +
+
+
+
Title: + Sample CS Paper One +
+ +

Abstract one.

+
+
+
+ arXiv:2606.23691 +
+
+
+
Title: + Sample CS Paper Two +
+ +

Abstract two.

+
+
+
+HTML; + } +} diff --git a/tests/Unit/ArxivListUrlParserTest.php b/tests/Unit/ArxivListUrlParserTest.php new file mode 100644 index 0000000..c9f6f29 --- /dev/null +++ b/tests/Unit/ArxivListUrlParserTest.php @@ -0,0 +1,32 @@ +assertSame(['category' => 'cs', 'slice' => 'new'], $parsed); + $this->assertSame('cat:cs*', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/cs/new')); + $this->assertSame('cat:cs.AI', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/cs.AI/new')); + $this->assertSame('cat:astro-ph*', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/astro-ph/new')); + } + + public function test_builds_list_page_url_with_paging(): void + { + $url = ArxivListUrlParser::buildListPageUrl('https://arxiv.org/list/math/new', 50, 100); + $this->assertStringContainsString('skip=50', $url); + $this->assertStringContainsString('show=100', $url); + } + + public function test_combines_category_and_keyword_queries(): void + { + $query = CrawlKeywordParser::combineArxivSearchQueries('cat:math*', 'graph neural'); + $this->assertSame('(cat:math*) AND ((all:graph AND all:neural))', $query); + } +}