交大智能研究院

master
lion 3 days ago
parent d5529b2b22
commit 322baf9bfa

@ -314,6 +314,12 @@ class CrawlJobController extends Controller
->where('status', 'imported')
->count();
$duplicateTeacherCount = CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('target_type', 'teacher')
->where('status', 'duplicate')
->count();
return [
'id' => $job->id,
'target_type' => $job->target_type,
@ -333,6 +339,7 @@ class CrawlJobController extends Controller
'preview_teacher_count' => $previewTeacherCount,
'papers_imported' => $importedPaperCount,
'teacher_leads_imported' => $importedTeacherLeadCount,
'teacher_duplicates_skipped' => $duplicateTeacherCount,
'result_summary' => $job->result_summary,
'completed_at' => $job->completed_at?->toIso8601String(),
];
@ -425,12 +432,21 @@ class CrawlJobController extends Controller
}
if ($job->target_type === 'teacher') {
$imported = (int) ($importResult['teachers_imported'] ?? 0);
$duplicateCount = $this->countTeacherDuplicateItems($job);
$summary = sprintf(
'已从 %s 抓取 %d 位老师,已入库 %d 位老师',
$sourceName,
$fetched,
(int) ($importResult['teachers_imported'] ?? 0),
$imported,
);
if ($duplicateCount > 0) {
$summary .= sprintf(',跳过 %d 位(老师库中已存在)', $duplicateCount);
}
$failedCount = (int) ($importResult['failed'] ?? 0);
if ($failedCount > 0) {
$summary .= sprintf(',失败 %d 位', $failedCount);
}
$skippedProfiles = $this->countProfileEnrichSkipped($job);
if ($skippedProfiles > 0) {
$summary .= sprintf('%d 位未访问主页补邮箱,避免超时)', $skippedProfiles);
@ -461,6 +477,15 @@ class CrawlJobController extends Controller
->count();
}
protected function countTeacherDuplicateItems(CrawlJob $job): int
{
return (int) CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('target_type', 'teacher')
->where('status', 'duplicate')
->count();
}
/**
* @param array{
* imported:int,
@ -484,6 +509,10 @@ class CrawlJobController extends Controller
if ($job->target_type === 'teacher') {
$teachers = (int) ($importResult['teachers_imported'] ?? 0);
$duplicateCount = $this->countTeacherDuplicateItems($job);
if ($duplicateCount > 0) {
return "抓取完成,已入库 {$teachers} 位老师,跳过 {$duplicateCount} 位";
}
return "抓取完成,已入库 {$teachers} 位老师";
}

@ -5,6 +5,7 @@ namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Models\Paper;
use App\Services\Crawl\ArxivAbsEnricher;
use App\Services\Crawl\ArxivListUrlParser;
use App\Services\Crawl\ArxivMetadataParser;
use App\Services\Crawl\ArxivRequestGate;
use App\Services\Crawl\ArxivTextNormalizer;
@ -45,6 +46,9 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
$skipImported = ($params['skip_imported'] ?? true) !== false;
$importedIds = $skipImported ? $this->loadImportedExternalIds() : [];
$maxScanPages = $this->resolveMaxScanPages($maxPages, $maxResults, $skipImported);
$isListUrl = ArxivListUrlParser::isListUrl($requestUrl);
$searchQuery = $this->resolveSearchQuery($requestUrl, $keywordRaw);
$sort = ArxivListUrlParser::sortParams($requestUrl);
if ((bool) config('crawl.arxiv.prefer_html_search', false)) {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, $maxResults));
@ -55,7 +59,7 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) {
$start = $page * $pageSize;
$batch = $this->fetchApiPage($keywordRaw, $start, $pageSize);
$batch = $this->fetchApiPage($keywordRaw, $start, $pageSize, $searchQuery, $sort);
if ($batch === []) {
break;
}
@ -85,27 +89,58 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
return $this->finalizeItems($items);
}
if ($keywordRaw === '') {
throw new \RuntimeException('arXiv API 未返回结果,请稍后重试');
if ($isListUrl) {
$listItems = $this->fetchViaListHtml(
$requestUrl,
$keywordRaw,
$maxResults,
$maxScanPages,
$skipImported,
$importedIds,
);
if ($listItems !== []) {
return $this->finalizeItems($listItems);
}
}
if ($keywordRaw !== '') {
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize)));
}
return $this->finalizeItems($this->requireHtmlSearchItems($keywordRaw, min($maxResults, $pageSize)));
throw new \RuntimeException(
$isListUrl
? 'arXiv 分类列表未返回结果,请稍后重试或检查 URL'
: 'arXiv API 未返回结果,请稍后重试'
);
}
protected function resolveSearchQuery(string $requestUrl, string $keywordRaw): string
{
$categoryQuery = ArxivListUrlParser::buildCategorySearchQuery($requestUrl);
return CrawlKeywordParser::combineArxivSearchQueries($categoryQuery, $keywordRaw);
}
/**
* @param array{sortBy: string, sortOrder: string} $sort
* @return list<CrawlItemDto>
*/
protected function fetchApiPage(string $keywordRaw, int $start, int $maxResults): array
{
protected function fetchApiPage(
string $keywordRaw,
int $start,
int $maxResults,
?string $searchQuery = null,
array $sort = ['sortBy' => 'submittedDate', 'sortOrder' => 'descending'],
): array {
$maxResults = min(50, max(1, $maxResults));
try {
$response = $this->requestApiOnce([
'search_query' => CrawlKeywordParser::buildArxivSearchQuery($keywordRaw),
'search_query' => $searchQuery ?? CrawlKeywordParser::buildArxivSearchQuery($keywordRaw),
'start' => $start,
'max_results' => $maxResults,
'sortBy' => 'submittedDate',
'sortOrder' => 'descending',
'sortBy' => $sort['sortBy'] ?? 'submittedDate',
'sortOrder' => $sort['sortOrder'] ?? 'descending',
]);
} catch (ConnectionException|RequestException) {
return [];
@ -116,7 +151,7 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
}
$body = $response->body();
if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, '<title>Error</title>')) {
if (str_contains($body, 'arxiv.org/api/errors') || str_contains($body, '<title>Error</title>') || str_contains($body, 'Rate exceeded')) {
return [];
}
@ -316,6 +351,161 @@ class ArxivApiAdapter implements CrawlerAdapterInterface
return $items;
}
/**
* @param array<string, true> $importedIds
* @return list<CrawlItemDto>
*/
protected function fetchViaListHtml(
string $requestUrl,
string $keywordRaw,
int $maxResults,
int $maxScanPages,
bool $skipImported,
array $importedIds,
): array {
$keywords = CrawlKeywordParser::parse($keywordRaw);
$pageSize = min(200, max(50, $maxResults));
$items = [];
$seen = [];
for ($page = 0; $page < $maxScanPages && count($items) < $maxResults; $page++) {
$pageUrl = ArxivListUrlParser::buildListPageUrl($requestUrl, $page * $pageSize, $pageSize);
try {
$response = $this->gate->run(fn () => $this->sendRequest($pageUrl, []));
} catch (ConnectionException|RequestException) {
break;
}
if (! $response->successful()) {
break;
}
$batch = $this->parseListHtml($response->body(), $keywordRaw, $keywords, $maxResults - count($items));
if ($batch === []) {
break;
}
foreach ($batch as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
if ($skipImported && isset($importedIds[$item->externalId])) {
continue;
}
$items[] = $item;
if (count($items) >= $maxResults) {
break 2;
}
}
if (count($batch) < $pageSize) {
break;
}
}
return $items;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function parseListHtml(string $html, string $keyword, array $keywords, int $maxResults): array
{
if (! preg_match_all('#<dt>(.*?)</dt>\s*<dd>(.*?)</dd>#s', $html, $blocks, PREG_SET_ORDER)) {
return [];
}
$items = [];
foreach ($blocks as $block) {
if (count($items) >= $maxResults) {
break;
}
$head = $block[1];
$body = $block[2];
if (! preg_match('#/abs/([^"\s?]+)#', $head, $idMatch)) {
continue;
}
$arxivId = trim($idMatch[1], '/');
if ($arxivId === '') {
continue;
}
$title = '';
if (preg_match('#<div class=[\'"]list-title mathjax[\'"][^>]*>.*?<span class=[\'"]descriptor[\'"]>Title:</span>\s*(.*?)\s*</div>#s', $body, $titleMatch)) {
$title = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($titleMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'))) ?? '';
}
if ($title === '') {
continue;
}
if (! CrawlKeywordParser::matchesAny($title, null, $keywords)) {
$summaryProbe = '';
if (preg_match('#<p class=[\'"]mathjax[\'"]>(.*?)</p>#s', $body, $abstractMatch)) {
$summaryProbe = trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8'));
}
if (! CrawlKeywordParser::matchesAny($title, $summaryProbe, $keywords)) {
continue;
}
}
$authors = '';
if (preg_match('#<div class=[\'"]list-authors[\'"]>(.*?)</div>#s', $body, $authorMatch)) {
if (preg_match_all('#>([^<]+)</a>#', $authorMatch[1], $authorNames)) {
$authors = implode('; ', array_map('trim', $authorNames[1]));
}
}
$summary = null;
if (preg_match('#<p class=[\'"]mathjax[\'"]>(.*?)</p>#s', $body, $abstractMatch)) {
$summary = ArxivTextNormalizer::normalize(trim(html_entity_decode(strip_tags($abstractMatch[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8')));
}
$authorsParsed = [];
if ($authors !== '') {
foreach (preg_split('/\s*;\s*/', $authors) ?: [] as $name) {
$name = trim($name);
if ($name !== '') {
$authorsParsed[] = [
'name' => $name,
'email' => null,
'affiliation' => null,
'university_name' => null,
];
}
}
}
$lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);
$items[] = new CrawlItemDto(
externalId: 'arxiv:'.$arxivId,
title: $title,
canonicalUrl: 'https://arxiv.org/abs/'.$arxivId,
authors: $authors,
summary: $summary,
publishedAt: ArxivMetadataParser::parsePublishedDate($body),
schoolName: $lead['university_name'] ?? null,
extra: [
'platform' => 'arxiv',
'arxiv_id' => $arxivId,
'keyword' => $keyword,
'source' => 'list_html',
'pdf_url' => ArxivMetadataParser::extractPdfUrl($body, $arxivId),
'html_url' => ArxivMetadataParser::extractHtmlUrl($body, $arxivId),
'authors_parsed' => $authorsParsed,
'lead_author' => $lead,
],
authorsParsed: $authorsParsed,
);
}
return $items;
}
/**
* @param array<string, mixed> $queryParams
*/

@ -0,0 +1,90 @@
<?php
namespace App\Services\Crawl;
class ArxivListUrlParser
{
/**
* @return array{category: string, slice: string}|null
*/
public static function parse(?string $url): ?array
{
$url = trim((string) $url);
if ($url === '') {
return null;
}
if (! preg_match('#arxiv\.org/list/([^/?\#]+)/([^/?\#]+)#i', $url, $matches)) {
return null;
}
$category = trim($matches[1]);
$slice = trim($matches[2]);
if ($category === '' || $slice === '' || ! preg_match('/^[a-z0-9\-_.]+$/i', $category)) {
return null;
}
return [
'category' => $category,
'slice' => $slice,
];
}
public static function isListUrl(?string $url): bool
{
return self::parse($url) !== null;
}
public static function buildCategorySearchQuery(?string $url): ?string
{
$parsed = self::parse($url);
if ($parsed === null) {
return null;
}
$category = $parsed['category'];
if (str_contains($category, '.')) {
return 'cat:'.$category;
}
return 'cat:'.$category.'*';
}
/**
* @return array{sortBy: string, sortOrder: string}
*/
public static function sortParams(?string $url): array
{
$parsed = self::parse($url);
$slice = strtolower($parsed['slice'] ?? 'new');
if ($slice === 'recent') {
return [
'sortBy' => 'lastUpdatedDate',
'sortOrder' => 'descending',
];
}
return [
'sortBy' => 'submittedDate',
'sortOrder' => 'descending',
];
}
public static function buildListPageUrl(string $requestUrl, int $skip, int $show): string
{
$parts = parse_url($requestUrl);
$scheme = $parts['scheme'] ?? 'https';
$host = $parts['host'] ?? 'arxiv.org';
$path = $parts['path'] ?? '/';
$query = [];
if (! empty($parts['query'])) {
parse_str((string) $parts['query'], $query);
}
$query['skip'] = max(0, $skip);
$query['show'] = min(2000, max(1, $show));
return $scheme.'://'.$host.$path.'?'.http_build_query($query);
}
}

@ -138,6 +138,10 @@ class CrawlImportService
{
$payload = $item->payload ?? [];
$externalId = $item->external_id;
$title = trim((string) $item->title);
if ($title === '') {
return null;
}
$existing = Paper::query()
->where('external_id', $externalId)
@ -150,7 +154,7 @@ class CrawlImportService
}
$paper = Paper::query()->create([
'title' => $item->title,
'title' => $title,
'authors' => $payload['authors'] ?? null,
'school_name' => $payload['school_name'] ?? null,
'published_at' => $this->normalizePaperPublishedAt($payload['published_at'] ?? null),

@ -83,6 +83,30 @@ class CrawlKeywordParser
* 构建 arXiv search_query短语内空格 AND多短语之间 OR。
*/
public static function buildArxivSearchQuery(?string $raw): string
{
return self::combineArxivSearchQueries(null, $raw);
}
/**
* 结合列表页分类与关键词构建 search_query。
*/
public static function combineArxivSearchQueries(?string $categoryQuery, ?string $keywordRaw): string
{
$categoryQuery = trim((string) $categoryQuery);
$keywordQuery = self::buildKeywordOnlyArxivSearchQuery($keywordRaw);
if ($categoryQuery !== '' && $keywordQuery !== 'cat:*') {
return '('.$categoryQuery.') AND ('.$keywordQuery.')';
}
if ($categoryQuery !== '') {
return $categoryQuery;
}
return $keywordQuery;
}
protected static function buildKeywordOnlyArxivSearchQuery(?string $raw): string
{
$phrases = self::parsePhrases($raw);
if ($phrases === []) {

@ -0,0 +1,76 @@
<?php
namespace Tests\Unit;
use App\Models\CrawlSource;
use App\Services\Crawl\Adapters\ArxivApiAdapter;
use App\Services\Crawl\ArxivAbsEnricher;
use App\Services\Crawl\ArxivRequestGate;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class ArxivApiAdapterTest extends TestCase
{
public function test_fetches_papers_from_category_list_html_when_api_empty(): void
{
Http::fake([
'export.arxiv.org/*' => Http::response('Rate exceeded.', 200),
'arxiv.org/list/cs/new*' => Http::response($this->sampleListHtml(), 200),
]);
$enricher = $this->createMock(ArxivAbsEnricher::class);
$enricher->method('enrichMany')->willReturnCallback(fn (array $items) => $items);
$adapter = new ArxivApiAdapter(
new ArxivRequestGate(0),
$enricher,
);
$source = new CrawlSource([
'adapter_code' => 'arxiv_api',
'target_type' => 'paper',
]);
$items = $adapter->fetch('https://arxiv.org/list/cs/new', $source, [
'max_results' => 5,
'max_pages' => 1,
'skip_imported' => false,
]);
$this->assertCount(2, $items);
$this->assertSame('arxiv:2606.23690', $items[0]->externalId);
$this->assertSame('Sample CS Paper One', $items[0]->title);
$this->assertSame('list_html', $items[0]->extra['source']);
}
protected function sampleListHtml(): string
{
return <<<'HTML'
<dl>
<dt>
<a href ="/abs/2606.23690" title="Abstract" id="2606.23690">arXiv:2606.23690</a>
</dt>
<dd>
<div class='meta'>
<div class='list-title mathjax'><span class='descriptor'>Title:</span>
Sample CS Paper One
</div>
<div class='list-authors'><a href="#">Alice Author</a></div>
<p class='mathjax'>Abstract one.</p>
</div>
</dd>
<dt>
<a href ="/abs/2606.23691" title="Abstract" id="2606.23691">arXiv:2606.23691</a>
</dt>
<dd>
<div class='meta'>
<div class='list-title mathjax'><span class='descriptor'>Title:</span>
Sample CS Paper Two
</div>
<div class='list-authors'><a href="#">Bob Author</a></div>
<p class='mathjax'>Abstract two.</p>
</div>
</dd>
</dl>
HTML;
}
}

@ -0,0 +1,32 @@
<?php
namespace Tests\Unit;
use App\Services\Crawl\ArxivListUrlParser;
use App\Services\Crawl\CrawlKeywordParser;
use PHPUnit\Framework\TestCase;
class ArxivListUrlParserTest extends TestCase
{
public function test_parses_category_list_urls(): void
{
$parsed = ArxivListUrlParser::parse('https://arxiv.org/list/cs/new');
$this->assertSame(['category' => 'cs', 'slice' => 'new'], $parsed);
$this->assertSame('cat:cs*', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/cs/new'));
$this->assertSame('cat:cs.AI', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/cs.AI/new'));
$this->assertSame('cat:astro-ph*', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/astro-ph/new'));
}
public function test_builds_list_page_url_with_paging(): void
{
$url = ArxivListUrlParser::buildListPageUrl('https://arxiv.org/list/math/new', 50, 100);
$this->assertStringContainsString('skip=50', $url);
$this->assertStringContainsString('show=100', $url);
}
public function test_combines_category_and_keyword_queries(): void
{
$query = CrawlKeywordParser::combineArxivSearchQueries('cat:math*', 'graph neural');
$this->assertSame('(cat:math*) AND ((all:graph AND all:neural))', $query);
}
}
Loading…
Cancel
Save