parent
d5529b2b22
commit
322baf9bfa
@ -0,0 +1,90 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Crawl;
|
||||
|
||||
class ArxivListUrlParser
|
||||
{
|
||||
/**
|
||||
* @return array{category: string, slice: string}|null
|
||||
*/
|
||||
public static function parse(?string $url): ?array
|
||||
{
|
||||
$url = trim((string) $url);
|
||||
if ($url === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (! preg_match('#arxiv\.org/list/([^/?\#]+)/([^/?\#]+)#i', $url, $matches)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$category = trim($matches[1]);
|
||||
$slice = trim($matches[2]);
|
||||
if ($category === '' || $slice === '' || ! preg_match('/^[a-z0-9\-_.]+$/i', $category)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return [
|
||||
'category' => $category,
|
||||
'slice' => $slice,
|
||||
];
|
||||
}
|
||||
|
||||
public static function isListUrl(?string $url): bool
|
||||
{
|
||||
return self::parse($url) !== null;
|
||||
}
|
||||
|
||||
public static function buildCategorySearchQuery(?string $url): ?string
|
||||
{
|
||||
$parsed = self::parse($url);
|
||||
if ($parsed === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$category = $parsed['category'];
|
||||
if (str_contains($category, '.')) {
|
||||
return 'cat:'.$category;
|
||||
}
|
||||
|
||||
return 'cat:'.$category.'*';
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{sortBy: string, sortOrder: string}
|
||||
*/
|
||||
public static function sortParams(?string $url): array
|
||||
{
|
||||
$parsed = self::parse($url);
|
||||
$slice = strtolower($parsed['slice'] ?? 'new');
|
||||
|
||||
if ($slice === 'recent') {
|
||||
return [
|
||||
'sortBy' => 'lastUpdatedDate',
|
||||
'sortOrder' => 'descending',
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
'sortBy' => 'submittedDate',
|
||||
'sortOrder' => 'descending',
|
||||
];
|
||||
}
|
||||
|
||||
public static function buildListPageUrl(string $requestUrl, int $skip, int $show): string
|
||||
{
|
||||
$parts = parse_url($requestUrl);
|
||||
$scheme = $parts['scheme'] ?? 'https';
|
||||
$host = $parts['host'] ?? 'arxiv.org';
|
||||
$path = $parts['path'] ?? '/';
|
||||
$query = [];
|
||||
if (! empty($parts['query'])) {
|
||||
parse_str((string) $parts['query'], $query);
|
||||
}
|
||||
$query['skip'] = max(0, $skip);
|
||||
$query['show'] = min(2000, max(1, $show));
|
||||
|
||||
return $scheme.'://'.$host.$path.'?'.http_build_query($query);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,76 @@
|
||||
<?php
|
||||
|
||||
namespace Tests\Unit;
|
||||
|
||||
use App\Models\CrawlSource;
|
||||
use App\Services\Crawl\Adapters\ArxivApiAdapter;
|
||||
use App\Services\Crawl\ArxivAbsEnricher;
|
||||
use App\Services\Crawl\ArxivRequestGate;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Tests\TestCase;
|
||||
|
||||
class ArxivApiAdapterTest extends TestCase
|
||||
{
|
||||
public function test_fetches_papers_from_category_list_html_when_api_empty(): void
|
||||
{
|
||||
Http::fake([
|
||||
'export.arxiv.org/*' => Http::response('Rate exceeded.', 200),
|
||||
'arxiv.org/list/cs/new*' => Http::response($this->sampleListHtml(), 200),
|
||||
]);
|
||||
|
||||
$enricher = $this->createMock(ArxivAbsEnricher::class);
|
||||
$enricher->method('enrichMany')->willReturnCallback(fn (array $items) => $items);
|
||||
|
||||
$adapter = new ArxivApiAdapter(
|
||||
new ArxivRequestGate(0),
|
||||
$enricher,
|
||||
);
|
||||
$source = new CrawlSource([
|
||||
'adapter_code' => 'arxiv_api',
|
||||
'target_type' => 'paper',
|
||||
]);
|
||||
|
||||
$items = $adapter->fetch('https://arxiv.org/list/cs/new', $source, [
|
||||
'max_results' => 5,
|
||||
'max_pages' => 1,
|
||||
'skip_imported' => false,
|
||||
]);
|
||||
|
||||
$this->assertCount(2, $items);
|
||||
$this->assertSame('arxiv:2606.23690', $items[0]->externalId);
|
||||
$this->assertSame('Sample CS Paper One', $items[0]->title);
|
||||
$this->assertSame('list_html', $items[0]->extra['source']);
|
||||
}
|
||||
|
||||
protected function sampleListHtml(): string
|
||||
{
|
||||
return <<<'HTML'
|
||||
<dl>
|
||||
<dt>
|
||||
<a href ="/abs/2606.23690" title="Abstract" id="2606.23690">arXiv:2606.23690</a>
|
||||
</dt>
|
||||
<dd>
|
||||
<div class='meta'>
|
||||
<div class='list-title mathjax'><span class='descriptor'>Title:</span>
|
||||
Sample CS Paper One
|
||||
</div>
|
||||
<div class='list-authors'><a href="#">Alice Author</a></div>
|
||||
<p class='mathjax'>Abstract one.</p>
|
||||
</div>
|
||||
</dd>
|
||||
<dt>
|
||||
<a href ="/abs/2606.23691" title="Abstract" id="2606.23691">arXiv:2606.23691</a>
|
||||
</dt>
|
||||
<dd>
|
||||
<div class='meta'>
|
||||
<div class='list-title mathjax'><span class='descriptor'>Title:</span>
|
||||
Sample CS Paper Two
|
||||
</div>
|
||||
<div class='list-authors'><a href="#">Bob Author</a></div>
|
||||
<p class='mathjax'>Abstract two.</p>
|
||||
</div>
|
||||
</dd>
|
||||
</dl>
|
||||
HTML;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,32 @@
|
||||
<?php
|
||||
|
||||
namespace Tests\Unit;
|
||||
|
||||
use App\Services\Crawl\ArxivListUrlParser;
|
||||
use App\Services\Crawl\CrawlKeywordParser;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
class ArxivListUrlParserTest extends TestCase
|
||||
{
|
||||
public function test_parses_category_list_urls(): void
|
||||
{
|
||||
$parsed = ArxivListUrlParser::parse('https://arxiv.org/list/cs/new');
|
||||
$this->assertSame(['category' => 'cs', 'slice' => 'new'], $parsed);
|
||||
$this->assertSame('cat:cs*', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/cs/new'));
|
||||
$this->assertSame('cat:cs.AI', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/cs.AI/new'));
|
||||
$this->assertSame('cat:astro-ph*', ArxivListUrlParser::buildCategorySearchQuery('https://arxiv.org/list/astro-ph/new'));
|
||||
}
|
||||
|
||||
public function test_builds_list_page_url_with_paging(): void
|
||||
{
|
||||
$url = ArxivListUrlParser::buildListPageUrl('https://arxiv.org/list/math/new', 50, 100);
|
||||
$this->assertStringContainsString('skip=50', $url);
|
||||
$this->assertStringContainsString('show=100', $url);
|
||||
}
|
||||
|
||||
public function test_combines_category_and_keyword_queries(): void
|
||||
{
|
||||
$query = CrawlKeywordParser::combineArxivSearchQueries('cat:math*', 'graph neural');
|
||||
$this->assertSame('(cat:math*) AND ((all:graph AND all:neural))', $query);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in new issue