You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
2.3 KiB

<?php
namespace App\Services\Crawl;
class ArxivListUrlParser
{
/**
* @return array{category: string, slice: string}|null
*/
public static function parse(?string $url): ?array
{
$url = trim((string) $url);
if ($url === '') {
return null;
}
if (! preg_match('#arxiv\.org/list/([^/?\#]+)/([^/?\#]+)#i', $url, $matches)) {
return null;
}
$category = trim($matches[1]);
$slice = trim($matches[2]);
if ($category === '' || $slice === '' || ! preg_match('/^[a-z0-9\-_.]+$/i', $category)) {
return null;
}
return [
'category' => $category,
'slice' => $slice,
];
}
public static function isListUrl(?string $url): bool
{
return self::parse($url) !== null;
}
public static function buildCategorySearchQuery(?string $url): ?string
{
$parsed = self::parse($url);
if ($parsed === null) {
return null;
}
$category = $parsed['category'];
if (str_contains($category, '.')) {
return 'cat:'.$category;
}
return 'cat:'.$category.'*';
}
/**
* @return array{sortBy: string, sortOrder: string}
*/
public static function sortParams(?string $url): array
{
$parsed = self::parse($url);
$slice = strtolower($parsed['slice'] ?? 'new');
if ($slice === 'recent') {
return [
'sortBy' => 'lastUpdatedDate',
'sortOrder' => 'descending',
];
}
return [
'sortBy' => 'submittedDate',
'sortOrder' => 'descending',
];
}
public static function buildListPageUrl(string $requestUrl, int $skip, int $show): string
{
$parts = parse_url($requestUrl);
$scheme = $parts['scheme'] ?? 'https';
$host = $parts['host'] ?? 'arxiv.org';
$path = $parts['path'] ?? '/';
$query = [];
if (! empty($parts['query'])) {
parse_str((string) $parts['query'], $query);
}
$query['skip'] = max(0, $skip);
$query['show'] = min(2000, max(1, $show));
return $scheme.'://'.$host.$path.'?'.http_build_query($query);
}
}