You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
91 lines
2.3 KiB
91 lines
2.3 KiB
<?php
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
class ArxivListUrlParser
|
|
{
|
|
/**
|
|
* @return array{category: string, slice: string}|null
|
|
*/
|
|
public static function parse(?string $url): ?array
|
|
{
|
|
$url = trim((string) $url);
|
|
if ($url === '') {
|
|
return null;
|
|
}
|
|
|
|
if (! preg_match('#arxiv\.org/list/([^/?\#]+)/([^/?\#]+)#i', $url, $matches)) {
|
|
return null;
|
|
}
|
|
|
|
$category = trim($matches[1]);
|
|
$slice = trim($matches[2]);
|
|
if ($category === '' || $slice === '' || ! preg_match('/^[a-z0-9\-_.]+$/i', $category)) {
|
|
return null;
|
|
}
|
|
|
|
return [
|
|
'category' => $category,
|
|
'slice' => $slice,
|
|
];
|
|
}
|
|
|
|
public static function isListUrl(?string $url): bool
|
|
{
|
|
return self::parse($url) !== null;
|
|
}
|
|
|
|
public static function buildCategorySearchQuery(?string $url): ?string
|
|
{
|
|
$parsed = self::parse($url);
|
|
if ($parsed === null) {
|
|
return null;
|
|
}
|
|
|
|
$category = $parsed['category'];
|
|
if (str_contains($category, '.')) {
|
|
return 'cat:'.$category;
|
|
}
|
|
|
|
return 'cat:'.$category.'*';
|
|
}
|
|
|
|
/**
|
|
* @return array{sortBy: string, sortOrder: string}
|
|
*/
|
|
public static function sortParams(?string $url): array
|
|
{
|
|
$parsed = self::parse($url);
|
|
$slice = strtolower($parsed['slice'] ?? 'new');
|
|
|
|
if ($slice === 'recent') {
|
|
return [
|
|
'sortBy' => 'lastUpdatedDate',
|
|
'sortOrder' => 'descending',
|
|
];
|
|
}
|
|
|
|
return [
|
|
'sortBy' => 'submittedDate',
|
|
'sortOrder' => 'descending',
|
|
];
|
|
}
|
|
|
|
public static function buildListPageUrl(string $requestUrl, int $skip, int $show): string
|
|
{
|
|
$parts = parse_url($requestUrl);
|
|
$scheme = $parts['scheme'] ?? 'https';
|
|
$host = $parts['host'] ?? 'arxiv.org';
|
|
$path = $parts['path'] ?? '/';
|
|
$query = [];
|
|
if (! empty($parts['query'])) {
|
|
parse_str((string) $parts['query'], $query);
|
|
}
|
|
$query['skip'] = max(0, $skip);
|
|
$query['show'] = min(2000, max(1, $show));
|
|
|
|
return $scheme.'://'.$host.$path.'?'.http_build_query($query);
|
|
}
|
|
|
|
}
|