You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
3.1 KiB

2 weeks ago
<?php
namespace App\Services\Crawl;
class CrawlKeywordParser
{
/**
* 解析多个关键词:支持空格、中英文逗号、分号、换行分隔。
*
* @return list<string>
*/
public static function parse(?string $raw): array
{
if ($raw === null || trim($raw) === '') {
return [];
}
$parts = preg_split('/[\s,;\n\r]+/u', trim($raw), -1, PREG_SPLIT_NO_EMPTY) ?: [];
$keywords = [];
foreach ($parts as $part) {
$kw = trim($part);
if ($kw !== '' && ! in_array($kw, $keywords, true)) {
$keywords[] = $kw;
}
}
return $keywords;
}
/**
* 标题或摘要是否命中任一关键词。
*
* @param list<string> $keywords
*/
public static function matchesAny(string $title, ?string $summary, array $keywords): bool
{
if ($keywords === []) {
7 days ago
return true;
2 weeks ago
}
$haystack = $title.($summary ?? '');
foreach ($keywords as $keyword) {
if ($keyword !== '' && mb_stripos($haystack, $keyword) !== false) {
return true;
}
}
return false;
}
/**
* 按逗号/分号/换行拆成多个检索短语;无分隔符时整段视为一个短语(短语内空格在 arXiv 查询中 AND
*
* @return list<string>
*/
public static function parsePhrases(?string $raw): array
{
$raw = trim((string) $raw);
if ($raw === '') {
return [];
}
if (preg_match('/[,;\n\r]/u', $raw)) {
$parts = preg_split('/[,;\n\r]+/u', $raw, -1, PREG_SPLIT_NO_EMPTY) ?: [];
} else {
$parts = [$raw];
}
$phrases = [];
foreach ($parts as $part) {
$phrase = trim($part);
if ($phrase !== '' && ! in_array($phrase, $phrases, true)) {
$phrases[] = $phrase;
}
}
return $phrases;
}
/**
* 构建 arXiv search_query短语内空格 AND多短语之间 OR。
*/
public static function buildArxivSearchQuery(?string $raw): string
{
$phrases = self::parsePhrases($raw);
if ($phrases === []) {
7 days ago
return 'all:*';
2 weeks ago
}
$clauseParts = [];
foreach ($phrases as $phrase) {
$terms = preg_split('/\s+/u', $phrase, -1, PREG_SPLIT_NO_EMPTY) ?: [];
$ands = [];
foreach ($terms as $term) {
$clean = preg_replace('/[^\p{L}\p{N}\s\-_]/u', '', $term);
if ($clean !== '') {
$ands[] = 'all:'.$clean;
}
}
if ($ands === []) {
continue;
}
$clauseParts[] = count($ands) === 1 ? $ands[0] : '('.implode(' AND ', $ands).')';
}
if ($clauseParts === []) {
7 days ago
return 'all:*';
2 weeks ago
}
return count($clauseParts) === 1
? $clauseParts[0]
: '('.implode(' OR ', $clauseParts).')';
}
}