|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
class CrawlKeywordParser
|
|
|
{
|
|
|
/**
|
|
|
* 解析多个关键词:支持空格、中英文逗号、分号、换行分隔。
|
|
|
*
|
|
|
* @return list<string>
|
|
|
*/
|
|
|
public static function parse(?string $raw): array
|
|
|
{
|
|
|
if ($raw === null || trim($raw) === '') {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
$parts = preg_split('/[\s,,;;\n\r]+/u', trim($raw), -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
|
|
|
|
|
$keywords = [];
|
|
|
foreach ($parts as $part) {
|
|
|
$kw = trim($part);
|
|
|
if ($kw !== '' && ! in_array($kw, $keywords, true)) {
|
|
|
$keywords[] = $kw;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $keywords;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 标题或摘要是否命中任一关键词。
|
|
|
*
|
|
|
* @param list<string> $keywords
|
|
|
*/
|
|
|
public static function matchesAny(string $title, ?string $summary, array $keywords): bool
|
|
|
{
|
|
|
if ($keywords === []) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
$haystack = $title.($summary ?? '');
|
|
|
|
|
|
foreach ($keywords as $keyword) {
|
|
|
if ($keyword !== '' && mb_stripos($haystack, $keyword) !== false) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 按逗号/分号/换行拆成多个检索短语;无分隔符时整段视为一个短语(短语内空格在 arXiv 查询中 AND)。
|
|
|
*
|
|
|
* @return list<string>
|
|
|
*/
|
|
|
public static function parsePhrases(?string $raw): array
|
|
|
{
|
|
|
$raw = trim((string) $raw);
|
|
|
if ($raw === '') {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
if (preg_match('/[,,;;\n\r]/u', $raw)) {
|
|
|
$parts = preg_split('/[,,;;\n\r]+/u', $raw, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
|
|
} else {
|
|
|
$parts = [$raw];
|
|
|
}
|
|
|
|
|
|
$phrases = [];
|
|
|
foreach ($parts as $part) {
|
|
|
$phrase = trim($part);
|
|
|
if ($phrase !== '' && ! in_array($phrase, $phrases, true)) {
|
|
|
$phrases[] = $phrase;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $phrases;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 构建 arXiv search_query:短语内空格 AND,多短语之间 OR。
|
|
|
*/
|
|
|
public static function buildArxivSearchQuery(?string $raw): string
|
|
|
{
|
|
|
$phrases = self::parsePhrases($raw);
|
|
|
if ($phrases === []) {
|
|
|
return 'all:*';
|
|
|
}
|
|
|
|
|
|
$clauseParts = [];
|
|
|
foreach ($phrases as $phrase) {
|
|
|
$terms = preg_split('/\s+/u', $phrase, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
|
|
$ands = [];
|
|
|
foreach ($terms as $term) {
|
|
|
$clean = preg_replace('/[^\p{L}\p{N}\s\-_]/u', '', $term);
|
|
|
if ($clean !== '') {
|
|
|
$ands[] = 'all:'.$clean;
|
|
|
}
|
|
|
}
|
|
|
if ($ands === []) {
|
|
|
continue;
|
|
|
}
|
|
|
$clauseParts[] = count($ands) === 1 ? $ands[0] : '('.implode(' AND ', $ands).')';
|
|
|
}
|
|
|
|
|
|
if ($clauseParts === []) {
|
|
|
return 'all:*';
|
|
|
}
|
|
|
|
|
|
return count($clauseParts) === 1
|
|
|
? $clauseParts[0]
|
|
|
: '('.implode(' OR ', $clauseParts).')';
|
|
|
}
|
|
|
}
|