You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
3.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl;
class CrawlKeywordParser
{
/**
* 解析多个关键词:支持空格、中英文逗号、分号、换行分隔。
*
* @return list<string>
*/
public static function parse(?string $raw): array
{
if ($raw === null || trim($raw) === '') {
return [];
}
$parts = preg_split('/[\s,;\n\r]+/u', trim($raw), -1, PREG_SPLIT_NO_EMPTY) ?: [];
$keywords = [];
foreach ($parts as $part) {
$kw = trim($part);
if ($kw !== '' && ! in_array($kw, $keywords, true)) {
$keywords[] = $kw;
}
}
return $keywords;
}
/**
* 标题或摘要是否命中任一关键词。
*
* @param list<string> $keywords
*/
public static function matchesAny(string $title, ?string $summary, array $keywords): bool
{
if ($keywords === []) {
return true;
}
$haystack = $title.($summary ?? '');
foreach ($keywords as $keyword) {
if ($keyword !== '' && mb_stripos($haystack, $keyword) !== false) {
return true;
}
}
return false;
}
/**
* 按逗号/分号/换行拆成多个检索短语;无分隔符时整段视为一个短语(短语内空格在 arXiv 查询中 AND
*
* @return list<string>
*/
public static function parsePhrases(?string $raw): array
{
$raw = trim((string) $raw);
if ($raw === '') {
return [];
}
if (preg_match('/[,;\n\r]/u', $raw)) {
$parts = preg_split('/[,;\n\r]+/u', $raw, -1, PREG_SPLIT_NO_EMPTY) ?: [];
} else {
$parts = [$raw];
}
$phrases = [];
foreach ($parts as $part) {
$phrase = trim($part);
if ($phrase !== '' && ! in_array($phrase, $phrases, true)) {
$phrases[] = $phrase;
}
}
return $phrases;
}
/**
* 构建 arXiv search_query短语内空格 AND多短语之间 OR。
*/
public static function buildArxivSearchQuery(?string $raw): string
{
$phrases = self::parsePhrases($raw);
if ($phrases === []) {
return 'all:*';
}
$clauseParts = [];
foreach ($phrases as $phrase) {
$terms = preg_split('/\s+/u', $phrase, -1, PREG_SPLIT_NO_EMPTY) ?: [];
$ands = [];
foreach ($terms as $term) {
$clean = preg_replace('/[^\p{L}\p{N}\s\-_]/u', '', $term);
if ($clean !== '') {
$ands[] = 'all:'.$clean;
}
}
if ($ands === []) {
continue;
}
$clauseParts[] = count($ands) === 1 ? $ands[0] : '('.implode(' AND ', $ands).')';
}
if ($clauseParts === []) {
return 'all:*';
}
return count($clauseParts) === 1
? $clauseParts[0]
: '('.implode(' OR ', $clauseParts).')';
}
}