You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
2.6 KiB

<?php
namespace App\Services\Crawl;
use App\Models\CrawlSource;
class CrawlSourceResolver
{
public function resolve(string $requestUrl, string $targetType): ?CrawlSource
{
$host = strtolower((string) parse_url($requestUrl, PHP_URL_HOST));
if ($host === '') {
return null;
}
$sources = CrawlSource::query()
->where('status', 1)
->where('target_type', $targetType)
->orderBy('sort')
->get();
$wildcard = null;
foreach ($sources as $source) {
foreach ($source->match_domains ?? [] as $domain) {
$domain = strtolower(trim((string) $domain));
if ($domain === '*' || $domain === 'any') {
$wildcard ??= $source;
continue;
}
if ($this->hostMatchesDomain($host, $domain)) {
return $source;
}
}
}
$hint = $this->resolveByUrlHint($requestUrl, $targetType, $sources);
if ($hint) {
return $hint;
}
return $wildcard;
}
protected function hostMatchesDomain(string $host, string $domain): bool
{
$host = $this->normalizeHost($host);
$domain = strtolower(trim($domain));
if ($domain === '' || $domain === '*' || $domain === 'any') {
return false;
}
if (str_starts_with($domain, '*.')) {
$suffix = substr($domain, 1);
return $host === substr($domain, 2) || str_ends_with($host, $suffix);
}
$domain = $this->normalizeHost($domain);
return $host === $domain || str_ends_with($host, '.'.$domain);
}
protected function normalizeHost(string $host): string
{
$host = strtolower(trim($host));
if (str_starts_with($host, 'www.')) {
return substr($host, 4);
}
return $host;
}
/**
* 域名未精确命中时,按 URL 特征回退(如 pedaily 子域、arxiv 路径)。
*
* @param \Illuminate\Support\Collection<int, CrawlSource> $sources
*/
protected function resolveByUrlHint(string $requestUrl, string $targetType, $sources): ?CrawlSource
{
$lower = strtolower($requestUrl);
if ($targetType === 'industry_news' && str_contains($lower, 'pedaily')) {
return $sources->firstWhere('adapter_code', 'pedaily_html');
}
if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) {
return $sources->firstWhere('adapter_code', 'arxiv_api');
}
return null;
}
}