You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
95 lines
2.6 KiB
95 lines
2.6 KiB
<?php
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
use App\Models\CrawlSource;
|
|
|
|
class CrawlSourceResolver
|
|
{
|
|
public function resolve(string $requestUrl, string $targetType): ?CrawlSource
|
|
{
|
|
$host = strtolower((string) parse_url($requestUrl, PHP_URL_HOST));
|
|
if ($host === '') {
|
|
return null;
|
|
}
|
|
|
|
$sources = CrawlSource::query()
|
|
->where('status', 1)
|
|
->where('target_type', $targetType)
|
|
->orderBy('sort')
|
|
->get();
|
|
|
|
$wildcard = null;
|
|
|
|
foreach ($sources as $source) {
|
|
foreach ($source->match_domains ?? [] as $domain) {
|
|
$domain = strtolower(trim((string) $domain));
|
|
if ($domain === '*' || $domain === 'any') {
|
|
$wildcard ??= $source;
|
|
|
|
continue;
|
|
}
|
|
if ($this->hostMatchesDomain($host, $domain)) {
|
|
return $source;
|
|
}
|
|
}
|
|
}
|
|
|
|
$hint = $this->resolveByUrlHint($requestUrl, $targetType, $sources);
|
|
if ($hint) {
|
|
return $hint;
|
|
}
|
|
|
|
return $wildcard;
|
|
}
|
|
|
|
protected function hostMatchesDomain(string $host, string $domain): bool
|
|
{
|
|
$host = $this->normalizeHost($host);
|
|
$domain = strtolower(trim($domain));
|
|
if ($domain === '' || $domain === '*' || $domain === 'any') {
|
|
return false;
|
|
}
|
|
|
|
if (str_starts_with($domain, '*.')) {
|
|
$suffix = substr($domain, 1);
|
|
|
|
return $host === substr($domain, 2) || str_ends_with($host, $suffix);
|
|
}
|
|
|
|
$domain = $this->normalizeHost($domain);
|
|
|
|
return $host === $domain || str_ends_with($host, '.'.$domain);
|
|
}
|
|
|
|
protected function normalizeHost(string $host): string
|
|
{
|
|
$host = strtolower(trim($host));
|
|
if (str_starts_with($host, 'www.')) {
|
|
return substr($host, 4);
|
|
}
|
|
|
|
return $host;
|
|
}
|
|
|
|
/**
|
|
* 域名未精确命中时,按 URL 特征回退(如 pedaily 子域、arxiv 路径)。
|
|
*
|
|
* @param \Illuminate\Support\Collection<int, CrawlSource> $sources
|
|
*/
|
|
protected function resolveByUrlHint(string $requestUrl, string $targetType, $sources): ?CrawlSource
|
|
{
|
|
$lower = strtolower($requestUrl);
|
|
|
|
if ($targetType === 'industry_news' && str_contains($lower, 'pedaily')) {
|
|
return $sources->firstWhere('adapter_code', 'pedaily_html');
|
|
}
|
|
|
|
if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) {
|
|
return $sources->firstWhere('adapter_code', 'arxiv_api');
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|