where('status', 1) ->where('target_type', $targetType) ->orderBy('sort') ->get(); $wildcard = null; foreach ($sources as $source) { foreach ($source->match_domains ?? [] as $domain) { $domain = strtolower(trim((string) $domain)); if ($domain === '*' || $domain === 'any') { $wildcard ??= $source; continue; } if ($this->hostMatchesDomain($host, $domain)) { return $source; } } } $hint = $this->resolveByUrlHint($requestUrl, $targetType, $sources); if ($hint) { return $hint; } return $wildcard; } protected function hostMatchesDomain(string $host, string $domain): bool { $host = $this->normalizeHost($host); $domain = strtolower(trim($domain)); if ($domain === '' || $domain === '*' || $domain === 'any') { return false; } if (str_starts_with($domain, '*.')) { $suffix = substr($domain, 1); return $host === substr($domain, 2) || str_ends_with($host, $suffix); } $domain = $this->normalizeHost($domain); return $host === $domain || str_ends_with($host, '.'.$domain); } protected function normalizeHost(string $host): string { $host = strtolower(trim($host)); if (str_starts_with($host, 'www.')) { return substr($host, 4); } return $host; } /** * 域名未精确命中时,按 URL 特征回退(如 pedaily 子域、arxiv 路径)。 * * @param \Illuminate\Support\Collection $sources */ protected function resolveByUrlHint(string $requestUrl, string $targetType, $sources): ?CrawlSource { $lower = strtolower($requestUrl); if ($targetType === 'industry_news' && str_contains($lower, 'pedaily')) { return $sources->firstWhere('adapter_code', 'pedaily_html'); } if ($targetType === 'paper' && (str_contains($lower, 'arxiv.org') || str_contains($lower, 'arxiv'))) { return $sources->firstWhere('adapter_code', 'arxiv_api'); } return null; } }