You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

138 lines
3.9 KiB

1 day ago
<?php
namespace App\Services\Crawl;
use App\Models\CrawlAddress;
use Illuminate\Support\Collection;
class CrawlAddressSourceResolver
{
/** @var array<string, Collection<int, CrawlAddress>> */
protected array $cache = [];
public function resolveByRequestUrl(?string $url, string $targetType = 'industry_news'): ?string
{
$normalized = $this->normalizeUrl($url);
if ($normalized === '') {
return null;
}
return $this->addresses($targetType)
->first(fn (CrawlAddress $address) => $this->normalizeUrl($address->request_url) === $normalized)
?->name;
}
public function resolveBySourceUrl(?string $sourceUrl, string $targetType = 'industry_news'): ?string
{
$host = strtolower((string) parse_url((string) $sourceUrl, PHP_URL_HOST));
if ($host === '') {
return null;
}
$matches = $this->addresses($targetType)
->filter(function (CrawlAddress $address) use ($host) {
$addressHost = strtolower((string) parse_url($address->request_url, PHP_URL_HOST));
return $addressHost !== '' && $addressHost === $host;
})
->values();
if ($matches->isEmpty()) {
return null;
}
if ($matches->count() === 1) {
return $matches->first()->name;
}
$path = (string) (parse_url((string) $sourceUrl, PHP_URL_PATH) ?: '/');
$bestName = null;
$bestLength = 0;
foreach ($matches as $address) {
$prefix = $this->listPathPrefix($address->request_url);
if ($prefix !== '' && str_starts_with($path, $prefix) && strlen($prefix) > $bestLength) {
$bestName = $address->name;
$bestLength = strlen($prefix);
}
}
return $bestName;
}
public function resolveForNews(?string $jobRequestUrl, ?string $articleSourceUrl, string $targetType = 'industry_news'): ?string
{
return $this->resolveByRequestUrl($jobRequestUrl, $targetType)
?? $this->resolveBySourceUrl($articleSourceUrl, $targetType);
}
/**
* @return list<string>
*/
public function genericAdapterSourceNames(): array
{
return [
'通用资讯 HTML',
'爬虫采集',
];
}
/**
* @return Collection<int, CrawlAddress>
*/
protected function addresses(string $targetType): Collection
{
if (! isset($this->cache[$targetType])) {
$this->cache[$targetType] = CrawlAddress::query()
->where('target_type', $targetType)
->where('status', 1)
->orderBy('sort')
->orderBy('name')
->get();
}
return $this->cache[$targetType];
}
protected function normalizeUrl(?string $url): string
{
$trimmed = trim((string) $url);
if ($trimmed === '') {
return '';
}
if (! preg_match('#^https?://#i', $trimmed)) {
$trimmed = 'https://'.$trimmed;
}
$parts = parse_url($trimmed);
if (! is_array($parts) || empty($parts['host'])) {
return rtrim($trimmed, '/');
}
$scheme = strtolower((string) ($parts['scheme'] ?? 'https'));
$host = strtolower((string) $parts['host']);
$path = $parts['path'] ?? '/';
$path = $path === '' ? '/' : $path;
$path = rtrim($path, '/') ?: '/';
return $scheme.'://'.$host.$path;
}
protected function listPathPrefix(string $url): string
{
$path = (string) (parse_url($url, PHP_URL_PATH) ?: '/');
if ($path === '/') {
return '/';
}
$path = rtrim($path, '/');
$lastSlash = strrpos($path, '/');
if ($lastSlash === false) {
return $path;
}
return substr($path, 0, $lastSlash + 1);
}
}