You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
138 lines
3.9 KiB
138 lines
3.9 KiB
|
1 day ago
|
<?php
|
||
|
|
|
||
|
|
namespace App\Services\Crawl;
|
||
|
|
|
||
|
|
use App\Models\CrawlAddress;
|
||
|
|
use Illuminate\Support\Collection;
|
||
|
|
|
||
|
|
class CrawlAddressSourceResolver
|
||
|
|
{
|
||
|
|
/** @var array<string, Collection<int, CrawlAddress>> */
|
||
|
|
protected array $cache = [];
|
||
|
|
|
||
|
|
public function resolveByRequestUrl(?string $url, string $targetType = 'industry_news'): ?string
|
||
|
|
{
|
||
|
|
$normalized = $this->normalizeUrl($url);
|
||
|
|
if ($normalized === '') {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
return $this->addresses($targetType)
|
||
|
|
->first(fn (CrawlAddress $address) => $this->normalizeUrl($address->request_url) === $normalized)
|
||
|
|
?->name;
|
||
|
|
}
|
||
|
|
|
||
|
|
public function resolveBySourceUrl(?string $sourceUrl, string $targetType = 'industry_news'): ?string
|
||
|
|
{
|
||
|
|
$host = strtolower((string) parse_url((string) $sourceUrl, PHP_URL_HOST));
|
||
|
|
if ($host === '') {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
$matches = $this->addresses($targetType)
|
||
|
|
->filter(function (CrawlAddress $address) use ($host) {
|
||
|
|
$addressHost = strtolower((string) parse_url($address->request_url, PHP_URL_HOST));
|
||
|
|
|
||
|
|
return $addressHost !== '' && $addressHost === $host;
|
||
|
|
})
|
||
|
|
->values();
|
||
|
|
|
||
|
|
if ($matches->isEmpty()) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($matches->count() === 1) {
|
||
|
|
return $matches->first()->name;
|
||
|
|
}
|
||
|
|
|
||
|
|
$path = (string) (parse_url((string) $sourceUrl, PHP_URL_PATH) ?: '/');
|
||
|
|
$bestName = null;
|
||
|
|
$bestLength = 0;
|
||
|
|
|
||
|
|
foreach ($matches as $address) {
|
||
|
|
$prefix = $this->listPathPrefix($address->request_url);
|
||
|
|
if ($prefix !== '' && str_starts_with($path, $prefix) && strlen($prefix) > $bestLength) {
|
||
|
|
$bestName = $address->name;
|
||
|
|
$bestLength = strlen($prefix);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return $bestName;
|
||
|
|
}
|
||
|
|
|
||
|
|
public function resolveForNews(?string $jobRequestUrl, ?string $articleSourceUrl, string $targetType = 'industry_news'): ?string
|
||
|
|
{
|
||
|
|
return $this->resolveByRequestUrl($jobRequestUrl, $targetType)
|
||
|
|
?? $this->resolveBySourceUrl($articleSourceUrl, $targetType);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return list<string>
|
||
|
|
*/
|
||
|
|
public function genericAdapterSourceNames(): array
|
||
|
|
{
|
||
|
|
return [
|
||
|
|
'通用资讯 HTML',
|
||
|
|
'爬虫采集',
|
||
|
|
];
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @return Collection<int, CrawlAddress>
|
||
|
|
*/
|
||
|
|
protected function addresses(string $targetType): Collection
|
||
|
|
{
|
||
|
|
if (! isset($this->cache[$targetType])) {
|
||
|
|
$this->cache[$targetType] = CrawlAddress::query()
|
||
|
|
->where('target_type', $targetType)
|
||
|
|
->where('status', 1)
|
||
|
|
->orderBy('sort')
|
||
|
|
->orderBy('name')
|
||
|
|
->get();
|
||
|
|
}
|
||
|
|
|
||
|
|
return $this->cache[$targetType];
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function normalizeUrl(?string $url): string
|
||
|
|
{
|
||
|
|
$trimmed = trim((string) $url);
|
||
|
|
if ($trimmed === '') {
|
||
|
|
return '';
|
||
|
|
}
|
||
|
|
|
||
|
|
if (! preg_match('#^https?://#i', $trimmed)) {
|
||
|
|
$trimmed = 'https://'.$trimmed;
|
||
|
|
}
|
||
|
|
|
||
|
|
$parts = parse_url($trimmed);
|
||
|
|
if (! is_array($parts) || empty($parts['host'])) {
|
||
|
|
return rtrim($trimmed, '/');
|
||
|
|
}
|
||
|
|
|
||
|
|
$scheme = strtolower((string) ($parts['scheme'] ?? 'https'));
|
||
|
|
$host = strtolower((string) $parts['host']);
|
||
|
|
$path = $parts['path'] ?? '/';
|
||
|
|
$path = $path === '' ? '/' : $path;
|
||
|
|
$path = rtrim($path, '/') ?: '/';
|
||
|
|
|
||
|
|
return $scheme.'://'.$host.$path;
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function listPathPrefix(string $url): string
|
||
|
|
{
|
||
|
|
$path = (string) (parse_url($url, PHP_URL_PATH) ?: '/');
|
||
|
|
if ($path === '/') {
|
||
|
|
return '/';
|
||
|
|
}
|
||
|
|
|
||
|
|
$path = rtrim($path, '/');
|
||
|
|
$lastSlash = strrpos($path, '/');
|
||
|
|
if ($lastSlash === false) {
|
||
|
|
return $path;
|
||
|
|
}
|
||
|
|
|
||
|
|
return substr($path, 0, $lastSlash + 1);
|
||
|
|
}
|
||
|
|
}
|