withHeaders([ 'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)', 'Accept' => 'text/html,application/xhtml+xml', 'Accept-Language' => 'zh-CN,zh;q=0.9,en;q=0.8', ]) ->get($url); if (! $response->successful()) { throw new \RuntimeException('页面请求失败:HTTP '.$response->status()); } return $response->body(); } public static function hostKey(?string $url): ?string { $host = strtolower((string) parse_url((string) $url, PHP_URL_HOST)); if ($host === '') { return null; } if (str_starts_with($host, 'www.')) { $host = substr($host, 4); } return $host; } public static function sameHost(?string $a, ?string $b): bool { return $a !== null && $b !== null && $a === $b; } public static function absoluteUrl(string $href, string $base): ?string { $href = html_entity_decode(trim($href)); if ($href === '' || Str::startsWith($href, ['#', 'javascript:', 'mailto:', 'tel:'])) { return null; } if (Str::startsWith($href, 'http')) { return $href; } $parts = parse_url($base); if (! $parts || empty($parts['scheme']) || empty($parts['host'])) { return null; } $origin = $parts['scheme'].'://'.$parts['host']; if (Str::startsWith($href, '//')) { return $parts['scheme'].':'.$href; } if (Str::startsWith($href, '/')) { return $origin.$href; } $path = $parts['path'] ?? '/'; $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/'; return $origin.$dir.ltrim($href, '/'); } public static function normalizeDate(?string $raw): ?string { if ($raw === null || trim($raw) === '') { return null; } $raw = trim(html_entity_decode(strip_tags($raw))); if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $raw, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } $raw = str_replace(['/', '.'], '-', $raw); if (preg_match('#(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } return null; } public static function isWeakLinkTitle(string $title): bool { $title = trim($title); if ($title === '') { return true; } if (preg_match('#^\d{4}[-/年]\d{1,2}([-/月日]\d{1,2})?$#u', $title)) { return true; } foreach (['阅读全文', '查看更多', '查看详情', '详情', '点击进入', '更多>>', 'More'] as $noise) { if ($title === $noise || str_starts_with($title, $noise)) { return true; } } return false; } public static function cleanArticleTitle(?string $title): ?string { if ($title === null) { return null; } $title = trim(html_entity_decode(strip_tags($title))); if ($title === '') { return null; } if (preg_match('/^(.+?)\s*[-_|–—]\s*.+$/u', $title, $m) && mb_strlen($m[1]) >= 8) { $title = trim($m[1]); } return $title; } public static function extractDateFromText(string $text): ?string { if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $text, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } if (preg_match('#(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})#', $text, $m)) { return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]); } return null; } public static function isSkippableLinkTitle(string $title): bool { if (Str::length($title) < 8) { return true; } foreach (['登录', '注册', '更多', '下一页', '上一页', '首页', '关于我们', '联系我们', '隐私', '版权'] as $noise) { if (Str::contains($title, $noise)) { return true; } } return false; } public static function isAssetPath(string $url): bool { return (bool) preg_match('#\.(css|js|png|jpe?g|gif|svg|ico|woff2?|pdf|zip)(\?|$)#i', $url); } }