You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
4.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
class HtmlCrawlSupport
{
public static function fetchHtml(string $url, int $timeout = 45): string
{
$response = Http::timeout($timeout)
->withHeaders([
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
'Accept' => 'text/html,application/xhtml+xml',
'Accept-Language' => 'zh-CN,zh;q=0.9,en;q=0.8',
])
->get($url);
if (! $response->successful()) {
throw new \RuntimeException('页面请求失败HTTP '.$response->status());
}
return $response->body();
}
public static function hostKey(?string $url): ?string
{
$host = strtolower((string) parse_url((string) $url, PHP_URL_HOST));
if ($host === '') {
return null;
}
if (str_starts_with($host, 'www.')) {
$host = substr($host, 4);
}
return $host;
}
public static function sameHost(?string $a, ?string $b): bool
{
return $a !== null && $b !== null && $a === $b;
}
public static function absoluteUrl(string $href, string $base): ?string
{
$href = html_entity_decode(trim($href));
if ($href === '' || Str::startsWith($href, ['#', 'javascript:', 'mailto:', 'tel:'])) {
return null;
}
if (Str::startsWith($href, 'http')) {
return $href;
}
$parts = parse_url($base);
if (! $parts || empty($parts['scheme']) || empty($parts['host'])) {
return null;
}
$origin = $parts['scheme'].'://'.$parts['host'];
if (Str::startsWith($href, '//')) {
return $parts['scheme'].':'.$href;
}
if (Str::startsWith($href, '/')) {
return $origin.$href;
}
$path = $parts['path'] ?? '/';
$dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';
return $origin.$dir.ltrim($href, '/');
}
public static function normalizeDate(?string $raw): ?string
{
if ($raw === null || trim($raw) === '') {
return null;
}
$raw = trim(html_entity_decode(strip_tags($raw)));
if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $raw, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
$raw = str_replace(['/', '.'], '-', $raw);
if (preg_match('#(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
return null;
}
public static function isWeakLinkTitle(string $title): bool
{
$title = trim($title);
if ($title === '') {
return true;
}
if (preg_match('#^\d{4}[-/年]\d{1,2}([-/月日]\d{1,2})?$#u', $title)) {
return true;
}
foreach (['阅读全文', '查看更多', '查看详情', '详情', '点击进入', '更多>>', 'More'] as $noise) {
if ($title === $noise || str_starts_with($title, $noise)) {
return true;
}
}
return false;
}
public static function cleanArticleTitle(?string $title): ?string
{
if ($title === null) {
return null;
}
$title = trim(html_entity_decode(strip_tags($title)));
if ($title === '') {
return null;
}
if (preg_match('/^(.+?)\s*[-_|–—]\s*.+$/u', $title, $m) && mb_strlen($m[1]) >= 8) {
$title = trim($m[1]);
}
return $title;
}
public static function extractDateFromText(string $text): ?string
{
if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $text, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
if (preg_match('#(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})#', $text, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
return null;
}
public static function isSkippableLinkTitle(string $title): bool
{
if (Str::length($title) < 8) {
return true;
}
foreach (['登录', '注册', '更多', '下一页', '上一页', '首页', '关于我们', '联系我们', '隐私', '版权'] as $noise) {
if (Str::contains($title, $noise)) {
return true;
}
}
return false;
}
public static function isAssetPath(string $url): bool
{
return (bool) preg_match('#\.(css|js|png|jpe?g|gif|svg|ico|woff2?|pdf|zip)(\?|$)#i', $url);
}
}