slake-school-service/app/Services/Crawl/HtmlCrawlSupport.php

<?php

namespace App\Services\Crawl;

use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;

class HtmlCrawlSupport
{
    public static function fetchHtml(string $url, int $timeout = 45): string
    {
        $response = Http::timeout($timeout)
            ->withHeaders([
                'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
                'Accept' => 'text/html,application/xhtml+xml',
                'Accept-Language' => 'zh-CN,zh;q=0.9,en;q=0.8',
            ])
            ->get($url);

        if (! $response->successful()) {
            throw new \RuntimeException('页面请求失败：HTTP '.$response->status());
        }

        return $response->body();
    }

    public static function hostKey(?string $url): ?string
    {
        $host = strtolower((string) parse_url((string) $url, PHP_URL_HOST));
        if ($host === '') {
            return null;
        }
        if (str_starts_with($host, 'www.')) {
            $host = substr($host, 4);
        }

        return $host;
    }

    public static function sameHost(?string $a, ?string $b): bool
    {
        return $a !== null && $b !== null && $a === $b;
    }

    public static function absoluteUrl(string $href, string $base): ?string
    {
        $href = html_entity_decode(trim($href));
        if ($href === '' || Str::startsWith($href, ['#', 'javascript:', 'mailto:', 'tel:'])) {
            return null;
        }
        if (Str::startsWith($href, 'http')) {
            return $href;
        }
        $parts = parse_url($base);
        if (! $parts || empty($parts['scheme']) || empty($parts['host'])) {
            return null;
        }
        $origin = $parts['scheme'].'://'.$parts['host'];
        if (Str::startsWith($href, '//')) {
            return $parts['scheme'].':'.$href;
        }
        if (Str::startsWith($href, '/')) {
            return $origin.$href;
        }

        $path = $parts['path'] ?? '/';
        $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';

        return $origin.$dir.ltrim($href, '/');
    }

    public static function normalizeDate(?string $raw): ?string
    {
        if ($raw === null || trim($raw) === '') {
            return null;
        }
        $raw = trim(html_entity_decode(strip_tags($raw)));
        if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $raw, $m)) {
            return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
        }

        $raw = str_replace(['/', '.'], '-', $raw);
        if (preg_match('#(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) {
            return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
        }

        return null;
    }

    public static function isWeakLinkTitle(string $title): bool
    {
        $title = trim($title);
        if ($title === '') {
            return true;
        }
        if (preg_match('#^\d{4}[-/年]\d{1,2}([-/月日]\d{1,2})?$#u', $title)) {
            return true;
        }
        foreach (['阅读全文', '查看更多', '查看详情', '详情', '点击进入', '更多>>', 'More'] as $noise) {
            if ($title === $noise || str_starts_with($title, $noise)) {
                return true;
            }
        }

        return false;
    }

    public static function cleanArticleTitle(?string $title): ?string
    {
        if ($title === null) {
            return null;
        }
        $title = trim(html_entity_decode(strip_tags($title)));
        if ($title === '') {
            return null;
        }
        if (preg_match('/^(.+?)\s*[-_|–—]\s*.+$/u', $title, $m) && mb_strlen($m[1]) >= 8) {
            $title = trim($m[1]);
        }

        return $title;
    }

    public static function extractDateFromText(string $text): ?string
    {
        if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $text, $m)) {
            return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
        }
        if (preg_match('#(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})#', $text, $m)) {
            return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
        }

        return null;
    }

    public static function isSkippableLinkTitle(string $title): bool
    {
        if (Str::length($title) < 8) {
            return true;
        }

        foreach (['登录', '注册', '更多', '下一页', '上一页', '首页', '关于我们', '联系我们', '隐私', '版权'] as $noise) {
            if (Str::contains($title, $noise)) {
                return true;
            }
        }

        return false;
    }

    public static function isAssetPath(string $url): bool
    {
        return (bool) preg_match('#\.(css|js|png|jpe?g|gif|svg|ico|woff2?|pdf|zip)(\?|$)#i', $url);
    }
}