You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
4.7 KiB

7 days ago
<?php
namespace App\Services\Crawl;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
class HtmlCrawlSupport
{
public static function fetchHtml(string $url, int $timeout = 45): string
{
$response = Http::timeout($timeout)
->withHeaders([
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
'Accept' => 'text/html,application/xhtml+xml',
'Accept-Language' => 'zh-CN,zh;q=0.9,en;q=0.8',
])
->get($url);
if (! $response->successful()) {
throw new \RuntimeException('页面请求失败HTTP '.$response->status());
}
return $response->body();
}
public static function hostKey(?string $url): ?string
{
$host = strtolower((string) parse_url((string) $url, PHP_URL_HOST));
if ($host === '') {
return null;
}
if (str_starts_with($host, 'www.')) {
$host = substr($host, 4);
}
return $host;
}
public static function sameHost(?string $a, ?string $b): bool
{
return $a !== null && $b !== null && $a === $b;
}
public static function absoluteUrl(string $href, string $base): ?string
{
$href = html_entity_decode(trim($href));
if ($href === '' || Str::startsWith($href, ['#', 'javascript:', 'mailto:', 'tel:'])) {
return null;
}
if (Str::startsWith($href, 'http')) {
return $href;
}
$parts = parse_url($base);
if (! $parts || empty($parts['scheme']) || empty($parts['host'])) {
return null;
}
$origin = $parts['scheme'].'://'.$parts['host'];
if (Str::startsWith($href, '//')) {
return $parts['scheme'].':'.$href;
}
if (Str::startsWith($href, '/')) {
return $origin.$href;
}
$path = $parts['path'] ?? '/';
$dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';
return $origin.$dir.ltrim($href, '/');
}
public static function normalizeDate(?string $raw): ?string
{
if ($raw === null || trim($raw) === '') {
return null;
}
$raw = trim(html_entity_decode(strip_tags($raw)));
if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $raw, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
$raw = str_replace(['/', '.'], '-', $raw);
if (preg_match('#(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
return null;
}
public static function isWeakLinkTitle(string $title): bool
{
$title = trim($title);
if ($title === '') {
return true;
}
if (preg_match('#^\d{4}[-/年]\d{1,2}([-/月日]\d{1,2})?$#u', $title)) {
return true;
}
foreach (['阅读全文', '查看更多', '查看详情', '详情', '点击进入', '更多>>', 'More'] as $noise) {
if ($title === $noise || str_starts_with($title, $noise)) {
return true;
}
}
return false;
}
public static function cleanArticleTitle(?string $title): ?string
{
if ($title === null) {
return null;
}
$title = trim(html_entity_decode(strip_tags($title)));
if ($title === '') {
return null;
}
if (preg_match('/^(.+?)\s*[-_|–—]\s*.+$/u', $title, $m) && mb_strlen($m[1]) >= 8) {
$title = trim($m[1]);
}
return $title;
}
public static function extractDateFromText(string $text): ?string
{
if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $text, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
if (preg_match('#(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})#', $text, $m)) {
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
}
return null;
}
public static function isSkippableLinkTitle(string $title): bool
{
if (Str::length($title) < 8) {
return true;
}
foreach (['登录', '注册', '更多', '下一页', '上一页', '首页', '关于我们', '联系我们', '隐私', '版权'] as $noise) {
if (Str::contains($title, $noise)) {
return true;
}
}
return false;
}
public static function isAssetPath(string $url): bool
{
return (bool) preg_match('#\.(css|js|png|jpe?g|gif|svg|ico|woff2?|pdf|zip)(\?|$)#i', $url);
}
}