|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
use Illuminate\Support\Facades\Http;
|
|
|
use Illuminate\Support\Str;
|
|
|
|
|
|
class HtmlCrawlSupport
|
|
|
{
|
|
|
public static function fetchHtml(string $url, int $timeout = 45): string
|
|
|
{
|
|
|
$response = Http::timeout($timeout)
|
|
|
->withHeaders([
|
|
|
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
|
|
|
'Accept' => 'text/html,application/xhtml+xml',
|
|
|
'Accept-Language' => 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
|
])
|
|
|
->get($url);
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
throw new \RuntimeException('页面请求失败:HTTP '.$response->status());
|
|
|
}
|
|
|
|
|
|
return $response->body();
|
|
|
}
|
|
|
|
|
|
public static function hostKey(?string $url): ?string
|
|
|
{
|
|
|
$host = strtolower((string) parse_url((string) $url, PHP_URL_HOST));
|
|
|
if ($host === '') {
|
|
|
return null;
|
|
|
}
|
|
|
if (str_starts_with($host, 'www.')) {
|
|
|
$host = substr($host, 4);
|
|
|
}
|
|
|
|
|
|
return $host;
|
|
|
}
|
|
|
|
|
|
public static function sameHost(?string $a, ?string $b): bool
|
|
|
{
|
|
|
return $a !== null && $b !== null && $a === $b;
|
|
|
}
|
|
|
|
|
|
public static function absoluteUrl(string $href, string $base): ?string
|
|
|
{
|
|
|
$href = html_entity_decode(trim($href));
|
|
|
if ($href === '' || Str::startsWith($href, ['#', 'javascript:', 'mailto:', 'tel:'])) {
|
|
|
return null;
|
|
|
}
|
|
|
if (Str::startsWith($href, 'http')) {
|
|
|
return $href;
|
|
|
}
|
|
|
$parts = parse_url($base);
|
|
|
if (! $parts || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
return null;
|
|
|
}
|
|
|
$origin = $parts['scheme'].'://'.$parts['host'];
|
|
|
if (Str::startsWith($href, '//')) {
|
|
|
return $parts['scheme'].':'.$href;
|
|
|
}
|
|
|
if (Str::startsWith($href, '/')) {
|
|
|
return $origin.$href;
|
|
|
}
|
|
|
|
|
|
$path = $parts['path'] ?? '/';
|
|
|
$dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';
|
|
|
|
|
|
return $origin.$dir.ltrim($href, '/');
|
|
|
}
|
|
|
|
|
|
public static function normalizeDate(?string $raw): ?string
|
|
|
{
|
|
|
if ($raw === null || trim($raw) === '') {
|
|
|
return null;
|
|
|
}
|
|
|
$raw = trim(html_entity_decode(strip_tags($raw)));
|
|
|
if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $raw, $m)) {
|
|
|
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
|
|
|
}
|
|
|
|
|
|
$raw = str_replace(['/', '.'], '-', $raw);
|
|
|
if (preg_match('#(\d{4})-(\d{1,2})-(\d{1,2})#', $raw, $m)) {
|
|
|
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
public static function isWeakLinkTitle(string $title): bool
|
|
|
{
|
|
|
$title = trim($title);
|
|
|
if ($title === '') {
|
|
|
return true;
|
|
|
}
|
|
|
if (preg_match('#^\d{4}[-/年]\d{1,2}([-/月日]\d{1,2})?$#u', $title)) {
|
|
|
return true;
|
|
|
}
|
|
|
foreach (['阅读全文', '查看更多', '查看详情', '详情', '点击进入', '更多>>', 'More'] as $noise) {
|
|
|
if ($title === $noise || str_starts_with($title, $noise)) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
public static function cleanArticleTitle(?string $title): ?string
|
|
|
{
|
|
|
if ($title === null) {
|
|
|
return null;
|
|
|
}
|
|
|
$title = trim(html_entity_decode(strip_tags($title)));
|
|
|
if ($title === '') {
|
|
|
return null;
|
|
|
}
|
|
|
if (preg_match('/^(.+?)\s*[-_|–—]\s*.+$/u', $title, $m) && mb_strlen($m[1]) >= 8) {
|
|
|
$title = trim($m[1]);
|
|
|
}
|
|
|
|
|
|
return $title;
|
|
|
}
|
|
|
|
|
|
public static function extractDateFromText(string $text): ?string
|
|
|
{
|
|
|
if (preg_match('#(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日#u', $text, $m)) {
|
|
|
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
|
|
|
}
|
|
|
if (preg_match('#(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})#', $text, $m)) {
|
|
|
return sprintf('%04d-%02d-%02d', (int) $m[1], (int) $m[2], (int) $m[3]);
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
public static function isSkippableLinkTitle(string $title): bool
|
|
|
{
|
|
|
if (Str::length($title) < 8) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
foreach (['登录', '注册', '更多', '下一页', '上一页', '首页', '关于我们', '联系我们', '隐私', '版权'] as $noise) {
|
|
|
if (Str::contains($title, $noise)) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
public static function isAssetPath(string $url): bool
|
|
|
{
|
|
|
return (bool) preg_match('#\.(css|js|png|jpe?g|gif|svg|ico|woff2?|pdf|zip)(\?|$)#i', $url);
|
|
|
}
|
|
|
}
|