|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
/**
|
|
|
* 资讯正文 HTML:提取正文区域、补全图片地址、懒加载属性转换。
|
|
|
*/
|
|
|
class NewsContentHtml
|
|
|
{
|
|
|
public static function extractBody(string $html): ?string
|
|
|
{
|
|
|
$candidates = [];
|
|
|
|
|
|
$patterns = [
|
|
|
// 上海交大等:Article_content + Article-source
|
|
|
'#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>\s*<div[^>]+class=["\'][^"\']*Article-source#is',
|
|
|
'#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>#is',
|
|
|
'#<div[^>]+id=["\']ivs_content["\'][^>]*>(.*?)</div>#is',
|
|
|
'#<div[^>]+id=["\'](?:vsb_content|v_news_content|news_content|zoom)["\'][^>]*>(.*?)</div>#is',
|
|
|
'#<div[^>]+class=["\'][^"\']*v_news_content[^"\']*["\'][^>]*>(.*?)</div>#is',
|
|
|
'#<article[^>]*>(.*?)</article>#is',
|
|
|
'#<div[^>]+id=["\'](?:news-content|article-body|article-content|main-content|post-content)["\'][^>]*>(.*?)</div>#is',
|
|
|
'#<div[^>]+class=["\'][^"\']*(?:article-body|article-content|news-content|post-content|entry-content|content-detail|detail-content)[^"\']*["\'][^>]*>(.*?)</div>#is',
|
|
|
];
|
|
|
|
|
|
foreach ($patterns as $pattern) {
|
|
|
if (preg_match($pattern, $html, $m)) {
|
|
|
$body = trim($m[1]);
|
|
|
$len = mb_strlen(strip_tags($body));
|
|
|
if ($len >= 30) {
|
|
|
$candidates[$len] = $body;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if ($candidates === []) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
ksort($candidates);
|
|
|
|
|
|
return (string) array_pop($candidates);
|
|
|
}
|
|
|
|
|
|
public static function normalize(?string $html, ?string $pageUrl = null): ?string
|
|
|
{
|
|
|
if ($html === null || trim($html) === '') {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$html = html_entity_decode($html, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
$origin = self::pageOrigin($pageUrl);
|
|
|
|
|
|
$html = preg_replace_callback(
|
|
|
'/<img\b([^>]*)>/iu',
|
|
|
static function (array $m) use ($origin, $pageUrl): string {
|
|
|
$attrs = $m[1];
|
|
|
|
|
|
foreach (['data-src', 'data-original', 'data-url', 'original-src'] as $attr) {
|
|
|
if (preg_match('/\s'.preg_quote($attr, '/').'=(["\'])([^"\']+)\1/i', $attrs, $data)) {
|
|
|
$real = $data[2];
|
|
|
if (preg_match('/\ssrc=(["\'])([^"\']*)\1/i', $attrs)) {
|
|
|
$attrs = preg_replace('/\ssrc=(["\'])[^"\']*\1/i', ' src="'.$real.'"', $attrs);
|
|
|
} else {
|
|
|
$attrs .= ' src="'.$real.'"';
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (preg_match('/\ssrc=(["\'])([^"\']*)\1/i', $attrs, $srcMatch)) {
|
|
|
$resolved = self::resolveAssetUrl($srcMatch[2], $pageUrl, $origin);
|
|
|
$attrs = preg_replace(
|
|
|
'/\ssrc=(["\'])[^"\']*\1/i',
|
|
|
' src="'.$resolved.'"',
|
|
|
$attrs
|
|
|
) ?? $attrs;
|
|
|
}
|
|
|
|
|
|
if (! preg_match('/\sreferrerpolicy=/i', $attrs)) {
|
|
|
$attrs .= ' referrerpolicy="no-referrer"';
|
|
|
}
|
|
|
|
|
|
return '<img'.$attrs.'>';
|
|
|
},
|
|
|
$html
|
|
|
) ?? $html;
|
|
|
|
|
|
return trim($html);
|
|
|
}
|
|
|
|
|
|
public static function resolveAssetUrl(string $url, ?string $pageUrl, ?string $origin = null): string
|
|
|
{
|
|
|
$url = trim(html_entity_decode($url));
|
|
|
if ($url === '') {
|
|
|
return $url;
|
|
|
}
|
|
|
|
|
|
if (preg_match('#^https?://#i', $url)) {
|
|
|
return $url;
|
|
|
}
|
|
|
|
|
|
$origin ??= self::pageOrigin($pageUrl);
|
|
|
if ($origin === null) {
|
|
|
return $url;
|
|
|
}
|
|
|
|
|
|
if (str_starts_with($url, '//')) {
|
|
|
$scheme = parse_url($pageUrl ?? $origin, PHP_URL_SCHEME) ?: 'https';
|
|
|
|
|
|
return $scheme.':'.$url;
|
|
|
}
|
|
|
|
|
|
if (str_starts_with($url, '/')) {
|
|
|
return $origin.$url;
|
|
|
}
|
|
|
|
|
|
if ($pageUrl) {
|
|
|
$absolute = HtmlCrawlSupport::absoluteUrl($url, $pageUrl);
|
|
|
if ($absolute) {
|
|
|
return $absolute;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return rtrim($origin, '/').'/'.ltrim($url, '/');
|
|
|
}
|
|
|
|
|
|
protected static function pageOrigin(?string $pageUrl): ?string
|
|
|
{
|
|
|
if (! $pageUrl) {
|
|
|
return null;
|
|
|
}
|
|
|
$parts = parse_url($pageUrl);
|
|
|
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$origin = $parts['scheme'].'://'.$parts['host'];
|
|
|
if (! empty($parts['port'])) {
|
|
|
$origin .= ':'.$parts['port'];
|
|
|
}
|
|
|
|
|
|
return $origin;
|
|
|
}
|
|
|
}
|