You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
4.7 KiB

1 week ago
<?php
namespace App\Services\Crawl;
/**
* 资讯正文 HTML提取正文区域、补全图片地址、懒加载属性转换。
*/
class NewsContentHtml
{
public static function extractBody(string $html): ?string
{
$candidates = [];
$patterns = [
// 上海交大等Article_content + Article-source
'#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>\s*<div[^>]+class=["\'][^"\']*Article-source#is',
'#<div[^>]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)</div>#is',
'#<div[^>]+id=["\']ivs_content["\'][^>]*>(.*?)</div>#is',
'#<div[^>]+id=["\'](?:vsb_content|v_news_content|news_content|zoom)["\'][^>]*>(.*?)</div>#is',
'#<div[^>]+class=["\'][^"\']*v_news_content[^"\']*["\'][^>]*>(.*?)</div>#is',
'#<article[^>]*>(.*?)</article>#is',
'#<div[^>]+id=["\'](?:news-content|article-body|article-content|main-content|post-content)["\'][^>]*>(.*?)</div>#is',
'#<div[^>]+class=["\'][^"\']*(?:article-body|article-content|news-content|post-content|entry-content|content-detail|detail-content)[^"\']*["\'][^>]*>(.*?)</div>#is',
];
foreach ($patterns as $pattern) {
if (preg_match($pattern, $html, $m)) {
$body = trim($m[1]);
$len = mb_strlen(strip_tags($body));
if ($len >= 30) {
$candidates[$len] = $body;
}
}
}
if ($candidates === []) {
return null;
}
ksort($candidates);
return (string) array_pop($candidates);
}
public static function normalize(?string $html, ?string $pageUrl = null): ?string
{
if ($html === null || trim($html) === '') {
return null;
}
$html = html_entity_decode($html, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$origin = self::pageOrigin($pageUrl);
$html = preg_replace_callback(
'/<img\b([^>]*)>/iu',
static function (array $m) use ($origin, $pageUrl): string {
$attrs = $m[1];
foreach (['data-src', 'data-original', 'data-url', 'original-src'] as $attr) {
if (preg_match('/\s'.preg_quote($attr, '/').'=(["\'])([^"\']+)\1/i', $attrs, $data)) {
$real = $data[2];
if (preg_match('/\ssrc=(["\'])([^"\']*)\1/i', $attrs)) {
$attrs = preg_replace('/\ssrc=(["\'])[^"\']*\1/i', ' src="'.$real.'"', $attrs);
} else {
$attrs .= ' src="'.$real.'"';
}
}
}
if (preg_match('/\ssrc=(["\'])([^"\']*)\1/i', $attrs, $srcMatch)) {
$resolved = self::resolveAssetUrl($srcMatch[2], $pageUrl, $origin);
$attrs = preg_replace(
'/\ssrc=(["\'])[^"\']*\1/i',
' src="'.$resolved.'"',
$attrs
) ?? $attrs;
}
if (! preg_match('/\sreferrerpolicy=/i', $attrs)) {
$attrs .= ' referrerpolicy="no-referrer"';
}
return '<img'.$attrs.'>';
},
$html
) ?? $html;
return trim($html);
}
public static function resolveAssetUrl(string $url, ?string $pageUrl, ?string $origin = null): string
{
$url = trim(html_entity_decode($url));
if ($url === '') {
return $url;
}
if (preg_match('#^https?://#i', $url)) {
return $url;
}
$origin ??= self::pageOrigin($pageUrl);
if ($origin === null) {
return $url;
}
if (str_starts_with($url, '//')) {
$scheme = parse_url($pageUrl ?? $origin, PHP_URL_SCHEME) ?: 'https';
return $scheme.':'.$url;
}
if (str_starts_with($url, '/')) {
return $origin.$url;
}
if ($pageUrl) {
$absolute = HtmlCrawlSupport::absoluteUrl($url, $pageUrl);
if ($absolute) {
return $absolute;
}
}
return rtrim($origin, '/').'/'.ltrim($url, '/');
}
protected static function pageOrigin(?string $pageUrl): ?string
{
if (! $pageUrl) {
return null;
}
$parts = parse_url($pageUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return null;
}
$origin = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$origin .= ':'.$parts['port'];
}
return $origin;
}
}