]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)\s*]+class=["\'][^"\']*Article-source#is', '#]+class=["\'][^"\']*Article_content[^"\']*["\'][^>]*>(.*?)#is', '#]+id=["\']ivs_content["\'][^>]*>(.*?)#is', '#]+id=["\'](?:vsb_content|v_news_content|news_content|zoom)["\'][^>]*>(.*?)#is', '#]+class=["\'][^"\']*v_news_content[^"\']*["\'][^>]*>(.*?)#is', '#]*>(.*?)#is', '#]+id=["\'](?:news-content|article-body|article-content|main-content|post-content)["\'][^>]*>(.*?)#is', '#]+class=["\'][^"\']*(?:article-body|article-content|news-content|post-content|entry-content|content-detail|detail-content)[^"\']*["\'][^>]*>(.*?)#is', ]; foreach ($patterns as $pattern) { if (preg_match($pattern, $html, $m)) { $body = trim($m[1]); $len = mb_strlen(strip_tags($body)); if ($len >= 30) { $candidates[$len] = $body; } } } if ($candidates === []) { return null; } ksort($candidates); return (string) array_pop($candidates); } public static function normalize(?string $html, ?string $pageUrl = null): ?string { if ($html === null || trim($html) === '') { return null; } $html = html_entity_decode($html, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $origin = self::pageOrigin($pageUrl); $html = preg_replace_callback( '/]*)>/iu', static function (array $m) use ($origin, $pageUrl): string { $attrs = $m[1]; foreach (['data-src', 'data-original', 'data-url', 'original-src'] as $attr) { if (preg_match('/\s'.preg_quote($attr, '/').'=(["\'])([^"\']+)\1/i', $attrs, $data)) { $real = $data[2]; if (preg_match('/\ssrc=(["\'])([^"\']*)\1/i', $attrs)) { $attrs = preg_replace('/\ssrc=(["\'])[^"\']*\1/i', ' src="'.$real.'"', $attrs); } else { $attrs .= ' src="'.$real.'"'; } } } if (preg_match('/\ssrc=(["\'])([^"\']*)\1/i', $attrs, $srcMatch)) { $resolved = self::resolveAssetUrl($srcMatch[2], $pageUrl, $origin); $attrs = preg_replace( '/\ssrc=(["\'])[^"\']*\1/i', ' src="'.$resolved.'"', $attrs ) ?? $attrs; } if (! preg_match('/\sreferrerpolicy=/i', $attrs)) { $attrs .= ' referrerpolicy="no-referrer"'; } return ''; }, $html ) ?? $html; return trim($html); } public static function resolveAssetUrl(string $url, ?string $pageUrl, ?string $origin = null): string { $url = trim(html_entity_decode($url)); if ($url === '') { return $url; } if (preg_match('#^https?://#i', $url)) { return $url; } $origin ??= self::pageOrigin($pageUrl); if ($origin === null) { return $url; } if (str_starts_with($url, '//')) { $scheme = parse_url($pageUrl ?? $origin, PHP_URL_SCHEME) ?: 'https'; return $scheme.':'.$url; } if (str_starts_with($url, '/')) { return $origin.$url; } if ($pageUrl) { $absolute = HtmlCrawlSupport::absoluteUrl($url, $pageUrl); if ($absolute) { return $absolute; } } return rtrim($origin, '/').'/'.ltrim($url, '/'); } protected static function pageOrigin(?string $pageUrl): ?string { if (! $pageUrl) { return null; } $parts = parse_url($pageUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return null; } $origin = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $origin .= ':'.$parts['port']; } return $origin; } }