You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

46 lines
1.4 KiB

2 weeks ago
<?php
namespace App\Services\Crawl;
/**
* 投资界正文 HTML补全图片地址便于后台编辑器与前台展示。
*/
class PedailyContentNormalizer
{
public static function normalize(?string $html): ?string
{
if ($html === null || trim($html) === '') {
return null;
}
$html = html_entity_decode($html, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$html = preg_replace_callback(
'/<img\b([^>]*)>/iu',
static function (array $m): string {
$attrs = $m[1];
if (preg_match('/\sdata-src=(["\'])([^"\']+)\1/i', $attrs, $data)) {
$realSrc = $data[2];
if (preg_match('/\ssrc=(["\'])([^"\']*)\1/i', $attrs)) {
$attrs = preg_replace('/\ssrc=(["\'])[^"\']*\1/i', ' src="'.$realSrc.'"', $attrs);
} else {
$attrs .= ' src="'.$realSrc.'"';
}
}
if (! preg_match('/\sreferrerpolicy=/i', $attrs)) {
$attrs .= ' referrerpolicy="no-referrer"';
}
return '<img'.$attrs.'>';
},
$html
) ?? $html;
$html = preg_replace('#\ssrc=(["\'])/(?!/)#', ' src=$1https://www.pedaily.cn/', $html) ?? $html;
$html = preg_replace('#\ssrc=(["\'])//#', ' src=$1https://', $html) ?? $html;
return trim($html);
}
}