You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* 投资界正文 HTML:补全图片地址,便于后台编辑器与前台展示。
|
|
|
|
|
|
*/
|
|
|
|
|
|
class PedailyContentNormalizer
|
|
|
|
|
|
{
|
|
|
|
|
|
public static function normalize(?string $html): ?string
|
|
|
|
|
|
{
|
|
|
|
|
|
if ($html === null || trim($html) === '') {
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$html = html_entity_decode($html, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
|
|
|
|
|
|
|
$html = preg_replace_callback(
|
|
|
|
|
|
'/<img\b([^>]*)>/iu',
|
|
|
|
|
|
static function (array $m): string {
|
|
|
|
|
|
$attrs = $m[1];
|
|
|
|
|
|
if (preg_match('/\sdata-src=(["\'])([^"\']+)\1/i', $attrs, $data)) {
|
|
|
|
|
|
$realSrc = $data[2];
|
|
|
|
|
|
if (preg_match('/\ssrc=(["\'])([^"\']*)\1/i', $attrs)) {
|
|
|
|
|
|
$attrs = preg_replace('/\ssrc=(["\'])[^"\']*\1/i', ' src="'.$realSrc.'"', $attrs);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
$attrs .= ' src="'.$realSrc.'"';
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (! preg_match('/\sreferrerpolicy=/i', $attrs)) {
|
|
|
|
|
|
$attrs .= ' referrerpolicy="no-referrer"';
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return '<img'.$attrs.'>';
|
|
|
|
|
|
},
|
|
|
|
|
|
$html
|
|
|
|
|
|
) ?? $html;
|
|
|
|
|
|
|
|
|
|
|
|
$html = preg_replace('#\ssrc=(["\'])/(?!/)#', ' src=$1https://www.pedaily.cn/', $html) ?? $html;
|
|
|
|
|
|
$html = preg_replace('#\ssrc=(["\'])//#', ' src=$1https://', $html) ?? $html;
|
|
|
|
|
|
|
|
|
|
|
|
return trim($html);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|