|
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
|
|
|
|
use App\Models\CourseMedia;
|
|
|
|
|
use Illuminate\Support\Facades\Http;
|
|
|
|
|
use Illuminate\Support\Facades\Storage;
|
|
|
|
|
use Illuminate\Support\Str;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 将正文中的外链图片下载到本地 public 盘,避免 CDN 防盗链导致无法显示。
|
|
|
|
|
*/
|
|
|
|
|
class NewsHtmlImageLocalizer
|
|
|
|
|
{
|
|
|
|
|
public function localize(?string $html, ?string $pageUrl = null): ?string
|
|
|
|
|
{
|
|
|
|
|
if ($html === null || trim($html) === '') {
|
|
|
|
|
return $html;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$html = NewsContentHtml::normalize($html, $pageUrl);
|
|
|
|
|
|
|
|
|
|
return preg_replace_callback(
|
|
|
|
|
'/<img\b([^>]*)>/iu',
|
|
|
|
|
function (array $m) use ($pageUrl): string {
|
|
|
|
|
$attrs = $m[1];
|
|
|
|
|
if (! preg_match('/\ssrc=(["\'])([^"\']+)\1/i', $attrs, $srcMatch)) {
|
|
|
|
|
return '<img'.$attrs.'>';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$src = NewsContentHtml::resolveAssetUrl($srcMatch[2], $pageUrl);
|
|
|
|
|
$local = $this->downloadToPublic($src, $pageUrl);
|
|
|
|
|
if ($local === null) {
|
|
|
|
|
$attrs = preg_replace(
|
|
|
|
|
'/\ssrc=(["\'])[^"\']*\1/i',
|
|
|
|
|
' src="'.$src.'"',
|
|
|
|
|
$attrs
|
|
|
|
|
) ?? $attrs;
|
|
|
|
|
|
|
|
|
|
return '<img'.$attrs.'>';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$attrs = preg_replace(
|
|
|
|
|
'/\ssrc=(["\'])[^"\']*\1/i',
|
|
|
|
|
' src="'.$local.'"',
|
|
|
|
|
$attrs
|
|
|
|
|
) ?? $attrs;
|
|
|
|
|
|
|
|
|
|
return '<img'.$attrs.'>';
|
|
|
|
|
},
|
|
|
|
|
$html
|
|
|
|
|
) ?? $html;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function downloadToPublic(string $url, ?string $pageUrl = null): ?string
|
|
|
|
|
{
|
|
|
|
|
$url = NewsContentHtml::resolveAssetUrl(trim($url), $pageUrl);
|
|
|
|
|
if ($url === '' || ! preg_match('#^https?://#i', $url)) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (Str::contains($url, ['/storage/', '127.0.0.1', 'localhost'])) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$headers = [
|
|
|
|
|
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
|
|
|
|
|
'Accept' => 'image/*,*/*',
|
|
|
|
|
];
|
|
|
|
|
if ($pageUrl && preg_match('#^https?://#i', $pageUrl)) {
|
|
|
|
|
$headers['Referer'] = $pageUrl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
$response = Http::timeout(25)
|
|
|
|
|
->withHeaders($headers)
|
|
|
|
|
->get($url);
|
|
|
|
|
} catch (\Throwable) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$body = $response->body();
|
|
|
|
|
if ($body === '') {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$ext = $this->guessExtension($url, $response->header('Content-Type'));
|
|
|
|
|
$name = 'news/content/'.date('Ym').'/'.Str::random(16).'.'.$ext;
|
|
|
|
|
|
|
|
|
|
Storage::disk('public')->put($name, $body);
|
|
|
|
|
|
|
|
|
|
return CourseMedia::publicAssetUrl($name);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function guessExtension(string $url, ?string $contentType): string
|
|
|
|
|
{
|
|
|
|
|
if ($contentType) {
|
|
|
|
|
if (str_contains($contentType, 'png')) {
|
|
|
|
|
return 'png';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($contentType, 'jpeg') || str_contains($contentType, 'jpg')) {
|
|
|
|
|
return 'jpg';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($contentType, 'gif')) {
|
|
|
|
|
return 'gif';
|
|
|
|
|
}
|
|
|
|
|
if (str_contains($contentType, 'webp')) {
|
|
|
|
|
return 'webp';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('#\.(png|jpe?g|gif|webp)(\?|$)#i', $url, $m)) {
|
|
|
|
|
return strtolower($m[1] === 'jpeg' ? 'jpg' : $m[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 'jpg';
|
|
|
|
|
}
|
|
|
|
|
}
|