You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
3.5 KiB

2 weeks ago
<?php
namespace App\Services\Crawl;
use App\Models\CourseMedia;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Str;
/**
* 将正文中的外链图片下载到本地 public 盘,避免 CDN 防盗链导致无法显示。
*/
class NewsHtmlImageLocalizer
{
6 days ago
public function localize(?string $html, ?string $pageUrl = null): ?string
2 weeks ago
{
if ($html === null || trim($html) === '') {
return $html;
}
6 days ago
$html = NewsContentHtml::normalize($html, $pageUrl);
2 weeks ago
return preg_replace_callback(
'/<img\b([^>]*)>/iu',
6 days ago
function (array $m) use ($pageUrl): string {
2 weeks ago
$attrs = $m[1];
if (! preg_match('/\ssrc=(["\'])([^"\']+)\1/i', $attrs, $srcMatch)) {
return '<img'.$attrs.'>';
}
6 days ago
$src = NewsContentHtml::resolveAssetUrl($srcMatch[2], $pageUrl);
$local = $this->downloadToPublic($src, $pageUrl);
2 weeks ago
if ($local === null) {
6 days ago
$attrs = preg_replace(
'/\ssrc=(["\'])[^"\']*\1/i',
' src="'.$src.'"',
$attrs
) ?? $attrs;
2 weeks ago
return '<img'.$attrs.'>';
}
$attrs = preg_replace(
'/\ssrc=(["\'])[^"\']*\1/i',
' src="'.$local.'"',
$attrs
) ?? $attrs;
return '<img'.$attrs.'>';
},
$html
) ?? $html;
}
6 days ago
protected function downloadToPublic(string $url, ?string $pageUrl = null): ?string
2 weeks ago
{
6 days ago
$url = NewsContentHtml::resolveAssetUrl(trim($url), $pageUrl);
2 weeks ago
if ($url === '' || ! preg_match('#^https?://#i', $url)) {
return null;
}
if (Str::contains($url, ['/storage/', '127.0.0.1', 'localhost'])) {
return null;
}
6 days ago
$headers = [
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
'Accept' => 'image/*,*/*',
];
if ($pageUrl && preg_match('#^https?://#i', $pageUrl)) {
$headers['Referer'] = $pageUrl;
}
2 weeks ago
try {
$response = Http::timeout(25)
6 days ago
->withHeaders($headers)
2 weeks ago
->get($url);
} catch (\Throwable) {
return null;
}
if (! $response->successful()) {
return null;
}
$body = $response->body();
if ($body === '') {
return null;
}
$ext = $this->guessExtension($url, $response->header('Content-Type'));
$name = 'news/content/'.date('Ym').'/'.Str::random(16).'.'.$ext;
Storage::disk('public')->put($name, $body);
return CourseMedia::publicAssetUrl($name);
}
protected function guessExtension(string $url, ?string $contentType): string
{
if ($contentType) {
if (str_contains($contentType, 'png')) {
return 'png';
}
if (str_contains($contentType, 'jpeg') || str_contains($contentType, 'jpg')) {
return 'jpg';
}
if (str_contains($contentType, 'gif')) {
return 'gif';
}
if (str_contains($contentType, 'webp')) {
return 'webp';
}
}
if (preg_match('#\.(png|jpe?g|gif|webp)(\?|$)#i', $url, $m)) {
return strtolower($m[1] === 'jpeg' ? 'jpg' : $m[1]);
}
return 'jpg';
}
}