You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
109 lines
3.0 KiB
109 lines
3.0 KiB
|
2 weeks ago
|
<?php
|
||
|
|
|
||
|
|
namespace App\Services\Crawl;
|
||
|
|
|
||
|
|
use App\Models\CourseMedia;
|
||
|
|
use Illuminate\Support\Facades\Http;
|
||
|
|
use Illuminate\Support\Facades\Storage;
|
||
|
|
use Illuminate\Support\Str;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* 将正文中的外链图片下载到本地 public 盘,避免 CDN 防盗链导致无法显示。
|
||
|
|
*/
|
||
|
|
class NewsHtmlImageLocalizer
|
||
|
|
{
|
||
|
|
public function localize(?string $html): ?string
|
||
|
|
{
|
||
|
|
if ($html === null || trim($html) === '') {
|
||
|
|
return $html;
|
||
|
|
}
|
||
|
|
|
||
|
|
return preg_replace_callback(
|
||
|
|
'/<img\b([^>]*)>/iu',
|
||
|
|
function (array $m): string {
|
||
|
|
$attrs = $m[1];
|
||
|
|
if (! preg_match('/\ssrc=(["\'])([^"\']+)\1/i', $attrs, $srcMatch)) {
|
||
|
|
return '<img'.$attrs.'>';
|
||
|
|
}
|
||
|
|
|
||
|
|
$local = $this->downloadToPublic($srcMatch[2]);
|
||
|
|
if ($local === null) {
|
||
|
|
return '<img'.$attrs.'>';
|
||
|
|
}
|
||
|
|
|
||
|
|
$attrs = preg_replace(
|
||
|
|
'/\ssrc=(["\'])[^"\']*\1/i',
|
||
|
|
' src="'.$local.'"',
|
||
|
|
$attrs
|
||
|
|
) ?? $attrs;
|
||
|
|
|
||
|
|
return '<img'.$attrs.'>';
|
||
|
|
},
|
||
|
|
$html
|
||
|
|
) ?? $html;
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function downloadToPublic(string $url): ?string
|
||
|
|
{
|
||
|
|
$url = trim($url);
|
||
|
|
if ($url === '' || ! preg_match('#^https?://#i', $url)) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (Str::contains($url, ['/storage/', '127.0.0.1', 'localhost'])) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
$response = Http::timeout(25)
|
||
|
|
->withHeaders([
|
||
|
|
'User-Agent' => 'Mozilla/5.0 (compatible; SlakeCrawler/1.0)',
|
||
|
|
'Accept' => 'image/*,*/*',
|
||
|
|
])
|
||
|
|
->get($url);
|
||
|
|
} catch (\Throwable) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (! $response->successful()) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
$body = $response->body();
|
||
|
|
if ($body === '') {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
$ext = $this->guessExtension($url, $response->header('Content-Type'));
|
||
|
|
$name = 'news/content/'.date('Ym').'/'.Str::random(16).'.'.$ext;
|
||
|
|
|
||
|
|
Storage::disk('public')->put($name, $body);
|
||
|
|
|
||
|
|
return CourseMedia::publicAssetUrl($name);
|
||
|
|
}
|
||
|
|
|
||
|
|
protected function guessExtension(string $url, ?string $contentType): string
|
||
|
|
{
|
||
|
|
if ($contentType) {
|
||
|
|
if (str_contains($contentType, 'png')) {
|
||
|
|
return 'png';
|
||
|
|
}
|
||
|
|
if (str_contains($contentType, 'jpeg') || str_contains($contentType, 'jpg')) {
|
||
|
|
return 'jpg';
|
||
|
|
}
|
||
|
|
if (str_contains($contentType, 'gif')) {
|
||
|
|
return 'gif';
|
||
|
|
}
|
||
|
|
if (str_contains($contentType, 'webp')) {
|
||
|
|
return 'webp';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (preg_match('#\.(png|jpe?g|gif|webp)(\?|$)#i', $url, $m)) {
|
||
|
|
return strtolower($m[1] === 'jpeg' ? 'jpg' : $m[1]);
|
||
|
|
}
|
||
|
|
|
||
|
|
return 'jpg';
|
||
|
|
}
|
||
|
|
}
|