|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl\Adapters;
|
|
|
|
|
|
use App\Models\CrawlSource;
|
|
|
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
|
|
|
use App\Services\Crawl\CrawlAuthorParser;
|
|
|
use App\Services\Crawl\CrawlItemDto;
|
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
|
use Illuminate\Support\Facades\Http;
|
|
|
use Illuminate\Support\Str;
|
|
|
|
|
|
/**
|
|
|
* 通用院系/师资列表页:按关键词匹配区块,提取姓名、邮箱、机构。
|
|
|
*/
|
|
|
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
{
|
|
|
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
|
|
|
{
|
|
|
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
|
|
|
|
|
|
$maxResults = min(100, max(1, (int) ($params['max_results'] ?? 30)));
|
|
|
|
|
|
$response = Http::timeout(30)
|
|
|
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
|
|
|
->get($requestUrl);
|
|
|
|
|
|
if (! $response->successful()) {
|
|
|
throw new \RuntimeException('页面请求失败(HTTP '.$response->status().')');
|
|
|
}
|
|
|
|
|
|
$html = (string) $response->body();
|
|
|
$items = $this->extractFromHtml($html, $keywords, $requestUrl);
|
|
|
|
|
|
return array_slice($items, 0, $maxResults);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param list<string> $keywords
|
|
|
* @return list<CrawlItemDto>
|
|
|
*/
|
|
|
protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
|
|
|
{
|
|
|
$items = [];
|
|
|
$seen = [];
|
|
|
|
|
|
if (! preg_match_all(
|
|
|
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
|
|
|
$html,
|
|
|
$emailMatches,
|
|
|
PREG_OFFSET_CAPTURE
|
|
|
)) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
foreach ($emailMatches[1] as $match) {
|
|
|
$email = CrawlAuthorParser::normalizeEmail($match[0]);
|
|
|
if (! $email || isset($seen[$email])) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$pos = (int) $match[1];
|
|
|
$window = substr($html, max(0, $pos - 400), 800);
|
|
|
$plain = html_entity_decode(strip_tags($window), ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
$plain = preg_replace('/\s+/u', ' ', $plain) ?? '';
|
|
|
|
|
|
if ($keywords !== []) {
|
|
|
$hitKeyword = false;
|
|
|
foreach ($keywords as $kw) {
|
|
|
if ($kw !== '' && stripos($plain, $kw) !== false) {
|
|
|
$hitKeyword = true;
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
if (! $hitKeyword) {
|
|
|
continue;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$name = $this->guessName($plain, $email);
|
|
|
if ($name === '') {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
$affiliation = $this->guessAffiliation($plain);
|
|
|
$seen[$email] = true;
|
|
|
|
|
|
$items[] = new CrawlItemDto(
|
|
|
externalId: 'faculty:'.md5($email),
|
|
|
title: $name,
|
|
|
canonicalUrl: $sourceUrl,
|
|
|
authors: $name,
|
|
|
summary: Str::limit($plain, 300),
|
|
|
schoolName: CrawlAuthorParser::universityFromAffiliation($affiliation),
|
|
|
extra: [
|
|
|
'platform' => 'faculty_html',
|
|
|
'lead_author' => [
|
|
|
'name' => $name,
|
|
|
'email' => $email,
|
|
|
'affiliation' => $affiliation,
|
|
|
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
|
|
|
],
|
|
|
'keyword' => implode(' ', $keywords),
|
|
|
],
|
|
|
authorsParsed: [[
|
|
|
'name' => $name,
|
|
|
'email' => $email,
|
|
|
'affiliation' => $affiliation,
|
|
|
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
|
|
|
]],
|
|
|
);
|
|
|
}
|
|
|
|
|
|
return $items;
|
|
|
}
|
|
|
|
|
|
protected function guessName(string $plain, string $email): string
|
|
|
{
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
|
|
|
return trim($m[1]);
|
|
|
}
|
|
|
|
|
|
$local = strstr($email, '@', true) ?: '';
|
|
|
$local = str_replace(['.', '_', '-'], ' ', $local);
|
|
|
|
|
|
return Str::title(trim($local));
|
|
|
}
|
|
|
|
|
|
protected function guessAffiliation(string $plain): ?string
|
|
|
{
|
|
|
if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
|
|
|
return CrawlAuthorParser::cleanText($m[1]);
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
}
|