You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

644 lines
21 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
/**
* 通用院系/师资列表页:优先邮箱条目;无邮箱时解析 tsites 等列表卡片(姓名、单位、职称、主页)。
*/
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
{
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 30)));
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
$baseUrl = $this->normalizeRequestUrl($requestUrl);
$firstHtml = $this->fetchHtml($baseUrl);
$totalPages = $this->detectTotalPages($firstHtml);
$pagesToFetch = min($maxPages, $totalPages);
$merged = [];
$seen = [];
for ($page = 1; $page <= $pagesToFetch; $page++) {
$html = $page === 1
? $firstHtml
: $this->fetchHtml($this->buildPageUrl($baseUrl, $page, $firstHtml));
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$merged[] = $item;
if (count($merged) >= $maxResults) {
break 2;
}
}
}
return $this->enrichEmailsFromProfilePages($merged);
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function enrichEmailsFromProfilePages(array $items): array
{
if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
return $items;
}
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 6)));
$timeout = max(8, (int) config('crawl.faculty.profile_http_timeout_seconds', 20));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$enriched = [];
foreach (array_chunk($items, $poolSize) as $chunk) {
$pending = [];
foreach ($chunk as $item) {
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
$enriched[] = $item;
continue;
}
$pending[$item->externalId] = $item;
}
if ($pending === []) {
continue;
}
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pending, $headers, $timeout) {
foreach ($pending as $externalId => $item) {
$pool->as($externalId)
->timeout($timeout)
->withHeaders($headers)
->get($item->canonicalUrl);
}
});
foreach ($pending as $externalId => $item) {
$response = $responses[$externalId] ?? null;
if ($response && $response->successful()) {
$email = $this->extractEmailFromProfileHtml((string) $response->body());
if ($email) {
$item = $this->applyEmailToItem($item, $email);
}
}
$enriched[] = $item;
}
}
return $enriched;
}
protected function itemHasEmail(CrawlItemDto $item): bool
{
$lead = $item->extra['lead_author'] ?? null;
if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) {
return true;
}
foreach ($item->authorsParsed as $author) {
if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) {
return true;
}
}
return false;
}
protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto
{
$email = CrawlAuthorParser::normalizeEmail($email) ?? $email;
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
$lead['email'] = $email;
$authorsParsed = $item->authorsParsed;
if ($authorsParsed === []) {
$authorsParsed = [[
'name' => $item->title,
'email' => $email,
'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null,
'university_name' => $lead['university_name'] ?? $item->schoolName,
]];
} else {
$authorsParsed[0]['email'] = $email;
}
$extra = $item->extra;
$extra['lead_author'] = $lead;
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $authorsParsed,
);
}
protected function extractEmailFromProfileHtml(string $html): ?string
{
$labeledPatterns = [
'/电子邮箱[:]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子信箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/E-?mail[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
'/邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
];
foreach ($labeledPatterns as $pattern) {
if (preg_match($pattern, $html, $match)) {
$email = CrawlAuthorParser::normalizeEmail($match[1]);
if ($email && ! $this->isNoiseEmail($email)) {
return $email;
}
}
}
$candidates = [];
if (preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emailMatches,
)) {
foreach ($emailMatches[1] as $raw) {
$email = CrawlAuthorParser::normalizeEmail($raw);
if ($email && ! $this->isNoiseEmail($email)) {
$candidates[] = $email;
}
}
}
if ($candidates === []) {
return null;
}
$candidates = array_values(array_unique($candidates));
foreach ($candidates as $email) {
if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) {
return $email;
}
}
return $candidates[0];
}
protected function isNoiseEmail(string $email): bool
{
return (bool) preg_match(
'/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i',
$email,
);
}
protected function fetchHtml(string $url): string
{
$response = Http::timeout(30)
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
->get($url);
if (! $response->successful()) {
throw new \RuntimeException('页面请求失败HTTP '.$response->status().''.$url);
}
return (string) $response->body();
}
protected function detectTotalPages(string $html): int
{
if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
return max(1, (int) $match[1]);
}
if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) {
$perPage = 0;
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
$perPage = preg_match_all('/<div\s+class="name">/u', $listMatch[1]) ?: 0;
}
if ($perPage > 0) {
return max(1, (int) ceil(((int) $countMatch[1]) / $perPage));
}
}
return 1;
}
protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
{
$parts = parse_url($baseUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $baseUrl;
}
parse_str((string) ($parts['query'] ?? ''), $query);
$query['PAGENUM'] = (string) $page;
if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) {
$query['totalpage'] = $totalMatch[1];
}
$url = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$url .= ':'.$parts['port'];
}
$url .= $parts['path'] ?? '/';
if ($query !== []) {
$url .= '?'.http_build_query($query);
}
return $url;
}
protected function normalizeRequestUrl(string $url): string
{
$parts = parse_url($url);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $url;
}
$normalized = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$normalized .= ':'.$parts['port'];
}
$normalized .= $parts['path'] ?? '/';
if (! empty($parts['query'])) {
$normalized .= '?'.$parts['query'];
}
return $normalized;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
{
$items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
return $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
if (! preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emailMatches,
PREG_OFFSET_CAPTURE
)) {
return [];
}
foreach ($emailMatches[1] as $match) {
$email = CrawlAuthorParser::normalizeEmail($match[0]);
if (! $email || isset($seen[$email])) {
continue;
}
$pos = (int) $match[1];
$window = substr($html, max(0, $pos - 400), 800);
$plain = $this->htmlToPlain($window);
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$name = $this->guessName($plain, $email);
if ($name === '') {
continue;
}
$affiliation = $this->guessAffiliation($plain);
$seen[$email] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($email),
name: $name,
profileUrl: $sourceUrl,
email: $email,
affiliation: $affiliation,
universityName: CrawlAuthorParser::universityFromAffiliation($affiliation)
?? $this->inferUniversityFromSource($sourceUrl, $html),
summary: Str::limit($plain, 300),
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html',
);
}
return $items;
}
/**
* 上海交通大学等 tsites.CollegeTeacherListdiv.list > ul > li 卡片。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$collegeName = null;
if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) {
$collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]);
}
$listHtml = $html;
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
$listHtml = $listMatch[1];
}
if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
foreach ($liBlocks[1] as $inner) {
$inner = (string) $inner;
if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($nameMatch[1]);
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = '';
if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) {
$href = (string) $hrefMatch[1];
}
$profileUrl = $this->resolveUrl($href, $sourceUrl)
?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl);
$plain = $this->htmlToPlain($inner);
if (! $this->matchesKeywords($plain.' '.$name, $keywords)) {
continue;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$seen[$dedupeKey] = true;
$affiliation = $this->parseLabeledField($inner, '所在单位')
?? $collegeName;
$academicTitle = $this->parseLabeledField($inner, '职称');
// 列表页「所在单位」多为学院,高校名称从站点/页头推断
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
$summaryParts = array_filter([
$academicTitle ? '职称:'.$academicTitle : null,
$affiliation ? '单位:'.$affiliation : null,
$this->parseLabeledField($inner, '简介'),
]);
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $universityName,
summary: Str::limit(implode('', $summaryParts), 300),
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_tsites',
);
}
return $items;
}
/**
* @param list<string> $keywords
*/
protected function makeFacultyItem(
string $externalKey,
string $name,
?string $profileUrl,
?string $email,
?string $affiliation,
?string $universityName,
?string $summary,
array $keywords,
?string $academicTitle,
string $platform,
): CrawlItemDto {
$college = $affiliation;
$lead = [
'name' => $name,
'email' => $email,
'affiliation' => $college,
'college' => $college,
'university_name' => $universityName,
'academic_title' => $academicTitle,
'profile_url' => $profileUrl,
];
return new CrawlItemDto(
externalId: $externalKey,
title: $name,
canonicalUrl: $profileUrl,
authors: $name,
summary: $summary,
schoolName: $universityName,
extra: [
'platform' => $platform,
'academic_title' => $academicTitle,
'college_name' => $college,
'profile_url' => $profileUrl,
'lead_author' => $lead,
'keyword' => implode(' ', $keywords),
],
authorsParsed: [[
'name' => $name,
'email' => $email,
'affiliation' => $college,
'university_name' => $universityName,
'academic_title' => $academicTitle,
]],
);
}
/**
* @param list<string> $keywords
*/
protected function matchesKeywords(string $plain, array $keywords): bool
{
if ($keywords === []) {
return true;
}
foreach ($keywords as $kw) {
if ($kw !== '' && stripos($plain, $kw) !== false) {
return true;
}
}
return false;
}
protected function htmlToPlain(string $html): string
{
$plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8');
return preg_replace('/\s+/u', ' ', $plain) ?? '';
}
protected function parseLabeledField(string $html, string $label): ?string
{
$pattern = '/'.preg_quote($label, '/').'[:]\s*([^<]+)/u';
if (! preg_match($pattern, $html, $match)) {
return null;
}
return CrawlAuthorParser::cleanText($match[1]);
}
protected function looksLikePersonName(string $name): bool
{
if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) {
return false;
}
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name);
}
protected function resolveUrl(string $href, string $baseUrl): ?string
{
$href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($href === '' || str_starts_with($href, 'javascript:')) {
return null;
}
if (preg_match('#^https?://#i', $href)) {
return $href;
}
$base = parse_url($baseUrl);
if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) {
return $href;
}
$origin = $base['scheme'].'://'.$base['host'];
if (! empty($base['port'])) {
$origin .= ':'.$base['port'];
}
if (str_starts_with($href, '//')) {
return $base['scheme'].':'.$href;
}
if (str_starts_with($href, '/')) {
return $origin.$href;
}
$path = $base['path'] ?? '/';
$dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';
return $origin.$dir.$href;
}
protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string
{
$escaped = preg_quote($name, '/');
if (! preg_match(
'/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u',
$html,
$match,
)) {
return null;
}
return $this->resolveUrl($match[1], $sourceUrl);
}
protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string
{
$host = parse_url($sourceUrl, PHP_URL_HOST);
if (is_string($host)) {
$host = strtolower($host);
if (str_contains($host, 'sjtu.edu.cn')) {
return '上海交通大学';
}
if (str_contains($host, 'tsinghua.edu.cn')) {
return '清华大学';
}
if (str_contains($host, 'pku.edu.cn')) {
return '北京大学';
}
if (str_contains($host, 'zju.edu.cn')) {
return '浙江大学';
}
if (str_contains($host, 'fudan.edu.cn')) {
return '复旦大学';
}
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
return CrawlAuthorParser::cleanText($match[1]);
}
return null;
}
protected function guessName(string $plain, string $email): string
{
if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
return trim($m[1]);
}
$local = strstr($email, '@', true) ?: '';
$local = str_replace(['.', '_', '-'], ' ', $local);
return Str::title(trim($local));
}
protected function guessAffiliation(string $plain): ?string
{
if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
return CrawlAuthorParser::cleanText($m[1]);
}
return null;
}
}