You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

644 lines
21 KiB

3 weeks ago
<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
/**
2 weeks ago
* 通用院系/师资列表页:优先邮箱条目;无邮箱时解析 tsites 等列表卡片(姓名、单位、职称、主页)。
3 weeks ago
*/
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
{
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
2 weeks ago
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 30)));
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
3 weeks ago
2 weeks ago
$baseUrl = $this->normalizeRequestUrl($requestUrl);
$firstHtml = $this->fetchHtml($baseUrl);
$totalPages = $this->detectTotalPages($firstHtml);
$pagesToFetch = min($maxPages, $totalPages);
3 weeks ago
2 weeks ago
$merged = [];
$seen = [];
for ($page = 1; $page <= $pagesToFetch; $page++) {
$html = $page === 1
? $firstHtml
: $this->fetchHtml($this->buildPageUrl($baseUrl, $page, $firstHtml));
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$merged[] = $item;
if (count($merged) >= $maxResults) {
break 2;
}
}
}
return $this->enrichEmailsFromProfilePages($merged);
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function enrichEmailsFromProfilePages(array $items): array
{
if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
return $items;
}
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 6)));
$timeout = max(8, (int) config('crawl.faculty.profile_http_timeout_seconds', 20));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$enriched = [];
foreach (array_chunk($items, $poolSize) as $chunk) {
$pending = [];
foreach ($chunk as $item) {
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
$enriched[] = $item;
continue;
}
$pending[$item->externalId] = $item;
}
if ($pending === []) {
continue;
}
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pending, $headers, $timeout) {
foreach ($pending as $externalId => $item) {
$pool->as($externalId)
->timeout($timeout)
->withHeaders($headers)
->get($item->canonicalUrl);
}
});
foreach ($pending as $externalId => $item) {
$response = $responses[$externalId] ?? null;
if ($response && $response->successful()) {
$email = $this->extractEmailFromProfileHtml((string) $response->body());
if ($email) {
$item = $this->applyEmailToItem($item, $email);
}
}
$enriched[] = $item;
}
}
return $enriched;
}
protected function itemHasEmail(CrawlItemDto $item): bool
{
$lead = $item->extra['lead_author'] ?? null;
if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) {
return true;
}
foreach ($item->authorsParsed as $author) {
if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) {
return true;
}
}
return false;
}
protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto
{
$email = CrawlAuthorParser::normalizeEmail($email) ?? $email;
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
$lead['email'] = $email;
$authorsParsed = $item->authorsParsed;
if ($authorsParsed === []) {
$authorsParsed = [[
'name' => $item->title,
'email' => $email,
'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null,
'university_name' => $lead['university_name'] ?? $item->schoolName,
]];
} else {
$authorsParsed[0]['email'] = $email;
}
$extra = $item->extra;
$extra['lead_author'] = $lead;
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $authorsParsed,
);
}
protected function extractEmailFromProfileHtml(string $html): ?string
{
$labeledPatterns = [
'/电子邮箱[:]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子信箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/E-?mail[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
'/邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
];
foreach ($labeledPatterns as $pattern) {
if (preg_match($pattern, $html, $match)) {
$email = CrawlAuthorParser::normalizeEmail($match[1]);
if ($email && ! $this->isNoiseEmail($email)) {
return $email;
}
}
}
$candidates = [];
if (preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emailMatches,
)) {
foreach ($emailMatches[1] as $raw) {
$email = CrawlAuthorParser::normalizeEmail($raw);
if ($email && ! $this->isNoiseEmail($email)) {
$candidates[] = $email;
}
}
}
if ($candidates === []) {
return null;
}
$candidates = array_values(array_unique($candidates));
foreach ($candidates as $email) {
if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) {
return $email;
}
}
return $candidates[0];
}
protected function isNoiseEmail(string $email): bool
{
return (bool) preg_match(
'/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i',
$email,
);
}
protected function fetchHtml(string $url): string
{
3 weeks ago
$response = Http::timeout(30)
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
2 weeks ago
->get($url);
3 weeks ago
if (! $response->successful()) {
2 weeks ago
throw new \RuntimeException('页面请求失败HTTP '.$response->status().''.$url);
}
return (string) $response->body();
}
protected function detectTotalPages(string $html): int
{
if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
return max(1, (int) $match[1]);
}
if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) {
$perPage = 0;
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
$perPage = preg_match_all('/<div\s+class="name">/u', $listMatch[1]) ?: 0;
}
if ($perPage > 0) {
return max(1, (int) ceil(((int) $countMatch[1]) / $perPage));
}
}
return 1;
}
protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
{
$parts = parse_url($baseUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $baseUrl;
}
parse_str((string) ($parts['query'] ?? ''), $query);
$query['PAGENUM'] = (string) $page;
if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) {
$query['totalpage'] = $totalMatch[1];
}
$url = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$url .= ':'.$parts['port'];
}
$url .= $parts['path'] ?? '/';
if ($query !== []) {
$url .= '?'.http_build_query($query);
}
return $url;
}
protected function normalizeRequestUrl(string $url): string
{
$parts = parse_url($url);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $url;
3 weeks ago
}
2 weeks ago
$normalized = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$normalized .= ':'.$parts['port'];
}
$normalized .= $parts['path'] ?? '/';
if (! empty($parts['query'])) {
$normalized .= '?'.$parts['query'];
}
3 weeks ago
2 weeks ago
return $normalized;
3 weeks ago
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
2 weeks ago
{
$items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
return $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array
3 weeks ago
{
$items = [];
$seen = [];
if (! preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emailMatches,
PREG_OFFSET_CAPTURE
)) {
return [];
}
foreach ($emailMatches[1] as $match) {
$email = CrawlAuthorParser::normalizeEmail($match[0]);
if (! $email || isset($seen[$email])) {
continue;
}
$pos = (int) $match[1];
$window = substr($html, max(0, $pos - 400), 800);
2 weeks ago
$plain = $this->htmlToPlain($window);
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
3 weeks ago
}
$name = $this->guessName($plain, $email);
if ($name === '') {
continue;
}
$affiliation = $this->guessAffiliation($plain);
$seen[$email] = true;
2 weeks ago
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($email),
name: $name,
profileUrl: $sourceUrl,
email: $email,
affiliation: $affiliation,
universityName: CrawlAuthorParser::universityFromAffiliation($affiliation)
?? $this->inferUniversityFromSource($sourceUrl, $html),
3 weeks ago
summary: Str::limit($plain, 300),
2 weeks ago
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html',
);
}
return $items;
}
/**
* 上海交通大学等 tsites.CollegeTeacherListdiv.list > ul > li 卡片。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$collegeName = null;
if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) {
$collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]);
}
$listHtml = $html;
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
$listHtml = $listMatch[1];
}
if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
foreach ($liBlocks[1] as $inner) {
$inner = (string) $inner;
if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($nameMatch[1]);
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = '';
if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) {
$href = (string) $hrefMatch[1];
}
$profileUrl = $this->resolveUrl($href, $sourceUrl)
?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl);
$plain = $this->htmlToPlain($inner);
if (! $this->matchesKeywords($plain.' '.$name, $keywords)) {
continue;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$seen[$dedupeKey] = true;
$affiliation = $this->parseLabeledField($inner, '所在单位')
?? $collegeName;
$academicTitle = $this->parseLabeledField($inner, '职称');
// 列表页「所在单位」多为学院,高校名称从站点/页头推断
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
$summaryParts = array_filter([
$academicTitle ? '职称:'.$academicTitle : null,
$affiliation ? '单位:'.$affiliation : null,
$this->parseLabeledField($inner, '简介'),
]);
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $universityName,
summary: Str::limit(implode('', $summaryParts), 300),
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_tsites',
3 weeks ago
);
}
return $items;
}
2 weeks ago
/**
* @param list<string> $keywords
*/
protected function makeFacultyItem(
string $externalKey,
string $name,
?string $profileUrl,
?string $email,
?string $affiliation,
?string $universityName,
?string $summary,
array $keywords,
?string $academicTitle,
string $platform,
): CrawlItemDto {
$college = $affiliation;
$lead = [
'name' => $name,
'email' => $email,
'affiliation' => $college,
'college' => $college,
'university_name' => $universityName,
'academic_title' => $academicTitle,
'profile_url' => $profileUrl,
];
return new CrawlItemDto(
externalId: $externalKey,
title: $name,
canonicalUrl: $profileUrl,
authors: $name,
summary: $summary,
schoolName: $universityName,
extra: [
'platform' => $platform,
'academic_title' => $academicTitle,
'college_name' => $college,
'profile_url' => $profileUrl,
'lead_author' => $lead,
'keyword' => implode(' ', $keywords),
],
authorsParsed: [[
'name' => $name,
'email' => $email,
'affiliation' => $college,
'university_name' => $universityName,
'academic_title' => $academicTitle,
]],
);
}
/**
* @param list<string> $keywords
*/
protected function matchesKeywords(string $plain, array $keywords): bool
{
if ($keywords === []) {
return true;
}
foreach ($keywords as $kw) {
if ($kw !== '' && stripos($plain, $kw) !== false) {
return true;
}
}
return false;
}
protected function htmlToPlain(string $html): string
{
$plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8');
return preg_replace('/\s+/u', ' ', $plain) ?? '';
}
protected function parseLabeledField(string $html, string $label): ?string
{
$pattern = '/'.preg_quote($label, '/').'[:]\s*([^<]+)/u';
if (! preg_match($pattern, $html, $match)) {
return null;
}
return CrawlAuthorParser::cleanText($match[1]);
}
protected function looksLikePersonName(string $name): bool
{
if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) {
return false;
}
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name);
}
protected function resolveUrl(string $href, string $baseUrl): ?string
{
$href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($href === '' || str_starts_with($href, 'javascript:')) {
return null;
}
if (preg_match('#^https?://#i', $href)) {
return $href;
}
$base = parse_url($baseUrl);
if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) {
return $href;
}
$origin = $base['scheme'].'://'.$base['host'];
if (! empty($base['port'])) {
$origin .= ':'.$base['port'];
}
if (str_starts_with($href, '//')) {
return $base['scheme'].':'.$href;
}
if (str_starts_with($href, '/')) {
return $origin.$href;
}
$path = $base['path'] ?? '/';
$dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';
return $origin.$dir.$href;
}
protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string
{
$escaped = preg_quote($name, '/');
if (! preg_match(
'/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u',
$html,
$match,
)) {
return null;
}
return $this->resolveUrl($match[1], $sourceUrl);
}
protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string
{
$host = parse_url($sourceUrl, PHP_URL_HOST);
if (is_string($host)) {
$host = strtolower($host);
if (str_contains($host, 'sjtu.edu.cn')) {
return '上海交通大学';
}
if (str_contains($host, 'tsinghua.edu.cn')) {
return '清华大学';
}
if (str_contains($host, 'pku.edu.cn')) {
return '北京大学';
}
if (str_contains($host, 'zju.edu.cn')) {
return '浙江大学';
}
if (str_contains($host, 'fudan.edu.cn')) {
return '复旦大学';
}
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
return CrawlAuthorParser::cleanText($match[1]);
}
return null;
}
3 weeks ago
protected function guessName(string $plain, string $email): string
{
if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
return trim($m[1]);
}
$local = strstr($email, '@', true) ?: '';
$local = str_replace(['.', '_', '-'], ' ', $local);
return Str::title(trim($local));
}
protected function guessAffiliation(string $plain): ?string
{
if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
return CrawlAuthorParser::cleanText($m[1]);
}
return null;
}
}