You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1238 lines
40 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
/**
* 通用院系/师资列表页:优先邮箱条目;无邮箱时解析 tsites 等列表卡片(姓名、单位、职称、主页)。
*/
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
{
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 30)));
$maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));
$baseUrl = $this->normalizeRequestUrl($requestUrl);
$firstHtml = $this->fetchHtml($baseUrl);
if ($this->isSaisAjaxFacultyPage($firstHtml, $requestUrl)) {
$items = $this->fetchSaisFacultyItems($requestUrl, $firstHtml, $keywords, $maxResults);
return $this->enrichEmailsFromProfilePages($items, $params);
}
$totalPages = $this->detectTotalPages($firstHtml);
$pagesToFetch = min($maxPages, $totalPages);
$merged = [];
$seen = [];
for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) {
$html = $page === 1
? $firstHtml
: null;
if ($html === null) {
break;
}
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$merged[] = $item;
if (count($merged) >= $maxResults) {
break 2;
}
}
}
if ($pagesToFetch > 1 && count($merged) < $maxResults) {
$merged = $this->fetchRemainingListPages(
$baseUrl,
$firstHtml,
$pagesToFetch,
$keywords,
$requestUrl,
$merged,
$seen,
$maxResults,
);
}
return $this->enrichEmailsFromProfilePages($merged, $params);
}
/**
* @param list<CrawlItemDto> $merged
* @param array<string, true> $seen
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchRemainingListPages(
string $baseUrl,
string $firstHtml,
int $pagesToFetch,
array $keywords,
string $requestUrl,
array $merged,
array $seen,
int $maxResults,
): array {
$poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5)));
$pageUrls = [];
for ($page = 2; $page <= $pagesToFetch; $page++) {
$pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml);
}
foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) {
$htmlByPage = $this->fetchHtmlPool($chunk);
ksort($htmlByPage);
foreach ($htmlByPage as $html) {
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$merged[] = $item;
if (count($merged) >= $maxResults) {
return $merged;
}
}
}
}
return $merged;
}
/**
* @param array<int, string> $pageUrls
* @return array<int, string>
*/
protected function fetchHtmlPool(array $pageUrls): array
{
if ($pageUrls === []) {
return [];
}
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) {
foreach ($pageUrls as $page => $url) {
$pool->as((string) $page)
->timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers)
->get($url);
}
});
$htmlByPage = [];
foreach ($pageUrls as $page => $url) {
$body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null);
if ($body !== null && $body !== '') {
$htmlByPage[$page] = $body;
}
}
return $htmlByPage;
}
/**
* @param list<CrawlItemDto> $items
* @param array<string, mixed> $params
* @return list<CrawlItemDto>
*/
protected function enrichEmailsFromProfilePages(array $items, array $params = []): array
{
if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
return $items;
}
$maxEnrich = $this->resolveProfileEnrichMax($params, count($items));
if ($maxEnrich <= 0) {
return $this->markProfileEnrichSkipped($items);
}
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8)));
$timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$fetchMap = [];
$enrichBudget = $maxEnrich;
foreach ($items as $index => $item) {
if ($enrichBudget <= 0) {
break;
}
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
continue;
}
$fetchMap[$index] = $item;
$enrichBudget--;
}
if ($fetchMap === []) {
return $items;
}
$fetchedBodies = [];
foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) {
$batchPending = [];
foreach ($chunk as $index => $item) {
$batchPending[$index] = $item;
}
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) {
foreach ($batchPending as $index => $item) {
$pool->as((string) $index)
->timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers)
->get($item->canonicalUrl);
}
});
foreach ($batchPending as $index => $item) {
$body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null);
if ($body !== null) {
$email = $this->extractEmailFromProfileHtml($body);
if ($email) {
$item = $this->applyEmailToItem($item, $email);
}
$item = $this->applyProfileMetadataToItem($item, $body);
}
$fetchedBodies[$index] = $item;
}
}
$result = [];
foreach ($items as $index => $item) {
if (isset($fetchedBodies[$index])) {
$result[] = $fetchedBodies[$index];
} elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) {
$result[] = $this->markItemProfileEnrichSkipped($item);
} else {
$result[] = $item;
}
}
return $result;
}
/**
* @param array<string, mixed> $params
*/
protected function resolveProfileEnrichMax(array $params, int $itemCount): int
{
if (($params['skip_profile_enrich'] ?? false) === true) {
return 0;
}
$configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32));
return max(0, min($itemCount, min(200, $configured)));
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function markProfileEnrichSkipped(array $items): array
{
return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items);
}
protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto
{
if ($this->itemHasEmail($item)) {
return $item;
}
$extra = $item->extra;
$extra['profile_enrich_skipped'] = true;
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $item->authorsParsed,
);
}
protected function responseBodyFromPoolResult(mixed $result): ?string
{
if ($result instanceof Response && $result->successful()) {
return (string) $result->body();
}
return null;
}
protected function itemHasEmail(CrawlItemDto $item): bool
{
$lead = $item->extra['lead_author'] ?? null;
if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) {
return true;
}
foreach ($item->authorsParsed as $author) {
if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) {
return true;
}
}
return false;
}
protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto
{
$email = CrawlAuthorParser::normalizeEmail($email) ?? $email;
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
$lead['email'] = $email;
$authorsParsed = $item->authorsParsed;
if ($authorsParsed === []) {
$authorsParsed = [[
'name' => $item->title,
'email' => $email,
'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null,
'university_name' => $lead['university_name'] ?? $item->schoolName,
]];
} else {
$authorsParsed[0]['email'] = $email;
}
$extra = $item->extra;
$extra['lead_author'] = $lead;
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $authorsParsed,
);
}
protected function extractEmailFromProfileHtml(string $html): ?string
{
$labeledPatterns = [
'/电子邮箱[:]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子信箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/E-?mail[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
'/邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子邮件[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
];
foreach ($labeledPatterns as $pattern) {
if (preg_match($pattern, $html, $match)) {
$email = CrawlAuthorParser::normalizeEmail($match[1]);
if ($email && ! $this->isNoiseEmail($email)) {
return $email;
}
}
}
$candidates = [];
if (preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emailMatches,
)) {
foreach ($emailMatches[1] as $raw) {
$email = CrawlAuthorParser::normalizeEmail($raw);
if ($email && ! $this->isNoiseEmail($email)) {
$candidates[] = $email;
}
}
}
if ($candidates === []) {
return null;
}
$candidates = array_values(array_unique($candidates));
foreach ($candidates as $email) {
if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) {
return $email;
}
}
return $candidates[0];
}
protected function isNoiseEmail(string $email): bool
{
return (bool) preg_match(
'/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i',
$email,
);
}
protected function fetchHtml(string $url): string
{
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
->get($url);
if (! $response->successful()) {
throw new \RuntimeException('页面请求失败HTTP '.$response->status().''.$url);
}
return (string) $response->body();
}
protected function detectTotalPages(string $html): int
{
if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
return max(1, (int) $match[1]);
}
if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) {
$perPage = 0;
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
$perPage = preg_match_all('/<div\s+class="name">/u', $listMatch[1]) ?: 0;
}
if ($perPage > 0) {
return max(1, (int) ceil(((int) $countMatch[1]) / $perPage));
}
}
return 1;
}
protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
{
$parts = parse_url($baseUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $baseUrl;
}
parse_str((string) ($parts['query'] ?? ''), $query);
$query['PAGENUM'] = (string) $page;
if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) {
$query['totalpage'] = $totalMatch[1];
}
$url = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$url .= ':'.$parts['port'];
}
$url .= $parts['path'] ?? '/';
if ($query !== []) {
$url .= '?'.http_build_query($query);
}
return $url;
}
protected function normalizeRequestUrl(string $url): string
{
$parts = parse_url($url);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $url;
}
$normalized = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$normalized .= ':'.$parts['port'];
}
$normalized .= $parts['path'] ?? '/';
if (! empty($parts['query'])) {
$normalized .= '?'.$parts['query'];
}
return $normalized;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
{
$items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
$items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
if ($items !== []) {
return $items;
}
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
}
protected function isSaisAjaxFacultyPage(string $html, string $sourceUrl): bool
{
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
return true;
}
if (str_contains($html, 'ajax_teacher_list.html')) {
return true;
}
$host = strtolower((string) parse_url($sourceUrl, PHP_URL_HOST));
return str_contains($host, 'sais.sjtu.edu.cn')
&& str_contains(strtolower($sourceUrl), 'faculty');
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchSaisFacultyItems(
string $requestUrl,
string $pageHtml,
array $keywords,
int $maxResults,
): array {
$config = $this->parseSaisAjaxConfig($pageHtml, $requestUrl);
$search = implode(' ', $keywords);
$type = $search !== '' ? '2' : '1';
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0',
'Accept' => 'application/json, text/html',
])
->asForm()
->post($config['api_url'], [
'cat_id' => $config['cat_id'],
'cat_code' => $config['cat_code'],
'type' => $type,
'zm' => $search === '' ? 'All' : '',
'search' => $search,
]);
if (! $response->successful()) {
throw new \RuntimeException('SAIS 教师列表接口请求失败HTTP '.$response->status().'');
}
$payload = $response->json();
if (! is_array($payload)) {
throw new \RuntimeException('SAIS 教师列表接口返回格式异常');
}
$content = (string) ($payload['content'] ?? '');
if ($content === '') {
return [];
}
$items = $this->extractFromSaisJsList($pageHtml.$content, $keywords, $requestUrl);
if (count($items) > $maxResults) {
$items = array_slice($items, 0, $maxResults);
}
return $items;
}
/**
* @return array{cat_id:string,cat_code:string,api_url:string}
*/
protected function parseSaisAjaxConfig(string $html, string $sourceUrl): array
{
$catId = '18';
$catCode = 'faculty';
$apiUrl = 'https://sais.sjtu.edu.cn/active/ajax_teacher_list.html';
if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) {
$catId = $match[1];
}
if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) {
$catCode = $match[1];
}
if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) {
$apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl)
?? $apiUrl;
}
$origin = $this->requestOrigin($sourceUrl);
if ($origin !== null && str_starts_with($apiUrl, '/')) {
$apiUrl = $origin.$apiUrl;
}
return [
'cat_id' => $catId,
'cat_code' => $catCode,
'api_url' => $apiUrl,
];
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromSaisJsList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
if (! preg_match('/\bclass="[^"]*\bname\b[^"]*"/u', $attrs)) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! str_contains(strtolower($href), '/faculty/')) {
continue;
}
$profileUrl = $this->resolveUrl($href, $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$plain = trim($name.' '.($defaultCollege ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_sais',
bio: null,
);
}
return $items;
}
protected function requestOrigin(string $sourceUrl): ?string
{
$parts = parse_url($sourceUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return null;
}
$origin = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$origin .= ':'.$parts['port'];
}
return $origin;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
if (! preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emailMatches,
PREG_OFFSET_CAPTURE
)) {
return [];
}
foreach ($emailMatches[1] as $match) {
$email = CrawlAuthorParser::normalizeEmail($match[0]);
if (! $email || isset($seen[$email])) {
continue;
}
$pos = (int) $match[1];
$window = substr($html, max(0, $pos - 400), 800);
$plain = $this->htmlToPlain($window);
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$name = $this->guessName($plain, $email);
if ($name === '') {
continue;
}
$affiliation = $this->guessAffiliation($plain);
$seen[$email] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($email),
name: $name,
profileUrl: $sourceUrl,
email: $email,
affiliation: $affiliation,
universityName: CrawlAuthorParser::universityFromAffiliation($affiliation)
?? $this->inferUniversityFromSource($sourceUrl, $html),
summary: Str::limit($plain, 300),
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html',
bio: null,
);
}
return $items;
}
/**
* 上海交大材料学院等panel-item + a.staff-item/people/detail_new/{id})。
*
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
$panelChunks = preg_split('#<div\s+class="panel-head">#u', $html) ?: [];
if (count($panelChunks) > 1) {
array_shift($panelChunks);
foreach ($panelChunks as $chunk) {
if (! preg_match('#<div\s+class="title">\s*([^<]+?)\s*</div>#u', $chunk, $titleMatch)) {
continue;
}
$department = CrawlAuthorParser::cleanText($titleMatch[1]);
foreach ($this->extractStaffItemLinks($chunk) as $link) {
$item = $this->makeStaffPanelItem(
$link,
$department ?: $defaultCollege,
$pageUniversity,
$keywords,
$sourceUrl,
);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
}
}
}
if ($items !== []) {
return $items;
}
foreach ($this->extractStaffItemLinks($html) as $link) {
$item = $this->makeStaffPanelItem(
$link,
$defaultCollege,
$pageUniversity,
$keywords,
$sourceUrl,
);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
}
return $items;
}
/**
* @return list<array{href:string,name:string}>
*/
protected function extractStaffItemLinks(string $html): array
{
$links = [];
$seen = [];
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
if (! str_contains($attrs, 'staff-item')) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
$key = $href.'|'.$name;
if (isset($seen[$key])) {
continue;
}
$seen[$key] = true;
$links[] = ['href' => $href, 'name' => $name];
}
return $links;
}
/**
* @param array{href:string,name:string} $link
* @param list<string> $keywords
*/
protected function makeStaffPanelItem(
array $link,
?string $department,
?string $pageUniversity,
array $keywords,
string $sourceUrl,
): ?CrawlItemDto {
$name = $link['name'];
$profileUrl = $this->resolveUrl($link['href'], $sourceUrl);
$plain = trim($name.' '.($department ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
return null;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
$affiliation = $department;
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
return $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $universityName,
summary: $department ? '单位:'.$department : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_smse',
bio: null,
);
}
protected function inferCollegeFromPageTitle(string $html): ?string
{
if (! preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
return null;
}
$title = CrawlAuthorParser::cleanText($match[1]);
if ($title === null || $title === '') {
return null;
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
return CrawlAuthorParser::cleanText($college[1]);
}
return $title;
}
protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
{
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
$changed = false;
if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) {
$title = CrawlAuthorParser::cleanText($titleMatch[1]);
if ($title !== null && $title !== '') {
$lead['academic_title'] = $title;
$changed = true;
}
}
if (empty($lead['college']) && empty($lead['affiliation'])) {
$dept = $this->parseLabeledField($html, '所属二级机构');
if ($dept !== null && $dept !== '') {
$lead['affiliation'] = $dept;
$lead['college'] = $dept;
$changed = true;
}
}
if (! $changed) {
return $item;
}
$extra = $item->extra;
$extra['lead_author'] = $lead;
if (! empty($lead['academic_title'])) {
$extra['academic_title'] = $lead['academic_title'];
}
if (! empty($lead['college'])) {
$extra['college_name'] = $lead['college'];
}
$authorsParsed = $item->authorsParsed;
if ($authorsParsed !== []) {
if (! empty($lead['academic_title'])) {
$authorsParsed[0]['academic_title'] = $lead['academic_title'];
}
if (! empty($lead['college'])) {
$authorsParsed[0]['affiliation'] = $lead['college'];
}
}
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $authorsParsed,
);
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$collegeName = null;
if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) {
$collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]);
}
$listHtml = $html;
if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
$listHtml = $listMatch[1];
}
if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) {
return [];
}
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
foreach ($liBlocks[1] as $inner) {
$inner = (string) $inner;
if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($nameMatch[1]);
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = '';
if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) {
$href = (string) $hrefMatch[1];
}
$profileUrl = $this->resolveUrl($href, $sourceUrl)
?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl);
$plain = $this->htmlToPlain($inner);
if (! $this->matchesKeywords($plain.' '.$name, $keywords)) {
continue;
}
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$seen[$dedupeKey] = true;
$affiliation = $this->parseLabeledField($inner, '所在单位')
?? $collegeName;
$academicTitle = $this->parseLabeledField($inner, '职称');
// 列表页「所在单位」多为学院,高校名称从站点/页头推断
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
$bio = $this->parseLabeledField($inner, '简介');
$summaryParts = array_filter([
$academicTitle ? '职称:'.$academicTitle : null,
$affiliation ? '单位:'.$affiliation : null,
$bio,
]);
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $affiliation,
universityName: $universityName,
summary: Str::limit(implode('', $summaryParts), 300),
keywords: $keywords,
academicTitle: $academicTitle,
platform: 'faculty_html_tsites',
bio: $bio,
);
}
return $items;
}
/**
* @param list<string> $keywords
*/
protected function makeFacultyItem(
string $externalKey,
string $name,
?string $profileUrl,
?string $email,
?string $affiliation,
?string $universityName,
?string $summary,
array $keywords,
?string $academicTitle,
string $platform,
?string $bio = null,
): CrawlItemDto {
$college = $affiliation;
$lead = [
'name' => $name,
'email' => $email,
'affiliation' => $college,
'college' => $college,
'university_name' => $universityName,
'academic_title' => $academicTitle,
'bio' => $bio,
'profile_url' => $profileUrl,
];
return new CrawlItemDto(
externalId: $externalKey,
title: $name,
canonicalUrl: $profileUrl,
authors: $name,
summary: $summary,
schoolName: $universityName,
extra: [
'platform' => $platform,
'academic_title' => $academicTitle,
'college_name' => $college,
'bio' => $bio,
'profile_url' => $profileUrl,
'lead_author' => $lead,
'keyword' => implode(' ', $keywords),
],
authorsParsed: [[
'name' => $name,
'email' => $email,
'affiliation' => $college,
'university_name' => $universityName,
'academic_title' => $academicTitle,
]],
);
}
/**
* @param list<string> $keywords
*/
protected function matchesKeywords(string $plain, array $keywords): bool
{
if ($keywords === []) {
return true;
}
foreach ($keywords as $kw) {
if ($kw !== '' && stripos($plain, $kw) !== false) {
return true;
}
}
return false;
}
protected function htmlToPlain(string $html): string
{
$plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8');
return preg_replace('/\s+/u', ' ', $plain) ?? '';
}
protected function parseLabeledField(string $html, string $label): ?string
{
$pattern = '/'.preg_quote($label, '/').'[:]\s*([^<]+)/u';
if (! preg_match($pattern, $html, $match)) {
return null;
}
return CrawlAuthorParser::cleanText($match[1]);
}
protected function looksLikePersonName(string $name): bool
{
if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) {
return false;
}
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name);
}
protected function resolveUrl(string $href, string $baseUrl): ?string
{
$href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($href === '' || str_starts_with($href, 'javascript:')) {
return null;
}
if (preg_match('#^https?://#i', $href)) {
return $href;
}
$base = parse_url($baseUrl);
if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) {
return $href;
}
$origin = $base['scheme'].'://'.$base['host'];
if (! empty($base['port'])) {
$origin .= ':'.$base['port'];
}
if (str_starts_with($href, '//')) {
return $base['scheme'].':'.$href;
}
if (str_starts_with($href, '/')) {
return $origin.$href;
}
$path = $base['path'] ?? '/';
$dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';
return $origin.$dir.$href;
}
protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string
{
$escaped = preg_quote($name, '/');
if (! preg_match(
'/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u',
$html,
$match,
)) {
return null;
}
return $this->resolveUrl($match[1], $sourceUrl);
}
protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string
{
$host = parse_url($sourceUrl, PHP_URL_HOST);
if (is_string($host)) {
$host = strtolower($host);
if (str_contains($host, 'sjtu.edu.cn')) {
return '上海交通大学';
}
if (str_contains($host, 'tsinghua.edu.cn')) {
return '清华大学';
}
if (str_contains($host, 'pku.edu.cn')) {
return '北京大学';
}
if (str_contains($host, 'zju.edu.cn')) {
return '浙江大学';
}
if (str_contains($host, 'fudan.edu.cn')) {
return '复旦大学';
}
}
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
return CrawlAuthorParser::cleanText($match[1]);
}
return null;
}
protected function guessName(string $plain, string $email): string
{
if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
return trim($m[1]);
}
$local = strstr($email, '@', true) ?: '';
$local = str_replace(['.', '_', '-'], ' ', $local);
return Str::title(trim($local));
}
protected function guessAffiliation(string $plain): ?string
{
if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
return CrawlAuthorParser::cleanText($m[1]);
}
return null;
}
}