\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) { $title = CrawlAuthorParser::cleanText($match[1]); if ($title !== null && $title !== '') { if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) { return CrawlAuthorParser::cleanText($college[1]); } return $title; } } if (preg_match('/<meta\s+name="description"\s+content="([^"]+)"/u', $html, $match)) { $desc = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($desc !== null && $desc !== '') { if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) { return CrawlAuthorParser::cleanText($college[1]); } if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) { return CrawlAuthorParser::cleanText($college[1]); } } } if (preg_match('/<meta\s+name=[\'"]SiteName[\'"]\s+content=[\'"]([^\'"]+)[\'"]/u', $html, $match)) { $siteName = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($siteName !== null && $siteName !== '') { if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $siteName, $college)) { return CrawlAuthorParser::cleanText($college[1]); } } } return null; } protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto { $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : []; $changed = false; if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) { $title = CrawlAuthorParser::cleanText($titleMatch[1]); if ($title !== null && $title !== '') { $lead['academic_title'] = $title; $changed = true; } } if (empty($lead['college']) && empty($lead['affiliation'])) { $dept = $this->parseLabeledField($html, '所属二级机构'); if ($dept !== null && $dept !== '') { $lead['affiliation'] = $dept; $lead['college'] = $dept; $changed = true; } } if (! $changed) { return $item; } $extra = $item->extra; $extra['lead_author'] = $lead; if (! empty($lead['academic_title'])) { $extra['academic_title'] = $lead['academic_title']; } if (! empty($lead['college'])) { $extra['college_name'] = $lead['college']; } $authorsParsed = $item->authorsParsed; if ($authorsParsed !== []) { if (! empty($lead['academic_title'])) { $authorsParsed[0]['academic_title'] = $lead['academic_title']; } if (! empty($lead['college'])) { $authorsParsed[0]['affiliation'] = $lead['college']; } } return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } /** * @param list<string> $keywords * @return list<CrawlItemDto> */ protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; $collegeName = null; if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) { $collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]); } $listHtml = $html; if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) { $listHtml = $listMatch[1]; } if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) { return []; } $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); foreach ($liBlocks[1] as $inner) { $inner = (string) $inner; if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) { continue; } $name = CrawlAuthorParser::cleanText($nameMatch[1]); if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } $href = ''; if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) { $href = (string) $hrefMatch[1]; } $profileUrl = $this->resolveUrl($href, $sourceUrl) ?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl); $plain = $this->htmlToPlain($inner); if (! $this->matchesKeywords($plain.' '.$name, $keywords)) { continue; } $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $seen[$dedupeKey] = true; $affiliation = $this->parseLabeledField($inner, '所在单位') ?? $collegeName; $academicTitle = $this->parseLabeledField($inner, '职称'); // 列表页「所在单位」多为学院，高校名称从站点/页头推断 $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation); $bio = $this->parseLabeledField($inner, '简介'); $summaryParts = array_filter([ $academicTitle ? '职称：'.$academicTitle : null, $affiliation ? '单位：'.$affiliation : null, $bio, ]); $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $universityName, summary: Str::limit(implode('；', $summaryParts), 300), keywords: $keywords, academicTitle: $academicTitle, platform: 'faculty_html_tsites', bio: $bio, ); } return $items; } /** * @param list<string> $keywords */ protected function makeFacultyItem( string $externalKey, string $name, ?string $profileUrl, ?string $email, ?string $affiliation, ?string $universityName, ?string $summary, array $keywords, ?string $academicTitle, string $platform, ?string $bio = null, ): CrawlItemDto { $college = $affiliation; $lead = [ 'name' => $name, 'email' => $email, 'affiliation' => $college, 'college' => $college, 'university_name' => $universityName, 'academic_title' => $academicTitle, 'bio' => $bio, 'profile_url' => $profileUrl, ]; return new CrawlItemDto( externalId: $externalKey, title: $name, canonicalUrl: $profileUrl, authors: $name, summary: $summary, schoolName: $universityName, extra: [ 'platform' => $platform, 'academic_title' => $academicTitle, 'college_name' => $college, 'bio' => $bio, 'profile_url' => $profileUrl, 'lead_author' => $lead, 'keyword' => implode(' ', $keywords), ], authorsParsed: [[ 'name' => $name, 'email' => $email, 'affiliation' => $college, 'university_name' => $universityName, 'academic_title' => $academicTitle, ]], ); } /** * @param list<string> $keywords */ protected function matchesKeywords(string $plain, array $keywords): bool { if ($keywords === []) { return true; } foreach ($keywords as $kw) { if ($kw !== '' && stripos($plain, $kw) !== false) { return true; } } return false; } protected function htmlToPlain(string $html): string { $plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8'); return preg_replace('/\s+/u', ' ', $plain) ?? ''; } protected function parseLabeledField(string $html, string $label): ?string { $pattern = '/'.preg_quote($label, '/').'[：:]\s*([^<]+)/u'; if (! preg_match($pattern, $html, $match)) { return null; } return CrawlAuthorParser::cleanText($match[1]); } protected function looksLikePersonName(string $name): bool { if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) { return false; } return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name) || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name); } protected function resolveUrl(string $href, string $baseUrl): ?string { $href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($href === '' || str_starts_with($href, 'javascript:')) { return null; } if (preg_match('#^https?://#i', $href)) { return $href; } $base = parse_url($baseUrl); if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) { return $href; } $origin = $base['scheme'].'://'.$base['host']; if (! empty($base['port'])) { $origin .= ':'.$base['port']; } if (str_starts_with($href, '//')) { return $base['scheme'].':'.$href; } if (str_starts_with($href, '/')) { return $origin.$href; } $path = $base['path'] ?? '/'; $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/'; return $origin.$dir.$href; } protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string { $escaped = preg_quote($name, '/'); if (! preg_match( '/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u', $html, $match, )) { return null; } return $this->resolveUrl($match[1], $sourceUrl); } protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string { $host = parse_url($sourceUrl, PHP_URL_HOST); if (is_string($host)) { $host = strtolower($host); if (str_contains($host, 'sjtu.edu.cn')) { return '上海交通大学'; } if (str_contains($host, 'tsinghua.edu.cn')) { return '清华大学'; } if (str_contains($host, 'pku.edu.cn')) { return '北京大学'; } if (str_contains($host, 'zju.edu.cn')) { return '浙江大学'; } if (str_contains($host, 'fudan.edu.cn')) { return '复旦大学'; } if (str_contains($host, 'nju.edu.cn')) { return '南京大学'; } if (str_contains($host, 'tsinghua.edu.cn')) { return '清华大学'; } } if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) { return CrawlAuthorParser::cleanText($match[1]); } return null; } protected function guessName(string $plain, string $email): string { if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) { return trim($m[1]); } $local = strstr($email, '@', true) ?: ''; $local = str_replace(['.', '_', '-'], ' ', $local); return Str::title(trim($local)); } protected function guessAffiliation(string $plain): ?string { if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) { return CrawlAuthorParser::cleanText($m[1]); } return null; } }

normalizeRequestUrl($requestUrl); $firstHtml = $this->fetchHtml($baseUrl); if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) { $items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages); return $this->enrichEmailsFromProfilePages($items, $params); } if ($this->isNjuTeacherHomePage($firstHtml)) { $items = $this->fetchNjuTeacherHomeItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages); return $this->enrichEmailsFromProfilePages($items, $params); } $totalPages = $this->detectTotalPages($firstHtml); $pagesToFetch = min($maxPages, $totalPages); $merged = []; $seen = []; for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) { $html = $page === 1 ? $firstHtml : null; if ($html === null) { break; } foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $merged[] = $item; if (count($merged) >= $maxResults) { break 2; } } } if ($pagesToFetch > 1 && count($merged) < $maxResults) { $merged = $this->fetchRemainingListPages( $baseUrl, $firstHtml, $pagesToFetch, $keywords, $requestUrl, $merged, $seen, $maxResults, ); } return $this->enrichEmailsFromProfilePages($merged, $params); } /** * @param list $merged * @param array $seen * @param list $keywords * @return list */ protected function fetchRemainingListPages( string $baseUrl, string $firstHtml, int $pagesToFetch, array $keywords, string $requestUrl, array $merged, array $seen, int $maxResults, ): array { $poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5))); $pageUrls = []; for ($page = 2; $page <= $pagesToFetch; $page++) { $pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml); } foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) { $htmlByPage = $this->fetchHtmlPool($chunk); ksort($htmlByPage); foreach ($htmlByPage as $html) { foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $merged[] = $item; if (count($merged) >= $maxResults) { return $merged; } } } } return $merged; } /** * @param array $pageUrls * @return array */ protected function fetchHtmlPool(array $pageUrls): array { if ($pageUrls === []) { return []; } $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) { foreach ($pageUrls as $page => $url) { $pool->as((string) $page) ->timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders($headers) ->get($url); } }); $htmlByPage = []; foreach ($pageUrls as $page => $url) { $body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null); if ($body !== null && $body !== '') { $htmlByPage[$page] = $body; } } return $htmlByPage; } /** * @param list $items * @param array $params * @return list */ protected function enrichEmailsFromProfilePages(array $items, array $params = []): array { if (! config('crawl.faculty.profile_email_enrich_enabled', true)) { return $items; } $maxEnrich = $this->resolveProfileEnrichMax($params, count($items)); if ($maxEnrich <= 0) { return $this->markProfileEnrichSkipped($items); } $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8))); $timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10)); $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; $fetchMap = []; $enrichBudget = $maxEnrich; foreach ($items as $index => $item) { if ($enrichBudget <= 0) { break; } if ($this->itemHasEmail($item) || ! $item->canonicalUrl) { continue; } $fetchMap[$index] = $item; $enrichBudget--; } if ($fetchMap === []) { return $items; } $fetchedBodies = []; foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) { $batchPending = []; foreach ($chunk as $index => $item) { $batchPending[$index] = $item; } $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) { foreach ($batchPending as $index => $item) { $pool->as((string) $index) ->timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders($headers) ->get($item->canonicalUrl); } }); foreach ($batchPending as $index => $item) { $body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null); if ($body !== null) { $email = $this->extractEmailFromProfileHtml($body); if ($email) { $item = $this->applyEmailToItem($item, $email); } $item = $this->applyProfileMetadataToItem($item, $body); } $fetchedBodies[$index] = $item; } } $result = []; foreach ($items as $index => $item) { if (isset($fetchedBodies[$index])) { $result[] = $fetchedBodies[$index]; } elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) { $result[] = $this->markItemProfileEnrichSkipped($item); } else { $result[] = $item; } } return $result; } /** * @param array $params */ protected function resolveProfileEnrichMax(array $params, int $itemCount): int { if (($params['skip_profile_enrich'] ?? false) === true) { return 0; } $configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32)); return max(0, min($itemCount, min(200, $configured))); } /** * @param list $items * @return list */ protected function markProfileEnrichSkipped(array $items): array { return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items); } protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto { if ($this->itemHasEmail($item)) { return $item; } $extra = $item->extra; $extra['profile_enrich_skipped'] = true; return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $item->authorsParsed, ); } protected function responseBodyFromPoolResult(mixed $result): ?string { if ($result instanceof Response && $result->successful()) { return (string) $result->body(); } return null; } protected function itemHasEmail(CrawlItemDto $item): bool { $lead = $item->extra['lead_author'] ?? null; if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) { return true; } foreach ($item->authorsParsed as $author) { if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) { return true; } } return false; } protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto { $email = CrawlAuthorParser::normalizeEmail($email) ?? $email; $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : []; $lead['email'] = $email; $authorsParsed = $item->authorsParsed; if ($authorsParsed === []) { $authorsParsed = [[ 'name' => $item->title, 'email' => $email, 'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null, 'university_name' => $lead['university_name'] ?? $item->schoolName, ]]; } else { $authorsParsed[0]['email'] = $email; } $extra = $item->extra; $extra['lead_author'] = $lead; return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } protected function extractEmailFromProfileHtml(string $html): ?string { $labeledPatterns = [ '/电子邮箱[：:]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子邮箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子信箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/E-?mail[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu', '/邮箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子邮件[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', ]; foreach ($labeledPatterns as $pattern) { if (preg_match($pattern, $html, $match)) { $email = CrawlAuthorParser::normalizeEmail($match[1]); if ($email && ! $this->isNoiseEmail($email)) { return $email; } } } $candidates = []; if (preg_match_all( '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $html, $emailMatches, )) { foreach ($emailMatches[1] as $raw) { $email = CrawlAuthorParser::normalizeEmail($raw); if ($email && ! $this->isNoiseEmail($email)) { $candidates[] = $email; } } } if ($candidates === []) { return null; } $candidates = array_values(array_unique($candidates)); foreach ($candidates as $email) { if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) { return $email; } } return $candidates[0]; } protected function isNoiseEmail(string $email): bool { return (bool) preg_match( '/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i', $email, ); } protected function fetchHtml(string $url): string { $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $response = Http::timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']) ->get($url); if (! $response->successful()) { throw new \RuntimeException('页面请求失败（HTTP '.$response->status().'）：'.$url); } return (string) $response->body(); } protected function detectTotalPages(string $html): int { if (preg_match('/totalpage=(\d+)/i', $html, $match)) { return max(1, (int) $match[1]); } if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) { $perPage = 0; if (preg_match('/]*>\s*

/u', $listMatch[1]) ?: 0; } if ($perPage > 0) { return max(1, (int) ceil(((int) $countMatch[1]) / $perPage)); } } return 1; } protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string { $parts = parse_url($baseUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $baseUrl; } parse_str((string) ($parts['query'] ?? ''), $query); $query['PAGENUM'] = (string) $page; if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) { $query['totalpage'] = $totalMatch[1]; } $url = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $url .= ':'.$parts['port']; } $url .= $parts['path'] ?? '/'; if ($query !== []) { $url .= '?'.http_build_query($query); } return $url; } protected function normalizeRequestUrl(string $url): string { $parts = parse_url($url); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $url; } $normalized = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $normalized .= ':'.$parts['port']; } $normalized .= $parts['path'] ?? '/'; if (! empty($parts['query'])) { $normalized .= '?'.$parts['query']; } return $normalized; } /** * @param list $keywords * @return list */ protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array { $items = $this->extractFromSudyNewsFacultyList($html, $keywords, $sourceUrl); if ($items !== []) { return $items; } $items = $this->extractFromRaTeacherList($html, $keywords, $sourceUrl); if ($items !== []) { return $items; } $items = $this->extractFromVsbFacultyTable($html, $keywords, $sourceUrl); if ($items !== []) { return $items; } $items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl); if ($items !== []) { return $items; } $items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl); if ($items !== []) { return $items; } return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl); } /** * 南大 Sudy CMS：ul.news_list 内 news_title / news_title1 链接（frontier、ic 等）。 * * @param list $keywords * @return list */ protected function extractFromSudyNewsFacultyList(string $html, array $keywords, string $sourceUrl): array { if (! preg_match('/class="news_list/u', $html)) { return []; } $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); if ($defaultCollege === null && preg_match('#

([^<]+)

#u', $html, $titleMatch)) { $defaultCollege = CrawlAuthorParser::cleanText($titleMatch[1]); } $items = []; $seen = []; $chunks = preg_split('#

([^<]+)#u', $chunk, $deptMatch)) { $department = CrawlAuthorParser::cleanText($deptMatch[1]); } foreach ($this->extractSudyNewsLinksFromChunk( $chunk, $department, $keywords, $sourceUrl, $pageUniversity, $seen, ) as $item) { $items[] = $item; } } return $items; } /** * @param array $seen * @param list $keywords * @return list */ protected function extractSudyNewsLinksFromChunk( string $chunk, ?string $department, array $keywords, string $sourceUrl, ?string $pageUniversity, array &$seen, ): array { $items = []; if (! preg_match_all( '#<(?:div|span)\s+class="news_title1?">\s*]*?)>([^<]+)#su', $chunk, $matches, PREG_SET_ORDER, )) { return []; } foreach ($matches as $match) { $attrs = (string) $match[1]; $name = CrawlAuthorParser::cleanText($match[2]) ?? ''; if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } if (! preg_match('#\bhref=[\'"]([^\'"]+)[\'"]#u', $attrs, $hrefMatch)) { continue; } $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); if (! $this->looksLikeTeacherProfileUrl($href, null)) { continue; } $profileUrl = $this->resolveUrl($href, $sourceUrl); $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $plain = trim($name.' '.($department ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $seen[$dedupeKey] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $department, universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($department), summary: $department ? '单位：'.$department : null, keywords: $keywords, academicTitle: null, platform: 'faculty_html_sudy_news', bio: null, ); } return $items; } /** * 南大机器人学院等博山 CMS：ul.teacher 卡片（div.xm 姓名）。 * * @param list $keywords * @return list */ protected function extractFromRaTeacherList(string $html, array $keywords, string $sourceUrl): array { if (! preg_match('/

]*?)>.*?

([^<]+)

(.*?)#su', $html, $matches, PREG_SET_ORDER, )) { return []; } $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); $items = []; $seen = []; foreach ($matches as $match) { $attrs = (string) $match[1]; $name = CrawlAuthorParser::cleanText($match[2]) ?? ''; $tail = (string) $match[3]; if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { continue; } $profileUrl = $this->resolveUrl(html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl); $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $academicTitle = null; if (preg_match('#职称：\s*([^<]+)#u', $tail, $titleMatch)) { $academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]); } $researchField = null; if (preg_match('#研究方向：\s*([^<]+)#u', $tail, $fieldMatch)) { $researchField = CrawlAuthorParser::cleanText($fieldMatch[1]); } $plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $summaryParts = array_filter([ $defaultCollege ? '单位：'.$defaultCollege : null, $academicTitle ? '职称：'.$academicTitle : null, $researchField ? '研究方向：'.$researchField : null, ]); $seen[$dedupeKey] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $defaultCollege, universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege), summary: $summaryParts !== [] ? implode('；', $summaryParts) : null, keywords: $keywords, academicTitle: $academicTitle, platform: 'faculty_html_ra', bio: $researchField, ); } return $items; } /** * 南大/清华 WebPlus(VSB) 师资表格页（ise zjzjs 等）。 * * @param list $keywords * @return list */ protected function extractFromVsbFacultyTable(string $html, array $keywords, string $sourceUrl): array { $scope = null; if (preg_match('#

(.*?)

#su', $html, $match)) { $scope = (string) $match[1]; } elseif (preg_match('#

(.*?)

\s*\s*#su', $html, $match)) { $scope = (string) $match[1]; } elseif (preg_match('#

(.*?)#su', $html, $match) && trim(strip_tags($match[1])) !== '') { $scope = (string) $match[1]; } if ($scope === null || trim(strip_tags($scope)) === '') { return []; } $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); $items = []; $seen = []; $sectionTitles = []; if (preg_match_all('#]*>(.*?)#su', $scope, $sectionMatches, PREG_OFFSET_CAPTURE)) { foreach ($sectionMatches[1] as $sectionMatch) { $title = CrawlAuthorParser::cleanText(strip_tags($sectionMatch[0])); if ($title !== null && $title !== '') { $sectionTitles[] = [ 'offset' => $sectionMatch[1], 'title' => $title, ]; } } } $resolveSectionTitle = function (int $offset) use ($sectionTitles): ?string { $title = null; foreach ($sectionTitles as $section) { if ($section['offset'] <= $offset) { $title = $section['title']; } else { break; } } return $title; }; $addItem = function ( string $name, ?string $profileUrl, ?string $sectionTitle, ) use ( $keywords, $defaultCollege, $pageUniversity, &$items, &$seen, ): void { if ($name === '' || ! $this->looksLikePersonName($name)) { return; } $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { return; } $academicTitle = $this->inferAcademicTitleFromSection($sectionTitle); $plain = trim($name.' '.($academicTitle ?? '').' '.($defaultCollege ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { return; } $seen[$dedupeKey] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $defaultCollege, universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege), summary: $defaultCollege ? '单位：'.$defaultCollege : null, keywords: $keywords, academicTitle: $academicTitle, platform: 'faculty_html_vsb', bio: null, ); }; if (preg_match_all('#]*?)>(.*?)#su', $scope, $linkMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) { foreach ($linkMatches as $linkMatch) { $attrs = (string) $linkMatch[1][0]; $offset = (int) $linkMatch[0][1]; $name = CrawlAuthorParser::cleanText(strip_tags($linkMatch[2][0])) ?? ''; if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { continue; } $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); if (! $this->looksLikeTeacherProfileUrl($href, null)) { continue; } $addItem($name, $this->resolveUrl($href, $sourceUrl), $resolveSectionTitle($offset)); } } if (preg_match_all('#]*>(.*?)#su', $scope, $cellMatches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) { foreach ($cellMatches as $cellMatch) { $cellHtml = (string) $cellMatch[1][0]; $offset = (int) $cellMatch[0][1]; if (str_contains($cellHtml, ']*class="[^"]*\bfaculty\b/u', $html); } /** * @param list $keywords * @return list */ protected function fetchNjuTeacherHomeItems( string $requestUrl, string $pageHtml, array $keywords, int $maxResults, int $maxPages = 1, ): array { $siteId = $this->parseNjuSiteId($pageHtml); $filters = $this->parseNjuTeacherHomeFilters($pageHtml); $conditions = $this->buildNjuTeacherHomeConditions($filters['career'], $filters['sub_career']); $origin = $this->requestOrigin($requestUrl) ?? 'https://is.nju.edu.cn'; $apiUrl = $origin.'/_wp3services/generalQuery?queryObj=teacherHome'; $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $maxPages = max(1, min(50, $maxPages)); $rows = 50; $pageUniversity = $this->inferUniversityFromSource($requestUrl, $pageHtml); $defaultCollege = $this->inferCollegeFromPageTitle($pageHtml); $items = []; $seen = []; $pageIndex = 1; $pageCount = null; while ($pageIndex <= $maxPages && count($items) < $maxResults) { $body = $this->requestNjuTeacherHomePage($apiUrl, $siteId, $pageIndex, $rows, $conditions, $timeout); if ($pageCount === null) { $pageCount = max(1, (int) ($body['pageCount'] ?? 1)); } $data = $body['data'] ?? []; if (! is_array($data) || $data === []) { break; } foreach ($data as $art) { if (! is_array($art)) { continue; } $name = CrawlAuthorParser::cleanText((string) ($art['title'] ?? '')) ?? ''; if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } $profileUrl = $this->resolveUrl((string) ($art['cnUrl'] ?? ''), $requestUrl); $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $academicTitle = CrawlAuthorParser::cleanText((string) ($art['exField2'] ?? '')); $researchField = CrawlAuthorParser::cleanText((string) ($art['exField1'] ?? '')); $plain = trim($name.' '.($researchField ?? '').' '.($academicTitle ?? '').' '.($defaultCollege ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $summaryParts = array_filter([ $defaultCollege ? '单位：'.$defaultCollege : null, $academicTitle ? '职称：'.$academicTitle : null, $researchField ? '研究领域：'.$researchField : null, ]); $seen[$dedupeKey] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $defaultCollege, universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege), summary: $summaryParts !== [] ? implode('；', $summaryParts) : null, keywords: $keywords, academicTitle: $academicTitle, platform: 'faculty_html_nju_wp', bio: $researchField, ); if (count($items) >= $maxResults) { break 2; } } if ($pageIndex >= $pageCount) { break; } $pageIndex++; } return $items; } protected function parseNjuSiteId(string $html): int { if (preg_match('/sudy-wp-siteId="(\d+)"/', $html, $match)) { return (int) $match[1]; } throw new \RuntimeException('无法解析教师列表站点 ID（siteId）'); } /** * @return array{career:?string,sub_career:?string} */ protected function parseNjuTeacherHomeFilters(string $html): array { $career = null; $subCareer = null; if (preg_match('#class="col_item_link\s+selected"[^>]*title="([^"]+)"#u', $html, $match)) { $career = CrawlAuthorParser::cleanText($match[1]); } elseif (preg_match('#class="col_item_link\s+selected"[^>]*>.*?class="column-name">([^<]+)#su', $html, $match)) { $career = CrawlAuthorParser::cleanText($match[1]); } if (preg_match('#class="sub-item[^"]*\sselected"[^>]*>.*?class="column-name">([^<]+)#su', $html, $match)) { $subCareer = CrawlAuthorParser::cleanText($match[1]); } elseif (preg_match('#class="sub-link[^"]*\sselected"[^>]*title="([^"]+)"#u', $html, $match)) { $subCareer = CrawlAuthorParser::cleanText($match[1]); } return [ 'career' => $career, 'sub_career' => $subCareer, ]; } /** * @return list> */ protected function buildNjuTeacherHomeConditions(?string $career, ?string $subCareer): array { $conditions = [ ['field' => 'published', 'value' => '1', 'judge' => '='], ]; if ($subCareer === '长聘副教授') { $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '=']]]; } elseif ($subCareer === '准聘副教授') { $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '=']]]; } elseif ($subCareer === '准聘助理教授') { $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '=']]]; } elseif ($subCareer === '专职科研') { $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '专职科研', 'judge' => '=']]]; } elseif ($subCareer === '博士后') { $conditions[] = ['orConditions' => [['field' => 'exField2', 'value' => '博士后', 'judge' => '=']]]; } if ($career === null || $career === '') { return $conditions; } if ($career === '教授') { $conditions[] = ['field' => 'exField2', 'value' => '教授', 'judge' => '=']; } elseif ($career === '副教授') { $conditions[] = ['field' => 'exField2', 'value' => '副教授', 'judge' => '=']; } elseif ($career === '兼职教授') { $conditions[] = ['field' => 'exField2', 'value' => '兼职教授', 'judge' => '=']; } elseif ($career === '行政管理人员') { $conditions[] = ['field' => 'exField2', 'value' => '行政管理人员', 'judge' => '=']; } elseif ($career === '准长聘' && ($subCareer === null || $subCareer === '')) { $conditions[] = [ 'orConditions' => [ ['field' => 'exField2', 'value' => '长聘副教授', 'judge' => '='], ['field' => 'exField2', 'value' => '准聘副教授', 'judge' => '='], ['field' => 'exField2', 'value' => '准聘助理教授', 'judge' => '='], ], ]; } elseif ($career === '专职科研及博士后' && ($subCareer === null || $subCareer === '')) { $conditions[] = [ 'orConditions' => [ ['field' => 'exField2', 'value' => '专职科研', 'judge' => '='], ['field' => 'exField2', 'value' => '博士后', 'judge' => '='], ], ]; } return $conditions; } /** * @param list> $conditions * @return array */ protected function requestNjuTeacherHomePage( string $apiUrl, int $siteId, int $pageIndex, int $rows, array $conditions, int $timeout, ): array { $returnInfos = [ ['field' => 'headerPic', 'name' => 'headerPic'], ['field' => 'exField1', 'name' => 'exField1'], ['field' => 'exField2', 'name' => 'exField2'], ['field' => 'cnUrl', 'name' => 'cnUrl'], ['field' => 'title', 'name' => 'title'], ['field' => 'phone', 'name' => 'phone'], ]; $response = Http::timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders([ 'User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'application/json', ]) ->asForm() ->post($apiUrl, [ 'siteId' => $siteId, 'pageIndex' => $pageIndex, 'rows' => $rows, 'orders' => json_encode([['field' => 'siteSort', 'type' => 'asc']], JSON_UNESCAPED_UNICODE), 'returnInfos' => json_encode($returnInfos, JSON_UNESCAPED_UNICODE), 'conditions' => json_encode($conditions, JSON_UNESCAPED_UNICODE), 'articleType' => 1, 'level' => 1, ]); if (! $response->successful()) { throw new \RuntimeException('教师列表接口请求失败（HTTP '.$response->status().'）'); } $body = $response->json(); if (! is_array($body)) { throw new \RuntimeException('教师列表接口返回格式异常'); } return $body; } /** * @param list $keywords * @return list */ protected function fetchAjaxTeacherItems( string $requestUrl, string $pageHtml, array $keywords, int $maxResults, int $maxPages = 1, ): array { $config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl); $search = implode(' ', $keywords); $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $maxPages = max(1, min(50, $maxPages)); $items = []; $seen = []; $page = 1; $totalCount = null; while ($page <= $maxPages && count($items) < $maxResults) { $body = $this->requestAjaxTeacherPage($config, $page, $search, $timeout); if ($totalCount === null && isset($body['count'])) { $totalCount = max(0, (int) $body['count']); } $content = (string) ($body['content'] ?? ''); if ($content === '') { break; } $before = count($items); foreach ($this->extractFromAjaxTeacherContent( $pageHtml.$content, $keywords, $requestUrl, $config['cat_code'], ) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; if (count($items) >= $maxResults) { break 2; } } if ($config['variant'] === 'standard') { break; } if (count($items) === $before) { break; } if ($totalCount !== null && count($items) >= min($totalCount, $maxResults)) { break; } $page++; } return $items; } /** * @param array{variant:string,cat_id:?string,cat_code:string,api_url:string} $config * @return array */ protected function requestAjaxTeacherPage(array $config, int $page, string $search, int $timeout): array { if ($config['variant'] === 'simple') { $payload = [ 'page' => (string) $page, 'cat_code' => $config['cat_code'], 'yjszxfl' => '全部', 'name' => $search, 'zm' => $search === '' ? 'All' : '', ]; } else { $payload = [ 'cat_id' => $config['cat_id'], 'cat_code' => $config['cat_code'], 'type' => $search !== '' ? '2' : '1', 'zm' => $search === '' ? 'All' : '', 'zc' => '', 'search' => $search, ]; if ($config['uses_page']) { $payload['page'] = (string) $page; } } $response = Http::timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders([ 'User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'application/json, text/html', ]) ->asForm() ->post($config['api_url'], $payload); if (! $response->successful()) { throw new \RuntimeException('教师列表接口请求失败（HTTP '.$response->status().'）'); } $body = $response->json(); if (! is_array($body)) { throw new \RuntimeException('教师列表接口返回格式异常'); } return $body; } /** * @return array{variant:string,cat_id:?string,cat_code:string,api_url:string,uses_page:bool} */ protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array { $catId = null; $catCode = null; $usesPage = str_contains($html, 'page:page'); $origin = $this->requestOrigin($sourceUrl); $apiUrl = $origin !== null ? $origin.'/active/ajax_teacher_list.html' : ''; if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) { $catId = $match[1]; } if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) { $catCode = $match[1]; } if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) { $apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl) ?? $apiUrl; } if ($origin !== null && str_starts_with($apiUrl, '/')) { $apiUrl = $origin.$apiUrl; } if ($catCode === null || $apiUrl === '') { throw new \RuntimeException('无法解析教师列表接口参数（cat_code）'); } $variant = $catId !== null ? 'standard' : 'simple'; if ($variant === 'simple') { $usesPage = true; } return [ 'variant' => $variant, 'cat_id' => $catId, 'cat_code' => $catCode, 'api_url' => $apiUrl, 'uses_page' => $usesPage, ]; } /** * @param list $keywords * @return list */ protected function extractFromAjaxTeacherContent( string $html, array $keywords, string $sourceUrl, ?string $catCode = null, ): array { $items = []; $seen = []; $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); $cardItems = $this->extractFromAjaxTeacherCards( $html, $keywords, $sourceUrl, $pageUniversity, $defaultCollege, $catCode, ); if ($cardItems !== []) { return $cardItems; } $parts = preg_split('##u', $html) ?: []; if (count($parts) > 1) { array_shift($parts); foreach ($parts as $block) { $department = $defaultCollege; if (preg_match('#.*?([^<]+)#su', $block, $deptMatch)) { $sectionTitle = CrawlAuthorParser::cleanText($deptMatch[1]); if ($sectionTitle !== null && $sectionTitle !== '' && ! $this->looksLikePersonName($sectionTitle)) { $department = $sectionTitle; } } foreach ($this->extractTeacherLinksFromHtmlBlock( $block, $keywords, $sourceUrl, $pageUniversity, $department, $catCode, ) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; } } if ($items !== []) { return $items; } } return $this->extractTeacherLinksFromHtmlBlock( $html, $keywords, $sourceUrl, $pageUniversity, $defaultCollege, $catCode, ); } /** * ICISEE 等站点 AJAX 返回的卡片式教师列表（姓名在 div.name 内，职称在 span 内）。 * * @param list $keywords * @return list */ protected function extractFromAjaxTeacherCards( string $html, array $keywords, string $sourceUrl, ?string $pageUniversity, ?string $affiliation, ?string $catCode, ): array { if (! preg_match_all( '#]*?)>\s*(?:.*?\s*)?(.*?)#su', $html, $matches, PREG_SET_ORDER, )) { return []; } $items = []; $seen = []; foreach ($matches as $match) { $attrs = (string) $match[1]; $nameBlock = (string) $match[2]; if (! preg_match('/^([^<]+)/u', $nameBlock, $nameMatch)) { continue; } $name = CrawlAuthorParser::cleanText(trim($nameMatch[1])) ?? ''; if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { continue; } $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) { continue; } $profileUrl = $this->resolveUrl($href, $sourceUrl); $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $plain = trim($name.' '.($affiliation ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $academicTitle = null; if (preg_match('#([^<]+)#u', $nameBlock, $titleMatch)) { $academicTitle = CrawlAuthorParser::cleanText($titleMatch[1]); } $seen[$dedupeKey] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation), summary: $affiliation ? '单位：'.$affiliation : null, keywords: $keywords, academicTitle: $academicTitle, platform: 'faculty_html_ajax', bio: null, ); } return $items; } /** * @param list $keywords * @return list */ protected function extractTeacherLinksFromHtmlBlock( string $html, array $keywords, string $sourceUrl, ?string $pageUniversity, ?string $affiliation, ?string $catCode, ): array { $items = []; $seen = []; if (! preg_match_all('#]*?)>(.*?)#su', $html, $matches, PREG_SET_ORDER)) { return []; } foreach ($matches as $match) { $attrs = (string) $match[1]; $rawName = preg_replace('/\s+/u', '', strip_tags($match[2])) ?? ''; $name = CrawlAuthorParser::cleanText($rawName) ?? ''; if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { continue; } $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) { continue; } $profileUrl = $this->resolveUrl($href, $sourceUrl); $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $plain = trim($name.' '.($affiliation ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $seen[$dedupeKey] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation), summary: $affiliation ? '单位：'.$affiliation : null, keywords: $keywords, academicTitle: null, platform: 'faculty_html_ajax', bio: null, ); } return $items; } protected function looksLikeTeacherProfileUrl(string $href, ?string $catCode): bool { $path = strtolower((string) parse_url($href, PHP_URL_PATH)); if ($path === '') { return false; } if (preg_match('#/(faculty|jiaoshiml|people/detail_new)/[^/]+\.html$#', $path)) { return true; } if (preg_match('#/c\d+a\d+/page\.htm$#', $path)) { return true; } if (preg_match('#/(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) { return true; } if (preg_match('#^(?:szll|zjzjs)/[^/]+\.(?:htm|html)$#', $path)) { return true; } if (preg_match('#/info/\d+/\d+\.htm$#', $path)) { return true; } if ($catCode !== null && $catCode !== '') { $code = preg_quote(strtolower($catCode), '#'); return (bool) preg_match('#/'.$code.'/[^/]+\.html$#', $path); } return false; } protected function requestOrigin(string $sourceUrl): ?string { $parts = parse_url($sourceUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return null; } $origin = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $origin .= ':'.$parts['port']; } return $origin; } /** * @param list $keywords * @return list */ protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; if (! preg_match_all( '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $html, $emailMatches, PREG_OFFSET_CAPTURE )) { return []; } foreach ($emailMatches[1] as $match) { $email = CrawlAuthorParser::normalizeEmail($match[0]); if (! $email || isset($seen[$email])) { continue; } $pos = (int) $match[1]; $window = substr($html, max(0, $pos - 400), 800); $plain = $this->htmlToPlain($window); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $name = $this->guessName($plain, $email); if ($name === '') { continue; } $affiliation = $this->guessAffiliation($plain); $seen[$email] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($email), name: $name, profileUrl: $sourceUrl, email: $email, affiliation: $affiliation, universityName: CrawlAuthorParser::universityFromAffiliation($affiliation) ?? $this->inferUniversityFromSource($sourceUrl, $html), summary: Str::limit($plain, 300), keywords: $keywords, academicTitle: null, platform: 'faculty_html', bio: null, ); } return $items; } /** * 上海交大材料学院等：panel-item + a.staff-item（/people/detail_new/{id}）。 * * @param list $keywords * @return list */ protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); $panelChunks = preg_split('##u', $html) ?: []; if (count($panelChunks) > 1) { array_shift($panelChunks); foreach ($panelChunks as $chunk) { if (! preg_match('#\s*([^<]+?)\s*#u', $chunk, $titleMatch)) { continue; } $department = CrawlAuthorParser::cleanText($titleMatch[1]); foreach ($this->extractStaffItemLinks($chunk) as $link) { $item = $this->makeStaffPanelItem( $link, $department ?: $defaultCollege, $pageUniversity, $keywords, $sourceUrl, ); if ($item === null || isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; } } } if ($items !== []) { return $items; } foreach ($this->extractStaffItemLinks($html) as $link) { $item = $this->makeStaffPanelItem( $link, $defaultCollege, $pageUniversity, $keywords, $sourceUrl, ); if ($item === null || isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; } return $items; } /** * @return list */ protected function extractStaffItemLinks(string $html): array { $links = []; $seen = []; if (! preg_match_all('#]*?)>([^<]+)#su', $html, $matches, PREG_SET_ORDER)) { return []; } foreach ($matches as $match) { $attrs = (string) $match[1]; if (! str_contains($attrs, 'staff-item')) { continue; } if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { continue; } $name = CrawlAuthorParser::cleanText($match[2]) ?? ''; if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); $key = $href.'|'.$name; if (isset($seen[$key])) { continue; } $seen[$key] = true; $links[] = ['href' => $href, 'name' => $name]; } return $links; } /** * @param array{href:string,name:string} $link * @param list $keywords */ protected function makeStaffPanelItem( array $link, ?string $department, ?string $pageUniversity, array $keywords, string $sourceUrl, ): ?CrawlItemDto { $name = $link['name']; $profileUrl = $this->resolveUrl($link['href'], $sourceUrl); $plain = trim($name.' '.($department ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { return null; } $dedupeKey = $profileUrl ?: ('name:'.md5($name)); $affiliation = $department; $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation); return $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $universityName, summary: $department ? '单位：'.$department : null, keywords: $keywords, academicTitle: null, platform: 'faculty_html_smse', bio: null, ); } protected function inferCollegeFromPageTitle(string $html): ?string { if (preg_match('/\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) { $title = CrawlAuthorParser::cleanText($match[1]); if ($title !== null && $title !== '') { if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) { return CrawlAuthorParser::cleanText($college[1]); } return $title; } } if (preg_match('/<meta\s+name="description"\s+content="([^"]+)"/u', $html, $match)) { $desc = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($desc !== null && $desc !== '') { if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) { return CrawlAuthorParser::cleanText($college[1]); } if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $desc, $college)) { return CrawlAuthorParser::cleanText($college[1]); } } } if (preg_match('/<meta\s+name=[\'"]SiteName[\'"]\s+content=[\'"]([^\'"]+)[\'"]/u', $html, $match)) { $siteName = CrawlAuthorParser::cleanText(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($siteName !== null && $siteName !== '') { if (preg_match('/大学([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $siteName, $college)) { return CrawlAuthorParser::cleanText($college[1]); } } } return null; } protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto { $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : []; $changed = false; if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) { $title = CrawlAuthorParser::cleanText($titleMatch[1]); if ($title !== null && $title !== '') { $lead['academic_title'] = $title; $changed = true; } } if (empty($lead['college']) && empty($lead['affiliation'])) { $dept = $this->parseLabeledField($html, '所属二级机构'); if ($dept !== null && $dept !== '') { $lead['affiliation'] = $dept; $lead['college'] = $dept; $changed = true; } } if (! $changed) { return $item; } $extra = $item->extra; $extra['lead_author'] = $lead; if (! empty($lead['academic_title'])) { $extra['academic_title'] = $lead['academic_title']; } if (! empty($lead['college'])) { $extra['college_name'] = $lead['college']; } $authorsParsed = $item->authorsParsed; if ($authorsParsed !== []) { if (! empty($lead['academic_title'])) { $authorsParsed[0]['academic_title'] = $lead['academic_title']; } if (! empty($lead['college'])) { $authorsParsed[0]['affiliation'] = $lead['college']; } } return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } /** * @param list<string> $keywords * @return list<CrawlItemDto> */ protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; $collegeName = null; if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) { $collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]); } $listHtml = $html; if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) { $listHtml = $listMatch[1]; } if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) { return []; } $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); foreach ($liBlocks[1] as $inner) { $inner = (string) $inner; if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) { continue; } $name = CrawlAuthorParser::cleanText($nameMatch[1]); if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } $href = ''; if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) { $href = (string) $hrefMatch[1]; } $profileUrl = $this->resolveUrl($href, $sourceUrl) ?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl); $plain = $this->htmlToPlain($inner); if (! $this->matchesKeywords($plain.' '.$name, $keywords)) { continue; } $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $seen[$dedupeKey] = true; $affiliation = $this->parseLabeledField($inner, '所在单位') ?? $collegeName; $academicTitle = $this->parseLabeledField($inner, '职称'); // 列表页「所在单位」多为学院，高校名称从站点/页头推断 $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation); $bio = $this->parseLabeledField($inner, '简介'); $summaryParts = array_filter([ $academicTitle ? '职称：'.$academicTitle : null, $affiliation ? '单位：'.$affiliation : null, $bio, ]); $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $universityName, summary: Str::limit(implode('；', $summaryParts), 300), keywords: $keywords, academicTitle: $academicTitle, platform: 'faculty_html_tsites', bio: $bio, ); } return $items; } /** * @param list<string> $keywords */ protected function makeFacultyItem( string $externalKey, string $name, ?string $profileUrl, ?string $email, ?string $affiliation, ?string $universityName, ?string $summary, array $keywords, ?string $academicTitle, string $platform, ?string $bio = null, ): CrawlItemDto { $college = $affiliation; $lead = [ 'name' => $name, 'email' => $email, 'affiliation' => $college, 'college' => $college, 'university_name' => $universityName, 'academic_title' => $academicTitle, 'bio' => $bio, 'profile_url' => $profileUrl, ]; return new CrawlItemDto( externalId: $externalKey, title: $name, canonicalUrl: $profileUrl, authors: $name, summary: $summary, schoolName: $universityName, extra: [ 'platform' => $platform, 'academic_title' => $academicTitle, 'college_name' => $college, 'bio' => $bio, 'profile_url' => $profileUrl, 'lead_author' => $lead, 'keyword' => implode(' ', $keywords), ], authorsParsed: [[ 'name' => $name, 'email' => $email, 'affiliation' => $college, 'university_name' => $universityName, 'academic_title' => $academicTitle, ]], ); } /** * @param list<string> $keywords */ protected function matchesKeywords(string $plain, array $keywords): bool { if ($keywords === []) { return true; } foreach ($keywords as $kw) { if ($kw !== '' && stripos($plain, $kw) !== false) { return true; } } return false; } protected function htmlToPlain(string $html): string { $plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8'); return preg_replace('/\s+/u', ' ', $plain) ?? ''; } protected function parseLabeledField(string $html, string $label): ?string { $pattern = '/'.preg_quote($label, '/').'[：:]\s*([^<]+)/u'; if (! preg_match($pattern, $html, $match)) { return null; } return CrawlAuthorParser::cleanText($match[1]); } protected function looksLikePersonName(string $name): bool { if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) { return false; } return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name) || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name); } protected function resolveUrl(string $href, string $baseUrl): ?string { $href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($href === '' || str_starts_with($href, 'javascript:')) { return null; } if (preg_match('#^https?://#i', $href)) { return $href; } $base = parse_url($baseUrl); if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) { return $href; } $origin = $base['scheme'].'://'.$base['host']; if (! empty($base['port'])) { $origin .= ':'.$base['port']; } if (str_starts_with($href, '//')) { return $base['scheme'].':'.$href; } if (str_starts_with($href, '/')) { return $origin.$href; } $path = $base['path'] ?? '/'; $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/'; return $origin.$dir.$href; } protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string { $escaped = preg_quote($name, '/'); if (! preg_match( '/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u', $html, $match, )) { return null; } return $this->resolveUrl($match[1], $sourceUrl); } protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string { $host = parse_url($sourceUrl, PHP_URL_HOST); if (is_string($host)) { $host = strtolower($host); if (str_contains($host, 'sjtu.edu.cn')) { return '上海交通大学'; } if (str_contains($host, 'tsinghua.edu.cn')) { return '清华大学'; } if (str_contains($host, 'pku.edu.cn')) { return '北京大学'; } if (str_contains($host, 'zju.edu.cn')) { return '浙江大学'; } if (str_contains($host, 'fudan.edu.cn')) { return '复旦大学'; } if (str_contains($host, 'nju.edu.cn')) { return '南京大学'; } if (str_contains($host, 'tsinghua.edu.cn')) { return '清华大学'; } } if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) { return CrawlAuthorParser::cleanText($match[1]); } return null; } protected function guessName(string $plain, string $email): string { if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) { return trim($m[1]); } $local = strstr($email, '@', true) ?: ''; $local = str_replace(['.', '_', '-'], ' ', $local); return Str::title(trim($local)); } protected function guessAffiliation(string $plain): ?string { if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) { return CrawlAuthorParser::cleanText($m[1]); } return null; } }