normalizeRequestUrl($requestUrl); $firstHtml = $this->fetchHtml($baseUrl); if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) { $items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults); return $this->enrichEmailsFromProfilePages($items, $params); } $totalPages = $this->detectTotalPages($firstHtml); $pagesToFetch = min($maxPages, $totalPages); $merged = []; $seen = []; for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) { $html = $page === 1 ? $firstHtml : null; if ($html === null) { break; } foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $merged[] = $item; if (count($merged) >= $maxResults) { break 2; } } } if ($pagesToFetch > 1 && count($merged) < $maxResults) { $merged = $this->fetchRemainingListPages( $baseUrl, $firstHtml, $pagesToFetch, $keywords, $requestUrl, $merged, $seen, $maxResults, ); } return $this->enrichEmailsFromProfilePages($merged, $params); } /** * @param list $merged * @param array $seen * @param list $keywords * @return list */ protected function fetchRemainingListPages( string $baseUrl, string $firstHtml, int $pagesToFetch, array $keywords, string $requestUrl, array $merged, array $seen, int $maxResults, ): array { $poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5))); $pageUrls = []; for ($page = 2; $page <= $pagesToFetch; $page++) { $pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml); } foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) { $htmlByPage = $this->fetchHtmlPool($chunk); ksort($htmlByPage); foreach ($htmlByPage as $html) { foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $merged[] = $item; if (count($merged) >= $maxResults) { return $merged; } } } } return $merged; } /** * @param array $pageUrls * @return array */ protected function fetchHtmlPool(array $pageUrls): array { if ($pageUrls === []) { return []; } $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) { foreach ($pageUrls as $page => $url) { $pool->as((string) $page) ->timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders($headers) ->get($url); } }); $htmlByPage = []; foreach ($pageUrls as $page => $url) { $body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null); if ($body !== null && $body !== '') { $htmlByPage[$page] = $body; } } return $htmlByPage; } /** * @param list $items * @param array $params * @return list */ protected function enrichEmailsFromProfilePages(array $items, array $params = []): array { if (! config('crawl.faculty.profile_email_enrich_enabled', true)) { return $items; } $maxEnrich = $this->resolveProfileEnrichMax($params, count($items)); if ($maxEnrich <= 0) { return $this->markProfileEnrichSkipped($items); } $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8))); $timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10)); $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; $fetchMap = []; $enrichBudget = $maxEnrich; foreach ($items as $index => $item) { if ($enrichBudget <= 0) { break; } if ($this->itemHasEmail($item) || ! $item->canonicalUrl) { continue; } $fetchMap[$index] = $item; $enrichBudget--; } if ($fetchMap === []) { return $items; } $fetchedBodies = []; foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) { $batchPending = []; foreach ($chunk as $index => $item) { $batchPending[$index] = $item; } $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) { foreach ($batchPending as $index => $item) { $pool->as((string) $index) ->timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders($headers) ->get($item->canonicalUrl); } }); foreach ($batchPending as $index => $item) { $body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null); if ($body !== null) { $email = $this->extractEmailFromProfileHtml($body); if ($email) { $item = $this->applyEmailToItem($item, $email); } $item = $this->applyProfileMetadataToItem($item, $body); } $fetchedBodies[$index] = $item; } } $result = []; foreach ($items as $index => $item) { if (isset($fetchedBodies[$index])) { $result[] = $fetchedBodies[$index]; } elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) { $result[] = $this->markItemProfileEnrichSkipped($item); } else { $result[] = $item; } } return $result; } /** * @param array $params */ protected function resolveProfileEnrichMax(array $params, int $itemCount): int { if (($params['skip_profile_enrich'] ?? false) === true) { return 0; } $configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32)); return max(0, min($itemCount, min(200, $configured))); } /** * @param list $items * @return list */ protected function markProfileEnrichSkipped(array $items): array { return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items); } protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto { if ($this->itemHasEmail($item)) { return $item; } $extra = $item->extra; $extra['profile_enrich_skipped'] = true; return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $item->authorsParsed, ); } protected function responseBodyFromPoolResult(mixed $result): ?string { if ($result instanceof Response && $result->successful()) { return (string) $result->body(); } return null; } protected function itemHasEmail(CrawlItemDto $item): bool { $lead = $item->extra['lead_author'] ?? null; if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) { return true; } foreach ($item->authorsParsed as $author) { if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) { return true; } } return false; } protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto { $email = CrawlAuthorParser::normalizeEmail($email) ?? $email; $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : []; $lead['email'] = $email; $authorsParsed = $item->authorsParsed; if ($authorsParsed === []) { $authorsParsed = [[ 'name' => $item->title, 'email' => $email, 'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null, 'university_name' => $lead['university_name'] ?? $item->schoolName, ]]; } else { $authorsParsed[0]['email'] = $email; } $extra = $item->extra; $extra['lead_author'] = $lead; return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } protected function extractEmailFromProfileHtml(string $html): ?string { $labeledPatterns = [ '/电子邮箱[::]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子信箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/E-?mail[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu', '/邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子邮件[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', ]; foreach ($labeledPatterns as $pattern) { if (preg_match($pattern, $html, $match)) { $email = CrawlAuthorParser::normalizeEmail($match[1]); if ($email && ! $this->isNoiseEmail($email)) { return $email; } } } $candidates = []; if (preg_match_all( '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $html, $emailMatches, )) { foreach ($emailMatches[1] as $raw) { $email = CrawlAuthorParser::normalizeEmail($raw); if ($email && ! $this->isNoiseEmail($email)) { $candidates[] = $email; } } } if ($candidates === []) { return null; } $candidates = array_values(array_unique($candidates)); foreach ($candidates as $email) { if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) { return $email; } } return $candidates[0]; } protected function isNoiseEmail(string $email): bool { return (bool) preg_match( '/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i', $email, ); } protected function fetchHtml(string $url): string { $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $response = Http::timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']) ->get($url); if (! $response->successful()) { throw new \RuntimeException('页面请求失败(HTTP '.$response->status().'):'.$url); } return (string) $response->body(); } protected function detectTotalPages(string $html): int { if (preg_match('/totalpage=(\d+)/i', $html, $match)) { return max(1, (int) $match[1]); } if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) { $perPage = 0; if (preg_match('/]*>\s*
    (.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) { $perPage = preg_match_all('//u', $listMatch[1]) ?: 0; } if ($perPage > 0) { return max(1, (int) ceil(((int) $countMatch[1]) / $perPage)); } } return 1; } protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string { $parts = parse_url($baseUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $baseUrl; } parse_str((string) ($parts['query'] ?? ''), $query); $query['PAGENUM'] = (string) $page; if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) { $query['totalpage'] = $totalMatch[1]; } $url = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $url .= ':'.$parts['port']; } $url .= $parts['path'] ?? '/'; if ($query !== []) { $url .= '?'.http_build_query($query); } return $url; } protected function normalizeRequestUrl(string $url): string { $parts = parse_url($url); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $url; } $normalized = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $normalized .= ':'.$parts['port']; } $normalized .= $parts['path'] ?? '/'; if (! empty($parts['query'])) { $normalized .= '?'.$parts['query']; } return $normalized; } /** * @param list $keywords * @return list */ protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array { $items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl); if ($items !== []) { return $items; } $items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl); if ($items !== []) { return $items; } return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl); } protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool { if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) { return true; } return str_contains($html, 'ajax_teacher_list.html'); } /** * @param list $keywords * @return list */ protected function fetchAjaxTeacherItems( string $requestUrl, string $pageHtml, array $keywords, int $maxResults, ): array { $config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl); $search = implode(' ', $keywords); $type = $search !== '' ? '2' : '1'; $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $payload = [ 'cat_id' => $config['cat_id'], 'cat_code' => $config['cat_code'], 'type' => $type, 'zm' => $search === '' ? 'All' : '', 'zc' => '', 'search' => $search, ]; if ($config['uses_page']) { $payload['page'] = '1'; } $response = Http::timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders([ 'User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'application/json, text/html', ]) ->asForm() ->post($config['api_url'], $payload); if (! $response->successful()) { throw new \RuntimeException('教师列表接口请求失败(HTTP '.$response->status().')'); } $body = $response->json(); if (! is_array($body)) { throw new \RuntimeException('教师列表接口返回格式异常'); } $content = (string) ($body['content'] ?? ''); if ($content === '') { return []; } $items = $this->extractFromAjaxTeacherContent( $pageHtml.$content, $keywords, $requestUrl, $config['cat_code'], ); if (count($items) > $maxResults) { $items = array_slice($items, 0, $maxResults); } return $items; } /** * @return array{cat_id:string,cat_code:string,api_url:string,uses_page:bool} */ protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array { $catId = null; $catCode = null; $usesPage = str_contains($html, 'page:page'); $origin = $this->requestOrigin($sourceUrl); $apiUrl = $origin !== null ? $origin.'/active/ajax_teacher_list.html' : ''; if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) { $catId = $match[1]; } if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) { $catCode = $match[1]; } if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) { $apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl) ?? $apiUrl; } if ($origin !== null && str_starts_with($apiUrl, '/')) { $apiUrl = $origin.$apiUrl; } if ($catId === null || $catCode === null || $apiUrl === '') { throw new \RuntimeException('无法解析教师列表接口参数(cat_id / cat_code)'); } return [ 'cat_id' => $catId, 'cat_code' => $catCode, 'api_url' => $apiUrl, 'uses_page' => $usesPage, ]; } /** * @param list $keywords * @return list */ protected function extractFromAjaxTeacherContent( string $html, array $keywords, string $sourceUrl, ?string $catCode = null, ): array { $items = []; $seen = []; $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); $parts = preg_split('##u', $html) ?: []; if (count($parts) > 1) { array_shift($parts); foreach ($parts as $block) { $department = $defaultCollege; if (preg_match('#.*?([^<]+)#su', $block, $deptMatch)) { $sectionTitle = CrawlAuthorParser::cleanText($deptMatch[1]); if ($sectionTitle !== null && $sectionTitle !== '' && ! $this->looksLikePersonName($sectionTitle)) { $department = $sectionTitle; } } foreach ($this->extractTeacherLinksFromHtmlBlock( $block, $keywords, $sourceUrl, $pageUniversity, $department, $catCode, ) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; } } if ($items !== []) { return $items; } } return $this->extractTeacherLinksFromHtmlBlock( $html, $keywords, $sourceUrl, $pageUniversity, $defaultCollege, $catCode, ); } /** * @param list $keywords * @return list */ protected function extractTeacherLinksFromHtmlBlock( string $html, array $keywords, string $sourceUrl, ?string $pageUniversity, ?string $affiliation, ?string $catCode, ): array { $items = []; $seen = []; if (! preg_match_all('#]*?)>(.*?)#su', $html, $matches, PREG_SET_ORDER)) { return []; } foreach ($matches as $match) { $attrs = (string) $match[1]; $rawName = preg_replace('/\s+/u', '', strip_tags($match[2])) ?? ''; $name = CrawlAuthorParser::cleanText($rawName) ?? ''; if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { continue; } $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) { continue; } $profileUrl = $this->resolveUrl($href, $sourceUrl); $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $plain = trim($name.' '.($affiliation ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $seen[$dedupeKey] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation), summary: $affiliation ? '单位:'.$affiliation : null, keywords: $keywords, academicTitle: null, platform: 'faculty_html_ajax', bio: null, ); } return $items; } protected function looksLikeTeacherProfileUrl(string $href, ?string $catCode): bool { $path = strtolower((string) parse_url($href, PHP_URL_PATH)); if ($path === '') { return false; } if (preg_match('#/(faculty|jiaoshiml|people/detail_new)/[^/]+\.html$#', $path)) { return true; } if ($catCode !== null && $catCode !== '') { $code = preg_quote(strtolower($catCode), '#'); return (bool) preg_match('#/'.$code.'/[^/]+\.html$#', $path); } return false; } protected function requestOrigin(string $sourceUrl): ?string { $parts = parse_url($sourceUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return null; } $origin = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $origin .= ':'.$parts['port']; } return $origin; } /** * @param list $keywords * @return list */ protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; if (! preg_match_all( '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $html, $emailMatches, PREG_OFFSET_CAPTURE )) { return []; } foreach ($emailMatches[1] as $match) { $email = CrawlAuthorParser::normalizeEmail($match[0]); if (! $email || isset($seen[$email])) { continue; } $pos = (int) $match[1]; $window = substr($html, max(0, $pos - 400), 800); $plain = $this->htmlToPlain($window); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $name = $this->guessName($plain, $email); if ($name === '') { continue; } $affiliation = $this->guessAffiliation($plain); $seen[$email] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($email), name: $name, profileUrl: $sourceUrl, email: $email, affiliation: $affiliation, universityName: CrawlAuthorParser::universityFromAffiliation($affiliation) ?? $this->inferUniversityFromSource($sourceUrl, $html), summary: Str::limit($plain, 300), keywords: $keywords, academicTitle: null, platform: 'faculty_html', bio: null, ); } return $items; } /** * 上海交大材料学院等:panel-item + a.staff-item(/people/detail_new/{id})。 * * @param list $keywords * @return list */ protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); $defaultCollege = $this->inferCollegeFromPageTitle($html); $panelChunks = preg_split('##u', $html) ?: []; if (count($panelChunks) > 1) { array_shift($panelChunks); foreach ($panelChunks as $chunk) { if (! preg_match('#\s*([^<]+?)\s*#u', $chunk, $titleMatch)) { continue; } $department = CrawlAuthorParser::cleanText($titleMatch[1]); foreach ($this->extractStaffItemLinks($chunk) as $link) { $item = $this->makeStaffPanelItem( $link, $department ?: $defaultCollege, $pageUniversity, $keywords, $sourceUrl, ); if ($item === null || isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; } } } if ($items !== []) { return $items; } foreach ($this->extractStaffItemLinks($html) as $link) { $item = $this->makeStaffPanelItem( $link, $defaultCollege, $pageUniversity, $keywords, $sourceUrl, ); if ($item === null || isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $items[] = $item; } return $items; } /** * @return list */ protected function extractStaffItemLinks(string $html): array { $links = []; $seen = []; if (! preg_match_all('#]*?)>([^<]+)#su', $html, $matches, PREG_SET_ORDER)) { return []; } foreach ($matches as $match) { $attrs = (string) $match[1]; if (! str_contains($attrs, 'staff-item')) { continue; } if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) { continue; } $name = CrawlAuthorParser::cleanText($match[2]) ?? ''; if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } $href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'); $key = $href.'|'.$name; if (isset($seen[$key])) { continue; } $seen[$key] = true; $links[] = ['href' => $href, 'name' => $name]; } return $links; } /** * @param array{href:string,name:string} $link * @param list $keywords */ protected function makeStaffPanelItem( array $link, ?string $department, ?string $pageUniversity, array $keywords, string $sourceUrl, ): ?CrawlItemDto { $name = $link['name']; $profileUrl = $this->resolveUrl($link['href'], $sourceUrl); $plain = trim($name.' '.($department ?? '')); if (! $this->matchesKeywords($plain, $keywords)) { return null; } $dedupeKey = $profileUrl ?: ('name:'.md5($name)); $affiliation = $department; $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation); return $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $universityName, summary: $department ? '单位:'.$department : null, keywords: $keywords, academicTitle: null, platform: 'faculty_html_smse', bio: null, ); } protected function inferCollegeFromPageTitle(string $html): ?string { if (! preg_match('/\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) { return null; } $title = CrawlAuthorParser::cleanText($match[1]); if ($title === null || $title === '') { return null; } if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) { return CrawlAuthorParser::cleanText($college[1]); } return $title; } protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto { $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : []; $changed = false; if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) { $title = CrawlAuthorParser::cleanText($titleMatch[1]); if ($title !== null && $title !== '') { $lead['academic_title'] = $title; $changed = true; } } if (empty($lead['college']) && empty($lead['affiliation'])) { $dept = $this->parseLabeledField($html, '所属二级机构'); if ($dept !== null && $dept !== '') { $lead['affiliation'] = $dept; $lead['college'] = $dept; $changed = true; } } if (! $changed) { return $item; } $extra = $item->extra; $extra['lead_author'] = $lead; if (! empty($lead['academic_title'])) { $extra['academic_title'] = $lead['academic_title']; } if (! empty($lead['college'])) { $extra['college_name'] = $lead['college']; } $authorsParsed = $item->authorsParsed; if ($authorsParsed !== []) { if (! empty($lead['academic_title'])) { $authorsParsed[0]['academic_title'] = $lead['academic_title']; } if (! empty($lead['college'])) { $authorsParsed[0]['affiliation'] = $lead['college']; } } return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } /** * @param list<string> $keywords * @return list<CrawlItemDto> */ protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; $collegeName = null; if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) { $collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]); } $listHtml = $html; if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) { $listHtml = $listMatch[1]; } if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) { return []; } $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); foreach ($liBlocks[1] as $inner) { $inner = (string) $inner; if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) { continue; } $name = CrawlAuthorParser::cleanText($nameMatch[1]); if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } $href = ''; if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) { $href = (string) $hrefMatch[1]; } $profileUrl = $this->resolveUrl($href, $sourceUrl) ?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl); $plain = $this->htmlToPlain($inner); if (! $this->matchesKeywords($plain.' '.$name, $keywords)) { continue; } $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $seen[$dedupeKey] = true; $affiliation = $this->parseLabeledField($inner, '所在单位') ?? $collegeName; $academicTitle = $this->parseLabeledField($inner, '职称'); // 列表页「所在单位」多为学院,高校名称从站点/页头推断 $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation); $bio = $this->parseLabeledField($inner, '简介'); $summaryParts = array_filter([ $academicTitle ? '职称:'.$academicTitle : null, $affiliation ? '单位:'.$affiliation : null, $bio, ]); $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $universityName, summary: Str::limit(implode(';', $summaryParts), 300), keywords: $keywords, academicTitle: $academicTitle, platform: 'faculty_html_tsites', bio: $bio, ); } return $items; } /** * @param list<string> $keywords */ protected function makeFacultyItem( string $externalKey, string $name, ?string $profileUrl, ?string $email, ?string $affiliation, ?string $universityName, ?string $summary, array $keywords, ?string $academicTitle, string $platform, ?string $bio = null, ): CrawlItemDto { $college = $affiliation; $lead = [ 'name' => $name, 'email' => $email, 'affiliation' => $college, 'college' => $college, 'university_name' => $universityName, 'academic_title' => $academicTitle, 'bio' => $bio, 'profile_url' => $profileUrl, ]; return new CrawlItemDto( externalId: $externalKey, title: $name, canonicalUrl: $profileUrl, authors: $name, summary: $summary, schoolName: $universityName, extra: [ 'platform' => $platform, 'academic_title' => $academicTitle, 'college_name' => $college, 'bio' => $bio, 'profile_url' => $profileUrl, 'lead_author' => $lead, 'keyword' => implode(' ', $keywords), ], authorsParsed: [[ 'name' => $name, 'email' => $email, 'affiliation' => $college, 'university_name' => $universityName, 'academic_title' => $academicTitle, ]], ); } /** * @param list<string> $keywords */ protected function matchesKeywords(string $plain, array $keywords): bool { if ($keywords === []) { return true; } foreach ($keywords as $kw) { if ($kw !== '' && stripos($plain, $kw) !== false) { return true; } } return false; } protected function htmlToPlain(string $html): string { $plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8'); return preg_replace('/\s+/u', ' ', $plain) ?? ''; } protected function parseLabeledField(string $html, string $label): ?string { $pattern = '/'.preg_quote($label, '/').'[::]\s*([^<]+)/u'; if (! preg_match($pattern, $html, $match)) { return null; } return CrawlAuthorParser::cleanText($match[1]); } protected function looksLikePersonName(string $name): bool { if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) { return false; } return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name) || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name); } protected function resolveUrl(string $href, string $baseUrl): ?string { $href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($href === '' || str_starts_with($href, 'javascript:')) { return null; } if (preg_match('#^https?://#i', $href)) { return $href; } $base = parse_url($baseUrl); if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) { return $href; } $origin = $base['scheme'].'://'.$base['host']; if (! empty($base['port'])) { $origin .= ':'.$base['port']; } if (str_starts_with($href, '//')) { return $base['scheme'].':'.$href; } if (str_starts_with($href, '/')) { return $origin.$href; } $path = $base['path'] ?? '/'; $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/'; return $origin.$dir.$href; } protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string { $escaped = preg_quote($name, '/'); if (! preg_match( '/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u', $html, $match, )) { return null; } return $this->resolveUrl($match[1], $sourceUrl); } protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string { $host = parse_url($sourceUrl, PHP_URL_HOST); if (is_string($host)) { $host = strtolower($host); if (str_contains($host, 'sjtu.edu.cn')) { return '上海交通大学'; } if (str_contains($host, 'tsinghua.edu.cn')) { return '清华大学'; } if (str_contains($host, 'pku.edu.cn')) { return '北京大学'; } if (str_contains($host, 'zju.edu.cn')) { return '浙江大学'; } if (str_contains($host, 'fudan.edu.cn')) { return '复旦大学'; } } if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) { return CrawlAuthorParser::cleanText($match[1]); } return null; } protected function guessName(string $plain, string $email): string { if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) { return trim($m[1]); } $local = strstr($email, '@', true) ?: ''; $local = str_replace(['.', '_', '-'], ' ', $local); return Str::title(trim($local)); } protected function guessAffiliation(string $plain): ?string { if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) { return CrawlAuthorParser::cleanText($m[1]); } return null; } }