normalizeRequestUrl($requestUrl); $firstHtml = $this->fetchHtml($baseUrl); $totalPages = $this->detectTotalPages($firstHtml); $pagesToFetch = min($maxPages, $totalPages); $merged = []; $seen = []; for ($page = 1; $page <= $pagesToFetch; $page++) { $html = $page === 1 ? $firstHtml : $this->fetchHtml($this->buildPageUrl($baseUrl, $page, $firstHtml)); foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $merged[] = $item; if (count($merged) >= $maxResults) { break 2; } } } return $this->enrichEmailsFromProfilePages($merged); } /** * @param list $items * @return list */ protected function enrichEmailsFromProfilePages(array $items): array { if (! config('crawl.faculty.profile_email_enrich_enabled', true)) { return $items; } $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 6))); $timeout = max(8, (int) config('crawl.faculty.profile_http_timeout_seconds', 20)); $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; $enriched = []; foreach (array_chunk($items, $poolSize) as $chunk) { $pending = []; foreach ($chunk as $item) { if ($this->itemHasEmail($item) || ! $item->canonicalUrl) { $enriched[] = $item; continue; } $pending[$item->externalId] = $item; } if ($pending === []) { continue; } $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pending, $headers, $timeout) { foreach ($pending as $externalId => $item) { $pool->as($externalId) ->timeout($timeout) ->withHeaders($headers) ->get($item->canonicalUrl); } }); foreach ($pending as $externalId => $item) { $response = $responses[$externalId] ?? null; if ($response && $response->successful()) { $email = $this->extractEmailFromProfileHtml((string) $response->body()); if ($email) { $item = $this->applyEmailToItem($item, $email); } } $enriched[] = $item; } } return $enriched; } protected function itemHasEmail(CrawlItemDto $item): bool { $lead = $item->extra['lead_author'] ?? null; if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) { return true; } foreach ($item->authorsParsed as $author) { if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) { return true; } } return false; } protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto { $email = CrawlAuthorParser::normalizeEmail($email) ?? $email; $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : []; $lead['email'] = $email; $authorsParsed = $item->authorsParsed; if ($authorsParsed === []) { $authorsParsed = [[ 'name' => $item->title, 'email' => $email, 'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null, 'university_name' => $lead['university_name'] ?? $item->schoolName, ]]; } else { $authorsParsed[0]['email'] = $email; } $extra = $item->extra; $extra['lead_author'] = $lead; return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } protected function extractEmailFromProfileHtml(string $html): ?string { $labeledPatterns = [ '/电子邮箱[::]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子信箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/E-?mail[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu', '/邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', ]; foreach ($labeledPatterns as $pattern) { if (preg_match($pattern, $html, $match)) { $email = CrawlAuthorParser::normalizeEmail($match[1]); if ($email && ! $this->isNoiseEmail($email)) { return $email; } } } $candidates = []; if (preg_match_all( '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $html, $emailMatches, )) { foreach ($emailMatches[1] as $raw) { $email = CrawlAuthorParser::normalizeEmail($raw); if ($email && ! $this->isNoiseEmail($email)) { $candidates[] = $email; } } } if ($candidates === []) { return null; } $candidates = array_values(array_unique($candidates)); foreach ($candidates as $email) { if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) { return $email; } } return $candidates[0]; } protected function isNoiseEmail(string $email): bool { return (bool) preg_match( '/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i', $email, ); } protected function fetchHtml(string $url): string { $response = Http::timeout(30) ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']) ->get($url); if (! $response->successful()) { throw new \RuntimeException('页面请求失败(HTTP '.$response->status().'):'.$url); } return (string) $response->body(); } protected function detectTotalPages(string $html): int { if (preg_match('/totalpage=(\d+)/i', $html, $match)) { return max(1, (int) $match[1]); } if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) { $perPage = 0; if (preg_match('/]*>\s*
    (.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) { $perPage = preg_match_all('//u', $listMatch[1]) ?: 0; } if ($perPage > 0) { return max(1, (int) ceil(((int) $countMatch[1]) / $perPage)); } } return 1; } protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string { $parts = parse_url($baseUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $baseUrl; } parse_str((string) ($parts['query'] ?? ''), $query); $query['PAGENUM'] = (string) $page; if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) { $query['totalpage'] = $totalMatch[1]; } $url = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $url .= ':'.$parts['port']; } $url .= $parts['path'] ?? '/'; if ($query !== []) { $url .= '?'.http_build_query($query); } return $url; } protected function normalizeRequestUrl(string $url): string { $parts = parse_url($url); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $url; } $normalized = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $normalized .= ':'.$parts['port']; } $normalized .= $parts['path'] ?? '/'; if (! empty($parts['query'])) { $normalized .= '?'.$parts['query']; } return $normalized; } /** * @param list $keywords * @return list */ protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array { $items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl); if ($items !== []) { return $items; } return $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl); } /** * @param list $keywords * @return list */ protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; if (! preg_match_all( '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $html, $emailMatches, PREG_OFFSET_CAPTURE )) { return []; } foreach ($emailMatches[1] as $match) { $email = CrawlAuthorParser::normalizeEmail($match[0]); if (! $email || isset($seen[$email])) { continue; } $pos = (int) $match[1]; $window = substr($html, max(0, $pos - 400), 800); $plain = $this->htmlToPlain($window); if (! $this->matchesKeywords($plain, $keywords)) { continue; } $name = $this->guessName($plain, $email); if ($name === '') { continue; } $affiliation = $this->guessAffiliation($plain); $seen[$email] = true; $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($email), name: $name, profileUrl: $sourceUrl, email: $email, affiliation: $affiliation, universityName: CrawlAuthorParser::universityFromAffiliation($affiliation) ?? $this->inferUniversityFromSource($sourceUrl, $html), summary: Str::limit($plain, 300), keywords: $keywords, academicTitle: null, platform: 'faculty_html', ); } return $items; } /** * 上海交通大学等 tsites.CollegeTeacherList:div.list > ul > li 卡片。 * * @param list $keywords * @return list */ protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array { $items = []; $seen = []; $collegeName = null; if (preg_match('/\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) { $collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]); } $listHtml = $html; if (preg_match('/]*>\s*
      (.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) { $listHtml = $listMatch[1]; } if (! preg_match_all('#
    • (.*?)
    • #su', $listHtml, $liBlocks)) { return []; } $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html); foreach ($liBlocks[1] as $inner) { $inner = (string) $inner; if (! preg_match('/\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) { continue; } $name = CrawlAuthorParser::cleanText($nameMatch[1]); if ($name === '' || ! $this->looksLikePersonName($name)) { continue; } $href = ''; if (preg_match('/]*href="([^"]*)"/u', $inner, $hrefMatch)) { $href = (string) $hrefMatch[1]; } $profileUrl = $this->resolveUrl($href, $sourceUrl) ?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl); $plain = $this->htmlToPlain($inner); if (! $this->matchesKeywords($plain.' '.$name, $keywords)) { continue; } $dedupeKey = $profileUrl ?: ('name:'.md5($name)); if (isset($seen[$dedupeKey])) { continue; } $seen[$dedupeKey] = true; $affiliation = $this->parseLabeledField($inner, '所在单位') ?? $collegeName; $academicTitle = $this->parseLabeledField($inner, '职称'); // 列表页「所在单位」多为学院,高校名称从站点/页头推断 $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation); $summaryParts = array_filter([ $academicTitle ? '职称:'.$academicTitle : null, $affiliation ? '单位:'.$affiliation : null, $this->parseLabeledField($inner, '简介'), ]); $items[] = $this->makeFacultyItem( externalKey: 'faculty:'.md5($dedupeKey), name: $name, profileUrl: $profileUrl, email: null, affiliation: $affiliation, universityName: $universityName, summary: Str::limit(implode(';', $summaryParts), 300), keywords: $keywords, academicTitle: $academicTitle, platform: 'faculty_html_tsites', ); } return $items; } /** * @param list $keywords */ protected function makeFacultyItem( string $externalKey, string $name, ?string $profileUrl, ?string $email, ?string $affiliation, ?string $universityName, ?string $summary, array $keywords, ?string $academicTitle, string $platform, ): CrawlItemDto { $college = $affiliation; $lead = [ 'name' => $name, 'email' => $email, 'affiliation' => $college, 'college' => $college, 'university_name' => $universityName, 'academic_title' => $academicTitle, 'profile_url' => $profileUrl, ]; return new CrawlItemDto( externalId: $externalKey, title: $name, canonicalUrl: $profileUrl, authors: $name, summary: $summary, schoolName: $universityName, extra: [ 'platform' => $platform, 'academic_title' => $academicTitle, 'college_name' => $college, 'profile_url' => $profileUrl, 'lead_author' => $lead, 'keyword' => implode(' ', $keywords), ], authorsParsed: [[ 'name' => $name, 'email' => $email, 'affiliation' => $college, 'university_name' => $universityName, 'academic_title' => $academicTitle, ]], ); } /** * @param list $keywords */ protected function matchesKeywords(string $plain, array $keywords): bool { if ($keywords === []) { return true; } foreach ($keywords as $kw) { if ($kw !== '' && stripos($plain, $kw) !== false) { return true; } } return false; } protected function htmlToPlain(string $html): string { $plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8'); return preg_replace('/\s+/u', ' ', $plain) ?? ''; } protected function parseLabeledField(string $html, string $label): ?string { $pattern = '/'.preg_quote($label, '/').'[::]\s*([^<]+)/u'; if (! preg_match($pattern, $html, $match)) { return null; } return CrawlAuthorParser::cleanText($match[1]); } protected function looksLikePersonName(string $name): bool { if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) { return false; } return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name) || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name); } protected function resolveUrl(string $href, string $baseUrl): ?string { $href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($href === '' || str_starts_with($href, 'javascript:')) { return null; } if (preg_match('#^https?://#i', $href)) { return $href; } $base = parse_url($baseUrl); if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) { return $href; } $origin = $base['scheme'].'://'.$base['host']; if (! empty($base['port'])) { $origin .= ':'.$base['port']; } if (str_starts_with($href, '//')) { return $base['scheme'].':'.$href; } if (str_starts_with($href, '/')) { return $origin.$href; } $path = $base['path'] ?? '/'; $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/'; return $origin.$dir.$href; } protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string { $escaped = preg_quote($name, '/'); if (! preg_match( '/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u', $html, $match, )) { return null; } return $this->resolveUrl($match[1], $sourceUrl); } protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string { $host = parse_url($sourceUrl, PHP_URL_HOST); if (is_string($host)) { $host = strtolower($host); if (str_contains($host, 'sjtu.edu.cn')) { return '上海交通大学'; } if (str_contains($host, 'tsinghua.edu.cn')) { return '清华大学'; } if (str_contains($host, 'pku.edu.cn')) { return '北京大学'; } if (str_contains($host, 'zju.edu.cn')) { return '浙江大学'; } if (str_contains($host, 'fudan.edu.cn')) { return '复旦大学'; } } if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) { return CrawlAuthorParser::cleanText($match[1]); } return null; } protected function guessName(string $plain, string $email): string { if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) { return trim($m[1]); } $local = strstr($email, '@', true) ?: ''; $local = str_replace(['.', '_', '-'], ' ', $local); return Str::title(trim($local)); } protected function guessAffiliation(string $plain): ?string { if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) { return CrawlAuthorParser::cleanText($m[1]); } return null; } }