normalizeRequestUrl($requestUrl); $firstHtml = $this->fetchHtml($baseUrl); if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) { $items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages); return $this->enrichEmailsFromProfilePages($items, $params); } if ($this->isNjuTeacherHomePage($firstHtml)) { $items = $this->fetchNjuTeacherHomeItems($requestUrl, $firstHtml, $keywords, $maxResults, $maxPages); return $this->enrichEmailsFromProfilePages($items, $params); } $totalPages = $this->detectTotalPages($firstHtml); $pagesToFetch = min($maxPages, $totalPages); $merged = []; $seen = []; for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) { $html = $page === 1 ? $firstHtml : null; if ($html === null) { break; } foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $merged[] = $item; if (count($merged) >= $maxResults) { break 2; } } } if ($pagesToFetch > 1 && count($merged) < $maxResults) { $merged = $this->fetchRemainingListPages( $baseUrl, $firstHtml, $pagesToFetch, $keywords, $requestUrl, $merged, $seen, $maxResults, ); } return $this->enrichEmailsFromProfilePages($merged, $params); } /** * @param list $merged * @param array $seen * @param list $keywords * @return list */ protected function fetchRemainingListPages( string $baseUrl, string $firstHtml, int $pagesToFetch, array $keywords, string $requestUrl, array $merged, array $seen, int $maxResults, ): array { $poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5))); $pageUrls = []; for ($page = 2; $page <= $pagesToFetch; $page++) { $pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml); } foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) { $htmlByPage = $this->fetchHtmlPool($chunk); ksort($htmlByPage); foreach ($htmlByPage as $html) { foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { if (isset($seen[$item->externalId])) { continue; } $seen[$item->externalId] = true; $merged[] = $item; if (count($merged) >= $maxResults) { return $merged; } } } } return $merged; } /** * @param array $pageUrls * @return array */ protected function fetchHtmlPool(array $pageUrls): array { if ($pageUrls === []) { return []; } $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) { foreach ($pageUrls as $page => $url) { $pool->as((string) $page) ->timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders($headers) ->get($url); } }); $htmlByPage = []; foreach ($pageUrls as $page => $url) { $body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null); if ($body !== null && $body !== '') { $htmlByPage[$page] = $body; } } return $htmlByPage; } /** * @param list $items * @param array $params * @return list */ protected function enrichEmailsFromProfilePages(array $items, array $params = []): array { if (! config('crawl.faculty.profile_email_enrich_enabled', true)) { return $items; } $maxEnrich = $this->resolveProfileEnrichMax($params, count($items)); if ($maxEnrich <= 0) { return $this->markProfileEnrichSkipped($items); } $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8))); $timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10)); $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; $fetchMap = []; $enrichBudget = $maxEnrich; foreach ($items as $index => $item) { if ($enrichBudget <= 0) { break; } if ($this->itemHasEmail($item) || ! $item->canonicalUrl) { continue; } $fetchMap[$index] = $item; $enrichBudget--; } if ($fetchMap === []) { return $items; } $fetchedBodies = []; foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) { $batchPending = []; foreach ($chunk as $index => $item) { $batchPending[$index] = $item; } $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) { foreach ($batchPending as $index => $item) { $pool->as((string) $index) ->timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders($headers) ->get($item->canonicalUrl); } }); foreach ($batchPending as $index => $item) { $body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null); if ($body !== null) { $email = $this->extractEmailFromProfileHtml($body); if ($email) { $item = $this->applyEmailToItem($item, $email); } $item = $this->applyProfileMetadataToItem($item, $body); } $fetchedBodies[$index] = $item; } } $result = []; foreach ($items as $index => $item) { if (isset($fetchedBodies[$index])) { $result[] = $fetchedBodies[$index]; } elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) { $result[] = $this->markItemProfileEnrichSkipped($item); } else { $result[] = $item; } } return $result; } /** * @param array $params */ protected function resolveProfileEnrichMax(array $params, int $itemCount): int { if (($params['skip_profile_enrich'] ?? false) === true) { return 0; } $configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32)); return max(0, min($itemCount, min(200, $configured))); } /** * @param list $items * @return list */ protected function markProfileEnrichSkipped(array $items): array { return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items); } protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto { if ($this->itemHasEmail($item)) { return $item; } $extra = $item->extra; $extra['profile_enrich_skipped'] = true; return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $item->authorsParsed, ); } protected function responseBodyFromPoolResult(mixed $result): ?string { if ($result instanceof Response && $result->successful()) { return (string) $result->body(); } return null; } protected function itemHasEmail(CrawlItemDto $item): bool { $lead = $item->extra['lead_author'] ?? null; if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) { return true; } foreach ($item->authorsParsed as $author) { if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) { return true; } } return false; } protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto { $email = CrawlAuthorParser::normalizeEmail($email) ?? $email; $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : []; $lead['email'] = $email; $authorsParsed = $item->authorsParsed; if ($authorsParsed === []) { $authorsParsed = [[ 'name' => $item->title, 'email' => $email, 'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null, 'university_name' => $lead['university_name'] ?? $item->schoolName, ]]; } else { $authorsParsed[0]['email'] = $email; } $extra = $item->extra; $extra['lead_author'] = $lead; return new CrawlItemDto( externalId: $item->externalId, title: $item->title, canonicalUrl: $item->canonicalUrl, authors: $item->authors, summary: $item->summary, publishedAt: $item->publishedAt, schoolName: $item->schoolName, section: $item->section, contentHtml: $item->contentHtml, extra: $extra, authorsParsed: $authorsParsed, ); } protected function extractEmailFromProfileHtml(string $html): ?string { $labeledPatterns = [ '/电子邮箱[::]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子信箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/E-?mail[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu', '/邮箱[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', '/电子邮件[::]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u', ]; foreach ($labeledPatterns as $pattern) { if (preg_match($pattern, $html, $match)) { $email = CrawlAuthorParser::normalizeEmail($match[1]); if ($email && ! $this->isNoiseEmail($email)) { return $email; } } } $candidates = []; if (preg_match_all( '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $html, $emailMatches, )) { foreach ($emailMatches[1] as $raw) { $email = CrawlAuthorParser::normalizeEmail($raw); if ($email && ! $this->isNoiseEmail($email)) { $candidates[] = $email; } } } if ($candidates === []) { return null; } $candidates = array_values(array_unique($candidates)); foreach ($candidates as $email) { if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) { return $email; } } return $candidates[0]; } protected function isNoiseEmail(string $email): bool { return (bool) preg_match( '/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i', $email, ); } protected function fetchHtml(string $url): string { $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20)); $response = Http::timeout($timeout) ->connectTimeout(min(8, $timeout)) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']) ->get($url); if (! $response->successful()) { throw new \RuntimeException('页面请求失败(HTTP '.$response->status().'):'.$url); } return (string) $response->body(); } protected function detectTotalPages(string $html): int { if (preg_match('/totalpage=(\d+)/i', $html, $match)) { return max(1, (int) $match[1]); } if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) { $perPage = 0; if (preg_match('/]*>\s*