master
lion 1 day ago
parent f9256f17bb
commit d3418d2d3c

@ -24,6 +24,13 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
$baseUrl = $this->normalizeRequestUrl($requestUrl);
$firstHtml = $this->fetchHtml($baseUrl);
if ($this->isSaisAjaxFacultyPage($firstHtml, $requestUrl)) {
$items = $this->fetchSaisFacultyItems($requestUrl, $firstHtml, $keywords, $maxResults);
return $this->enrichEmailsFromProfilePages($items, $params);
}
$totalPages = $this->detectTotalPages($firstHtml);
$pagesToFetch = min($maxPages, $totalPages);
@ -342,6 +349,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
'/电子信箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/E-?mail[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
'/邮箱[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
'/电子邮件[:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
];
foreach ($labeledPatterns as $pattern) {
@ -490,6 +498,186 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
}
protected function isSaisAjaxFacultyPage(string $html, string $sourceUrl): bool
{
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
return true;
}
if (str_contains($html, 'ajax_teacher_list.html')) {
return true;
}
$host = strtolower((string) parse_url($sourceUrl, PHP_URL_HOST));
return str_contains($host, 'sais.sjtu.edu.cn')
&& str_contains(strtolower($sourceUrl), 'faculty');
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchSaisFacultyItems(
string $requestUrl,
string $pageHtml,
array $keywords,
int $maxResults,
): array {
$config = $this->parseSaisAjaxConfig($pageHtml, $requestUrl);
$search = implode(' ', $keywords);
$type = $search !== '' ? '2' : '1';
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0',
'Accept' => 'application/json, text/html',
])
->asForm()
->post($config['api_url'], [
'cat_id' => $config['cat_id'],
'cat_code' => $config['cat_code'],
'type' => $type,
'zm' => $search === '' ? 'All' : '',
'search' => $search,
]);
if (! $response->successful()) {
throw new \RuntimeException('SAIS 教师列表接口请求失败HTTP '.$response->status().'');
}
$payload = $response->json();
if (! is_array($payload)) {
throw new \RuntimeException('SAIS 教师列表接口返回格式异常');
}
$content = (string) ($payload['content'] ?? '');
if ($content === '') {
return [];
}
$items = $this->extractFromSaisJsList($pageHtml.$content, $keywords, $requestUrl);
if (count($items) > $maxResults) {
$items = array_slice($items, 0, $maxResults);
}
return $items;
}
/**
* @return array{cat_id:string,cat_code:string,api_url:string}
*/
protected function parseSaisAjaxConfig(string $html, string $sourceUrl): array
{
$catId = '18';
$catCode = 'faculty';
$apiUrl = 'https://sais.sjtu.edu.cn/active/ajax_teacher_list.html';
if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) {
$catId = $match[1];
}
if (preg_match("/cat_code\s*:\s*'([^']+)'/i", $html, $match)) {
$catCode = $match[1];
}
if (preg_match("#url\s*:\s*'([^']*ajax_teacher_list[^']*)'#i", $html, $match)) {
$apiUrl = $this->resolveUrl(html_entity_decode($match[1], ENT_QUOTES | ENT_HTML5, 'UTF-8'), $sourceUrl)
?? $apiUrl;
}
$origin = $this->requestOrigin($sourceUrl);
if ($origin !== null && str_starts_with($apiUrl, '/')) {
$apiUrl = $origin.$apiUrl;
}
return [
'cat_id' => $catId,
'cat_code' => $catCode,
'api_url' => $apiUrl,
];
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromSaisJsList(string $html, array $keywords, string $sourceUrl): array
{
$items = [];
$seen = [];
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
if (! preg_match('/\bclass="[^"]*\bname\b[^"]*"/u', $attrs)) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! str_contains(strtolower($href), '/faculty/')) {
continue;
}
$profileUrl = $this->resolveUrl($href, $sourceUrl);
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
if (isset($seen[$dedupeKey])) {
continue;
}
$plain = trim($name.' '.($defaultCollege ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
$seen[$dedupeKey] = true;
$items[] = $this->makeFacultyItem(
externalKey: 'faculty:'.md5($dedupeKey),
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_sais',
bio: null,
);
}
return $items;
}
protected function requestOrigin(string $sourceUrl): ?string
{
$parts = parse_url($sourceUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return null;
}
$origin = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$origin .= ':'.$parts['port'];
}
return $origin;
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>

@ -210,6 +210,34 @@ HTML;
$this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']);
}
public function test_extracts_sais_js_list_from_ajax_content(): void
{
$html = <<<'HTML'
<title>教师名录-上海交通大学自动化与感知学院</title>
<div class="js-list">
<li><a href="https://sais.sjtu.edu.cn/faculty/baiyang.html" class="name">白洋</a></li>
<li><a href="https://sais.sjtu.edu.cn/faculty/chenxin.html" class="name">陈新</a></li>
</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromSaisJsList');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://sais.sjtu.edu.cn/faculty.html',
);
$this->assertCount(2, $items);
$this->assertSame('白洋', $items[0]->title);
$this->assertSame('https://sais.sjtu.edu.cn/faculty/baiyang.html', $items[0]->canonicalUrl);
$this->assertSame('faculty_html_sais', $items[0]->extra['platform']);
$this->assertSame('上海交通大学', $items[0]->schoolName);
}
public function test_resolve_profile_enrich_max_caps_large_batches(): void
{
$adapter = new FacultyListHtmlAdapter;

Loading…
Cancel
Save