master
lion 1 day ago
parent d3418d2d3c
commit 68f30c05d6

@ -25,8 +25,8 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
$baseUrl = $this->normalizeRequestUrl($requestUrl);
$firstHtml = $this->fetchHtml($baseUrl);
if ($this->isSaisAjaxFacultyPage($firstHtml, $requestUrl)) {
$items = $this->fetchSaisFacultyItems($requestUrl, $firstHtml, $keywords, $maxResults);
if ($this->isAjaxTeacherListPage($firstHtml, $requestUrl)) {
$items = $this->fetchAjaxTeacherItems($requestUrl, $firstHtml, $keywords, $maxResults);
return $this->enrichEmailsFromProfilePages($items, $params);
}
@ -498,37 +498,42 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
}
protected function isSaisAjaxFacultyPage(string $html, string $sourceUrl): bool
protected function isAjaxTeacherListPage(string $html, string $sourceUrl): bool
{
if (str_contains(strtolower($sourceUrl), 'ajax_teacher_list')) {
return true;
}
if (str_contains($html, 'ajax_teacher_list.html')) {
return true;
}
$host = strtolower((string) parse_url($sourceUrl, PHP_URL_HOST));
return str_contains($host, 'sais.sjtu.edu.cn')
&& str_contains(strtolower($sourceUrl), 'faculty');
return str_contains($html, 'ajax_teacher_list.html');
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchSaisFacultyItems(
protected function fetchAjaxTeacherItems(
string $requestUrl,
string $pageHtml,
array $keywords,
int $maxResults,
): array {
$config = $this->parseSaisAjaxConfig($pageHtml, $requestUrl);
$config = $this->parseAjaxTeacherConfig($pageHtml, $requestUrl);
$search = implode(' ', $keywords);
$type = $search !== '' ? '2' : '1';
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$payload = [
'cat_id' => $config['cat_id'],
'cat_code' => $config['cat_code'],
'type' => $type,
'zm' => $search === '' ? 'All' : '',
'zc' => '',
'search' => $search,
];
if ($config['uses_page']) {
$payload['page'] = '1';
}
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
@ -537,29 +542,28 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
'Accept' => 'application/json, text/html',
])
->asForm()
->post($config['api_url'], [
'cat_id' => $config['cat_id'],
'cat_code' => $config['cat_code'],
'type' => $type,
'zm' => $search === '' ? 'All' : '',
'search' => $search,
]);
->post($config['api_url'], $payload);
if (! $response->successful()) {
throw new \RuntimeException('SAIS 教师列表接口请求失败HTTP '.$response->status().'');
throw new \RuntimeException('教师列表接口请求失败HTTP '.$response->status().'');
}
$payload = $response->json();
if (! is_array($payload)) {
throw new \RuntimeException('SAIS 教师列表接口返回格式异常');
$body = $response->json();
if (! is_array($body)) {
throw new \RuntimeException('教师列表接口返回格式异常');
}
$content = (string) ($payload['content'] ?? '');
$content = (string) ($body['content'] ?? '');
if ($content === '') {
return [];
}
$items = $this->extractFromSaisJsList($pageHtml.$content, $keywords, $requestUrl);
$items = $this->extractFromAjaxTeacherContent(
$pageHtml.$content,
$keywords,
$requestUrl,
$config['cat_code'],
);
if (count($items) > $maxResults) {
$items = array_slice($items, 0, $maxResults);
}
@ -568,13 +572,15 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
}
/**
* @return array{cat_id:string,cat_code:string,api_url:string}
* @return array{cat_id:string,cat_code:string,api_url:string,uses_page:bool}
*/
protected function parseSaisAjaxConfig(string $html, string $sourceUrl): array
protected function parseAjaxTeacherConfig(string $html, string $sourceUrl): array
{
$catId = '18';
$catCode = 'faculty';
$apiUrl = 'https://sais.sjtu.edu.cn/active/ajax_teacher_list.html';
$catId = null;
$catCode = null;
$usesPage = str_contains($html, 'page:page');
$origin = $this->requestOrigin($sourceUrl);
$apiUrl = $origin !== null ? $origin.'/active/ajax_teacher_list.html' : '';
if (preg_match("/cat_id\s*:\s*'(\d+)'/i", $html, $match)) {
$catId = $match[1];
@ -587,15 +593,19 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
?? $apiUrl;
}
$origin = $this->requestOrigin($sourceUrl);
if ($origin !== null && str_starts_with($apiUrl, '/')) {
$apiUrl = $origin.$apiUrl;
}
if ($catId === null || $catCode === null || $apiUrl === '') {
throw new \RuntimeException('无法解析教师列表接口参数cat_id / cat_code');
}
return [
'cat_id' => $catId,
'cat_code' => $catCode,
'api_url' => $apiUrl,
'uses_page' => $usesPage,
];
}
@ -603,33 +613,92 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractFromSaisJsList(string $html, array $keywords, string $sourceUrl): array
{
protected function extractFromAjaxTeacherContent(
string $html,
array $keywords,
string $sourceUrl,
?string $catCode = null,
): array {
$items = [];
$seen = [];
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
$defaultCollege = $this->inferCollegeFromPageTitle($html);
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
$parts = preg_split('#<div\s+class="rc-item">#u', $html) ?: [];
if (count($parts) > 1) {
array_shift($parts);
foreach ($parts as $block) {
$department = $defaultCollege;
if (preg_match('#<div\s+class="tit">.*?<div\s+class="name">([^<]+)</div>#su', $block, $deptMatch)) {
$sectionTitle = CrawlAuthorParser::cleanText($deptMatch[1]);
if ($sectionTitle !== null && $sectionTitle !== '' && ! $this->looksLikePersonName($sectionTitle)) {
$department = $sectionTitle;
}
}
foreach ($this->extractTeacherLinksFromHtmlBlock(
$block,
$keywords,
$sourceUrl,
$pageUniversity,
$department,
$catCode,
) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
}
}
if ($items !== []) {
return $items;
}
}
return $this->extractTeacherLinksFromHtmlBlock(
$html,
$keywords,
$sourceUrl,
$pageUniversity,
$defaultCollege,
$catCode,
);
}
/**
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function extractTeacherLinksFromHtmlBlock(
string $html,
array $keywords,
string $sourceUrl,
?string $pageUniversity,
?string $affiliation,
?string $catCode,
): array {
$items = [];
$seen = [];
if (! preg_match_all('#<a\b([^>]*?)>(.*?)</a>#su', $html, $matches, PREG_SET_ORDER)) {
return [];
}
foreach ($matches as $match) {
$attrs = (string) $match[1];
if (! preg_match('/\bclass="[^"]*\bname\b[^"]*"/u', $attrs)) {
$rawName = preg_replace('/\s+/u', '', strip_tags($match[2])) ?? '';
$name = CrawlAuthorParser::cleanText($rawName) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
continue;
}
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
if ($name === '' || ! $this->looksLikePersonName($name)) {
continue;
}
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
if (! str_contains(strtolower($href), '/faculty/')) {
if (! $this->looksLikeTeacherProfileUrl($href, $catCode)) {
continue;
}
@ -639,7 +708,7 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
continue;
}
$plain = trim($name.' '.($defaultCollege ?? ''));
$plain = trim($name.' '.($affiliation ?? ''));
if (! $this->matchesKeywords($plain, $keywords)) {
continue;
}
@ -650,12 +719,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
name: $name,
profileUrl: $profileUrl,
email: null,
affiliation: $defaultCollege,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($defaultCollege),
summary: $defaultCollege ? '单位:'.$defaultCollege : null,
affiliation: $affiliation,
universityName: $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation),
summary: $affiliation ? '单位:'.$affiliation : null,
keywords: $keywords,
academicTitle: null,
platform: 'faculty_html_sais',
platform: 'faculty_html_ajax',
bio: null,
);
}
@ -663,6 +732,26 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
return $items;
}
protected function looksLikeTeacherProfileUrl(string $href, ?string $catCode): bool
{
$path = strtolower((string) parse_url($href, PHP_URL_PATH));
if ($path === '') {
return false;
}
if (preg_match('#/(faculty|jiaoshiml|people/detail_new)/[^/]+\.html$#', $path)) {
return true;
}
if ($catCode !== null && $catCode !== '') {
$code = preg_quote(strtolower($catCode), '#');
return (bool) preg_match('#/'.$code.'/[^/]+\.html$#', $path);
}
return false;
}
protected function requestOrigin(string $sourceUrl): ?string
{
$parts = parse_url($sourceUrl);

@ -221,7 +221,7 @@ HTML;
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromSaisJsList');
$method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent');
$method->setAccessible(true);
$items = $method->invoke(
@ -229,15 +229,47 @@ HTML;
$html,
[],
'https://sais.sjtu.edu.cn/faculty.html',
'faculty',
);
$this->assertCount(2, $items);
$this->assertSame('白洋', $items[0]->title);
$this->assertSame('https://sais.sjtu.edu.cn/faculty/baiyang.html', $items[0]->canonicalUrl);
$this->assertSame('faculty_html_sais', $items[0]->extra['platform']);
$this->assertSame('faculty_html_ajax', $items[0]->extra['platform']);
$this->assertSame('上海交通大学', $items[0]->schoolName);
}
public function test_extracts_cs_rc_item_teacher_list(): void
{
$html = <<<'HTML'
<title>教师名录-上海交通大学计算机学院(网络空间安全学院、密码学院)</title>
<div class="rc-item">
<div class="tit"><div class="name">并行与分布式系统研究所</div></div>
<div class="dt">
<p>所长:<a href="https://www.cs.sjtu.edu.cn/jiaoshiml/zangbinyu.html" target="_blank">臧斌宇</a></p>
<p><a href="https://www.cs.sjtu.edu.cn/jiaoshiml/chenhaibo.html" target="_blank">陈海波</a></p>
</div>
</div>
HTML;
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'extractFromAjaxTeacherContent');
$method->setAccessible(true);
$items = $method->invoke(
$adapter,
$html,
[],
'https://www.cs.sjtu.edu.cn/jiaoshiml.html',
'jiaoshiml',
);
$this->assertCount(2, $items);
$this->assertSame('臧斌宇', $items[0]->title);
$this->assertSame('并行与分布式系统研究所', $items[0]->extra['college_name']);
$this->assertSame('https://www.cs.sjtu.edu.cn/jiaoshiml/chenhaibo.html', $items[1]->canonicalUrl);
}
public function test_resolve_profile_enrich_max_caps_large_batches(): void
{
$adapter = new FacultyListHtmlAdapter;

Loading…
Cancel
Save