|
|
|
|
@ -91,10 +91,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
foreach ($pending as $externalId => $item) {
|
|
|
|
|
$response = $responses[$externalId] ?? null;
|
|
|
|
|
if ($response && $response->successful()) {
|
|
|
|
|
$email = $this->extractEmailFromProfileHtml((string) $response->body());
|
|
|
|
|
$body = (string) $response->body();
|
|
|
|
|
$email = $this->extractEmailFromProfileHtml($body);
|
|
|
|
|
if ($email) {
|
|
|
|
|
$item = $this->applyEmailToItem($item, $email);
|
|
|
|
|
}
|
|
|
|
|
$item = $this->applyProfileMetadataToItem($item, $body);
|
|
|
|
|
}
|
|
|
|
|
$enriched[] = $item;
|
|
|
|
|
}
|
|
|
|
|
@ -299,7 +301,12 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
|
|
|
|
|
$items = $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
|
|
|
|
|
if ($items !== []) {
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $this->extractFromStaffPanelList($html, $keywords, $sourceUrl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@ -362,11 +369,223 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 上海交通大学等 tsites.CollegeTeacherList:div.list > ul > li 卡片。
|
|
|
|
|
* 上海交大材料学院等:panel-item + a.staff-item(/people/detail_new/{id})。
|
|
|
|
|
*
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromStaffPanelList(string $html, array $keywords, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
$items = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
$pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);
|
|
|
|
|
$defaultCollege = $this->inferCollegeFromPageTitle($html);
|
|
|
|
|
|
|
|
|
|
$panelChunks = preg_split('#<div\s+class="panel-head">#u', $html) ?: [];
|
|
|
|
|
if (count($panelChunks) > 1) {
|
|
|
|
|
array_shift($panelChunks);
|
|
|
|
|
foreach ($panelChunks as $chunk) {
|
|
|
|
|
if (! preg_match('#<div\s+class="title">\s*([^<]+?)\s*</div>#u', $chunk, $titleMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$department = CrawlAuthorParser::cleanText($titleMatch[1]);
|
|
|
|
|
foreach ($this->extractStaffItemLinks($chunk) as $link) {
|
|
|
|
|
$item = $this->makeStaffPanelItem(
|
|
|
|
|
$link,
|
|
|
|
|
$department ?: $defaultCollege,
|
|
|
|
|
$pageUniversity,
|
|
|
|
|
$keywords,
|
|
|
|
|
$sourceUrl,
|
|
|
|
|
);
|
|
|
|
|
if ($item === null || isset($seen[$item->externalId])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$seen[$item->externalId] = true;
|
|
|
|
|
$items[] = $item;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($items !== []) {
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($this->extractStaffItemLinks($html) as $link) {
|
|
|
|
|
$item = $this->makeStaffPanelItem(
|
|
|
|
|
$link,
|
|
|
|
|
$defaultCollege,
|
|
|
|
|
$pageUniversity,
|
|
|
|
|
$keywords,
|
|
|
|
|
$sourceUrl,
|
|
|
|
|
);
|
|
|
|
|
if ($item === null || isset($seen[$item->externalId])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$seen[$item->externalId] = true;
|
|
|
|
|
$items[] = $item;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return list<array{href:string,name:string}>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractStaffItemLinks(string $html): array
|
|
|
|
|
{
|
|
|
|
|
$links = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
|
|
|
|
|
if (! preg_match_all('#<a\b([^>]*?)>([^<]+)</a>#su', $html, $matches, PREG_SET_ORDER)) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($matches as $match) {
|
|
|
|
|
$attrs = (string) $match[1];
|
|
|
|
|
if (! str_contains($attrs, 'staff-item')) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (! preg_match('#\bhref="([^"]+)"#u', $attrs, $hrefMatch)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$name = CrawlAuthorParser::cleanText($match[2]) ?? '';
|
|
|
|
|
if ($name === '' || ! $this->looksLikePersonName($name)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$href = html_entity_decode($hrefMatch[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
$key = $href.'|'.$name;
|
|
|
|
|
if (isset($seen[$key])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$seen[$key] = true;
|
|
|
|
|
$links[] = ['href' => $href, 'name' => $name];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $links;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param array{href:string,name:string} $link
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
*/
|
|
|
|
|
protected function makeStaffPanelItem(
|
|
|
|
|
array $link,
|
|
|
|
|
?string $department,
|
|
|
|
|
?string $pageUniversity,
|
|
|
|
|
array $keywords,
|
|
|
|
|
string $sourceUrl,
|
|
|
|
|
): ?CrawlItemDto {
|
|
|
|
|
$name = $link['name'];
|
|
|
|
|
$profileUrl = $this->resolveUrl($link['href'], $sourceUrl);
|
|
|
|
|
$plain = trim($name.' '.($department ?? ''));
|
|
|
|
|
|
|
|
|
|
if (! $this->matchesKeywords($plain, $keywords)) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$dedupeKey = $profileUrl ?: ('name:'.md5($name));
|
|
|
|
|
$affiliation = $department;
|
|
|
|
|
$universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);
|
|
|
|
|
|
|
|
|
|
return $this->makeFacultyItem(
|
|
|
|
|
externalKey: 'faculty:'.md5($dedupeKey),
|
|
|
|
|
name: $name,
|
|
|
|
|
profileUrl: $profileUrl,
|
|
|
|
|
email: null,
|
|
|
|
|
affiliation: $affiliation,
|
|
|
|
|
universityName: $universityName,
|
|
|
|
|
summary: $department ? '单位:'.$department : null,
|
|
|
|
|
keywords: $keywords,
|
|
|
|
|
academicTitle: null,
|
|
|
|
|
platform: 'faculty_html_smse',
|
|
|
|
|
bio: null,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function inferCollegeFromPageTitle(string $html): ?string
|
|
|
|
|
{
|
|
|
|
|
if (! preg_match('/<title>\s*[^<\-\–—]+[\-–—]\s*([^<]+?)\s*<\/title>/u', $html, $match)) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$title = CrawlAuthorParser::cleanText($match[1]);
|
|
|
|
|
if ($title === null || $title === '') {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,40}(?:学院|研究院|学部|系))/u', $title, $college)) {
|
|
|
|
|
return CrawlAuthorParser::cleanText($college[1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function applyProfileMetadataToItem(CrawlItemDto $item, string $html): CrawlItemDto
|
|
|
|
|
{
|
|
|
|
|
$lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
|
|
|
|
|
$changed = false;
|
|
|
|
|
|
|
|
|
|
if (empty($lead['academic_title']) && preg_match('/<em>\s*([^<]+?)\s*<\/em>/u', $html, $titleMatch)) {
|
|
|
|
|
$title = CrawlAuthorParser::cleanText($titleMatch[1]);
|
|
|
|
|
if ($title !== null && $title !== '') {
|
|
|
|
|
$lead['academic_title'] = $title;
|
|
|
|
|
$changed = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (empty($lead['college']) && empty($lead['affiliation'])) {
|
|
|
|
|
$dept = $this->parseLabeledField($html, '所属二级机构');
|
|
|
|
|
if ($dept !== null && $dept !== '') {
|
|
|
|
|
$lead['affiliation'] = $dept;
|
|
|
|
|
$lead['college'] = $dept;
|
|
|
|
|
$changed = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (! $changed) {
|
|
|
|
|
return $item;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$extra = $item->extra;
|
|
|
|
|
$extra['lead_author'] = $lead;
|
|
|
|
|
if (! empty($lead['academic_title'])) {
|
|
|
|
|
$extra['academic_title'] = $lead['academic_title'];
|
|
|
|
|
}
|
|
|
|
|
if (! empty($lead['college'])) {
|
|
|
|
|
$extra['college_name'] = $lead['college'];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$authorsParsed = $item->authorsParsed;
|
|
|
|
|
if ($authorsParsed !== []) {
|
|
|
|
|
if (! empty($lead['academic_title'])) {
|
|
|
|
|
$authorsParsed[0]['academic_title'] = $lead['academic_title'];
|
|
|
|
|
}
|
|
|
|
|
if (! empty($lead['college'])) {
|
|
|
|
|
$authorsParsed[0]['affiliation'] = $lead['college'];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
|
|
externalId: $item->externalId,
|
|
|
|
|
title: $item->title,
|
|
|
|
|
canonicalUrl: $item->canonicalUrl,
|
|
|
|
|
authors: $item->authors,
|
|
|
|
|
summary: $item->summary,
|
|
|
|
|
publishedAt: $item->publishedAt,
|
|
|
|
|
schoolName: $item->schoolName,
|
|
|
|
|
section: $item->section,
|
|
|
|
|
contentHtml: $item->contentHtml,
|
|
|
|
|
extra: $extra,
|
|
|
|
|
authorsParsed: $authorsParsed,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
|
|
|
|
|
{
|
|
|
|
|
$items = [];
|
|
|
|
|
|