You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

220 lines
7.1 KiB

<?php
namespace App\Services\Crawl\Adapters;
use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Support\Facades\Http;
/**
* 上海交大人工智能研究院研究中心页Vue SPA
* GET /api/researchCenter 列表各中心「研究团队」tab 对应 teams 字段。
*/
class AiSjtuResearchCenterAdapter implements CrawlerAdapterInterface
{
protected const API_BASE = 'https://ai.sjtu.edu.cn/api';
protected const LIST_PATH = '/researchCenter';
protected const UNIVERSITY_NAME = '上海交通大学';
public function fetch(string $requestUrl, CrawlSource $source, array $params): array
{
$keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
$maxResults = min(500, max(1, (int) ($params['max_results'] ?? 200)));
$response = Http::timeout(30)
->connectTimeout(10)
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders([
'User-Agent' => 'SlakeSchool-Crawler/1.0',
'Accept' => 'application/json',
])
->get(self::API_BASE.self::LIST_PATH, [
'page' => 1,
'limit' => 99999,
]);
if (! $response->successful()) {
throw new \RuntimeException('无法访问人工智能研究院研究中心 APIHTTP '.$response->status());
}
$json = $response->json();
if (! is_array($json)) {
throw new \RuntimeException('研究中心 API 返回格式异常');
}
$centers = $json['researchCenters'] ?? [];
if (! is_array($centers)) {
throw new \RuntimeException('研究中心 API 缺少 researchCenters 字段');
}
$items = [];
$seen = [];
foreach ($centers as $center) {
if (! is_array($center)) {
continue;
}
$centerId = (int) ($center['id'] ?? 0);
$centerName = trim((string) ($center['name'] ?? ''));
if ($centerId <= 0 || $centerName === '') {
continue;
}
$teams = $center['teams'] ?? [];
if (! is_array($teams)) {
continue;
}
foreach ($teams as $member) {
if (! is_array($member)) {
continue;
}
$item = $this->memberToItem($member, $centerId, $centerName, $keywords, $requestUrl);
if ($item === null || isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$items[] = $item;
if (count($items) >= $maxResults) {
return $items;
}
}
}
return $items;
}
/**
* @param list<string> $keywords
*/
protected function memberToItem(
array $member,
int $centerId,
string $centerName,
array $keywords,
string $requestUrl,
): ?CrawlItemDto {
$name = trim((string) ($member['name'] ?? ''));
if ($name === '' || ! $this->looksLikePersonName($name)) {
return null;
}
$email = CrawlAuthorParser::normalizeEmail(trim((string) ($member['email'] ?? '')));
$phone = $this->normalizePhone((string) ($member['phone'] ?? ''));
$title = trim((string) ($member['title'] ?? ''));
$direction = trim((string) ($member['direction'] ?? ''));
$memberKey = (string) ($member['id'] ?? md5($name.$email));
$plain = implode(' ', array_filter([$name, $centerName, $title, $direction, $email, $phone]));
if (! $this->matchesKeywords($plain, $keywords)) {
return null;
}
$profileUrl = 'https://ai.sjtu.edu.cn/center?centerId='.$centerId;
$externalId = 'ai_sjtu_center_'.$centerId.'_team_'.$memberKey;
$researchDirectionNames = $this->parseResearchDirectionNames($direction);
$summaryParts = array_filter([
$title !== '' ? '职称:'.$title : null,
$phone !== '' ? '电话:'.$phone : null,
$direction !== '' ? '研究方向:'.$direction : null,
'所属中心:'.$centerName,
]);
$lead = [
'name' => $name,
'email' => $email,
'phone' => $phone !== '' ? $phone : null,
'affiliation' => $centerName,
'college' => $centerName,
'university_name' => self::UNIVERSITY_NAME,
'academic_title' => $title !== '' ? $title : null,
'research_direction_names' => $researchDirectionNames,
];
return new CrawlItemDto(
externalId: $externalId,
title: $name,
canonicalUrl: $profileUrl,
authors: $name,
summary: implode('', $summaryParts),
schoolName: self::UNIVERSITY_NAME,
section: $centerName,
extra: [
'platform' => 'ai_sjtu_research_center',
'academic_title' => $title !== '' ? $title : null,
'college_name' => $centerName,
'profile_url' => $profileUrl,
'phone' => $phone !== '' ? $phone : null,
'research_direction_names' => $researchDirectionNames,
'lead_author' => $lead,
],
authorsParsed: [[
'name' => $name,
'email' => $email,
'affiliation' => $centerName,
'university_name' => self::UNIVERSITY_NAME,
'academic_title' => $title !== '' ? $title : null,
]],
);
}
/**
* @param list<string> $keywords
*/
protected function matchesKeywords(string $plain, array $keywords): bool
{
if ($keywords === []) {
return true;
}
foreach ($keywords as $keyword) {
if ($keyword !== '' && mb_stripos($plain, $keyword) !== false) {
return true;
}
}
return false;
}
/**
* @return list<string>
*/
protected function parseResearchDirectionNames(string $direction): array
{
$direction = trim($direction);
if ($direction === '') {
return [];
}
$parts = preg_split('/[、,,;\/]+/u', $direction) ?: [];
return array_values(array_unique(array_filter(array_map(
fn (string $part) => trim($part),
$parts,
))));
}
protected function normalizePhone(string $phone): string
{
$phone = trim(preg_replace('/\s+/u', ' ', $phone) ?? '');
return $phone;
}
protected function looksLikePersonName(string $name): bool
{
return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
|| (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,40}$/', $name);
}
}