You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
381 lines
14 KiB
381 lines
14 KiB
<?php
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
/**
|
|
* 从 arXiv 搜索块 / abs / html 页面文本解析发表日期与作者元数据。
|
|
*/
|
|
class ArxivMetadataParser
|
|
{
|
|
/** @var array<string, string> */
|
|
private const MONTHS = [
|
|
'january' => '01', 'february' => '02', 'march' => '03', 'april' => '04',
|
|
'may' => '05', 'june' => '06', 'july' => '07', 'august' => '08',
|
|
'september' => '09', 'october' => '10', 'november' => '11', 'december' => '12',
|
|
'jan' => '01', 'feb' => '02', 'mar' => '03', 'apr' => '04',
|
|
'jun' => '06', 'jul' => '07', 'aug' => '08', 'sep' => '09', 'sept' => '09',
|
|
'oct' => '10', 'nov' => '11', 'dec' => '12',
|
|
];
|
|
|
|
public static function parsePublishedDate(?string $text): ?string
|
|
{
|
|
if ($text === null || trim($text) === '') {
|
|
return null;
|
|
}
|
|
|
|
if (preg_match(
|
|
'/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i',
|
|
$text,
|
|
$gen
|
|
)) {
|
|
$date = self::toYmd($gen[3], $gen[1], $gen[2]);
|
|
if ($date !== null) {
|
|
return $date;
|
|
}
|
|
}
|
|
|
|
$text = html_entity_decode(strip_tags($text), ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
$text = preg_replace('/\s+/u', ' ', $text) ?? '';
|
|
|
|
if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $text, $iso)) {
|
|
return sprintf('%s-%s-%s', $iso[1], $iso[2], $iso[3]);
|
|
}
|
|
|
|
if (preg_match('/(\d{4})-(\d{2})-(\d{2})T\d{2}:\d{2}/', $text, $isoT)) {
|
|
return sprintf('%s-%s-%s', $isoT[1], $isoT[2], $isoT[3]);
|
|
}
|
|
|
|
$patterns = [
|
|
'/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i',
|
|
'/\[v\d+\]\s*(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i',
|
|
'/\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]/i',
|
|
'/Submitted\s+on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i',
|
|
'/Submitted\s+(\d{1,2})\s+([A-Za-z]+),\s+(\d{4})/i',
|
|
'/(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\d{1,2}:\d{2}:\d{2}\s+UTC/i',
|
|
'/(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\d{1,2}:\d{2}:\d{2}\s+UTC/i',
|
|
];
|
|
|
|
foreach ($patterns as $pattern) {
|
|
if (! preg_match($pattern, $text, $m)) {
|
|
continue;
|
|
}
|
|
// Generated on Thu May 28 ... 2026 → 月、日、年顺序
|
|
if (str_starts_with($pattern, '/Generated on')) {
|
|
$date = self::toYmd($m[3], $m[1], $m[2]);
|
|
} else {
|
|
$date = self::toYmd($m[3], $m[2], $m[1]);
|
|
}
|
|
if ($date !== null) {
|
|
return $date;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
|
|
*/
|
|
public static function parseAuthorsFromAbsHtml(string $html): array
|
|
{
|
|
$block = '';
|
|
if (preg_match('#<div[^>]*class="[^"]*authors[^"]*"[^>]*>(.*?)</div>#is', $html, $m)) {
|
|
$block = $m[1];
|
|
} elseif (preg_match('#<meta[^>]*name="citation_author"[^>]*content="([^"]+)"#i', $html, $meta)) {
|
|
return self::rowsFromNames([$meta[1]], $html);
|
|
}
|
|
|
|
if ($block === '') {
|
|
return [];
|
|
}
|
|
|
|
$rows = [];
|
|
if (preg_match_all('#<a[^>]*href="mailto:([^"]+)"[^>]*>([^<]*)</a>#i', $block, $mailto, PREG_SET_ORDER)) {
|
|
foreach ($mailto as $m) {
|
|
$rows[] = [
|
|
'name' => CrawlAuthorParser::cleanText($m[2]) ?: CrawlAuthorParser::cleanText($m[1]),
|
|
'email' => CrawlAuthorParser::normalizeEmail($m[1]),
|
|
'affiliation' => null,
|
|
'university_name' => null,
|
|
];
|
|
}
|
|
}
|
|
|
|
if ($rows === [] && preg_match_all(
|
|
'#<a[^>]*href="[^"]*searchtype=author[^"]*"[^>]*>([^<]+)</a>#i',
|
|
$block,
|
|
$links
|
|
)) {
|
|
$rows = self::rowsFromNames($links[1], $html);
|
|
}
|
|
|
|
if ($rows === [] && preg_match_all('#<a[^>]*class="[^"]*link-author[^"]*"[^>]*>([^<]+)</a>#i', $block, $links)) {
|
|
$rows = self::rowsFromNames($links[1], $html);
|
|
}
|
|
|
|
if ($rows === [] && preg_match_all('#<span class="descriptor">([^<]*)</span>#', $block, $names)) {
|
|
$rows = self::rowsFromNames($names[1], $html);
|
|
}
|
|
|
|
if ($rows === [] && preg_match_all('#<a[^>]*>([^<]+)</a>#', $block, $links)) {
|
|
$names = [];
|
|
foreach ($links[1] as $name) {
|
|
$name = trim($name);
|
|
if ($name !== '' && ! str_contains(strtolower($name), 'orcid')) {
|
|
$names[] = $name;
|
|
}
|
|
}
|
|
$rows = self::rowsFromNames($names, $html);
|
|
}
|
|
|
|
if (preg_match_all('#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $block, $emails)) {
|
|
foreach ($emails[1] as $i => $email) {
|
|
if (isset($rows[$i])) {
|
|
$rows[$i]['email'] = CrawlAuthorParser::normalizeEmail($email);
|
|
}
|
|
}
|
|
}
|
|
|
|
$affiliations = [];
|
|
if (preg_match_all('#<span class="affiliation">([^<]+)</span>#', $html, $affs)) {
|
|
$affiliations = array_map(
|
|
fn ($a) => CrawlAuthorParser::cleanText(html_entity_decode($a, ENT_QUOTES | ENT_HTML5, 'UTF-8')),
|
|
$affs[1],
|
|
);
|
|
}
|
|
|
|
if ($affiliations === [] && preg_match_all(
|
|
'/((?:[\x{4e00}-\x{9fff}A-Za-z\s,&.-]{2,60}(?:大学|学院|研究院|研究所|University|College|Institute|School)[^.;]{0,80}))/u',
|
|
$block,
|
|
$inlineAff
|
|
)) {
|
|
$affiliations = array_map(fn ($a) => CrawlAuthorParser::cleanText($a), $inlineAff[1]);
|
|
}
|
|
|
|
foreach ($affiliations as $i => $aff) {
|
|
if (! isset($rows[$i])) {
|
|
continue;
|
|
}
|
|
$rows[$i]['affiliation'] = $aff;
|
|
$rows[$i]['university_name'] = CrawlAuthorParser::universityFromAffiliation($aff);
|
|
}
|
|
|
|
if ($rows !== [] && ($rows[0]['university_name'] ?? null) === null) {
|
|
$uni = self::firstUniversityInText($block.' '.$html);
|
|
if ($uni !== null) {
|
|
$rows[0]['affiliation'] = $rows[0]['affiliation'] ?? $uni;
|
|
$rows[0]['university_name'] = CrawlAuthorParser::universityFromAffiliation($uni);
|
|
}
|
|
}
|
|
|
|
return array_values(array_filter($rows, fn ($r) => ($r['name'] ?? '') !== ''));
|
|
}
|
|
|
|
/**
|
|
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
|
|
*/
|
|
public static function parseAuthorsFromHtmlVersion(string $html): array
|
|
{
|
|
$rows = self::parseLtxPersonnameBlock($html);
|
|
if ($rows !== []) {
|
|
return $rows;
|
|
}
|
|
|
|
if (preg_match_all(
|
|
'#<span[^>]*class="[^"]*ltx_author[^"]*"[^>]*>(.*?)</span>#is',
|
|
$html,
|
|
$blocks
|
|
)) {
|
|
foreach ($blocks[1] as $chunk) {
|
|
$name = '';
|
|
if (preg_match('#<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>(.*?)</span>#is', $chunk, $n)) {
|
|
$parsed = self::parseLtxPersonnameInner($n[1]);
|
|
if ($parsed !== []) {
|
|
$rows = array_merge($rows, $parsed);
|
|
}
|
|
continue;
|
|
}
|
|
if (preg_match('#<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>([^<]+)#i', $chunk, $n)) {
|
|
$name = CrawlAuthorParser::cleanText($n[1]);
|
|
}
|
|
if ($name === '') {
|
|
continue;
|
|
}
|
|
$email = null;
|
|
if (preg_match('#mailto:([^"\'>\s]+)#i', $chunk, $em)) {
|
|
$email = CrawlAuthorParser::normalizeEmail($em[1]);
|
|
}
|
|
$aff = null;
|
|
if (preg_match('#<span[^>]*class="[^"]*ltx_author_affiliation[^"]*"[^>]*>([^<]+)#i', $chunk, $af)) {
|
|
$aff = CrawlAuthorParser::cleanText($af[1]);
|
|
}
|
|
$rows[] = [
|
|
'name' => $name,
|
|
'email' => $email,
|
|
'affiliation' => $aff,
|
|
'university_name' => CrawlAuthorParser::universityFromAffiliation($aff),
|
|
];
|
|
}
|
|
}
|
|
|
|
if ($rows === [] && preg_match_all(
|
|
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
|
|
$html,
|
|
$emails
|
|
)) {
|
|
$rows[] = [
|
|
'name' => '',
|
|
'email' => CrawlAuthorParser::normalizeEmail($emails[1][0]),
|
|
'affiliation' => self::firstUniversityInText($html),
|
|
'university_name' => CrawlAuthorParser::universityFromAffiliation(self::firstUniversityInText($html)),
|
|
];
|
|
}
|
|
|
|
return array_values(array_filter($rows, fn ($r) => ($r['name'] ?? '') !== '' || ($r['email'] ?? '') !== ''));
|
|
}
|
|
|
|
public static function extractPdfUrl(string $htmlOrBlock, string $arxivId): ?string
|
|
{
|
|
if (preg_match('#arxiv\.org/pdf/([^"?\s]+)#i', $htmlOrBlock, $m)) {
|
|
return 'https://arxiv.org/pdf/'.$m[1];
|
|
}
|
|
|
|
$base = preg_replace('/v\d+$/i', '', $arxivId) ?: $arxivId;
|
|
|
|
return 'https://arxiv.org/pdf/'.$base;
|
|
}
|
|
|
|
public static function extractHtmlUrl(string $htmlOrBlock, string $arxivId): ?string
|
|
{
|
|
if (preg_match('#arxiv\.org/html/([^"?\s]+)#i', $htmlOrBlock, $m)) {
|
|
return 'https://arxiv.org/html/'.$m[1];
|
|
}
|
|
|
|
$id = preg_replace('/v\d+$/i', '', $arxivId) ?: $arxivId;
|
|
if (preg_match('/v\d+$/i', $arxivId)) {
|
|
return 'https://arxiv.org/html/'.$arxivId;
|
|
}
|
|
|
|
return 'https://arxiv.org/html/'.$id.'v1';
|
|
}
|
|
|
|
/**
|
|
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
|
|
*/
|
|
protected static function parseLtxPersonnameBlock(string $html): array
|
|
{
|
|
if (! preg_match(
|
|
'#<div[^>]*class="[^"]*ltx_authors[^"]*"[^>]*>.*?<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>(.*?)</span>#is',
|
|
$html,
|
|
$m
|
|
)) {
|
|
return [];
|
|
}
|
|
|
|
return self::parseLtxPersonnameInner($m[1]);
|
|
}
|
|
|
|
/**
|
|
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
|
|
*/
|
|
protected static function parseLtxPersonnameInner(string $innerHtml): array
|
|
{
|
|
$parts = preg_split('#<br[^>]*>#i', $innerHtml) ?: [];
|
|
$namesRaw = strip_tags($parts[0] ?? '');
|
|
$namesPart = CrawlAuthorParser::cleanText($namesRaw) ?? '';
|
|
$affiliation = null;
|
|
foreach (array_slice($parts, 1) as $part) {
|
|
$text = CrawlAuthorParser::cleanText(strip_tags($part));
|
|
if ($text !== null && $text !== '' && self::looksLikeAffiliation($text)) {
|
|
$affiliation = $text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($namesPart === '') {
|
|
return [];
|
|
}
|
|
|
|
$names = preg_split('/[\x{2003}\x{2002}\x{2009}]|\s{2,}/u', $namesRaw) ?: [];
|
|
$names = array_values(array_filter(array_map(
|
|
fn ($n) => CrawlAuthorParser::cleanText($n) ?? '',
|
|
$names
|
|
)));
|
|
if (count($names) <= 1 && preg_match('/\s+[A-Z][a-z]+\s+[A-Z][a-z]+\s*$/u', $namesPart)) {
|
|
$names = preg_split('/\s+(?=[A-Z][a-z]+\s+[A-Z][a-z]+\s*$)/u', $namesPart, 2) ?: [$namesPart];
|
|
}
|
|
if ($names === []) {
|
|
$names = [$namesPart];
|
|
}
|
|
$rows = [];
|
|
foreach ($names as $name) {
|
|
$name = CrawlAuthorParser::cleanText($name) ?? '';
|
|
if ($name === '') {
|
|
continue;
|
|
}
|
|
$rows[] = [
|
|
'name' => $name,
|
|
'email' => null,
|
|
'affiliation' => $affiliation,
|
|
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
|
|
];
|
|
}
|
|
|
|
return $rows;
|
|
}
|
|
|
|
protected static function looksLikeAffiliation(string $text): bool
|
|
{
|
|
return (bool) preg_match(
|
|
'/(?:大学|学院|研究院|研究所|University|College|Institute|School|Jerusalem|Laboratory|Lab)/iu',
|
|
$text
|
|
);
|
|
}
|
|
|
|
/**
|
|
* @param list<string> $names
|
|
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
|
|
*/
|
|
protected static function rowsFromNames(array $names, string $fullHtml): array
|
|
{
|
|
$rows = [];
|
|
foreach ($names as $name) {
|
|
$name = CrawlAuthorParser::cleanText(html_entity_decode($name, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
|
|
if ($name === '' || str_ends_with($name, ':')) {
|
|
continue;
|
|
}
|
|
$rows[] = [
|
|
'name' => $name,
|
|
'email' => null,
|
|
'affiliation' => null,
|
|
'university_name' => null,
|
|
];
|
|
}
|
|
|
|
return $rows;
|
|
}
|
|
|
|
protected static function firstUniversityInText(string $text): ?string
|
|
{
|
|
if (preg_match(
|
|
'/((?:[\x{4e00}-\x{9fff}A-Za-z\s,&.-]{2,50}(?:大学|学院|研究院|研究所))|(?:University|College|Institute)[\s\w,.-]{0,60})/u',
|
|
$text,
|
|
$m
|
|
)) {
|
|
return CrawlAuthorParser::cleanText($m[1]);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
protected static function toYmd(string $year, string $monthName, string $day): ?string
|
|
{
|
|
$month = self::MONTHS[strtolower(rtrim($monthName, '.'))] ?? null;
|
|
if ($month === null) {
|
|
return null;
|
|
}
|
|
|
|
return sprintf('%s-%s-%02d', $year, $month, (int) $day);
|
|
}
|
|
}
|