You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

381 lines
14 KiB

<?php
namespace App\Services\Crawl;
/**
* 从 arXiv 搜索块 / abs / html 页面文本解析发表日期与作者元数据。
*/
class ArxivMetadataParser
{
/** @var array<string, string> */
private const MONTHS = [
'january' => '01', 'february' => '02', 'march' => '03', 'april' => '04',
'may' => '05', 'june' => '06', 'july' => '07', 'august' => '08',
'september' => '09', 'october' => '10', 'november' => '11', 'december' => '12',
'jan' => '01', 'feb' => '02', 'mar' => '03', 'apr' => '04',
'jun' => '06', 'jul' => '07', 'aug' => '08', 'sep' => '09', 'sept' => '09',
'oct' => '10', 'nov' => '11', 'dec' => '12',
];
public static function parsePublishedDate(?string $text): ?string
{
if ($text === null || trim($text) === '') {
return null;
}
if (preg_match(
'/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i',
$text,
$gen
)) {
$date = self::toYmd($gen[3], $gen[1], $gen[2]);
if ($date !== null) {
return $date;
}
}
$text = html_entity_decode(strip_tags($text), ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text) ?? '';
if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $text, $iso)) {
return sprintf('%s-%s-%s', $iso[1], $iso[2], $iso[3]);
}
if (preg_match('/(\d{4})-(\d{2})-(\d{2})T\d{2}:\d{2}/', $text, $isoT)) {
return sprintf('%s-%s-%s', $isoT[1], $isoT[2], $isoT[3]);
}
$patterns = [
'/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i',
'/\[v\d+\]\s*(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i',
'/\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]/i',
'/Submitted\s+on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i',
'/Submitted\s+(\d{1,2})\s+([A-Za-z]+),\s+(\d{4})/i',
'/(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\d{1,2}:\d{2}:\d{2}\s+UTC/i',
'/(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\d{1,2}:\d{2}:\d{2}\s+UTC/i',
];
foreach ($patterns as $pattern) {
if (! preg_match($pattern, $text, $m)) {
continue;
}
// Generated on Thu May 28 ... 2026 → 月、日、年顺序
if (str_starts_with($pattern, '/Generated on')) {
$date = self::toYmd($m[3], $m[1], $m[2]);
} else {
$date = self::toYmd($m[3], $m[2], $m[1]);
}
if ($date !== null) {
return $date;
}
}
return null;
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
public static function parseAuthorsFromAbsHtml(string $html): array
{
$block = '';
if (preg_match('#<div[^>]*class="[^"]*authors[^"]*"[^>]*>(.*?)</div>#is', $html, $m)) {
$block = $m[1];
} elseif (preg_match('#<meta[^>]*name="citation_author"[^>]*content="([^"]+)"#i', $html, $meta)) {
return self::rowsFromNames([$meta[1]], $html);
}
if ($block === '') {
return [];
}
$rows = [];
if (preg_match_all('#<a[^>]*href="mailto:([^"]+)"[^>]*>([^<]*)</a>#i', $block, $mailto, PREG_SET_ORDER)) {
foreach ($mailto as $m) {
$rows[] = [
'name' => CrawlAuthorParser::cleanText($m[2]) ?: CrawlAuthorParser::cleanText($m[1]),
'email' => CrawlAuthorParser::normalizeEmail($m[1]),
'affiliation' => null,
'university_name' => null,
];
}
}
if ($rows === [] && preg_match_all(
'#<a[^>]*href="[^"]*searchtype=author[^"]*"[^>]*>([^<]+)</a>#i',
$block,
$links
)) {
$rows = self::rowsFromNames($links[1], $html);
}
if ($rows === [] && preg_match_all('#<a[^>]*class="[^"]*link-author[^"]*"[^>]*>([^<]+)</a>#i', $block, $links)) {
$rows = self::rowsFromNames($links[1], $html);
}
if ($rows === [] && preg_match_all('#<span class="descriptor">([^<]*)</span>#', $block, $names)) {
$rows = self::rowsFromNames($names[1], $html);
}
if ($rows === [] && preg_match_all('#<a[^>]*>([^<]+)</a>#', $block, $links)) {
$names = [];
foreach ($links[1] as $name) {
$name = trim($name);
if ($name !== '' && ! str_contains(strtolower($name), 'orcid')) {
$names[] = $name;
}
}
$rows = self::rowsFromNames($names, $html);
}
if (preg_match_all('#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $block, $emails)) {
foreach ($emails[1] as $i => $email) {
if (isset($rows[$i])) {
$rows[$i]['email'] = CrawlAuthorParser::normalizeEmail($email);
}
}
}
$affiliations = [];
if (preg_match_all('#<span class="affiliation">([^<]+)</span>#', $html, $affs)) {
$affiliations = array_map(
fn ($a) => CrawlAuthorParser::cleanText(html_entity_decode($a, ENT_QUOTES | ENT_HTML5, 'UTF-8')),
$affs[1],
);
}
if ($affiliations === [] && preg_match_all(
'/((?:[\x{4e00}-\x{9fff}A-Za-z\s,&.-]{2,60}(?:大学|学院|研究院|研究所|University|College|Institute|School)[^.;]{0,80}))/u',
$block,
$inlineAff
)) {
$affiliations = array_map(fn ($a) => CrawlAuthorParser::cleanText($a), $inlineAff[1]);
}
foreach ($affiliations as $i => $aff) {
if (! isset($rows[$i])) {
continue;
}
$rows[$i]['affiliation'] = $aff;
$rows[$i]['university_name'] = CrawlAuthorParser::universityFromAffiliation($aff);
}
if ($rows !== [] && ($rows[0]['university_name'] ?? null) === null) {
$uni = self::firstUniversityInText($block.' '.$html);
if ($uni !== null) {
$rows[0]['affiliation'] = $rows[0]['affiliation'] ?? $uni;
$rows[0]['university_name'] = CrawlAuthorParser::universityFromAffiliation($uni);
}
}
return array_values(array_filter($rows, fn ($r) => ($r['name'] ?? '') !== ''));
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
public static function parseAuthorsFromHtmlVersion(string $html): array
{
$rows = self::parseLtxPersonnameBlock($html);
if ($rows !== []) {
return $rows;
}
if (preg_match_all(
'#<span[^>]*class="[^"]*ltx_author[^"]*"[^>]*>(.*?)</span>#is',
$html,
$blocks
)) {
foreach ($blocks[1] as $chunk) {
$name = '';
if (preg_match('#<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>(.*?)</span>#is', $chunk, $n)) {
$parsed = self::parseLtxPersonnameInner($n[1]);
if ($parsed !== []) {
$rows = array_merge($rows, $parsed);
}
continue;
}
if (preg_match('#<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>([^<]+)#i', $chunk, $n)) {
$name = CrawlAuthorParser::cleanText($n[1]);
}
if ($name === '') {
continue;
}
$email = null;
if (preg_match('#mailto:([^"\'>\s]+)#i', $chunk, $em)) {
$email = CrawlAuthorParser::normalizeEmail($em[1]);
}
$aff = null;
if (preg_match('#<span[^>]*class="[^"]*ltx_author_affiliation[^"]*"[^>]*>([^<]+)#i', $chunk, $af)) {
$aff = CrawlAuthorParser::cleanText($af[1]);
}
$rows[] = [
'name' => $name,
'email' => $email,
'affiliation' => $aff,
'university_name' => CrawlAuthorParser::universityFromAffiliation($aff),
];
}
}
if ($rows === [] && preg_match_all(
'#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
$html,
$emails
)) {
$rows[] = [
'name' => '',
'email' => CrawlAuthorParser::normalizeEmail($emails[1][0]),
'affiliation' => self::firstUniversityInText($html),
'university_name' => CrawlAuthorParser::universityFromAffiliation(self::firstUniversityInText($html)),
];
}
return array_values(array_filter($rows, fn ($r) => ($r['name'] ?? '') !== '' || ($r['email'] ?? '') !== ''));
}
public static function extractPdfUrl(string $htmlOrBlock, string $arxivId): ?string
{
if (preg_match('#arxiv\.org/pdf/([^"?\s]+)#i', $htmlOrBlock, $m)) {
return 'https://arxiv.org/pdf/'.$m[1];
}
$base = preg_replace('/v\d+$/i', '', $arxivId) ?: $arxivId;
return 'https://arxiv.org/pdf/'.$base;
}
public static function extractHtmlUrl(string $htmlOrBlock, string $arxivId): ?string
{
if (preg_match('#arxiv\.org/html/([^"?\s]+)#i', $htmlOrBlock, $m)) {
return 'https://arxiv.org/html/'.$m[1];
}
$id = preg_replace('/v\d+$/i', '', $arxivId) ?: $arxivId;
if (preg_match('/v\d+$/i', $arxivId)) {
return 'https://arxiv.org/html/'.$arxivId;
}
return 'https://arxiv.org/html/'.$id.'v1';
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
protected static function parseLtxPersonnameBlock(string $html): array
{
if (! preg_match(
'#<div[^>]*class="[^"]*ltx_authors[^"]*"[^>]*>.*?<span[^>]*class="[^"]*ltx_personname[^"]*"[^>]*>(.*?)</span>#is',
$html,
$m
)) {
return [];
}
return self::parseLtxPersonnameInner($m[1]);
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
protected static function parseLtxPersonnameInner(string $innerHtml): array
{
$parts = preg_split('#<br[^>]*>#i', $innerHtml) ?: [];
$namesRaw = strip_tags($parts[0] ?? '');
$namesPart = CrawlAuthorParser::cleanText($namesRaw) ?? '';
$affiliation = null;
foreach (array_slice($parts, 1) as $part) {
$text = CrawlAuthorParser::cleanText(strip_tags($part));
if ($text !== null && $text !== '' && self::looksLikeAffiliation($text)) {
$affiliation = $text;
break;
}
}
if ($namesPart === '') {
return [];
}
$names = preg_split('/[\x{2003}\x{2002}\x{2009}]|\s{2,}/u', $namesRaw) ?: [];
$names = array_values(array_filter(array_map(
fn ($n) => CrawlAuthorParser::cleanText($n) ?? '',
$names
)));
if (count($names) <= 1 && preg_match('/\s+[A-Z][a-z]+\s+[A-Z][a-z]+\s*$/u', $namesPart)) {
$names = preg_split('/\s+(?=[A-Z][a-z]+\s+[A-Z][a-z]+\s*$)/u', $namesPart, 2) ?: [$namesPart];
}
if ($names === []) {
$names = [$namesPart];
}
$rows = [];
foreach ($names as $name) {
$name = CrawlAuthorParser::cleanText($name) ?? '';
if ($name === '') {
continue;
}
$rows[] = [
'name' => $name,
'email' => null,
'affiliation' => $affiliation,
'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
];
}
return $rows;
}
protected static function looksLikeAffiliation(string $text): bool
{
return (bool) preg_match(
'/(?:大学|学院|研究院|研究所|University|College|Institute|School|Jerusalem|Laboratory|Lab)/iu',
$text
);
}
/**
* @param list<string> $names
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
protected static function rowsFromNames(array $names, string $fullHtml): array
{
$rows = [];
foreach ($names as $name) {
$name = CrawlAuthorParser::cleanText(html_entity_decode($name, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
if ($name === '' || str_ends_with($name, ':')) {
continue;
}
$rows[] = [
'name' => $name,
'email' => null,
'affiliation' => null,
'university_name' => null,
];
}
return $rows;
}
protected static function firstUniversityInText(string $text): ?string
{
if (preg_match(
'/((?:[\x{4e00}-\x{9fff}A-Za-z\s,&.-]{2,50}(?:大学|学院|研究院|研究所))|(?:University|College|Institute)[\s\w,.-]{0,60})/u',
$text,
$m
)) {
return CrawlAuthorParser::cleanText($m[1]);
}
return null;
}
protected static function toYmd(string $year, string $monthName, string $day): ?string
{
$month = self::MONTHS[strtolower(rtrim($monthName, '.'))] ?? null;
if ($month === null) {
return null;
}
return sprintf('%s-%s-%02d', $year, $month, (int) $day);
}
}