*/ private const MONTHS = [ 'january' => '01', 'february' => '02', 'march' => '03', 'april' => '04', 'may' => '05', 'june' => '06', 'july' => '07', 'august' => '08', 'september' => '09', 'october' => '10', 'november' => '11', 'december' => '12', 'jan' => '01', 'feb' => '02', 'mar' => '03', 'apr' => '04', 'jun' => '06', 'jul' => '07', 'aug' => '08', 'sep' => '09', 'sept' => '09', 'oct' => '10', 'nov' => '11', 'dec' => '12', ]; public static function parsePublishedDate(?string $text): ?string { if ($text === null || trim($text) === '') { return null; } if (preg_match( '/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i', $text, $gen )) { $date = self::toYmd($gen[3], $gen[1], $gen[2]); if ($date !== null) { return $date; } } $text = html_entity_decode(strip_tags($text), ENT_QUOTES | ENT_HTML5, 'UTF-8'); $text = preg_replace('/\s+/u', ' ', $text) ?? ''; if (preg_match('/(\d{4})-(\d{2})-(\d{2})/', $text, $iso)) { return sprintf('%s-%s-%s', $iso[1], $iso[2], $iso[3]); } if (preg_match('/(\d{4})-(\d{2})-(\d{2})T\d{2}:\d{2}/', $text, $isoT)) { return sprintf('%s-%s-%s', $isoT[1], $isoT[2], $isoT[3]); } $patterns = [ '/Generated on\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+([A-Za-z]+)\s+(\d{1,2})\s+\d{1,2}:\d{2}:\d{2}\s+(\d{4})/i', '/\[v\d+\]\s*(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i', '/\[Submitted on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\]/i', '/Submitted\s+on\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})/i', '/Submitted\s+(\d{1,2})\s+([A-Za-z]+),\s+(\d{4})/i', '/(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),?\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\d{1,2}:\d{2}:\d{2}\s+UTC/i', '/(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\d{1,2}:\d{2}:\d{2}\s+UTC/i', ]; foreach ($patterns as $pattern) { if (! preg_match($pattern, $text, $m)) { continue; } // Generated on Thu May 28 ... 2026 → 月、日、年顺序 if (str_starts_with($pattern, '/Generated on')) { $date = self::toYmd($m[3], $m[1], $m[2]); } else { $date = self::toYmd($m[3], $m[2], $m[1]); } if ($date !== null) { return $date; } } return null; } /** * @return list */ public static function parseAuthorsFromAbsHtml(string $html): array { $block = ''; if (preg_match('#]*class="[^"]*authors[^"]*"[^>]*>(.*?)#is', $html, $m)) { $block = $m[1]; } elseif (preg_match('#]*name="citation_author"[^>]*content="([^"]+)"#i', $html, $meta)) { return self::rowsFromNames([$meta[1]], $html); } if ($block === '') { return []; } $rows = []; if (preg_match_all('#]*href="mailto:([^"]+)"[^>]*>([^<]*)#i', $block, $mailto, PREG_SET_ORDER)) { foreach ($mailto as $m) { $rows[] = [ 'name' => CrawlAuthorParser::cleanText($m[2]) ?: CrawlAuthorParser::cleanText($m[1]), 'email' => CrawlAuthorParser::normalizeEmail($m[1]), 'affiliation' => null, 'university_name' => null, ]; } } if ($rows === [] && preg_match_all( '#]*href="[^"]*searchtype=author[^"]*"[^>]*>([^<]+)#i', $block, $links )) { $rows = self::rowsFromNames($links[1], $html); } if ($rows === [] && preg_match_all('#]*class="[^"]*link-author[^"]*"[^>]*>([^<]+)#i', $block, $links)) { $rows = self::rowsFromNames($links[1], $html); } if ($rows === [] && preg_match_all('#([^<]*)#', $block, $names)) { $rows = self::rowsFromNames($names[1], $html); } if ($rows === [] && preg_match_all('#]*>([^<]+)#', $block, $links)) { $names = []; foreach ($links[1] as $name) { $name = trim($name); if ($name !== '' && ! str_contains(strtolower($name), 'orcid')) { $names[] = $name; } } $rows = self::rowsFromNames($names, $html); } if (preg_match_all('#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $block, $emails)) { foreach ($emails[1] as $i => $email) { if (isset($rows[$i])) { $rows[$i]['email'] = CrawlAuthorParser::normalizeEmail($email); } } } $affiliations = []; if (preg_match_all('#([^<]+)#', $html, $affs)) { $affiliations = array_map( fn ($a) => CrawlAuthorParser::cleanText(html_entity_decode($a, ENT_QUOTES | ENT_HTML5, 'UTF-8')), $affs[1], ); } if ($affiliations === [] && preg_match_all( '/((?:[\x{4e00}-\x{9fff}A-Za-z\s,&.-]{2,60}(?:大学|学院|研究院|研究所|University|College|Institute|School)[^.;]{0,80}))/u', $block, $inlineAff )) { $affiliations = array_map(fn ($a) => CrawlAuthorParser::cleanText($a), $inlineAff[1]); } foreach ($affiliations as $i => $aff) { if (! isset($rows[$i])) { continue; } $rows[$i]['affiliation'] = $aff; $rows[$i]['university_name'] = CrawlAuthorParser::universityFromAffiliation($aff); } if ($rows !== [] && ($rows[0]['university_name'] ?? null) === null) { $uni = self::firstUniversityInText($block.' '.$html); if ($uni !== null) { $rows[0]['affiliation'] = $rows[0]['affiliation'] ?? $uni; $rows[0]['university_name'] = CrawlAuthorParser::universityFromAffiliation($uni); } } return array_values(array_filter($rows, fn ($r) => ($r['name'] ?? '') !== '')); } /** * @return list */ public static function parseAuthorsFromHtmlVersion(string $html): array { $rows = self::parseLtxPersonnameBlock($html); if ($rows !== []) { return $rows; } if (preg_match_all( '#]*class="[^"]*ltx_author[^"]*"[^>]*>(.*?)#is', $html, $blocks )) { foreach ($blocks[1] as $chunk) { $name = ''; if (preg_match('#]*class="[^"]*ltx_personname[^"]*"[^>]*>(.*?)#is', $chunk, $n)) { $parsed = self::parseLtxPersonnameInner($n[1]); if ($parsed !== []) { $rows = array_merge($rows, $parsed); } continue; } if (preg_match('#]*class="[^"]*ltx_personname[^"]*"[^>]*>([^<]+)#i', $chunk, $n)) { $name = CrawlAuthorParser::cleanText($n[1]); } if ($name === '') { continue; } $email = null; if (preg_match('#mailto:([^"\'>\s]+)#i', $chunk, $em)) { $email = CrawlAuthorParser::normalizeEmail($em[1]); } $aff = null; if (preg_match('#]*class="[^"]*ltx_author_affiliation[^"]*"[^>]*>([^<]+)#i', $chunk, $af)) { $aff = CrawlAuthorParser::cleanText($af[1]); } $rows[] = [ 'name' => $name, 'email' => $email, 'affiliation' => $aff, 'university_name' => CrawlAuthorParser::universityFromAffiliation($aff), ]; } } if ($rows === [] && preg_match_all( '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#', $html, $emails )) { $rows[] = [ 'name' => '', 'email' => CrawlAuthorParser::normalizeEmail($emails[1][0]), 'affiliation' => self::firstUniversityInText($html), 'university_name' => CrawlAuthorParser::universityFromAffiliation(self::firstUniversityInText($html)), ]; } return array_values(array_filter($rows, fn ($r) => ($r['name'] ?? '') !== '' || ($r['email'] ?? '') !== '')); } public static function extractPdfUrl(string $htmlOrBlock, string $arxivId): ?string { if (preg_match('#arxiv\.org/pdf/([^"?\s]+)#i', $htmlOrBlock, $m)) { return 'https://arxiv.org/pdf/'.$m[1]; } $base = preg_replace('/v\d+$/i', '', $arxivId) ?: $arxivId; return 'https://arxiv.org/pdf/'.$base; } public static function extractHtmlUrl(string $htmlOrBlock, string $arxivId): ?string { if (preg_match('#arxiv\.org/html/([^"?\s]+)#i', $htmlOrBlock, $m)) { return 'https://arxiv.org/html/'.$m[1]; } $id = preg_replace('/v\d+$/i', '', $arxivId) ?: $arxivId; if (preg_match('/v\d+$/i', $arxivId)) { return 'https://arxiv.org/html/'.$arxivId; } return 'https://arxiv.org/html/'.$id.'v1'; } /** * @return list */ protected static function parseLtxPersonnameBlock(string $html): array { if (! preg_match( '#]*class="[^"]*ltx_authors[^"]*"[^>]*>.*?]*class="[^"]*ltx_personname[^"]*"[^>]*>(.*?)#is', $html, $m )) { return []; } return self::parseLtxPersonnameInner($m[1]); } /** * @return list */ protected static function parseLtxPersonnameInner(string $innerHtml): array { $parts = preg_split('#]*>#i', $innerHtml) ?: []; $namesRaw = strip_tags($parts[0] ?? ''); $namesPart = CrawlAuthorParser::cleanText($namesRaw) ?? ''; $affiliation = null; foreach (array_slice($parts, 1) as $part) { $text = CrawlAuthorParser::cleanText(strip_tags($part)); if ($text !== null && $text !== '' && self::looksLikeAffiliation($text)) { $affiliation = $text; break; } } if ($namesPart === '') { return []; } $names = preg_split('/[\x{2003}\x{2002}\x{2009}]|\s{2,}/u', $namesRaw) ?: []; $names = array_values(array_filter(array_map( fn ($n) => CrawlAuthorParser::cleanText($n) ?? '', $names ))); if (count($names) <= 1 && preg_match('/\s+[A-Z][a-z]+\s+[A-Z][a-z]+\s*$/u', $namesPart)) { $names = preg_split('/\s+(?=[A-Z][a-z]+\s+[A-Z][a-z]+\s*$)/u', $namesPart, 2) ?: [$namesPart]; } if ($names === []) { $names = [$namesPart]; } $rows = []; foreach ($names as $name) { $name = CrawlAuthorParser::cleanText($name) ?? ''; if ($name === '') { continue; } $rows[] = [ 'name' => $name, 'email' => null, 'affiliation' => $affiliation, 'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation), ]; } return $rows; } protected static function looksLikeAffiliation(string $text): bool { return (bool) preg_match( '/(?:大学|学院|研究院|研究所|University|College|Institute|School|Jerusalem|Laboratory|Lab)/iu', $text ); } /** * @param list $names * @return list */ protected static function rowsFromNames(array $names, string $fullHtml): array { $rows = []; foreach ($names as $name) { $name = CrawlAuthorParser::cleanText(html_entity_decode($name, ENT_QUOTES | ENT_HTML5, 'UTF-8')); if ($name === '' || str_ends_with($name, ':')) { continue; } $rows[] = [ 'name' => $name, 'email' => null, 'affiliation' => null, 'university_name' => null, ]; } return $rows; } protected static function firstUniversityInText(string $text): ?string { if (preg_match( '/((?:[\x{4e00}-\x{9fff}A-Za-z\s,&.-]{2,50}(?:大学|学院|研究院|研究所))|(?:University|College|Institute)[\s\w,.-]{0,60})/u', $text, $m )) { return CrawlAuthorParser::cleanText($m[1]); } return null; } protected static function toYmd(string $year, string $monthName, string $day): ?string { $month = self::MONTHS[strtolower(rtrim($monthName, '.'))] ?? null; if ($month === null) { return null; } return sprintf('%s-%s-%02d', $year, $month, (int) $day); } }