You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

96 lines
2.8 KiB

2 weeks ago
<?php
namespace App\Services\Crawl;
class CrawlAuthorParser
{
/**
* @param list<array{name:string,email?:?string,affiliation?:?string,university_name?:?string}> $authorsParsed
* @return array{name:string,email:?string,affiliation:?string,university_name:?string}|null
*/
public static function leadAuthor(?string $authorsString, array $authorsParsed = []): ?array
{
if ($authorsParsed !== []) {
$first = $authorsParsed[0];
$name = trim((string) ($first['name'] ?? ''));
if ($name !== '') {
return [
'name' => $name,
'email' => self::normalizeEmail($first['email'] ?? null),
'affiliation' => self::cleanText($first['affiliation'] ?? null),
'university_name' => self::universityFromAffiliation($first['affiliation'] ?? null)
?? self::cleanText($first['university_name'] ?? null),
];
}
}
if (! $authorsString) {
return null;
}
$parts = preg_split('/[;,]/u', $authorsString) ?: [];
$name = trim((string) ($parts[0] ?? ''));
if ($name === '') {
return null;
}
return [
'name' => $name,
'email' => null,
'affiliation' => null,
'university_name' => null,
];
}
/**
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
*/
public static function splitAuthorsString(?string $authors): array
{
if (! $authors) {
return [];
}
$rows = [];
foreach (preg_split('/[;]/u', $authors) ?: [] as $chunk) {
$name = trim($chunk);
if ($name !== '') {
$rows[] = ['name' => $name, 'email' => null, 'affiliation' => null, 'university_name' => null];
}
}
return $rows;
}
public static function normalizeEmail(?string $email): ?string
{
$email = trim((string) $email);
if ($email === '' || ! filter_var($email, FILTER_VALIDATE_EMAIL)) {
return null;
}
return strtolower($email);
}
public static function cleanText(?string $text): ?string
{
$text = trim(preg_replace('/\s+/u', ' ', (string) $text) ?? '');
return $text === '' ? null : $text;
}
public static function universityFromAffiliation(?string $affiliation): ?string
{
$affiliation = self::cleanText($affiliation);
if (! $affiliation) {
return null;
}
if (preg_match('/^([^,;]+(?:大学|学院|研究院|研究所|University|College)[^,;]*)/iu', $affiliation, $m)) {
return trim($m[1]);
}
return null;
}
}