|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
class CrawlAuthorParser
|
|
|
{
|
|
|
/**
|
|
|
* @param list<array{name:string,email?:?string,affiliation?:?string,university_name?:?string}> $authorsParsed
|
|
|
* @return array{name:string,email:?string,affiliation:?string,university_name:?string}|null
|
|
|
*/
|
|
|
public static function leadAuthor(?string $authorsString, array $authorsParsed = []): ?array
|
|
|
{
|
|
|
if ($authorsParsed !== []) {
|
|
|
$first = $authorsParsed[0];
|
|
|
$name = trim((string) ($first['name'] ?? ''));
|
|
|
if ($name !== '') {
|
|
|
return [
|
|
|
'name' => $name,
|
|
|
'email' => self::normalizeEmail($first['email'] ?? null),
|
|
|
'affiliation' => self::cleanText($first['affiliation'] ?? null),
|
|
|
'university_name' => self::universityFromAffiliation($first['affiliation'] ?? null)
|
|
|
?? self::cleanText($first['university_name'] ?? null),
|
|
|
];
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (! $authorsString) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$parts = preg_split('/[;;,,]/u', $authorsString) ?: [];
|
|
|
$name = trim((string) ($parts[0] ?? ''));
|
|
|
if ($name === '') {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
return [
|
|
|
'name' => $name,
|
|
|
'email' => null,
|
|
|
'affiliation' => null,
|
|
|
'university_name' => null,
|
|
|
];
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return list<array{name:string,email:?string,affiliation:?string,university_name:?string}>
|
|
|
*/
|
|
|
public static function splitAuthorsString(?string $authors): array
|
|
|
{
|
|
|
if (! $authors) {
|
|
|
return [];
|
|
|
}
|
|
|
|
|
|
$rows = [];
|
|
|
foreach (preg_split('/[;;]/u', $authors) ?: [] as $chunk) {
|
|
|
$name = trim($chunk);
|
|
|
if ($name !== '') {
|
|
|
$rows[] = ['name' => $name, 'email' => null, 'affiliation' => null, 'university_name' => null];
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $rows;
|
|
|
}
|
|
|
|
|
|
public static function normalizeEmail(?string $email): ?string
|
|
|
{
|
|
|
$email = trim((string) $email);
|
|
|
if ($email === '' || ! filter_var($email, FILTER_VALIDATE_EMAIL)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
return strtolower($email);
|
|
|
}
|
|
|
|
|
|
public static function cleanText(?string $text): ?string
|
|
|
{
|
|
|
$text = trim(preg_replace('/\s+/u', ' ', (string) $text) ?? '');
|
|
|
|
|
|
return $text === '' ? null : $text;
|
|
|
}
|
|
|
|
|
|
public static function universityFromAffiliation(?string $affiliation): ?string
|
|
|
{
|
|
|
$affiliation = self::cleanText($affiliation);
|
|
|
if (! $affiliation) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/^([^,,;;]+(?:大学|学院|研究院|研究所|University|College)[^,,;;]*)/iu', $affiliation, $m)) {
|
|
|
return trim($m[1]);
|
|
|
}
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
}
|