|
|
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* 将 arXiv Atom/API 摘要中的 LaTeX 行内公式转为接近官网展示的纯文本。
|
|
|
|
|
|
*/
|
|
|
|
|
|
class ArxivTextNormalizer
|
|
|
|
|
|
{
|
|
|
|
|
|
public static function normalize(?string $text): ?string
|
|
|
|
|
|
{
|
|
|
|
|
|
if ($text === null || $text === '') {
|
|
|
|
|
|
return $text;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
|
|
|
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
|
|
|
|
|
|
|
|
|
|
|
|
// 去掉 HTML 降级时可能残留的检索高亮标记
|
|
|
|
|
|
$text = preg_replace('/\s*…\s*/u', ' ', $text) ?? $text;
|
|
|
|
|
|
|
|
|
|
|
|
$text = preg_replace_callback(
|
|
|
|
|
|
'/\$\$([^$]+)\$\$/s',
|
|
|
|
|
|
static fn (array $m): string => self::latexFragmentToPlain($m[1]),
|
|
|
|
|
|
$text
|
|
|
|
|
|
) ?? $text;
|
|
|
|
|
|
|
|
|
|
|
|
$text = preg_replace_callback(
|
|
|
|
|
|
'/\$([^$]+)\$/',
|
|
|
|
|
|
static fn (array $m): string => self::latexFragmentToPlain($m[1]),
|
|
|
|
|
|
$text
|
|
|
|
|
|
) ?? $text;
|
|
|
|
|
|
|
|
|
|
|
|
$text = preg_replace('/\\\\(mathrm|mathbf|mathit|text|textbf|textit)\{([^}]*)\}/', '$2', $text) ?? $text;
|
|
|
|
|
|
$text = str_replace(['\\', '{', '}'], '', $text);
|
|
|
|
|
|
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
|
|
|
|
|
|
|
|
|
|
|
|
return trim($text);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
protected static function latexFragmentToPlain(string $latex): string
|
|
|
|
|
|
{
|
|
|
|
|
|
$s = trim($latex);
|
|
|
|
|
|
|
|
|
|
|
|
$s = preg_replace_callback(
|
|
|
|
|
|
'/\^\{([^}]*)\}/',
|
|
|
|
|
|
static fn (array $m): string => self::formatExponent($m[1]),
|
|
|
|
|
|
$s
|
|
|
|
|
|
) ?? $s;
|
|
|
|
|
|
|
|
|
|
|
|
$s = preg_replace_callback(
|
|
|
|
|
|
'/\^(-?\d+)/',
|
|
|
|
|
|
static fn (array $m): string => self::formatExponent($m[1]),
|
|
|
|
|
|
$s
|
|
|
|
|
|
) ?? $s;
|
|
|
|
|
|
|
|
|
|
|
|
$s = preg_replace('/_\{([^}]+)\}/', '$1', $s) ?? $s;
|
|
|
|
|
|
$s = preg_replace('/_([a-zA-Z0-9]+)/', '$1', $s) ?? $s;
|
|
|
|
|
|
|
|
|
|
|
|
$s = preg_replace('/\\\\(mathrm|mathbf|mathit|text)\{([^}]*)\}/', '$2', $s) ?? $s;
|
|
|
|
|
|
$s = str_replace(['\\', '{', '}'], '', $s);
|
|
|
|
|
|
|
|
|
|
|
|
return $s;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
protected static function formatExponent(string $exp): string
|
|
|
|
|
|
{
|
|
|
|
|
|
if (preg_match('/^-\d+$/', $exp)) {
|
|
|
|
|
|
return '−'.substr($exp, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return $exp;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|