You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

75 lines
2.1 KiB

2 weeks ago
<?php
namespace App\Services\Crawl;
/**
* 将 arXiv Atom/API 摘要中的 LaTeX 行内公式转为接近官网展示的纯文本。
*/
class ArxivTextNormalizer
{
public static function normalize(?string $text): ?string
{
if ($text === null || $text === '') {
return $text;
}
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
// 去掉 HTML 降级时可能残留的检索高亮标记
$text = preg_replace('/\s*…\s*/u', ' ', $text) ?? $text;
$text = preg_replace_callback(
'/\$\$([^$]+)\$\$/s',
static fn (array $m): string => self::latexFragmentToPlain($m[1]),
$text
) ?? $text;
$text = preg_replace_callback(
'/\$([^$]+)\$/',
static fn (array $m): string => self::latexFragmentToPlain($m[1]),
$text
) ?? $text;
$text = preg_replace('/\\\\(mathrm|mathbf|mathit|text|textbf|textit)\{([^}]*)\}/', '$2', $text) ?? $text;
$text = str_replace(['\\', '{', '}'], '', $text);
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
return trim($text);
}
protected static function latexFragmentToPlain(string $latex): string
{
$s = trim($latex);
$s = preg_replace_callback(
'/\^\{([^}]*)\}/',
static fn (array $m): string => self::formatExponent($m[1]),
$s
) ?? $s;
$s = preg_replace_callback(
'/\^(-?\d+)/',
static fn (array $m): string => self::formatExponent($m[1]),
$s
) ?? $s;
$s = preg_replace('/_\{([^}]+)\}/', '$1', $s) ?? $s;
$s = preg_replace('/_([a-zA-Z0-9]+)/', '$1', $s) ?? $s;
$s = preg_replace('/\\\\(mathrm|mathbf|mathit|text)\{([^}]*)\}/', '$2', $s) ?? $s;
$s = str_replace(['\\', '{', '}'], '', $s);
return $s;
}
protected static function formatExponent(string $exp): string
{
if (preg_match('/^-\d+$/', $exp)) {
return ''.substr($exp, 1);
}
return $exp;
}
}