|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
class HtmlPagination
|
|
|
{
|
|
|
public static function detectTotalPages(string $html, string $baseUrl): int
|
|
|
{
|
|
|
$indexMax = self::detectIndexUnderscoreMaxPage($html, $baseUrl);
|
|
|
if ($indexMax > 1) {
|
|
|
return $indexMax;
|
|
|
}
|
|
|
|
|
|
if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
|
|
|
return max(1, (int) $match[1]);
|
|
|
}
|
|
|
|
|
|
if (preg_match('/共\s*(\d+)\s*页/u', $html, $match)) {
|
|
|
return max(1, (int) $match[1]);
|
|
|
}
|
|
|
|
|
|
$max = 1;
|
|
|
if (preg_match_all('#[?&](?:page|p|pageNum|PAGENUM|pn)=(\d+)#i', $html, $matches)) {
|
|
|
foreach ($matches[1] as $num) {
|
|
|
$max = max($max, (int) $num);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (preg_match_all('#/(?:page|pages?)/(\d+)(?:/|[\?"\'\s>])#i', $html, $matches)) {
|
|
|
foreach ($matches[1] as $num) {
|
|
|
$max = max($max, (int) $num);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (preg_match('#class=["\'][^"\']*(?:pagination|page-nav|pages|pager)[^"\']*["\']#i', $html)) {
|
|
|
if (preg_match_all('#>(\d{1,3})</a>#', $html, $matches)) {
|
|
|
foreach ($matches[1] as $num) {
|
|
|
$n = (int) $num;
|
|
|
if ($n > 0 && $n <= 500) {
|
|
|
$max = max($max, $n);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
parse_str((string) parse_url($baseUrl, PHP_URL_QUERY), $query);
|
|
|
if (isset($query['totalpage'])) {
|
|
|
$max = max($max, (int) $query['totalpage']);
|
|
|
}
|
|
|
|
|
|
return max(1, $max);
|
|
|
}
|
|
|
|
|
|
public static function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
|
|
|
{
|
|
|
if ($page <= 1) {
|
|
|
return self::normalizeIndexFirstPageUrl($baseUrl);
|
|
|
}
|
|
|
|
|
|
$indexUrl = self::buildIndexUnderscorePageUrl($baseUrl, $page);
|
|
|
if ($indexUrl !== null) {
|
|
|
return $indexUrl;
|
|
|
}
|
|
|
|
|
|
$parts = parse_url($baseUrl);
|
|
|
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
return $baseUrl;
|
|
|
}
|
|
|
|
|
|
parse_str((string) ($parts['query'] ?? ''), $query);
|
|
|
|
|
|
if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)
|
|
|
|| preg_match('/totalpage=(\d+)/i', $baseUrl, $totalMatch)) {
|
|
|
$query['PAGENUM'] = (string) $page;
|
|
|
$query['totalpage'] = $totalMatch[1];
|
|
|
} elseif (array_key_exists('PAGENUM', $query) || preg_match('/PAGENUM=/i', $firstPageHtml)) {
|
|
|
$query['PAGENUM'] = (string) $page;
|
|
|
} elseif (array_key_exists('pageNum', $query) || preg_match('/pageNum=/i', $firstPageHtml)) {
|
|
|
$query['pageNum'] = (string) $page;
|
|
|
} elseif (array_key_exists('page', $query) || preg_match('/[?&]page=\d+/i', $firstPageHtml)) {
|
|
|
$query['page'] = (string) $page;
|
|
|
} elseif (array_key_exists('p', $query) || preg_match('/[?&]p=\d+/i', $firstPageHtml)) {
|
|
|
$query['p'] = (string) $page;
|
|
|
} elseif (preg_match('#/(?:page|pages?)/(\d+)#i', $baseUrl)) {
|
|
|
$path = preg_replace('#/(?:page|pages?)/\d+#i', '/page/'.$page, $parts['path'] ?? '/');
|
|
|
$parts['path'] = $path;
|
|
|
$query = [];
|
|
|
} else {
|
|
|
$query['page'] = (string) $page;
|
|
|
}
|
|
|
|
|
|
$url = $parts['scheme'].'://'.$parts['host'];
|
|
|
if (! empty($parts['port'])) {
|
|
|
$url .= ':'.$parts['port'];
|
|
|
}
|
|
|
$url .= $parts['path'] ?? '/';
|
|
|
if ($query !== []) {
|
|
|
$url .= '?'.http_build_query($query);
|
|
|
}
|
|
|
|
|
|
return $url;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @return list<string>
|
|
|
*/
|
|
|
public static function fetchPagesHtml(string $baseUrl, int $maxPages): array
|
|
|
{
|
|
|
$maxPages = max(1, min(50, $maxPages));
|
|
|
$first = HtmlCrawlSupport::fetchHtml($baseUrl);
|
|
|
$pages = [$first];
|
|
|
|
|
|
$detected = self::detectTotalPages($first, $baseUrl);
|
|
|
$usesIndexStyle = self::usesIndexUnderscoreStyle($baseUrl, $first);
|
|
|
$total = min($maxPages, $usesIndexStyle && $detected <= 1 ? $maxPages : max($detected, 1));
|
|
|
$total = min($maxPages, max($total, 1));
|
|
|
|
|
|
for ($page = 2; $page <= $total; $page++) {
|
|
|
$url = self::buildPageUrl($baseUrl, $page, $first);
|
|
|
if ($url === self::normalizeIndexFirstPageUrl($baseUrl)) {
|
|
|
break;
|
|
|
}
|
|
|
try {
|
|
|
$pages[] = HtmlCrawlSupport::fetchHtml($url, 30);
|
|
|
} catch (\Throwable) {
|
|
|
if ($usesIndexStyle) {
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $pages;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 高校资讯常见:index.html(第 1 页)、index_2.html、index_3.html …
|
|
|
*/
|
|
|
protected static function usesIndexUnderscoreStyle(string $baseUrl, string $html): bool
|
|
|
{
|
|
|
$path = (string) parse_url($baseUrl, PHP_URL_PATH);
|
|
|
if (preg_match('#/(?:index|list)(?:_\d+)?\.(?:html?|shtml)$#i', $path)) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
return (bool) preg_match('#(?:index|list)_\d+\.(?:html?|shtml)#i', $html);
|
|
|
}
|
|
|
|
|
|
protected static function detectIndexUnderscoreMaxPage(string $html, string $baseUrl): int
|
|
|
{
|
|
|
$max = 1;
|
|
|
$haystack = $html.' '.$baseUrl;
|
|
|
|
|
|
if (preg_match_all('#(?:index|list)_(\d+)\.(?:html?|shtml)#i', $haystack, $matches)) {
|
|
|
foreach ($matches[1] as $num) {
|
|
|
$n = (int) $num;
|
|
|
if ($n >= 2 && $n <= 500) {
|
|
|
$max = max($max, $n);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $max;
|
|
|
}
|
|
|
|
|
|
protected static function normalizeIndexFirstPageUrl(string $baseUrl): string
|
|
|
{
|
|
|
$parts = parse_url($baseUrl);
|
|
|
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
return $baseUrl;
|
|
|
}
|
|
|
|
|
|
$path = $parts['path'] ?? '/';
|
|
|
if (preg_match('#^(.*?/)(?:index|list)_\d+(\.(?:html?|shtml))$#i', $path, $m)) {
|
|
|
$path = $m[1].'index'.$m[2];
|
|
|
}
|
|
|
|
|
|
$url = $parts['scheme'].'://'.$parts['host'];
|
|
|
if (! empty($parts['port'])) {
|
|
|
$url .= ':'.$parts['port'];
|
|
|
}
|
|
|
$url .= $path;
|
|
|
if (! empty($parts['query'])) {
|
|
|
$url .= '?'.$parts['query'];
|
|
|
}
|
|
|
|
|
|
return $url;
|
|
|
}
|
|
|
|
|
|
protected static function buildIndexUnderscorePageUrl(string $baseUrl, int $page): ?string
|
|
|
{
|
|
|
if ($page <= 1) {
|
|
|
return self::normalizeIndexFirstPageUrl($baseUrl);
|
|
|
}
|
|
|
|
|
|
$parts = parse_url($baseUrl);
|
|
|
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$path = $parts['path'] ?? '/';
|
|
|
if (! preg_match('#^(.*?/)(index|list)(?:_\d+)?(\.(?:html?|shtml))$#i', $path, $m)) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$path = $m[1].$m[2].'_'.$page.$m[3];
|
|
|
|
|
|
$url = $parts['scheme'].'://'.$parts['host'];
|
|
|
if (! empty($parts['port'])) {
|
|
|
$url .= ':'.$parts['port'];
|
|
|
}
|
|
|
$url .= $path;
|
|
|
if (! empty($parts['query'])) {
|
|
|
$url .= '?'.$parts['query'];
|
|
|
}
|
|
|
|
|
|
return $url;
|
|
|
}
|
|
|
}
|