1) {
return $indexMax;
}
if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
return max(1, (int) $match[1]);
}
if (preg_match('/共\s*(\d+)\s*页/u', $html, $match)) {
return max(1, (int) $match[1]);
}
$max = 1;
if (preg_match_all('#[?&](?:page|p|pageNum|PAGENUM|pn)=(\d+)#i', $html, $matches)) {
foreach ($matches[1] as $num) {
$max = max($max, (int) $num);
}
}
if (preg_match_all('#/(?:page|pages?)/(\d+)(?:/|[\?"\'\s>])#i', $html, $matches)) {
foreach ($matches[1] as $num) {
$max = max($max, (int) $num);
}
}
if (preg_match('#class=["\'][^"\']*(?:pagination|page-nav|pages|pager)[^"\']*["\']#i', $html)) {
if (preg_match_all('#>(\d{1,3})#', $html, $matches)) {
foreach ($matches[1] as $num) {
$n = (int) $num;
if ($n > 0 && $n <= 500) {
$max = max($max, $n);
}
}
}
}
parse_str((string) parse_url($baseUrl, PHP_URL_QUERY), $query);
if (isset($query['totalpage'])) {
$max = max($max, (int) $query['totalpage']);
}
return max(1, $max);
}
public static function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
{
if ($page <= 1) {
return self::normalizeIndexFirstPageUrl($baseUrl);
}
$indexUrl = self::buildIndexUnderscorePageUrl($baseUrl, $page);
if ($indexUrl !== null) {
return $indexUrl;
}
$parts = parse_url($baseUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $baseUrl;
}
parse_str((string) ($parts['query'] ?? ''), $query);
if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)
|| preg_match('/totalpage=(\d+)/i', $baseUrl, $totalMatch)) {
$query['PAGENUM'] = (string) $page;
$query['totalpage'] = $totalMatch[1];
} elseif (array_key_exists('PAGENUM', $query) || preg_match('/PAGENUM=/i', $firstPageHtml)) {
$query['PAGENUM'] = (string) $page;
} elseif (array_key_exists('pageNum', $query) || preg_match('/pageNum=/i', $firstPageHtml)) {
$query['pageNum'] = (string) $page;
} elseif (array_key_exists('page', $query) || preg_match('/[?&]page=\d+/i', $firstPageHtml)) {
$query['page'] = (string) $page;
} elseif (array_key_exists('p', $query) || preg_match('/[?&]p=\d+/i', $firstPageHtml)) {
$query['p'] = (string) $page;
} elseif (preg_match('#/(?:page|pages?)/(\d+)#i', $baseUrl)) {
$path = preg_replace('#/(?:page|pages?)/\d+#i', '/page/'.$page, $parts['path'] ?? '/');
$parts['path'] = $path;
$query = [];
} else {
$query['page'] = (string) $page;
}
$url = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$url .= ':'.$parts['port'];
}
$url .= $parts['path'] ?? '/';
if ($query !== []) {
$url .= '?'.http_build_query($query);
}
return $url;
}
/**
* @return list
*/
public static function fetchPagesHtml(string $baseUrl, int $maxPages): array
{
$maxPages = max(1, min(50, $maxPages));
$first = HtmlCrawlSupport::fetchHtml($baseUrl);
$pages = [$first];
$detected = self::detectTotalPages($first, $baseUrl);
$usesIndexStyle = self::usesIndexUnderscoreStyle($baseUrl, $first);
$total = min($maxPages, $usesIndexStyle && $detected <= 1 ? $maxPages : max($detected, 1));
$total = min($maxPages, max($total, 1));
for ($page = 2; $page <= $total; $page++) {
$url = self::buildPageUrl($baseUrl, $page, $first);
if ($url === self::normalizeIndexFirstPageUrl($baseUrl)) {
break;
}
try {
$pages[] = HtmlCrawlSupport::fetchHtml($url, 30);
} catch (\Throwable) {
if ($usesIndexStyle) {
break;
}
}
}
return $pages;
}
/**
* 高校资讯常见:index.html(第 1 页)、index_2.html、index_3.html …
*/
protected static function usesIndexUnderscoreStyle(string $baseUrl, string $html): bool
{
$path = (string) parse_url($baseUrl, PHP_URL_PATH);
if (preg_match('#/(?:index|list)(?:_\d+)?\.(?:html?|shtml)$#i', $path)) {
return true;
}
return (bool) preg_match('#(?:index|list)_\d+\.(?:html?|shtml)#i', $html);
}
protected static function detectIndexUnderscoreMaxPage(string $html, string $baseUrl): int
{
$max = 1;
$haystack = $html.' '.$baseUrl;
if (preg_match_all('#(?:index|list)_(\d+)\.(?:html?|shtml)#i', $haystack, $matches)) {
foreach ($matches[1] as $num) {
$n = (int) $num;
if ($n >= 2 && $n <= 500) {
$max = max($max, $n);
}
}
}
return $max;
}
protected static function normalizeIndexFirstPageUrl(string $baseUrl): string
{
$parts = parse_url($baseUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $baseUrl;
}
$path = $parts['path'] ?? '/';
if (preg_match('#^(.*?/)(?:index|list)_\d+(\.(?:html?|shtml))$#i', $path, $m)) {
$path = $m[1].'index'.$m[2];
}
$url = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$url .= ':'.$parts['port'];
}
$url .= $path;
if (! empty($parts['query'])) {
$url .= '?'.$parts['query'];
}
return $url;
}
protected static function buildIndexUnderscorePageUrl(string $baseUrl, int $page): ?string
{
if ($page <= 1) {
return self::normalizeIndexFirstPageUrl($baseUrl);
}
$parts = parse_url($baseUrl);
if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return null;
}
$path = $parts['path'] ?? '/';
if (! preg_match('#^(.*?/)(index|list)(?:_\d+)?(\.(?:html?|shtml))$#i', $path, $m)) {
return null;
}
$path = $m[1].$m[2].'_'.$page.$m[3];
$url = $parts['scheme'].'://'.$parts['host'];
if (! empty($parts['port'])) {
$url .= ':'.$parts['port'];
}
$url .= $path;
if (! empty($parts['query'])) {
$url .= '?'.$parts['query'];
}
return $url;
}
}