1) { return $indexMax; } if (preg_match('/totalpage=(\d+)/i', $html, $match)) { return max(1, (int) $match[1]); } if (preg_match('/共\s*(\d+)\s*页/u', $html, $match)) { return max(1, (int) $match[1]); } $max = 1; if (preg_match_all('#[?&](?:page|p|pageNum|PAGENUM|pn)=(\d+)#i', $html, $matches)) { foreach ($matches[1] as $num) { $max = max($max, (int) $num); } } if (preg_match_all('#/(?:page|pages?)/(\d+)(?:/|[\?"\'\s>])#i', $html, $matches)) { foreach ($matches[1] as $num) { $max = max($max, (int) $num); } } if (preg_match('#class=["\'][^"\']*(?:pagination|page-nav|pages|pager)[^"\']*["\']#i', $html)) { if (preg_match_all('#>(\d{1,3})#', $html, $matches)) { foreach ($matches[1] as $num) { $n = (int) $num; if ($n > 0 && $n <= 500) { $max = max($max, $n); } } } } parse_str((string) parse_url($baseUrl, PHP_URL_QUERY), $query); if (isset($query['totalpage'])) { $max = max($max, (int) $query['totalpage']); } return max(1, $max); } public static function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string { if ($page <= 1) { return self::normalizeIndexFirstPageUrl($baseUrl); } $indexUrl = self::buildIndexUnderscorePageUrl($baseUrl, $page); if ($indexUrl !== null) { return $indexUrl; } $parts = parse_url($baseUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $baseUrl; } parse_str((string) ($parts['query'] ?? ''), $query); if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch) || preg_match('/totalpage=(\d+)/i', $baseUrl, $totalMatch)) { $query['PAGENUM'] = (string) $page; $query['totalpage'] = $totalMatch[1]; } elseif (array_key_exists('PAGENUM', $query) || preg_match('/PAGENUM=/i', $firstPageHtml)) { $query['PAGENUM'] = (string) $page; } elseif (array_key_exists('pageNum', $query) || preg_match('/pageNum=/i', $firstPageHtml)) { $query['pageNum'] = (string) $page; } elseif (array_key_exists('page', $query) || preg_match('/[?&]page=\d+/i', $firstPageHtml)) { $query['page'] = (string) $page; } elseif (array_key_exists('p', $query) || preg_match('/[?&]p=\d+/i', $firstPageHtml)) { $query['p'] = (string) $page; } elseif (preg_match('#/(?:page|pages?)/(\d+)#i', $baseUrl)) { $path = preg_replace('#/(?:page|pages?)/\d+#i', '/page/'.$page, $parts['path'] ?? '/'); $parts['path'] = $path; $query = []; } else { $query['page'] = (string) $page; } $url = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $url .= ':'.$parts['port']; } $url .= $parts['path'] ?? '/'; if ($query !== []) { $url .= '?'.http_build_query($query); } return $url; } /** * @return list */ public static function fetchPagesHtml(string $baseUrl, int $maxPages): array { $maxPages = max(1, min(50, $maxPages)); $first = HtmlCrawlSupport::fetchHtml($baseUrl); $pages = [$first]; $detected = self::detectTotalPages($first, $baseUrl); $usesIndexStyle = self::usesIndexUnderscoreStyle($baseUrl, $first); $total = min($maxPages, $usesIndexStyle && $detected <= 1 ? $maxPages : max($detected, 1)); $total = min($maxPages, max($total, 1)); for ($page = 2; $page <= $total; $page++) { $url = self::buildPageUrl($baseUrl, $page, $first); if ($url === self::normalizeIndexFirstPageUrl($baseUrl)) { break; } try { $pages[] = HtmlCrawlSupport::fetchHtml($url, 30); } catch (\Throwable) { if ($usesIndexStyle) { break; } } } return $pages; } /** * 高校资讯常见:index.html(第 1 页)、index_2.html、index_3.html … */ protected static function usesIndexUnderscoreStyle(string $baseUrl, string $html): bool { $path = (string) parse_url($baseUrl, PHP_URL_PATH); if (preg_match('#/(?:index|list)(?:_\d+)?\.(?:html?|shtml)$#i', $path)) { return true; } return (bool) preg_match('#(?:index|list)_\d+\.(?:html?|shtml)#i', $html); } protected static function detectIndexUnderscoreMaxPage(string $html, string $baseUrl): int { $max = 1; $haystack = $html.' '.$baseUrl; if (preg_match_all('#(?:index|list)_(\d+)\.(?:html?|shtml)#i', $haystack, $matches)) { foreach ($matches[1] as $num) { $n = (int) $num; if ($n >= 2 && $n <= 500) { $max = max($max, $n); } } } return $max; } protected static function normalizeIndexFirstPageUrl(string $baseUrl): string { $parts = parse_url($baseUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return $baseUrl; } $path = $parts['path'] ?? '/'; if (preg_match('#^(.*?/)(?:index|list)_\d+(\.(?:html?|shtml))$#i', $path, $m)) { $path = $m[1].'index'.$m[2]; } $url = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $url .= ':'.$parts['port']; } $url .= $path; if (! empty($parts['query'])) { $url .= '?'.$parts['query']; } return $url; } protected static function buildIndexUnderscorePageUrl(string $baseUrl, int $page): ?string { if ($page <= 1) { return self::normalizeIndexFirstPageUrl($baseUrl); } $parts = parse_url($baseUrl); if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) { return null; } $path = $parts['path'] ?? '/'; if (! preg_match('#^(.*?/)(index|list)(?:_\d+)?(\.(?:html?|shtml))$#i', $path, $m)) { return null; } $path = $m[1].$m[2].'_'.$page.$m[3]; $url = $parts['scheme'].'://'.$parts['host']; if (! empty($parts['port'])) { $url .= ':'.$parts['port']; } $url .= $path; if (! empty($parts['query'])) { $url .= '?'.$parts['query']; } return $url; } }