slake-school-service/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;

/**
 * 通用院系/师资列表页：优先邮箱条目；无邮箱时解析 tsites 等列表卡片（姓名、单位、职称、主页）。
 */
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
{
    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
        $maxResults = min(500, max(1, (int) ($params['max_results'] ?? 30)));
        $maxPages = min(50, max(1, (int) ($params['max_pages'] ?? 1)));

        $baseUrl = $this->normalizeRequestUrl($requestUrl);
        $firstHtml = $this->fetchHtml($baseUrl);
        $totalPages = $this->detectTotalPages($firstHtml);
        $pagesToFetch = min($maxPages, $totalPages);

        $merged = [];
        $seen = [];

        for ($page = 1; $page <= $pagesToFetch; $page++) {
            $html = $page === 1
                ? $firstHtml
                : $this->fetchHtml($this->buildPageUrl($baseUrl, $page, $firstHtml));

            foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
                if (isset($seen[$item->externalId])) {
                    continue;
                }
                $seen[$item->externalId] = true;
                $merged[] = $item;
                if (count($merged) >= $maxResults) {
                    break 2;
                }
            }
        }

        return $this->enrichEmailsFromProfilePages($merged);
    }

    /**
     * @param  list<CrawlItemDto>  $items
     * @return list<CrawlItemDto>
     */
    protected function enrichEmailsFromProfilePages(array $items): array
    {
        if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
            return $items;
        }

        $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 6)));
        $timeout = max(8, (int) config('crawl.faculty.profile_http_timeout_seconds', 20));
        $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];

        $enriched = [];
        foreach (array_chunk($items, $poolSize) as $chunk) {
            $pending = [];
            foreach ($chunk as $item) {
                if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
                    $enriched[] = $item;

                    continue;
                }
                $pending[$item->externalId] = $item;
            }

            if ($pending === []) {
                continue;
            }

            $responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pending, $headers, $timeout) {
                foreach ($pending as $externalId => $item) {
                    $pool->as($externalId)
                        ->timeout($timeout)
                        ->withHeaders($headers)
                        ->get($item->canonicalUrl);
                }
            });

            foreach ($pending as $externalId => $item) {
                $response = $responses[$externalId] ?? null;
                if ($response && $response->successful()) {
                    $email = $this->extractEmailFromProfileHtml((string) $response->body());
                    if ($email) {
                        $item = $this->applyEmailToItem($item, $email);
                    }
                }
                $enriched[] = $item;
            }
        }

        return $enriched;
    }

    protected function itemHasEmail(CrawlItemDto $item): bool
    {
        $lead = $item->extra['lead_author'] ?? null;
        if (is_array($lead) && CrawlAuthorParser::normalizeEmail($lead['email'] ?? null)) {
            return true;
        }

        foreach ($item->authorsParsed as $author) {
            if (CrawlAuthorParser::normalizeEmail($author['email'] ?? null)) {
                return true;
            }
        }

        return false;
    }

    protected function applyEmailToItem(CrawlItemDto $item, string $email): CrawlItemDto
    {
        $email = CrawlAuthorParser::normalizeEmail($email) ?? $email;
        $lead = is_array($item->extra['lead_author'] ?? null) ? $item->extra['lead_author'] : [];
        $lead['email'] = $email;

        $authorsParsed = $item->authorsParsed;
        if ($authorsParsed === []) {
            $authorsParsed = [[
                'name' => $item->title,
                'email' => $email,
                'affiliation' => $lead['affiliation'] ?? $lead['college'] ?? null,
                'university_name' => $lead['university_name'] ?? $item->schoolName,
            ]];
        } else {
            $authorsParsed[0]['email'] = $email;
        }

        $extra = $item->extra;
        $extra['lead_author'] = $lead;

        return new CrawlItemDto(
            externalId: $item->externalId,
            title: $item->title,
            canonicalUrl: $item->canonicalUrl,
            authors: $item->authors,
            summary: $item->summary,
            publishedAt: $item->publishedAt,
            schoolName: $item->schoolName,
            section: $item->section,
            contentHtml: $item->contentHtml,
            extra: $extra,
            authorsParsed: $authorsParsed,
        );
    }

    protected function extractEmailFromProfileHtml(string $html): ?string
    {
        $labeledPatterns = [
            '/电子邮箱[：:]\s*<\/strong>\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
            '/电子邮箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
            '/电子信箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
            '/E-?mail[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/iu',
            '/邮箱[：:]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/u',
        ];

        foreach ($labeledPatterns as $pattern) {
            if (preg_match($pattern, $html, $match)) {
                $email = CrawlAuthorParser::normalizeEmail($match[1]);
                if ($email && ! $this->isNoiseEmail($email)) {
                    return $email;
                }
            }
        }

        $candidates = [];
        if (preg_match_all(
            '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
            $html,
            $emailMatches,
        )) {
            foreach ($emailMatches[1] as $raw) {
                $email = CrawlAuthorParser::normalizeEmail($raw);
                if ($email && ! $this->isNoiseEmail($email)) {
                    $candidates[] = $email;
                }
            }
        }

        if ($candidates === []) {
            return null;
        }

        $candidates = array_values(array_unique($candidates));

        foreach ($candidates as $email) {
            if (str_ends_with($email, '.edu.cn') || str_ends_with($email, '.edu')) {
                return $email;
            }
        }

        return $candidates[0];
    }

    protected function isNoiseEmail(string $email): bool
    {
        return (bool) preg_match(
            '/^(noreply|no-reply|admin|webmaster|postmaster|root|support|service|info|contact)@/i',
            $email,
        );
    }

    protected function fetchHtml(string $url): string
    {
        $response = Http::timeout(30)
            ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
            ->get($url);

        if (! $response->successful()) {
            throw new \RuntimeException('页面请求失败（HTTP '.$response->status().'）：'.$url);
        }

        return (string) $response->body();
    }

    protected function detectTotalPages(string $html): int
    {
        if (preg_match('/totalpage=(\d+)/i', $html, $match)) {
            return max(1, (int) $match[1]);
        }

        if (preg_match('/共\s*(\d+)\s*条/u', $html, $countMatch)) {
            $perPage = 0;
            if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
                $perPage = preg_match_all('/<div\s+class="name">/u', $listMatch[1]) ?: 0;
            }
            if ($perPage > 0) {
                return max(1, (int) ceil(((int) $countMatch[1]) / $perPage));
            }
        }

        return 1;
    }

    protected function buildPageUrl(string $baseUrl, int $page, string $firstPageHtml): string
    {
        $parts = parse_url($baseUrl);
        if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
            return $baseUrl;
        }

        parse_str((string) ($parts['query'] ?? ''), $query);
        $query['PAGENUM'] = (string) $page;

        if (preg_match('/totalpage=(\d+)/i', $firstPageHtml, $totalMatch)) {
            $query['totalpage'] = $totalMatch[1];
        }

        $url = $parts['scheme'].'://'.$parts['host'];
        if (! empty($parts['port'])) {
            $url .= ':'.$parts['port'];
        }
        $url .= $parts['path'] ?? '/';
        if ($query !== []) {
            $url .= '?'.http_build_query($query);
        }

        return $url;
    }

    protected function normalizeRequestUrl(string $url): string
    {
        $parts = parse_url($url);
        if (! is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
            return $url;
        }

        $normalized = $parts['scheme'].'://'.$parts['host'];
        if (! empty($parts['port'])) {
            $normalized .= ':'.$parts['port'];
        }
        $normalized .= $parts['path'] ?? '/';
        if (! empty($parts['query'])) {
            $normalized .= '?'.$parts['query'];
        }

        return $normalized;
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
    {
        $items = $this->extractFromEmailBlocks($html, $keywords, $sourceUrl);
        if ($items !== []) {
            return $items;
        }

        return $this->extractFromStructuredFacultyList($html, $keywords, $sourceUrl);
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromEmailBlocks(string $html, array $keywords, string $sourceUrl): array
    {
        $items = [];
        $seen = [];

        if (! preg_match_all(
            '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
            $html,
            $emailMatches,
            PREG_OFFSET_CAPTURE
        )) {
            return [];
        }

        foreach ($emailMatches[1] as $match) {
            $email = CrawlAuthorParser::normalizeEmail($match[0]);
            if (! $email || isset($seen[$email])) {
                continue;
            }

            $pos = (int) $match[1];
            $window = substr($html, max(0, $pos - 400), 800);
            $plain = $this->htmlToPlain($window);

            if (! $this->matchesKeywords($plain, $keywords)) {
                continue;
            }

            $name = $this->guessName($plain, $email);
            if ($name === '') {
                continue;
            }

            $affiliation = $this->guessAffiliation($plain);
            $seen[$email] = true;

            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($email),
                name: $name,
                profileUrl: $sourceUrl,
                email: $email,
                affiliation: $affiliation,
                universityName: CrawlAuthorParser::universityFromAffiliation($affiliation)
                    ?? $this->inferUniversityFromSource($sourceUrl, $html),
                summary: Str::limit($plain, 300),
                keywords: $keywords,
                academicTitle: null,
                platform: 'faculty_html',
            );
        }

        return $items;
    }

    /**
     * 上海交通大学等 tsites.CollegeTeacherList：div.list > ul > li 卡片。
     *
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromStructuredFacultyList(string $html, array $keywords, string $sourceUrl): array
    {
        $items = [];
        $seen = [];

        $collegeName = null;
        if (preg_match('/<div\s+class="title">\s*([^<]+?)\s*<\/div>/u', $html, $collegeMatch)) {
            $collegeName = CrawlAuthorParser::cleanText($collegeMatch[1]);
        }

        $listHtml = $html;
        if (preg_match('/<div\s+class="list"[^>]*>\s*<ul>(.*?)<\/ul>\s*<\/div>/su', $html, $listMatch)) {
            $listHtml = $listMatch[1];
        }

        if (! preg_match_all('#<li>(.*?)</li>#su', $listHtml, $liBlocks)) {
            return [];
        }

        $pageUniversity = $this->inferUniversityFromSource($sourceUrl, $html);

        foreach ($liBlocks[1] as $inner) {
            $inner = (string) $inner;
            if (! preg_match('/<div\s+class="name">\s*([^<]+?)\s*<\/div>/u', $inner, $nameMatch)) {
                continue;
            }

            $name = CrawlAuthorParser::cleanText($nameMatch[1]);
            if ($name === '' || ! $this->looksLikePersonName($name)) {
                continue;
            }

            $href = '';
            if (preg_match('/<a\s+[^>]*href="([^"]*)"/u', $inner, $hrefMatch)) {
                $href = (string) $hrefMatch[1];
            }

            $profileUrl = $this->resolveUrl($href, $sourceUrl)
                ?? $this->inferProfileUrlFromPageScripts($html, $name, $sourceUrl);
            $plain = $this->htmlToPlain($inner);

            if (! $this->matchesKeywords($plain.' '.$name, $keywords)) {
                continue;
            }

            $dedupeKey = $profileUrl ?: ('name:'.md5($name));
            if (isset($seen[$dedupeKey])) {
                continue;
            }
            $seen[$dedupeKey] = true;

            $affiliation = $this->parseLabeledField($inner, '所在单位')
                ?? $collegeName;
            $academicTitle = $this->parseLabeledField($inner, '职称');
            // 列表页「所在单位」多为学院，高校名称从站点/页头推断
            $universityName = $pageUniversity ?? CrawlAuthorParser::universityFromAffiliation($affiliation);

            $summaryParts = array_filter([
                $academicTitle ? '职称：'.$academicTitle : null,
                $affiliation ? '单位：'.$affiliation : null,
                $this->parseLabeledField($inner, '简介'),
            ]);

            $items[] = $this->makeFacultyItem(
                externalKey: 'faculty:'.md5($dedupeKey),
                name: $name,
                profileUrl: $profileUrl,
                email: null,
                affiliation: $affiliation,
                universityName: $universityName,
                summary: Str::limit(implode('；', $summaryParts), 300),
                keywords: $keywords,
                academicTitle: $academicTitle,
                platform: 'faculty_html_tsites',
            );
        }

        return $items;
    }

    /**
     * @param  list<string>  $keywords
     */
    protected function makeFacultyItem(
        string $externalKey,
        string $name,
        ?string $profileUrl,
        ?string $email,
        ?string $affiliation,
        ?string $universityName,
        ?string $summary,
        array $keywords,
        ?string $academicTitle,
        string $platform,
    ): CrawlItemDto {
        $college = $affiliation;
        $lead = [
            'name' => $name,
            'email' => $email,
            'affiliation' => $college,
            'college' => $college,
            'university_name' => $universityName,
            'academic_title' => $academicTitle,
            'profile_url' => $profileUrl,
        ];

        return new CrawlItemDto(
            externalId: $externalKey,
            title: $name,
            canonicalUrl: $profileUrl,
            authors: $name,
            summary: $summary,
            schoolName: $universityName,
            extra: [
                'platform' => $platform,
                'academic_title' => $academicTitle,
                'college_name' => $college,
                'profile_url' => $profileUrl,
                'lead_author' => $lead,
                'keyword' => implode(' ', $keywords),
            ],
            authorsParsed: [[
                'name' => $name,
                'email' => $email,
                'affiliation' => $college,
                'university_name' => $universityName,
                'academic_title' => $academicTitle,
            ]],
        );
    }

    /**
     * @param  list<string>  $keywords
     */
    protected function matchesKeywords(string $plain, array $keywords): bool
    {
        if ($keywords === []) {
            return true;
        }

        foreach ($keywords as $kw) {
            if ($kw !== '' && stripos($plain, $kw) !== false) {
                return true;
            }
        }

        return false;
    }

    protected function htmlToPlain(string $html): string
    {
        $plain = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, 'UTF-8');

        return preg_replace('/\s+/u', ' ', $plain) ?? '';
    }

    protected function parseLabeledField(string $html, string $label): ?string
    {
        $pattern = '/'.preg_quote($label, '/').'[：:]\s*([^<]+)/u';
        if (! preg_match($pattern, $html, $match)) {
            return null;
        }

        return CrawlAuthorParser::cleanText($match[1]);
    }

    protected function looksLikePersonName(string $name): bool
    {
        if (preg_match('/^(首页|登录|联系我们|下页|尾页|转到)/u', $name)) {
            return false;
        }

        return (bool) preg_match('/^[\x{4e00}-\x{9fff}]{2,4}(?:·[\x{4e00}-\x{9fff}]{2,4})?$/u', $name)
            || (bool) preg_match('/^[A-Za-z][A-Za-z\s\.\-]{1,30}$/', $name);
    }

    protected function resolveUrl(string $href, string $baseUrl): ?string
    {
        $href = trim(html_entity_decode($href, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
        if ($href === '' || str_starts_with($href, 'javascript:')) {
            return null;
        }

        if (preg_match('#^https?://#i', $href)) {
            return $href;
        }

        $base = parse_url($baseUrl);
        if (! is_array($base) || empty($base['scheme']) || empty($base['host'])) {
            return $href;
        }

        $origin = $base['scheme'].'://'.$base['host'];
        if (! empty($base['port'])) {
            $origin .= ':'.$base['port'];
        }

        if (str_starts_with($href, '//')) {
            return $base['scheme'].':'.$href;
        }

        if (str_starts_with($href, '/')) {
            return $origin.$href;
        }

        $path = $base['path'] ?? '/';
        $dir = str_contains($path, '/') ? substr($path, 0, (int) strrpos($path, '/') + 1) : '/';

        return $origin.$dir.$href;
    }

    protected function inferProfileUrlFromPageScripts(string $html, string $name, string $sourceUrl): ?string
    {
        $escaped = preg_quote($name, '/');
        if (! preg_match(
            '/addimg\(\s*(?:"[^"]*"|\'[^\']*\')\s*,\s*"(\\/[^"]+index\.htm)"\s*,\s*"'.$escaped.'"/u',
            $html,
            $match,
        )) {
            return null;
        }

        return $this->resolveUrl($match[1], $sourceUrl);
    }

    protected function inferUniversityFromSource(string $sourceUrl, string $html): ?string
    {
        $host = parse_url($sourceUrl, PHP_URL_HOST);
        if (is_string($host)) {
            $host = strtolower($host);
            if (str_contains($host, 'sjtu.edu.cn')) {
                return '上海交通大学';
            }
            if (str_contains($host, 'tsinghua.edu.cn')) {
                return '清华大学';
            }
            if (str_contains($host, 'pku.edu.cn')) {
                return '北京大学';
            }
            if (str_contains($host, 'zju.edu.cn')) {
                return '浙江大学';
            }
            if (str_contains($host, 'fudan.edu.cn')) {
                return '复旦大学';
            }
        }

        if (preg_match('/([\x{4e00}-\x{9fff}A-Za-z]{2,20}大学)/u', $this->htmlToPlain($html), $match)) {
            return CrawlAuthorParser::cleanText($match[1]);
        }

        return null;
    }

    protected function guessName(string $plain, string $email): string
    {
        if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
            return trim($m[1]);
        }

        $local = strstr($email, '@', true) ?: '';
        $local = str_replace(['.', '_', '-'], ' ', $local);

        return Str::title(trim($local));
    }

    protected function guessAffiliation(string $plain): ?string
    {
        if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
            return CrawlAuthorParser::cleanText($m[1]);
        }

        return null;
    }
}