slake-school-service/app/Services/Crawl/Adapters/FacultyListHtmlAdapter.php

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;

/**
 * 通用院系/师资列表页：按关键词匹配区块，提取姓名、邮箱、机构。
 */
class FacultyListHtmlAdapter implements CrawlerAdapterInterface
{
    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));

        $maxResults = min(100, max(1, (int) ($params['max_results'] ?? 30)));

        $response = Http::timeout(30)
            ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
            ->get($requestUrl);

        if (! $response->successful()) {
            throw new \RuntimeException('页面请求失败（HTTP '.$response->status().'）');
        }

        $html = (string) $response->body();
        $items = $this->extractFromHtml($html, $keywords, $requestUrl);

        return array_slice($items, 0, $maxResults);
    }

    /**
     * @param  list<string>  $keywords
     * @return list<CrawlItemDto>
     */
    protected function extractFromHtml(string $html, array $keywords, string $sourceUrl): array
    {
        $items = [];
        $seen = [];

        if (! preg_match_all(
            '#([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})#',
            $html,
            $emailMatches,
            PREG_OFFSET_CAPTURE
        )) {
            return [];
        }

        foreach ($emailMatches[1] as $match) {
            $email = CrawlAuthorParser::normalizeEmail($match[0]);
            if (! $email || isset($seen[$email])) {
                continue;
            }

            $pos = (int) $match[1];
            $window = substr($html, max(0, $pos - 400), 800);
            $plain = html_entity_decode(strip_tags($window), ENT_QUOTES | ENT_HTML5, 'UTF-8');
            $plain = preg_replace('/\s+/u', ' ', $plain) ?? '';

            if ($keywords !== []) {
                $hitKeyword = false;
                foreach ($keywords as $kw) {
                    if ($kw !== '' && stripos($plain, $kw) !== false) {
                        $hitKeyword = true;
                        break;
                    }
                }
                if (! $hitKeyword) {
                    continue;
                }
            }

            $name = $this->guessName($plain, $email);
            if ($name === '') {
                continue;
            }

            $affiliation = $this->guessAffiliation($plain);
            $seen[$email] = true;

            $items[] = new CrawlItemDto(
                externalId: 'faculty:'.md5($email),
                title: $name,
                canonicalUrl: $sourceUrl,
                authors: $name,
                summary: Str::limit($plain, 300),
                schoolName: CrawlAuthorParser::universityFromAffiliation($affiliation),
                extra: [
                    'platform' => 'faculty_html',
                    'lead_author' => [
                        'name' => $name,
                        'email' => $email,
                        'affiliation' => $affiliation,
                        'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
                    ],
                    'keyword' => implode(' ', $keywords),
                ],
                authorsParsed: [[
                    'name' => $name,
                    'email' => $email,
                    'affiliation' => $affiliation,
                    'university_name' => CrawlAuthorParser::universityFromAffiliation($affiliation),
                ]],
            );
        }

        return $items;
    }

    protected function guessName(string $plain, string $email): string
    {
        if (preg_match('/([\x{4e00}-\x{9fff}]{2,4})\s*(?:教授|副教授|讲师|研究员|博士|老师)?/u', $plain, $m)) {
            return trim($m[1]);
        }

        $local = strstr($email, '@', true) ?: '';
        $local = str_replace(['.', '_', '-'], ' ', $local);

        return Str::title(trim($local));
    }

    protected function guessAffiliation(string $plain): ?string
    {
        if (preg_match('/((?:[\x{4e00}-\x{9fff}A-Za-z\s]{2,40}(?:大学|学院|研究院|研究所|University|College)))/u', $plain, $m)) {
            return CrawlAuthorParser::cleanText($m[1]);
        }

        return null;
    }
}