master
lion 1 day ago
parent 099c609328
commit f9256f17bb

@ -137,6 +137,8 @@ class CrawlJobController extends Controller
'status' => 'pending',
]);
@set_time_limit(300);
try {
$job = $runner->run($job, $source, $params);
$this->applyCrawlDefaultsToPreviewItems(
@ -412,12 +414,18 @@ class CrawlJobController extends Controller
}
if ($job->target_type === 'teacher') {
return sprintf(
$summary = sprintf(
'已从 %s 抓取 %d 位老师,已入库 %d 位老师',
$sourceName,
$fetched,
(int) ($importResult['teachers_imported'] ?? 0),
);
$skippedProfiles = $this->countProfileEnrichSkipped($job);
if ($skippedProfiles > 0) {
$summary .= sprintf('%d 位未访问主页补邮箱,避免超时)', $skippedProfiles);
}
return $summary;
}
return sprintf(
@ -428,6 +436,20 @@ class CrawlJobController extends Controller
);
}
protected function countProfileEnrichSkipped(CrawlJob $job): int
{
return (int) CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('target_type', 'teacher')
->get(['payload'])
->filter(function (CrawlJobItem $item) {
$extra = $item->payload['extra'] ?? [];
return ($extra['profile_enrich_skipped'] ?? false) === true;
})
->count();
}
/**
* @param array{
* imported:int,

@ -30,10 +30,14 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
$merged = [];
$seen = [];
for ($page = 1; $page <= $pagesToFetch; $page++) {
for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) {
$html = $page === 1
? $firstHtml
: $this->fetchHtml($this->buildPageUrl($baseUrl, $page, $firstHtml));
: null;
if ($html === null) {
break;
}
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
@ -47,52 +51,156 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
}
}
return $this->enrichEmailsFromProfilePages($merged);
if ($pagesToFetch > 1 && count($merged) < $maxResults) {
$merged = $this->fetchRemainingListPages(
$baseUrl,
$firstHtml,
$pagesToFetch,
$keywords,
$requestUrl,
$merged,
$seen,
$maxResults,
);
}
return $this->enrichEmailsFromProfilePages($merged, $params);
}
/**
* @param list<CrawlItemDto> $merged
* @param array<string, true> $seen
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchRemainingListPages(
string $baseUrl,
string $firstHtml,
int $pagesToFetch,
array $keywords,
string $requestUrl,
array $merged,
array $seen,
int $maxResults,
): array {
$poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5)));
$pageUrls = [];
for ($page = 2; $page <= $pagesToFetch; $page++) {
$pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml);
}
foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) {
$htmlByPage = $this->fetchHtmlPool($chunk);
ksort($htmlByPage);
foreach ($htmlByPage as $html) {
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$merged[] = $item;
if (count($merged) >= $maxResults) {
return $merged;
}
}
}
}
return $merged;
}
/**
* @param array<int, string> $pageUrls
* @return array<int, string>
*/
protected function fetchHtmlPool(array $pageUrls): array
{
if ($pageUrls === []) {
return [];
}
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) {
foreach ($pageUrls as $page => $url) {
$pool->as((string) $page)
->timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers)
->get($url);
}
});
$htmlByPage = [];
foreach ($pageUrls as $page => $url) {
$body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null);
if ($body !== null && $body !== '') {
$htmlByPage[$page] = $body;
}
}
return $htmlByPage;
}
/**
* @param list<CrawlItemDto> $items
* @param array<string, mixed> $params
* @return list<CrawlItemDto>
*/
protected function enrichEmailsFromProfilePages(array $items): array
protected function enrichEmailsFromProfilePages(array $items, array $params = []): array
{
if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
return $items;
}
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 6)));
$timeout = max(8, (int) config('crawl.faculty.profile_http_timeout_seconds', 20));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$maxEnrich = $this->resolveProfileEnrichMax($params, count($items));
if ($maxEnrich <= 0) {
return $this->markProfileEnrichSkipped($items);
}
$enriched = [];
foreach (array_chunk($items, $poolSize) as $chunk) {
$pending = [];
foreach ($chunk as $item) {
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
$enriched[] = $item;
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8)));
$timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
continue;
}
$pending[$item->externalId] = $item;
$fetchMap = [];
$enrichBudget = $maxEnrich;
foreach ($items as $index => $item) {
if ($enrichBudget <= 0) {
break;
}
if ($pending === []) {
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
continue;
}
$fetchMap[$index] = $item;
$enrichBudget--;
}
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pending, $headers, $timeout) {
foreach ($pending as $externalId => $item) {
$pool->as($externalId)
if ($fetchMap === []) {
return $items;
}
$fetchedBodies = [];
foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) {
$batchPending = [];
foreach ($chunk as $index => $item) {
$batchPending[$index] = $item;
}
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) {
foreach ($batchPending as $index => $item) {
$pool->as((string) $index)
->timeout($timeout)
->connectTimeout(min(10, $timeout))
->retry(1, 500, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers)
->get($item->canonicalUrl);
}
});
foreach ($pending as $externalId => $item) {
$body = $this->responseBodyFromPoolResult($responses[$externalId] ?? null);
foreach ($batchPending as $index => $item) {
$body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null);
if ($body !== null) {
$email = $this->extractEmailFromProfileHtml($body);
if ($email) {
@ -100,11 +208,69 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
}
$item = $this->applyProfileMetadataToItem($item, $body);
}
$enriched[] = $item;
$fetchedBodies[$index] = $item;
}
}
return $enriched;
$result = [];
foreach ($items as $index => $item) {
if (isset($fetchedBodies[$index])) {
$result[] = $fetchedBodies[$index];
} elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) {
$result[] = $this->markItemProfileEnrichSkipped($item);
} else {
$result[] = $item;
}
}
return $result;
}
/**
* @param array<string, mixed> $params
*/
protected function resolveProfileEnrichMax(array $params, int $itemCount): int
{
if (($params['skip_profile_enrich'] ?? false) === true) {
return 0;
}
$configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32));
return max(0, min($itemCount, min(200, $configured)));
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function markProfileEnrichSkipped(array $items): array
{
return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items);
}
protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto
{
if ($this->itemHasEmail($item)) {
return $item;
}
$extra = $item->extra;
$extra['profile_enrich_skipped'] = true;
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $item->authorsParsed,
);
}
protected function responseBodyFromPoolResult(mixed $result): ?string
@ -226,7 +392,11 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
protected function fetchHtml(string $url): string
{
$response = Http::timeout(30)
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
->get($url);

@ -31,8 +31,13 @@ return [
'faculty' => [
/** 列表项无邮箱时,是否请求教师主页补全邮箱 */
'profile_email_enrich_enabled' => (bool) env('FACULTY_PROFILE_EMAIL_ENRICH', true),
'profile_http_timeout_seconds' => (int) env('FACULTY_PROFILE_HTTP_TIMEOUT', 20),
/** 单次任务最多补全主页数(其余仍入库,仅无邮箱) */
'profile_enrich_max' => (int) env('FACULTY_PROFILE_ENRICH_MAX', 32),
'profile_http_timeout_seconds' => (int) env('FACULTY_PROFILE_HTTP_TIMEOUT', 10),
/** 并发请求教师主页数 */
'profile_enrich_pool_size' => (int) env('FACULTY_PROFILE_ENRICH_POOL', 6),
'profile_enrich_pool_size' => (int) env('FACULTY_PROFILE_ENRICH_POOL', 8),
'list_http_timeout_seconds' => (int) env('FACULTY_LIST_HTTP_TIMEOUT', 20),
/** 师资列表分页并发抓取数 */
'list_fetch_pool_size' => (int) env('FACULTY_LIST_FETCH_POOL', 5),
],
];

@ -210,6 +210,17 @@ HTML;
$this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']);
}
public function test_resolve_profile_enrich_max_caps_large_batches(): void
{
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'resolveProfileEnrichMax');
$method->setAccessible(true);
$this->assertSame(32, $method->invoke($adapter, [], 500));
$this->assertSame(10, $method->invoke($adapter, ['profile_enrich_max' => 10], 500));
$this->assertSame(0, $method->invoke($adapter, ['skip_profile_enrich' => true], 500));
}
public function test_response_body_from_pool_result_ignores_connection_exception(): void
{
$adapter = new FacultyListHtmlAdapter;

Loading…
Cancel
Save