master
lion 3 days ago
parent 099c609328
commit f9256f17bb

@ -137,6 +137,8 @@ class CrawlJobController extends Controller
'status' => 'pending', 'status' => 'pending',
]); ]);
@set_time_limit(300);
try { try {
$job = $runner->run($job, $source, $params); $job = $runner->run($job, $source, $params);
$this->applyCrawlDefaultsToPreviewItems( $this->applyCrawlDefaultsToPreviewItems(
@ -412,12 +414,18 @@ class CrawlJobController extends Controller
} }
if ($job->target_type === 'teacher') { if ($job->target_type === 'teacher') {
return sprintf( $summary = sprintf(
'已从 %s 抓取 %d 位老师,已入库 %d 位老师', '已从 %s 抓取 %d 位老师,已入库 %d 位老师',
$sourceName, $sourceName,
$fetched, $fetched,
(int) ($importResult['teachers_imported'] ?? 0), (int) ($importResult['teachers_imported'] ?? 0),
); );
$skippedProfiles = $this->countProfileEnrichSkipped($job);
if ($skippedProfiles > 0) {
$summary .= sprintf('%d 位未访问主页补邮箱,避免超时)', $skippedProfiles);
}
return $summary;
} }
return sprintf( return sprintf(
@ -428,6 +436,20 @@ class CrawlJobController extends Controller
); );
} }
protected function countProfileEnrichSkipped(CrawlJob $job): int
{
return (int) CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('target_type', 'teacher')
->get(['payload'])
->filter(function (CrawlJobItem $item) {
$extra = $item->payload['extra'] ?? [];
return ($extra['profile_enrich_skipped'] ?? false) === true;
})
->count();
}
/** /**
* @param array{ * @param array{
* imported:int, * imported:int,

@ -30,10 +30,14 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
$merged = []; $merged = [];
$seen = []; $seen = [];
for ($page = 1; $page <= $pagesToFetch; $page++) { for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) {
$html = $page === 1 $html = $page === 1
? $firstHtml ? $firstHtml
: $this->fetchHtml($this->buildPageUrl($baseUrl, $page, $firstHtml)); : null;
if ($html === null) {
break;
}
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) { foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) { if (isset($seen[$item->externalId])) {
@ -47,52 +51,156 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
} }
} }
return $this->enrichEmailsFromProfilePages($merged); if ($pagesToFetch > 1 && count($merged) < $maxResults) {
$merged = $this->fetchRemainingListPages(
$baseUrl,
$firstHtml,
$pagesToFetch,
$keywords,
$requestUrl,
$merged,
$seen,
$maxResults,
);
}
return $this->enrichEmailsFromProfilePages($merged, $params);
}
/**
* @param list<CrawlItemDto> $merged
* @param array<string, true> $seen
* @param list<string> $keywords
* @return list<CrawlItemDto>
*/
protected function fetchRemainingListPages(
string $baseUrl,
string $firstHtml,
int $pagesToFetch,
array $keywords,
string $requestUrl,
array $merged,
array $seen,
int $maxResults,
): array {
$poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5)));
$pageUrls = [];
for ($page = 2; $page <= $pagesToFetch; $page++) {
$pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml);
}
foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) {
$htmlByPage = $this->fetchHtmlPool($chunk);
ksort($htmlByPage);
foreach ($htmlByPage as $html) {
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
if (isset($seen[$item->externalId])) {
continue;
}
$seen[$item->externalId] = true;
$merged[] = $item;
if (count($merged) >= $maxResults) {
return $merged;
}
}
}
}
return $merged;
}
/**
* @param array<int, string> $pageUrls
* @return array<int, string>
*/
protected function fetchHtmlPool(array $pageUrls): array
{
if ($pageUrls === []) {
return [];
}
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) {
foreach ($pageUrls as $page => $url) {
$pool->as((string) $page)
->timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers)
->get($url);
}
});
$htmlByPage = [];
foreach ($pageUrls as $page => $url) {
$body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null);
if ($body !== null && $body !== '') {
$htmlByPage[$page] = $body;
}
}
return $htmlByPage;
} }
/** /**
* @param list<CrawlItemDto> $items * @param list<CrawlItemDto> $items
* @param array<string, mixed> $params
* @return list<CrawlItemDto> * @return list<CrawlItemDto>
*/ */
protected function enrichEmailsFromProfilePages(array $items): array protected function enrichEmailsFromProfilePages(array $items, array $params = []): array
{ {
if (! config('crawl.faculty.profile_email_enrich_enabled', true)) { if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
return $items; return $items;
} }
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 6))); $maxEnrich = $this->resolveProfileEnrichMax($params, count($items));
$timeout = max(8, (int) config('crawl.faculty.profile_http_timeout_seconds', 20)); if ($maxEnrich <= 0) {
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']; return $this->markProfileEnrichSkipped($items);
}
$enriched = []; $poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8)));
foreach (array_chunk($items, $poolSize) as $chunk) { $timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10));
$pending = []; $headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
foreach ($chunk as $item) {
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
$enriched[] = $item;
continue; $fetchMap = [];
} $enrichBudget = $maxEnrich;
$pending[$item->externalId] = $item; foreach ($items as $index => $item) {
if ($enrichBudget <= 0) {
break;
} }
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
if ($pending === []) {
continue; continue;
} }
$fetchMap[$index] = $item;
$enrichBudget--;
}
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pending, $headers, $timeout) { if ($fetchMap === []) {
foreach ($pending as $externalId => $item) { return $items;
$pool->as($externalId) }
$fetchedBodies = [];
foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) {
$batchPending = [];
foreach ($chunk as $index => $item) {
$batchPending[$index] = $item;
}
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) {
foreach ($batchPending as $index => $item) {
$pool->as((string) $index)
->timeout($timeout) ->timeout($timeout)
->connectTimeout(min(10, $timeout)) ->connectTimeout(min(8, $timeout))
->retry(1, 500, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false) ->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders($headers) ->withHeaders($headers)
->get($item->canonicalUrl); ->get($item->canonicalUrl);
} }
}); });
foreach ($pending as $externalId => $item) { foreach ($batchPending as $index => $item) {
$body = $this->responseBodyFromPoolResult($responses[$externalId] ?? null); $body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null);
if ($body !== null) { if ($body !== null) {
$email = $this->extractEmailFromProfileHtml($body); $email = $this->extractEmailFromProfileHtml($body);
if ($email) { if ($email) {
@ -100,11 +208,69 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
} }
$item = $this->applyProfileMetadataToItem($item, $body); $item = $this->applyProfileMetadataToItem($item, $body);
} }
$enriched[] = $item; $fetchedBodies[$index] = $item;
} }
} }
return $enriched; $result = [];
foreach ($items as $index => $item) {
if (isset($fetchedBodies[$index])) {
$result[] = $fetchedBodies[$index];
} elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) {
$result[] = $this->markItemProfileEnrichSkipped($item);
} else {
$result[] = $item;
}
}
return $result;
}
/**
* @param array<string, mixed> $params
*/
protected function resolveProfileEnrichMax(array $params, int $itemCount): int
{
if (($params['skip_profile_enrich'] ?? false) === true) {
return 0;
}
$configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32));
return max(0, min($itemCount, min(200, $configured)));
}
/**
* @param list<CrawlItemDto> $items
* @return list<CrawlItemDto>
*/
protected function markProfileEnrichSkipped(array $items): array
{
return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items);
}
protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto
{
if ($this->itemHasEmail($item)) {
return $item;
}
$extra = $item->extra;
$extra['profile_enrich_skipped'] = true;
return new CrawlItemDto(
externalId: $item->externalId,
title: $item->title,
canonicalUrl: $item->canonicalUrl,
authors: $item->authors,
summary: $item->summary,
publishedAt: $item->publishedAt,
schoolName: $item->schoolName,
section: $item->section,
contentHtml: $item->contentHtml,
extra: $extra,
authorsParsed: $item->authorsParsed,
);
} }
protected function responseBodyFromPoolResult(mixed $result): ?string protected function responseBodyFromPoolResult(mixed $result): ?string
@ -226,7 +392,11 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
protected function fetchHtml(string $url): string protected function fetchHtml(string $url): string
{ {
$response = Http::timeout(30) $timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
$response = Http::timeout($timeout)
->connectTimeout(min(8, $timeout))
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html']) ->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
->get($url); ->get($url);

@ -31,8 +31,13 @@ return [
'faculty' => [ 'faculty' => [
/** 列表项无邮箱时,是否请求教师主页补全邮箱 */ /** 列表项无邮箱时,是否请求教师主页补全邮箱 */
'profile_email_enrich_enabled' => (bool) env('FACULTY_PROFILE_EMAIL_ENRICH', true), 'profile_email_enrich_enabled' => (bool) env('FACULTY_PROFILE_EMAIL_ENRICH', true),
'profile_http_timeout_seconds' => (int) env('FACULTY_PROFILE_HTTP_TIMEOUT', 20), /** 单次任务最多补全主页数(其余仍入库,仅无邮箱) */
'profile_enrich_max' => (int) env('FACULTY_PROFILE_ENRICH_MAX', 32),
'profile_http_timeout_seconds' => (int) env('FACULTY_PROFILE_HTTP_TIMEOUT', 10),
/** 并发请求教师主页数 */ /** 并发请求教师主页数 */
'profile_enrich_pool_size' => (int) env('FACULTY_PROFILE_ENRICH_POOL', 6), 'profile_enrich_pool_size' => (int) env('FACULTY_PROFILE_ENRICH_POOL', 8),
'list_http_timeout_seconds' => (int) env('FACULTY_LIST_HTTP_TIMEOUT', 20),
/** 师资列表分页并发抓取数 */
'list_fetch_pool_size' => (int) env('FACULTY_LIST_FETCH_POOL', 5),
], ],
]; ];

@ -210,6 +210,17 @@ HTML;
$this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']); $this->assertSame('塑性成形技术与装备研究院', $item->extra['college_name']);
} }
public function test_resolve_profile_enrich_max_caps_large_batches(): void
{
$adapter = new FacultyListHtmlAdapter;
$method = new \ReflectionMethod($adapter, 'resolveProfileEnrichMax');
$method->setAccessible(true);
$this->assertSame(32, $method->invoke($adapter, [], 500));
$this->assertSame(10, $method->invoke($adapter, ['profile_enrich_max' => 10], 500));
$this->assertSame(0, $method->invoke($adapter, ['skip_profile_enrich' => true], 500));
}
public function test_response_body_from_pool_result_ignores_connection_exception(): void public function test_response_body_from_pool_result_ignores_connection_exception(): void
{ {
$adapter = new FacultyListHtmlAdapter; $adapter = new FacultyListHtmlAdapter;

Loading…
Cancel
Save