|
|
|
|
@ -30,10 +30,14 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
$merged = [];
|
|
|
|
|
$seen = [];
|
|
|
|
|
|
|
|
|
|
for ($page = 1; $page <= $pagesToFetch; $page++) {
|
|
|
|
|
for ($page = 1; $page <= $pagesToFetch && count($merged) < $maxResults; $page++) {
|
|
|
|
|
$html = $page === 1
|
|
|
|
|
? $firstHtml
|
|
|
|
|
: $this->fetchHtml($this->buildPageUrl($baseUrl, $page, $firstHtml));
|
|
|
|
|
: null;
|
|
|
|
|
|
|
|
|
|
if ($html === null) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
|
|
|
|
|
if (isset($seen[$item->externalId])) {
|
|
|
|
|
@ -47,52 +51,156 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($merged);
|
|
|
|
|
if ($pagesToFetch > 1 && count($merged) < $maxResults) {
|
|
|
|
|
$merged = $this->fetchRemainingListPages(
|
|
|
|
|
$baseUrl,
|
|
|
|
|
$firstHtml,
|
|
|
|
|
$pagesToFetch,
|
|
|
|
|
$keywords,
|
|
|
|
|
$requestUrl,
|
|
|
|
|
$merged,
|
|
|
|
|
$seen,
|
|
|
|
|
$maxResults,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $this->enrichEmailsFromProfilePages($merged, $params);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<CrawlItemDto> $merged
|
|
|
|
|
* @param array<string, true> $seen
|
|
|
|
|
* @param list<string> $keywords
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function fetchRemainingListPages(
|
|
|
|
|
string $baseUrl,
|
|
|
|
|
string $firstHtml,
|
|
|
|
|
int $pagesToFetch,
|
|
|
|
|
array $keywords,
|
|
|
|
|
string $requestUrl,
|
|
|
|
|
array $merged,
|
|
|
|
|
array $seen,
|
|
|
|
|
int $maxResults,
|
|
|
|
|
): array {
|
|
|
|
|
$poolSize = max(1, min(10, (int) config('crawl.faculty.list_fetch_pool_size', 5)));
|
|
|
|
|
$pageUrls = [];
|
|
|
|
|
for ($page = 2; $page <= $pagesToFetch; $page++) {
|
|
|
|
|
$pageUrls[$page] = $this->buildPageUrl($baseUrl, $page, $firstHtml);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach (array_chunk($pageUrls, $poolSize, true) as $chunk) {
|
|
|
|
|
$htmlByPage = $this->fetchHtmlPool($chunk);
|
|
|
|
|
ksort($htmlByPage);
|
|
|
|
|
|
|
|
|
|
foreach ($htmlByPage as $html) {
|
|
|
|
|
foreach ($this->extractFromHtml($html, $keywords, $requestUrl) as $item) {
|
|
|
|
|
if (isset($seen[$item->externalId])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$seen[$item->externalId] = true;
|
|
|
|
|
$merged[] = $item;
|
|
|
|
|
if (count($merged) >= $maxResults) {
|
|
|
|
|
return $merged;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $merged;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param array<int, string> $pageUrls
|
|
|
|
|
* @return array<int, string>
|
|
|
|
|
*/
|
|
|
|
|
protected function fetchHtmlPool(array $pageUrls): array
|
|
|
|
|
{
|
|
|
|
|
if ($pageUrls === []) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
|
|
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
|
|
|
|
|
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pageUrls, $headers, $timeout) {
|
|
|
|
|
foreach ($pageUrls as $page => $url) {
|
|
|
|
|
$pool->as((string) $page)
|
|
|
|
|
->timeout($timeout)
|
|
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
|
|
->withHeaders($headers)
|
|
|
|
|
->get($url);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
$htmlByPage = [];
|
|
|
|
|
foreach ($pageUrls as $page => $url) {
|
|
|
|
|
$body = $this->responseBodyFromPoolResult($responses[(string) $page] ?? null);
|
|
|
|
|
if ($body !== null && $body !== '') {
|
|
|
|
|
$htmlByPage[$page] = $body;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $htmlByPage;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<CrawlItemDto> $items
|
|
|
|
|
* @param array<string, mixed> $params
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function enrichEmailsFromProfilePages(array $items): array
|
|
|
|
|
protected function enrichEmailsFromProfilePages(array $items, array $params = []): array
|
|
|
|
|
{
|
|
|
|
|
if (! config('crawl.faculty.profile_email_enrich_enabled', true)) {
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 6)));
|
|
|
|
|
$timeout = max(8, (int) config('crawl.faculty.profile_http_timeout_seconds', 20));
|
|
|
|
|
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
|
|
|
|
|
$maxEnrich = $this->resolveProfileEnrichMax($params, count($items));
|
|
|
|
|
if ($maxEnrich <= 0) {
|
|
|
|
|
return $this->markProfileEnrichSkipped($items);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$enriched = [];
|
|
|
|
|
foreach (array_chunk($items, $poolSize) as $chunk) {
|
|
|
|
|
$pending = [];
|
|
|
|
|
foreach ($chunk as $item) {
|
|
|
|
|
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
|
|
|
|
|
$enriched[] = $item;
|
|
|
|
|
$poolSize = max(1, min(12, (int) config('crawl.faculty.profile_enrich_pool_size', 8)));
|
|
|
|
|
$timeout = max(5, (int) config('crawl.faculty.profile_http_timeout_seconds', 10));
|
|
|
|
|
$headers = ['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'];
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$pending[$item->externalId] = $item;
|
|
|
|
|
$fetchMap = [];
|
|
|
|
|
$enrichBudget = $maxEnrich;
|
|
|
|
|
foreach ($items as $index => $item) {
|
|
|
|
|
if ($enrichBudget <= 0) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ($pending === []) {
|
|
|
|
|
if ($this->itemHasEmail($item) || ! $item->canonicalUrl) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$fetchMap[$index] = $item;
|
|
|
|
|
$enrichBudget--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($pending, $headers, $timeout) {
|
|
|
|
|
foreach ($pending as $externalId => $item) {
|
|
|
|
|
$pool->as($externalId)
|
|
|
|
|
if ($fetchMap === []) {
|
|
|
|
|
return $items;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$fetchedBodies = [];
|
|
|
|
|
foreach (array_chunk($fetchMap, $poolSize, true) as $chunk) {
|
|
|
|
|
$batchPending = [];
|
|
|
|
|
foreach ($chunk as $index => $item) {
|
|
|
|
|
$batchPending[$index] = $item;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$responses = Http::pool(function (\Illuminate\Http\Client\Pool $pool) use ($batchPending, $headers, $timeout) {
|
|
|
|
|
foreach ($batchPending as $index => $item) {
|
|
|
|
|
$pool->as((string) $index)
|
|
|
|
|
->timeout($timeout)
|
|
|
|
|
->connectTimeout(min(10, $timeout))
|
|
|
|
|
->retry(1, 500, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
|
|
->withHeaders($headers)
|
|
|
|
|
->get($item->canonicalUrl);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
foreach ($pending as $externalId => $item) {
|
|
|
|
|
$body = $this->responseBodyFromPoolResult($responses[$externalId] ?? null);
|
|
|
|
|
foreach ($batchPending as $index => $item) {
|
|
|
|
|
$body = $this->responseBodyFromPoolResult($responses[(string) $index] ?? null);
|
|
|
|
|
if ($body !== null) {
|
|
|
|
|
$email = $this->extractEmailFromProfileHtml($body);
|
|
|
|
|
if ($email) {
|
|
|
|
|
@ -100,11 +208,69 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
}
|
|
|
|
|
$item = $this->applyProfileMetadataToItem($item, $body);
|
|
|
|
|
}
|
|
|
|
|
$enriched[] = $item;
|
|
|
|
|
$fetchedBodies[$index] = $item;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $enriched;
|
|
|
|
|
$result = [];
|
|
|
|
|
foreach ($items as $index => $item) {
|
|
|
|
|
if (isset($fetchedBodies[$index])) {
|
|
|
|
|
$result[] = $fetchedBodies[$index];
|
|
|
|
|
} elseif (! $this->itemHasEmail($item) && $item->canonicalUrl) {
|
|
|
|
|
$result[] = $this->markItemProfileEnrichSkipped($item);
|
|
|
|
|
} else {
|
|
|
|
|
$result[] = $item;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param array<string, mixed> $params
|
|
|
|
|
*/
|
|
|
|
|
protected function resolveProfileEnrichMax(array $params, int $itemCount): int
|
|
|
|
|
{
|
|
|
|
|
if (($params['skip_profile_enrich'] ?? false) === true) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$configured = (int) ($params['profile_enrich_max'] ?? config('crawl.faculty.profile_enrich_max', 32));
|
|
|
|
|
|
|
|
|
|
return max(0, min($itemCount, min(200, $configured)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param list<CrawlItemDto> $items
|
|
|
|
|
* @return list<CrawlItemDto>
|
|
|
|
|
*/
|
|
|
|
|
protected function markProfileEnrichSkipped(array $items): array
|
|
|
|
|
{
|
|
|
|
|
return array_map(fn (CrawlItemDto $item) => $this->markItemProfileEnrichSkipped($item), $items);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function markItemProfileEnrichSkipped(CrawlItemDto $item): CrawlItemDto
|
|
|
|
|
{
|
|
|
|
|
if ($this->itemHasEmail($item)) {
|
|
|
|
|
return $item;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$extra = $item->extra;
|
|
|
|
|
$extra['profile_enrich_skipped'] = true;
|
|
|
|
|
|
|
|
|
|
return new CrawlItemDto(
|
|
|
|
|
externalId: $item->externalId,
|
|
|
|
|
title: $item->title,
|
|
|
|
|
canonicalUrl: $item->canonicalUrl,
|
|
|
|
|
authors: $item->authors,
|
|
|
|
|
summary: $item->summary,
|
|
|
|
|
publishedAt: $item->publishedAt,
|
|
|
|
|
schoolName: $item->schoolName,
|
|
|
|
|
section: $item->section,
|
|
|
|
|
contentHtml: $item->contentHtml,
|
|
|
|
|
extra: $extra,
|
|
|
|
|
authorsParsed: $item->authorsParsed,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function responseBodyFromPoolResult(mixed $result): ?string
|
|
|
|
|
@ -226,7 +392,11 @@ class FacultyListHtmlAdapter implements CrawlerAdapterInterface
|
|
|
|
|
|
|
|
|
|
protected function fetchHtml(string $url): string
|
|
|
|
|
{
|
|
|
|
|
$response = Http::timeout(30)
|
|
|
|
|
$timeout = max(5, (int) config('crawl.faculty.list_http_timeout_seconds', 20));
|
|
|
|
|
|
|
|
|
|
$response = Http::timeout($timeout)
|
|
|
|
|
->connectTimeout(min(8, $timeout))
|
|
|
|
|
->retry(1, 300, fn ($exception) => $exception instanceof \Illuminate\Http\Client\ConnectionException, throw: false)
|
|
|
|
|
->withHeaders(['User-Agent' => 'SlakeSchool-Crawler/1.0', 'Accept' => 'text/html'])
|
|
|
|
|
->get($url);
|
|
|
|
|
|
|
|
|
|
|