You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

222 lines
7.8 KiB

<?php
namespace App\Services\Crawl;
use App\Models\CrawlJob;
use App\Models\CrawlJobItem;
use App\Models\CrawlSource;
use App\Models\News;
use App\Models\Paper;
use App\Models\Teacher;
class CrawlJobRunnerService
{
public function __construct(
protected CrawlJobDispatcher $dispatcher,
) {}
/**
* @param array<string, mixed> $params
*/
public function run(CrawlJob $job, CrawlSource $source, array $params): CrawlJob
{
$job->update(['status' => 'running']);
$dtos = $this->dispatcher->fetch(
$job->request_url ?? $job->platform_url,
$source,
$params
);
$count = 0;
foreach ($dtos as $dto) {
if ($source->target_type === 'teacher') {
$count += $this->persistTeacherItem($job, $source, $dto) ? 1 : 0;
continue;
}
$status = $this->previewStatusForPaperOrNews($source, $dto);
$payload = $this->buildPayload($dto);
CrawlJobItem::query()->updateOrCreate(
[
'crawl_job_id' => $job->id,
'external_id' => $dto->externalId,
],
[
'canonical_url' => $dto->canonicalUrl,
'title' => $dto->title,
'payload' => $payload,
'status' => $status,
'target_type' => $source->target_type === 'paper' ? 'paper' : 'news',
'source_name' => $source->name,
'target_id' => null,
]
);
$count++;
if ($source->target_type === 'paper') {
$this->persistTeacherLeadItem($job, $source, $dto, $payload, $status);
}
}
$job->update([
'status' => 'completed',
'items_fetched' => $count,
'papers_created' => $source->target_type === 'paper' ? $count : 0,
'result_summary' => sprintf('已从 %s 抓取 %d 条(暂存预览,请勾选入库)', $source->name, $count),
'completed_at' => now(),
]);
return $job->fresh();
}
protected function previewStatusForPaperOrNews(CrawlSource $source, CrawlItemDto $dto): string
{
if ($source->target_type === 'paper') {
if (Paper::query()
->where('external_id', $dto->externalId)
->where('source', 'crawl')
->exists()) {
return 'duplicate';
}
} else {
$url = $dto->canonicalUrl;
if ($url && News::query()->where('source_url', $url)->exists()) {
return 'duplicate';
}
}
return 'preview';
}
/**
* @return array<string, mixed>
*/
protected function buildPayload(CrawlItemDto $dto): array
{
$extra = $dto->extra;
if ($dto->authorsParsed !== []) {
$extra['authors_parsed'] = $dto->authorsParsed;
}
if (! isset($extra['lead_author'])) {
$extra['lead_author'] = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
}
return [
'authors' => $dto->authors,
'summary' => $dto->summary,
'published_at' => $dto->publishedAt,
'school_name' => $dto->schoolName,
'section' => $dto->section,
'content_html' => $dto->contentHtml,
'extra' => $extra,
'authors_parsed' => $dto->authorsParsed,
'lead_author' => $extra['lead_author'] ?? null,
];
}
/**
* @param array<string, mixed> $paperPayload
*/
protected function persistTeacherLeadItem(
CrawlJob $job,
CrawlSource $source,
CrawlItemDto $dto,
array $paperPayload,
string $paperStatus,
): void {
$lead = $paperPayload['lead_author'] ?? null;
if (! is_array($lead) || trim((string) ($lead['name'] ?? '')) === '') {
return;
}
$leadExternalId = 'lead:'.$dto->externalId;
$dup = false;
if (! empty($lead['email']) && Teacher::query()->where('email', $lead['email'])->exists()) {
$dup = true;
}
CrawlJobItem::query()->updateOrCreate(
[
'crawl_job_id' => $job->id,
'external_id' => $leadExternalId,
],
[
'canonical_url' => $dto->canonicalUrl,
'title' => (string) $lead['name'],
'payload' => [
'lead_author' => $lead,
'paper_external_id' => $dto->externalId,
'paper_title' => $dto->title,
'school_name' => $lead['university_name'] ?? $dto->schoolName,
],
'status' => $dup ? 'duplicate' : ($paperStatus === 'duplicate' ? 'preview' : 'preview'),
'target_type' => 'teacher_lead',
'source_name' => $source->name,
'target_id' => null,
]
);
}
protected function persistTeacherItem(CrawlJob $job, CrawlSource $source, CrawlItemDto $dto): bool
{
$lead = $dto->extra['lead_author'] ?? CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
$email = is_array($lead) ? ($lead['email'] ?? null) : null;
$status = 'preview';
if ($email && Teacher::query()->where('email', $email)->exists()) {
$status = 'duplicate';
} elseif (is_array($lead)) {
$leadName = trim((string) ($lead['name'] ?? ''));
$leadUniversity = trim((string) ($lead['university_name'] ?? ''));
$profileUrl = trim((string) ($lead['profile_url'] ?? $dto->canonicalUrl ?? ''));
$collegeName = trim((string) ($lead['college'] ?? $lead['affiliation'] ?? $dto->extra['college_name'] ?? ''));
if ($leadName !== '' && ! $email) {
if ($profileUrl !== '' && Teacher::query()->where('remark', 'like', '%'.$profileUrl.'%')->exists()) {
$status = 'duplicate';
} else {
$dup = Teacher::query()->where('name', $leadName);
if ($collegeName !== '') {
$dup->where('department', $collegeName);
}
if ($leadUniversity !== '') {
$dup->where(function ($q) use ($leadUniversity) {
$q->where('university_text', $leadUniversity)
->orWhere('university_text', 'like', $leadUniversity.'%');
});
}
if ($dup->exists()) {
$status = 'duplicate';
}
}
}
}
CrawlJobItem::query()->updateOrCreate(
[
'crawl_job_id' => $job->id,
'external_id' => $dto->externalId,
],
[
'canonical_url' => $dto->canonicalUrl,
'title' => $dto->title,
'payload' => [
'lead_author' => $lead,
'school_name' => $dto->schoolName,
'summary' => $dto->summary,
'academic_title' => $dto->extra['academic_title'] ?? (is_array($lead) ? ($lead['academic_title'] ?? null) : null),
'college_name' => $dto->extra['college_name'] ?? (is_array($lead) ? ($lead['college'] ?? $lead['affiliation'] ?? null) : null),
'profile_url' => $dto->extra['profile_url'] ?? $dto->canonicalUrl,
],
'status' => $status,
'target_type' => 'teacher',
'source_name' => $source->name,
'target_id' => null,
]
);
return true;
}
}