$params */ public function run(CrawlJob $job, CrawlSource $source, array $params): CrawlJob { $job->update(['status' => 'running']); $dtos = $this->dispatcher->fetch( $job->request_url ?? $job->platform_url, $source, $params ); $count = 0; foreach ($dtos as $dto) { if ($source->target_type === 'teacher') { $count += $this->persistTeacherItem($job, $source, $dto) ? 1 : 0; continue; } $status = $this->previewStatusForPaperOrNews($source, $dto); $payload = $this->buildPayload($dto); CrawlJobItem::query()->updateOrCreate( [ 'crawl_job_id' => $job->id, 'external_id' => $dto->externalId, ], [ 'canonical_url' => $dto->canonicalUrl, 'title' => $dto->title, 'payload' => $payload, 'status' => $status, 'target_type' => $source->target_type === 'paper' ? 'paper' : 'news', 'source_name' => $source->name, 'target_id' => null, ] ); $count++; if ($source->target_type === 'paper') { $this->persistTeacherLeadItem($job, $source, $dto, $payload, $status); } } $job->update([ 'status' => 'completed', 'items_fetched' => $count, 'papers_created' => $source->target_type === 'paper' ? $count : 0, 'result_summary' => sprintf('已从 %s 抓取 %d 条(暂存预览,请勾选入库)', $source->name, $count), 'completed_at' => now(), ]); return $job->fresh(); } protected function previewStatusForPaperOrNews(CrawlSource $source, CrawlItemDto $dto): string { if ($source->target_type === 'paper') { if (Paper::query() ->where('external_id', $dto->externalId) ->where('source', 'crawl') ->exists()) { return 'duplicate'; } } else { $url = $dto->canonicalUrl; if ($url && News::query()->where('source_url', $url)->exists()) { return 'duplicate'; } } return 'preview'; } /** * @return array */ protected function buildPayload(CrawlItemDto $dto): array { $extra = $dto->extra; if ($dto->authorsParsed !== []) { $extra['authors_parsed'] = $dto->authorsParsed; } if (! isset($extra['lead_author'])) { $extra['lead_author'] = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); } return [ 'authors' => $dto->authors, 'summary' => $dto->summary, 'published_at' => $dto->publishedAt, 'school_name' => $dto->schoolName, 'section' => $dto->section, 'content_html' => $dto->contentHtml, 'extra' => $extra, 'authors_parsed' => $dto->authorsParsed, 'lead_author' => $extra['lead_author'] ?? null, ]; } /** * @param array $paperPayload */ protected function persistTeacherLeadItem( CrawlJob $job, CrawlSource $source, CrawlItemDto $dto, array $paperPayload, string $paperStatus, ): void { $lead = $paperPayload['lead_author'] ?? null; if (! is_array($lead) || trim((string) ($lead['name'] ?? '')) === '') { return; } $leadExternalId = 'lead:'.$dto->externalId; $dup = false; if (! empty($lead['email']) && Teacher::query()->where('email', $lead['email'])->exists()) { $dup = true; } CrawlJobItem::query()->updateOrCreate( [ 'crawl_job_id' => $job->id, 'external_id' => $leadExternalId, ], [ 'canonical_url' => $dto->canonicalUrl, 'title' => (string) $lead['name'], 'payload' => [ 'lead_author' => $lead, 'paper_external_id' => $dto->externalId, 'paper_title' => $dto->title, 'school_name' => $lead['university_name'] ?? $dto->schoolName, ], 'status' => $dup ? 'duplicate' : ($paperStatus === 'duplicate' ? 'preview' : 'preview'), 'target_type' => 'teacher_lead', 'source_name' => $source->name, 'target_id' => null, ] ); } protected function persistTeacherItem(CrawlJob $job, CrawlSource $source, CrawlItemDto $dto): bool { $lead = $dto->extra['lead_author'] ?? CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); $email = is_array($lead) ? ($lead['email'] ?? null) : null; $status = 'preview'; if ($email && Teacher::query()->where('email', $email)->exists()) { $status = 'duplicate'; } CrawlJobItem::query()->updateOrCreate( [ 'crawl_job_id' => $job->id, 'external_id' => $dto->externalId, ], [ 'canonical_url' => $dto->canonicalUrl, 'title' => $dto->title, 'payload' => [ 'lead_author' => $lead, 'school_name' => $dto->schoolName, 'summary' => $dto->summary, ], 'status' => $status, 'target_type' => 'teacher', 'source_name' => $source->name, 'target_id' => null, ] ); return true; } }