$params */ public function run(CrawlJob $job, CrawlSource $source, array $params): CrawlJob { $job->update(['status' => 'running']); $dtos = $this->dispatcher->fetch( $job->request_url ?? $job->platform_url, $source, $params ); $count = 0; foreach ($dtos as $dto) { if ($source->target_type === 'teacher') { $count += $this->persistTeacherItem($job, $source, $dto) ? 1 : 0; continue; } $status = $this->previewStatusForPaperOrNews($source, $dto); $payload = $this->buildPayload($dto); CrawlJobItem::query()->updateOrCreate( [ 'crawl_job_id' => $job->id, 'external_id' => $dto->externalId, ], [ 'canonical_url' => $dto->canonicalUrl, 'title' => $dto->title, 'payload' => $payload, 'status' => $status, 'target_type' => $source->target_type === 'paper' ? 'paper' : 'news', 'source_name' => $source->name, 'target_id' => null, ] ); $count++; if ($source->target_type === 'paper') { $this->persistTeacherLeadItem($job, $source, $dto, $payload, $status); } } $job->update([ 'status' => 'completed', 'items_fetched' => $count, 'papers_created' => $source->target_type === 'paper' ? $count : 0, 'result_summary' => sprintf('已从 %s 抓取 %d 条(暂存预览,请勾选入库)', $source->name, $count), 'completed_at' => now(), ]); return $job->fresh(); } protected function previewStatusForPaperOrNews(CrawlSource $source, CrawlItemDto $dto): string { if ($source->target_type === 'paper') { if (Paper::query() ->where('external_id', $dto->externalId) ->where('source', 'crawl') ->exists()) { return 'duplicate'; } } else { $url = $dto->canonicalUrl; if ($url && News::query()->where('source_url', $url)->exists()) { return 'duplicate'; } } return 'preview'; } /** * @return array */ protected function buildPayload(CrawlItemDto $dto): array { $extra = $dto->extra; if ($dto->authorsParsed !== []) { $extra['authors_parsed'] = $dto->authorsParsed; } if (! isset($extra['lead_author'])) { $extra['lead_author'] = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); } return [ 'authors' => $dto->authors, 'summary' => $dto->summary, 'published_at' => $dto->publishedAt, 'school_name' => $dto->schoolName, 'section' => $dto->section, 'content_html' => $dto->contentHtml, 'extra' => $extra, 'authors_parsed' => $dto->authorsParsed, 'lead_author' => $extra['lead_author'] ?? null, ]; } /** * @param array $paperPayload */ protected function persistTeacherLeadItem( CrawlJob $job, CrawlSource $source, CrawlItemDto $dto, array $paperPayload, string $paperStatus, ): void { $lead = $paperPayload['lead_author'] ?? null; if (! is_array($lead) || trim((string) ($lead['name'] ?? '')) === '') { return; } $leadExternalId = 'lead:'.$dto->externalId; $dup = false; if (! empty($lead['email']) && Teacher::query()->where('email', $lead['email'])->exists()) { $dup = true; } CrawlJobItem::query()->updateOrCreate( [ 'crawl_job_id' => $job->id, 'external_id' => $leadExternalId, ], [ 'canonical_url' => $dto->canonicalUrl, 'title' => (string) $lead['name'], 'payload' => [ 'lead_author' => $lead, 'paper_external_id' => $dto->externalId, 'paper_title' => $dto->title, 'school_name' => $lead['university_name'] ?? $dto->schoolName, ], 'status' => $dup ? 'duplicate' : ($paperStatus === 'duplicate' ? 'preview' : 'preview'), 'target_type' => 'teacher_lead', 'source_name' => $source->name, 'target_id' => null, ] ); } protected function persistTeacherItem(CrawlJob $job, CrawlSource $source, CrawlItemDto $dto): bool { $lead = $dto->extra['lead_author'] ?? CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed); $email = is_array($lead) ? ($lead['email'] ?? null) : null; $status = 'preview'; if ($email && Teacher::query()->where('email', $email)->exists()) { $status = 'duplicate'; } elseif (is_array($lead)) { $leadName = trim((string) ($lead['name'] ?? '')); $leadUniversity = trim((string) ($lead['university_name'] ?? '')); $profileUrl = trim((string) ($lead['profile_url'] ?? $dto->canonicalUrl ?? '')); $collegeName = trim((string) ($lead['college'] ?? $lead['affiliation'] ?? $dto->extra['college_name'] ?? '')); if ($leadName !== '' && ! $email) { if ($profileUrl !== '' && Teacher::query()->where('remark', 'like', '%'.$profileUrl.'%')->exists()) { $status = 'duplicate'; } else { $dup = Teacher::query()->where('name', $leadName); if ($collegeName !== '') { $dup->where('department', $collegeName); } if ($leadUniversity !== '') { $dup->where(function ($q) use ($leadUniversity) { $q->where('university_text', $leadUniversity) ->orWhere('university_text', 'like', $leadUniversity.'%'); }); } if ($dup->exists()) { $status = 'duplicate'; } } } } CrawlJobItem::query()->updateOrCreate( [ 'crawl_job_id' => $job->id, 'external_id' => $dto->externalId, ], [ 'canonical_url' => $dto->canonicalUrl, 'title' => $dto->title, 'payload' => [ 'lead_author' => $lead, 'school_name' => $dto->schoolName, 'summary' => $dto->summary, 'academic_title' => $dto->extra['academic_title'] ?? (is_array($lead) ? ($lead['academic_title'] ?? null) : null), 'college_name' => $dto->extra['college_name'] ?? (is_array($lead) ? ($lead['college'] ?? $lead['affiliation'] ?? null) : null), 'profile_url' => $dto->extra['profile_url'] ?? $dto->canonicalUrl, ], 'status' => $status, 'target_type' => 'teacher', 'source_name' => $source->name, 'target_id' => null, ] ); return true; } }