|null $itemIds * @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $teacherDefaults * @return array{imported:int, skipped:int, failed:int} */ public function import( CrawlJob $job, ?array $itemIds = null, bool $selectAll = false, array $teacherDefaults = [], array $newsDefaults = [], ): array { $query = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->whereIn('status', ['preview']); if (! $selectAll && $itemIds !== null && $itemIds !== []) { $query->whereIn('id', $itemIds); } $items = $query->get(); $imported = 0; $skipped = 0; $failed = 0; DB::transaction(function () use ($items, $job, $teacherDefaults, $newsDefaults, &$imported, &$skipped, &$failed) { foreach ($items as $item) { try { $id = match ($item->target_type) { 'paper' => $this->importPaper($job, $item), 'teacher_lead', 'teacher' => $this->importTeacher($job, $item, $teacherDefaults), default => $this->importNews($job, $item, $newsDefaults), }; if ($id) { $item->update(['status' => 'imported', 'target_id' => $id]); $imported++; } else { if ($item->status === 'preview') { $item->update(['status' => 'skipped']); } $skipped++; } } catch (\Throwable $e) { $failed++; Log::warning('crawl_import_item_failed', [ 'crawl_job_id' => $job->id, 'item_id' => $item->id, 'target_type' => $item->target_type, 'message' => $e->getMessage(), ]); } } $job->update([ 'items_imported' => CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('status', 'imported') ->count(), ]); }); return compact('imported', 'skipped', 'failed'); } protected function importPaper(CrawlJob $job, CrawlJobItem $item): ?int { $payload = $item->payload ?? []; $externalId = $item->external_id; $existing = Paper::query() ->where('external_id', $externalId) ->where('source', 'crawl') ->first(); if ($existing) { $item->update(['status' => 'duplicate']); return null; } $paper = Paper::query()->create([ 'title' => $item->title, 'authors' => $payload['authors'] ?? null, 'school_name' => $payload['school_name'] ?? null, 'published_at' => $payload['published_at'] ?? null, 'url' => $item->canonical_url, 'summary' => $payload['summary'] ?? null, 'source' => 'crawl', 'external_id' => $externalId, 'source_site' => $item->source_name, 'crawl_job_id' => $job->id, ]); return $paper->id; } /** * @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $defaults */ protected function importTeacher(CrawlJob $job, CrawlJobItem $item, array $defaults = []): ?int { $payload = $item->payload ?? []; $lead = $payload['lead_author'] ?? null; if (! is_array($lead)) { $lead = [ 'name' => $item->title, 'email' => null, 'affiliation' => null, 'university_name' => $payload['school_name'] ?? null, ]; } $name = trim((string) ($lead['name'] ?? '')); if ($name === '') { return null; } $email = CrawlAuthorParser::normalizeEmail($lead['email'] ?? null); if ($email && Teacher::query()->where('email', $email)->exists()) { $item->update(['status' => 'duplicate']); return null; } $academicTitle = trim((string) ($lead['academic_title'] ?? $payload['academic_title'] ?? '')); $collegeName = trim((string) ($lead['college'] ?? $lead['affiliation'] ?? $payload['college_name'] ?? '')); $profileUrl = trim((string) ($payload['profile_url'] ?? $item->canonical_url ?? '')); $universityId = isset($defaults['university_id']) ? (int) $defaults['university_id'] : null; $city = isset($defaults['city']) ? trim((string) $defaults['city']) : null; if ($universityId && ! $city) { $city = University::query()->whereKey($universityId)->value('city'); } $leadUniversityName = trim((string) ($lead['university_name'] ?? '')); if (! $universityId && $leadUniversityName !== '') { $universityId = $this->resolveUniversityId($leadUniversityName); if ($universityId) { $city = $city ?: University::query()->whereKey($universityId)->value('city'); } } if ($this->teacherAlreadyExists($name, $email, $profileUrl, $universityId, $leadUniversityName, $collegeName)) { $item->update(['status' => 'duplicate']); return null; } $sourceId = $this->resolveTeacherSourceId($item->target_type); $statusId = $this->defaultTeacherStatusId(); if (! $sourceId || ! $statusId) { throw new \RuntimeException('老师库字典未配置'); } $remarkParts = [ match ($item->target_type) { 'teacher_lead' => '论文库入库', 'teacher' => '高校抓取入库', default => '爬虫入库', }, ]; if ($profileUrl !== '') { $remarkParts[] = '主页:'.$profileUrl; } $universityText = null; if (! $universityId) { if ($leadUniversityName !== '' && $collegeName !== '') { $universityText = $leadUniversityName.' · '.$collegeName; } elseif ($leadUniversityName !== '') { $universityText = $leadUniversityName; } elseif ($collegeName !== '') { $universityText = $collegeName; } } $teacher = Teacher::query()->create([ 'name' => $name, 'university_id' => $universityId, 'university_text' => $universityText, 'department' => $collegeName !== '' ? $collegeName : null, 'city' => $city ?: '待补充', 'title' => $academicTitle !== '' ? $academicTitle : '待补充', 'email' => $email, 'source_dict_item_id' => $sourceId, 'status_dict_item_id' => $statusId, 'remark' => implode(';', $remarkParts), ]); if (! empty($defaults['research_direction_ids']) && is_array($defaults['research_direction_ids'])) { $teacher->researchDirections()->sync($defaults['research_direction_ids']); } $paperExternalId = $payload['paper_external_id'] ?? null; if ($paperExternalId) { $paper = Paper::query() ->where('external_id', $paperExternalId) ->where('source', 'crawl') ->first(); if ($paper) { $teacher->papers()->syncWithoutDetaching([$paper->id]); } } return $teacher->id; } /** * @param array{source?:string, category_dict_item_id?:int} $newsDefaults */ protected function importNews(CrawlJob $job, CrawlJobItem $item, array $newsDefaults = []): ?int { $url = $item->canonical_url; if ($url && News::query()->where('source_url', $url)->exists()) { $item->update(['status' => 'duplicate']); return null; } $title = trim((string) $item->title); if ($title === '') { throw new \RuntimeException('标题为空,无法入库'); } $payload = $item->payload ?? []; $extra = $payload['extra'] ?? []; $categoryId = isset($extra['category_dict_item_id']) ? (int) $extra['category_dict_item_id'] : 0; if ($categoryId <= 0 && ! empty($newsDefaults['category_dict_item_id'])) { $categoryId = (int) $newsDefaults['category_dict_item_id']; } if ($categoryId <= 0) { $categoryId = (int) (app(NewsCategoryMatcher::class)->resolveCategoryId( $item->title, $payload['summary'] ?? null, $extra['keywords'] ?? CrawlKeywordParser::parse((string) ($job->keyword ?? '')), ) ?? 0); } if ($categoryId <= 0) { $categoryId = null; } $content = app(NewsHtmlImageLocalizer::class)->localize( $payload['content_html'] ?? '', $url ) ?? ''; if ($content === '') { $content = '
(爬虫抓取,请编辑正文)
'; } $importSource = trim((string) ($extra['import_source'] ?? $newsDefaults['source'] ?? '')); if ($importSource === '') { $importSource = $item->source_name ?: '爬虫采集'; } $news = News::query()->create([ 'title' => $title, 'category_dict_item_id' => $categoryId, 'source' => $importSource, 'source_url' => $url, 'source_site' => $importSource, 'crawl_job_id' => $job->id, 'summary' => $payload['summary'] ?? null, 'content_html' => $content, 'status' => 0, 'published_at' => $this->resolvePublishedAt($payload['published_at'] ?? null), ]); return $news->id; } protected function resolvePublishedAt(mixed $value): Carbon { if ($value === null || $value === '') { return now(); } try { return Carbon::parse($value); } catch (\Throwable) { $normalized = HtmlCrawlSupport::normalizeDate((string) $value); return $normalized ? Carbon::parse($normalized.' 00:00:00') : now(); } } protected function teacherAlreadyExists( string $name, ?string $email, ?string $profileUrl, ?int $universityId, string $leadUniversityName, ?string $collegeName, ): bool { if ($email && Teacher::query()->where('email', $email)->exists()) { return true; } if ($email) { return false; } if ($profileUrl !== '') { return Teacher::query()->where('remark', 'like', '%'.$profileUrl.'%')->exists(); } $dupQuery = Teacher::query()->where('name', $name); if ($collegeName !== '') { $dupQuery->where('department', $collegeName); } if ($universityId) { $dupQuery->where('university_id', $universityId); } elseif ($leadUniversityName !== '') { $dupQuery->where(function ($q) use ($leadUniversityName) { $q->where('university_text', $leadUniversityName) ->orWhere('university_text', 'like', $leadUniversityName.'%'); }); } return $dupQuery->exists(); } protected function resolveUniversityId(string $name): ?int { $name = trim($name); if ($name === '') { return null; } $id = University::query() ->where('name', $name) ->where('status', 1) ->value('id'); return $id ? (int) $id : null; } /** * 论文任务第一作者 → 论文库;师资列表任务 → 高校抓取;其余 → 手动录入。 */ protected function resolveTeacherSourceId(string $targetType): ?int { $value = match ($targetType) { 'teacher_lead' => 'paper', 'teacher' => 'faculty_crawl', default => 'manual', }; $typeId = DictType::query()->where('code', 'teacher_source')->where('status', 1)->value('id'); if (! $typeId) { return null; } return DictItem::query() ->where('dict_type_id', $typeId) ->where('value', $value) ->where('status', 1) ->value('id'); } protected function defaultTeacherStatusId(): ?int { $typeId = DictType::query()->where('code', 'teacher_status')->where('status', 1)->value('id'); if (! $typeId) { return null; } return DictItem::query() ->where('dict_type_id', $typeId) ->where('value', 'active') ->where('status', 1) ->value('id'); } }