mergeNormalizedRequestUrl($request); $data = $request->validate([ 'request_url' => ['required', 'url', 'max:512'], 'target_type' => ['required', 'in:paper,industry_news,teacher'], ]); $source = $resolver->resolve($data['request_url'], $data['target_type']); if (! $source) { return $this->fail('无法识别该地址,请确认 URL 可访问且入库类型正确', 422); } return $this->ok([ 'crawl_source_id' => $source->id, 'source_name' => $source->name, 'adapter_code' => $source->adapter_code, 'target_type' => $source->target_type, 'param_schema' => $source->param_schema, 'entry_url' => $source->entry_url, ]); } public function show(int $crawlJob): JsonResponse { $job = CrawlJob::query()->with('crawlSource')->findOrFail($crawlJob); return $this->ok($this->serializeJob($job)); } public function items(Request $request, int $crawlJob): JsonResponse { $job = CrawlJob::query()->findOrFail($crawlJob); $query = CrawlJobItem::query()->where('crawl_job_id', $job->id)->orderByDesc('id'); if ($kind = $request->query('item_kind')) { match ($kind) { 'paper' => $query->where('target_type', 'paper'), 'teacher_lead' => $query->where('target_type', 'teacher_lead'), 'teacher' => $query->where('target_type', 'teacher'), default => null, }; } if ($kw = $request->query('keyword')) { $query->where(function ($q) use ($kw) { $q->where('title', 'like', "%{$kw}%") ->orWhere('canonical_url', 'like', "%{$kw}%"); }); } $pageSize = min(500, max(1, (int) $request->query('page_size', 100))); $paginator = $query->paginate($pageSize)->withQueryString(); $paginator->getCollection()->transform(fn (CrawlJobItem $i) => $this->serializeItem($i)); if ($request->query('item_kind') === 'teacher_lead') { $sorted = $paginator->getCollection() ->sort(function (array $a, array $b) { $aHas = ($a['lead_author_university'] ?? '') !== '' || ($a['school_name'] ?? '') !== ''; $bHas = ($b['lead_author_university'] ?? '') !== '' || ($b['school_name'] ?? '') !== ''; if ($aHas !== $bHas) { return $aHas ? -1 : 1; } return strcmp((string) ($a['lead_author_name'] ?? ''), (string) ($b['lead_author_name'] ?? '')); }) ->values(); $paginator->setCollection($sorted); } return $this->paginated($paginator); } public function store( Request $request, CrawlSourceResolver $resolver, CrawlJobRunnerService $runner, ): JsonResponse { $this->mergeNormalizedRequestUrl($request); $data = $request->validate([ 'target_type' => ['required', 'in:paper,industry_news,teacher'], 'request_url' => ['required', 'url', 'max:512'], 'params' => ['nullable', 'array'], ]); $params = $data['params'] ?? []; $source = $resolver->resolve($data['request_url'], $data['target_type']); if (! $source) { return $this->fail('无法识别该地址对应的采集源', 422); } $job = CrawlJob::query()->create([ 'target_type' => $data['target_type'], 'request_url' => $data['request_url'], 'platform_url' => $data['request_url'], 'keyword' => (string) ($params['keyword'] ?? ''), 'params' => $params, 'crawl_source_id' => $source->id, 'adapter_code' => $source->adapter_code, 'admin_user_id' => $request->user()?->id, 'status' => 'pending', ]); try { $job = $runner->run($job, $source, $params); } catch (\Throwable $e) { $job->update([ 'status' => 'failed', 'result_summary' => '抓取失败:'.$e->getMessage(), 'completed_at' => now(), ]); return $this->fail('抓取任务失败:'.$e->getMessage(), 500, ['id' => $job->id]); } return $this->ok($this->serializeJob($job->fresh(['crawlSource'])), '抓取完成,请预览并勾选入库'); } public function import(Request $request, int $crawlJob, CrawlImportService $importService): JsonResponse { $job = CrawlJob::query()->findOrFail($crawlJob); $data = $request->validate([ 'item_ids' => ['nullable', 'array'], 'item_ids.*' => ['integer'], 'select_all' => ['nullable', 'boolean'], 'teacher_defaults' => ['nullable', 'array'], 'teacher_defaults.university_id' => ['nullable', 'integer', 'exists:universities,id'], 'teacher_defaults.city' => ['nullable', 'string', 'max:64'], 'teacher_defaults.research_direction_ids' => ['nullable', 'array'], 'teacher_defaults.research_direction_ids.*' => ['integer', 'exists:research_directions,id'], 'news_defaults' => ['nullable', 'array'], 'news_defaults.source' => ['nullable', 'string', 'max:128'], 'news_defaults.category_dict_item_id' => ['nullable', 'integer'], ]); $result = $importService->import( $job, $data['item_ids'] ?? null, (bool) ($data['select_all'] ?? false), $data['teacher_defaults'] ?? [], $data['news_defaults'] ?? [], ); return $this->ok([ 'imported' => $result['imported'], 'skipped' => $result['skipped'], 'failed' => $result['failed'], 'items_imported' => $job->fresh()->items_imported, ], "已入库 {$result['imported']} 条"); } public function updateItem(Request $request, int $crawlJob, int $item): JsonResponse { CrawlJob::query()->findOrFail($crawlJob); $row = CrawlJobItem::query() ->where('crawl_job_id', $crawlJob) ->where('target_type', 'news') ->findOrFail($item); $typeId = DictType::query()->where('code', 'news_category')->where('status', 1)->value('id'); if (! $typeId) { return $this->fail('资讯分类字典未配置', 422); } $data = $request->validate([ 'category_dict_item_id' => [ 'nullable', 'integer', \Illuminate\Validation\Rule::exists('dict_items', 'id')->where( fn ($q) => $q->where('dict_type_id', $typeId)->where('status', 1) ), ], 'import_source' => ['nullable', 'string', 'max:128'], ]); $payload = $row->payload ?? []; $extra = $payload['extra'] ?? []; if (array_key_exists('category_dict_item_id', $data) && $data['category_dict_item_id']) { $dictItem = DictItem::query()->findOrFail($data['category_dict_item_id']); $extra['category_dict_item_id'] = (int) $dictItem->id; $extra['category_label'] = $dictItem->label; } if (array_key_exists('import_source', $data)) { $extra['import_source'] = trim((string) $data['import_source']); } $payload['extra'] = $extra; $row->update(['payload' => $payload]); return $this->ok($this->serializeItem($row->fresh()), '已更新'); } /** * @return array */ protected function serializeJob(CrawlJob $job): array { $previewCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('status', 'preview') ->count(); $previewPaperCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'paper') ->where('status', 'preview') ->count(); $previewTeacherLeadCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'teacher_lead') ->where('status', 'preview') ->count(); $previewTeacherCount = CrawlJobItem::query() ->where('crawl_job_id', $job->id) ->where('target_type', 'teacher') ->where('status', 'preview') ->count(); return [ 'id' => $job->id, 'target_type' => $job->target_type, 'request_url' => $job->request_url ?? $job->platform_url, 'platform_url' => $job->platform_url, 'keyword' => $job->keyword, 'params' => $job->params, 'status' => $job->status, 'source_name' => $job->crawlSource?->name, 'adapter_code' => $job->adapter_code, 'items_fetched' => $job->items_fetched ?? $job->papers_created, 'items_imported' => $job->items_imported ?? 0, 'papers_created' => $job->papers_created, 'preview_count' => $previewCount, 'preview_paper_count' => $previewPaperCount, 'preview_teacher_lead_count' => $previewTeacherLeadCount, 'preview_teacher_count' => $previewTeacherCount, 'result_summary' => $job->result_summary, 'completed_at' => $job->completed_at?->toIso8601String(), ]; } /** * @return array */ protected function serializeItem(CrawlJobItem $item): array { $payload = $item->payload ?? []; $lead = $payload['lead_author'] ?? null; return [ 'id' => $item->id, 'external_id' => $item->external_id, 'title' => $item->title, 'authors' => $payload['authors'] ?? null, 'school_name' => $payload['school_name'] ?? ($lead['university_name'] ?? null), 'published_at' => $this->formatPublishedAt($payload['published_at'] ?? null), 'lead_author_name' => is_array($lead) ? ($lead['name'] ?? null) : null, 'lead_author_email' => is_array($lead) ? ($lead['email'] ?? null) : null, 'lead_author_affiliation' => is_array($lead) ? ($lead['college'] ?? $lead['affiliation'] ?? $payload['college_name'] ?? null) : ($payload['college_name'] ?? null), 'lead_author_university' => is_array($lead) ? ($lead['university_name'] ?? null) : null, 'lead_author_academic_title' => is_array($lead) ? ($lead['academic_title'] ?? $payload['academic_title'] ?? null) : ($payload['academic_title'] ?? null), 'paper_title' => $payload['paper_title'] ?? null, 'paper_external_id' => $payload['paper_external_id'] ?? null, 'url' => $item->canonical_url, 'summary' => $payload['summary'] ?? null, 'content_html' => $payload['content_html'] ?? null, 'section' => $payload['section'] ?? null, 'category_dict_item_id' => isset(($payload['extra'] ?? [])['category_dict_item_id']) ? (int) ($payload['extra']['category_dict_item_id']) : null, 'category_label' => ($payload['extra'] ?? [])['category_label'] ?? null, 'import_source' => ($payload['extra'] ?? [])['import_source'] ?? null, 'status' => $item->status, 'target_type' => $item->target_type, 'source_name' => $item->source_name, 'selectable' => $item->status === 'preview', 'is_duplicate' => $item->status === 'duplicate', ]; } protected function formatPublishedAt(mixed $value): ?string { if (! $value) { return null; } $str = (string) $value; if (preg_match('/^\d{4}-\d{2}-\d{2}/', $str, $m)) { return substr($m[0], 0, 10); } return $str; } /** 粘贴地址常缺协议,补全 https:// 以便通过 url 校验与域名匹配。 */ protected function mergeNormalizedRequestUrl(Request $request): void { $raw = trim((string) $request->input('request_url', '')); if ($raw === '' || preg_match('#^https?://#i', $raw)) { return; } $request->merge(['request_url' => 'https://'.$raw]); } }