You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

309 lines
12 KiB

<?php
namespace App\Http\Controllers\Admin;
use App\Http\Controllers\Controller;
use App\Models\CrawlJob;
use App\Models\CrawlJobItem;
use App\Models\DictItem;
use App\Models\DictType;
use App\Services\Crawl\CrawlImportService;
use App\Services\Crawl\CrawlJobRunnerService;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\CrawlSourceResolver;
use App\Support\ApiResponse;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Request;
class CrawlJobController extends Controller
{
use ApiResponse;
public function resolveUrl(Request $request, CrawlSourceResolver $resolver): JsonResponse
{
$data = $request->validate([
'request_url' => ['required', 'url', 'max:512'],
'target_type' => ['required', 'in:paper,industry_news,teacher'],
]);
$source = $resolver->resolve($data['request_url'], $data['target_type']);
if (! $source) {
return $this->fail('无法识别该地址,请确认 URL 或在采集源中配置域名匹配', 422);
}
return $this->ok([
'crawl_source_id' => $source->id,
'source_name' => $source->name,
'adapter_code' => $source->adapter_code,
'target_type' => $source->target_type,
'param_schema' => $source->param_schema,
'entry_url' => $source->entry_url,
]);
}
public function show(int $crawlJob): JsonResponse
{
$job = CrawlJob::query()->with('crawlSource')->findOrFail($crawlJob);
return $this->ok($this->serializeJob($job));
}
public function items(Request $request, int $crawlJob): JsonResponse
{
$job = CrawlJob::query()->findOrFail($crawlJob);
$query = CrawlJobItem::query()->where('crawl_job_id', $job->id)->orderByDesc('id');
if ($kind = $request->query('item_kind')) {
match ($kind) {
'paper' => $query->where('target_type', 'paper'),
'teacher_lead' => $query->where('target_type', 'teacher_lead'),
'teacher' => $query->where('target_type', 'teacher'),
default => null,
};
}
if ($kw = $request->query('keyword')) {
$query->where(function ($q) use ($kw) {
$q->where('title', 'like', "%{$kw}%")
->orWhere('canonical_url', 'like', "%{$kw}%");
});
}
$paginator = $query->paginate((int) $request->query('page_size', 100))->withQueryString();
$paginator->getCollection()->transform(fn (CrawlJobItem $i) => $this->serializeItem($i));
if ($request->query('item_kind') === 'teacher_lead') {
$sorted = $paginator->getCollection()
->sort(function (array $a, array $b) {
$aHas = ($a['lead_author_university'] ?? '') !== '' || ($a['school_name'] ?? '') !== '';
$bHas = ($b['lead_author_university'] ?? '') !== '' || ($b['school_name'] ?? '') !== '';
if ($aHas !== $bHas) {
return $aHas ? -1 : 1;
}
return strcmp((string) ($a['lead_author_name'] ?? ''), (string) ($b['lead_author_name'] ?? ''));
})
->values();
$paginator->setCollection($sorted);
}
return $this->paginated($paginator);
}
public function store(
Request $request,
CrawlSourceResolver $resolver,
CrawlJobRunnerService $runner,
): JsonResponse {
$data = $request->validate([
'target_type' => ['required', 'in:paper,industry_news,teacher'],
'request_url' => ['required', 'url', 'max:512'],
'params' => ['nullable', 'array'],
]);
$params = $data['params'] ?? [];
if (
$data['target_type'] !== 'teacher'
&& CrawlKeywordParser::parse((string) ($params['keyword'] ?? '')) === []
) {
return $this->fail('请填写至少一个搜索关键词', 422);
}
$source = $resolver->resolve($data['request_url'], $data['target_type']);
if (! $source) {
return $this->fail('无法识别该地址对应的采集源', 422);
}
$job = CrawlJob::query()->create([
'target_type' => $data['target_type'],
'request_url' => $data['request_url'],
'platform_url' => $data['request_url'],
'keyword' => (string) ($params['keyword'] ?? ''),
'params' => $params,
'crawl_source_id' => $source->id,
'adapter_code' => $source->adapter_code,
'admin_user_id' => $request->user()?->id,
'status' => 'pending',
]);
try {
$job = $runner->run($job, $source, $params);
} catch (\Throwable $e) {
$job->update([
'status' => 'failed',
'result_summary' => '抓取失败:'.$e->getMessage(),
'completed_at' => now(),
]);
return $this->fail('抓取任务失败:'.$e->getMessage(), 500, ['id' => $job->id]);
}
return $this->ok($this->serializeJob($job->fresh(['crawlSource'])), '抓取完成,请预览并勾选入库');
}
public function import(Request $request, int $crawlJob, CrawlImportService $importService): JsonResponse
{
$job = CrawlJob::query()->findOrFail($crawlJob);
$data = $request->validate([
'item_ids' => ['nullable', 'array'],
'item_ids.*' => ['integer'],
'select_all' => ['nullable', 'boolean'],
'teacher_defaults' => ['nullable', 'array'],
'teacher_defaults.university_id' => ['nullable', 'integer', 'exists:universities,id'],
'teacher_defaults.city' => ['nullable', 'string', 'max:64'],
'teacher_defaults.research_direction_ids' => ['nullable', 'array'],
'teacher_defaults.research_direction_ids.*' => ['integer', 'exists:research_directions,id'],
]);
$result = $importService->import(
$job,
$data['item_ids'] ?? null,
(bool) ($data['select_all'] ?? false),
$data['teacher_defaults'] ?? [],
);
return $this->ok([
'imported' => $result['imported'],
'skipped' => $result['skipped'],
'failed' => $result['failed'],
'items_imported' => $job->fresh()->items_imported,
], "已入库 {$result['imported']}");
}
public function updateItem(Request $request, int $crawlJob, int $item): JsonResponse
{
CrawlJob::query()->findOrFail($crawlJob);
$row = CrawlJobItem::query()
->where('crawl_job_id', $crawlJob)
->where('target_type', 'news')
->findOrFail($item);
$typeId = DictType::query()->where('code', 'news_category')->where('status', 1)->value('id');
if (! $typeId) {
return $this->fail('资讯分类字典未配置', 422);
}
$data = $request->validate([
'category_dict_item_id' => [
'required',
'integer',
\Illuminate\Validation\Rule::exists('dict_items', 'id')->where(
fn ($q) => $q->where('dict_type_id', $typeId)->where('status', 1)
),
],
]);
$dictItem = DictItem::query()->findOrFail($data['category_dict_item_id']);
$payload = $row->payload ?? [];
$extra = $payload['extra'] ?? [];
$extra['category_dict_item_id'] = (int) $dictItem->id;
$extra['category_label'] = $dictItem->label;
$payload['extra'] = $extra;
$row->update(['payload' => $payload]);
return $this->ok($this->serializeItem($row->fresh()), '已更新分类');
}
/**
* @return array<string, mixed>
*/
protected function serializeJob(CrawlJob $job): array
{
$previewCount = CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('status', 'preview')
->count();
$previewPaperCount = CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('target_type', 'paper')
->where('status', 'preview')
->count();
$previewTeacherLeadCount = CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('target_type', 'teacher_lead')
->where('status', 'preview')
->count();
$previewTeacherCount = CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('target_type', 'teacher')
->where('status', 'preview')
->count();
return [
'id' => $job->id,
'target_type' => $job->target_type,
'request_url' => $job->request_url ?? $job->platform_url,
'platform_url' => $job->platform_url,
'keyword' => $job->keyword,
'params' => $job->params,
'status' => $job->status,
'source_name' => $job->crawlSource?->name,
'adapter_code' => $job->adapter_code,
'items_fetched' => $job->items_fetched ?? $job->papers_created,
'items_imported' => $job->items_imported ?? 0,
'papers_created' => $job->papers_created,
'preview_count' => $previewCount,
'preview_paper_count' => $previewPaperCount,
'preview_teacher_lead_count' => $previewTeacherLeadCount,
'preview_teacher_count' => $previewTeacherCount,
'result_summary' => $job->result_summary,
'completed_at' => $job->completed_at?->toIso8601String(),
];
}
/**
* @return array<string, mixed>
*/
protected function serializeItem(CrawlJobItem $item): array
{
$payload = $item->payload ?? [];
$lead = $payload['lead_author'] ?? null;
return [
'id' => $item->id,
'external_id' => $item->external_id,
'title' => $item->title,
'authors' => $payload['authors'] ?? null,
'school_name' => $payload['school_name'] ?? ($lead['university_name'] ?? null),
'published_at' => $this->formatPublishedAt($payload['published_at'] ?? null),
'lead_author_name' => is_array($lead) ? ($lead['name'] ?? null) : null,
'lead_author_email' => is_array($lead) ? ($lead['email'] ?? null) : null,
'lead_author_affiliation' => is_array($lead) ? ($lead['affiliation'] ?? null) : null,
'lead_author_university' => is_array($lead) ? ($lead['university_name'] ?? null) : null,
'paper_title' => $payload['paper_title'] ?? null,
'paper_external_id' => $payload['paper_external_id'] ?? null,
'url' => $item->canonical_url,
'summary' => $payload['summary'] ?? null,
'content_html' => $payload['content_html'] ?? null,
'section' => $payload['section'] ?? null,
'category_dict_item_id' => isset(($payload['extra'] ?? [])['category_dict_item_id'])
? (int) ($payload['extra']['category_dict_item_id'])
: null,
'category_label' => ($payload['extra'] ?? [])['category_label'] ?? null,
'status' => $item->status,
'target_type' => $item->target_type,
'source_name' => $item->source_name,
'selectable' => $item->status === 'preview',
'is_duplicate' => $item->status === 'duplicate',
];
}
protected function formatPublishedAt(mixed $value): ?string
{
if (! $value) {
return null;
}
$str = (string) $value;
if (preg_match('/^\d{4}-\d{2}-\d{2}/', $str, $m)) {
return substr($m[0], 0, 10);
}
return $str;
}
}