You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
309 lines
12 KiB
309 lines
12 KiB
<?php
|
|
|
|
namespace App\Http\Controllers\Admin;
|
|
|
|
use App\Http\Controllers\Controller;
|
|
use App\Models\CrawlJob;
|
|
use App\Models\CrawlJobItem;
|
|
use App\Models\DictItem;
|
|
use App\Models\DictType;
|
|
use App\Services\Crawl\CrawlImportService;
|
|
use App\Services\Crawl\CrawlJobRunnerService;
|
|
use App\Services\Crawl\CrawlKeywordParser;
|
|
use App\Services\Crawl\CrawlSourceResolver;
|
|
use App\Support\ApiResponse;
|
|
use Illuminate\Http\JsonResponse;
|
|
use Illuminate\Http\Request;
|
|
|
|
class CrawlJobController extends Controller
|
|
{
|
|
use ApiResponse;
|
|
|
|
public function resolveUrl(Request $request, CrawlSourceResolver $resolver): JsonResponse
|
|
{
|
|
$data = $request->validate([
|
|
'request_url' => ['required', 'url', 'max:512'],
|
|
'target_type' => ['required', 'in:paper,industry_news,teacher'],
|
|
]);
|
|
|
|
$source = $resolver->resolve($data['request_url'], $data['target_type']);
|
|
if (! $source) {
|
|
return $this->fail('无法识别该地址,请确认 URL 或在采集源中配置域名匹配', 422);
|
|
}
|
|
|
|
return $this->ok([
|
|
'crawl_source_id' => $source->id,
|
|
'source_name' => $source->name,
|
|
'adapter_code' => $source->adapter_code,
|
|
'target_type' => $source->target_type,
|
|
'param_schema' => $source->param_schema,
|
|
'entry_url' => $source->entry_url,
|
|
]);
|
|
}
|
|
|
|
public function show(int $crawlJob): JsonResponse
|
|
{
|
|
$job = CrawlJob::query()->with('crawlSource')->findOrFail($crawlJob);
|
|
|
|
return $this->ok($this->serializeJob($job));
|
|
}
|
|
|
|
public function items(Request $request, int $crawlJob): JsonResponse
|
|
{
|
|
$job = CrawlJob::query()->findOrFail($crawlJob);
|
|
$query = CrawlJobItem::query()->where('crawl_job_id', $job->id)->orderByDesc('id');
|
|
|
|
if ($kind = $request->query('item_kind')) {
|
|
match ($kind) {
|
|
'paper' => $query->where('target_type', 'paper'),
|
|
'teacher_lead' => $query->where('target_type', 'teacher_lead'),
|
|
'teacher' => $query->where('target_type', 'teacher'),
|
|
default => null,
|
|
};
|
|
}
|
|
|
|
if ($kw = $request->query('keyword')) {
|
|
$query->where(function ($q) use ($kw) {
|
|
$q->where('title', 'like', "%{$kw}%")
|
|
->orWhere('canonical_url', 'like', "%{$kw}%");
|
|
});
|
|
}
|
|
|
|
$paginator = $query->paginate((int) $request->query('page_size', 100))->withQueryString();
|
|
$paginator->getCollection()->transform(fn (CrawlJobItem $i) => $this->serializeItem($i));
|
|
|
|
if ($request->query('item_kind') === 'teacher_lead') {
|
|
$sorted = $paginator->getCollection()
|
|
->sort(function (array $a, array $b) {
|
|
$aHas = ($a['lead_author_university'] ?? '') !== '' || ($a['school_name'] ?? '') !== '';
|
|
$bHas = ($b['lead_author_university'] ?? '') !== '' || ($b['school_name'] ?? '') !== '';
|
|
if ($aHas !== $bHas) {
|
|
return $aHas ? -1 : 1;
|
|
}
|
|
|
|
return strcmp((string) ($a['lead_author_name'] ?? ''), (string) ($b['lead_author_name'] ?? ''));
|
|
})
|
|
->values();
|
|
$paginator->setCollection($sorted);
|
|
}
|
|
|
|
return $this->paginated($paginator);
|
|
}
|
|
|
|
public function store(
|
|
Request $request,
|
|
CrawlSourceResolver $resolver,
|
|
CrawlJobRunnerService $runner,
|
|
): JsonResponse {
|
|
$data = $request->validate([
|
|
'target_type' => ['required', 'in:paper,industry_news,teacher'],
|
|
'request_url' => ['required', 'url', 'max:512'],
|
|
'params' => ['nullable', 'array'],
|
|
]);
|
|
|
|
$params = $data['params'] ?? [];
|
|
if (
|
|
$data['target_type'] !== 'teacher'
|
|
&& CrawlKeywordParser::parse((string) ($params['keyword'] ?? '')) === []
|
|
) {
|
|
return $this->fail('请填写至少一个搜索关键词', 422);
|
|
}
|
|
|
|
$source = $resolver->resolve($data['request_url'], $data['target_type']);
|
|
if (! $source) {
|
|
return $this->fail('无法识别该地址对应的采集源', 422);
|
|
}
|
|
|
|
$job = CrawlJob::query()->create([
|
|
'target_type' => $data['target_type'],
|
|
'request_url' => $data['request_url'],
|
|
'platform_url' => $data['request_url'],
|
|
'keyword' => (string) ($params['keyword'] ?? ''),
|
|
'params' => $params,
|
|
'crawl_source_id' => $source->id,
|
|
'adapter_code' => $source->adapter_code,
|
|
'admin_user_id' => $request->user()?->id,
|
|
'status' => 'pending',
|
|
]);
|
|
|
|
try {
|
|
$job = $runner->run($job, $source, $params);
|
|
} catch (\Throwable $e) {
|
|
$job->update([
|
|
'status' => 'failed',
|
|
'result_summary' => '抓取失败:'.$e->getMessage(),
|
|
'completed_at' => now(),
|
|
]);
|
|
|
|
return $this->fail('抓取任务失败:'.$e->getMessage(), 500, ['id' => $job->id]);
|
|
}
|
|
|
|
return $this->ok($this->serializeJob($job->fresh(['crawlSource'])), '抓取完成,请预览并勾选入库');
|
|
}
|
|
|
|
public function import(Request $request, int $crawlJob, CrawlImportService $importService): JsonResponse
|
|
{
|
|
$job = CrawlJob::query()->findOrFail($crawlJob);
|
|
$data = $request->validate([
|
|
'item_ids' => ['nullable', 'array'],
|
|
'item_ids.*' => ['integer'],
|
|
'select_all' => ['nullable', 'boolean'],
|
|
'teacher_defaults' => ['nullable', 'array'],
|
|
'teacher_defaults.university_id' => ['nullable', 'integer', 'exists:universities,id'],
|
|
'teacher_defaults.city' => ['nullable', 'string', 'max:64'],
|
|
'teacher_defaults.research_direction_ids' => ['nullable', 'array'],
|
|
'teacher_defaults.research_direction_ids.*' => ['integer', 'exists:research_directions,id'],
|
|
]);
|
|
|
|
$result = $importService->import(
|
|
$job,
|
|
$data['item_ids'] ?? null,
|
|
(bool) ($data['select_all'] ?? false),
|
|
$data['teacher_defaults'] ?? [],
|
|
);
|
|
|
|
return $this->ok([
|
|
'imported' => $result['imported'],
|
|
'skipped' => $result['skipped'],
|
|
'failed' => $result['failed'],
|
|
'items_imported' => $job->fresh()->items_imported,
|
|
], "已入库 {$result['imported']} 条");
|
|
}
|
|
|
|
public function updateItem(Request $request, int $crawlJob, int $item): JsonResponse
|
|
{
|
|
CrawlJob::query()->findOrFail($crawlJob);
|
|
$row = CrawlJobItem::query()
|
|
->where('crawl_job_id', $crawlJob)
|
|
->where('target_type', 'news')
|
|
->findOrFail($item);
|
|
|
|
$typeId = DictType::query()->where('code', 'news_category')->where('status', 1)->value('id');
|
|
if (! $typeId) {
|
|
return $this->fail('资讯分类字典未配置', 422);
|
|
}
|
|
|
|
$data = $request->validate([
|
|
'category_dict_item_id' => [
|
|
'required',
|
|
'integer',
|
|
\Illuminate\Validation\Rule::exists('dict_items', 'id')->where(
|
|
fn ($q) => $q->where('dict_type_id', $typeId)->where('status', 1)
|
|
),
|
|
],
|
|
]);
|
|
|
|
$dictItem = DictItem::query()->findOrFail($data['category_dict_item_id']);
|
|
$payload = $row->payload ?? [];
|
|
$extra = $payload['extra'] ?? [];
|
|
$extra['category_dict_item_id'] = (int) $dictItem->id;
|
|
$extra['category_label'] = $dictItem->label;
|
|
$payload['extra'] = $extra;
|
|
$row->update(['payload' => $payload]);
|
|
|
|
return $this->ok($this->serializeItem($row->fresh()), '已更新分类');
|
|
}
|
|
|
|
/**
|
|
* @return array<string, mixed>
|
|
*/
|
|
protected function serializeJob(CrawlJob $job): array
|
|
{
|
|
$previewCount = CrawlJobItem::query()
|
|
->where('crawl_job_id', $job->id)
|
|
->where('status', 'preview')
|
|
->count();
|
|
|
|
$previewPaperCount = CrawlJobItem::query()
|
|
->where('crawl_job_id', $job->id)
|
|
->where('target_type', 'paper')
|
|
->where('status', 'preview')
|
|
->count();
|
|
|
|
$previewTeacherLeadCount = CrawlJobItem::query()
|
|
->where('crawl_job_id', $job->id)
|
|
->where('target_type', 'teacher_lead')
|
|
->where('status', 'preview')
|
|
->count();
|
|
|
|
$previewTeacherCount = CrawlJobItem::query()
|
|
->where('crawl_job_id', $job->id)
|
|
->where('target_type', 'teacher')
|
|
->where('status', 'preview')
|
|
->count();
|
|
|
|
return [
|
|
'id' => $job->id,
|
|
'target_type' => $job->target_type,
|
|
'request_url' => $job->request_url ?? $job->platform_url,
|
|
'platform_url' => $job->platform_url,
|
|
'keyword' => $job->keyword,
|
|
'params' => $job->params,
|
|
'status' => $job->status,
|
|
'source_name' => $job->crawlSource?->name,
|
|
'adapter_code' => $job->adapter_code,
|
|
'items_fetched' => $job->items_fetched ?? $job->papers_created,
|
|
'items_imported' => $job->items_imported ?? 0,
|
|
'papers_created' => $job->papers_created,
|
|
'preview_count' => $previewCount,
|
|
'preview_paper_count' => $previewPaperCount,
|
|
'preview_teacher_lead_count' => $previewTeacherLeadCount,
|
|
'preview_teacher_count' => $previewTeacherCount,
|
|
'result_summary' => $job->result_summary,
|
|
'completed_at' => $job->completed_at?->toIso8601String(),
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @return array<string, mixed>
|
|
*/
|
|
protected function serializeItem(CrawlJobItem $item): array
|
|
{
|
|
$payload = $item->payload ?? [];
|
|
|
|
$lead = $payload['lead_author'] ?? null;
|
|
|
|
return [
|
|
'id' => $item->id,
|
|
'external_id' => $item->external_id,
|
|
'title' => $item->title,
|
|
'authors' => $payload['authors'] ?? null,
|
|
'school_name' => $payload['school_name'] ?? ($lead['university_name'] ?? null),
|
|
'published_at' => $this->formatPublishedAt($payload['published_at'] ?? null),
|
|
'lead_author_name' => is_array($lead) ? ($lead['name'] ?? null) : null,
|
|
'lead_author_email' => is_array($lead) ? ($lead['email'] ?? null) : null,
|
|
'lead_author_affiliation' => is_array($lead) ? ($lead['affiliation'] ?? null) : null,
|
|
'lead_author_university' => is_array($lead) ? ($lead['university_name'] ?? null) : null,
|
|
'paper_title' => $payload['paper_title'] ?? null,
|
|
'paper_external_id' => $payload['paper_external_id'] ?? null,
|
|
'url' => $item->canonical_url,
|
|
'summary' => $payload['summary'] ?? null,
|
|
'content_html' => $payload['content_html'] ?? null,
|
|
'section' => $payload['section'] ?? null,
|
|
'category_dict_item_id' => isset(($payload['extra'] ?? [])['category_dict_item_id'])
|
|
? (int) ($payload['extra']['category_dict_item_id'])
|
|
: null,
|
|
'category_label' => ($payload['extra'] ?? [])['category_label'] ?? null,
|
|
'status' => $item->status,
|
|
'target_type' => $item->target_type,
|
|
'source_name' => $item->source_name,
|
|
'selectable' => $item->status === 'preview',
|
|
'is_duplicate' => $item->status === 'duplicate',
|
|
];
|
|
}
|
|
|
|
protected function formatPublishedAt(mixed $value): ?string
|
|
{
|
|
if (! $value) {
|
|
return null;
|
|
}
|
|
|
|
$str = (string) $value;
|
|
if (preg_match('/^\d{4}-\d{2}-\d{2}/', $str, $m)) {
|
|
return substr($m[0], 0, 10);
|
|
}
|
|
|
|
return $str;
|
|
}
|
|
}
|