You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

269 lines
9.0 KiB

2 weeks ago
<?php
namespace App\Services\Crawl;
use App\Models\CrawlJob;
use App\Models\CrawlJobItem;
use App\Models\DictItem;
use App\Models\DictType;
use App\Models\News;
use App\Models\Paper;
use App\Models\Teacher;
use App\Models\University;
use Illuminate\Support\Facades\DB;
class CrawlImportService
{
/**
* @param array<int>|null $itemIds
* @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $teacherDefaults
* @return array{imported:int, skipped:int, failed:int}
*/
public function import(
CrawlJob $job,
?array $itemIds = null,
bool $selectAll = false,
array $teacherDefaults = [],
): array {
$query = CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->whereIn('status', ['preview']);
if (! $selectAll && $itemIds !== null && $itemIds !== []) {
$query->whereIn('id', $itemIds);
}
$items = $query->get();
$imported = 0;
$skipped = 0;
$failed = 0;
DB::transaction(function () use ($items, $job, $teacherDefaults, &$imported, &$skipped, &$failed) {
foreach ($items as $item) {
try {
$id = match ($item->target_type) {
'paper' => $this->importPaper($job, $item),
'teacher_lead', 'teacher' => $this->importTeacher($job, $item, $teacherDefaults),
default => $this->importNews($job, $item),
};
if ($id) {
$item->update(['status' => 'imported', 'target_id' => $id]);
$imported++;
} else {
$item->update(['status' => 'skipped']);
$skipped++;
}
} catch (\Throwable) {
$failed++;
}
}
$job->update([
'items_imported' => CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('status', 'imported')
->count(),
]);
});
return compact('imported', 'skipped', 'failed');
}
protected function importPaper(CrawlJob $job, CrawlJobItem $item): ?int
{
$payload = $item->payload ?? [];
$externalId = $item->external_id;
$existing = Paper::query()
->where('external_id', $externalId)
->where('source', 'crawl')
->first();
if ($existing) {
$item->update(['status' => 'duplicate']);
return null;
}
$paper = Paper::query()->create([
'title' => $item->title,
'authors' => $payload['authors'] ?? null,
'school_name' => $payload['school_name'] ?? null,
'published_at' => $payload['published_at'] ?? null,
'url' => $item->canonical_url,
'summary' => $payload['summary'] ?? null,
'source' => 'crawl',
'external_id' => $externalId,
'source_site' => $item->source_name,
'crawl_job_id' => $job->id,
]);
return $paper->id;
}
/**
* @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $defaults
*/
protected function importTeacher(CrawlJob $job, CrawlJobItem $item, array $defaults = []): ?int
{
$payload = $item->payload ?? [];
$lead = $payload['lead_author'] ?? null;
if (! is_array($lead)) {
$lead = [
'name' => $item->title,
'email' => null,
'affiliation' => null,
'university_name' => $payload['school_name'] ?? null,
];
}
$name = trim((string) ($lead['name'] ?? ''));
if ($name === '') {
return null;
}
$email = CrawlAuthorParser::normalizeEmail($lead['email'] ?? null);
if ($email && Teacher::query()->where('email', $email)->exists()) {
$item->update(['status' => 'duplicate']);
return null;
}
$universityId = isset($defaults['university_id']) ? (int) $defaults['university_id'] : null;
$city = isset($defaults['city']) ? trim((string) $defaults['city']) : null;
if ($universityId && ! $city) {
$city = University::query()->whereKey($universityId)->value('city');
}
$leadUniversityName = trim((string) ($lead['university_name'] ?? ''));
if (! $universityId && $leadUniversityName !== '') {
$universityId = $this->resolveUniversityId($leadUniversityName);
if ($universityId) {
$city = $city ?: University::query()->whereKey($universityId)->value('city');
}
}
$sourceId = $this->resolveTeacherSourceId($item->target_type);
$statusId = $this->defaultTeacherStatusId();
if (! $sourceId || ! $statusId) {
throw new \RuntimeException('老师库字典未配置');
}
$teacher = Teacher::query()->create([
'name' => $name,
'university_id' => $universityId,
'university_text' => $universityId ? null : ($leadUniversityName !== '' ? $leadUniversityName : null),
'city' => $city ?: '待补充',
'title' => '待补充',
'email' => $email,
'source_dict_item_id' => $sourceId,
'status_dict_item_id' => $statusId,
'remark' => $item->target_type === 'teacher_lead' ? '论文库入库' : '爬虫入库',
]);
if (! empty($defaults['research_direction_ids']) && is_array($defaults['research_direction_ids'])) {
$teacher->researchDirections()->sync($defaults['research_direction_ids']);
}
$paperExternalId = $payload['paper_external_id'] ?? null;
if ($paperExternalId) {
$paper = Paper::query()
->where('external_id', $paperExternalId)
->where('source', 'crawl')
->first();
if ($paper) {
$teacher->papers()->syncWithoutDetaching([$paper->id]);
}
}
return $teacher->id;
}
protected function importNews(CrawlJob $job, CrawlJobItem $item): ?int
{
$url = $item->canonical_url;
if ($url && News::query()->where('source_url', $url)->exists()) {
$item->update(['status' => 'duplicate']);
return null;
}
$payload = $item->payload ?? [];
$extra = $payload['extra'] ?? [];
$categoryId = isset($extra['category_dict_item_id'])
? (int) $extra['category_dict_item_id']
: app(NewsCategoryMatcher::class)->resolveCategoryId(
$item->title,
$payload['summary'] ?? null,
$extra['keywords'] ?? CrawlKeywordParser::parse((string) ($job->keyword ?? '')),
);
$content = PedailyContentNormalizer::normalize($payload['content_html'] ?? '') ?? '';
$content = app(NewsHtmlImageLocalizer::class)->localize($content) ?? $content;
if ($content === '') {
$content = '<p>(爬虫抓取,请编辑正文)</p>';
}
$news = News::query()->create([
'title' => $item->title,
'category_dict_item_id' => $categoryId,
'source' => $item->source_name ?: '投资界',
'source_url' => $url,
'source_site' => $item->source_name,
'crawl_job_id' => $job->id,
'summary' => null,
'content_html' => $content,
'status' => 0,
'published_at' => $payload['published_at'] ?? now(),
]);
return $news->id;
}
protected function resolveUniversityId(string $name): ?int
{
$name = trim($name);
if ($name === '') {
return null;
}
$id = University::query()
->where('name', $name)
->where('status', 1)
->value('id');
return $id ? (int) $id : null;
}
/**
* 论文任务第一作者 → 论文库;师资页老师任务 → 手动录入。
*/
protected function resolveTeacherSourceId(string $targetType): ?int
{
$value = $targetType === 'teacher_lead' ? 'paper' : 'manual';
$typeId = DictType::query()->where('code', 'teacher_source')->where('status', 1)->value('id');
if (! $typeId) {
return null;
}
return DictItem::query()
->where('dict_type_id', $typeId)
->where('value', $value)
->where('status', 1)
->value('id');
}
protected function defaultTeacherStatusId(): ?int
{
$typeId = DictType::query()->where('code', 'teacher_status')->where('status', 1)->value('id');
if (! $typeId) {
return null;
}
return DictItem::query()
->where('dict_type_id', $typeId)
->where('value', 'active')
->where('status', 1)
->value('id');
}
}