You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

390 lines
13 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
namespace App\Services\Crawl;
use App\Models\CrawlJob;
use App\Models\CrawlJobItem;
use App\Models\DictItem;
use App\Models\DictType;
use App\Models\News;
use App\Models\Paper;
use App\Models\Teacher;
use App\Models\University;
use Carbon\Carbon;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Log;
class CrawlImportService
{
/**
* @param array<int>|null $itemIds
* @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $teacherDefaults
* @return array{imported:int, skipped:int, failed:int}
*/
public function import(
CrawlJob $job,
?array $itemIds = null,
bool $selectAll = false,
array $teacherDefaults = [],
array $newsDefaults = [],
): array {
$query = CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->whereIn('status', ['preview']);
if (! $selectAll && $itemIds !== null && $itemIds !== []) {
$query->whereIn('id', $itemIds);
}
$items = $query->get();
$imported = 0;
$skipped = 0;
$failed = 0;
DB::transaction(function () use ($items, $job, $teacherDefaults, $newsDefaults, &$imported, &$skipped, &$failed) {
foreach ($items as $item) {
try {
$id = match ($item->target_type) {
'paper' => $this->importPaper($job, $item),
'teacher_lead', 'teacher' => $this->importTeacher($job, $item, $teacherDefaults),
default => $this->importNews($job, $item, $newsDefaults),
};
if ($id) {
$item->update(['status' => 'imported', 'target_id' => $id]);
$imported++;
} else {
if ($item->status === 'preview') {
$item->update(['status' => 'skipped']);
}
$skipped++;
}
} catch (\Throwable $e) {
$failed++;
Log::warning('crawl_import_item_failed', [
'crawl_job_id' => $job->id,
'item_id' => $item->id,
'target_type' => $item->target_type,
'message' => $e->getMessage(),
]);
}
}
$job->update([
'items_imported' => CrawlJobItem::query()
->where('crawl_job_id', $job->id)
->where('status', 'imported')
->count(),
]);
});
return compact('imported', 'skipped', 'failed');
}
protected function importPaper(CrawlJob $job, CrawlJobItem $item): ?int
{
$payload = $item->payload ?? [];
$externalId = $item->external_id;
$existing = Paper::query()
->where('external_id', $externalId)
->where('source', 'crawl')
->first();
if ($existing) {
$item->update(['status' => 'duplicate']);
return null;
}
$paper = Paper::query()->create([
'title' => $item->title,
'authors' => $payload['authors'] ?? null,
'school_name' => $payload['school_name'] ?? null,
'published_at' => $payload['published_at'] ?? null,
'url' => $item->canonical_url,
'summary' => $payload['summary'] ?? null,
'source' => 'crawl',
'external_id' => $externalId,
'source_site' => $item->source_name,
'crawl_job_id' => $job->id,
]);
return $paper->id;
}
/**
* @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $defaults
*/
protected function importTeacher(CrawlJob $job, CrawlJobItem $item, array $defaults = []): ?int
{
$payload = $item->payload ?? [];
$lead = $payload['lead_author'] ?? null;
if (! is_array($lead)) {
$lead = [
'name' => $item->title,
'email' => null,
'affiliation' => null,
'university_name' => $payload['school_name'] ?? null,
];
}
$name = trim((string) ($lead['name'] ?? ''));
if ($name === '') {
return null;
}
$email = CrawlAuthorParser::normalizeEmail($lead['email'] ?? null);
if ($email && Teacher::query()->where('email', $email)->exists()) {
$item->update(['status' => 'duplicate']);
return null;
}
$academicTitle = trim((string) ($lead['academic_title'] ?? $payload['academic_title'] ?? ''));
$collegeName = trim((string) ($lead['college'] ?? $lead['affiliation'] ?? $payload['college_name'] ?? ''));
$profileUrl = trim((string) ($payload['profile_url'] ?? $item->canonical_url ?? ''));
$universityId = isset($defaults['university_id']) ? (int) $defaults['university_id'] : null;
$city = isset($defaults['city']) ? trim((string) $defaults['city']) : null;
if ($universityId && ! $city) {
$city = University::query()->whereKey($universityId)->value('city');
}
$leadUniversityName = trim((string) ($lead['university_name'] ?? ''));
if (! $universityId && $leadUniversityName !== '') {
$universityId = $this->resolveUniversityId($leadUniversityName);
if ($universityId) {
$city = $city ?: University::query()->whereKey($universityId)->value('city');
}
}
if ($this->teacherAlreadyExists($name, $email, $profileUrl, $universityId, $leadUniversityName, $collegeName)) {
$item->update(['status' => 'duplicate']);
return null;
}
$sourceId = $this->resolveTeacherSourceId($item->target_type);
$statusId = $this->defaultTeacherStatusId();
if (! $sourceId || ! $statusId) {
throw new \RuntimeException('老师库字典未配置');
}
$remarkParts = [
match ($item->target_type) {
'teacher_lead' => '论文库入库',
'teacher' => '高校抓取入库',
default => '爬虫入库',
},
];
if ($profileUrl !== '') {
$remarkParts[] = '主页:'.$profileUrl;
}
$universityText = null;
if (! $universityId) {
if ($leadUniversityName !== '' && $collegeName !== '') {
$universityText = $leadUniversityName.' · '.$collegeName;
} elseif ($leadUniversityName !== '') {
$universityText = $leadUniversityName;
} elseif ($collegeName !== '') {
$universityText = $collegeName;
}
}
$teacher = Teacher::query()->create([
'name' => $name,
'university_id' => $universityId,
'university_text' => $universityText,
'department' => $collegeName !== '' ? $collegeName : null,
'city' => $city ?: '待补充',
'title' => $academicTitle !== '' ? $academicTitle : '待补充',
'email' => $email,
'source_dict_item_id' => $sourceId,
'status_dict_item_id' => $statusId,
'remark' => implode('', $remarkParts),
]);
if (! empty($defaults['research_direction_ids']) && is_array($defaults['research_direction_ids'])) {
$teacher->researchDirections()->sync($defaults['research_direction_ids']);
}
$paperExternalId = $payload['paper_external_id'] ?? null;
if ($paperExternalId) {
$paper = Paper::query()
->where('external_id', $paperExternalId)
->where('source', 'crawl')
->first();
if ($paper) {
$teacher->papers()->syncWithoutDetaching([$paper->id]);
}
}
return $teacher->id;
}
/**
* @param array{source?:string, category_dict_item_id?:int} $newsDefaults
*/
protected function importNews(CrawlJob $job, CrawlJobItem $item, array $newsDefaults = []): ?int
{
$url = $item->canonical_url;
if ($url && News::query()->where('source_url', $url)->exists()) {
$item->update(['status' => 'duplicate']);
return null;
}
$title = trim((string) $item->title);
if ($title === '') {
throw new \RuntimeException('标题为空,无法入库');
}
$payload = $item->payload ?? [];
$extra = $payload['extra'] ?? [];
$categoryId = isset($extra['category_dict_item_id']) ? (int) $extra['category_dict_item_id'] : 0;
if ($categoryId <= 0 && ! empty($newsDefaults['category_dict_item_id'])) {
$categoryId = (int) $newsDefaults['category_dict_item_id'];
}
if ($categoryId <= 0) {
$categoryId = (int) (app(NewsCategoryMatcher::class)->resolveCategoryId(
$item->title,
$payload['summary'] ?? null,
$extra['keywords'] ?? CrawlKeywordParser::parse((string) ($job->keyword ?? '')),
) ?? 0);
}
if ($categoryId <= 0) {
$categoryId = null;
}
$content = app(NewsHtmlImageLocalizer::class)->localize(
$payload['content_html'] ?? '',
$url
) ?? '';
if ($content === '') {
$content = '<p>(爬虫抓取,请编辑正文)</p>';
}
$importSource = trim((string) ($extra['import_source'] ?? $newsDefaults['source'] ?? ''));
if ($importSource === '') {
$importSource = $item->source_name ?: '爬虫采集';
}
$news = News::query()->create([
'title' => $title,
'category_dict_item_id' => $categoryId,
'source' => $importSource,
'source_url' => $url,
'source_site' => $importSource,
'crawl_job_id' => $job->id,
'summary' => $payload['summary'] ?? null,
'content_html' => $content,
'status' => 0,
'published_at' => $this->resolvePublishedAt($payload['published_at'] ?? null),
]);
return $news->id;
}
protected function resolvePublishedAt(mixed $value): Carbon
{
if ($value === null || $value === '') {
return now();
}
try {
return Carbon::parse($value);
} catch (\Throwable) {
$normalized = HtmlCrawlSupport::normalizeDate((string) $value);
return $normalized ? Carbon::parse($normalized.' 00:00:00') : now();
}
}
protected function teacherAlreadyExists(
string $name,
?string $email,
?string $profileUrl,
?int $universityId,
string $leadUniversityName,
?string $collegeName,
): bool {
if ($email && Teacher::query()->where('email', $email)->exists()) {
return true;
}
if ($email) {
return false;
}
if ($profileUrl !== '') {
return Teacher::query()->where('remark', 'like', '%'.$profileUrl.'%')->exists();
}
$dupQuery = Teacher::query()->where('name', $name);
if ($collegeName !== '') {
$dupQuery->where('department', $collegeName);
}
if ($universityId) {
$dupQuery->where('university_id', $universityId);
} elseif ($leadUniversityName !== '') {
$dupQuery->where(function ($q) use ($leadUniversityName) {
$q->where('university_text', $leadUniversityName)
->orWhere('university_text', 'like', $leadUniversityName.'%');
});
}
return $dupQuery->exists();
}
protected function resolveUniversityId(string $name): ?int
{
$name = trim($name);
if ($name === '') {
return null;
}
$id = University::query()
->where('name', $name)
->where('status', 1)
->value('id');
return $id ? (int) $id : null;
}
/**
* 论文任务第一作者 → 论文库;师资列表任务 → 高校抓取;其余 → 手动录入。
*/
protected function resolveTeacherSourceId(string $targetType): ?int
{
$value = match ($targetType) {
'teacher_lead' => 'paper',
'teacher' => 'faculty_crawl',
default => 'manual',
};
$typeId = DictType::query()->where('code', 'teacher_source')->where('status', 1)->value('id');
if (! $typeId) {
return null;
}
return DictItem::query()
->where('dict_type_id', $typeId)
->where('value', $value)
->where('status', 1)
->value('id');
}
protected function defaultTeacherStatusId(): ?int
{
$typeId = DictType::query()->where('code', 'teacher_status')->where('status', 1)->value('id');
if (! $typeId) {
return null;
}
return DictItem::query()
->where('dict_type_id', $typeId)
->where('value', 'active')
->where('status', 1)
->value('id');
}
}