|
|
<?php
|
|
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
|
|
use App\Models\CrawlJob;
|
|
|
use App\Models\CrawlJobItem;
|
|
|
use App\Models\DictItem;
|
|
|
use App\Models\DictType;
|
|
|
use App\Models\News;
|
|
|
use App\Models\Paper;
|
|
|
use App\Models\Teacher;
|
|
|
use App\Models\University;
|
|
|
use Carbon\Carbon;
|
|
|
use Illuminate\Support\Facades\DB;
|
|
|
use Illuminate\Support\Facades\Log;
|
|
|
|
|
|
class CrawlImportService
|
|
|
{
|
|
|
/**
|
|
|
* @param array<int>|null $itemIds
|
|
|
* @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $teacherDefaults
|
|
|
* @return array{imported:int, skipped:int, failed:int}
|
|
|
*/
|
|
|
public function import(
|
|
|
CrawlJob $job,
|
|
|
?array $itemIds = null,
|
|
|
bool $selectAll = false,
|
|
|
array $teacherDefaults = [],
|
|
|
array $newsDefaults = [],
|
|
|
): array {
|
|
|
$query = CrawlJobItem::query()
|
|
|
->where('crawl_job_id', $job->id)
|
|
|
->whereIn('status', ['preview']);
|
|
|
|
|
|
if (! $selectAll && $itemIds !== null && $itemIds !== []) {
|
|
|
$query->whereIn('id', $itemIds);
|
|
|
}
|
|
|
|
|
|
$items = $query->get();
|
|
|
$imported = 0;
|
|
|
$skipped = 0;
|
|
|
$failed = 0;
|
|
|
|
|
|
DB::transaction(function () use ($items, $job, $teacherDefaults, $newsDefaults, &$imported, &$skipped, &$failed) {
|
|
|
foreach ($items as $item) {
|
|
|
try {
|
|
|
$id = match ($item->target_type) {
|
|
|
'paper' => $this->importPaper($job, $item),
|
|
|
'teacher_lead', 'teacher' => $this->importTeacher($job, $item, $teacherDefaults),
|
|
|
default => $this->importNews($job, $item, $newsDefaults),
|
|
|
};
|
|
|
if ($id) {
|
|
|
$item->update(['status' => 'imported', 'target_id' => $id]);
|
|
|
$imported++;
|
|
|
} else {
|
|
|
if ($item->status === 'preview') {
|
|
|
$item->update(['status' => 'skipped']);
|
|
|
}
|
|
|
$skipped++;
|
|
|
}
|
|
|
} catch (\Throwable $e) {
|
|
|
$failed++;
|
|
|
Log::warning('crawl_import_item_failed', [
|
|
|
'crawl_job_id' => $job->id,
|
|
|
'item_id' => $item->id,
|
|
|
'target_type' => $item->target_type,
|
|
|
'message' => $e->getMessage(),
|
|
|
]);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$job->update([
|
|
|
'items_imported' => CrawlJobItem::query()
|
|
|
->where('crawl_job_id', $job->id)
|
|
|
->where('status', 'imported')
|
|
|
->count(),
|
|
|
]);
|
|
|
});
|
|
|
|
|
|
return compact('imported', 'skipped', 'failed');
|
|
|
}
|
|
|
|
|
|
protected function importPaper(CrawlJob $job, CrawlJobItem $item): ?int
|
|
|
{
|
|
|
$payload = $item->payload ?? [];
|
|
|
$externalId = $item->external_id;
|
|
|
|
|
|
$existing = Paper::query()
|
|
|
->where('external_id', $externalId)
|
|
|
->where('source', 'crawl')
|
|
|
->first();
|
|
|
if ($existing) {
|
|
|
$item->update(['status' => 'duplicate']);
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$paper = Paper::query()->create([
|
|
|
'title' => $item->title,
|
|
|
'authors' => $payload['authors'] ?? null,
|
|
|
'school_name' => $payload['school_name'] ?? null,
|
|
|
'published_at' => $payload['published_at'] ?? null,
|
|
|
'url' => $item->canonical_url,
|
|
|
'summary' => $payload['summary'] ?? null,
|
|
|
'source' => 'crawl',
|
|
|
'external_id' => $externalId,
|
|
|
'source_site' => $item->source_name,
|
|
|
'crawl_job_id' => $job->id,
|
|
|
]);
|
|
|
|
|
|
return $paper->id;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array{university_id?:int,city?:string,research_direction_ids?:int[]} $defaults
|
|
|
*/
|
|
|
protected function importTeacher(CrawlJob $job, CrawlJobItem $item, array $defaults = []): ?int
|
|
|
{
|
|
|
$payload = $item->payload ?? [];
|
|
|
$lead = $payload['lead_author'] ?? null;
|
|
|
if (! is_array($lead)) {
|
|
|
$lead = [
|
|
|
'name' => $item->title,
|
|
|
'email' => null,
|
|
|
'affiliation' => null,
|
|
|
'university_name' => $payload['school_name'] ?? null,
|
|
|
];
|
|
|
}
|
|
|
|
|
|
$name = trim((string) ($lead['name'] ?? ''));
|
|
|
if ($name === '') {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$email = CrawlAuthorParser::normalizeEmail($lead['email'] ?? null);
|
|
|
if ($email && Teacher::query()->where('email', $email)->exists()) {
|
|
|
$item->update(['status' => 'duplicate']);
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$academicTitle = trim((string) ($lead['academic_title'] ?? $payload['academic_title'] ?? ''));
|
|
|
$collegeName = trim((string) ($lead['college'] ?? $lead['affiliation'] ?? $payload['college_name'] ?? ''));
|
|
|
$profileUrl = trim((string) ($payload['profile_url'] ?? $item->canonical_url ?? ''));
|
|
|
|
|
|
$universityId = isset($defaults['university_id']) ? (int) $defaults['university_id'] : null;
|
|
|
$city = isset($defaults['city']) ? trim((string) $defaults['city']) : null;
|
|
|
if ($universityId && ! $city) {
|
|
|
$city = University::query()->whereKey($universityId)->value('city');
|
|
|
}
|
|
|
$leadUniversityName = trim((string) ($lead['university_name'] ?? ''));
|
|
|
if (! $universityId && $leadUniversityName !== '') {
|
|
|
$universityId = $this->resolveUniversityId($leadUniversityName);
|
|
|
if ($universityId) {
|
|
|
$city = $city ?: University::query()->whereKey($universityId)->value('city');
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if ($this->teacherAlreadyExists($name, $email, $profileUrl, $universityId, $leadUniversityName, $collegeName)) {
|
|
|
$item->update(['status' => 'duplicate']);
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$sourceId = $this->resolveTeacherSourceId($item->target_type);
|
|
|
$statusId = $this->defaultTeacherStatusId();
|
|
|
if (! $sourceId || ! $statusId) {
|
|
|
throw new \RuntimeException('老师库字典未配置');
|
|
|
}
|
|
|
|
|
|
$remarkParts = [
|
|
|
match ($item->target_type) {
|
|
|
'teacher_lead' => '论文库入库',
|
|
|
'teacher' => '高校抓取入库',
|
|
|
default => '爬虫入库',
|
|
|
},
|
|
|
];
|
|
|
if ($profileUrl !== '') {
|
|
|
$remarkParts[] = '主页:'.$profileUrl;
|
|
|
}
|
|
|
|
|
|
$universityText = null;
|
|
|
if (! $universityId) {
|
|
|
if ($leadUniversityName !== '' && $collegeName !== '') {
|
|
|
$universityText = $leadUniversityName.' · '.$collegeName;
|
|
|
} elseif ($leadUniversityName !== '') {
|
|
|
$universityText = $leadUniversityName;
|
|
|
} elseif ($collegeName !== '') {
|
|
|
$universityText = $collegeName;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
$teacher = Teacher::query()->create([
|
|
|
'name' => $name,
|
|
|
'university_id' => $universityId,
|
|
|
'university_text' => $universityText,
|
|
|
'department' => $collegeName !== '' ? $collegeName : null,
|
|
|
'city' => $city ?: '待补充',
|
|
|
'title' => $academicTitle !== '' ? $academicTitle : '待补充',
|
|
|
'email' => $email,
|
|
|
'source_dict_item_id' => $sourceId,
|
|
|
'status_dict_item_id' => $statusId,
|
|
|
'remark' => implode(';', $remarkParts),
|
|
|
]);
|
|
|
|
|
|
if (! empty($defaults['research_direction_ids']) && is_array($defaults['research_direction_ids'])) {
|
|
|
$teacher->researchDirections()->sync($defaults['research_direction_ids']);
|
|
|
}
|
|
|
|
|
|
$paperExternalId = $payload['paper_external_id'] ?? null;
|
|
|
if ($paperExternalId) {
|
|
|
$paper = Paper::query()
|
|
|
->where('external_id', $paperExternalId)
|
|
|
->where('source', 'crawl')
|
|
|
->first();
|
|
|
if ($paper) {
|
|
|
$teacher->papers()->syncWithoutDetaching([$paper->id]);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return $teacher->id;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* @param array{source?:string, category_dict_item_id?:int} $newsDefaults
|
|
|
*/
|
|
|
protected function importNews(CrawlJob $job, CrawlJobItem $item, array $newsDefaults = []): ?int
|
|
|
{
|
|
|
$url = $item->canonical_url;
|
|
|
if ($url && News::query()->where('source_url', $url)->exists()) {
|
|
|
$item->update(['status' => 'duplicate']);
|
|
|
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$title = trim((string) $item->title);
|
|
|
if ($title === '') {
|
|
|
throw new \RuntimeException('标题为空,无法入库');
|
|
|
}
|
|
|
|
|
|
$payload = $item->payload ?? [];
|
|
|
$extra = $payload['extra'] ?? [];
|
|
|
$categoryId = isset($extra['category_dict_item_id']) ? (int) $extra['category_dict_item_id'] : 0;
|
|
|
if ($categoryId <= 0 && ! empty($newsDefaults['category_dict_item_id'])) {
|
|
|
$categoryId = (int) $newsDefaults['category_dict_item_id'];
|
|
|
}
|
|
|
if ($categoryId <= 0) {
|
|
|
$categoryId = (int) (app(NewsCategoryMatcher::class)->resolveCategoryId(
|
|
|
$item->title,
|
|
|
$payload['summary'] ?? null,
|
|
|
$extra['keywords'] ?? CrawlKeywordParser::parse((string) ($job->keyword ?? '')),
|
|
|
) ?? 0);
|
|
|
}
|
|
|
if ($categoryId <= 0) {
|
|
|
$categoryId = null;
|
|
|
}
|
|
|
|
|
|
$content = app(NewsHtmlImageLocalizer::class)->localize(
|
|
|
$payload['content_html'] ?? '',
|
|
|
$url
|
|
|
) ?? '';
|
|
|
if ($content === '') {
|
|
|
$content = '<p>(爬虫抓取,请编辑正文)</p>';
|
|
|
}
|
|
|
|
|
|
$importSource = trim((string) ($extra['import_source'] ?? $newsDefaults['source'] ?? ''));
|
|
|
if ($importSource === '') {
|
|
|
$importSource = $item->source_name ?: '爬虫采集';
|
|
|
}
|
|
|
|
|
|
$news = News::query()->create([
|
|
|
'title' => $title,
|
|
|
'category_dict_item_id' => $categoryId,
|
|
|
'source' => $importSource,
|
|
|
'source_url' => $url,
|
|
|
'source_site' => $importSource,
|
|
|
'crawl_job_id' => $job->id,
|
|
|
'summary' => $payload['summary'] ?? null,
|
|
|
'content_html' => $content,
|
|
|
'status' => 0,
|
|
|
'published_at' => $this->resolvePublishedAt($payload['published_at'] ?? null),
|
|
|
]);
|
|
|
|
|
|
return $news->id;
|
|
|
}
|
|
|
|
|
|
protected function resolvePublishedAt(mixed $value): Carbon
|
|
|
{
|
|
|
if ($value === null || $value === '') {
|
|
|
return now();
|
|
|
}
|
|
|
|
|
|
try {
|
|
|
return Carbon::parse($value);
|
|
|
} catch (\Throwable) {
|
|
|
$normalized = HtmlCrawlSupport::normalizeDate((string) $value);
|
|
|
|
|
|
return $normalized ? Carbon::parse($normalized.' 00:00:00') : now();
|
|
|
}
|
|
|
}
|
|
|
|
|
|
protected function teacherAlreadyExists(
|
|
|
string $name,
|
|
|
?string $email,
|
|
|
?string $profileUrl,
|
|
|
?int $universityId,
|
|
|
string $leadUniversityName,
|
|
|
?string $collegeName,
|
|
|
): bool {
|
|
|
if ($email && Teacher::query()->where('email', $email)->exists()) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
if ($email) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
if ($profileUrl !== '') {
|
|
|
return Teacher::query()->where('remark', 'like', '%'.$profileUrl.'%')->exists();
|
|
|
}
|
|
|
|
|
|
$dupQuery = Teacher::query()->where('name', $name);
|
|
|
if ($collegeName !== '') {
|
|
|
$dupQuery->where('department', $collegeName);
|
|
|
}
|
|
|
if ($universityId) {
|
|
|
$dupQuery->where('university_id', $universityId);
|
|
|
} elseif ($leadUniversityName !== '') {
|
|
|
$dupQuery->where(function ($q) use ($leadUniversityName) {
|
|
|
$q->where('university_text', $leadUniversityName)
|
|
|
->orWhere('university_text', 'like', $leadUniversityName.'%');
|
|
|
});
|
|
|
}
|
|
|
|
|
|
return $dupQuery->exists();
|
|
|
}
|
|
|
|
|
|
protected function resolveUniversityId(string $name): ?int
|
|
|
{
|
|
|
$name = trim($name);
|
|
|
if ($name === '') {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
$id = University::query()
|
|
|
->where('name', $name)
|
|
|
->where('status', 1)
|
|
|
->value('id');
|
|
|
|
|
|
return $id ? (int) $id : null;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 论文任务第一作者 → 论文库;师资列表任务 → 高校抓取;其余 → 手动录入。
|
|
|
*/
|
|
|
protected function resolveTeacherSourceId(string $targetType): ?int
|
|
|
{
|
|
|
$value = match ($targetType) {
|
|
|
'teacher_lead' => 'paper',
|
|
|
'teacher' => 'faculty_crawl',
|
|
|
default => 'manual',
|
|
|
};
|
|
|
|
|
|
$typeId = DictType::query()->where('code', 'teacher_source')->where('status', 1)->value('id');
|
|
|
if (! $typeId) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
return DictItem::query()
|
|
|
->where('dict_type_id', $typeId)
|
|
|
->where('value', $value)
|
|
|
->where('status', 1)
|
|
|
->value('id');
|
|
|
}
|
|
|
|
|
|
protected function defaultTeacherStatusId(): ?int
|
|
|
{
|
|
|
$typeId = DictType::query()->where('code', 'teacher_status')->where('status', 1)->value('id');
|
|
|
if (! $typeId) {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
return DictItem::query()
|
|
|
->where('dict_type_id', $typeId)
|
|
|
->where('value', 'active')
|
|
|
->where('status', 1)
|
|
|
->value('id');
|
|
|
}
|
|
|
}
|