You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
222 lines
7.8 KiB
222 lines
7.8 KiB
<?php
|
|
|
|
namespace App\Services\Crawl;
|
|
|
|
use App\Models\CrawlJob;
|
|
use App\Models\CrawlJobItem;
|
|
use App\Models\CrawlSource;
|
|
use App\Models\News;
|
|
use App\Models\Paper;
|
|
use App\Models\Teacher;
|
|
|
|
class CrawlJobRunnerService
|
|
{
|
|
public function __construct(
|
|
protected CrawlJobDispatcher $dispatcher,
|
|
) {}
|
|
|
|
/**
|
|
* @param array<string, mixed> $params
|
|
*/
|
|
public function run(CrawlJob $job, CrawlSource $source, array $params): CrawlJob
|
|
{
|
|
$job->update(['status' => 'running']);
|
|
|
|
$dtos = $this->dispatcher->fetch(
|
|
$job->request_url ?? $job->platform_url,
|
|
$source,
|
|
$params
|
|
);
|
|
|
|
$count = 0;
|
|
foreach ($dtos as $dto) {
|
|
if ($source->target_type === 'teacher') {
|
|
$count += $this->persistTeacherItem($job, $source, $dto) ? 1 : 0;
|
|
|
|
continue;
|
|
}
|
|
|
|
$status = $this->previewStatusForPaperOrNews($source, $dto);
|
|
$payload = $this->buildPayload($dto);
|
|
|
|
CrawlJobItem::query()->updateOrCreate(
|
|
[
|
|
'crawl_job_id' => $job->id,
|
|
'external_id' => $dto->externalId,
|
|
],
|
|
[
|
|
'canonical_url' => $dto->canonicalUrl,
|
|
'title' => $dto->title,
|
|
'payload' => $payload,
|
|
'status' => $status,
|
|
'target_type' => $source->target_type === 'paper' ? 'paper' : 'news',
|
|
'source_name' => $source->name,
|
|
'target_id' => null,
|
|
]
|
|
);
|
|
$count++;
|
|
|
|
if ($source->target_type === 'paper') {
|
|
$this->persistTeacherLeadItem($job, $source, $dto, $payload, $status);
|
|
}
|
|
}
|
|
|
|
$job->update([
|
|
'status' => 'completed',
|
|
'items_fetched' => $count,
|
|
'papers_created' => $source->target_type === 'paper' ? $count : 0,
|
|
'result_summary' => sprintf('已从 %s 抓取 %d 条(暂存预览,请勾选入库)', $source->name, $count),
|
|
'completed_at' => now(),
|
|
]);
|
|
|
|
return $job->fresh();
|
|
}
|
|
|
|
protected function previewStatusForPaperOrNews(CrawlSource $source, CrawlItemDto $dto): string
|
|
{
|
|
if ($source->target_type === 'paper') {
|
|
if (Paper::query()
|
|
->where('external_id', $dto->externalId)
|
|
->where('source', 'crawl')
|
|
->exists()) {
|
|
return 'duplicate';
|
|
}
|
|
} else {
|
|
$url = $dto->canonicalUrl;
|
|
if ($url && News::query()->where('source_url', $url)->exists()) {
|
|
return 'duplicate';
|
|
}
|
|
}
|
|
|
|
return 'preview';
|
|
}
|
|
|
|
/**
|
|
* @return array<string, mixed>
|
|
*/
|
|
protected function buildPayload(CrawlItemDto $dto): array
|
|
{
|
|
$extra = $dto->extra;
|
|
if ($dto->authorsParsed !== []) {
|
|
$extra['authors_parsed'] = $dto->authorsParsed;
|
|
}
|
|
if (! isset($extra['lead_author'])) {
|
|
$extra['lead_author'] = CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
|
|
}
|
|
|
|
return [
|
|
'authors' => $dto->authors,
|
|
'summary' => $dto->summary,
|
|
'published_at' => $dto->publishedAt,
|
|
'school_name' => $dto->schoolName,
|
|
'section' => $dto->section,
|
|
'content_html' => $dto->contentHtml,
|
|
'extra' => $extra,
|
|
'authors_parsed' => $dto->authorsParsed,
|
|
'lead_author' => $extra['lead_author'] ?? null,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @param array<string, mixed> $paperPayload
|
|
*/
|
|
protected function persistTeacherLeadItem(
|
|
CrawlJob $job,
|
|
CrawlSource $source,
|
|
CrawlItemDto $dto,
|
|
array $paperPayload,
|
|
string $paperStatus,
|
|
): void {
|
|
$lead = $paperPayload['lead_author'] ?? null;
|
|
if (! is_array($lead) || trim((string) ($lead['name'] ?? '')) === '') {
|
|
return;
|
|
}
|
|
|
|
$leadExternalId = 'lead:'.$dto->externalId;
|
|
$dup = false;
|
|
if (! empty($lead['email']) && Teacher::query()->where('email', $lead['email'])->exists()) {
|
|
$dup = true;
|
|
}
|
|
|
|
CrawlJobItem::query()->updateOrCreate(
|
|
[
|
|
'crawl_job_id' => $job->id,
|
|
'external_id' => $leadExternalId,
|
|
],
|
|
[
|
|
'canonical_url' => $dto->canonicalUrl,
|
|
'title' => (string) $lead['name'],
|
|
'payload' => [
|
|
'lead_author' => $lead,
|
|
'paper_external_id' => $dto->externalId,
|
|
'paper_title' => $dto->title,
|
|
'school_name' => $lead['university_name'] ?? $dto->schoolName,
|
|
],
|
|
'status' => $dup ? 'duplicate' : ($paperStatus === 'duplicate' ? 'preview' : 'preview'),
|
|
'target_type' => 'teacher_lead',
|
|
'source_name' => $source->name,
|
|
'target_id' => null,
|
|
]
|
|
);
|
|
}
|
|
|
|
protected function persistTeacherItem(CrawlJob $job, CrawlSource $source, CrawlItemDto $dto): bool
|
|
{
|
|
$lead = $dto->extra['lead_author'] ?? CrawlAuthorParser::leadAuthor($dto->authors, $dto->authorsParsed);
|
|
$email = is_array($lead) ? ($lead['email'] ?? null) : null;
|
|
$status = 'preview';
|
|
if ($email && Teacher::query()->where('email', $email)->exists()) {
|
|
$status = 'duplicate';
|
|
} elseif (is_array($lead)) {
|
|
$leadName = trim((string) ($lead['name'] ?? ''));
|
|
$leadUniversity = trim((string) ($lead['university_name'] ?? ''));
|
|
$profileUrl = trim((string) ($lead['profile_url'] ?? $dto->canonicalUrl ?? ''));
|
|
$collegeName = trim((string) ($lead['college'] ?? $lead['affiliation'] ?? $dto->extra['college_name'] ?? ''));
|
|
if ($leadName !== '' && ! $email) {
|
|
if ($profileUrl !== '' && Teacher::query()->where('remark', 'like', '%'.$profileUrl.'%')->exists()) {
|
|
$status = 'duplicate';
|
|
} else {
|
|
$dup = Teacher::query()->where('name', $leadName);
|
|
if ($collegeName !== '') {
|
|
$dup->where('department', $collegeName);
|
|
}
|
|
if ($leadUniversity !== '') {
|
|
$dup->where(function ($q) use ($leadUniversity) {
|
|
$q->where('university_text', $leadUniversity)
|
|
->orWhere('university_text', 'like', $leadUniversity.'%');
|
|
});
|
|
}
|
|
if ($dup->exists()) {
|
|
$status = 'duplicate';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
CrawlJobItem::query()->updateOrCreate(
|
|
[
|
|
'crawl_job_id' => $job->id,
|
|
'external_id' => $dto->externalId,
|
|
],
|
|
[
|
|
'canonical_url' => $dto->canonicalUrl,
|
|
'title' => $dto->title,
|
|
'payload' => [
|
|
'lead_author' => $lead,
|
|
'school_name' => $dto->schoolName,
|
|
'summary' => $dto->summary,
|
|
'academic_title' => $dto->extra['academic_title'] ?? (is_array($lead) ? ($lead['academic_title'] ?? null) : null),
|
|
'college_name' => $dto->extra['college_name'] ?? (is_array($lead) ? ($lead['college'] ?? $lead['affiliation'] ?? null) : null),
|
|
'profile_url' => $dto->extra['profile_url'] ?? $dto->canonicalUrl,
|
|
],
|
|
'status' => $status,
|
|
'target_type' => 'teacher',
|
|
'source_name' => $source->name,
|
|
'target_id' => null,
|
|
]
|
|
);
|
|
|
|
return true;
|
|
}
|
|
}
|