parseListHtml($html, $requestUrl, $pageHost); if (count($items) <= 1 && $this->looksLikePaperPage($html)) { $single = $this->parsePaperFromHtml($html, $requestUrl); if ($single !== null) { $items = [$single]; } } $items = array_values(array_filter( $items, fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny( $item->title, $item->summary, $keywords ) )); $enriched = []; foreach ($items as $item) { if (count($enriched) >= $maxResults) { break; } $detail = $this->fetchPaperDetail($item->canonicalUrl ?? ''); $title = $detail['title'] ?: $item->title; $summary = $detail['summary'] ?? $item->summary; $authors = $detail['authors'] ?? $item->authors; $authorsParsed = $detail['authors_parsed'] ?? []; $publishedAt = $detail['published_at'] ?? $item->publishedAt; if (! CrawlKeywordParser::matchesAny($title, $summary, $keywords)) { continue; } $lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed); $enriched[] = new CrawlItemDto( externalId: $item->externalId, title: $title, canonicalUrl: $item->canonicalUrl, authors: $authors, summary: $summary, publishedAt: $publishedAt, schoolName: $lead['university_name'] ?? null, extra: [ 'platform' => 'generic_html', 'keyword' => implode(' ', $keywords), 'source' => 'html', 'authors_parsed' => $authorsParsed, 'lead_author' => $lead, ], authorsParsed: $authorsParsed, ); } return $enriched; } /** * @return list */ protected function parseListHtml(string $html, string $baseUrl, ?string $pageHost): array { $items = []; $seen = []; if (! preg_match_all( '#]+href=["\']([^"\']+)["\'][^>]*>(.*?)#isu', $html, $matches, PREG_SET_ORDER )) { return []; } foreach ($matches as $m) { $href = $m[1]; $title = trim(preg_replace('/\s+/u', ' ', strip_tags(html_entity_decode($m[2])))); if (HtmlCrawlSupport::isSkippableLinkTitle($title)) { continue; } $url = HtmlCrawlSupport::absoluteUrl($href, $baseUrl); if (! $url || HtmlCrawlSupport::isAssetPath($url)) { continue; } if ($pageHost && ! HtmlCrawlSupport::sameHost($pageHost, HtmlCrawlSupport::hostKey($url))) { continue; } if (! $this->looksLikePaperUrl($url, $baseUrl)) { continue; } $key = md5($url); if (isset($seen[$key])) { continue; } $seen[$key] = true; $items[] = new CrawlItemDto( externalId: 'paper:'.$key, title: $title, canonicalUrl: $url, extra: ['platform' => 'generic_html'], ); } return $items; } protected function looksLikePaperUrl(string $url, string $listUrl): bool { if (rtrim($url, '/') === rtrim($listUrl, '/')) { return false; } $lower = strtolower($url); if (str_contains($lower, 'doi.org/')) { return true; } $path = strtolower((string) parse_url($url, PHP_URL_PATH)); if ($path === '' || $path === '/') { return false; } return (bool) preg_match( '#/(paper|papers|publication|publications|preprint|abs|arxiv|scholar|thesis|dissertation|doc|detail|view)/#i', $path ) || preg_match('#\.pdf(\?|$)#i', $path); } protected function looksLikePaperPage(string $html): bool { if (preg_match('#]+name=["\']citation_title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { return trim($m[1]) !== ''; } if (preg_match('#]*>(.*?)#is', $html, $m)) { return Str::length(trim(strip_tags($m[1]))) >= 8; } return false; } protected function parsePaperFromHtml(string $html, string $url): ?CrawlItemDto { $detail = $this->parsePaperDetailFromHtml($html); if (($detail['title'] ?? '') === '') { return null; } return new CrawlItemDto( externalId: 'paper:'.md5($url), title: $detail['title'], canonicalUrl: $url, authors: $detail['authors'], summary: $detail['summary'], publishedAt: $detail['published_at'], extra: ['platform' => 'generic_html'], authorsParsed: $detail['authors_parsed'], ); } /** * @return array{title:?string, summary:?string, authors:?string, published_at:?string, authors_parsed:list>} */ protected function fetchPaperDetail(string $url): array { $empty = [ 'title' => null, 'summary' => null, 'authors' => null, 'published_at' => null, 'authors_parsed' => [], ]; if ($url === '') { return $empty; } try { $html = HtmlCrawlSupport::fetchHtml($url, 20); } catch (\Throwable) { return $empty; } return $this->parsePaperDetailFromHtml($html); } /** * @return array{title:string, summary:?string, authors:?string, published_at:?string, authors_parsed:list>} */ protected function parsePaperDetailFromHtml(string $html): array { $title = ''; if (preg_match_all('#]+name=["\']citation_title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { $title = trim(html_entity_decode(end($m[1]))); } elseif (preg_match('#]*>(.*?)#is', $html, $m)) { $title = trim(strip_tags(html_entity_decode($m[1]))); } $authorsParsed = []; if (preg_match_all('#]+name=["\']citation_author["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { foreach ($m[1] as $name) { $name = trim(html_entity_decode($name)); if ($name === '') { continue; } $authorsParsed[] = [ 'name' => $name, 'email' => null, 'affiliation' => null, 'university_name' => null, ]; } } $summary = null; if (preg_match('#]+name=["\']citation_abstract["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { $summary = trim(html_entity_decode($m[1])); } elseif (preg_match('#]+class=["\'][^"\']*abstract[^"\']*["\'][^>]*>(.*?)#is', $html, $m)) { $summary = trim(strip_tags(html_entity_decode($m[1]))); } $publishedAt = null; if (preg_match('#]+name=["\']citation_publication_date["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { $publishedAt = HtmlCrawlSupport::normalizeDate($m[1]); } elseif (preg_match('#]+name=["\']citation_date["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) { $publishedAt = HtmlCrawlSupport::normalizeDate($m[1]); } $authorNames = array_column($authorsParsed, 'name'); return [ 'title' => $title, 'summary' => $summary, 'authors' => $authorNames !== [] ? implode('; ', $authorNames) : null, 'published_at' => $publishedAt, 'authors_parsed' => $authorsParsed, ]; } }