slake-school-service/app/Services/Crawl/Adapters/GenericPaperHtmlAdapter.php

<?php

namespace App\Services\Crawl\Adapters;

use App\Models\CrawlSource;
use App\Services\Crawl\Contracts\CrawlerAdapterInterface;
use App\Services\Crawl\CrawlAuthorParser;
use App\Services\Crawl\CrawlItemDto;
use App\Services\Crawl\CrawlKeywordParser;
use App\Services\Crawl\HtmlCrawlSupport;
use Illuminate\Support\Str;

class GenericPaperHtmlAdapter implements CrawlerAdapterInterface
{
    public function fetch(string $requestUrl, CrawlSource $source, array $params): array
    {
        $keywords = CrawlKeywordParser::parse((string) ($params['keyword'] ?? ''));
        $maxResults = min(50, max(1, (int) ($params['max_results'] ?? 20)));

        $html = HtmlCrawlSupport::fetchHtml($requestUrl);
        $pageHost = HtmlCrawlSupport::hostKey($requestUrl);

        $items = $this->parseListHtml($html, $requestUrl, $pageHost);

        if (count($items) <= 1 && $this->looksLikePaperPage($html)) {
            $single = $this->parsePaperFromHtml($html, $requestUrl);
            if ($single !== null) {
                $items = [$single];
            }
        }

        $items = array_values(array_filter(
            $items,
            fn (CrawlItemDto $item) => CrawlKeywordParser::matchesAny(
                $item->title,
                $item->summary,
                $keywords
            )
        ));

        $enriched = [];
        foreach ($items as $item) {
            if (count($enriched) >= $maxResults) {
                break;
            }
            $detail = $this->fetchPaperDetail($item->canonicalUrl ?? '');
            $title = $detail['title'] ?: $item->title;
            $summary = $detail['summary'] ?? $item->summary;
            $authors = $detail['authors'] ?? $item->authors;
            $authorsParsed = $detail['authors_parsed'] ?? [];
            $publishedAt = $detail['published_at'] ?? $item->publishedAt;

            if (! CrawlKeywordParser::matchesAny($title, $summary, $keywords)) {
                continue;
            }

            $lead = CrawlAuthorParser::leadAuthor($authors, $authorsParsed);

            $enriched[] = new CrawlItemDto(
                externalId: $item->externalId,
                title: $title,
                canonicalUrl: $item->canonicalUrl,
                authors: $authors,
                summary: $summary,
                publishedAt: $publishedAt,
                schoolName: $lead['university_name'] ?? null,
                extra: [
                    'platform' => 'generic_html',
                    'keyword' => implode(' ', $keywords),
                    'source' => 'html',
                    'authors_parsed' => $authorsParsed,
                    'lead_author' => $lead,
                ],
                authorsParsed: $authorsParsed,
            );
        }

        return $enriched;
    }

    /**
     * @return list<CrawlItemDto>
     */
    protected function parseListHtml(string $html, string $baseUrl, ?string $pageHost): array
    {
        $items = [];
        $seen = [];

        if (! preg_match_all(
            '#<a[^>]+href=["\']([^"\']+)["\'][^>]*>(.*?)</a>#isu',
            $html,
            $matches,
            PREG_SET_ORDER
        )) {
            return [];
        }

        foreach ($matches as $m) {
            $href = $m[1];
            $title = trim(preg_replace('/\s+/u', ' ', strip_tags(html_entity_decode($m[2]))));
            if (HtmlCrawlSupport::isSkippableLinkTitle($title)) {
                continue;
            }

            $url = HtmlCrawlSupport::absoluteUrl($href, $baseUrl);
            if (! $url || HtmlCrawlSupport::isAssetPath($url)) {
                continue;
            }
            if ($pageHost && ! HtmlCrawlSupport::sameHost($pageHost, HtmlCrawlSupport::hostKey($url))) {
                continue;
            }
            if (! $this->looksLikePaperUrl($url, $baseUrl)) {
                continue;
            }

            $key = md5($url);
            if (isset($seen[$key])) {
                continue;
            }
            $seen[$key] = true;

            $items[] = new CrawlItemDto(
                externalId: 'paper:'.$key,
                title: $title,
                canonicalUrl: $url,
                extra: ['platform' => 'generic_html'],
            );
        }

        return $items;
    }

    protected function looksLikePaperUrl(string $url, string $listUrl): bool
    {
        if (rtrim($url, '/') === rtrim($listUrl, '/')) {
            return false;
        }

        $lower = strtolower($url);
        if (str_contains($lower, 'doi.org/')) {
            return true;
        }

        $path = strtolower((string) parse_url($url, PHP_URL_PATH));
        if ($path === '' || $path === '/') {
            return false;
        }

        return (bool) preg_match(
            '#/(paper|papers|publication|publications|preprint|abs|arxiv|scholar|thesis|dissertation|doc|detail|view)/#i',
            $path
        ) || preg_match('#\.pdf(\?|$)#i', $path);
    }

    protected function looksLikePaperPage(string $html): bool
    {
        if (preg_match('#<meta[^>]+name=["\']citation_title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            return trim($m[1]) !== '';
        }

        if (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
            return Str::length(trim(strip_tags($m[1]))) >= 8;
        }

        return false;
    }

    protected function parsePaperFromHtml(string $html, string $url): ?CrawlItemDto
    {
        $detail = $this->parsePaperDetailFromHtml($html);
        if (($detail['title'] ?? '') === '') {
            return null;
        }

        return new CrawlItemDto(
            externalId: 'paper:'.md5($url),
            title: $detail['title'],
            canonicalUrl: $url,
            authors: $detail['authors'],
            summary: $detail['summary'],
            publishedAt: $detail['published_at'],
            extra: ['platform' => 'generic_html'],
            authorsParsed: $detail['authors_parsed'],
        );
    }

    /**
     * @return array{title:?string, summary:?string, authors:?string, published_at:?string, authors_parsed:list<array<string,mixed>>}
     */
    protected function fetchPaperDetail(string $url): array
    {
        $empty = [
            'title' => null,
            'summary' => null,
            'authors' => null,
            'published_at' => null,
            'authors_parsed' => [],
        ];

        if ($url === '') {
            return $empty;
        }

        try {
            $html = HtmlCrawlSupport::fetchHtml($url, 20);
        } catch (\Throwable) {
            return $empty;
        }

        return $this->parsePaperDetailFromHtml($html);
    }

    /**
     * @return array{title:string, summary:?string, authors:?string, published_at:?string, authors_parsed:list<array<string,mixed>>}
     */
    protected function parsePaperDetailFromHtml(string $html): array
    {
        $title = '';
        if (preg_match_all('#<meta[^>]+name=["\']citation_title["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            $title = trim(html_entity_decode(end($m[1])));
        } elseif (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
            $title = trim(strip_tags(html_entity_decode($m[1])));
        }

        $authorsParsed = [];
        if (preg_match_all('#<meta[^>]+name=["\']citation_author["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            foreach ($m[1] as $name) {
                $name = trim(html_entity_decode($name));
                if ($name === '') {
                    continue;
                }
                $authorsParsed[] = [
                    'name' => $name,
                    'email' => null,
                    'affiliation' => null,
                    'university_name' => null,
                ];
            }
        }

        $summary = null;
        if (preg_match('#<meta[^>]+name=["\']citation_abstract["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            $summary = trim(html_entity_decode($m[1]));
        } elseif (preg_match('#<div[^>]+class=["\'][^"\']*abstract[^"\']*["\'][^>]*>(.*?)</div>#is', $html, $m)) {
            $summary = trim(strip_tags(html_entity_decode($m[1])));
        }

        $publishedAt = null;
        if (preg_match('#<meta[^>]+name=["\']citation_publication_date["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            $publishedAt = HtmlCrawlSupport::normalizeDate($m[1]);
        } elseif (preg_match('#<meta[^>]+name=["\']citation_date["\'][^>]+content=["\']([^"\']+)#i', $html, $m)) {
            $publishedAt = HtmlCrawlSupport::normalizeDate($m[1]);
        }

        $authorNames = array_column($authorsParsed, 'name');

        return [
            'title' => $title,
            'summary' => $summary,
            'authors' => $authorNames !== [] ? implode('; ', $authorNames) : null,
            'published_at' => $publishedAt,
            'authors_parsed' => $authorsParsed,
        ];
    }
}