You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
147 lines
5.4 KiB
147 lines
5.4 KiB
<?php
|
|
|
|
namespace Tests\Unit;
|
|
|
|
use App\Services\Crawl\Adapters\FacultyListHtmlAdapter;
|
|
use Tests\TestCase;
|
|
|
|
class FacultyListHtmlAdapterTest extends TestCase
|
|
{
|
|
public function test_extracts_sjtu_college_teacher_list_without_email(): void
|
|
{
|
|
$html = <<<'HTML'
|
|
<div class="jssy-b">
|
|
<div class="title">电子信息与电气工程学院</div>
|
|
<div class="list">
|
|
<ul>
|
|
<li>
|
|
<a href="http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm" target="_blank">
|
|
<div class="name">沈备军</div>
|
|
<p>所在单位:电子信息与电气工程学院</p>
|
|
<p>职称:副教授</p>
|
|
</a>
|
|
</li>
|
|
<li>
|
|
<a href="http://faculty.sjtu.edu.cn/xiabin/zh_CN/index.htm" target="_blank">
|
|
<div class="name">夏斌</div>
|
|
<p>所在单位:电子信息与电气工程学院</p>
|
|
<p>职称:教授</p>
|
|
</a>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
HTML;
|
|
|
|
$adapter = new FacultyListHtmlAdapter;
|
|
$method = new \ReflectionMethod($adapter, 'extractFromHtml');
|
|
$method->setAccessible(true);
|
|
|
|
$items = $method->invoke(
|
|
$adapter,
|
|
$html,
|
|
[],
|
|
'https://faculty.sjtu.edu.cn/xyjs_list.jsp?urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1701&lang=zh_CN',
|
|
);
|
|
|
|
$this->assertCount(2, $items);
|
|
$this->assertSame('沈备军', $items[0]->title);
|
|
$this->assertSame('http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm', $items[0]->canonicalUrl);
|
|
$this->assertSame('上海交通大学', $items[0]->schoolName);
|
|
$this->assertSame('faculty_html_tsites', $items[0]->extra['platform']);
|
|
$this->assertSame('副教授', $items[0]->extra['academic_title']);
|
|
$this->assertSame('电子信息与电气工程学院', $items[0]->extra['college_name']);
|
|
$this->assertSame('副教授', $items[0]->extra['lead_author']['academic_title']);
|
|
$this->assertSame('夏斌', $items[1]->title);
|
|
}
|
|
|
|
public function test_extracts_teacher_when_profile_href_is_empty(): void
|
|
{
|
|
$html = <<<'HTML'
|
|
<script>u_u11_pic.addimg("/__local/x.png","/shaohaibin/zh_CN/index.htm","邵海滨","1");</script>
|
|
<div class="list"><ul>
|
|
<li><a href="" target="_blank">
|
|
<div class="name">邵海滨</div>
|
|
<p>所在单位:电子信息与电气工程学院</p>
|
|
<p>职称:副研究员</p>
|
|
</a></li>
|
|
</ul></div>
|
|
HTML;
|
|
|
|
$adapter = new FacultyListHtmlAdapter;
|
|
$method = new \ReflectionMethod($adapter, 'extractFromHtml');
|
|
$method->setAccessible(true);
|
|
$items = $method->invoke(
|
|
$adapter,
|
|
$html,
|
|
[],
|
|
'https://faculty.sjtu.edu.cn/xyjs_list.jsp?id=1701',
|
|
);
|
|
|
|
$this->assertCount(1, $items);
|
|
$this->assertSame('邵海滨', $items[0]->title);
|
|
$this->assertStringContainsString('shaohaibin', (string) $items[0]->canonicalUrl);
|
|
$this->assertSame('副研究员', $items[0]->extra['academic_title']);
|
|
$this->assertSame('电子信息与电气工程学院', $items[0]->extra['college_name']);
|
|
}
|
|
|
|
public function test_extracts_email_from_teacher_profile_html(): void
|
|
{
|
|
$html = '<li><strong>电子邮箱:</strong>bjshen@sjtu.edu.cn</li>';
|
|
$adapter = new FacultyListHtmlAdapter;
|
|
$method = new \ReflectionMethod($adapter, 'extractEmailFromProfileHtml');
|
|
$method->setAccessible(true);
|
|
|
|
$email = $method->invoke($adapter, $html);
|
|
|
|
$this->assertSame('bjshen@sjtu.edu.cn', $email);
|
|
}
|
|
|
|
public function test_apply_email_to_item_updates_lead_author(): void
|
|
{
|
|
$adapter = new FacultyListHtmlAdapter;
|
|
$method = new \ReflectionMethod($adapter, 'applyEmailToItem');
|
|
$method->setAccessible(true);
|
|
|
|
$item = $method->invoke(
|
|
$adapter,
|
|
new \App\Services\Crawl\CrawlItemDto(
|
|
externalId: 'faculty:test',
|
|
title: '沈备军',
|
|
canonicalUrl: 'http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm',
|
|
extra: [
|
|
'lead_author' => [
|
|
'name' => '沈备军',
|
|
'email' => null,
|
|
'university_name' => '上海交通大学',
|
|
],
|
|
],
|
|
),
|
|
'bjshen@sjtu.edu.cn',
|
|
);
|
|
|
|
$this->assertSame('bjshen@sjtu.edu.cn', $item->extra['lead_author']['email']);
|
|
}
|
|
|
|
public function test_detects_total_pages_and_builds_pagenum_url(): void
|
|
{
|
|
$html = '<a href="?totalpage=20&PAGENUM=2&urltype=tsites.CollegeTeacherList&id=1701">下页</a>';
|
|
$adapter = new FacultyListHtmlAdapter;
|
|
|
|
$detect = new \ReflectionMethod($adapter, 'detectTotalPages');
|
|
$detect->setAccessible(true);
|
|
$this->assertSame(20, $detect->invoke($adapter, $html));
|
|
|
|
$build = new \ReflectionMethod($adapter, 'buildPageUrl');
|
|
$build->setAccessible(true);
|
|
$url = $build->invoke(
|
|
$adapter,
|
|
'https://faculty.sjtu.edu.cn/xyjs_list.jsp?urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1701&lang=zh_CN',
|
|
3,
|
|
$html,
|
|
);
|
|
$this->assertStringContainsString('PAGENUM=3', $url);
|
|
$this->assertStringContainsString('totalpage=20', $url);
|
|
}
|
|
}
|