电子信息与电气工程学院
HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromHtml'); $method->setAccessible(true); $items = $method->invoke( $adapter, $html, [], 'https://faculty.sjtu.edu.cn/xyjs_list.jsp?urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1701&lang=zh_CN', ); $this->assertCount(2, $items); $this->assertSame('沈备军', $items[0]->title); $this->assertSame('http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm', $items[0]->canonicalUrl); $this->assertSame('上海交通大学', $items[0]->schoolName); $this->assertSame('faculty_html_tsites', $items[0]->extra['platform']); $this->assertSame('副教授', $items[0]->extra['academic_title']); $this->assertSame('电子信息与电气工程学院', $items[0]->extra['college_name']); $this->assertSame('副教授', $items[0]->extra['lead_author']['academic_title']); $this->assertSame('夏斌', $items[1]->title); } public function test_extracts_teacher_when_profile_href_is_empty(): void { $html = <<<'HTML'
HTML; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractFromHtml'); $method->setAccessible(true); $items = $method->invoke( $adapter, $html, [], 'https://faculty.sjtu.edu.cn/xyjs_list.jsp?id=1701', ); $this->assertCount(1, $items); $this->assertSame('邵海滨', $items[0]->title); $this->assertStringContainsString('shaohaibin', (string) $items[0]->canonicalUrl); $this->assertSame('副研究员', $items[0]->extra['academic_title']); $this->assertSame('电子信息与电气工程学院', $items[0]->extra['college_name']); } public function test_extracts_email_from_teacher_profile_html(): void { $html = '
  • 电子邮箱:bjshen@sjtu.edu.cn
  • '; $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'extractEmailFromProfileHtml'); $method->setAccessible(true); $email = $method->invoke($adapter, $html); $this->assertSame('bjshen@sjtu.edu.cn', $email); } public function test_apply_email_to_item_updates_lead_author(): void { $adapter = new FacultyListHtmlAdapter; $method = new \ReflectionMethod($adapter, 'applyEmailToItem'); $method->setAccessible(true); $item = $method->invoke( $adapter, new \App\Services\Crawl\CrawlItemDto( externalId: 'faculty:test', title: '沈备军', canonicalUrl: 'http://faculty.sjtu.edu.cn/bjshen/zh_CN/index.htm', extra: [ 'lead_author' => [ 'name' => '沈备军', 'email' => null, 'university_name' => '上海交通大学', ], ], ), 'bjshen@sjtu.edu.cn', ); $this->assertSame('bjshen@sjtu.edu.cn', $item->extra['lead_author']['email']); } public function test_detects_total_pages_and_builds_pagenum_url(): void { $html = '下页'; $adapter = new FacultyListHtmlAdapter; $detect = new \ReflectionMethod($adapter, 'detectTotalPages'); $detect->setAccessible(true); $this->assertSame(20, $detect->invoke($adapter, $html)); $build = new \ReflectionMethod($adapter, 'buildPageUrl'); $build->setAccessible(true); $url = $build->invoke( $adapter, 'https://faculty.sjtu.edu.cn/xyjs_list.jsp?urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1701&lang=zh_CN', 3, $html, ); $this->assertStringContainsString('PAGENUM=3', $url); $this->assertStringContainsString('totalpage=20', $url); } }