#!/usr/bin/env python3 """ 测试Harvard大学爬取 - 只测试2个学院 """ import asyncio import sys from pathlib import Path # 添加项目路径 sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from university_scraper.config import ScraperConfig from university_scraper.scraper import UniversityScraper # 简化的测试配置 - 只测试2个学院 TEST_CONFIG = { "university": { "name": "Harvard University", "url": "https://www.harvard.edu/", "country": "USA" }, "schools": { "discovery_method": "static_list", "static_list": [ { "name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)", "url": "https://seas.harvard.edu/" }, { "name": "Graduate School of Design (GSD)", "url": "https://www.gsd.harvard.edu/" } ] }, "programs": { "paths_to_try": [ "/academics/graduate-programs", "/programs", "/academics/programs", "/graduate" ], "link_patterns": [ {"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]}, {"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]} ], "selectors": { "program_item": "div.program-item, li.program, a[href*='/program']", "program_name": "h3, .title", "program_url": "a[href]", "degree_type": ".degree" }, "pagination": {"type": "none"} }, "faculty": { "discovery_strategies": [ { "type": "link_in_page", "patterns": [ {"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]} ] }, { "type": "url_pattern", "patterns": [ "{school_url}/faculty", "{school_url}/people" ] } ], "selectors": { "faculty_item": "div.faculty, li.person", "faculty_name": "h3, .name", "faculty_url": "a[href*='/people/'], a[href*='/faculty/']" } }, "filters": { "program_degree_types": { "include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."], "exclude": ["Ph.D.", "Doctor", "Bachelor"] }, "exclude_schools": [] } } async def test_harvard(): """测试Harvard爬取""" print("=" * 60) print("测试Harvard大学爬取(简化版 - 2个学院)") print("=" * 60) config = ScraperConfig.from_dict(TEST_CONFIG) async with UniversityScraper(config, headless=False) as scraper: university = await scraper.scrape() scraper.save_results("output/harvard_test_result.json") # 打印详细结果 print("\n" + "=" * 60) print("详细结果:") print("=" * 60) for school in university.schools: print(f"\n学院: {school.name}") print(f" URL: {school.url}") print(f" 项目数: {len(school.programs)}") for prog in school.programs[:5]: print(f"\n 项目: {prog.name}") print(f" URL: {prog.url}") print(f" 学位: {prog.degree_type}") print(f" 导师数: {len(prog.faculty)}") if prog.faculty: print(" 导师示例:") for f in prog.faculty[:3]: print(f" - {f.name}: {f.url}") if len(school.programs) > 5: print(f"\n ... 还有 {len(school.programs) - 5} 个项目") if __name__ == "__main__": asyncio.run(test_harvard())