- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
127 lines
3.7 KiB
Python
127 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
测试Harvard大学爬取 - 只测试2个学院
|
||
"""
|
||
|
||
import asyncio
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# 添加项目路径
|
||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||
|
||
from university_scraper.config import ScraperConfig
|
||
from university_scraper.scraper import UniversityScraper
|
||
|
||
|
||
# 简化的测试配置 - 只测试2个学院
|
||
TEST_CONFIG = {
|
||
"university": {
|
||
"name": "Harvard University",
|
||
"url": "https://www.harvard.edu/",
|
||
"country": "USA"
|
||
},
|
||
"schools": {
|
||
"discovery_method": "static_list",
|
||
"static_list": [
|
||
{
|
||
"name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
|
||
"url": "https://seas.harvard.edu/"
|
||
},
|
||
{
|
||
"name": "Graduate School of Design (GSD)",
|
||
"url": "https://www.gsd.harvard.edu/"
|
||
}
|
||
]
|
||
},
|
||
"programs": {
|
||
"paths_to_try": [
|
||
"/academics/graduate-programs",
|
||
"/programs",
|
||
"/academics/programs",
|
||
"/graduate"
|
||
],
|
||
"link_patterns": [
|
||
{"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]},
|
||
{"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]}
|
||
],
|
||
"selectors": {
|
||
"program_item": "div.program-item, li.program, a[href*='/program']",
|
||
"program_name": "h3, .title",
|
||
"program_url": "a[href]",
|
||
"degree_type": ".degree"
|
||
},
|
||
"pagination": {"type": "none"}
|
||
},
|
||
"faculty": {
|
||
"discovery_strategies": [
|
||
{
|
||
"type": "link_in_page",
|
||
"patterns": [
|
||
{"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]}
|
||
]
|
||
},
|
||
{
|
||
"type": "url_pattern",
|
||
"patterns": [
|
||
"{school_url}/faculty",
|
||
"{school_url}/people"
|
||
]
|
||
}
|
||
],
|
||
"selectors": {
|
||
"faculty_item": "div.faculty, li.person",
|
||
"faculty_name": "h3, .name",
|
||
"faculty_url": "a[href*='/people/'], a[href*='/faculty/']"
|
||
}
|
||
},
|
||
"filters": {
|
||
"program_degree_types": {
|
||
"include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."],
|
||
"exclude": ["Ph.D.", "Doctor", "Bachelor"]
|
||
},
|
||
"exclude_schools": []
|
||
}
|
||
}
|
||
|
||
|
||
async def test_harvard():
|
||
"""测试Harvard爬取"""
|
||
print("=" * 60)
|
||
print("测试Harvard大学爬取(简化版 - 2个学院)")
|
||
print("=" * 60)
|
||
|
||
config = ScraperConfig.from_dict(TEST_CONFIG)
|
||
|
||
async with UniversityScraper(config, headless=False) as scraper:
|
||
university = await scraper.scrape()
|
||
scraper.save_results("output/harvard_test_result.json")
|
||
|
||
# 打印详细结果
|
||
print("\n" + "=" * 60)
|
||
print("详细结果:")
|
||
print("=" * 60)
|
||
|
||
for school in university.schools:
|
||
print(f"\n学院: {school.name}")
|
||
print(f" URL: {school.url}")
|
||
print(f" 项目数: {len(school.programs)}")
|
||
|
||
for prog in school.programs[:5]:
|
||
print(f"\n 项目: {prog.name}")
|
||
print(f" URL: {prog.url}")
|
||
print(f" 学位: {prog.degree_type}")
|
||
print(f" 导师数: {len(prog.faculty)}")
|
||
|
||
if prog.faculty:
|
||
print(" 导师示例:")
|
||
for f in prog.faculty[:3]:
|
||
print(f" - {f.name}: {f.url}")
|
||
|
||
if len(school.programs) > 5:
|
||
print(f"\n ... 还有 {len(school.programs) - 5} 个项目")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(test_harvard())
|