Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
126
scripts/test_harvard.py
Normal file
126
scripts/test_harvard.py
Normal file
@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
测试Harvard大学爬取 - 只测试2个学院
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 添加项目路径
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from university_scraper.config import ScraperConfig
|
||||
from university_scraper.scraper import UniversityScraper
|
||||
|
||||
|
||||
# 简化的测试配置 - 只测试2个学院
|
||||
TEST_CONFIG = {
|
||||
"university": {
|
||||
"name": "Harvard University",
|
||||
"url": "https://www.harvard.edu/",
|
||||
"country": "USA"
|
||||
},
|
||||
"schools": {
|
||||
"discovery_method": "static_list",
|
||||
"static_list": [
|
||||
{
|
||||
"name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
|
||||
"url": "https://seas.harvard.edu/"
|
||||
},
|
||||
{
|
||||
"name": "Graduate School of Design (GSD)",
|
||||
"url": "https://www.gsd.harvard.edu/"
|
||||
}
|
||||
]
|
||||
},
|
||||
"programs": {
|
||||
"paths_to_try": [
|
||||
"/academics/graduate-programs",
|
||||
"/programs",
|
||||
"/academics/programs",
|
||||
"/graduate"
|
||||
],
|
||||
"link_patterns": [
|
||||
{"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]},
|
||||
{"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]}
|
||||
],
|
||||
"selectors": {
|
||||
"program_item": "div.program-item, li.program, a[href*='/program']",
|
||||
"program_name": "h3, .title",
|
||||
"program_url": "a[href]",
|
||||
"degree_type": ".degree"
|
||||
},
|
||||
"pagination": {"type": "none"}
|
||||
},
|
||||
"faculty": {
|
||||
"discovery_strategies": [
|
||||
{
|
||||
"type": "link_in_page",
|
||||
"patterns": [
|
||||
{"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "url_pattern",
|
||||
"patterns": [
|
||||
"{school_url}/faculty",
|
||||
"{school_url}/people"
|
||||
]
|
||||
}
|
||||
],
|
||||
"selectors": {
|
||||
"faculty_item": "div.faculty, li.person",
|
||||
"faculty_name": "h3, .name",
|
||||
"faculty_url": "a[href*='/people/'], a[href*='/faculty/']"
|
||||
}
|
||||
},
|
||||
"filters": {
|
||||
"program_degree_types": {
|
||||
"include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."],
|
||||
"exclude": ["Ph.D.", "Doctor", "Bachelor"]
|
||||
},
|
||||
"exclude_schools": []
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async def test_harvard():
|
||||
"""测试Harvard爬取"""
|
||||
print("=" * 60)
|
||||
print("测试Harvard大学爬取(简化版 - 2个学院)")
|
||||
print("=" * 60)
|
||||
|
||||
config = ScraperConfig.from_dict(TEST_CONFIG)
|
||||
|
||||
async with UniversityScraper(config, headless=False) as scraper:
|
||||
university = await scraper.scrape()
|
||||
scraper.save_results("output/harvard_test_result.json")
|
||||
|
||||
# 打印详细结果
|
||||
print("\n" + "=" * 60)
|
||||
print("详细结果:")
|
||||
print("=" * 60)
|
||||
|
||||
for school in university.schools:
|
||||
print(f"\n学院: {school.name}")
|
||||
print(f" URL: {school.url}")
|
||||
print(f" 项目数: {len(school.programs)}")
|
||||
|
||||
for prog in school.programs[:5]:
|
||||
print(f"\n 项目: {prog.name}")
|
||||
print(f" URL: {prog.url}")
|
||||
print(f" 学位: {prog.degree_type}")
|
||||
print(f" 导师数: {len(prog.faculty)}")
|
||||
|
||||
if prog.faculty:
|
||||
print(" 导师示例:")
|
||||
for f in prog.faculty[:3]:
|
||||
print(f" - {f.name}: {f.url}")
|
||||
|
||||
if len(school.programs) > 5:
|
||||
print(f"\n ... 还有 {len(school.programs) - 5} 个项目")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_harvard())
|
||||
Reference in New Issue
Block a user