Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

126
scripts/test_harvard.py Normal file
View File

@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""
测试Harvard大学爬取 - 只测试2个学院
"""
import asyncio
import sys
from pathlib import Path
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from university_scraper.config import ScraperConfig
from university_scraper.scraper import UniversityScraper
# 简化的测试配置 - 只测试2个学院
TEST_CONFIG = {
"university": {
"name": "Harvard University",
"url": "https://www.harvard.edu/",
"country": "USA"
},
"schools": {
"discovery_method": "static_list",
"static_list": [
{
"name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
"url": "https://seas.harvard.edu/"
},
{
"name": "Graduate School of Design (GSD)",
"url": "https://www.gsd.harvard.edu/"
}
]
},
"programs": {
"paths_to_try": [
"/academics/graduate-programs",
"/programs",
"/academics/programs",
"/graduate"
],
"link_patterns": [
{"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]},
{"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]}
],
"selectors": {
"program_item": "div.program-item, li.program, a[href*='/program']",
"program_name": "h3, .title",
"program_url": "a[href]",
"degree_type": ".degree"
},
"pagination": {"type": "none"}
},
"faculty": {
"discovery_strategies": [
{
"type": "link_in_page",
"patterns": [
{"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]}
]
},
{
"type": "url_pattern",
"patterns": [
"{school_url}/faculty",
"{school_url}/people"
]
}
],
"selectors": {
"faculty_item": "div.faculty, li.person",
"faculty_name": "h3, .name",
"faculty_url": "a[href*='/people/'], a[href*='/faculty/']"
}
},
"filters": {
"program_degree_types": {
"include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."],
"exclude": ["Ph.D.", "Doctor", "Bachelor"]
},
"exclude_schools": []
}
}
async def test_harvard():
"""测试Harvard爬取"""
print("=" * 60)
print("测试Harvard大学爬取简化版 - 2个学院")
print("=" * 60)
config = ScraperConfig.from_dict(TEST_CONFIG)
async with UniversityScraper(config, headless=False) as scraper:
university = await scraper.scrape()
scraper.save_results("output/harvard_test_result.json")
# 打印详细结果
print("\n" + "=" * 60)
print("详细结果:")
print("=" * 60)
for school in university.schools:
print(f"\n学院: {school.name}")
print(f" URL: {school.url}")
print(f" 项目数: {len(school.programs)}")
for prog in school.programs[:5]:
print(f"\n 项目: {prog.name}")
print(f" URL: {prog.url}")
print(f" 学位: {prog.degree_type}")
print(f" 导师数: {len(prog.faculty)}")
if prog.faculty:
print(" 导师示例:")
for f in prog.faculty[:3]:
print(f" - {f.name}: {f.url}")
if len(school.programs) > 5:
print(f"\n ... 还有 {len(school.programs) - 5} 个项目")
if __name__ == "__main__":
asyncio.run(test_harvard())