Files
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

127 lines
3.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
测试Harvard大学爬取 - 只测试2个学院
"""
import asyncio
import sys
from pathlib import Path
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from university_scraper.config import ScraperConfig
from university_scraper.scraper import UniversityScraper
# 简化的测试配置 - 只测试2个学院
TEST_CONFIG = {
"university": {
"name": "Harvard University",
"url": "https://www.harvard.edu/",
"country": "USA"
},
"schools": {
"discovery_method": "static_list",
"static_list": [
{
"name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
"url": "https://seas.harvard.edu/"
},
{
"name": "Graduate School of Design (GSD)",
"url": "https://www.gsd.harvard.edu/"
}
]
},
"programs": {
"paths_to_try": [
"/academics/graduate-programs",
"/programs",
"/academics/programs",
"/graduate"
],
"link_patterns": [
{"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]},
{"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]}
],
"selectors": {
"program_item": "div.program-item, li.program, a[href*='/program']",
"program_name": "h3, .title",
"program_url": "a[href]",
"degree_type": ".degree"
},
"pagination": {"type": "none"}
},
"faculty": {
"discovery_strategies": [
{
"type": "link_in_page",
"patterns": [
{"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]}
]
},
{
"type": "url_pattern",
"patterns": [
"{school_url}/faculty",
"{school_url}/people"
]
}
],
"selectors": {
"faculty_item": "div.faculty, li.person",
"faculty_name": "h3, .name",
"faculty_url": "a[href*='/people/'], a[href*='/faculty/']"
}
},
"filters": {
"program_degree_types": {
"include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."],
"exclude": ["Ph.D.", "Doctor", "Bachelor"]
},
"exclude_schools": []
}
}
async def test_harvard():
"""测试Harvard爬取"""
print("=" * 60)
print("测试Harvard大学爬取简化版 - 2个学院")
print("=" * 60)
config = ScraperConfig.from_dict(TEST_CONFIG)
async with UniversityScraper(config, headless=False) as scraper:
university = await scraper.scrape()
scraper.save_results("output/harvard_test_result.json")
# 打印详细结果
print("\n" + "=" * 60)
print("详细结果:")
print("=" * 60)
for school in university.schools:
print(f"\n学院: {school.name}")
print(f" URL: {school.url}")
print(f" 项目数: {len(school.programs)}")
for prog in school.programs[:5]:
print(f"\n 项目: {prog.name}")
print(f" URL: {prog.url}")
print(f" 学位: {prog.degree_type}")
print(f" 导师数: {len(prog.faculty)}")
if prog.faculty:
print(" 导师示例:")
for f in prog.faculty[:3]:
print(f" - {f.name}: {f.url}")
if len(school.programs) > 5:
print(f"\n ... 还有 {len(school.programs) - 5} 个项目")
if __name__ == "__main__":
asyncio.run(test_harvard())