Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""
将已爬取的Harvard数据按学院重新组织
读取原始扁平数据,按 学院 → 项目 → 导师 层级重新组织输出
"""
import json
from pathlib import Path
from datetime import datetime, timezone
from urllib.parse import urlparse
from collections import defaultdict
# Harvard学院映射 - 根据URL子域名判断所属学院
SCHOOL_MAPPING = {
"gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
"seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
"hbs.edu": "Harvard Business School (HBS)",
"www.hbs.edu": "Harvard Business School (HBS)",
"gsd.harvard.edu": "Graduate School of Design (GSD)",
"www.gsd.harvard.edu": "Graduate School of Design (GSD)",
"gse.harvard.edu": "Graduate School of Education (HGSE)",
"www.gse.harvard.edu": "Graduate School of Education (HGSE)",
"hks.harvard.edu": "Harvard Kennedy School (HKS)",
"www.hks.harvard.edu": "Harvard Kennedy School (HKS)",
"hls.harvard.edu": "Harvard Law School (HLS)",
"hms.harvard.edu": "Harvard Medical School (HMS)",
"hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
"www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
"hds.harvard.edu": "Harvard Divinity School (HDS)",
"hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)",
"fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
"aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
"dce.harvard.edu": "Division of Continuing Education (DCE)",
"extension.harvard.edu": "Harvard Extension School",
"cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
}
# 学院URL映射
SCHOOL_URLS = {
"Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/",
"John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/",
"Harvard Business School (HBS)": "https://www.hbs.edu/",
"Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/",
"Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/",
"Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/",
"Harvard Law School (HLS)": "https://hls.harvard.edu/",
"Harvard Medical School (HMS)": "https://hms.harvard.edu/",
"T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/",
"Harvard Divinity School (HDS)": "https://hds.harvard.edu/",
"Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/",
"Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/",
"Division of Continuing Education (DCE)": "https://dce.harvard.edu/",
"Harvard Extension School": "https://extension.harvard.edu/",
"Other": "https://www.harvard.edu/",
}
def determine_school_from_url(url: str) -> str:
"""根据URL判断所属学院"""
if not url:
return "Other"
parsed = urlparse(url)
domain = parsed.netloc.lower()
# 先尝试完全匹配
for pattern, school_name in SCHOOL_MAPPING.items():
if domain == pattern:
return school_name
# 再尝试部分匹配
for pattern, school_name in SCHOOL_MAPPING.items():
if pattern in domain:
return school_name
return "Other"
def reorganize_data(input_path: str, output_path: str):
"""重新组织数据按学院层级"""
# 读取原始数据
with open(input_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师")
# 按学院分组
schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []})
for prog in data['programs']:
# 根据faculty_page_url判断学院
faculty_url = prog.get('faculty_page_url', '')
school_name = determine_school_from_url(faculty_url)
# 如果没有faculty_page_url尝试从program url推断
if school_name == "Other" and prog.get('url'):
school_name = determine_school_from_url(prog['url'])
# 创建项目对象
program = {
"name": prog['name'],
"url": prog.get('url', ''),
"degree_type": prog.get('degrees', ''),
"faculty_page_url": faculty_url,
"faculty": prog.get('faculty', [])
}
# 添加到学院
if not schools_dict[school_name]["name"]:
schools_dict[school_name]["name"] = school_name
schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "")
schools_dict[school_name]["programs"].append(program)
# 转换为列表并排序
schools_list = sorted(schools_dict.values(), key=lambda s: s["name"])
# 构建输出结构
result = {
"name": "Harvard University",
"url": "https://www.harvard.edu/",
"country": "USA",
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": schools_list
}
# 打印统计
print("\n" + "=" * 60)
print("按学院重新组织完成!")
print("=" * 60)
print(f"大学: {result['name']}")
print(f"学院数: {len(schools_list)}")
total_programs = sum(len(s['programs']) for s in schools_list)
total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs'])
print(f"项目数: {total_programs}")
print(f"导师数: {total_faculty}")
print("\n各学院统计:")
for school in schools_list:
prog_count = len(school['programs'])
fac_count = sum(len(p['faculty']) for p in school['programs'])
print(f" {school['name']}: {prog_count}个项目, {fac_count}位导师")
# 保存结果
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n结果已保存到: {output_path}")
return result
if __name__ == "__main__":
input_file = "artifacts/harvard_programs_with_faculty.json"
output_file = "output/harvard_hierarchical_result.json"
reorganize_data(input_file, output_file)