Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
164
scripts/reorganize_by_school.py
Normal file
164
scripts/reorganize_by_school.py
Normal file
@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
将已爬取的Harvard数据按学院重新组织
|
||||
|
||||
读取原始扁平数据,按 学院 → 项目 → 导师 层级重新组织输出
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urlparse
|
||||
from collections import defaultdict
|
||||
|
||||
# Harvard学院映射 - 根据URL子域名判断所属学院
|
||||
SCHOOL_MAPPING = {
|
||||
"gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
|
||||
"seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
|
||||
"hbs.edu": "Harvard Business School (HBS)",
|
||||
"www.hbs.edu": "Harvard Business School (HBS)",
|
||||
"gsd.harvard.edu": "Graduate School of Design (GSD)",
|
||||
"www.gsd.harvard.edu": "Graduate School of Design (GSD)",
|
||||
"gse.harvard.edu": "Graduate School of Education (HGSE)",
|
||||
"www.gse.harvard.edu": "Graduate School of Education (HGSE)",
|
||||
"hks.harvard.edu": "Harvard Kennedy School (HKS)",
|
||||
"www.hks.harvard.edu": "Harvard Kennedy School (HKS)",
|
||||
"hls.harvard.edu": "Harvard Law School (HLS)",
|
||||
"hms.harvard.edu": "Harvard Medical School (HMS)",
|
||||
"hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
|
||||
"www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
|
||||
"hds.harvard.edu": "Harvard Divinity School (HDS)",
|
||||
"hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)",
|
||||
"fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
|
||||
"aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
|
||||
"dce.harvard.edu": "Division of Continuing Education (DCE)",
|
||||
"extension.harvard.edu": "Harvard Extension School",
|
||||
"cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
|
||||
}
|
||||
|
||||
# 学院URL映射
|
||||
SCHOOL_URLS = {
|
||||
"Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/",
|
||||
"John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/",
|
||||
"Harvard Business School (HBS)": "https://www.hbs.edu/",
|
||||
"Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/",
|
||||
"Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/",
|
||||
"Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/",
|
||||
"Harvard Law School (HLS)": "https://hls.harvard.edu/",
|
||||
"Harvard Medical School (HMS)": "https://hms.harvard.edu/",
|
||||
"T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/",
|
||||
"Harvard Divinity School (HDS)": "https://hds.harvard.edu/",
|
||||
"Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/",
|
||||
"Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/",
|
||||
"Division of Continuing Education (DCE)": "https://dce.harvard.edu/",
|
||||
"Harvard Extension School": "https://extension.harvard.edu/",
|
||||
"Other": "https://www.harvard.edu/",
|
||||
}
|
||||
|
||||
|
||||
def determine_school_from_url(url: str) -> str:
|
||||
"""根据URL判断所属学院"""
|
||||
if not url:
|
||||
return "Other"
|
||||
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# 先尝试完全匹配
|
||||
for pattern, school_name in SCHOOL_MAPPING.items():
|
||||
if domain == pattern:
|
||||
return school_name
|
||||
|
||||
# 再尝试部分匹配
|
||||
for pattern, school_name in SCHOOL_MAPPING.items():
|
||||
if pattern in domain:
|
||||
return school_name
|
||||
|
||||
return "Other"
|
||||
|
||||
|
||||
def reorganize_data(input_path: str, output_path: str):
|
||||
"""重新组织数据按学院层级"""
|
||||
|
||||
# 读取原始数据
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师")
|
||||
|
||||
# 按学院分组
|
||||
schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []})
|
||||
|
||||
for prog in data['programs']:
|
||||
# 根据faculty_page_url判断学院
|
||||
faculty_url = prog.get('faculty_page_url', '')
|
||||
school_name = determine_school_from_url(faculty_url)
|
||||
|
||||
# 如果没有faculty_page_url,尝试从program url推断
|
||||
if school_name == "Other" and prog.get('url'):
|
||||
school_name = determine_school_from_url(prog['url'])
|
||||
|
||||
# 创建项目对象
|
||||
program = {
|
||||
"name": prog['name'],
|
||||
"url": prog.get('url', ''),
|
||||
"degree_type": prog.get('degrees', ''),
|
||||
"faculty_page_url": faculty_url,
|
||||
"faculty": prog.get('faculty', [])
|
||||
}
|
||||
|
||||
# 添加到学院
|
||||
if not schools_dict[school_name]["name"]:
|
||||
schools_dict[school_name]["name"] = school_name
|
||||
schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "")
|
||||
|
||||
schools_dict[school_name]["programs"].append(program)
|
||||
|
||||
# 转换为列表并排序
|
||||
schools_list = sorted(schools_dict.values(), key=lambda s: s["name"])
|
||||
|
||||
# 构建输出结构
|
||||
result = {
|
||||
"name": "Harvard University",
|
||||
"url": "https://www.harvard.edu/",
|
||||
"country": "USA",
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": schools_list
|
||||
}
|
||||
|
||||
# 打印统计
|
||||
print("\n" + "=" * 60)
|
||||
print("按学院重新组织完成!")
|
||||
print("=" * 60)
|
||||
print(f"大学: {result['name']}")
|
||||
print(f"学院数: {len(schools_list)}")
|
||||
|
||||
total_programs = sum(len(s['programs']) for s in schools_list)
|
||||
total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs'])
|
||||
|
||||
print(f"项目数: {total_programs}")
|
||||
print(f"导师数: {total_faculty}")
|
||||
|
||||
print("\n各学院统计:")
|
||||
for school in schools_list:
|
||||
prog_count = len(school['programs'])
|
||||
fac_count = sum(len(p['faculty']) for p in school['programs'])
|
||||
print(f" {school['name']}: {prog_count}个项目, {fac_count}位导师")
|
||||
|
||||
# 保存结果
|
||||
output_file = Path(output_path)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n结果已保存到: {output_path}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = "artifacts/harvard_programs_with_faculty.json"
|
||||
output_file = "output/harvard_hierarchical_result.json"
|
||||
|
||||
reorganize_data(input_file, output_file)
|
||||
Reference in New Issue
Block a user