Files
University-Playwright-Codeg…/scripts/reorganize_by_school.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

165 lines
6.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
将已爬取的Harvard数据按学院重新组织
读取原始扁平数据,按 学院 → 项目 → 导师 层级重新组织输出
"""
import json
from pathlib import Path
from datetime import datetime, timezone
from urllib.parse import urlparse
from collections import defaultdict
# Harvard学院映射 - 根据URL子域名判断所属学院
SCHOOL_MAPPING = {
"gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
"seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
"hbs.edu": "Harvard Business School (HBS)",
"www.hbs.edu": "Harvard Business School (HBS)",
"gsd.harvard.edu": "Graduate School of Design (GSD)",
"www.gsd.harvard.edu": "Graduate School of Design (GSD)",
"gse.harvard.edu": "Graduate School of Education (HGSE)",
"www.gse.harvard.edu": "Graduate School of Education (HGSE)",
"hks.harvard.edu": "Harvard Kennedy School (HKS)",
"www.hks.harvard.edu": "Harvard Kennedy School (HKS)",
"hls.harvard.edu": "Harvard Law School (HLS)",
"hms.harvard.edu": "Harvard Medical School (HMS)",
"hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
"www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
"hds.harvard.edu": "Harvard Divinity School (HDS)",
"hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)",
"fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
"aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
"dce.harvard.edu": "Division of Continuing Education (DCE)",
"extension.harvard.edu": "Harvard Extension School",
"cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
}
# 学院URL映射
SCHOOL_URLS = {
"Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/",
"John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/",
"Harvard Business School (HBS)": "https://www.hbs.edu/",
"Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/",
"Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/",
"Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/",
"Harvard Law School (HLS)": "https://hls.harvard.edu/",
"Harvard Medical School (HMS)": "https://hms.harvard.edu/",
"T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/",
"Harvard Divinity School (HDS)": "https://hds.harvard.edu/",
"Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/",
"Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/",
"Division of Continuing Education (DCE)": "https://dce.harvard.edu/",
"Harvard Extension School": "https://extension.harvard.edu/",
"Other": "https://www.harvard.edu/",
}
def determine_school_from_url(url: str) -> str:
"""根据URL判断所属学院"""
if not url:
return "Other"
parsed = urlparse(url)
domain = parsed.netloc.lower()
# 先尝试完全匹配
for pattern, school_name in SCHOOL_MAPPING.items():
if domain == pattern:
return school_name
# 再尝试部分匹配
for pattern, school_name in SCHOOL_MAPPING.items():
if pattern in domain:
return school_name
return "Other"
def reorganize_data(input_path: str, output_path: str):
"""重新组织数据按学院层级"""
# 读取原始数据
with open(input_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师")
# 按学院分组
schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []})
for prog in data['programs']:
# 根据faculty_page_url判断学院
faculty_url = prog.get('faculty_page_url', '')
school_name = determine_school_from_url(faculty_url)
# 如果没有faculty_page_url尝试从program url推断
if school_name == "Other" and prog.get('url'):
school_name = determine_school_from_url(prog['url'])
# 创建项目对象
program = {
"name": prog['name'],
"url": prog.get('url', ''),
"degree_type": prog.get('degrees', ''),
"faculty_page_url": faculty_url,
"faculty": prog.get('faculty', [])
}
# 添加到学院
if not schools_dict[school_name]["name"]:
schools_dict[school_name]["name"] = school_name
schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "")
schools_dict[school_name]["programs"].append(program)
# 转换为列表并排序
schools_list = sorted(schools_dict.values(), key=lambda s: s["name"])
# 构建输出结构
result = {
"name": "Harvard University",
"url": "https://www.harvard.edu/",
"country": "USA",
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": schools_list
}
# 打印统计
print("\n" + "=" * 60)
print("按学院重新组织完成!")
print("=" * 60)
print(f"大学: {result['name']}")
print(f"学院数: {len(schools_list)}")
total_programs = sum(len(s['programs']) for s in schools_list)
total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs'])
print(f"项目数: {total_programs}")
print(f"导师数: {total_faculty}")
print("\n各学院统计:")
for school in schools_list:
prog_count = len(school['programs'])
fac_count = sum(len(p['faculty']) for p in school['programs'])
print(f" {school['name']}: {prog_count}个项目, {fac_count}位导师")
# 保存结果
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n结果已保存到: {output_path}")
return result
if __name__ == "__main__":
input_file = "artifacts/harvard_programs_with_faculty.json"
output_file = "output/harvard_hierarchical_result.json"
reorganize_data(input_file, output_file)