#!/usr/bin/env python3 """ 将已爬取的Harvard数据按学院重新组织 读取原始扁平数据,按 学院 → 项目 → 导师 层级重新组织输出 """ import json from pathlib import Path from datetime import datetime, timezone from urllib.parse import urlparse from collections import defaultdict # Harvard学院映射 - 根据URL子域名判断所属学院 SCHOOL_MAPPING = { "gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)", "seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)", "hbs.edu": "Harvard Business School (HBS)", "www.hbs.edu": "Harvard Business School (HBS)", "gsd.harvard.edu": "Graduate School of Design (GSD)", "www.gsd.harvard.edu": "Graduate School of Design (GSD)", "gse.harvard.edu": "Graduate School of Education (HGSE)", "www.gse.harvard.edu": "Graduate School of Education (HGSE)", "hks.harvard.edu": "Harvard Kennedy School (HKS)", "www.hks.harvard.edu": "Harvard Kennedy School (HKS)", "hls.harvard.edu": "Harvard Law School (HLS)", "hms.harvard.edu": "Harvard Medical School (HMS)", "hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)", "www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)", "hds.harvard.edu": "Harvard Divinity School (HDS)", "hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)", "fas.harvard.edu": "Faculty of Arts and Sciences (FAS)", "aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)", "dce.harvard.edu": "Division of Continuing Education (DCE)", "extension.harvard.edu": "Harvard Extension School", "cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)", } # 学院URL映射 SCHOOL_URLS = { "Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/", "John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/", "Harvard Business School (HBS)": "https://www.hbs.edu/", "Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/", "Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/", "Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/", "Harvard Law School (HLS)": "https://hls.harvard.edu/", "Harvard Medical School (HMS)": "https://hms.harvard.edu/", "T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/", "Harvard Divinity School (HDS)": "https://hds.harvard.edu/", "Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/", "Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/", "Division of Continuing Education (DCE)": "https://dce.harvard.edu/", "Harvard Extension School": "https://extension.harvard.edu/", "Other": "https://www.harvard.edu/", } def determine_school_from_url(url: str) -> str: """根据URL判断所属学院""" if not url: return "Other" parsed = urlparse(url) domain = parsed.netloc.lower() # 先尝试完全匹配 for pattern, school_name in SCHOOL_MAPPING.items(): if domain == pattern: return school_name # 再尝试部分匹配 for pattern, school_name in SCHOOL_MAPPING.items(): if pattern in domain: return school_name return "Other" def reorganize_data(input_path: str, output_path: str): """重新组织数据按学院层级""" # 读取原始数据 with open(input_path, 'r', encoding='utf-8') as f: data = json.load(f) print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师") # 按学院分组 schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []}) for prog in data['programs']: # 根据faculty_page_url判断学院 faculty_url = prog.get('faculty_page_url', '') school_name = determine_school_from_url(faculty_url) # 如果没有faculty_page_url,尝试从program url推断 if school_name == "Other" and prog.get('url'): school_name = determine_school_from_url(prog['url']) # 创建项目对象 program = { "name": prog['name'], "url": prog.get('url', ''), "degree_type": prog.get('degrees', ''), "faculty_page_url": faculty_url, "faculty": prog.get('faculty', []) } # 添加到学院 if not schools_dict[school_name]["name"]: schools_dict[school_name]["name"] = school_name schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "") schools_dict[school_name]["programs"].append(program) # 转换为列表并排序 schools_list = sorted(schools_dict.values(), key=lambda s: s["name"]) # 构建输出结构 result = { "name": "Harvard University", "url": "https://www.harvard.edu/", "country": "USA", "scraped_at": datetime.now(timezone.utc).isoformat(), "schools": schools_list } # 打印统计 print("\n" + "=" * 60) print("按学院重新组织完成!") print("=" * 60) print(f"大学: {result['name']}") print(f"学院数: {len(schools_list)}") total_programs = sum(len(s['programs']) for s in schools_list) total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs']) print(f"项目数: {total_programs}") print(f"导师数: {total_faculty}") print("\n各学院统计:") for school in schools_list: prog_count = len(school['programs']) fac_count = sum(len(p['faculty']) for p in school['programs']) print(f" {school['name']}: {prog_count}个项目, {fac_count}位导师") # 保存结果 output_file = Path(output_path) output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"\n结果已保存到: {output_path}") return result if __name__ == "__main__": input_file = "artifacts/harvard_programs_with_faculty.json" output_file = "output/harvard_hierarchical_result.json" reorganize_data(input_file, output_file)