Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/scripts/reorganize_by_school.py
+++ b/scripts/reorganize_by_school.py
@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+将已爬取的Harvard数据按学院重新组织
+
+读取原始扁平数据，按 学院 → 项目 → 导师 层级重新组织输出
+"""
+
+import json
+from pathlib import Path
+from datetime import datetime, timezone
+from urllib.parse import urlparse
+from collections import defaultdict
+
+# Harvard学院映射 - 根据URL子域名判断所属学院
+SCHOOL_MAPPING = {
+    "gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
+    "seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
+    "hbs.edu": "Harvard Business School (HBS)",
+    "www.hbs.edu": "Harvard Business School (HBS)",
+    "gsd.harvard.edu": "Graduate School of Design (GSD)",
+    "www.gsd.harvard.edu": "Graduate School of Design (GSD)",
+    "gse.harvard.edu": "Graduate School of Education (HGSE)",
+    "www.gse.harvard.edu": "Graduate School of Education (HGSE)",
+    "hks.harvard.edu": "Harvard Kennedy School (HKS)",
+    "www.hks.harvard.edu": "Harvard Kennedy School (HKS)",
+    "hls.harvard.edu": "Harvard Law School (HLS)",
+    "hms.harvard.edu": "Harvard Medical School (HMS)",
+    "hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
+    "www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
+    "hds.harvard.edu": "Harvard Divinity School (HDS)",
+    "hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)",
+    "fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
+    "aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
+    "dce.harvard.edu": "Division of Continuing Education (DCE)",
+    "extension.harvard.edu": "Harvard Extension School",
+    "cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
+}
+
+# 学院URL映射
+SCHOOL_URLS = {
+    "Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/",
+    "John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/",
+    "Harvard Business School (HBS)": "https://www.hbs.edu/",
+    "Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/",
+    "Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/",
+    "Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/",
+    "Harvard Law School (HLS)": "https://hls.harvard.edu/",
+    "Harvard Medical School (HMS)": "https://hms.harvard.edu/",
+    "T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/",
+    "Harvard Divinity School (HDS)": "https://hds.harvard.edu/",
+    "Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/",
+    "Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/",
+    "Division of Continuing Education (DCE)": "https://dce.harvard.edu/",
+    "Harvard Extension School": "https://extension.harvard.edu/",
+    "Other": "https://www.harvard.edu/",
+}
+
+
+def determine_school_from_url(url: str) -> str:
+    """根据URL判断所属学院"""
+    if not url:
+        return "Other"
+
+    parsed = urlparse(url)
+    domain = parsed.netloc.lower()
+
+    # 先尝试完全匹配
+    for pattern, school_name in SCHOOL_MAPPING.items():
+        if domain == pattern:
+            return school_name
+
+    # 再尝试部分匹配
+    for pattern, school_name in SCHOOL_MAPPING.items():
+        if pattern in domain:
+            return school_name
+
+    return "Other"
+
+
+def reorganize_data(input_path: str, output_path: str):
+    """重新组织数据按学院层级"""
+
+    # 读取原始数据
+    with open(input_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师")
+
+    # 按学院分组
+    schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []})
+
+    for prog in data['programs']:
+        # 根据faculty_page_url判断学院
+        faculty_url = prog.get('faculty_page_url', '')
+        school_name = determine_school_from_url(faculty_url)
+
+        # 如果没有faculty_page_url，尝试从program url推断
+        if school_name == "Other" and prog.get('url'):
+            school_name = determine_school_from_url(prog['url'])
+
+        # 创建项目对象
+        program = {
+            "name": prog['name'],
+            "url": prog.get('url', ''),
+            "degree_type": prog.get('degrees', ''),
+            "faculty_page_url": faculty_url,
+            "faculty": prog.get('faculty', [])
+        }
+
+        # 添加到学院
+        if not schools_dict[school_name]["name"]:
+            schools_dict[school_name]["name"] = school_name
+            schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "")
+
+        schools_dict[school_name]["programs"].append(program)
+
+    # 转换为列表并排序
+    schools_list = sorted(schools_dict.values(), key=lambda s: s["name"])
+
+    # 构建输出结构
+    result = {
+        "name": "Harvard University",
+        "url": "https://www.harvard.edu/",
+        "country": "USA",
+        "scraped_at": datetime.now(timezone.utc).isoformat(),
+        "schools": schools_list
+    }
+
+    # 打印统计
+    print("\n" + "=" * 60)
+    print("按学院重新组织完成!")
+    print("=" * 60)
+    print(f"大学: {result['name']}")
+    print(f"学院数: {len(schools_list)}")
+
+    total_programs = sum(len(s['programs']) for s in schools_list)
+    total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs'])
+
+    print(f"项目数: {total_programs}")
+    print(f"导师数: {total_faculty}")
+
+    print("\n各学院统计:")
+    for school in schools_list:
+        prog_count = len(school['programs'])
+        fac_count = sum(len(p['faculty']) for p in school['programs'])
+        print(f"  {school['name']}: {prog_count}个项目, {fac_count}位导师")
+
+    # 保存结果
+    output_file = Path(output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    print(f"\n结果已保存到: {output_path}")
+
+    return result
+
+
+if __name__ == "__main__":
+    input_file = "artifacts/harvard_programs_with_faculty.json"
+    output_file = "output/harvard_hierarchical_result.json"
+
+    reorganize_data(input_file, output_file)