Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/scripts/reorganize_by_school.py
+++ b/scripts/reorganize_by_school.py
@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+将已爬取的Harvard数据按学院重新组织
+
+读取原始扁平数据，按 学院 → 项目 → 导师 层级重新组织输出
+"""
+
+import json
+from pathlib import Path
+from datetime import datetime, timezone
+from urllib.parse import urlparse
+from collections import defaultdict
+
+# Harvard学院映射 - 根据URL子域名判断所属学院
+SCHOOL_MAPPING = {
+    "gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
+    "seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
+    "hbs.edu": "Harvard Business School (HBS)",
+    "www.hbs.edu": "Harvard Business School (HBS)",
+    "gsd.harvard.edu": "Graduate School of Design (GSD)",
+    "www.gsd.harvard.edu": "Graduate School of Design (GSD)",
+    "gse.harvard.edu": "Graduate School of Education (HGSE)",
+    "www.gse.harvard.edu": "Graduate School of Education (HGSE)",
+    "hks.harvard.edu": "Harvard Kennedy School (HKS)",
+    "www.hks.harvard.edu": "Harvard Kennedy School (HKS)",
+    "hls.harvard.edu": "Harvard Law School (HLS)",
+    "hms.harvard.edu": "Harvard Medical School (HMS)",
+    "hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
+    "www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
+    "hds.harvard.edu": "Harvard Divinity School (HDS)",
+    "hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)",
+    "fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
+    "aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
+    "dce.harvard.edu": "Division of Continuing Education (DCE)",
+    "extension.harvard.edu": "Harvard Extension School",
+    "cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
+}
+
+# 学院URL映射
+SCHOOL_URLS = {
+    "Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/",
+    "John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/",
+    "Harvard Business School (HBS)": "https://www.hbs.edu/",
+    "Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/",
+    "Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/",
+    "Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/",
+    "Harvard Law School (HLS)": "https://hls.harvard.edu/",
+    "Harvard Medical School (HMS)": "https://hms.harvard.edu/",
+    "T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/",
+    "Harvard Divinity School (HDS)": "https://hds.harvard.edu/",
+    "Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/",
+    "Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/",
+    "Division of Continuing Education (DCE)": "https://dce.harvard.edu/",
+    "Harvard Extension School": "https://extension.harvard.edu/",
+    "Other": "https://www.harvard.edu/",
+}
+
+
+def determine_school_from_url(url: str) -> str:
+    """根据URL判断所属学院"""
+    if not url:
+        return "Other"
+
+    parsed = urlparse(url)
+    domain = parsed.netloc.lower()
+
+    # 先尝试完全匹配
+    for pattern, school_name in SCHOOL_MAPPING.items():
+        if domain == pattern:
+            return school_name
+
+    # 再尝试部分匹配
+    for pattern, school_name in SCHOOL_MAPPING.items():
+        if pattern in domain:
+            return school_name
+
+    return "Other"
+
+
+def reorganize_data(input_path: str, output_path: str):
+    """重新组织数据按学院层级"""
+
+    # 读取原始数据
+    with open(input_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师")
+
+    # 按学院分组
+    schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []})
+
+    for prog in data['programs']:
+        # 根据faculty_page_url判断学院
+        faculty_url = prog.get('faculty_page_url', '')
+        school_name = determine_school_from_url(faculty_url)
+
+        # 如果没有faculty_page_url，尝试从program url推断
+        if school_name == "Other" and prog.get('url'):
+            school_name = determine_school_from_url(prog['url'])
+
+        # 创建项目对象
+        program = {
+            "name": prog['name'],
+            "url": prog.get('url', ''),
+            "degree_type": prog.get('degrees', ''),
+            "faculty_page_url": faculty_url,
+            "faculty": prog.get('faculty', [])
+        }
+
+        # 添加到学院
+        if not schools_dict[school_name]["name"]:
+            schools_dict[school_name]["name"] = school_name
+            schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "")
+
+        schools_dict[school_name]["programs"].append(program)
+
+    # 转换为列表并排序
+    schools_list = sorted(schools_dict.values(), key=lambda s: s["name"])
+
+    # 构建输出结构
+    result = {
+        "name": "Harvard University",
+        "url": "https://www.harvard.edu/",
+        "country": "USA",
+        "scraped_at": datetime.now(timezone.utc).isoformat(),
+        "schools": schools_list
+    }
+
+    # 打印统计
+    print("\n" + "=" * 60)
+    print("按学院重新组织完成!")
+    print("=" * 60)
+    print(f"大学: {result['name']}")
+    print(f"学院数: {len(schools_list)}")
+
+    total_programs = sum(len(s['programs']) for s in schools_list)
+    total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs'])
+
+    print(f"项目数: {total_programs}")
+    print(f"导师数: {total_faculty}")
+
+    print("\n各学院统计:")
+    for school in schools_list:
+        prog_count = len(school['programs'])
+        fac_count = sum(len(p['faculty']) for p in school['programs'])
+        print(f"  {school['name']}: {prog_count}个项目, {fac_count}位导师")
+
+    # 保存结果
+    output_file = Path(output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    print(f"\n结果已保存到: {output_path}")
+
+    return result
+
+
+if __name__ == "__main__":
+    input_file = "artifacts/harvard_programs_with_faculty.json"
+    output_file = "output/harvard_hierarchical_result.json"
+
+    reorganize_data(input_file, output_file)
--- a/scripts/start_backend.py
+++ b/scripts/start_backend.py
@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+启动后端API服务 (本地开发)
+"""
+
+import subprocess
+import sys
+import os
+
+# 切换到项目根目录
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+os.chdir(project_root)
+
+# 添加backend到Python路径
+backend_path = os.path.join(project_root, "backend")
+sys.path.insert(0, backend_path)
+
+print("=" * 60)
+print("启动大学爬虫 Web API 服务")
+print("=" * 60)
+print(f"项目目录: {project_root}")
+print(f"后端目录: {backend_path}")
+print()
+
+# 检查是否安装了依赖
+try:
+    import fastapi
+    import uvicorn
+except ImportError:
+    print("正在安装后端依赖...")
+    subprocess.run([sys.executable, "-m", "pip", "install", "-r", "backend/requirements.txt"])
+
+# 初始化数据库
+print("初始化数据库...")
+os.chdir(backend_path)
+
+# 启动服务
+print()
+print("启动 FastAPI 服务...")
+print("API文档: http://localhost:8000/docs")
+print("Swagger UI: http://localhost:8000/redoc")
+print()
+
+import uvicorn
+uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
--- a/scripts/start_dev.bat
+++ b/scripts/start_dev.bat
@ -0,0 +1,42 @@
+@echo off
+echo ============================================================
+echo 大学爬虫 Web 系统 - 本地开发启动
+echo ============================================================
+
+echo.
+echo 启动后端API服务...
+cd /d "%~dp0..\backend"
+
+REM 安装后端依赖
+pip install -r requirements.txt -q
+
+REM 启动后端
+start cmd /k "cd /d %~dp0..\backend && uvicorn app.main:app --reload --port 8000"
+
+echo 后端已启动: http://localhost:8000
+echo API文档: http://localhost:8000/docs
+
+echo.
+echo 启动前端服务...
+cd /d "%~dp0..\frontend"
+
+REM 安装前端依赖
+if not exist node_modules (
+    echo 安装前端依赖...
+    npm install
+)
+
+REM 启动前端
+start cmd /k "cd /d %~dp0..\frontend && npm run dev"
+
+echo 前端已启动: http://localhost:3000
+
+echo.
+echo ============================================================
+echo 系统启动完成!
+echo.
+echo 后端API: http://localhost:8000/docs
+echo 前端页面: http://localhost:3000
+echo ============================================================
+
+pause
--- a/scripts/test_harvard.py
+++ b/scripts/test_harvard.py
@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+测试Harvard大学爬取 - 只测试2个学院
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+# 添加项目路径
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from university_scraper.config import ScraperConfig
+from university_scraper.scraper import UniversityScraper
+
+
+# 简化的测试配置 - 只测试2个学院
+TEST_CONFIG = {
+    "university": {
+        "name": "Harvard University",
+        "url": "https://www.harvard.edu/",
+        "country": "USA"
+    },
+    "schools": {
+        "discovery_method": "static_list",
+        "static_list": [
+            {
+                "name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
+                "url": "https://seas.harvard.edu/"
+            },
+            {
+                "name": "Graduate School of Design (GSD)",
+                "url": "https://www.gsd.harvard.edu/"
+            }
+        ]
+    },
+    "programs": {
+        "paths_to_try": [
+            "/academics/graduate-programs",
+            "/programs",
+            "/academics/programs",
+            "/graduate"
+        ],
+        "link_patterns": [
+            {"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]},
+            {"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]}
+        ],
+        "selectors": {
+            "program_item": "div.program-item, li.program, a[href*='/program']",
+            "program_name": "h3, .title",
+            "program_url": "a[href]",
+            "degree_type": ".degree"
+        },
+        "pagination": {"type": "none"}
+    },
+    "faculty": {
+        "discovery_strategies": [
+            {
+                "type": "link_in_page",
+                "patterns": [
+                    {"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]}
+                ]
+            },
+            {
+                "type": "url_pattern",
+                "patterns": [
+                    "{school_url}/faculty",
+                    "{school_url}/people"
+                ]
+            }
+        ],
+        "selectors": {
+            "faculty_item": "div.faculty, li.person",
+            "faculty_name": "h3, .name",
+            "faculty_url": "a[href*='/people/'], a[href*='/faculty/']"
+        }
+    },
+    "filters": {
+        "program_degree_types": {
+            "include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."],
+            "exclude": ["Ph.D.", "Doctor", "Bachelor"]
+        },
+        "exclude_schools": []
+    }
+}
+
+
+async def test_harvard():
+    """测试Harvard爬取"""
+    print("=" * 60)
+    print("测试Harvard大学爬取（简化版 - 2个学院）")
+    print("=" * 60)
+
+    config = ScraperConfig.from_dict(TEST_CONFIG)
+
+    async with UniversityScraper(config, headless=False) as scraper:
+        university = await scraper.scrape()
+        scraper.save_results("output/harvard_test_result.json")
+
+    # 打印详细结果
+    print("\n" + "=" * 60)
+    print("详细结果:")
+    print("=" * 60)
+
+    for school in university.schools:
+        print(f"\n学院: {school.name}")
+        print(f"  URL: {school.url}")
+        print(f"  项目数: {len(school.programs)}")
+
+        for prog in school.programs[:5]:
+            print(f"\n  项目: {prog.name}")
+            print(f"    URL: {prog.url}")
+            print(f"    学位: {prog.degree_type}")
+            print(f"    导师数: {len(prog.faculty)}")
+
+            if prog.faculty:
+                print("    导师示例:")
+                for f in prog.faculty[:3]:
+                    print(f"      - {f.name}: {f.url}")
+
+        if len(school.programs) > 5:
+            print(f"\n  ... 还有 {len(school.programs) - 5} 个项目")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_harvard())