Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""
将已爬取的Harvard数据按学院重新组织
读取原始扁平数据,按 学院 → 项目 → 导师 层级重新组织输出
"""
import json
from pathlib import Path
from datetime import datetime, timezone
from urllib.parse import urlparse
from collections import defaultdict
# Harvard学院映射 - 根据URL子域名判断所属学院
SCHOOL_MAPPING = {
"gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
"seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
"hbs.edu": "Harvard Business School (HBS)",
"www.hbs.edu": "Harvard Business School (HBS)",
"gsd.harvard.edu": "Graduate School of Design (GSD)",
"www.gsd.harvard.edu": "Graduate School of Design (GSD)",
"gse.harvard.edu": "Graduate School of Education (HGSE)",
"www.gse.harvard.edu": "Graduate School of Education (HGSE)",
"hks.harvard.edu": "Harvard Kennedy School (HKS)",
"www.hks.harvard.edu": "Harvard Kennedy School (HKS)",
"hls.harvard.edu": "Harvard Law School (HLS)",
"hms.harvard.edu": "Harvard Medical School (HMS)",
"hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
"www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
"hds.harvard.edu": "Harvard Divinity School (HDS)",
"hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)",
"fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
"aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
"dce.harvard.edu": "Division of Continuing Education (DCE)",
"extension.harvard.edu": "Harvard Extension School",
"cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
}
# 学院URL映射
SCHOOL_URLS = {
"Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/",
"John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/",
"Harvard Business School (HBS)": "https://www.hbs.edu/",
"Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/",
"Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/",
"Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/",
"Harvard Law School (HLS)": "https://hls.harvard.edu/",
"Harvard Medical School (HMS)": "https://hms.harvard.edu/",
"T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/",
"Harvard Divinity School (HDS)": "https://hds.harvard.edu/",
"Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/",
"Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/",
"Division of Continuing Education (DCE)": "https://dce.harvard.edu/",
"Harvard Extension School": "https://extension.harvard.edu/",
"Other": "https://www.harvard.edu/",
}
def determine_school_from_url(url: str) -> str:
"""根据URL判断所属学院"""
if not url:
return "Other"
parsed = urlparse(url)
domain = parsed.netloc.lower()
# 先尝试完全匹配
for pattern, school_name in SCHOOL_MAPPING.items():
if domain == pattern:
return school_name
# 再尝试部分匹配
for pattern, school_name in SCHOOL_MAPPING.items():
if pattern in domain:
return school_name
return "Other"
def reorganize_data(input_path: str, output_path: str):
"""重新组织数据按学院层级"""
# 读取原始数据
with open(input_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师")
# 按学院分组
schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []})
for prog in data['programs']:
# 根据faculty_page_url判断学院
faculty_url = prog.get('faculty_page_url', '')
school_name = determine_school_from_url(faculty_url)
# 如果没有faculty_page_url尝试从program url推断
if school_name == "Other" and prog.get('url'):
school_name = determine_school_from_url(prog['url'])
# 创建项目对象
program = {
"name": prog['name'],
"url": prog.get('url', ''),
"degree_type": prog.get('degrees', ''),
"faculty_page_url": faculty_url,
"faculty": prog.get('faculty', [])
}
# 添加到学院
if not schools_dict[school_name]["name"]:
schools_dict[school_name]["name"] = school_name
schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "")
schools_dict[school_name]["programs"].append(program)
# 转换为列表并排序
schools_list = sorted(schools_dict.values(), key=lambda s: s["name"])
# 构建输出结构
result = {
"name": "Harvard University",
"url": "https://www.harvard.edu/",
"country": "USA",
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": schools_list
}
# 打印统计
print("\n" + "=" * 60)
print("按学院重新组织完成!")
print("=" * 60)
print(f"大学: {result['name']}")
print(f"学院数: {len(schools_list)}")
total_programs = sum(len(s['programs']) for s in schools_list)
total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs'])
print(f"项目数: {total_programs}")
print(f"导师数: {total_faculty}")
print("\n各学院统计:")
for school in schools_list:
prog_count = len(school['programs'])
fac_count = sum(len(p['faculty']) for p in school['programs'])
print(f" {school['name']}: {prog_count}个项目, {fac_count}位导师")
# 保存结果
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n结果已保存到: {output_path}")
return result
if __name__ == "__main__":
input_file = "artifacts/harvard_programs_with_faculty.json"
output_file = "output/harvard_hierarchical_result.json"
reorganize_data(input_file, output_file)

45
scripts/start_backend.py Normal file
View File

@ -0,0 +1,45 @@
#!/usr/bin/env python3
"""
启动后端API服务 (本地开发)
"""
import subprocess
import sys
import os
# 切换到项目根目录
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
os.chdir(project_root)
# 添加backend到Python路径
backend_path = os.path.join(project_root, "backend")
sys.path.insert(0, backend_path)
print("=" * 60)
print("启动大学爬虫 Web API 服务")
print("=" * 60)
print(f"项目目录: {project_root}")
print(f"后端目录: {backend_path}")
print()
# 检查是否安装了依赖
try:
import fastapi
import uvicorn
except ImportError:
print("正在安装后端依赖...")
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "backend/requirements.txt"])
# 初始化数据库
print("初始化数据库...")
os.chdir(backend_path)
# 启动服务
print()
print("启动 FastAPI 服务...")
print("API文档: http://localhost:8000/docs")
print("Swagger UI: http://localhost:8000/redoc")
print()
import uvicorn
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)

42
scripts/start_dev.bat Normal file
View File

@ -0,0 +1,42 @@
@echo off
echo ============================================================
echo 大学爬虫 Web 系统 - 本地开发启动
echo ============================================================
echo.
echo 启动后端API服务...
cd /d "%~dp0..\backend"
REM 安装后端依赖
pip install -r requirements.txt -q
REM 启动后端
start cmd /k "cd /d %~dp0..\backend && uvicorn app.main:app --reload --port 8000"
echo 后端已启动: http://localhost:8000
echo API文档: http://localhost:8000/docs
echo.
echo 启动前端服务...
cd /d "%~dp0..\frontend"
REM 安装前端依赖
if not exist node_modules (
echo 安装前端依赖...
npm install
)
REM 启动前端
start cmd /k "cd /d %~dp0..\frontend && npm run dev"
echo 前端已启动: http://localhost:3000
echo.
echo ============================================================
echo 系统启动完成!
echo.
echo 后端API: http://localhost:8000/docs
echo 前端页面: http://localhost:3000
echo ============================================================
pause

126
scripts/test_harvard.py Normal file
View File

@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""
测试Harvard大学爬取 - 只测试2个学院
"""
import asyncio
import sys
from pathlib import Path
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from university_scraper.config import ScraperConfig
from university_scraper.scraper import UniversityScraper
# 简化的测试配置 - 只测试2个学院
TEST_CONFIG = {
"university": {
"name": "Harvard University",
"url": "https://www.harvard.edu/",
"country": "USA"
},
"schools": {
"discovery_method": "static_list",
"static_list": [
{
"name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
"url": "https://seas.harvard.edu/"
},
{
"name": "Graduate School of Design (GSD)",
"url": "https://www.gsd.harvard.edu/"
}
]
},
"programs": {
"paths_to_try": [
"/academics/graduate-programs",
"/programs",
"/academics/programs",
"/graduate"
],
"link_patterns": [
{"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]},
{"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]}
],
"selectors": {
"program_item": "div.program-item, li.program, a[href*='/program']",
"program_name": "h3, .title",
"program_url": "a[href]",
"degree_type": ".degree"
},
"pagination": {"type": "none"}
},
"faculty": {
"discovery_strategies": [
{
"type": "link_in_page",
"patterns": [
{"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]}
]
},
{
"type": "url_pattern",
"patterns": [
"{school_url}/faculty",
"{school_url}/people"
]
}
],
"selectors": {
"faculty_item": "div.faculty, li.person",
"faculty_name": "h3, .name",
"faculty_url": "a[href*='/people/'], a[href*='/faculty/']"
}
},
"filters": {
"program_degree_types": {
"include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."],
"exclude": ["Ph.D.", "Doctor", "Bachelor"]
},
"exclude_schools": []
}
}
async def test_harvard():
"""测试Harvard爬取"""
print("=" * 60)
print("测试Harvard大学爬取简化版 - 2个学院")
print("=" * 60)
config = ScraperConfig.from_dict(TEST_CONFIG)
async with UniversityScraper(config, headless=False) as scraper:
university = await scraper.scrape()
scraper.save_results("output/harvard_test_result.json")
# 打印详细结果
print("\n" + "=" * 60)
print("详细结果:")
print("=" * 60)
for school in university.schools:
print(f"\n学院: {school.name}")
print(f" URL: {school.url}")
print(f" 项目数: {len(school.programs)}")
for prog in school.programs[:5]:
print(f"\n 项目: {prog.name}")
print(f" URL: {prog.url}")
print(f" 学位: {prog.degree_type}")
print(f" 导师数: {len(prog.faculty)}")
if prog.faculty:
print(" 导师示例:")
for f in prog.faculty[:3]:
print(f" - {f.name}: {f.url}")
if len(school.programs) > 5:
print(f"\n ... 还有 {len(school.programs) - 5} 个项目")
if __name__ == "__main__":
asyncio.run(test_harvard())