Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
164
scripts/reorganize_by_school.py
Normal file
164
scripts/reorganize_by_school.py
Normal file
@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
将已爬取的Harvard数据按学院重新组织
|
||||
|
||||
读取原始扁平数据,按 学院 → 项目 → 导师 层级重新组织输出
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urlparse
|
||||
from collections import defaultdict
|
||||
|
||||
# Harvard学院映射 - 根据URL子域名判断所属学院
|
||||
SCHOOL_MAPPING = {
|
||||
"gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
|
||||
"seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
|
||||
"hbs.edu": "Harvard Business School (HBS)",
|
||||
"www.hbs.edu": "Harvard Business School (HBS)",
|
||||
"gsd.harvard.edu": "Graduate School of Design (GSD)",
|
||||
"www.gsd.harvard.edu": "Graduate School of Design (GSD)",
|
||||
"gse.harvard.edu": "Graduate School of Education (HGSE)",
|
||||
"www.gse.harvard.edu": "Graduate School of Education (HGSE)",
|
||||
"hks.harvard.edu": "Harvard Kennedy School (HKS)",
|
||||
"www.hks.harvard.edu": "Harvard Kennedy School (HKS)",
|
||||
"hls.harvard.edu": "Harvard Law School (HLS)",
|
||||
"hms.harvard.edu": "Harvard Medical School (HMS)",
|
||||
"hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
|
||||
"www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
|
||||
"hds.harvard.edu": "Harvard Divinity School (HDS)",
|
||||
"hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)",
|
||||
"fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
|
||||
"aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
|
||||
"dce.harvard.edu": "Division of Continuing Education (DCE)",
|
||||
"extension.harvard.edu": "Harvard Extension School",
|
||||
"cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
|
||||
}
|
||||
|
||||
# 学院URL映射
|
||||
SCHOOL_URLS = {
|
||||
"Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/",
|
||||
"John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/",
|
||||
"Harvard Business School (HBS)": "https://www.hbs.edu/",
|
||||
"Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/",
|
||||
"Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/",
|
||||
"Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/",
|
||||
"Harvard Law School (HLS)": "https://hls.harvard.edu/",
|
||||
"Harvard Medical School (HMS)": "https://hms.harvard.edu/",
|
||||
"T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/",
|
||||
"Harvard Divinity School (HDS)": "https://hds.harvard.edu/",
|
||||
"Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/",
|
||||
"Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/",
|
||||
"Division of Continuing Education (DCE)": "https://dce.harvard.edu/",
|
||||
"Harvard Extension School": "https://extension.harvard.edu/",
|
||||
"Other": "https://www.harvard.edu/",
|
||||
}
|
||||
|
||||
|
||||
def determine_school_from_url(url: str) -> str:
|
||||
"""根据URL判断所属学院"""
|
||||
if not url:
|
||||
return "Other"
|
||||
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# 先尝试完全匹配
|
||||
for pattern, school_name in SCHOOL_MAPPING.items():
|
||||
if domain == pattern:
|
||||
return school_name
|
||||
|
||||
# 再尝试部分匹配
|
||||
for pattern, school_name in SCHOOL_MAPPING.items():
|
||||
if pattern in domain:
|
||||
return school_name
|
||||
|
||||
return "Other"
|
||||
|
||||
|
||||
def reorganize_data(input_path: str, output_path: str):
|
||||
"""重新组织数据按学院层级"""
|
||||
|
||||
# 读取原始数据
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师")
|
||||
|
||||
# 按学院分组
|
||||
schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []})
|
||||
|
||||
for prog in data['programs']:
|
||||
# 根据faculty_page_url判断学院
|
||||
faculty_url = prog.get('faculty_page_url', '')
|
||||
school_name = determine_school_from_url(faculty_url)
|
||||
|
||||
# 如果没有faculty_page_url,尝试从program url推断
|
||||
if school_name == "Other" and prog.get('url'):
|
||||
school_name = determine_school_from_url(prog['url'])
|
||||
|
||||
# 创建项目对象
|
||||
program = {
|
||||
"name": prog['name'],
|
||||
"url": prog.get('url', ''),
|
||||
"degree_type": prog.get('degrees', ''),
|
||||
"faculty_page_url": faculty_url,
|
||||
"faculty": prog.get('faculty', [])
|
||||
}
|
||||
|
||||
# 添加到学院
|
||||
if not schools_dict[school_name]["name"]:
|
||||
schools_dict[school_name]["name"] = school_name
|
||||
schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "")
|
||||
|
||||
schools_dict[school_name]["programs"].append(program)
|
||||
|
||||
# 转换为列表并排序
|
||||
schools_list = sorted(schools_dict.values(), key=lambda s: s["name"])
|
||||
|
||||
# 构建输出结构
|
||||
result = {
|
||||
"name": "Harvard University",
|
||||
"url": "https://www.harvard.edu/",
|
||||
"country": "USA",
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": schools_list
|
||||
}
|
||||
|
||||
# 打印统计
|
||||
print("\n" + "=" * 60)
|
||||
print("按学院重新组织完成!")
|
||||
print("=" * 60)
|
||||
print(f"大学: {result['name']}")
|
||||
print(f"学院数: {len(schools_list)}")
|
||||
|
||||
total_programs = sum(len(s['programs']) for s in schools_list)
|
||||
total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs'])
|
||||
|
||||
print(f"项目数: {total_programs}")
|
||||
print(f"导师数: {total_faculty}")
|
||||
|
||||
print("\n各学院统计:")
|
||||
for school in schools_list:
|
||||
prog_count = len(school['programs'])
|
||||
fac_count = sum(len(p['faculty']) for p in school['programs'])
|
||||
print(f" {school['name']}: {prog_count}个项目, {fac_count}位导师")
|
||||
|
||||
# 保存结果
|
||||
output_file = Path(output_path)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n结果已保存到: {output_path}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = "artifacts/harvard_programs_with_faculty.json"
|
||||
output_file = "output/harvard_hierarchical_result.json"
|
||||
|
||||
reorganize_data(input_file, output_file)
|
||||
45
scripts/start_backend.py
Normal file
45
scripts/start_backend.py
Normal file
@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
启动后端API服务 (本地开发)
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 切换到项目根目录
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
os.chdir(project_root)
|
||||
|
||||
# 添加backend到Python路径
|
||||
backend_path = os.path.join(project_root, "backend")
|
||||
sys.path.insert(0, backend_path)
|
||||
|
||||
print("=" * 60)
|
||||
print("启动大学爬虫 Web API 服务")
|
||||
print("=" * 60)
|
||||
print(f"项目目录: {project_root}")
|
||||
print(f"后端目录: {backend_path}")
|
||||
print()
|
||||
|
||||
# 检查是否安装了依赖
|
||||
try:
|
||||
import fastapi
|
||||
import uvicorn
|
||||
except ImportError:
|
||||
print("正在安装后端依赖...")
|
||||
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "backend/requirements.txt"])
|
||||
|
||||
# 初始化数据库
|
||||
print("初始化数据库...")
|
||||
os.chdir(backend_path)
|
||||
|
||||
# 启动服务
|
||||
print()
|
||||
print("启动 FastAPI 服务...")
|
||||
print("API文档: http://localhost:8000/docs")
|
||||
print("Swagger UI: http://localhost:8000/redoc")
|
||||
print()
|
||||
|
||||
import uvicorn
|
||||
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
|
||||
42
scripts/start_dev.bat
Normal file
42
scripts/start_dev.bat
Normal file
@ -0,0 +1,42 @@
|
||||
@echo off
|
||||
echo ============================================================
|
||||
echo 大学爬虫 Web 系统 - 本地开发启动
|
||||
echo ============================================================
|
||||
|
||||
echo.
|
||||
echo 启动后端API服务...
|
||||
cd /d "%~dp0..\backend"
|
||||
|
||||
REM 安装后端依赖
|
||||
pip install -r requirements.txt -q
|
||||
|
||||
REM 启动后端
|
||||
start cmd /k "cd /d %~dp0..\backend && uvicorn app.main:app --reload --port 8000"
|
||||
|
||||
echo 后端已启动: http://localhost:8000
|
||||
echo API文档: http://localhost:8000/docs
|
||||
|
||||
echo.
|
||||
echo 启动前端服务...
|
||||
cd /d "%~dp0..\frontend"
|
||||
|
||||
REM 安装前端依赖
|
||||
if not exist node_modules (
|
||||
echo 安装前端依赖...
|
||||
npm install
|
||||
)
|
||||
|
||||
REM 启动前端
|
||||
start cmd /k "cd /d %~dp0..\frontend && npm run dev"
|
||||
|
||||
echo 前端已启动: http://localhost:3000
|
||||
|
||||
echo.
|
||||
echo ============================================================
|
||||
echo 系统启动完成!
|
||||
echo.
|
||||
echo 后端API: http://localhost:8000/docs
|
||||
echo 前端页面: http://localhost:3000
|
||||
echo ============================================================
|
||||
|
||||
pause
|
||||
126
scripts/test_harvard.py
Normal file
126
scripts/test_harvard.py
Normal file
@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
测试Harvard大学爬取 - 只测试2个学院
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 添加项目路径
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from university_scraper.config import ScraperConfig
|
||||
from university_scraper.scraper import UniversityScraper
|
||||
|
||||
|
||||
# 简化的测试配置 - 只测试2个学院
|
||||
TEST_CONFIG = {
|
||||
"university": {
|
||||
"name": "Harvard University",
|
||||
"url": "https://www.harvard.edu/",
|
||||
"country": "USA"
|
||||
},
|
||||
"schools": {
|
||||
"discovery_method": "static_list",
|
||||
"static_list": [
|
||||
{
|
||||
"name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
|
||||
"url": "https://seas.harvard.edu/"
|
||||
},
|
||||
{
|
||||
"name": "Graduate School of Design (GSD)",
|
||||
"url": "https://www.gsd.harvard.edu/"
|
||||
}
|
||||
]
|
||||
},
|
||||
"programs": {
|
||||
"paths_to_try": [
|
||||
"/academics/graduate-programs",
|
||||
"/programs",
|
||||
"/academics/programs",
|
||||
"/graduate"
|
||||
],
|
||||
"link_patterns": [
|
||||
{"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]},
|
||||
{"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]}
|
||||
],
|
||||
"selectors": {
|
||||
"program_item": "div.program-item, li.program, a[href*='/program']",
|
||||
"program_name": "h3, .title",
|
||||
"program_url": "a[href]",
|
||||
"degree_type": ".degree"
|
||||
},
|
||||
"pagination": {"type": "none"}
|
||||
},
|
||||
"faculty": {
|
||||
"discovery_strategies": [
|
||||
{
|
||||
"type": "link_in_page",
|
||||
"patterns": [
|
||||
{"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "url_pattern",
|
||||
"patterns": [
|
||||
"{school_url}/faculty",
|
||||
"{school_url}/people"
|
||||
]
|
||||
}
|
||||
],
|
||||
"selectors": {
|
||||
"faculty_item": "div.faculty, li.person",
|
||||
"faculty_name": "h3, .name",
|
||||
"faculty_url": "a[href*='/people/'], a[href*='/faculty/']"
|
||||
}
|
||||
},
|
||||
"filters": {
|
||||
"program_degree_types": {
|
||||
"include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."],
|
||||
"exclude": ["Ph.D.", "Doctor", "Bachelor"]
|
||||
},
|
||||
"exclude_schools": []
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async def test_harvard():
|
||||
"""测试Harvard爬取"""
|
||||
print("=" * 60)
|
||||
print("测试Harvard大学爬取(简化版 - 2个学院)")
|
||||
print("=" * 60)
|
||||
|
||||
config = ScraperConfig.from_dict(TEST_CONFIG)
|
||||
|
||||
async with UniversityScraper(config, headless=False) as scraper:
|
||||
university = await scraper.scrape()
|
||||
scraper.save_results("output/harvard_test_result.json")
|
||||
|
||||
# 打印详细结果
|
||||
print("\n" + "=" * 60)
|
||||
print("详细结果:")
|
||||
print("=" * 60)
|
||||
|
||||
for school in university.schools:
|
||||
print(f"\n学院: {school.name}")
|
||||
print(f" URL: {school.url}")
|
||||
print(f" 项目数: {len(school.programs)}")
|
||||
|
||||
for prog in school.programs[:5]:
|
||||
print(f"\n 项目: {prog.name}")
|
||||
print(f" URL: {prog.url}")
|
||||
print(f" 学位: {prog.degree_type}")
|
||||
print(f" 导师数: {len(prog.faculty)}")
|
||||
|
||||
if prog.faculty:
|
||||
print(" 导师示例:")
|
||||
for f in prog.faculty[:3]:
|
||||
print(f" - {f.name}: {f.url}")
|
||||
|
||||
if len(school.programs) > 5:
|
||||
print(f"\n ... 还有 {len(school.programs) - 5} 个项目")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_harvard())
|
||||
Reference in New Issue
Block a user