Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,15 @@
"""API路由"""
from fastapi import APIRouter
from .universities import router as universities_router
from .scripts import router as scripts_router
from .jobs import router as jobs_router
from .results import router as results_router
api_router = APIRouter()
api_router.include_router(universities_router, prefix="/universities", tags=["大学管理"])
api_router.include_router(scripts_router, prefix="/scripts", tags=["爬虫脚本"])
api_router.include_router(jobs_router, prefix="/jobs", tags=["爬取任务"])
api_router.include_router(results_router, prefix="/results", tags=["爬取结果"])

144
backend/app/api/jobs.py Normal file
View File

@ -0,0 +1,144 @@
"""爬取任务API"""
from typing import List
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
from sqlalchemy.orm import Session
from ..database import get_db
from ..models import University, ScraperScript, ScrapeJob, ScrapeLog
from ..schemas.job import JobResponse, JobStatusResponse, LogResponse
from ..services.scraper_runner import run_scraper
router = APIRouter()
@router.post("/start/{university_id}", response_model=JobResponse)
async def start_scrape_job(
university_id: int,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db)
):
"""
一键运行爬虫
启动爬取任务,抓取大学项目和导师数据
"""
# 检查大学是否存在
university = db.query(University).filter(University.id == university_id).first()
if not university:
raise HTTPException(status_code=404, detail="大学不存在")
# 检查是否有活跃的脚本
script = db.query(ScraperScript).filter(
ScraperScript.university_id == university_id,
ScraperScript.status == "active"
).first()
if not script:
raise HTTPException(status_code=400, detail="没有可用的爬虫脚本,请先生成脚本")
# 检查是否有正在运行的任务
running_job = db.query(ScrapeJob).filter(
ScrapeJob.university_id == university_id,
ScrapeJob.status == "running"
).first()
if running_job:
raise HTTPException(status_code=400, detail="已有正在运行的任务")
# 创建任务
job = ScrapeJob(
university_id=university_id,
script_id=script.id,
status="pending",
progress=0,
current_step="准备中..."
)
db.add(job)
db.commit()
db.refresh(job)
# 在后台执行爬虫
background_tasks.add_task(
run_scraper,
job_id=job.id,
script_id=script.id
)
return job
@router.get("/{job_id}", response_model=JobResponse)
def get_job(
job_id: int,
db: Session = Depends(get_db)
):
"""获取任务详情"""
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="任务不存在")
return job
@router.get("/{job_id}/status", response_model=JobStatusResponse)
def get_job_status(
job_id: int,
db: Session = Depends(get_db)
):
"""获取任务状态和日志"""
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="任务不存在")
# 获取最近的日志
logs = db.query(ScrapeLog).filter(
ScrapeLog.job_id == job_id
).order_by(ScrapeLog.created_at.desc()).limit(50).all()
return JobStatusResponse(
id=job.id,
status=job.status,
progress=job.progress,
current_step=job.current_step,
logs=[LogResponse(
id=log.id,
level=log.level,
message=log.message,
created_at=log.created_at
) for log in reversed(logs)]
)
@router.get("/university/{university_id}", response_model=List[JobResponse])
def get_university_jobs(
university_id: int,
db: Session = Depends(get_db)
):
"""获取大学的所有任务"""
jobs = db.query(ScrapeJob).filter(
ScrapeJob.university_id == university_id
).order_by(ScrapeJob.created_at.desc()).limit(20).all()
return jobs
@router.post("/{job_id}/cancel")
def cancel_job(
job_id: int,
db: Session = Depends(get_db)
):
"""取消任务"""
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="任务不存在")
if job.status not in ["pending", "running"]:
raise HTTPException(status_code=400, detail="任务已结束,无法取消")
job.status = "cancelled"
job.completed_at = datetime.utcnow()
db.commit()
return {"message": "任务已取消"}

175
backend/app/api/results.py Normal file
View File

@ -0,0 +1,175 @@
"""爬取结果API"""
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import JSONResponse
from sqlalchemy.orm import Session
from ..database import get_db
from ..models import ScrapeResult
from ..schemas.result import ResultResponse
router = APIRouter()
@router.get("/university/{university_id}", response_model=ResultResponse)
def get_university_result(
university_id: int,
db: Session = Depends(get_db)
):
"""获取大学最新的爬取结果"""
result = db.query(ScrapeResult).filter(
ScrapeResult.university_id == university_id
).order_by(ScrapeResult.created_at.desc()).first()
if not result:
raise HTTPException(status_code=404, detail="没有爬取结果")
return result
@router.get("/university/{university_id}/schools")
def get_schools(
university_id: int,
db: Session = Depends(get_db)
):
"""获取学院列表"""
result = db.query(ScrapeResult).filter(
ScrapeResult.university_id == university_id
).order_by(ScrapeResult.created_at.desc()).first()
if not result:
raise HTTPException(status_code=404, detail="没有爬取结果")
schools = result.result_data.get("schools", [])
# 返回简化的学院列表
return {
"total": len(schools),
"schools": [
{
"name": s.get("name"),
"url": s.get("url"),
"program_count": len(s.get("programs", []))
}
for s in schools
]
}
@router.get("/university/{university_id}/programs")
def get_programs(
university_id: int,
school_name: Optional[str] = Query(None, description="按学院筛选"),
search: Optional[str] = Query(None, description="搜索项目名称"),
db: Session = Depends(get_db)
):
"""获取项目列表"""
result = db.query(ScrapeResult).filter(
ScrapeResult.university_id == university_id
).order_by(ScrapeResult.created_at.desc()).first()
if not result:
raise HTTPException(status_code=404, detail="没有爬取结果")
schools = result.result_data.get("schools", [])
programs = []
for school in schools:
if school_name and school.get("name") != school_name:
continue
for prog in school.get("programs", []):
if search and search.lower() not in prog.get("name", "").lower():
continue
programs.append({
"name": prog.get("name"),
"url": prog.get("url"),
"degree_type": prog.get("degree_type"),
"school": school.get("name"),
"faculty_count": len(prog.get("faculty", []))
})
return {
"total": len(programs),
"programs": programs
}
@router.get("/university/{university_id}/faculty")
def get_faculty(
university_id: int,
school_name: Optional[str] = Query(None, description="按学院筛选"),
program_name: Optional[str] = Query(None, description="按项目筛选"),
search: Optional[str] = Query(None, description="搜索导师姓名"),
skip: int = Query(0, ge=0),
limit: int = Query(50, ge=1, le=200),
db: Session = Depends(get_db)
):
"""获取导师列表"""
result = db.query(ScrapeResult).filter(
ScrapeResult.university_id == university_id
).order_by(ScrapeResult.created_at.desc()).first()
if not result:
raise HTTPException(status_code=404, detail="没有爬取结果")
schools = result.result_data.get("schools", [])
faculty_list = []
for school in schools:
if school_name and school.get("name") != school_name:
continue
for prog in school.get("programs", []):
if program_name and prog.get("name") != program_name:
continue
for fac in prog.get("faculty", []):
if search and search.lower() not in fac.get("name", "").lower():
continue
faculty_list.append({
"name": fac.get("name"),
"url": fac.get("url"),
"title": fac.get("title"),
"email": fac.get("email"),
"program": prog.get("name"),
"school": school.get("name")
})
total = len(faculty_list)
faculty_list = faculty_list[skip:skip + limit]
return {
"total": total,
"skip": skip,
"limit": limit,
"faculty": faculty_list
}
@router.get("/university/{university_id}/export")
def export_result(
university_id: int,
format: str = Query("json", enum=["json"]),
db: Session = Depends(get_db)
):
"""导出爬取结果"""
result = db.query(ScrapeResult).filter(
ScrapeResult.university_id == university_id
).order_by(ScrapeResult.created_at.desc()).first()
if not result:
raise HTTPException(status_code=404, detail="没有爬取结果")
if format == "json":
return JSONResponse(
content=result.result_data,
headers={
"Content-Disposition": f"attachment; filename=university_{university_id}_result.json"
}
)
raise HTTPException(status_code=400, detail="不支持的格式")

167
backend/app/api/scripts.py Normal file
View File

@ -0,0 +1,167 @@
"""爬虫脚本API"""
from typing import List
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
from sqlalchemy.orm import Session
from ..database import get_db
from ..models import University, ScraperScript
from ..schemas.script import (
ScriptCreate,
ScriptResponse,
GenerateScriptRequest,
GenerateScriptResponse
)
from ..services.script_generator import generate_scraper_script
router = APIRouter()
@router.post("/generate", response_model=GenerateScriptResponse)
async def generate_script(
data: GenerateScriptRequest,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db)
):
"""
一键生成爬虫脚本
分析大学网站结构,自动生成爬虫脚本
"""
# 检查或创建大学记录
university = db.query(University).filter(University.url == data.university_url).first()
if not university:
# 从URL提取大学名称
name = data.university_name
if not name:
from urllib.parse import urlparse
parsed = urlparse(data.university_url)
name = parsed.netloc.replace("www.", "").split(".")[0].title()
university = University(
name=name,
url=data.university_url,
status="analyzing"
)
db.add(university)
db.commit()
db.refresh(university)
else:
# 更新状态
university.status = "analyzing"
db.commit()
# 在后台执行脚本生成
background_tasks.add_task(
generate_scraper_script,
university_id=university.id,
university_url=data.university_url
)
return GenerateScriptResponse(
success=True,
university_id=university.id,
script_id=None,
message="正在分析网站结构并生成爬虫脚本...",
status="analyzing"
)
@router.get("/university/{university_id}", response_model=List[ScriptResponse])
def get_university_scripts(
university_id: int,
db: Session = Depends(get_db)
):
"""获取大学的所有爬虫脚本"""
scripts = db.query(ScraperScript).filter(
ScraperScript.university_id == university_id
).order_by(ScraperScript.version.desc()).all()
return scripts
@router.get("/{script_id}", response_model=ScriptResponse)
def get_script(
script_id: int,
db: Session = Depends(get_db)
):
"""获取脚本详情"""
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
if not script:
raise HTTPException(status_code=404, detail="脚本不存在")
return script
@router.post("", response_model=ScriptResponse)
def create_script(
data: ScriptCreate,
db: Session = Depends(get_db)
):
"""手动创建脚本"""
# 检查大学是否存在
university = db.query(University).filter(University.id == data.university_id).first()
if not university:
raise HTTPException(status_code=404, detail="大学不存在")
# 获取当前最高版本
max_version = db.query(ScraperScript).filter(
ScraperScript.university_id == data.university_id
).count()
script = ScraperScript(
university_id=data.university_id,
script_name=data.script_name,
script_content=data.script_content,
config_content=data.config_content,
version=max_version + 1,
status="active"
)
db.add(script)
db.commit()
db.refresh(script)
# 更新大学状态
university.status = "ready"
db.commit()
return script
@router.put("/{script_id}", response_model=ScriptResponse)
def update_script(
script_id: int,
data: ScriptCreate,
db: Session = Depends(get_db)
):
"""更新脚本"""
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
if not script:
raise HTTPException(status_code=404, detail="脚本不存在")
script.script_content = data.script_content
if data.config_content:
script.config_content = data.config_content
db.commit()
db.refresh(script)
return script
@router.delete("/{script_id}")
def delete_script(
script_id: int,
db: Session = Depends(get_db)
):
"""删除脚本"""
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
if not script:
raise HTTPException(status_code=404, detail="脚本不存在")
db.delete(script)
db.commit()
return {"message": "删除成功"}

View File

@ -0,0 +1,165 @@
"""大学管理API"""
from typing import List, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.orm import Session
from ..database import get_db
from ..models import University, ScrapeResult
from ..schemas.university import (
UniversityCreate,
UniversityUpdate,
UniversityResponse,
UniversityListResponse
)
router = APIRouter()
@router.get("", response_model=UniversityListResponse)
def list_universities(
skip: int = Query(0, ge=0),
limit: int = Query(20, ge=1, le=100),
search: Optional[str] = None,
db: Session = Depends(get_db)
):
"""获取大学列表"""
query = db.query(University)
if search:
query = query.filter(University.name.ilike(f"%{search}%"))
total = query.count()
universities = query.order_by(University.created_at.desc()).offset(skip).limit(limit).all()
# 添加统计信息
items = []
for uni in universities:
# 获取最新结果
latest_result = db.query(ScrapeResult).filter(
ScrapeResult.university_id == uni.id
).order_by(ScrapeResult.created_at.desc()).first()
items.append(UniversityResponse(
id=uni.id,
name=uni.name,
url=uni.url,
country=uni.country,
description=uni.description,
status=uni.status,
created_at=uni.created_at,
updated_at=uni.updated_at,
scripts_count=len(uni.scripts),
jobs_count=len(uni.jobs),
latest_result={
"schools_count": latest_result.schools_count,
"programs_count": latest_result.programs_count,
"faculty_count": latest_result.faculty_count,
"created_at": latest_result.created_at.isoformat()
} if latest_result else None
))
return UniversityListResponse(total=total, items=items)
@router.post("", response_model=UniversityResponse)
def create_university(
data: UniversityCreate,
db: Session = Depends(get_db)
):
"""创建大学"""
# 检查是否已存在
existing = db.query(University).filter(University.url == data.url).first()
if existing:
raise HTTPException(status_code=400, detail="该大学URL已存在")
university = University(**data.model_dump())
db.add(university)
db.commit()
db.refresh(university)
return UniversityResponse(
id=university.id,
name=university.name,
url=university.url,
country=university.country,
description=university.description,
status=university.status,
created_at=university.created_at,
updated_at=university.updated_at,
scripts_count=0,
jobs_count=0,
latest_result=None
)
@router.get("/{university_id}", response_model=UniversityResponse)
def get_university(
university_id: int,
db: Session = Depends(get_db)
):
"""获取大学详情"""
university = db.query(University).filter(University.id == university_id).first()
if not university:
raise HTTPException(status_code=404, detail="大学不存在")
# 获取最新结果
latest_result = db.query(ScrapeResult).filter(
ScrapeResult.university_id == university.id
).order_by(ScrapeResult.created_at.desc()).first()
return UniversityResponse(
id=university.id,
name=university.name,
url=university.url,
country=university.country,
description=university.description,
status=university.status,
created_at=university.created_at,
updated_at=university.updated_at,
scripts_count=len(university.scripts),
jobs_count=len(university.jobs),
latest_result={
"schools_count": latest_result.schools_count,
"programs_count": latest_result.programs_count,
"faculty_count": latest_result.faculty_count,
"created_at": latest_result.created_at.isoformat()
} if latest_result else None
)
@router.put("/{university_id}", response_model=UniversityResponse)
def update_university(
university_id: int,
data: UniversityUpdate,
db: Session = Depends(get_db)
):
"""更新大学信息"""
university = db.query(University).filter(University.id == university_id).first()
if not university:
raise HTTPException(status_code=404, detail="大学不存在")
update_data = data.model_dump(exclude_unset=True)
for field, value in update_data.items():
setattr(university, field, value)
db.commit()
db.refresh(university)
return get_university(university_id, db)
@router.delete("/{university_id}")
def delete_university(
university_id: int,
db: Session = Depends(get_db)
):
"""删除大学"""
university = db.query(University).filter(University.id == university_id).first()
if not university:
raise HTTPException(status_code=404, detail="大学不存在")
db.delete(university)
db.commit()
return {"message": "删除成功"}