Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
144
backend/app/api/jobs.py
Normal file
144
backend/app/api/jobs.py
Normal file
@ -0,0 +1,144 @@
|
||||
"""爬取任务API"""
|
||||
|
||||
from typing import List
|
||||
from datetime import datetime
|
||||
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ..database import get_db
|
||||
from ..models import University, ScraperScript, ScrapeJob, ScrapeLog
|
||||
from ..schemas.job import JobResponse, JobStatusResponse, LogResponse
|
||||
from ..services.scraper_runner import run_scraper
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/start/{university_id}", response_model=JobResponse)
|
||||
async def start_scrape_job(
|
||||
university_id: int,
|
||||
background_tasks: BackgroundTasks,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
一键运行爬虫
|
||||
|
||||
启动爬取任务,抓取大学项目和导师数据
|
||||
"""
|
||||
# 检查大学是否存在
|
||||
university = db.query(University).filter(University.id == university_id).first()
|
||||
if not university:
|
||||
raise HTTPException(status_code=404, detail="大学不存在")
|
||||
|
||||
# 检查是否有活跃的脚本
|
||||
script = db.query(ScraperScript).filter(
|
||||
ScraperScript.university_id == university_id,
|
||||
ScraperScript.status == "active"
|
||||
).first()
|
||||
|
||||
if not script:
|
||||
raise HTTPException(status_code=400, detail="没有可用的爬虫脚本,请先生成脚本")
|
||||
|
||||
# 检查是否有正在运行的任务
|
||||
running_job = db.query(ScrapeJob).filter(
|
||||
ScrapeJob.university_id == university_id,
|
||||
ScrapeJob.status == "running"
|
||||
).first()
|
||||
|
||||
if running_job:
|
||||
raise HTTPException(status_code=400, detail="已有正在运行的任务")
|
||||
|
||||
# 创建任务
|
||||
job = ScrapeJob(
|
||||
university_id=university_id,
|
||||
script_id=script.id,
|
||||
status="pending",
|
||||
progress=0,
|
||||
current_step="准备中..."
|
||||
)
|
||||
db.add(job)
|
||||
db.commit()
|
||||
db.refresh(job)
|
||||
|
||||
# 在后台执行爬虫
|
||||
background_tasks.add_task(
|
||||
run_scraper,
|
||||
job_id=job.id,
|
||||
script_id=script.id
|
||||
)
|
||||
|
||||
return job
|
||||
|
||||
|
||||
@router.get("/{job_id}", response_model=JobResponse)
|
||||
def get_job(
|
||||
job_id: int,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""获取任务详情"""
|
||||
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="任务不存在")
|
||||
|
||||
return job
|
||||
|
||||
|
||||
@router.get("/{job_id}/status", response_model=JobStatusResponse)
|
||||
def get_job_status(
|
||||
job_id: int,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""获取任务状态和日志"""
|
||||
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="任务不存在")
|
||||
|
||||
# 获取最近的日志
|
||||
logs = db.query(ScrapeLog).filter(
|
||||
ScrapeLog.job_id == job_id
|
||||
).order_by(ScrapeLog.created_at.desc()).limit(50).all()
|
||||
|
||||
return JobStatusResponse(
|
||||
id=job.id,
|
||||
status=job.status,
|
||||
progress=job.progress,
|
||||
current_step=job.current_step,
|
||||
logs=[LogResponse(
|
||||
id=log.id,
|
||||
level=log.level,
|
||||
message=log.message,
|
||||
created_at=log.created_at
|
||||
) for log in reversed(logs)]
|
||||
)
|
||||
|
||||
|
||||
@router.get("/university/{university_id}", response_model=List[JobResponse])
|
||||
def get_university_jobs(
|
||||
university_id: int,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""获取大学的所有任务"""
|
||||
jobs = db.query(ScrapeJob).filter(
|
||||
ScrapeJob.university_id == university_id
|
||||
).order_by(ScrapeJob.created_at.desc()).limit(20).all()
|
||||
|
||||
return jobs
|
||||
|
||||
|
||||
@router.post("/{job_id}/cancel")
|
||||
def cancel_job(
|
||||
job_id: int,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""取消任务"""
|
||||
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="任务不存在")
|
||||
|
||||
if job.status not in ["pending", "running"]:
|
||||
raise HTTPException(status_code=400, detail="任务已结束,无法取消")
|
||||
|
||||
job.status = "cancelled"
|
||||
job.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
return {"message": "任务已取消"}
|
||||
Reference in New Issue
Block a user