Files
University-Playwright-Codeg…/backend/app/api/jobs.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

145 lines
3.9 KiB
Python

"""爬取任务API"""
from typing import List
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
from sqlalchemy.orm import Session
from ..database import get_db
from ..models import University, ScraperScript, ScrapeJob, ScrapeLog
from ..schemas.job import JobResponse, JobStatusResponse, LogResponse
from ..services.scraper_runner import run_scraper
router = APIRouter()
@router.post("/start/{university_id}", response_model=JobResponse)
async def start_scrape_job(
university_id: int,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db)
):
"""
一键运行爬虫
启动爬取任务,抓取大学项目和导师数据
"""
# 检查大学是否存在
university = db.query(University).filter(University.id == university_id).first()
if not university:
raise HTTPException(status_code=404, detail="大学不存在")
# 检查是否有活跃的脚本
script = db.query(ScraperScript).filter(
ScraperScript.university_id == university_id,
ScraperScript.status == "active"
).first()
if not script:
raise HTTPException(status_code=400, detail="没有可用的爬虫脚本,请先生成脚本")
# 检查是否有正在运行的任务
running_job = db.query(ScrapeJob).filter(
ScrapeJob.university_id == university_id,
ScrapeJob.status == "running"
).first()
if running_job:
raise HTTPException(status_code=400, detail="已有正在运行的任务")
# 创建任务
job = ScrapeJob(
university_id=university_id,
script_id=script.id,
status="pending",
progress=0,
current_step="准备中..."
)
db.add(job)
db.commit()
db.refresh(job)
# 在后台执行爬虫
background_tasks.add_task(
run_scraper,
job_id=job.id,
script_id=script.id
)
return job
@router.get("/{job_id}", response_model=JobResponse)
def get_job(
job_id: int,
db: Session = Depends(get_db)
):
"""获取任务详情"""
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="任务不存在")
return job
@router.get("/{job_id}/status", response_model=JobStatusResponse)
def get_job_status(
job_id: int,
db: Session = Depends(get_db)
):
"""获取任务状态和日志"""
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="任务不存在")
# 获取最近的日志
logs = db.query(ScrapeLog).filter(
ScrapeLog.job_id == job_id
).order_by(ScrapeLog.created_at.desc()).limit(50).all()
return JobStatusResponse(
id=job.id,
status=job.status,
progress=job.progress,
current_step=job.current_step,
logs=[LogResponse(
id=log.id,
level=log.level,
message=log.message,
created_at=log.created_at
) for log in reversed(logs)]
)
@router.get("/university/{university_id}", response_model=List[JobResponse])
def get_university_jobs(
university_id: int,
db: Session = Depends(get_db)
):
"""获取大学的所有任务"""
jobs = db.query(ScrapeJob).filter(
ScrapeJob.university_id == university_id
).order_by(ScrapeJob.created_at.desc()).limit(20).all()
return jobs
@router.post("/{job_id}/cancel")
def cancel_job(
job_id: int,
db: Session = Depends(get_db)
):
"""取消任务"""
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="任务不存在")
if job.status not in ["pending", "running"]:
raise HTTPException(status_code=400, detail="任务已结束,无法取消")
job.status = "cancelled"
job.completed_at = datetime.utcnow()
db.commit()
return {"message": "任务已取消"}