- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
168 lines
4.4 KiB
Python
168 lines
4.4 KiB
Python
"""爬虫脚本API"""
|
|
|
|
from typing import List
|
|
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
|
|
from sqlalchemy.orm import Session
|
|
|
|
from ..database import get_db
|
|
from ..models import University, ScraperScript
|
|
from ..schemas.script import (
|
|
ScriptCreate,
|
|
ScriptResponse,
|
|
GenerateScriptRequest,
|
|
GenerateScriptResponse
|
|
)
|
|
from ..services.script_generator import generate_scraper_script
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.post("/generate", response_model=GenerateScriptResponse)
|
|
async def generate_script(
|
|
data: GenerateScriptRequest,
|
|
background_tasks: BackgroundTasks,
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""
|
|
一键生成爬虫脚本
|
|
|
|
分析大学网站结构,自动生成爬虫脚本
|
|
"""
|
|
# 检查或创建大学记录
|
|
university = db.query(University).filter(University.url == data.university_url).first()
|
|
|
|
if not university:
|
|
# 从URL提取大学名称
|
|
name = data.university_name
|
|
if not name:
|
|
from urllib.parse import urlparse
|
|
parsed = urlparse(data.university_url)
|
|
name = parsed.netloc.replace("www.", "").split(".")[0].title()
|
|
|
|
university = University(
|
|
name=name,
|
|
url=data.university_url,
|
|
status="analyzing"
|
|
)
|
|
db.add(university)
|
|
db.commit()
|
|
db.refresh(university)
|
|
else:
|
|
# 更新状态
|
|
university.status = "analyzing"
|
|
db.commit()
|
|
|
|
# 在后台执行脚本生成
|
|
background_tasks.add_task(
|
|
generate_scraper_script,
|
|
university_id=university.id,
|
|
university_url=data.university_url
|
|
)
|
|
|
|
return GenerateScriptResponse(
|
|
success=True,
|
|
university_id=university.id,
|
|
script_id=None,
|
|
message="正在分析网站结构并生成爬虫脚本...",
|
|
status="analyzing"
|
|
)
|
|
|
|
|
|
@router.get("/university/{university_id}", response_model=List[ScriptResponse])
|
|
def get_university_scripts(
|
|
university_id: int,
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""获取大学的所有爬虫脚本"""
|
|
scripts = db.query(ScraperScript).filter(
|
|
ScraperScript.university_id == university_id
|
|
).order_by(ScraperScript.version.desc()).all()
|
|
|
|
return scripts
|
|
|
|
|
|
@router.get("/{script_id}", response_model=ScriptResponse)
|
|
def get_script(
|
|
script_id: int,
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""获取脚本详情"""
|
|
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
|
|
if not script:
|
|
raise HTTPException(status_code=404, detail="脚本不存在")
|
|
|
|
return script
|
|
|
|
|
|
@router.post("", response_model=ScriptResponse)
|
|
def create_script(
|
|
data: ScriptCreate,
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""手动创建脚本"""
|
|
# 检查大学是否存在
|
|
university = db.query(University).filter(University.id == data.university_id).first()
|
|
if not university:
|
|
raise HTTPException(status_code=404, detail="大学不存在")
|
|
|
|
# 获取当前最高版本
|
|
max_version = db.query(ScraperScript).filter(
|
|
ScraperScript.university_id == data.university_id
|
|
).count()
|
|
|
|
script = ScraperScript(
|
|
university_id=data.university_id,
|
|
script_name=data.script_name,
|
|
script_content=data.script_content,
|
|
config_content=data.config_content,
|
|
version=max_version + 1,
|
|
status="active"
|
|
)
|
|
|
|
db.add(script)
|
|
db.commit()
|
|
db.refresh(script)
|
|
|
|
# 更新大学状态
|
|
university.status = "ready"
|
|
db.commit()
|
|
|
|
return script
|
|
|
|
|
|
@router.put("/{script_id}", response_model=ScriptResponse)
|
|
def update_script(
|
|
script_id: int,
|
|
data: ScriptCreate,
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""更新脚本"""
|
|
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
|
|
if not script:
|
|
raise HTTPException(status_code=404, detail="脚本不存在")
|
|
|
|
script.script_content = data.script_content
|
|
if data.config_content:
|
|
script.config_content = data.config_content
|
|
|
|
db.commit()
|
|
db.refresh(script)
|
|
|
|
return script
|
|
|
|
|
|
@router.delete("/{script_id}")
|
|
def delete_script(
|
|
script_id: int,
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""删除脚本"""
|
|
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
|
|
if not script:
|
|
raise HTTPException(status_code=404, detail="脚本不存在")
|
|
|
|
db.delete(script)
|
|
db.commit()
|
|
|
|
return {"message": "删除成功"}
|