Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

167
backend/app/api/scripts.py Normal file
View File

@ -0,0 +1,167 @@
"""爬虫脚本API"""
from typing import List
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
from sqlalchemy.orm import Session
from ..database import get_db
from ..models import University, ScraperScript
from ..schemas.script import (
ScriptCreate,
ScriptResponse,
GenerateScriptRequest,
GenerateScriptResponse
)
from ..services.script_generator import generate_scraper_script
router = APIRouter()
@router.post("/generate", response_model=GenerateScriptResponse)
async def generate_script(
data: GenerateScriptRequest,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db)
):
"""
一键生成爬虫脚本
分析大学网站结构,自动生成爬虫脚本
"""
# 检查或创建大学记录
university = db.query(University).filter(University.url == data.university_url).first()
if not university:
# 从URL提取大学名称
name = data.university_name
if not name:
from urllib.parse import urlparse
parsed = urlparse(data.university_url)
name = parsed.netloc.replace("www.", "").split(".")[0].title()
university = University(
name=name,
url=data.university_url,
status="analyzing"
)
db.add(university)
db.commit()
db.refresh(university)
else:
# 更新状态
university.status = "analyzing"
db.commit()
# 在后台执行脚本生成
background_tasks.add_task(
generate_scraper_script,
university_id=university.id,
university_url=data.university_url
)
return GenerateScriptResponse(
success=True,
university_id=university.id,
script_id=None,
message="正在分析网站结构并生成爬虫脚本...",
status="analyzing"
)
@router.get("/university/{university_id}", response_model=List[ScriptResponse])
def get_university_scripts(
university_id: int,
db: Session = Depends(get_db)
):
"""获取大学的所有爬虫脚本"""
scripts = db.query(ScraperScript).filter(
ScraperScript.university_id == university_id
).order_by(ScraperScript.version.desc()).all()
return scripts
@router.get("/{script_id}", response_model=ScriptResponse)
def get_script(
script_id: int,
db: Session = Depends(get_db)
):
"""获取脚本详情"""
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
if not script:
raise HTTPException(status_code=404, detail="脚本不存在")
return script
@router.post("", response_model=ScriptResponse)
def create_script(
data: ScriptCreate,
db: Session = Depends(get_db)
):
"""手动创建脚本"""
# 检查大学是否存在
university = db.query(University).filter(University.id == data.university_id).first()
if not university:
raise HTTPException(status_code=404, detail="大学不存在")
# 获取当前最高版本
max_version = db.query(ScraperScript).filter(
ScraperScript.university_id == data.university_id
).count()
script = ScraperScript(
university_id=data.university_id,
script_name=data.script_name,
script_content=data.script_content,
config_content=data.config_content,
version=max_version + 1,
status="active"
)
db.add(script)
db.commit()
db.refresh(script)
# 更新大学状态
university.status = "ready"
db.commit()
return script
@router.put("/{script_id}", response_model=ScriptResponse)
def update_script(
script_id: int,
data: ScriptCreate,
db: Session = Depends(get_db)
):
"""更新脚本"""
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
if not script:
raise HTTPException(status_code=404, detail="脚本不存在")
script.script_content = data.script_content
if data.config_content:
script.config_content = data.config_content
db.commit()
db.refresh(script)
return script
@router.delete("/{script_id}")
def delete_script(
script_id: int,
db: Session = Depends(get_db)
):
"""删除脚本"""
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
if not script:
raise HTTPException(status_code=404, detail="脚本不存在")
db.delete(script)
db.commit()
return {"message": "删除成功"}