Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/backend/app/api/init.py
+++ b/backend/app/api/init.py
@ -0,0 +1,15 @@
+"""API路由"""
+
+from fastapi import APIRouter
+
+from .universities import router as universities_router
+from .scripts import router as scripts_router
+from .jobs import router as jobs_router
+from .results import router as results_router
+
+api_router = APIRouter()
+
+api_router.include_router(universities_router, prefix="/universities", tags=["大学管理"])
+api_router.include_router(scripts_router, prefix="/scripts", tags=["爬虫脚本"])
+api_router.include_router(jobs_router, prefix="/jobs", tags=["爬取任务"])
+api_router.include_router(results_router, prefix="/results", tags=["爬取结果"])
--- a/backend/app/api/jobs.py
+++ b/backend/app/api/jobs.py
@ -0,0 +1,144 @@
+"""爬取任务API"""
+
+from typing import List
+from datetime import datetime
+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
+from sqlalchemy.orm import Session
+
+from ..database import get_db
+from ..models import University, ScraperScript, ScrapeJob, ScrapeLog
+from ..schemas.job import JobResponse, JobStatusResponse, LogResponse
+from ..services.scraper_runner import run_scraper
+
+router = APIRouter()
+
+
+@router.post("/start/{university_id}", response_model=JobResponse)
+async def start_scrape_job(
+    university_id: int,
+    background_tasks: BackgroundTasks,
+    db: Session = Depends(get_db)
+):
+    """
+    一键运行爬虫
+
+    启动爬取任务，抓取大学项目和导师数据
+    """
+    # 检查大学是否存在
+    university = db.query(University).filter(University.id == university_id).first()
+    if not university:
+        raise HTTPException(status_code=404, detail="大学不存在")
+
+    # 检查是否有活跃的脚本
+    script = db.query(ScraperScript).filter(
+        ScraperScript.university_id == university_id,
+        ScraperScript.status == "active"
+    ).first()
+
+    if not script:
+        raise HTTPException(status_code=400, detail="没有可用的爬虫脚本，请先生成脚本")
+
+    # 检查是否有正在运行的任务
+    running_job = db.query(ScrapeJob).filter(
+        ScrapeJob.university_id == university_id,
+        ScrapeJob.status == "running"
+    ).first()
+
+    if running_job:
+        raise HTTPException(status_code=400, detail="已有正在运行的任务")
+
+    # 创建任务
+    job = ScrapeJob(
+        university_id=university_id,
+        script_id=script.id,
+        status="pending",
+        progress=0,
+        current_step="准备中..."
+    )
+    db.add(job)
+    db.commit()
+    db.refresh(job)
+
+    # 在后台执行爬虫
+    background_tasks.add_task(
+        run_scraper,
+        job_id=job.id,
+        script_id=script.id
+    )
+
+    return job
+
+
+@router.get("/{job_id}", response_model=JobResponse)
+def get_job(
+    job_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取任务详情"""
+    job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="任务不存在")
+
+    return job
+
+
+@router.get("/{job_id}/status", response_model=JobStatusResponse)
+def get_job_status(
+    job_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取任务状态和日志"""
+    job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="任务不存在")
+
+    # 获取最近的日志
+    logs = db.query(ScrapeLog).filter(
+        ScrapeLog.job_id == job_id
+    ).order_by(ScrapeLog.created_at.desc()).limit(50).all()
+
+    return JobStatusResponse(
+        id=job.id,
+        status=job.status,
+        progress=job.progress,
+        current_step=job.current_step,
+        logs=[LogResponse(
+            id=log.id,
+            level=log.level,
+            message=log.message,
+            created_at=log.created_at
+        ) for log in reversed(logs)]
+    )
+
+
+@router.get("/university/{university_id}", response_model=List[JobResponse])
+def get_university_jobs(
+    university_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取大学的所有任务"""
+    jobs = db.query(ScrapeJob).filter(
+        ScrapeJob.university_id == university_id
+    ).order_by(ScrapeJob.created_at.desc()).limit(20).all()
+
+    return jobs
+
+
+@router.post("/{job_id}/cancel")
+def cancel_job(
+    job_id: int,
+    db: Session = Depends(get_db)
+):
+    """取消任务"""
+    job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="任务不存在")
+
+    if job.status not in ["pending", "running"]:
+        raise HTTPException(status_code=400, detail="任务已结束，无法取消")
+
+    job.status = "cancelled"
+    job.completed_at = datetime.utcnow()
+    db.commit()
+
+    return {"message": "任务已取消"}
--- a/backend/app/api/results.py
+++ b/backend/app/api/results.py
@ -0,0 +1,175 @@
+"""爬取结果API"""
+
+from typing import Optional
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import JSONResponse
+from sqlalchemy.orm import Session
+
+from ..database import get_db
+from ..models import ScrapeResult
+from ..schemas.result import ResultResponse
+
+router = APIRouter()
+
+
+@router.get("/university/{university_id}", response_model=ResultResponse)
+def get_university_result(
+    university_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取大学最新的爬取结果"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    return result
+
+
+@router.get("/university/{university_id}/schools")
+def get_schools(
+    university_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取学院列表"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    schools = result.result_data.get("schools", [])
+
+    # 返回简化的学院列表
+    return {
+        "total": len(schools),
+        "schools": [
+            {
+                "name": s.get("name"),
+                "url": s.get("url"),
+                "program_count": len(s.get("programs", []))
+            }
+            for s in schools
+        ]
+    }
+
+
+@router.get("/university/{university_id}/programs")
+def get_programs(
+    university_id: int,
+    school_name: Optional[str] = Query(None, description="按学院筛选"),
+    search: Optional[str] = Query(None, description="搜索项目名称"),
+    db: Session = Depends(get_db)
+):
+    """获取项目列表"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    schools = result.result_data.get("schools", [])
+    programs = []
+
+    for school in schools:
+        if school_name and school.get("name") != school_name:
+            continue
+
+        for prog in school.get("programs", []):
+            if search and search.lower() not in prog.get("name", "").lower():
+                continue
+
+            programs.append({
+                "name": prog.get("name"),
+                "url": prog.get("url"),
+                "degree_type": prog.get("degree_type"),
+                "school": school.get("name"),
+                "faculty_count": len(prog.get("faculty", []))
+            })
+
+    return {
+        "total": len(programs),
+        "programs": programs
+    }
+
+
+@router.get("/university/{university_id}/faculty")
+def get_faculty(
+    university_id: int,
+    school_name: Optional[str] = Query(None, description="按学院筛选"),
+    program_name: Optional[str] = Query(None, description="按项目筛选"),
+    search: Optional[str] = Query(None, description="搜索导师姓名"),
+    skip: int = Query(0, ge=0),
+    limit: int = Query(50, ge=1, le=200),
+    db: Session = Depends(get_db)
+):
+    """获取导师列表"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    schools = result.result_data.get("schools", [])
+    faculty_list = []
+
+    for school in schools:
+        if school_name and school.get("name") != school_name:
+            continue
+
+        for prog in school.get("programs", []):
+            if program_name and prog.get("name") != program_name:
+                continue
+
+            for fac in prog.get("faculty", []):
+                if search and search.lower() not in fac.get("name", "").lower():
+                    continue
+
+                faculty_list.append({
+                    "name": fac.get("name"),
+                    "url": fac.get("url"),
+                    "title": fac.get("title"),
+                    "email": fac.get("email"),
+                    "program": prog.get("name"),
+                    "school": school.get("name")
+                })
+
+    total = len(faculty_list)
+    faculty_list = faculty_list[skip:skip + limit]
+
+    return {
+        "total": total,
+        "skip": skip,
+        "limit": limit,
+        "faculty": faculty_list
+    }
+
+
+@router.get("/university/{university_id}/export")
+def export_result(
+    university_id: int,
+    format: str = Query("json", enum=["json"]),
+    db: Session = Depends(get_db)
+):
+    """导出爬取结果"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    if format == "json":
+        return JSONResponse(
+            content=result.result_data,
+            headers={
+                "Content-Disposition": f"attachment; filename=university_{university_id}_result.json"
+            }
+        )
+
+    raise HTTPException(status_code=400, detail="不支持的格式")
--- a/backend/app/api/scripts.py
+++ b/backend/app/api/scripts.py
@ -0,0 +1,167 @@
+"""爬虫脚本API"""
+
+from typing import List
+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
+from sqlalchemy.orm import Session
+
+from ..database import get_db
+from ..models import University, ScraperScript
+from ..schemas.script import (
+    ScriptCreate,
+    ScriptResponse,
+    GenerateScriptRequest,
+    GenerateScriptResponse
+)
+from ..services.script_generator import generate_scraper_script
+
+router = APIRouter()
+
+
+@router.post("/generate", response_model=GenerateScriptResponse)
+async def generate_script(
+    data: GenerateScriptRequest,
+    background_tasks: BackgroundTasks,
+    db: Session = Depends(get_db)
+):
+    """
+    一键生成爬虫脚本
+
+    分析大学网站结构，自动生成爬虫脚本
+    """
+    # 检查或创建大学记录
+    university = db.query(University).filter(University.url == data.university_url).first()
+
+    if not university:
+        # 从URL提取大学名称
+        name = data.university_name
+        if not name:
+            from urllib.parse import urlparse
+            parsed = urlparse(data.university_url)
+            name = parsed.netloc.replace("www.", "").split(".")[0].title()
+
+        university = University(
+            name=name,
+            url=data.university_url,
+            status="analyzing"
+        )
+        db.add(university)
+        db.commit()
+        db.refresh(university)
+    else:
+        # 更新状态
+        university.status = "analyzing"
+        db.commit()
+
+    # 在后台执行脚本生成
+    background_tasks.add_task(
+        generate_scraper_script,
+        university_id=university.id,
+        university_url=data.university_url
+    )
+
+    return GenerateScriptResponse(
+        success=True,
+        university_id=university.id,
+        script_id=None,
+        message="正在分析网站结构并生成爬虫脚本...",
+        status="analyzing"
+    )
+
+
+@router.get("/university/{university_id}", response_model=List[ScriptResponse])
+def get_university_scripts(
+    university_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取大学的所有爬虫脚本"""
+    scripts = db.query(ScraperScript).filter(
+        ScraperScript.university_id == university_id
+    ).order_by(ScraperScript.version.desc()).all()
+
+    return scripts
+
+
+@router.get("/{script_id}", response_model=ScriptResponse)
+def get_script(
+    script_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取脚本详情"""
+    script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
+    if not script:
+        raise HTTPException(status_code=404, detail="脚本不存在")
+
+    return script
+
+
+@router.post("", response_model=ScriptResponse)
+def create_script(
+    data: ScriptCreate,
+    db: Session = Depends(get_db)
+):
+    """手动创建脚本"""
+    # 检查大学是否存在
+    university = db.query(University).filter(University.id == data.university_id).first()
+    if not university:
+        raise HTTPException(status_code=404, detail="大学不存在")
+
+    # 获取当前最高版本
+    max_version = db.query(ScraperScript).filter(
+        ScraperScript.university_id == data.university_id
+    ).count()
+
+    script = ScraperScript(
+        university_id=data.university_id,
+        script_name=data.script_name,
+        script_content=data.script_content,
+        config_content=data.config_content,
+        version=max_version + 1,
+        status="active"
+    )
+
+    db.add(script)
+    db.commit()
+    db.refresh(script)
+
+    # 更新大学状态
+    university.status = "ready"
+    db.commit()
+
+    return script
+
+
+@router.put("/{script_id}", response_model=ScriptResponse)
+def update_script(
+    script_id: int,
+    data: ScriptCreate,
+    db: Session = Depends(get_db)
+):
+    """更新脚本"""
+    script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
+    if not script:
+        raise HTTPException(status_code=404, detail="脚本不存在")
+
+    script.script_content = data.script_content
+    if data.config_content:
+        script.config_content = data.config_content
+
+    db.commit()
+    db.refresh(script)
+
+    return script
+
+
+@router.delete("/{script_id}")
+def delete_script(
+    script_id: int,
+    db: Session = Depends(get_db)
+):
+    """删除脚本"""
+    script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
+    if not script:
+        raise HTTPException(status_code=404, detail="脚本不存在")
+
+    db.delete(script)
+    db.commit()
+
+    return {"message": "删除成功"}
--- a/backend/app/api/universities.py
+++ b/backend/app/api/universities.py
@ -0,0 +1,165 @@
+"""大学管理API"""
+
+from typing import List, Optional
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.orm import Session
+
+from ..database import get_db
+from ..models import University, ScrapeResult
+from ..schemas.university import (
+    UniversityCreate,
+    UniversityUpdate,
+    UniversityResponse,
+    UniversityListResponse
+)
+
+router = APIRouter()
+
+
+@router.get("", response_model=UniversityListResponse)
+def list_universities(
+    skip: int = Query(0, ge=0),
+    limit: int = Query(20, ge=1, le=100),
+    search: Optional[str] = None,
+    db: Session = Depends(get_db)
+):
+    """获取大学列表"""
+    query = db.query(University)
+
+    if search:
+        query = query.filter(University.name.ilike(f"%{search}%"))
+
+    total = query.count()
+    universities = query.order_by(University.created_at.desc()).offset(skip).limit(limit).all()
+
+    # 添加统计信息
+    items = []
+    for uni in universities:
+        # 获取最新结果
+        latest_result = db.query(ScrapeResult).filter(
+            ScrapeResult.university_id == uni.id
+        ).order_by(ScrapeResult.created_at.desc()).first()
+
+        items.append(UniversityResponse(
+            id=uni.id,
+            name=uni.name,
+            url=uni.url,
+            country=uni.country,
+            description=uni.description,
+            status=uni.status,
+            created_at=uni.created_at,
+            updated_at=uni.updated_at,
+            scripts_count=len(uni.scripts),
+            jobs_count=len(uni.jobs),
+            latest_result={
+                "schools_count": latest_result.schools_count,
+                "programs_count": latest_result.programs_count,
+                "faculty_count": latest_result.faculty_count,
+                "created_at": latest_result.created_at.isoformat()
+            } if latest_result else None
+        ))
+
+    return UniversityListResponse(total=total, items=items)
+
+
+@router.post("", response_model=UniversityResponse)
+def create_university(
+    data: UniversityCreate,
+    db: Session = Depends(get_db)
+):
+    """创建大学"""
+    # 检查是否已存在
+    existing = db.query(University).filter(University.url == data.url).first()
+    if existing:
+        raise HTTPException(status_code=400, detail="该大学URL已存在")
+
+    university = University(**data.model_dump())
+    db.add(university)
+    db.commit()
+    db.refresh(university)
+
+    return UniversityResponse(
+        id=university.id,
+        name=university.name,
+        url=university.url,
+        country=university.country,
+        description=university.description,
+        status=university.status,
+        created_at=university.created_at,
+        updated_at=university.updated_at,
+        scripts_count=0,
+        jobs_count=0,
+        latest_result=None
+    )
+
+
+@router.get("/{university_id}", response_model=UniversityResponse)
+def get_university(
+    university_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取大学详情"""
+    university = db.query(University).filter(University.id == university_id).first()
+    if not university:
+        raise HTTPException(status_code=404, detail="大学不存在")
+
+    # 获取最新结果
+    latest_result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university.id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    return UniversityResponse(
+        id=university.id,
+        name=university.name,
+        url=university.url,
+        country=university.country,
+        description=university.description,
+        status=university.status,
+        created_at=university.created_at,
+        updated_at=university.updated_at,
+        scripts_count=len(university.scripts),
+        jobs_count=len(university.jobs),
+        latest_result={
+            "schools_count": latest_result.schools_count,
+            "programs_count": latest_result.programs_count,
+            "faculty_count": latest_result.faculty_count,
+            "created_at": latest_result.created_at.isoformat()
+        } if latest_result else None
+    )
+
+
+@router.put("/{university_id}", response_model=UniversityResponse)
+def update_university(
+    university_id: int,
+    data: UniversityUpdate,
+    db: Session = Depends(get_db)
+):
+    """更新大学信息"""
+    university = db.query(University).filter(University.id == university_id).first()
+    if not university:
+        raise HTTPException(status_code=404, detail="大学不存在")
+
+    update_data = data.model_dump(exclude_unset=True)
+    for field, value in update_data.items():
+        setattr(university, field, value)
+
+    db.commit()
+    db.refresh(university)
+
+    return get_university(university_id, db)
+
+
+@router.delete("/{university_id}")
+def delete_university(
+    university_id: int,
+    db: Session = Depends(get_db)
+):
+    """删除大学"""
+    university = db.query(University).filter(University.id == university_id).first()
+    if not university:
+        raise HTTPException(status_code=404, detail="大学不存在")
+
+    db.delete(university)
+    db.commit()
+
+    return {"message": "删除成功"}