Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/backend/app/api/results.py
+++ b/backend/app/api/results.py
@ -0,0 +1,175 @@
+"""爬取结果API"""
+
+from typing import Optional
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import JSONResponse
+from sqlalchemy.orm import Session
+
+from ..database import get_db
+from ..models import ScrapeResult
+from ..schemas.result import ResultResponse
+
+router = APIRouter()
+
+
+@router.get("/university/{university_id}", response_model=ResultResponse)
+def get_university_result(
+    university_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取大学最新的爬取结果"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    return result
+
+
+@router.get("/university/{university_id}/schools")
+def get_schools(
+    university_id: int,
+    db: Session = Depends(get_db)
+):
+    """获取学院列表"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    schools = result.result_data.get("schools", [])
+
+    # 返回简化的学院列表
+    return {
+        "total": len(schools),
+        "schools": [
+            {
+                "name": s.get("name"),
+                "url": s.get("url"),
+                "program_count": len(s.get("programs", []))
+            }
+            for s in schools
+        ]
+    }
+
+
+@router.get("/university/{university_id}/programs")
+def get_programs(
+    university_id: int,
+    school_name: Optional[str] = Query(None, description="按学院筛选"),
+    search: Optional[str] = Query(None, description="搜索项目名称"),
+    db: Session = Depends(get_db)
+):
+    """获取项目列表"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    schools = result.result_data.get("schools", [])
+    programs = []
+
+    for school in schools:
+        if school_name and school.get("name") != school_name:
+            continue
+
+        for prog in school.get("programs", []):
+            if search and search.lower() not in prog.get("name", "").lower():
+                continue
+
+            programs.append({
+                "name": prog.get("name"),
+                "url": prog.get("url"),
+                "degree_type": prog.get("degree_type"),
+                "school": school.get("name"),
+                "faculty_count": len(prog.get("faculty", []))
+            })
+
+    return {
+        "total": len(programs),
+        "programs": programs
+    }
+
+
+@router.get("/university/{university_id}/faculty")
+def get_faculty(
+    university_id: int,
+    school_name: Optional[str] = Query(None, description="按学院筛选"),
+    program_name: Optional[str] = Query(None, description="按项目筛选"),
+    search: Optional[str] = Query(None, description="搜索导师姓名"),
+    skip: int = Query(0, ge=0),
+    limit: int = Query(50, ge=1, le=200),
+    db: Session = Depends(get_db)
+):
+    """获取导师列表"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    schools = result.result_data.get("schools", [])
+    faculty_list = []
+
+    for school in schools:
+        if school_name and school.get("name") != school_name:
+            continue
+
+        for prog in school.get("programs", []):
+            if program_name and prog.get("name") != program_name:
+                continue
+
+            for fac in prog.get("faculty", []):
+                if search and search.lower() not in fac.get("name", "").lower():
+                    continue
+
+                faculty_list.append({
+                    "name": fac.get("name"),
+                    "url": fac.get("url"),
+                    "title": fac.get("title"),
+                    "email": fac.get("email"),
+                    "program": prog.get("name"),
+                    "school": school.get("name")
+                })
+
+    total = len(faculty_list)
+    faculty_list = faculty_list[skip:skip + limit]
+
+    return {
+        "total": total,
+        "skip": skip,
+        "limit": limit,
+        "faculty": faculty_list
+    }
+
+
+@router.get("/university/{university_id}/export")
+def export_result(
+    university_id: int,
+    format: str = Query("json", enum=["json"]),
+    db: Session = Depends(get_db)
+):
+    """导出爬取结果"""
+    result = db.query(ScrapeResult).filter(
+        ScrapeResult.university_id == university_id
+    ).order_by(ScrapeResult.created_at.desc()).first()
+
+    if not result:
+        raise HTTPException(status_code=404, detail="没有爬取结果")
+
+    if format == "json":
+        return JSONResponse(
+            content=result.result_data,
+            headers={
+                "Content-Disposition": f"attachment; filename=university_{university_id}_result.json"
+            }
+        )
+
+    raise HTTPException(status_code=400, detail="不支持的格式")