Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,33 @@
"""Pydantic schemas for API"""
from .university import (
UniversityCreate,
UniversityUpdate,
UniversityResponse,
UniversityListResponse
)
from .script import (
ScriptCreate,
ScriptResponse,
GenerateScriptRequest,
GenerateScriptResponse
)
from .job import (
JobCreate,
JobResponse,
JobStatusResponse,
LogResponse
)
from .result import (
ResultResponse,
SchoolData,
ProgramData,
FacultyData
)
__all__ = [
"UniversityCreate", "UniversityUpdate", "UniversityResponse", "UniversityListResponse",
"ScriptCreate", "ScriptResponse", "GenerateScriptRequest", "GenerateScriptResponse",
"JobCreate", "JobResponse", "JobStatusResponse", "LogResponse",
"ResultResponse", "SchoolData", "ProgramData", "FacultyData"
]

View File

@ -0,0 +1,52 @@
"""爬取任务相关的Pydantic模型"""
from datetime import datetime
from typing import Optional, List
from pydantic import BaseModel
class JobCreate(BaseModel):
"""创建任务请求"""
university_id: int
script_id: Optional[int] = None
class JobResponse(BaseModel):
"""任务响应"""
id: int
university_id: int
script_id: Optional[int] = None
status: str
progress: int
current_step: Optional[str] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
error_message: Optional[str] = None
created_at: datetime
class Config:
from_attributes = True
class JobStatusResponse(BaseModel):
"""任务状态响应"""
id: int
status: str
progress: int
current_step: Optional[str] = None
logs: List["LogResponse"] = []
class LogResponse(BaseModel):
"""日志响应"""
id: int
level: str
message: str
created_at: datetime
class Config:
from_attributes = True
# 解决循环引用
JobStatusResponse.model_rebuild()

View File

@ -0,0 +1,67 @@
"""爬取结果相关的Pydantic模型"""
from datetime import datetime
from typing import Optional, List, Dict, Any
from pydantic import BaseModel
class FacultyData(BaseModel):
"""导师数据"""
name: str
url: str
title: Optional[str] = None
email: Optional[str] = None
department: Optional[str] = None
class ProgramData(BaseModel):
"""项目数据"""
name: str
url: str
degree_type: Optional[str] = None
description: Optional[str] = None
faculty_page_url: Optional[str] = None
faculty_count: int = 0
faculty: List[FacultyData] = []
class SchoolData(BaseModel):
"""学院数据"""
name: str
url: str
description: Optional[str] = None
program_count: int = 0
programs: List[ProgramData] = []
class ResultResponse(BaseModel):
"""完整结果响应"""
id: int
university_id: int
job_id: Optional[int] = None
# 统计
schools_count: int
programs_count: int
faculty_count: int
# 完整数据
result_data: Dict[str, Any]
created_at: datetime
class Config:
from_attributes = True
class ResultSummary(BaseModel):
"""结果摘要"""
id: int
university_id: int
schools_count: int
programs_count: int
faculty_count: int
created_at: datetime
class Config:
from_attributes = True

View File

@ -0,0 +1,46 @@
"""爬虫脚本相关的Pydantic模型"""
from datetime import datetime
from typing import Optional, Dict, Any
from pydantic import BaseModel
class ScriptBase(BaseModel):
"""脚本基础字段"""
script_name: str
script_content: str
config_content: Optional[Dict[str, Any]] = None
class ScriptCreate(ScriptBase):
"""创建脚本请求"""
university_id: int
class ScriptResponse(ScriptBase):
"""脚本响应"""
id: int
university_id: int
version: int
status: str
error_message: Optional[str] = None
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class GenerateScriptRequest(BaseModel):
"""生成脚本请求"""
university_url: str
university_name: Optional[str] = None
class GenerateScriptResponse(BaseModel):
"""生成脚本响应"""
success: bool
university_id: int
script_id: Optional[int] = None
message: str
status: str # analyzing, completed, failed

View File

@ -0,0 +1,48 @@
"""大学相关的Pydantic模型"""
from datetime import datetime
from typing import Optional, List
from pydantic import BaseModel, HttpUrl
class UniversityBase(BaseModel):
"""大学基础字段"""
name: str
url: str
country: Optional[str] = None
description: Optional[str] = None
class UniversityCreate(UniversityBase):
"""创建大学请求"""
pass
class UniversityUpdate(BaseModel):
"""更新大学请求"""
name: Optional[str] = None
url: Optional[str] = None
country: Optional[str] = None
description: Optional[str] = None
class UniversityResponse(UniversityBase):
"""大学响应"""
id: int
status: str
created_at: datetime
updated_at: datetime
# 统计信息
scripts_count: int = 0
jobs_count: int = 0
latest_result: Optional[dict] = None
class Config:
from_attributes = True
class UniversityListResponse(BaseModel):
"""大学列表响应"""
total: int
items: List[UniversityResponse]