Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
8
backend/app/models/__init__.py
Normal file
8
backend/app/models/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
"""数据库模型"""
|
||||
|
||||
from .university import University
|
||||
from .script import ScraperScript
|
||||
from .job import ScrapeJob, ScrapeLog
|
||||
from .result import ScrapeResult
|
||||
|
||||
__all__ = ["University", "ScraperScript", "ScrapeJob", "ScrapeLog", "ScrapeResult"]
|
||||
56
backend/app/models/job.py
Normal file
56
backend/app/models/job.py
Normal file
@ -0,0 +1,56 @@
|
||||
"""爬取任务模型"""
|
||||
|
||||
from datetime import datetime
|
||||
from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class ScrapeJob(Base):
|
||||
"""爬取任务表"""
|
||||
|
||||
__tablename__ = "scrape_jobs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
university_id = Column(Integer, ForeignKey("universities.id"), nullable=False)
|
||||
script_id = Column(Integer, ForeignKey("scraper_scripts.id"))
|
||||
|
||||
status = Column(String(50), default="pending") # pending, running, completed, failed, cancelled
|
||||
progress = Column(Integer, default=0) # 0-100 进度百分比
|
||||
current_step = Column(String(255)) # 当前步骤描述
|
||||
|
||||
started_at = Column(DateTime)
|
||||
completed_at = Column(DateTime)
|
||||
error_message = Column(Text)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
# 关联
|
||||
university = relationship("University", back_populates="jobs")
|
||||
script = relationship("ScraperScript", back_populates="jobs")
|
||||
logs = relationship("ScrapeLog", back_populates="job", cascade="all, delete-orphan")
|
||||
results = relationship("ScrapeResult", back_populates="job", cascade="all, delete-orphan")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ScrapeJob(id={self.id}, status='{self.status}')>"
|
||||
|
||||
|
||||
class ScrapeLog(Base):
|
||||
"""爬取日志表"""
|
||||
|
||||
__tablename__ = "scrape_logs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
job_id = Column(Integer, ForeignKey("scrape_jobs.id"), nullable=False)
|
||||
|
||||
level = Column(String(20), default="info") # debug, info, warning, error
|
||||
message = Column(Text, nullable=False)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
# 关联
|
||||
job = relationship("ScrapeJob", back_populates="logs")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ScrapeLog(id={self.id}, level='{self.level}')>"
|
||||
34
backend/app/models/result.py
Normal file
34
backend/app/models/result.py
Normal file
@ -0,0 +1,34 @@
|
||||
"""爬取结果模型"""
|
||||
|
||||
from datetime import datetime
|
||||
from sqlalchemy import Column, Integer, DateTime, ForeignKey, JSON
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class ScrapeResult(Base):
|
||||
"""爬取结果表"""
|
||||
|
||||
__tablename__ = "scrape_results"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
job_id = Column(Integer, ForeignKey("scrape_jobs.id"))
|
||||
university_id = Column(Integer, ForeignKey("universities.id"), nullable=False)
|
||||
|
||||
# JSON数据: 学院 → 项目 → 导师 层级结构
|
||||
result_data = Column(JSON, nullable=False)
|
||||
|
||||
# 统计信息
|
||||
schools_count = Column(Integer, default=0)
|
||||
programs_count = Column(Integer, default=0)
|
||||
faculty_count = Column(Integer, default=0)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
# 关联
|
||||
job = relationship("ScrapeJob", back_populates="results")
|
||||
university = relationship("University", back_populates="results")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ScrapeResult(id={self.id}, programs={self.programs_count}, faculty={self.faculty_count})>"
|
||||
34
backend/app/models/script.py
Normal file
34
backend/app/models/script.py
Normal file
@ -0,0 +1,34 @@
|
||||
"""爬虫脚本模型"""
|
||||
|
||||
from datetime import datetime
|
||||
from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey, JSON
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class ScraperScript(Base):
|
||||
"""爬虫脚本表"""
|
||||
|
||||
__tablename__ = "scraper_scripts"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
university_id = Column(Integer, ForeignKey("universities.id"), nullable=False)
|
||||
|
||||
script_name = Column(String(255), nullable=False)
|
||||
script_content = Column(Text, nullable=False) # Python脚本代码
|
||||
config_content = Column(JSON) # YAML配置转为JSON存储
|
||||
|
||||
version = Column(Integer, default=1)
|
||||
status = Column(String(50), default="draft") # draft, active, deprecated, error
|
||||
error_message = Column(Text)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
# 关联
|
||||
university = relationship("University", back_populates="scripts")
|
||||
jobs = relationship("ScrapeJob", back_populates="script")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ScraperScript(id={self.id}, name='{self.script_name}')>"
|
||||
31
backend/app/models/university.py
Normal file
31
backend/app/models/university.py
Normal file
@ -0,0 +1,31 @@
|
||||
"""大学模型"""
|
||||
|
||||
from datetime import datetime
|
||||
from sqlalchemy import Column, Integer, String, DateTime, Text
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class University(Base):
|
||||
"""大学表"""
|
||||
|
||||
__tablename__ = "universities"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
name = Column(String(255), nullable=False, index=True)
|
||||
url = Column(String(500), nullable=False)
|
||||
country = Column(String(100))
|
||||
description = Column(Text)
|
||||
status = Column(String(50), default="pending") # pending, analyzing, ready, error
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
# 关联
|
||||
scripts = relationship("ScraperScript", back_populates="university", cascade="all, delete-orphan")
|
||||
jobs = relationship("ScrapeJob", back_populates="university", cascade="all, delete-orphan")
|
||||
results = relationship("ScrapeResult", back_populates="university", cascade="all, delete-orphan")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<University(id={self.id}, name='{self.name}')>"
|
||||
Reference in New Issue
Block a user