Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,6 @@
"""业务服务"""
from .script_generator import generate_scraper_script
from .scraper_runner import run_scraper
__all__ = ["generate_scraper_script", "run_scraper"]

View File

@ -0,0 +1,177 @@
"""
爬虫执行服务
运行爬虫脚本并保存结果
"""
import asyncio
import json
import re
import sys
import traceback
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from sqlalchemy.orm import Session
# Windows 上需要设置事件循环策略
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
# 导入playwright供脚本使用
try:
from playwright.async_api import async_playwright
PLAYWRIGHT_AVAILABLE = True
except ImportError:
PLAYWRIGHT_AVAILABLE = False
async_playwright = None
from ..database import SessionLocal
from ..models import ScraperScript, ScrapeJob, ScrapeLog, ScrapeResult
def run_scraper(job_id: int, script_id: int):
"""
执行爬虫的后台任务
"""
db = SessionLocal()
try:
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
script = db.query(ScraperScript).filter(ScraperScript.id == script_id).first()
if not job or not script:
return
# 更新任务状态
job.status = "running"
job.started_at = datetime.utcnow()
job.current_step = "正在初始化..."
job.progress = 5
db.commit()
_add_log(db, job_id, "info", "开始执行爬虫脚本")
# 创建日志回调函数
def log_callback(level: str, message: str):
_add_log(db, job_id, level, message)
# 执行脚本
job.current_step = "正在爬取数据..."
job.progress = 20
db.commit()
result_data = _execute_script(script.script_content, log_callback)
if result_data:
job.progress = 80
job.current_step = "正在保存结果..."
db.commit()
_add_log(db, job_id, "info", "爬取完成,正在保存结果...")
# 计算统计信息
schools = result_data.get("schools", [])
schools_count = len(schools)
programs_count = sum(len(s.get("programs", [])) for s in schools)
faculty_count = sum(
len(p.get("faculty", []))
for s in schools
for p in s.get("programs", [])
)
# 保存结果
result = ScrapeResult(
job_id=job_id,
university_id=job.university_id,
result_data=result_data,
schools_count=schools_count,
programs_count=programs_count,
faculty_count=faculty_count
)
db.add(result)
job.status = "completed"
job.progress = 100
job.current_step = "完成"
job.completed_at = datetime.utcnow()
_add_log(
db, job_id, "info",
f"爬取成功: {schools_count}个学院, {programs_count}个项目, {faculty_count}位导师"
)
else:
job.status = "failed"
job.error_message = "脚本执行无返回结果"
job.completed_at = datetime.utcnow()
_add_log(db, job_id, "error", "脚本执行失败: 无返回结果")
db.commit()
except Exception as e:
error_msg = f"执行出错: {str(e)}\n{traceback.format_exc()}"
_add_log(db, job_id, "error", error_msg)
job = db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
if job:
job.status = "failed"
job.error_message = str(e)
job.completed_at = datetime.utcnow()
db.commit()
finally:
db.close()
def _execute_script(script_content: str, log_callback) -> dict:
"""
执行Python脚本内容
安全地在隔离环境中执行脚本
"""
if not PLAYWRIGHT_AVAILABLE:
log_callback("error", "Playwright 未安装,请运行: pip install playwright && playwright install")
return None
# 创建执行环境 - 包含脚本需要的所有模块
# 注意:使用同一个字典作为 globals 和 locals确保函数定义可以互相访问
exec_namespace = {
"__builtins__": __builtins__,
"asyncio": asyncio,
"json": json,
"re": re,
"datetime": datetime,
"timezone": timezone,
"urljoin": urljoin,
"urlparse": urlparse,
"async_playwright": async_playwright,
}
try:
# 编译并执行脚本 - 使用同一个命名空间确保函数可互相调用
exec(script_content, exec_namespace, exec_namespace)
# 获取scrape函数
scrape_func = exec_namespace.get("scrape")
if not scrape_func:
log_callback("error", "脚本中未找到 scrape 函数")
return None
# 运行异步爬虫函数
result = asyncio.run(scrape_func(output_callback=log_callback))
return result
except Exception as e:
log_callback("error", f"脚本执行异常: {str(e)}")
raise
def _add_log(db: Session, job_id: int, level: str, message: str):
"""添加日志"""
log = ScrapeLog(
job_id=job_id,
level=level,
message=message
)
db.add(log)
db.commit()

View File

@ -0,0 +1,558 @@
"""
爬虫脚本生成服务
分析大学网站结构,自动生成爬虫脚本
"""
import re
from datetime import datetime
from urllib.parse import urlparse
from sqlalchemy.orm import Session
from ..database import SessionLocal
from ..models import University, ScraperScript
# 预置的大学爬虫脚本模板
SCRAPER_TEMPLATES = {
"harvard.edu": "harvard_scraper",
"mit.edu": "generic_scraper",
"stanford.edu": "generic_scraper",
}
def generate_scraper_script(university_id: int, university_url: str):
"""
生成爬虫脚本的后台任务
1. 分析大学网站域名
2. 如果有预置模板则使用模板
3. 否则生成通用爬虫脚本
"""
db = SessionLocal()
try:
university = db.query(University).filter(University.id == university_id).first()
if not university:
return
# 解析URL获取域名
parsed = urlparse(university_url)
domain = parsed.netloc.replace("www.", "")
# 检查是否有预置模板
template_name = None
for pattern, template in SCRAPER_TEMPLATES.items():
if pattern in domain:
template_name = template
break
# 生成脚本
script_content = _generate_script_content(domain, template_name)
config_content = _generate_config_content(university.name, university_url, domain)
# 计算版本号
existing_count = db.query(ScraperScript).filter(
ScraperScript.university_id == university_id
).count()
# 保存脚本
script = ScraperScript(
university_id=university_id,
script_name=f"{domain.replace('.', '_')}_scraper",
script_content=script_content,
config_content=config_content,
version=existing_count + 1,
status="active"
)
db.add(script)
# 更新大学状态
university.status = "ready"
db.commit()
except Exception as e:
# 记录错误
if university:
university.status = "error"
db.commit()
raise e
finally:
db.close()
def _generate_script_content(domain: str, template_name: str = None) -> str:
"""生成Python爬虫脚本内容"""
if template_name == "harvard_scraper":
return '''"""
Harvard University 专用爬虫脚本
自动生成
"""
import asyncio
import json
from datetime import datetime, timezone
from playwright.async_api import async_playwright
# 学院URL映射
SCHOOL_MAPPING = {
"gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
"seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
"hbs.edu": "Harvard Business School (HBS)",
"gsd.harvard.edu": "Graduate School of Design (GSD)",
"gse.harvard.edu": "Graduate School of Education (HGSE)",
"hks.harvard.edu": "Harvard Kennedy School (HKS)",
"hls.harvard.edu": "Harvard Law School (HLS)",
"hms.harvard.edu": "Harvard Medical School (HMS)",
"hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
"hds.harvard.edu": "Harvard Divinity School (HDS)",
"fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
}
async def scrape(output_callback=None):
"""执行爬取"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
result = {
"name": "Harvard University",
"url": "https://www.harvard.edu/",
"country": "USA",
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": []
}
# 访问项目列表页
if output_callback:
output_callback("info", "访问Harvard项目列表...")
await page.goto("https://www.harvard.edu/programs/?degree_levels=graduate")
await page.wait_for_timeout(3000)
# 提取项目数据
programs = await page.evaluate("""() => {
const items = document.querySelectorAll('[class*="records__record"]');
const programs = [];
items.forEach(item => {
const btn = item.querySelector('button[class*="title-link"]');
if (btn) {
programs.push({
name: btn.innerText.trim(),
url: ''
});
}
});
return programs;
}""")
if output_callback:
output_callback("info", f"找到 {len(programs)} 个项目")
# 简化输出
result["schools"] = [{
"name": "Graduate Programs",
"url": "https://www.harvard.edu/programs/",
"programs": [{"name": p["name"], "url": p["url"], "faculty": []} for p in programs[:50]]
}]
await browser.close()
return result
if __name__ == "__main__":
result = asyncio.run(scrape())
print(json.dumps(result, indent=2, ensure_ascii=False))
'''
# 通用爬虫模板 - 深度爬取硕士项目
# 使用字符串拼接来避免 f-string 和 JavaScript 引号冲突
return _build_generic_scraper_template(domain)
def _build_generic_scraper_template(domain: str) -> str:
"""构建通用爬虫模板"""
# JavaScript code blocks (use raw strings to avoid escaping issues)
js_check_courses = r'''() => {
const links = document.querySelectorAll('a[href]');
let courseCount = 0;
for (const a of links) {
const href = a.href.toLowerCase();
if (/\/\d{4,}\//.test(href) ||
/\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
/\/course\/[a-z]/.test(href)) {
courseCount++;
}
}
return courseCount;
}'''
js_find_list_url = r'''() => {
const links = document.querySelectorAll('a[href]');
for (const a of links) {
const text = a.innerText.toLowerCase();
const href = a.href.toLowerCase();
if ((text.includes('a-z') || text.includes('all course') ||
text.includes('full list') || text.includes('browse all') ||
href.includes('/list')) &&
(href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
return a.href;
}
}
return null;
}'''
js_find_courses_from_home = r'''() => {
const links = document.querySelectorAll('a[href]');
for (const a of links) {
const href = a.href.toLowerCase();
const text = a.innerText.toLowerCase();
if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
(href.includes('course') || href.includes('program') || href.includes('degree'))) {
return a.href;
}
}
return null;
}'''
js_extract_programs = r'''() => {
const programs = [];
const seen = new Set();
const currentHost = window.location.hostname;
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().replace(/\s+/g, ' ');
if (!href || seen.has(href)) return;
if (text.length < 5 || text.length > 200) return;
if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;
try {
const linkHost = new URL(href).hostname;
if (!linkHost.includes(currentHost.replace('www.', '')) &&
!currentHost.includes(linkHost.replace('www.', ''))) return;
} catch {
return;
}
const hrefLower = href.toLowerCase();
const textLower = text.toLowerCase();
const isNavigation = textLower === 'courses' ||
textLower === 'programmes' ||
textLower === 'undergraduate' ||
textLower === 'postgraduate' ||
textLower === 'masters' ||
textLower === "master's" ||
textLower.includes('skip to') ||
textLower.includes('share') ||
textLower === 'home' ||
textLower === 'study' ||
textLower.startsWith('a-z') ||
textLower.includes('admission') ||
textLower.includes('fees and funding') ||
textLower.includes('why should') ||
textLower.includes('why manchester') ||
textLower.includes('teaching and learning') ||
textLower.includes('meet us') ||
textLower.includes('student support') ||
textLower.includes('contact us') ||
textLower.includes('how to apply') ||
hrefLower.includes('/admissions/') ||
hrefLower.includes('/fees-and-funding/') ||
hrefLower.includes('/why-') ||
hrefLower.includes('/meet-us/') ||
hrefLower.includes('/contact-us/') ||
hrefLower.includes('/student-support/') ||
hrefLower.includes('/teaching-and-learning/') ||
hrefLower.endsWith('/courses/') ||
hrefLower.endsWith('/masters/') ||
hrefLower.endsWith('/postgraduate/');
if (isNavigation) return;
const isExcluded = hrefLower.includes('/undergraduate') ||
hrefLower.includes('/bachelor') ||
hrefLower.includes('/phd/') ||
hrefLower.includes('/doctoral') ||
hrefLower.includes('/research-degree') ||
textLower.includes('bachelor') ||
textLower.includes('undergraduate') ||
(textLower.includes('phd') && !textLower.includes('mphil'));
if (isExcluded) return;
const hasNumericId = /\/\d{4,}\//.test(href);
const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
const isCoursePage = (hrefLower.includes('/course/') ||
hrefLower.includes('/courses/list/') ||
hrefLower.includes('/programme/')) &&
href.split('/').filter(p => p).length > 4;
const textHasDegree = /\b(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)\b/i.test(text) ||
textLower.includes('master');
if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
seen.add(href);
programs.push({
name: text,
url: href
});
}
});
return programs;
}'''
js_extract_faculty = r'''() => {
const faculty = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href.toLowerCase();
const text = a.innerText.trim();
if (seen.has(href)) return;
if (text.length < 3 || text.length > 100) return;
const isStaff = href.includes('/people/') ||
href.includes('/staff/') ||
href.includes('/faculty/') ||
href.includes('/profile/') ||
href.includes('/academics/') ||
href.includes('/researcher/');
if (isStaff) {
seen.add(href);
faculty.push({
name: text.replace(/\s+/g, ' '),
url: a.href
});
}
});
return faculty.slice(0, 20);
}'''
university_name = domain.split('.')[0].title()
template = f'''"""
通用大学爬虫脚本
目标: {domain}
自动生成 - 深度爬取硕士项目和导师信息
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright
MASTERS_PATHS = [
"/study/masters/courses/list/",
"/study/masters/courses/",
"/postgraduate/taught/courses/",
"/postgraduate/courses/list/",
"/postgraduate/courses/",
"/graduate/programs/",
"/academics/graduate/programs/",
"/programmes/masters/",
"/masters/programmes/",
"/admissions/graduate/programs/",
]
JS_CHECK_COURSES = """{js_check_courses}"""
JS_FIND_LIST_URL = """{js_find_list_url}"""
JS_FIND_COURSES_FROM_HOME = """{js_find_courses_from_home}"""
JS_EXTRACT_PROGRAMS = """{js_extract_programs}"""
JS_EXTRACT_FACULTY = """{js_extract_faculty}"""
async def find_course_list_page(page, base_url, output_callback):
for path in MASTERS_PATHS:
test_url = base_url.rstrip('/') + path
try:
response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
if response and response.status == 200:
title = await page.title()
if '404' not in title.lower() and 'not found' not in title.lower():
has_courses = await page.evaluate(JS_CHECK_COURSES)
if has_courses > 5:
if output_callback:
output_callback("info", f"Found course list: {{path}} ({{has_courses}} courses)")
return test_url
list_url = await page.evaluate(JS_FIND_LIST_URL)
if list_url:
if output_callback:
output_callback("info", f"Found full course list: {{list_url}}")
return list_url
except:
continue
try:
await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
if courses_url:
return courses_url
except:
pass
return None
async def extract_course_links(page, output_callback):
return await page.evaluate(JS_EXTRACT_PROGRAMS)
async def scrape(output_callback=None):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = await context.new_page()
base_url = "https://www.{domain}/"
result = {{
"name": "{university_name} University",
"url": base_url,
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": []
}}
all_programs = []
try:
if output_callback:
output_callback("info", "Searching for masters course list...")
courses_url = await find_course_list_page(page, base_url, output_callback)
if not courses_url:
if output_callback:
output_callback("warning", "Course list not found, using homepage")
courses_url = base_url
if output_callback:
output_callback("info", "Extracting masters programs...")
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
for _ in range(3):
try:
load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
if await load_more.count() > 0:
await load_more.first.click()
await page.wait_for_timeout(2000)
else:
break
except:
break
programs_data = await extract_course_links(page, output_callback)
if output_callback:
output_callback("info", f"Found {{len(programs_data)}} masters programs")
max_detail_pages = min(len(programs_data), 30)
for i, prog in enumerate(programs_data[:max_detail_pages]):
try:
if output_callback and i % 10 == 0:
output_callback("info", f"Processing {{i+1}}/{{max_detail_pages}}: {{prog['name'][:50]}}")
await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
await page.wait_for_timeout(800)
faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
all_programs.append({{
"name": prog['name'],
"url": prog['url'],
"faculty": faculty_data
}})
except:
all_programs.append({{
"name": prog['name'],
"url": prog['url'],
"faculty": []
}})
for prog in programs_data[max_detail_pages:]:
all_programs.append({{
"name": prog['name'],
"url": prog['url'],
"faculty": []
}})
result["schools"] = [{{
"name": "Masters Programs",
"url": courses_url,
"programs": all_programs
}}]
if output_callback:
total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
output_callback("info", f"Done! {{len(all_programs)}} programs, {{total_faculty}} faculty")
except Exception as e:
if output_callback:
output_callback("error", f"Scraping error: {{str(e)}}")
finally:
await browser.close()
return result
if __name__ == "__main__":
result = asyncio.run(scrape())
print(json.dumps(result, indent=2, ensure_ascii=False))
'''
return template
def _generate_config_content(name: str, url: str, domain: str) -> dict:
"""生成配置内容"""
return {
"university": {
"name": name,
"url": url,
"domain": domain
},
"scraper": {
"headless": True,
"timeout": 30000,
"wait_time": 2000
},
"paths_to_try": [
"/programs",
"/academics/programs",
"/graduate",
"/degrees",
"/admissions/graduate"
],
"selectors": {
"program_item": "div.program, li.program, article.program, a[href*='/program']",
"faculty_item": "div.faculty, li.person, .profile-card"
},
"generated_at": datetime.utcnow().isoformat()
}