Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
558
backend/app/services/script_generator.py
Normal file
558
backend/app/services/script_generator.py
Normal file
@ -0,0 +1,558 @@
|
||||
"""
|
||||
爬虫脚本生成服务
|
||||
|
||||
分析大学网站结构,自动生成爬虫脚本
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ..database import SessionLocal
|
||||
from ..models import University, ScraperScript
|
||||
|
||||
|
||||
# 预置的大学爬虫脚本模板
|
||||
SCRAPER_TEMPLATES = {
|
||||
"harvard.edu": "harvard_scraper",
|
||||
"mit.edu": "generic_scraper",
|
||||
"stanford.edu": "generic_scraper",
|
||||
}
|
||||
|
||||
|
||||
def generate_scraper_script(university_id: int, university_url: str):
|
||||
"""
|
||||
生成爬虫脚本的后台任务
|
||||
|
||||
1. 分析大学网站域名
|
||||
2. 如果有预置模板则使用模板
|
||||
3. 否则生成通用爬虫脚本
|
||||
"""
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
university = db.query(University).filter(University.id == university_id).first()
|
||||
if not university:
|
||||
return
|
||||
|
||||
# 解析URL获取域名
|
||||
parsed = urlparse(university_url)
|
||||
domain = parsed.netloc.replace("www.", "")
|
||||
|
||||
# 检查是否有预置模板
|
||||
template_name = None
|
||||
for pattern, template in SCRAPER_TEMPLATES.items():
|
||||
if pattern in domain:
|
||||
template_name = template
|
||||
break
|
||||
|
||||
# 生成脚本
|
||||
script_content = _generate_script_content(domain, template_name)
|
||||
config_content = _generate_config_content(university.name, university_url, domain)
|
||||
|
||||
# 计算版本号
|
||||
existing_count = db.query(ScraperScript).filter(
|
||||
ScraperScript.university_id == university_id
|
||||
).count()
|
||||
|
||||
# 保存脚本
|
||||
script = ScraperScript(
|
||||
university_id=university_id,
|
||||
script_name=f"{domain.replace('.', '_')}_scraper",
|
||||
script_content=script_content,
|
||||
config_content=config_content,
|
||||
version=existing_count + 1,
|
||||
status="active"
|
||||
)
|
||||
|
||||
db.add(script)
|
||||
|
||||
# 更新大学状态
|
||||
university.status = "ready"
|
||||
|
||||
db.commit()
|
||||
|
||||
except Exception as e:
|
||||
# 记录错误
|
||||
if university:
|
||||
university.status = "error"
|
||||
db.commit()
|
||||
raise e
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def _generate_script_content(domain: str, template_name: str = None) -> str:
|
||||
"""生成Python爬虫脚本内容"""
|
||||
|
||||
if template_name == "harvard_scraper":
|
||||
return '''"""
|
||||
Harvard University 专用爬虫脚本
|
||||
自动生成
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
# 学院URL映射
|
||||
SCHOOL_MAPPING = {
|
||||
"gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)",
|
||||
"seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
|
||||
"hbs.edu": "Harvard Business School (HBS)",
|
||||
"gsd.harvard.edu": "Graduate School of Design (GSD)",
|
||||
"gse.harvard.edu": "Graduate School of Education (HGSE)",
|
||||
"hks.harvard.edu": "Harvard Kennedy School (HKS)",
|
||||
"hls.harvard.edu": "Harvard Law School (HLS)",
|
||||
"hms.harvard.edu": "Harvard Medical School (HMS)",
|
||||
"hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)",
|
||||
"hds.harvard.edu": "Harvard Divinity School (HDS)",
|
||||
"fas.harvard.edu": "Faculty of Arts and Sciences (FAS)",
|
||||
}
|
||||
|
||||
|
||||
async def scrape(output_callback=None):
|
||||
"""执行爬取"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
result = {
|
||||
"name": "Harvard University",
|
||||
"url": "https://www.harvard.edu/",
|
||||
"country": "USA",
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": []
|
||||
}
|
||||
|
||||
# 访问项目列表页
|
||||
if output_callback:
|
||||
output_callback("info", "访问Harvard项目列表...")
|
||||
|
||||
await page.goto("https://www.harvard.edu/programs/?degree_levels=graduate")
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 提取项目数据
|
||||
programs = await page.evaluate("""() => {
|
||||
const items = document.querySelectorAll('[class*="records__record"]');
|
||||
const programs = [];
|
||||
items.forEach(item => {
|
||||
const btn = item.querySelector('button[class*="title-link"]');
|
||||
if (btn) {
|
||||
programs.push({
|
||||
name: btn.innerText.trim(),
|
||||
url: ''
|
||||
});
|
||||
}
|
||||
});
|
||||
return programs;
|
||||
}""")
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"找到 {len(programs)} 个项目")
|
||||
|
||||
# 简化输出
|
||||
result["schools"] = [{
|
||||
"name": "Graduate Programs",
|
||||
"url": "https://www.harvard.edu/programs/",
|
||||
"programs": [{"name": p["name"], "url": p["url"], "faculty": []} for p in programs[:50]]
|
||||
}]
|
||||
|
||||
await browser.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
result = asyncio.run(scrape())
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
'''
|
||||
|
||||
# 通用爬虫模板 - 深度爬取硕士项目
|
||||
# 使用字符串拼接来避免 f-string 和 JavaScript 引号冲突
|
||||
return _build_generic_scraper_template(domain)
|
||||
|
||||
|
||||
def _build_generic_scraper_template(domain: str) -> str:
|
||||
"""构建通用爬虫模板"""
|
||||
|
||||
# JavaScript code blocks (use raw strings to avoid escaping issues)
|
||||
js_check_courses = r'''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
let courseCount = 0;
|
||||
for (const a of links) {
|
||||
const href = a.href.toLowerCase();
|
||||
if (/\/\d{4,}\//.test(href) ||
|
||||
/\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
|
||||
/\/course\/[a-z]/.test(href)) {
|
||||
courseCount++;
|
||||
}
|
||||
}
|
||||
return courseCount;
|
||||
}'''
|
||||
|
||||
js_find_list_url = r'''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const a of links) {
|
||||
const text = a.innerText.toLowerCase();
|
||||
const href = a.href.toLowerCase();
|
||||
if ((text.includes('a-z') || text.includes('all course') ||
|
||||
text.includes('full list') || text.includes('browse all') ||
|
||||
href.includes('/list')) &&
|
||||
(href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
|
||||
return a.href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}'''
|
||||
|
||||
js_find_courses_from_home = r'''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const a of links) {
|
||||
const href = a.href.toLowerCase();
|
||||
const text = a.innerText.toLowerCase();
|
||||
if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
|
||||
(href.includes('course') || href.includes('program') || href.includes('degree'))) {
|
||||
return a.href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}'''
|
||||
|
||||
js_extract_programs = r'''() => {
|
||||
const programs = [];
|
||||
const seen = new Set();
|
||||
const currentHost = window.location.hostname;
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim().replace(/\s+/g, ' ');
|
||||
|
||||
if (!href || seen.has(href)) return;
|
||||
if (text.length < 5 || text.length > 200) return;
|
||||
if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;
|
||||
|
||||
try {
|
||||
const linkHost = new URL(href).hostname;
|
||||
if (!linkHost.includes(currentHost.replace('www.', '')) &&
|
||||
!currentHost.includes(linkHost.replace('www.', ''))) return;
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
|
||||
const hrefLower = href.toLowerCase();
|
||||
const textLower = text.toLowerCase();
|
||||
|
||||
const isNavigation = textLower === 'courses' ||
|
||||
textLower === 'programmes' ||
|
||||
textLower === 'undergraduate' ||
|
||||
textLower === 'postgraduate' ||
|
||||
textLower === 'masters' ||
|
||||
textLower === "master's" ||
|
||||
textLower.includes('skip to') ||
|
||||
textLower.includes('share') ||
|
||||
textLower === 'home' ||
|
||||
textLower === 'study' ||
|
||||
textLower.startsWith('a-z') ||
|
||||
textLower.includes('admission') ||
|
||||
textLower.includes('fees and funding') ||
|
||||
textLower.includes('why should') ||
|
||||
textLower.includes('why manchester') ||
|
||||
textLower.includes('teaching and learning') ||
|
||||
textLower.includes('meet us') ||
|
||||
textLower.includes('student support') ||
|
||||
textLower.includes('contact us') ||
|
||||
textLower.includes('how to apply') ||
|
||||
hrefLower.includes('/admissions/') ||
|
||||
hrefLower.includes('/fees-and-funding/') ||
|
||||
hrefLower.includes('/why-') ||
|
||||
hrefLower.includes('/meet-us/') ||
|
||||
hrefLower.includes('/contact-us/') ||
|
||||
hrefLower.includes('/student-support/') ||
|
||||
hrefLower.includes('/teaching-and-learning/') ||
|
||||
hrefLower.endsWith('/courses/') ||
|
||||
hrefLower.endsWith('/masters/') ||
|
||||
hrefLower.endsWith('/postgraduate/');
|
||||
|
||||
if (isNavigation) return;
|
||||
|
||||
const isExcluded = hrefLower.includes('/undergraduate') ||
|
||||
hrefLower.includes('/bachelor') ||
|
||||
hrefLower.includes('/phd/') ||
|
||||
hrefLower.includes('/doctoral') ||
|
||||
hrefLower.includes('/research-degree') ||
|
||||
textLower.includes('bachelor') ||
|
||||
textLower.includes('undergraduate') ||
|
||||
(textLower.includes('phd') && !textLower.includes('mphil'));
|
||||
|
||||
if (isExcluded) return;
|
||||
|
||||
const hasNumericId = /\/\d{4,}\//.test(href);
|
||||
const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
|
||||
const isCoursePage = (hrefLower.includes('/course/') ||
|
||||
hrefLower.includes('/courses/list/') ||
|
||||
hrefLower.includes('/programme/')) &&
|
||||
href.split('/').filter(p => p).length > 4;
|
||||
const textHasDegree = /\b(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)\b/i.test(text) ||
|
||||
textLower.includes('master');
|
||||
|
||||
if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
|
||||
seen.add(href);
|
||||
programs.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return programs;
|
||||
}'''
|
||||
|
||||
js_extract_faculty = r'''() => {
|
||||
const faculty = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href.toLowerCase();
|
||||
const text = a.innerText.trim();
|
||||
|
||||
if (seen.has(href)) return;
|
||||
if (text.length < 3 || text.length > 100) return;
|
||||
|
||||
const isStaff = href.includes('/people/') ||
|
||||
href.includes('/staff/') ||
|
||||
href.includes('/faculty/') ||
|
||||
href.includes('/profile/') ||
|
||||
href.includes('/academics/') ||
|
||||
href.includes('/researcher/');
|
||||
|
||||
if (isStaff) {
|
||||
seen.add(href);
|
||||
faculty.push({
|
||||
name: text.replace(/\s+/g, ' '),
|
||||
url: a.href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return faculty.slice(0, 20);
|
||||
}'''
|
||||
|
||||
university_name = domain.split('.')[0].title()
|
||||
|
||||
template = f'''"""
|
||||
通用大学爬虫脚本
|
||||
目标: {domain}
|
||||
自动生成 - 深度爬取硕士项目和导师信息
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
MASTERS_PATHS = [
|
||||
"/study/masters/courses/list/",
|
||||
"/study/masters/courses/",
|
||||
"/postgraduate/taught/courses/",
|
||||
"/postgraduate/courses/list/",
|
||||
"/postgraduate/courses/",
|
||||
"/graduate/programs/",
|
||||
"/academics/graduate/programs/",
|
||||
"/programmes/masters/",
|
||||
"/masters/programmes/",
|
||||
"/admissions/graduate/programs/",
|
||||
]
|
||||
|
||||
JS_CHECK_COURSES = """{js_check_courses}"""
|
||||
|
||||
JS_FIND_LIST_URL = """{js_find_list_url}"""
|
||||
|
||||
JS_FIND_COURSES_FROM_HOME = """{js_find_courses_from_home}"""
|
||||
|
||||
JS_EXTRACT_PROGRAMS = """{js_extract_programs}"""
|
||||
|
||||
JS_EXTRACT_FACULTY = """{js_extract_faculty}"""
|
||||
|
||||
|
||||
async def find_course_list_page(page, base_url, output_callback):
|
||||
for path in MASTERS_PATHS:
|
||||
test_url = base_url.rstrip('/') + path
|
||||
try:
|
||||
response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
|
||||
if response and response.status == 200:
|
||||
title = await page.title()
|
||||
if '404' not in title.lower() and 'not found' not in title.lower():
|
||||
has_courses = await page.evaluate(JS_CHECK_COURSES)
|
||||
if has_courses > 5:
|
||||
if output_callback:
|
||||
output_callback("info", f"Found course list: {{path}} ({{has_courses}} courses)")
|
||||
return test_url
|
||||
|
||||
list_url = await page.evaluate(JS_FIND_LIST_URL)
|
||||
if list_url:
|
||||
if output_callback:
|
||||
output_callback("info", f"Found full course list: {{list_url}}")
|
||||
return list_url
|
||||
except:
|
||||
continue
|
||||
|
||||
try:
|
||||
await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
|
||||
if courses_url:
|
||||
return courses_url
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def extract_course_links(page, output_callback):
|
||||
return await page.evaluate(JS_EXTRACT_PROGRAMS)
|
||||
|
||||
|
||||
async def scrape(output_callback=None):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
base_url = "https://www.{domain}/"
|
||||
|
||||
result = {{
|
||||
"name": "{university_name} University",
|
||||
"url": base_url,
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": []
|
||||
}}
|
||||
|
||||
all_programs = []
|
||||
|
||||
try:
|
||||
if output_callback:
|
||||
output_callback("info", "Searching for masters course list...")
|
||||
|
||||
courses_url = await find_course_list_page(page, base_url, output_callback)
|
||||
|
||||
if not courses_url:
|
||||
if output_callback:
|
||||
output_callback("warning", "Course list not found, using homepage")
|
||||
courses_url = base_url
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", "Extracting masters programs...")
|
||||
|
||||
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
for _ in range(3):
|
||||
try:
|
||||
load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
|
||||
if await load_more.count() > 0:
|
||||
await load_more.first.click()
|
||||
await page.wait_for_timeout(2000)
|
||||
else:
|
||||
break
|
||||
except:
|
||||
break
|
||||
|
||||
programs_data = await extract_course_links(page, output_callback)
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Found {{len(programs_data)}} masters programs")
|
||||
|
||||
max_detail_pages = min(len(programs_data), 30)
|
||||
|
||||
for i, prog in enumerate(programs_data[:max_detail_pages]):
|
||||
try:
|
||||
if output_callback and i % 10 == 0:
|
||||
output_callback("info", f"Processing {{i+1}}/{{max_detail_pages}}: {{prog['name'][:50]}}")
|
||||
|
||||
await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
|
||||
await page.wait_for_timeout(800)
|
||||
|
||||
faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
|
||||
|
||||
all_programs.append({{
|
||||
"name": prog['name'],
|
||||
"url": prog['url'],
|
||||
"faculty": faculty_data
|
||||
}})
|
||||
|
||||
except:
|
||||
all_programs.append({{
|
||||
"name": prog['name'],
|
||||
"url": prog['url'],
|
||||
"faculty": []
|
||||
}})
|
||||
|
||||
for prog in programs_data[max_detail_pages:]:
|
||||
all_programs.append({{
|
||||
"name": prog['name'],
|
||||
"url": prog['url'],
|
||||
"faculty": []
|
||||
}})
|
||||
|
||||
result["schools"] = [{{
|
||||
"name": "Masters Programs",
|
||||
"url": courses_url,
|
||||
"programs": all_programs
|
||||
}}]
|
||||
|
||||
if output_callback:
|
||||
total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
|
||||
output_callback("info", f"Done! {{len(all_programs)}} programs, {{total_faculty}} faculty")
|
||||
|
||||
except Exception as e:
|
||||
if output_callback:
|
||||
output_callback("error", f"Scraping error: {{str(e)}}")
|
||||
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
result = asyncio.run(scrape())
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
'''
|
||||
return template
|
||||
|
||||
|
||||
def _generate_config_content(name: str, url: str, domain: str) -> dict:
|
||||
"""生成配置内容"""
|
||||
return {
|
||||
"university": {
|
||||
"name": name,
|
||||
"url": url,
|
||||
"domain": domain
|
||||
},
|
||||
"scraper": {
|
||||
"headless": True,
|
||||
"timeout": 30000,
|
||||
"wait_time": 2000
|
||||
},
|
||||
"paths_to_try": [
|
||||
"/programs",
|
||||
"/academics/programs",
|
||||
"/graduate",
|
||||
"/degrees",
|
||||
"/admissions/graduate"
|
||||
],
|
||||
"selectors": {
|
||||
"program_item": "div.program, li.program, article.program, a[href*='/program']",
|
||||
"faculty_item": "div.faculty, li.person, .profile-card"
|
||||
},
|
||||
"generated_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
Reference in New Issue
Block a user