Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
229
artifacts/manchester_improved_scraper.py
Normal file
229
artifacts/manchester_improved_scraper.py
Normal file
@ -0,0 +1,229 @@
|
||||
"""
|
||||
曼彻斯特大学专用爬虫脚本
|
||||
改进版 - 从学院Staff页面提取导师信息
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
# 曼彻斯特大学学院Staff页面映射
|
||||
# 项目关键词 -> 学院Staff页面URL
|
||||
SCHOOL_STAFF_MAPPING = {
|
||||
# Alliance Manchester Business School (AMBS)
|
||||
"accounting": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
|
||||
"finance": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
|
||||
"business": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
|
||||
"management": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
|
||||
"marketing": "https://www.alliancembs.manchester.ac.uk/research/management-sciences-and-marketing/",
|
||||
"mba": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
|
||||
|
||||
# 其他学院可以继续添加...
|
||||
# "computer": "...",
|
||||
# "engineering": "...",
|
||||
}
|
||||
|
||||
# 通用学院Staff页面列表(如果没有匹配的关键词)
|
||||
GENERAL_STAFF_PAGES = [
|
||||
"https://www.alliancembs.manchester.ac.uk/about/our-people/",
|
||||
]
|
||||
|
||||
|
||||
async def scrape(output_callback=None):
|
||||
"""执行爬取"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
base_url = "https://www.manchester.ac.uk/"
|
||||
|
||||
result = {
|
||||
"name": "The University of Manchester",
|
||||
"url": base_url,
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": []
|
||||
}
|
||||
|
||||
try:
|
||||
# 第一步:爬取硕士项目列表
|
||||
if output_callback:
|
||||
output_callback("info", "Step 1: Scraping masters programs list...")
|
||||
|
||||
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
|
||||
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 提取所有硕士项目
|
||||
programs_data = await page.evaluate('''() => {
|
||||
const programs = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim().replace(/\\s+/g, ' ');
|
||||
|
||||
if (!href || seen.has(href)) return;
|
||||
if (text.length < 10 || text.length > 200) return;
|
||||
|
||||
const hrefLower = href.toLowerCase();
|
||||
const textLower = text.toLowerCase();
|
||||
|
||||
// 排除导航链接
|
||||
if (textLower === 'courses' || textLower === 'masters' ||
|
||||
textLower.includes('admission') || textLower.includes('fees') ||
|
||||
textLower.includes('skip to') || textLower.includes('skip navigation') ||
|
||||
textLower === 'home' || textLower === 'search' ||
|
||||
textLower.includes('contact') || textLower.includes('footer') ||
|
||||
hrefLower.endsWith('/courses/') || hrefLower.endsWith('/masters/') ||
|
||||
hrefLower.includes('#')) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 检查是否是课程链接 - 必须包含课程ID
|
||||
const hasNumericId = /\\/\\d{5}\\//.test(href); // 5位数字ID
|
||||
const isCoursePage = hrefLower.includes('/courses/list/') &&
|
||||
hasNumericId;
|
||||
|
||||
if (isCoursePage) {
|
||||
seen.add(href);
|
||||
programs.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return programs;
|
||||
}''')
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Found {len(programs_data)} masters programs")
|
||||
|
||||
# 第二步:爬取学院Staff页面的导师信息
|
||||
if output_callback:
|
||||
output_callback("info", "Step 2: Scraping faculty from school staff pages...")
|
||||
|
||||
all_faculty = {} # school_url -> faculty list
|
||||
|
||||
# 爬取AMBS Accounting & Finance Staff
|
||||
staff_url = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
|
||||
if output_callback:
|
||||
output_callback("info", f"Scraping staff from: {staff_url}")
|
||||
|
||||
await page.goto(staff_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 从表格提取教职员工
|
||||
faculty_data = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
const rows = document.querySelectorAll('table tr');
|
||||
|
||||
rows.forEach(row => {
|
||||
const cells = row.querySelectorAll('td');
|
||||
if (cells.length >= 2) {
|
||||
const link = cells[1]?.querySelector('a[href]');
|
||||
const titleCell = cells[2];
|
||||
|
||||
if (link) {
|
||||
const name = link.innerText.trim();
|
||||
const url = link.href;
|
||||
const title = titleCell ? titleCell.innerText.trim() : '';
|
||||
|
||||
if (name.length > 2 && !name.toLowerCase().includes('skip')) {
|
||||
faculty.push({
|
||||
name: name,
|
||||
url: url,
|
||||
title: title
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Found {len(faculty_data)} faculty members from AMBS")
|
||||
|
||||
all_faculty["AMBS - Accounting and Finance"] = faculty_data
|
||||
|
||||
# 第三步:组装结果
|
||||
# 将项目按关键词分配到学院
|
||||
schools_data = {}
|
||||
|
||||
for prog in programs_data:
|
||||
prog_name_lower = prog['name'].lower()
|
||||
|
||||
# 确定所属学院
|
||||
school_name = "Other Programs"
|
||||
matched_faculty = []
|
||||
|
||||
for keyword, staff_url in SCHOOL_STAFF_MAPPING.items():
|
||||
if keyword in prog_name_lower:
|
||||
if "accounting" in keyword or "finance" in keyword:
|
||||
school_name = "Alliance Manchester Business School"
|
||||
matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
|
||||
elif "business" in keyword or "management" in keyword or "mba" in keyword:
|
||||
school_name = "Alliance Manchester Business School"
|
||||
matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
|
||||
break
|
||||
|
||||
if school_name not in schools_data:
|
||||
schools_data[school_name] = {
|
||||
"name": school_name,
|
||||
"url": "",
|
||||
"programs": [],
|
||||
"faculty": matched_faculty # 学院级别的导师
|
||||
}
|
||||
|
||||
schools_data[school_name]["programs"].append({
|
||||
"name": prog['name'],
|
||||
"url": prog['url'],
|
||||
"faculty": [] # 项目级别暂不填充
|
||||
})
|
||||
|
||||
result["schools"] = list(schools_data.values())
|
||||
|
||||
# 统计
|
||||
total_programs = sum(len(s['programs']) for s in result['schools'])
|
||||
total_faculty = sum(len(s.get('faculty', [])) for s in result['schools'])
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty")
|
||||
|
||||
except Exception as e:
|
||||
if output_callback:
|
||||
output_callback("error", f"Scraping error: {str(e)}")
|
||||
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
if sys.platform == "win32":
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||||
|
||||
def print_callback(level, msg):
|
||||
print(f"[{level}] {msg}")
|
||||
|
||||
result = asyncio.run(scrape(output_callback=print_callback))
|
||||
|
||||
# 保存结果
|
||||
with open("output/manchester_improved_result.json", "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nResult saved to output/manchester_improved_result.json")
|
||||
print(f"Schools: {len(result['schools'])}")
|
||||
for school in result['schools']:
|
||||
print(f" - {school['name']}: {len(school['programs'])} programs, {len(school.get('faculty', []))} faculty")
|
||||
Reference in New Issue
Block a user