Files
University-Playwright-Codeg…/artifacts/manchester_complete_scraper.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

911 lines
30 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
曼彻斯特大学完整采集脚本
新增特性:
- Research Explorer API 优先拉取 JSON / XML失败再回落 DOM
- 每个学院独立页面、并行抓取(默认 3 并发)
- 细粒度超时/重试/滚动/Load more 控制
- 多 URL / 备用 Staff 页面配置
- 导师目录缓存,可按学院关键词映射到项目
- 诊断信息记录(失败学院、超时学院、批次信息)
"""
import asyncio
import json
import re
from copy import deepcopy
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlencode, urljoin
from xml.etree import ElementTree as ET
from playwright.async_api import (
TimeoutError as PlaywrightTimeoutError,
async_playwright,
)
# =========================
# 配置区
# =========================
DEFAULT_REQUEST = {
"timeout_ms": 60000,
"post_wait_ms": 2500,
"wait_until": "domcontentloaded",
"max_retries": 3,
"retry_backoff_ms": 2000,
}
STAFF_CONCURRENCY = 3
SCHOOL_CONFIG: List[Dict[str, Any]] = [
{
"name": "Alliance Manchester Business School",
"keywords": [
"accounting",
"finance",
"business",
"management",
"marketing",
"mba",
"economics",
"entrepreneurship",
],
"attach_faculty_to_programs": True,
"staff_pages": [
{
"url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
"extract_method": "table",
"request": {"timeout_ms": 60000, "wait_until": "networkidle"},
}
],
},
{
"name": "Department of Computer Science",
"keywords": [
"computer",
"software",
"data science",
"artificial intelligence",
"ai ",
"machine learning",
"cyber",
"computing",
],
"attach_faculty_to_programs": True,
"staff_pages": [
{
"url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/",
"extract_method": "links",
"requires_scroll": True,
},
{
"url": "https://www.cs.manchester.ac.uk/about/people/",
"extract_method": "links",
"load_more_selector": "button.load-more",
"max_load_more": 6,
},
],
},
{
"name": "Department of Physics and Astronomy",
"keywords": [
"physics",
"astronomy",
"astrophysics",
"nuclear",
"particle",
],
"attach_faculty_to_programs": True,
"staff_pages": [
{
"url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/",
"extract_method": "links",
"requires_scroll": True,
}
],
},
{
"name": "Department of Electrical and Electronic Engineering",
"keywords": [
"electrical",
"electronic",
"eee",
"power systems",
"microelectronics",
],
"attach_faculty_to_programs": True,
"staff_pages": [
{
"url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/",
"extract_method": "links",
"requires_scroll": True,
}
],
},
{
"name": "Department of Chemistry",
"keywords": ["chemistry", "chemical"],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 200},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
"request": {
"timeout_ms": 120000,
"wait_until": "networkidle",
"post_wait_ms": 5000,
},
}
],
},
{
"name": "Department of Mathematics",
"keywords": [
"mathematics",
"mathematical",
"applied math",
"statistics",
"actuarial",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 200},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Engineering",
"keywords": [
"engineering",
"mechanical",
"aerospace",
"civil",
"structural",
"materials",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 400},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "Faculty of Biology, Medicine and Health",
"keywords": [
"medicine",
"medical",
"health",
"nursing",
"pharmacy",
"clinical",
"dental",
"optometry",
"biology",
"biomedical",
"anatomical",
"physiotherapy",
"midwifery",
"mental health",
"psychology",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 400},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Social Sciences",
"keywords": [
"sociology",
"politics",
"international",
"social",
"criminology",
"anthropology",
"philosophy",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 200},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Law",
"keywords": ["law", "legal", "llm"],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 200},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Arts, Languages and Cultures",
"keywords": [
"arts",
"languages",
"culture",
"music",
"drama",
"theatre",
"history",
"linguistics",
"literature",
"translation",
"classics",
"archaeology",
"religion",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 400},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Environment, Education and Development",
"keywords": [
"environment",
"education",
"development",
"planning",
"architecture",
"urban",
"geography",
"sustainability",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 300},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
]
SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG}
# =========================
# JS 提取函数
# =========================
JS_EXTRACT_TABLE_STAFF = """() => {
const staff = [];
const seen = new Set();
document.querySelectorAll('table tr').forEach(row => {
const cells = row.querySelectorAll('td');
if (cells.length >= 2) {
const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]');
const titleCell = cells[2] || cells[1];
if (link) {
const name = link.innerText.trim();
const url = link.href;
const title = titleCell ? titleCell.innerText.trim() : '';
if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) {
seen.add(url);
staff.push({
name,
url,
title
});
}
}
}
});
return staff;
}"""
JS_EXTRACT_LINK_STAFF = """() => {
const staff = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim();
if (seen.has(href)) return;
if (text.length < 5 || text.length > 80) return;
const lowerText = text.toLowerCase();
if (lowerText.includes('skip') ||
lowerText.includes('staff') ||
lowerText.includes('people') ||
lowerText.includes('academic') ||
lowerText.includes('research profiles')) return;
if (href.includes('/persons/') ||
href.includes('/portal/en/researchers/') ||
href.includes('/profile/') ||
href.includes('/people/')) {
seen.add(href);
staff.push({
name: text,
url: href,
title: ''
});
}
});
return staff;
}"""
JS_EXTRACT_RESEARCH_EXPLORER = """() => {
const staff = [];
const seen = new Set();
document.querySelectorAll('a.link.person').forEach(a => {
const href = a.href;
const text = a.innerText.trim();
if (!seen.has(href) && text.length > 3 && text.length < 80) {
seen.add(href);
staff.push({
name: text,
url: href,
title: ''
});
}
});
if (staff.length === 0) {
document.querySelectorAll('a[href*="/persons/"]').forEach(a => {
const href = a.href;
const text = a.innerText.trim();
const lower = text.toLowerCase();
if (seen.has(href)) return;
if (text.length < 3 || text.length > 80) return;
if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return;
seen.add(href);
staff.push({
name: text,
url: href,
title: ''
});
});
}
return staff;
}"""
JS_EXTRACT_PROGRAMS = """() => {
const programs = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().replace(/\\s+/g, ' ');
if (!href || seen.has(href)) return;
if (text.length < 10 || text.length > 200) return;
const hrefLower = href.toLowerCase();
const textLower = text.toLowerCase();
const isNav = textLower === 'courses' ||
textLower === 'masters' ||
textLower.includes('admission') ||
textLower.includes('fees') ||
textLower.includes('skip to') ||
textLower.includes('search') ||
textLower.includes('contact') ||
hrefLower.includes('#');
if (isNav) return;
const hasNumericId = /\\/\\d{5}\\//.test(href);
const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId;
if (isCoursePage) {
seen.add(href);
programs.push({
name: text,
url: href
});
}
});
return programs;
}"""
# =========================
# 数据匹配
# =========================
def match_program_to_school(program_name: str) -> str:
lower = program_name.lower()
for school in SCHOOL_CONFIG:
for keyword in school["keywords"]:
if keyword in lower:
return school["name"]
return "Other Programs"
# =========================
# 请求与解析工具
# =========================
def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]:
settings = dict(DEFAULT_REQUEST)
for layer in layers:
if not layer:
continue
for key, value in layer.items():
if value is not None:
settings[key] = value
settings["max_retries"] = max(1, int(settings.get("max_retries", 1)))
settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000)
return settings
async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]:
last_error = None
for attempt in range(settings["max_retries"]):
try:
await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"])
if settings.get("wait_for_selector"):
await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"])
if settings.get("post_wait_ms"):
await page.wait_for_timeout(settings["post_wait_ms"])
return True, None
except PlaywrightTimeoutError as exc:
last_error = f"Timeout: {exc}"
except Exception as exc: # noqa: BLE001
last_error = str(exc)
if attempt < settings["max_retries"] - 1:
await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1))
return False, last_error
async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800):
repetitions = max(1, repetitions)
for i in range(repetitions):
await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1))
await page.wait_for_timeout(delay_ms)
async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500):
for _ in range(max_clicks):
button = await page.query_selector(selector)
if not button:
break
try:
await button.click()
await page.wait_for_timeout(wait_ms)
except Exception:
break
def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
seen = set()
cleaned = []
for item in staff:
name = (item.get("name") or "").strip()
if not name:
continue
url = (item.get("url") or "").strip()
key = url or name.lower()
if key in seen:
continue
seen.add(key)
cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()})
return cleaned
def _append_query(url: str, params: Dict[str, Any]) -> str:
delimiter = "&" if "?" in url else "?"
return f"{url}{delimiter}{urlencode(params)}"
def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]:
if not staff_url:
return None
path = staff_url.rstrip("/").split("/")
return path[-1] if path else None
def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]:
items: List[Dict[str, Any]] = []
if isinstance(data, list):
items = data
elif isinstance(data, dict):
for key in ("results", "items", "persons", "data", "entities"):
if isinstance(data.get(key), list):
items = data[key]
break
if not items and isinstance(data.get("rows"), list):
items = data["rows"]
staff = []
for item in items:
if not isinstance(item, dict):
continue
name = item.get("name") or item.get("title") or item.get("fullName")
profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL")
if not name:
continue
if profile_url:
profile_url = urljoin(base_url, profile_url)
staff.append(
{
"name": name.strip(),
"url": (profile_url or "").strip(),
"title": (item.get("jobTitle") or item.get("position") or "").strip(),
}
)
return staff
def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]:
staff: List[Dict[str, str]] = []
try:
root = ET.fromstring(text)
except ET.ParseError:
return staff
for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"):
title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="")
link = entry.find("{http://www.w3.org/2005/Atom}link")
href = link.attrib.get("href") if link is not None else ""
if title:
staff.append(
{
"name": title.strip(),
"url": urljoin(base_url, href) if href else "",
"title": "",
}
)
return staff
async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
config = school_config.get("research_explorer") or {}
if not config and school_config.get("extract_method") != "research_explorer":
return []
base_staff_url = ""
if school_config.get("staff_pages"):
base_staff_url = school_config["staff_pages"][0].get("url", "")
page_size = config.get("page_size", 200)
timeout_ms = config.get("timeout_ms", 70000)
candidates: List[str] = []
slug = config.get("org_slug") or _guess_research_slug(base_staff_url)
base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx")
if config.get("api_url"):
candidates.append(config["api_url"])
if slug:
params = {
"action": "search",
"language": "en",
"format": "json",
"site": "default",
"showall": "true",
"pageSize": page_size,
"organisations": slug,
}
candidates.append(f"{base_api}?{urlencode(params)}")
if base_staff_url:
candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size}))
candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size}))
for url in candidates:
try:
resp = await context.request.get(url, timeout=timeout_ms)
if resp.status != 200:
continue
ctype = resp.headers.get("content-type", "")
if "json" in ctype:
data = await resp.json()
parsed = _parse_research_explorer_json(data, base_staff_url)
else:
text = await resp.text()
parsed = _parse_research_explorer_xml(text, base_staff_url)
parsed = _deduplicate_staff(parsed)
if parsed:
if output_callback:
output_callback("info", f" {school_config['name']}: {len(parsed)} staff via API")
return parsed
except Exception as exc: # noqa: BLE001
if output_callback:
output_callback(
"warning", f" {school_config['name']}: API fetch failed ({str(exc)[:60]})"
)
return []
async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
staff_collected: List[Dict[str, str]] = []
staff_pages = school_config.get("staff_pages") or []
if not staff_pages and school_config.get("staff_url"):
staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}]
page = await context.new_page()
blocked_types = school_config.get("blocked_resources", ["image", "font", "media"])
if blocked_types:
async def _route_handler(route):
if route.request.resource_type in blocked_types:
await route.abort()
else:
await route.continue_()
await page.route("**/*", _route_handler)
for page_cfg in staff_pages:
target_url = page_cfg.get("url")
if not target_url:
continue
settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request"))
success, error = await _goto_with_retry(page, target_url, settings, school_config["name"])
if not success:
if output_callback:
output_callback("warning", f" {school_config['name']}: failed to load {target_url} ({error})")
continue
if page_cfg.get("requires_scroll"):
await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700))
if page_cfg.get("load_from_selector"):
await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5))
elif page_cfg.get("load_more_selector"):
await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5))
method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links"
if method == "table":
extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF)
elif method == "research_explorer":
extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER)
else:
extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF)
staff_collected.extend(extracted)
await page.close()
return _deduplicate_staff(staff_collected)
# =========================
# 并发抓取学院 Staff
# =========================
async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback):
async with semaphore:
staff_list: List[Dict[str, str]] = []
status = "success"
error: Optional[str] = None
try:
if school_config.get("extract_method") == "research_explorer":
staff_list = await fetch_research_explorer_api(context, school_config, output_callback)
if not staff_list:
staff_list = await scrape_staff_via_browser(context, school_config, output_callback)
if output_callback:
output_callback("info", f" {school_config['name']}: total {len(staff_list)} staff")
except Exception as exc: # noqa: BLE001
status = "error"
error = str(exc)
if output_callback:
output_callback("error", f" {school_config['name']}: {error}")
return {
"name": school_config["name"],
"staff": staff_list,
"status": status,
"error": error,
}
async def scrape_all_school_staff(context, output_callback):
semaphore = asyncio.Semaphore(STAFF_CONCURRENCY)
tasks = [
asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback))
for cfg in SCHOOL_CONFIG
]
results = await asyncio.gather(*tasks)
staff_map = {}
diagnostics = {"failed": [], "success": [], "total": len(results)}
for res in results:
if res["staff"]:
staff_map[res["name"]] = res["staff"]
diagnostics["success"].append(res["name"])
else:
diagnostics["failed"].append(
{
"name": res["name"],
"status": res["status"],
"error": res.get("error"),
}
)
return staff_map, diagnostics
# =========================
# 主流程
# =========================
async def scrape(output_callback=None):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
base_url = "https://www.manchester.ac.uk/"
result = {
"name": "The University of Manchester",
"url": base_url,
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": [],
"diagnostics": {},
}
try:
# Step 1: Masters 列表
if output_callback:
output_callback("info", "Step 1: Scraping masters programs list...")
page = await context.new_page()
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000)
await page.wait_for_timeout(3000)
programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS)
await page.close()
if output_callback:
output_callback("info", f"Found {len(programs_data)} masters programs")
# Step 2: 并发抓取学院 Staff
if output_callback:
output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...")
school_staff, diagnostics = await scrape_all_school_staff(context, output_callback)
# Step 3: 组织数据
schools_dict: Dict[str, Dict[str, Any]] = {}
for prog in programs_data:
school_name = match_program_to_school(prog["name"])
if school_name not in schools_dict:
schools_dict[school_name] = {
"name": school_name,
"url": "",
"programs": [],
"faculty": school_staff.get(school_name, []),
"faculty_source": "school_directory" if school_staff.get(school_name) else "",
}
schools_dict[school_name]["programs"].append(
{
"name": prog["name"],
"url": prog["url"],
"faculty": [],
}
)
for cfg in SCHOOL_CONFIG:
if cfg["name"] in schools_dict:
first_page = (cfg.get("staff_pages") or [{}])[0]
schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "")
_attach_faculty_to_programs(schools_dict, school_staff)
result["schools"] = list(schools_dict.values())
total_programs = sum(len(s["programs"]) for s in result["schools"])
total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"])
result["diagnostics"] = {
"total_programs": total_programs,
"total_faculty_records": total_faculty,
"school_staff_success": diagnostics.get("success", []),
"school_staff_failed": diagnostics.get("failed", []),
}
if output_callback:
output_callback(
"info",
f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty",
)
except Exception as exc: # noqa: BLE001
if output_callback:
output_callback("error", f"Scraping error: {str(exc)}")
finally:
await browser.close()
return result
def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]):
for school_name, school_data in schools_dict.items():
staff = staff_map.get(school_name, [])
cfg = SCHOOL_LOOKUP.get(school_name, {})
if not staff or not cfg.get("attach_faculty_to_programs"):
continue
limit = cfg.get("faculty_per_program")
for program in school_data["programs"]:
sliced = deepcopy(staff[:limit] if limit else staff)
program["faculty"] = sliced
# =========================
# CLI
# =========================
if __name__ == "__main__":
import sys
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
def print_callback(level, msg):
print(f"[{level}] {msg}")
scrape_result = asyncio.run(scrape(output_callback=print_callback))
output_path = "output/manchester_complete_result.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump(scrape_result, f, ensure_ascii=False, indent=2)
print("\nResult saved to", output_path)
print("\n=== Summary ===")
for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))):
print(
f" {school['name']}: "
f"{len(school['programs'])} programs, "
f"{len(school.get('faculty', []))} faculty"
)