- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
911 lines
30 KiB
Python
911 lines
30 KiB
Python
"""
|
||
曼彻斯特大学完整采集脚本
|
||
新增特性:
|
||
- Research Explorer API 优先拉取 JSON / XML,失败再回落 DOM
|
||
- 每个学院独立页面、并行抓取(默认 3 并发)
|
||
- 细粒度超时/重试/滚动/Load more 控制
|
||
- 多 URL / 备用 Staff 页面配置
|
||
- 导师目录缓存,可按学院关键词映射到项目
|
||
- 诊断信息记录(失败学院、超时学院、批次信息)
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
import re
|
||
from copy import deepcopy
|
||
from datetime import datetime, timezone
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
from urllib.parse import urlencode, urljoin
|
||
from xml.etree import ElementTree as ET
|
||
|
||
from playwright.async_api import (
|
||
TimeoutError as PlaywrightTimeoutError,
|
||
async_playwright,
|
||
)
|
||
|
||
# =========================
|
||
# 配置区
|
||
# =========================
|
||
|
||
DEFAULT_REQUEST = {
|
||
"timeout_ms": 60000,
|
||
"post_wait_ms": 2500,
|
||
"wait_until": "domcontentloaded",
|
||
"max_retries": 3,
|
||
"retry_backoff_ms": 2000,
|
||
}
|
||
|
||
STAFF_CONCURRENCY = 3
|
||
|
||
SCHOOL_CONFIG: List[Dict[str, Any]] = [
|
||
{
|
||
"name": "Alliance Manchester Business School",
|
||
"keywords": [
|
||
"accounting",
|
||
"finance",
|
||
"business",
|
||
"management",
|
||
"marketing",
|
||
"mba",
|
||
"economics",
|
||
"entrepreneurship",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
|
||
"extract_method": "table",
|
||
"request": {"timeout_ms": 60000, "wait_until": "networkidle"},
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "Department of Computer Science",
|
||
"keywords": [
|
||
"computer",
|
||
"software",
|
||
"data science",
|
||
"artificial intelligence",
|
||
"ai ",
|
||
"machine learning",
|
||
"cyber",
|
||
"computing",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||
"extract_method": "links",
|
||
"requires_scroll": True,
|
||
},
|
||
{
|
||
"url": "https://www.cs.manchester.ac.uk/about/people/",
|
||
"extract_method": "links",
|
||
"load_more_selector": "button.load-more",
|
||
"max_load_more": 6,
|
||
},
|
||
],
|
||
},
|
||
{
|
||
"name": "Department of Physics and Astronomy",
|
||
"keywords": [
|
||
"physics",
|
||
"astronomy",
|
||
"astrophysics",
|
||
"nuclear",
|
||
"particle",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||
"extract_method": "links",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "Department of Electrical and Electronic Engineering",
|
||
"keywords": [
|
||
"electrical",
|
||
"electronic",
|
||
"eee",
|
||
"power systems",
|
||
"microelectronics",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||
"extract_method": "links",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "Department of Chemistry",
|
||
"keywords": ["chemistry", "chemical"],
|
||
"attach_faculty_to_programs": True,
|
||
"extract_method": "research_explorer",
|
||
"research_explorer": {"page_size": 200},
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/",
|
||
"extract_method": "research_explorer",
|
||
"requires_scroll": True,
|
||
"request": {
|
||
"timeout_ms": 120000,
|
||
"wait_until": "networkidle",
|
||
"post_wait_ms": 5000,
|
||
},
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "Department of Mathematics",
|
||
"keywords": [
|
||
"mathematics",
|
||
"mathematical",
|
||
"applied math",
|
||
"statistics",
|
||
"actuarial",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"extract_method": "research_explorer",
|
||
"research_explorer": {"page_size": 200},
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/",
|
||
"extract_method": "research_explorer",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "School of Engineering",
|
||
"keywords": [
|
||
"engineering",
|
||
"mechanical",
|
||
"aerospace",
|
||
"civil",
|
||
"structural",
|
||
"materials",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"extract_method": "research_explorer",
|
||
"research_explorer": {"page_size": 400},
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/",
|
||
"extract_method": "research_explorer",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "Faculty of Biology, Medicine and Health",
|
||
"keywords": [
|
||
"medicine",
|
||
"medical",
|
||
"health",
|
||
"nursing",
|
||
"pharmacy",
|
||
"clinical",
|
||
"dental",
|
||
"optometry",
|
||
"biology",
|
||
"biomedical",
|
||
"anatomical",
|
||
"physiotherapy",
|
||
"midwifery",
|
||
"mental health",
|
||
"psychology",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"extract_method": "research_explorer",
|
||
"research_explorer": {"page_size": 400},
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/",
|
||
"extract_method": "research_explorer",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "School of Social Sciences",
|
||
"keywords": [
|
||
"sociology",
|
||
"politics",
|
||
"international",
|
||
"social",
|
||
"criminology",
|
||
"anthropology",
|
||
"philosophy",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"extract_method": "research_explorer",
|
||
"research_explorer": {"page_size": 200},
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/",
|
||
"extract_method": "research_explorer",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "School of Law",
|
||
"keywords": ["law", "legal", "llm"],
|
||
"attach_faculty_to_programs": True,
|
||
"extract_method": "research_explorer",
|
||
"research_explorer": {"page_size": 200},
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/",
|
||
"extract_method": "research_explorer",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "School of Arts, Languages and Cultures",
|
||
"keywords": [
|
||
"arts",
|
||
"languages",
|
||
"culture",
|
||
"music",
|
||
"drama",
|
||
"theatre",
|
||
"history",
|
||
"linguistics",
|
||
"literature",
|
||
"translation",
|
||
"classics",
|
||
"archaeology",
|
||
"religion",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"extract_method": "research_explorer",
|
||
"research_explorer": {"page_size": 400},
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/",
|
||
"extract_method": "research_explorer",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
{
|
||
"name": "School of Environment, Education and Development",
|
||
"keywords": [
|
||
"environment",
|
||
"education",
|
||
"development",
|
||
"planning",
|
||
"architecture",
|
||
"urban",
|
||
"geography",
|
||
"sustainability",
|
||
],
|
||
"attach_faculty_to_programs": True,
|
||
"extract_method": "research_explorer",
|
||
"research_explorer": {"page_size": 300},
|
||
"staff_pages": [
|
||
{
|
||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/",
|
||
"extract_method": "research_explorer",
|
||
"requires_scroll": True,
|
||
}
|
||
],
|
||
},
|
||
]
|
||
|
||
SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG}
|
||
|
||
# =========================
|
||
# JS 提取函数
|
||
# =========================
|
||
|
||
JS_EXTRACT_TABLE_STAFF = """() => {
|
||
const staff = [];
|
||
const seen = new Set();
|
||
|
||
document.querySelectorAll('table tr').forEach(row => {
|
||
const cells = row.querySelectorAll('td');
|
||
if (cells.length >= 2) {
|
||
const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]');
|
||
const titleCell = cells[2] || cells[1];
|
||
|
||
if (link) {
|
||
const name = link.innerText.trim();
|
||
const url = link.href;
|
||
const title = titleCell ? titleCell.innerText.trim() : '';
|
||
|
||
if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) {
|
||
seen.add(url);
|
||
staff.push({
|
||
name,
|
||
url,
|
||
title
|
||
});
|
||
}
|
||
}
|
||
}
|
||
});
|
||
|
||
return staff;
|
||
}"""
|
||
|
||
JS_EXTRACT_LINK_STAFF = """() => {
|
||
const staff = [];
|
||
const seen = new Set();
|
||
|
||
document.querySelectorAll('a[href]').forEach(a => {
|
||
const href = a.href;
|
||
const text = a.innerText.trim();
|
||
|
||
if (seen.has(href)) return;
|
||
if (text.length < 5 || text.length > 80) return;
|
||
|
||
const lowerText = text.toLowerCase();
|
||
if (lowerText.includes('skip') ||
|
||
lowerText.includes('staff') ||
|
||
lowerText.includes('people') ||
|
||
lowerText.includes('academic') ||
|
||
lowerText.includes('research profiles')) return;
|
||
|
||
if (href.includes('/persons/') ||
|
||
href.includes('/portal/en/researchers/') ||
|
||
href.includes('/profile/') ||
|
||
href.includes('/people/')) {
|
||
seen.add(href);
|
||
staff.push({
|
||
name: text,
|
||
url: href,
|
||
title: ''
|
||
});
|
||
}
|
||
});
|
||
|
||
return staff;
|
||
}"""
|
||
|
||
JS_EXTRACT_RESEARCH_EXPLORER = """() => {
|
||
const staff = [];
|
||
const seen = new Set();
|
||
|
||
document.querySelectorAll('a.link.person').forEach(a => {
|
||
const href = a.href;
|
||
const text = a.innerText.trim();
|
||
|
||
if (!seen.has(href) && text.length > 3 && text.length < 80) {
|
||
seen.add(href);
|
||
staff.push({
|
||
name: text,
|
||
url: href,
|
||
title: ''
|
||
});
|
||
}
|
||
});
|
||
|
||
if (staff.length === 0) {
|
||
document.querySelectorAll('a[href*="/persons/"]').forEach(a => {
|
||
const href = a.href;
|
||
const text = a.innerText.trim();
|
||
const lower = text.toLowerCase();
|
||
|
||
if (seen.has(href)) return;
|
||
if (text.length < 3 || text.length > 80) return;
|
||
if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return;
|
||
|
||
seen.add(href);
|
||
staff.push({
|
||
name: text,
|
||
url: href,
|
||
title: ''
|
||
});
|
||
});
|
||
}
|
||
|
||
return staff;
|
||
}"""
|
||
|
||
JS_EXTRACT_PROGRAMS = """() => {
|
||
const programs = [];
|
||
const seen = new Set();
|
||
|
||
document.querySelectorAll('a[href]').forEach(a => {
|
||
const href = a.href;
|
||
const text = a.innerText.trim().replace(/\\s+/g, ' ');
|
||
|
||
if (!href || seen.has(href)) return;
|
||
if (text.length < 10 || text.length > 200) return;
|
||
|
||
const hrefLower = href.toLowerCase();
|
||
const textLower = text.toLowerCase();
|
||
|
||
const isNav = textLower === 'courses' ||
|
||
textLower === 'masters' ||
|
||
textLower.includes('admission') ||
|
||
textLower.includes('fees') ||
|
||
textLower.includes('skip to') ||
|
||
textLower.includes('search') ||
|
||
textLower.includes('contact') ||
|
||
hrefLower.includes('#');
|
||
if (isNav) return;
|
||
|
||
const hasNumericId = /\\/\\d{5}\\//.test(href);
|
||
const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId;
|
||
|
||
if (isCoursePage) {
|
||
seen.add(href);
|
||
programs.push({
|
||
name: text,
|
||
url: href
|
||
});
|
||
}
|
||
});
|
||
|
||
return programs;
|
||
}"""
|
||
|
||
|
||
# =========================
|
||
# 数据匹配
|
||
# =========================
|
||
|
||
def match_program_to_school(program_name: str) -> str:
|
||
lower = program_name.lower()
|
||
for school in SCHOOL_CONFIG:
|
||
for keyword in school["keywords"]:
|
||
if keyword in lower:
|
||
return school["name"]
|
||
return "Other Programs"
|
||
|
||
|
||
# =========================
|
||
# 请求与解析工具
|
||
# =========================
|
||
|
||
def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]:
|
||
settings = dict(DEFAULT_REQUEST)
|
||
for layer in layers:
|
||
if not layer:
|
||
continue
|
||
for key, value in layer.items():
|
||
if value is not None:
|
||
settings[key] = value
|
||
settings["max_retries"] = max(1, int(settings.get("max_retries", 1)))
|
||
settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000)
|
||
return settings
|
||
|
||
|
||
async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]:
|
||
last_error = None
|
||
for attempt in range(settings["max_retries"]):
|
||
try:
|
||
await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"])
|
||
if settings.get("wait_for_selector"):
|
||
await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"])
|
||
if settings.get("post_wait_ms"):
|
||
await page.wait_for_timeout(settings["post_wait_ms"])
|
||
return True, None
|
||
except PlaywrightTimeoutError as exc:
|
||
last_error = f"Timeout: {exc}"
|
||
except Exception as exc: # noqa: BLE001
|
||
last_error = str(exc)
|
||
|
||
if attempt < settings["max_retries"] - 1:
|
||
await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1))
|
||
|
||
return False, last_error
|
||
|
||
|
||
async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800):
|
||
repetitions = max(1, repetitions)
|
||
for i in range(repetitions):
|
||
await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1))
|
||
await page.wait_for_timeout(delay_ms)
|
||
|
||
|
||
async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500):
|
||
for _ in range(max_clicks):
|
||
button = await page.query_selector(selector)
|
||
if not button:
|
||
break
|
||
try:
|
||
await button.click()
|
||
await page.wait_for_timeout(wait_ms)
|
||
except Exception:
|
||
break
|
||
|
||
|
||
def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
seen = set()
|
||
cleaned = []
|
||
for item in staff:
|
||
name = (item.get("name") or "").strip()
|
||
if not name:
|
||
continue
|
||
url = (item.get("url") or "").strip()
|
||
key = url or name.lower()
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()})
|
||
return cleaned
|
||
|
||
|
||
def _append_query(url: str, params: Dict[str, Any]) -> str:
|
||
delimiter = "&" if "?" in url else "?"
|
||
return f"{url}{delimiter}{urlencode(params)}"
|
||
|
||
|
||
def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]:
|
||
if not staff_url:
|
||
return None
|
||
path = staff_url.rstrip("/").split("/")
|
||
return path[-1] if path else None
|
||
|
||
|
||
def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]:
|
||
items: List[Dict[str, Any]] = []
|
||
if isinstance(data, list):
|
||
items = data
|
||
elif isinstance(data, dict):
|
||
for key in ("results", "items", "persons", "data", "entities"):
|
||
if isinstance(data.get(key), list):
|
||
items = data[key]
|
||
break
|
||
if not items and isinstance(data.get("rows"), list):
|
||
items = data["rows"]
|
||
|
||
staff = []
|
||
for item in items:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
name = item.get("name") or item.get("title") or item.get("fullName")
|
||
profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL")
|
||
if not name:
|
||
continue
|
||
if profile_url:
|
||
profile_url = urljoin(base_url, profile_url)
|
||
staff.append(
|
||
{
|
||
"name": name.strip(),
|
||
"url": (profile_url or "").strip(),
|
||
"title": (item.get("jobTitle") or item.get("position") or "").strip(),
|
||
}
|
||
)
|
||
return staff
|
||
|
||
|
||
def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]:
|
||
staff: List[Dict[str, str]] = []
|
||
try:
|
||
root = ET.fromstring(text)
|
||
except ET.ParseError:
|
||
return staff
|
||
|
||
for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"):
|
||
title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="")
|
||
link = entry.find("{http://www.w3.org/2005/Atom}link")
|
||
href = link.attrib.get("href") if link is not None else ""
|
||
if title:
|
||
staff.append(
|
||
{
|
||
"name": title.strip(),
|
||
"url": urljoin(base_url, href) if href else "",
|
||
"title": "",
|
||
}
|
||
)
|
||
return staff
|
||
|
||
|
||
async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
|
||
config = school_config.get("research_explorer") or {}
|
||
if not config and school_config.get("extract_method") != "research_explorer":
|
||
return []
|
||
|
||
base_staff_url = ""
|
||
if school_config.get("staff_pages"):
|
||
base_staff_url = school_config["staff_pages"][0].get("url", "")
|
||
|
||
page_size = config.get("page_size", 200)
|
||
timeout_ms = config.get("timeout_ms", 70000)
|
||
|
||
candidates: List[str] = []
|
||
slug = config.get("org_slug") or _guess_research_slug(base_staff_url)
|
||
base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx")
|
||
|
||
if config.get("api_url"):
|
||
candidates.append(config["api_url"])
|
||
|
||
if slug:
|
||
params = {
|
||
"action": "search",
|
||
"language": "en",
|
||
"format": "json",
|
||
"site": "default",
|
||
"showall": "true",
|
||
"pageSize": page_size,
|
||
"organisations": slug,
|
||
}
|
||
candidates.append(f"{base_api}?{urlencode(params)}")
|
||
|
||
if base_staff_url:
|
||
candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size}))
|
||
candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size}))
|
||
|
||
for url in candidates:
|
||
try:
|
||
resp = await context.request.get(url, timeout=timeout_ms)
|
||
if resp.status != 200:
|
||
continue
|
||
ctype = resp.headers.get("content-type", "")
|
||
if "json" in ctype:
|
||
data = await resp.json()
|
||
parsed = _parse_research_explorer_json(data, base_staff_url)
|
||
else:
|
||
text = await resp.text()
|
||
parsed = _parse_research_explorer_xml(text, base_staff_url)
|
||
parsed = _deduplicate_staff(parsed)
|
||
if parsed:
|
||
if output_callback:
|
||
output_callback("info", f" {school_config['name']}: {len(parsed)} staff via API")
|
||
return parsed
|
||
except Exception as exc: # noqa: BLE001
|
||
if output_callback:
|
||
output_callback(
|
||
"warning", f" {school_config['name']}: API fetch failed ({str(exc)[:60]})"
|
||
)
|
||
return []
|
||
|
||
|
||
async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
|
||
staff_collected: List[Dict[str, str]] = []
|
||
staff_pages = school_config.get("staff_pages") or []
|
||
if not staff_pages and school_config.get("staff_url"):
|
||
staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}]
|
||
|
||
page = await context.new_page()
|
||
blocked_types = school_config.get("blocked_resources", ["image", "font", "media"])
|
||
if blocked_types:
|
||
async def _route_handler(route):
|
||
if route.request.resource_type in blocked_types:
|
||
await route.abort()
|
||
else:
|
||
await route.continue_()
|
||
|
||
await page.route("**/*", _route_handler)
|
||
|
||
for page_cfg in staff_pages:
|
||
target_url = page_cfg.get("url")
|
||
if not target_url:
|
||
continue
|
||
|
||
settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request"))
|
||
success, error = await _goto_with_retry(page, target_url, settings, school_config["name"])
|
||
if not success:
|
||
if output_callback:
|
||
output_callback("warning", f" {school_config['name']}: failed to load {target_url} ({error})")
|
||
continue
|
||
|
||
if page_cfg.get("requires_scroll"):
|
||
await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700))
|
||
|
||
if page_cfg.get("load_from_selector"):
|
||
await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5))
|
||
elif page_cfg.get("load_more_selector"):
|
||
await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5))
|
||
|
||
method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links"
|
||
if method == "table":
|
||
extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF)
|
||
elif method == "research_explorer":
|
||
extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER)
|
||
else:
|
||
extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF)
|
||
|
||
staff_collected.extend(extracted)
|
||
|
||
await page.close()
|
||
return _deduplicate_staff(staff_collected)
|
||
|
||
|
||
# =========================
|
||
# 并发抓取学院 Staff
|
||
# =========================
|
||
|
||
async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback):
|
||
async with semaphore:
|
||
staff_list: List[Dict[str, str]] = []
|
||
status = "success"
|
||
error: Optional[str] = None
|
||
|
||
try:
|
||
if school_config.get("extract_method") == "research_explorer":
|
||
staff_list = await fetch_research_explorer_api(context, school_config, output_callback)
|
||
if not staff_list:
|
||
staff_list = await scrape_staff_via_browser(context, school_config, output_callback)
|
||
|
||
if output_callback:
|
||
output_callback("info", f" {school_config['name']}: total {len(staff_list)} staff")
|
||
|
||
except Exception as exc: # noqa: BLE001
|
||
status = "error"
|
||
error = str(exc)
|
||
if output_callback:
|
||
output_callback("error", f" {school_config['name']}: {error}")
|
||
|
||
return {
|
||
"name": school_config["name"],
|
||
"staff": staff_list,
|
||
"status": status,
|
||
"error": error,
|
||
}
|
||
|
||
|
||
async def scrape_all_school_staff(context, output_callback):
|
||
semaphore = asyncio.Semaphore(STAFF_CONCURRENCY)
|
||
tasks = [
|
||
asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback))
|
||
for cfg in SCHOOL_CONFIG
|
||
]
|
||
results = await asyncio.gather(*tasks)
|
||
|
||
staff_map = {}
|
||
diagnostics = {"failed": [], "success": [], "total": len(results)}
|
||
for res in results:
|
||
if res["staff"]:
|
||
staff_map[res["name"]] = res["staff"]
|
||
diagnostics["success"].append(res["name"])
|
||
else:
|
||
diagnostics["failed"].append(
|
||
{
|
||
"name": res["name"],
|
||
"status": res["status"],
|
||
"error": res.get("error"),
|
||
}
|
||
)
|
||
return staff_map, diagnostics
|
||
|
||
|
||
# =========================
|
||
# 主流程
|
||
# =========================
|
||
|
||
async def scrape(output_callback=None):
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(headless=True)
|
||
context = await browser.new_context(
|
||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||
)
|
||
|
||
base_url = "https://www.manchester.ac.uk/"
|
||
result = {
|
||
"name": "The University of Manchester",
|
||
"url": base_url,
|
||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||
"schools": [],
|
||
"diagnostics": {},
|
||
}
|
||
|
||
try:
|
||
# Step 1: Masters 列表
|
||
if output_callback:
|
||
output_callback("info", "Step 1: Scraping masters programs list...")
|
||
|
||
page = await context.new_page()
|
||
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
|
||
await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000)
|
||
await page.wait_for_timeout(3000)
|
||
programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS)
|
||
await page.close()
|
||
|
||
if output_callback:
|
||
output_callback("info", f"Found {len(programs_data)} masters programs")
|
||
|
||
# Step 2: 并发抓取学院 Staff
|
||
if output_callback:
|
||
output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...")
|
||
school_staff, diagnostics = await scrape_all_school_staff(context, output_callback)
|
||
|
||
# Step 3: 组织数据
|
||
schools_dict: Dict[str, Dict[str, Any]] = {}
|
||
for prog in programs_data:
|
||
school_name = match_program_to_school(prog["name"])
|
||
if school_name not in schools_dict:
|
||
schools_dict[school_name] = {
|
||
"name": school_name,
|
||
"url": "",
|
||
"programs": [],
|
||
"faculty": school_staff.get(school_name, []),
|
||
"faculty_source": "school_directory" if school_staff.get(school_name) else "",
|
||
}
|
||
|
||
schools_dict[school_name]["programs"].append(
|
||
{
|
||
"name": prog["name"],
|
||
"url": prog["url"],
|
||
"faculty": [],
|
||
}
|
||
)
|
||
|
||
for cfg in SCHOOL_CONFIG:
|
||
if cfg["name"] in schools_dict:
|
||
first_page = (cfg.get("staff_pages") or [{}])[0]
|
||
schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "")
|
||
|
||
_attach_faculty_to_programs(schools_dict, school_staff)
|
||
|
||
result["schools"] = list(schools_dict.values())
|
||
|
||
total_programs = sum(len(s["programs"]) for s in result["schools"])
|
||
total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"])
|
||
|
||
result["diagnostics"] = {
|
||
"total_programs": total_programs,
|
||
"total_faculty_records": total_faculty,
|
||
"school_staff_success": diagnostics.get("success", []),
|
||
"school_staff_failed": diagnostics.get("failed", []),
|
||
}
|
||
|
||
if output_callback:
|
||
output_callback(
|
||
"info",
|
||
f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty",
|
||
)
|
||
|
||
except Exception as exc: # noqa: BLE001
|
||
if output_callback:
|
||
output_callback("error", f"Scraping error: {str(exc)}")
|
||
finally:
|
||
await browser.close()
|
||
|
||
return result
|
||
|
||
|
||
def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]):
|
||
for school_name, school_data in schools_dict.items():
|
||
staff = staff_map.get(school_name, [])
|
||
cfg = SCHOOL_LOOKUP.get(school_name, {})
|
||
if not staff or not cfg.get("attach_faculty_to_programs"):
|
||
continue
|
||
|
||
limit = cfg.get("faculty_per_program")
|
||
for program in school_data["programs"]:
|
||
sliced = deepcopy(staff[:limit] if limit else staff)
|
||
program["faculty"] = sliced
|
||
|
||
|
||
# =========================
|
||
# CLI
|
||
# =========================
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
|
||
if sys.platform == "win32":
|
||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||
|
||
def print_callback(level, msg):
|
||
print(f"[{level}] {msg}")
|
||
|
||
scrape_result = asyncio.run(scrape(output_callback=print_callback))
|
||
|
||
output_path = "output/manchester_complete_result.json"
|
||
with open(output_path, "w", encoding="utf-8") as f:
|
||
json.dump(scrape_result, f, ensure_ascii=False, indent=2)
|
||
|
||
print("\nResult saved to", output_path)
|
||
print("\n=== Summary ===")
|
||
for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))):
|
||
print(
|
||
f" {school['name']}: "
|
||
f"{len(school['programs'])} programs, "
|
||
f"{len(school.get('faculty', []))} faculty"
|
||
)
|
||
|