Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
910
artifacts/manchester_complete_scraper.py
Normal file
910
artifacts/manchester_complete_scraper.py
Normal file
@ -0,0 +1,910 @@
|
||||
"""
|
||||
曼彻斯特大学完整采集脚本
|
||||
新增特性:
|
||||
- Research Explorer API 优先拉取 JSON / XML,失败再回落 DOM
|
||||
- 每个学院独立页面、并行抓取(默认 3 并发)
|
||||
- 细粒度超时/重试/滚动/Load more 控制
|
||||
- 多 URL / 备用 Staff 页面配置
|
||||
- 导师目录缓存,可按学院关键词映射到项目
|
||||
- 诊断信息记录(失败学院、超时学院、批次信息)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import urlencode, urljoin
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
from playwright.async_api import (
|
||||
TimeoutError as PlaywrightTimeoutError,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
# =========================
|
||||
# 配置区
|
||||
# =========================
|
||||
|
||||
DEFAULT_REQUEST = {
|
||||
"timeout_ms": 60000,
|
||||
"post_wait_ms": 2500,
|
||||
"wait_until": "domcontentloaded",
|
||||
"max_retries": 3,
|
||||
"retry_backoff_ms": 2000,
|
||||
}
|
||||
|
||||
STAFF_CONCURRENCY = 3
|
||||
|
||||
SCHOOL_CONFIG: List[Dict[str, Any]] = [
|
||||
{
|
||||
"name": "Alliance Manchester Business School",
|
||||
"keywords": [
|
||||
"accounting",
|
||||
"finance",
|
||||
"business",
|
||||
"management",
|
||||
"marketing",
|
||||
"mba",
|
||||
"economics",
|
||||
"entrepreneurship",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
|
||||
"extract_method": "table",
|
||||
"request": {"timeout_ms": 60000, "wait_until": "networkidle"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Computer Science",
|
||||
"keywords": [
|
||||
"computer",
|
||||
"software",
|
||||
"data science",
|
||||
"artificial intelligence",
|
||||
"ai ",
|
||||
"machine learning",
|
||||
"cyber",
|
||||
"computing",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||||
"extract_method": "links",
|
||||
"requires_scroll": True,
|
||||
},
|
||||
{
|
||||
"url": "https://www.cs.manchester.ac.uk/about/people/",
|
||||
"extract_method": "links",
|
||||
"load_more_selector": "button.load-more",
|
||||
"max_load_more": 6,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Physics and Astronomy",
|
||||
"keywords": [
|
||||
"physics",
|
||||
"astronomy",
|
||||
"astrophysics",
|
||||
"nuclear",
|
||||
"particle",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||||
"extract_method": "links",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Electrical and Electronic Engineering",
|
||||
"keywords": [
|
||||
"electrical",
|
||||
"electronic",
|
||||
"eee",
|
||||
"power systems",
|
||||
"microelectronics",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||||
"extract_method": "links",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Chemistry",
|
||||
"keywords": ["chemistry", "chemical"],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 200},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
"request": {
|
||||
"timeout_ms": 120000,
|
||||
"wait_until": "networkidle",
|
||||
"post_wait_ms": 5000,
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Mathematics",
|
||||
"keywords": [
|
||||
"mathematics",
|
||||
"mathematical",
|
||||
"applied math",
|
||||
"statistics",
|
||||
"actuarial",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 200},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Engineering",
|
||||
"keywords": [
|
||||
"engineering",
|
||||
"mechanical",
|
||||
"aerospace",
|
||||
"civil",
|
||||
"structural",
|
||||
"materials",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 400},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Faculty of Biology, Medicine and Health",
|
||||
"keywords": [
|
||||
"medicine",
|
||||
"medical",
|
||||
"health",
|
||||
"nursing",
|
||||
"pharmacy",
|
||||
"clinical",
|
||||
"dental",
|
||||
"optometry",
|
||||
"biology",
|
||||
"biomedical",
|
||||
"anatomical",
|
||||
"physiotherapy",
|
||||
"midwifery",
|
||||
"mental health",
|
||||
"psychology",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 400},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Social Sciences",
|
||||
"keywords": [
|
||||
"sociology",
|
||||
"politics",
|
||||
"international",
|
||||
"social",
|
||||
"criminology",
|
||||
"anthropology",
|
||||
"philosophy",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 200},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Law",
|
||||
"keywords": ["law", "legal", "llm"],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 200},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Arts, Languages and Cultures",
|
||||
"keywords": [
|
||||
"arts",
|
||||
"languages",
|
||||
"culture",
|
||||
"music",
|
||||
"drama",
|
||||
"theatre",
|
||||
"history",
|
||||
"linguistics",
|
||||
"literature",
|
||||
"translation",
|
||||
"classics",
|
||||
"archaeology",
|
||||
"religion",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 400},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Environment, Education and Development",
|
||||
"keywords": [
|
||||
"environment",
|
||||
"education",
|
||||
"development",
|
||||
"planning",
|
||||
"architecture",
|
||||
"urban",
|
||||
"geography",
|
||||
"sustainability",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 300},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG}
|
||||
|
||||
# =========================
|
||||
# JS 提取函数
|
||||
# =========================
|
||||
|
||||
JS_EXTRACT_TABLE_STAFF = """() => {
|
||||
const staff = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('table tr').forEach(row => {
|
||||
const cells = row.querySelectorAll('td');
|
||||
if (cells.length >= 2) {
|
||||
const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]');
|
||||
const titleCell = cells[2] || cells[1];
|
||||
|
||||
if (link) {
|
||||
const name = link.innerText.trim();
|
||||
const url = link.href;
|
||||
const title = titleCell ? titleCell.innerText.trim() : '';
|
||||
|
||||
if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) {
|
||||
seen.add(url);
|
||||
staff.push({
|
||||
name,
|
||||
url,
|
||||
title
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return staff;
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_LINK_STAFF = """() => {
|
||||
const staff = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim();
|
||||
|
||||
if (seen.has(href)) return;
|
||||
if (text.length < 5 || text.length > 80) return;
|
||||
|
||||
const lowerText = text.toLowerCase();
|
||||
if (lowerText.includes('skip') ||
|
||||
lowerText.includes('staff') ||
|
||||
lowerText.includes('people') ||
|
||||
lowerText.includes('academic') ||
|
||||
lowerText.includes('research profiles')) return;
|
||||
|
||||
if (href.includes('/persons/') ||
|
||||
href.includes('/portal/en/researchers/') ||
|
||||
href.includes('/profile/') ||
|
||||
href.includes('/people/')) {
|
||||
seen.add(href);
|
||||
staff.push({
|
||||
name: text,
|
||||
url: href,
|
||||
title: ''
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return staff;
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_RESEARCH_EXPLORER = """() => {
|
||||
const staff = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a.link.person').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim();
|
||||
|
||||
if (!seen.has(href) && text.length > 3 && text.length < 80) {
|
||||
seen.add(href);
|
||||
staff.push({
|
||||
name: text,
|
||||
url: href,
|
||||
title: ''
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if (staff.length === 0) {
|
||||
document.querySelectorAll('a[href*="/persons/"]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim();
|
||||
const lower = text.toLowerCase();
|
||||
|
||||
if (seen.has(href)) return;
|
||||
if (text.length < 3 || text.length > 80) return;
|
||||
if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return;
|
||||
|
||||
seen.add(href);
|
||||
staff.push({
|
||||
name: text,
|
||||
url: href,
|
||||
title: ''
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return staff;
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_PROGRAMS = """() => {
|
||||
const programs = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim().replace(/\\s+/g, ' ');
|
||||
|
||||
if (!href || seen.has(href)) return;
|
||||
if (text.length < 10 || text.length > 200) return;
|
||||
|
||||
const hrefLower = href.toLowerCase();
|
||||
const textLower = text.toLowerCase();
|
||||
|
||||
const isNav = textLower === 'courses' ||
|
||||
textLower === 'masters' ||
|
||||
textLower.includes('admission') ||
|
||||
textLower.includes('fees') ||
|
||||
textLower.includes('skip to') ||
|
||||
textLower.includes('search') ||
|
||||
textLower.includes('contact') ||
|
||||
hrefLower.includes('#');
|
||||
if (isNav) return;
|
||||
|
||||
const hasNumericId = /\\/\\d{5}\\//.test(href);
|
||||
const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId;
|
||||
|
||||
if (isCoursePage) {
|
||||
seen.add(href);
|
||||
programs.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return programs;
|
||||
}"""
|
||||
|
||||
|
||||
# =========================
|
||||
# 数据匹配
|
||||
# =========================
|
||||
|
||||
def match_program_to_school(program_name: str) -> str:
|
||||
lower = program_name.lower()
|
||||
for school in SCHOOL_CONFIG:
|
||||
for keyword in school["keywords"]:
|
||||
if keyword in lower:
|
||||
return school["name"]
|
||||
return "Other Programs"
|
||||
|
||||
|
||||
# =========================
|
||||
# 请求与解析工具
|
||||
# =========================
|
||||
|
||||
def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
settings = dict(DEFAULT_REQUEST)
|
||||
for layer in layers:
|
||||
if not layer:
|
||||
continue
|
||||
for key, value in layer.items():
|
||||
if value is not None:
|
||||
settings[key] = value
|
||||
settings["max_retries"] = max(1, int(settings.get("max_retries", 1)))
|
||||
settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000)
|
||||
return settings
|
||||
|
||||
|
||||
async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]:
|
||||
last_error = None
|
||||
for attempt in range(settings["max_retries"]):
|
||||
try:
|
||||
await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"])
|
||||
if settings.get("wait_for_selector"):
|
||||
await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"])
|
||||
if settings.get("post_wait_ms"):
|
||||
await page.wait_for_timeout(settings["post_wait_ms"])
|
||||
return True, None
|
||||
except PlaywrightTimeoutError as exc:
|
||||
last_error = f"Timeout: {exc}"
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_error = str(exc)
|
||||
|
||||
if attempt < settings["max_retries"] - 1:
|
||||
await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1))
|
||||
|
||||
return False, last_error
|
||||
|
||||
|
||||
async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800):
|
||||
repetitions = max(1, repetitions)
|
||||
for i in range(repetitions):
|
||||
await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1))
|
||||
await page.wait_for_timeout(delay_ms)
|
||||
|
||||
|
||||
async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500):
|
||||
for _ in range(max_clicks):
|
||||
button = await page.query_selector(selector)
|
||||
if not button:
|
||||
break
|
||||
try:
|
||||
await button.click()
|
||||
await page.wait_for_timeout(wait_ms)
|
||||
except Exception:
|
||||
break
|
||||
|
||||
|
||||
def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
seen = set()
|
||||
cleaned = []
|
||||
for item in staff:
|
||||
name = (item.get("name") or "").strip()
|
||||
if not name:
|
||||
continue
|
||||
url = (item.get("url") or "").strip()
|
||||
key = url or name.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()})
|
||||
return cleaned
|
||||
|
||||
|
||||
def _append_query(url: str, params: Dict[str, Any]) -> str:
|
||||
delimiter = "&" if "?" in url else "?"
|
||||
return f"{url}{delimiter}{urlencode(params)}"
|
||||
|
||||
|
||||
def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]:
|
||||
if not staff_url:
|
||||
return None
|
||||
path = staff_url.rstrip("/").split("/")
|
||||
return path[-1] if path else None
|
||||
|
||||
|
||||
def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]:
|
||||
items: List[Dict[str, Any]] = []
|
||||
if isinstance(data, list):
|
||||
items = data
|
||||
elif isinstance(data, dict):
|
||||
for key in ("results", "items", "persons", "data", "entities"):
|
||||
if isinstance(data.get(key), list):
|
||||
items = data[key]
|
||||
break
|
||||
if not items and isinstance(data.get("rows"), list):
|
||||
items = data["rows"]
|
||||
|
||||
staff = []
|
||||
for item in items:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
name = item.get("name") or item.get("title") or item.get("fullName")
|
||||
profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL")
|
||||
if not name:
|
||||
continue
|
||||
if profile_url:
|
||||
profile_url = urljoin(base_url, profile_url)
|
||||
staff.append(
|
||||
{
|
||||
"name": name.strip(),
|
||||
"url": (profile_url or "").strip(),
|
||||
"title": (item.get("jobTitle") or item.get("position") or "").strip(),
|
||||
}
|
||||
)
|
||||
return staff
|
||||
|
||||
|
||||
def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]:
|
||||
staff: List[Dict[str, str]] = []
|
||||
try:
|
||||
root = ET.fromstring(text)
|
||||
except ET.ParseError:
|
||||
return staff
|
||||
|
||||
for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"):
|
||||
title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="")
|
||||
link = entry.find("{http://www.w3.org/2005/Atom}link")
|
||||
href = link.attrib.get("href") if link is not None else ""
|
||||
if title:
|
||||
staff.append(
|
||||
{
|
||||
"name": title.strip(),
|
||||
"url": urljoin(base_url, href) if href else "",
|
||||
"title": "",
|
||||
}
|
||||
)
|
||||
return staff
|
||||
|
||||
|
||||
async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
|
||||
config = school_config.get("research_explorer") or {}
|
||||
if not config and school_config.get("extract_method") != "research_explorer":
|
||||
return []
|
||||
|
||||
base_staff_url = ""
|
||||
if school_config.get("staff_pages"):
|
||||
base_staff_url = school_config["staff_pages"][0].get("url", "")
|
||||
|
||||
page_size = config.get("page_size", 200)
|
||||
timeout_ms = config.get("timeout_ms", 70000)
|
||||
|
||||
candidates: List[str] = []
|
||||
slug = config.get("org_slug") or _guess_research_slug(base_staff_url)
|
||||
base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx")
|
||||
|
||||
if config.get("api_url"):
|
||||
candidates.append(config["api_url"])
|
||||
|
||||
if slug:
|
||||
params = {
|
||||
"action": "search",
|
||||
"language": "en",
|
||||
"format": "json",
|
||||
"site": "default",
|
||||
"showall": "true",
|
||||
"pageSize": page_size,
|
||||
"organisations": slug,
|
||||
}
|
||||
candidates.append(f"{base_api}?{urlencode(params)}")
|
||||
|
||||
if base_staff_url:
|
||||
candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size}))
|
||||
candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size}))
|
||||
|
||||
for url in candidates:
|
||||
try:
|
||||
resp = await context.request.get(url, timeout=timeout_ms)
|
||||
if resp.status != 200:
|
||||
continue
|
||||
ctype = resp.headers.get("content-type", "")
|
||||
if "json" in ctype:
|
||||
data = await resp.json()
|
||||
parsed = _parse_research_explorer_json(data, base_staff_url)
|
||||
else:
|
||||
text = await resp.text()
|
||||
parsed = _parse_research_explorer_xml(text, base_staff_url)
|
||||
parsed = _deduplicate_staff(parsed)
|
||||
if parsed:
|
||||
if output_callback:
|
||||
output_callback("info", f" {school_config['name']}: {len(parsed)} staff via API")
|
||||
return parsed
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if output_callback:
|
||||
output_callback(
|
||||
"warning", f" {school_config['name']}: API fetch failed ({str(exc)[:60]})"
|
||||
)
|
||||
return []
|
||||
|
||||
|
||||
async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
|
||||
staff_collected: List[Dict[str, str]] = []
|
||||
staff_pages = school_config.get("staff_pages") or []
|
||||
if not staff_pages and school_config.get("staff_url"):
|
||||
staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}]
|
||||
|
||||
page = await context.new_page()
|
||||
blocked_types = school_config.get("blocked_resources", ["image", "font", "media"])
|
||||
if blocked_types:
|
||||
async def _route_handler(route):
|
||||
if route.request.resource_type in blocked_types:
|
||||
await route.abort()
|
||||
else:
|
||||
await route.continue_()
|
||||
|
||||
await page.route("**/*", _route_handler)
|
||||
|
||||
for page_cfg in staff_pages:
|
||||
target_url = page_cfg.get("url")
|
||||
if not target_url:
|
||||
continue
|
||||
|
||||
settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request"))
|
||||
success, error = await _goto_with_retry(page, target_url, settings, school_config["name"])
|
||||
if not success:
|
||||
if output_callback:
|
||||
output_callback("warning", f" {school_config['name']}: failed to load {target_url} ({error})")
|
||||
continue
|
||||
|
||||
if page_cfg.get("requires_scroll"):
|
||||
await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700))
|
||||
|
||||
if page_cfg.get("load_from_selector"):
|
||||
await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5))
|
||||
elif page_cfg.get("load_more_selector"):
|
||||
await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5))
|
||||
|
||||
method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links"
|
||||
if method == "table":
|
||||
extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF)
|
||||
elif method == "research_explorer":
|
||||
extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER)
|
||||
else:
|
||||
extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF)
|
||||
|
||||
staff_collected.extend(extracted)
|
||||
|
||||
await page.close()
|
||||
return _deduplicate_staff(staff_collected)
|
||||
|
||||
|
||||
# =========================
|
||||
# 并发抓取学院 Staff
|
||||
# =========================
|
||||
|
||||
async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback):
|
||||
async with semaphore:
|
||||
staff_list: List[Dict[str, str]] = []
|
||||
status = "success"
|
||||
error: Optional[str] = None
|
||||
|
||||
try:
|
||||
if school_config.get("extract_method") == "research_explorer":
|
||||
staff_list = await fetch_research_explorer_api(context, school_config, output_callback)
|
||||
if not staff_list:
|
||||
staff_list = await scrape_staff_via_browser(context, school_config, output_callback)
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f" {school_config['name']}: total {len(staff_list)} staff")
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
status = "error"
|
||||
error = str(exc)
|
||||
if output_callback:
|
||||
output_callback("error", f" {school_config['name']}: {error}")
|
||||
|
||||
return {
|
||||
"name": school_config["name"],
|
||||
"staff": staff_list,
|
||||
"status": status,
|
||||
"error": error,
|
||||
}
|
||||
|
||||
|
||||
async def scrape_all_school_staff(context, output_callback):
|
||||
semaphore = asyncio.Semaphore(STAFF_CONCURRENCY)
|
||||
tasks = [
|
||||
asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback))
|
||||
for cfg in SCHOOL_CONFIG
|
||||
]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
staff_map = {}
|
||||
diagnostics = {"failed": [], "success": [], "total": len(results)}
|
||||
for res in results:
|
||||
if res["staff"]:
|
||||
staff_map[res["name"]] = res["staff"]
|
||||
diagnostics["success"].append(res["name"])
|
||||
else:
|
||||
diagnostics["failed"].append(
|
||||
{
|
||||
"name": res["name"],
|
||||
"status": res["status"],
|
||||
"error": res.get("error"),
|
||||
}
|
||||
)
|
||||
return staff_map, diagnostics
|
||||
|
||||
|
||||
# =========================
|
||||
# 主流程
|
||||
# =========================
|
||||
|
||||
async def scrape(output_callback=None):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
)
|
||||
|
||||
base_url = "https://www.manchester.ac.uk/"
|
||||
result = {
|
||||
"name": "The University of Manchester",
|
||||
"url": base_url,
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": [],
|
||||
"diagnostics": {},
|
||||
}
|
||||
|
||||
try:
|
||||
# Step 1: Masters 列表
|
||||
if output_callback:
|
||||
output_callback("info", "Step 1: Scraping masters programs list...")
|
||||
|
||||
page = await context.new_page()
|
||||
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
|
||||
await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000)
|
||||
await page.wait_for_timeout(3000)
|
||||
programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS)
|
||||
await page.close()
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Found {len(programs_data)} masters programs")
|
||||
|
||||
# Step 2: 并发抓取学院 Staff
|
||||
if output_callback:
|
||||
output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...")
|
||||
school_staff, diagnostics = await scrape_all_school_staff(context, output_callback)
|
||||
|
||||
# Step 3: 组织数据
|
||||
schools_dict: Dict[str, Dict[str, Any]] = {}
|
||||
for prog in programs_data:
|
||||
school_name = match_program_to_school(prog["name"])
|
||||
if school_name not in schools_dict:
|
||||
schools_dict[school_name] = {
|
||||
"name": school_name,
|
||||
"url": "",
|
||||
"programs": [],
|
||||
"faculty": school_staff.get(school_name, []),
|
||||
"faculty_source": "school_directory" if school_staff.get(school_name) else "",
|
||||
}
|
||||
|
||||
schools_dict[school_name]["programs"].append(
|
||||
{
|
||||
"name": prog["name"],
|
||||
"url": prog["url"],
|
||||
"faculty": [],
|
||||
}
|
||||
)
|
||||
|
||||
for cfg in SCHOOL_CONFIG:
|
||||
if cfg["name"] in schools_dict:
|
||||
first_page = (cfg.get("staff_pages") or [{}])[0]
|
||||
schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "")
|
||||
|
||||
_attach_faculty_to_programs(schools_dict, school_staff)
|
||||
|
||||
result["schools"] = list(schools_dict.values())
|
||||
|
||||
total_programs = sum(len(s["programs"]) for s in result["schools"])
|
||||
total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"])
|
||||
|
||||
result["diagnostics"] = {
|
||||
"total_programs": total_programs,
|
||||
"total_faculty_records": total_faculty,
|
||||
"school_staff_success": diagnostics.get("success", []),
|
||||
"school_staff_failed": diagnostics.get("failed", []),
|
||||
}
|
||||
|
||||
if output_callback:
|
||||
output_callback(
|
||||
"info",
|
||||
f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty",
|
||||
)
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if output_callback:
|
||||
output_callback("error", f"Scraping error: {str(exc)}")
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]):
|
||||
for school_name, school_data in schools_dict.items():
|
||||
staff = staff_map.get(school_name, [])
|
||||
cfg = SCHOOL_LOOKUP.get(school_name, {})
|
||||
if not staff or not cfg.get("attach_faculty_to_programs"):
|
||||
continue
|
||||
|
||||
limit = cfg.get("faculty_per_program")
|
||||
for program in school_data["programs"]:
|
||||
sliced = deepcopy(staff[:limit] if limit else staff)
|
||||
program["faculty"] = sliced
|
||||
|
||||
|
||||
# =========================
|
||||
# CLI
|
||||
# =========================
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if sys.platform == "win32":
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||||
|
||||
def print_callback(level, msg):
|
||||
print(f"[{level}] {msg}")
|
||||
|
||||
scrape_result = asyncio.run(scrape(output_callback=print_callback))
|
||||
|
||||
output_path = "output/manchester_complete_result.json"
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(scrape_result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("\nResult saved to", output_path)
|
||||
print("\n=== Summary ===")
|
||||
for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))):
|
||||
print(
|
||||
f" {school['name']}: "
|
||||
f"{len(school['programs'])} programs, "
|
||||
f"{len(school.get('faculty', []))} faculty"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user