Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/manchester_complete_scraper.py
+++ b/artifacts/manchester_complete_scraper.py
@ -0,0 +1,910 @@
+"""
+曼彻斯特大学完整采集脚本
+新增特性：
+- Research Explorer API 优先拉取 JSON / XML，失败再回落 DOM
+- 每个学院独立页面、并行抓取（默认 3 并发）
+- 细粒度超时/重试/滚动/Load more 控制
+- 多 URL / 备用 Staff 页面配置
+- 导师目录缓存，可按学院关键词映射到项目
+- 诊断信息记录（失败学院、超时学院、批次信息）
+"""
+
+import asyncio
+import json
+import re
+from copy import deepcopy
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlencode, urljoin
+from xml.etree import ElementTree as ET
+
+from playwright.async_api import (
+    TimeoutError as PlaywrightTimeoutError,
+    async_playwright,
+)
+
+# =========================
+# 配置区
+# =========================
+
+DEFAULT_REQUEST = {
+    "timeout_ms": 60000,
+    "post_wait_ms": 2500,
+    "wait_until": "domcontentloaded",
+    "max_retries": 3,
+    "retry_backoff_ms": 2000,
+}
+
+STAFF_CONCURRENCY = 3
+
+SCHOOL_CONFIG: List[Dict[str, Any]] = [
+    {
+        "name": "Alliance Manchester Business School",
+        "keywords": [
+            "accounting",
+            "finance",
+            "business",
+            "management",
+            "marketing",
+            "mba",
+            "economics",
+            "entrepreneurship",
+        ],
+        "attach_faculty_to_programs": True,
+        "staff_pages": [
+            {
+                "url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
+                "extract_method": "table",
+                "request": {"timeout_ms": 60000, "wait_until": "networkidle"},
+            }
+        ],
+    },
+    {
+        "name": "Department of Computer Science",
+        "keywords": [
+            "computer",
+            "software",
+            "data science",
+            "artificial intelligence",
+            "ai ",
+            "machine learning",
+            "cyber",
+            "computing",
+        ],
+        "attach_faculty_to_programs": True,
+        "staff_pages": [
+            {
+                "url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/",
+                "extract_method": "links",
+                "requires_scroll": True,
+            },
+            {
+                "url": "https://www.cs.manchester.ac.uk/about/people/",
+                "extract_method": "links",
+                "load_more_selector": "button.load-more",
+                "max_load_more": 6,
+            },
+        ],
+    },
+    {
+        "name": "Department of Physics and Astronomy",
+        "keywords": [
+            "physics",
+            "astronomy",
+            "astrophysics",
+            "nuclear",
+            "particle",
+        ],
+        "attach_faculty_to_programs": True,
+        "staff_pages": [
+            {
+                "url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/",
+                "extract_method": "links",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "Department of Electrical and Electronic Engineering",
+        "keywords": [
+            "electrical",
+            "electronic",
+            "eee",
+            "power systems",
+            "microelectronics",
+        ],
+        "attach_faculty_to_programs": True,
+        "staff_pages": [
+            {
+                "url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/",
+                "extract_method": "links",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "Department of Chemistry",
+        "keywords": ["chemistry", "chemical"],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 200},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+                "request": {
+                    "timeout_ms": 120000,
+                    "wait_until": "networkidle",
+                    "post_wait_ms": 5000,
+                },
+            }
+        ],
+    },
+    {
+        "name": "Department of Mathematics",
+        "keywords": [
+            "mathematics",
+            "mathematical",
+            "applied math",
+            "statistics",
+            "actuarial",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 200},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Engineering",
+        "keywords": [
+            "engineering",
+            "mechanical",
+            "aerospace",
+            "civil",
+            "structural",
+            "materials",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 400},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "Faculty of Biology, Medicine and Health",
+        "keywords": [
+            "medicine",
+            "medical",
+            "health",
+            "nursing",
+            "pharmacy",
+            "clinical",
+            "dental",
+            "optometry",
+            "biology",
+            "biomedical",
+            "anatomical",
+            "physiotherapy",
+            "midwifery",
+            "mental health",
+            "psychology",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 400},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Social Sciences",
+        "keywords": [
+            "sociology",
+            "politics",
+            "international",
+            "social",
+            "criminology",
+            "anthropology",
+            "philosophy",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 200},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Law",
+        "keywords": ["law", "legal", "llm"],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 200},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Arts, Languages and Cultures",
+        "keywords": [
+            "arts",
+            "languages",
+            "culture",
+            "music",
+            "drama",
+            "theatre",
+            "history",
+            "linguistics",
+            "literature",
+            "translation",
+            "classics",
+            "archaeology",
+            "religion",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 400},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Environment, Education and Development",
+        "keywords": [
+            "environment",
+            "education",
+            "development",
+            "planning",
+            "architecture",
+            "urban",
+            "geography",
+            "sustainability",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 300},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+]
+
+SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG}
+
+# =========================
+# JS 提取函数
+# =========================
+
+JS_EXTRACT_TABLE_STAFF = """() => {
+    const staff = [];
+    const seen = new Set();
+
+    document.querySelectorAll('table tr').forEach(row => {
+        const cells = row.querySelectorAll('td');
+        if (cells.length >= 2) {
+            const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]');
+            const titleCell = cells[2] || cells[1];
+
+            if (link) {
+                const name = link.innerText.trim();
+                const url = link.href;
+                const title = titleCell ? titleCell.innerText.trim() : '';
+
+                if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) {
+                    seen.add(url);
+                    staff.push({
+                        name,
+                        url,
+                        title
+                    });
+                }
+            }
+        }
+    });
+
+    return staff;
+}"""
+
+JS_EXTRACT_LINK_STAFF = """() => {
+    const staff = [];
+    const seen = new Set();
+
+    document.querySelectorAll('a[href]').forEach(a => {
+        const href = a.href;
+        const text = a.innerText.trim();
+
+        if (seen.has(href)) return;
+        if (text.length < 5 || text.length > 80) return;
+
+        const lowerText = text.toLowerCase();
+        if (lowerText.includes('skip') ||
+            lowerText.includes('staff') ||
+            lowerText.includes('people') ||
+            lowerText.includes('academic') ||
+            lowerText.includes('research profiles')) return;
+
+        if (href.includes('/persons/') ||
+            href.includes('/portal/en/researchers/') ||
+            href.includes('/profile/') ||
+            href.includes('/people/')) {
+            seen.add(href);
+            staff.push({
+                name: text,
+                url: href,
+                title: ''
+            });
+        }
+    });
+
+    return staff;
+}"""
+
+JS_EXTRACT_RESEARCH_EXPLORER = """() => {
+    const staff = [];
+    const seen = new Set();
+
+    document.querySelectorAll('a.link.person').forEach(a => {
+        const href = a.href;
+        const text = a.innerText.trim();
+
+        if (!seen.has(href) && text.length > 3 && text.length < 80) {
+            seen.add(href);
+            staff.push({
+                name: text,
+                url: href,
+                title: ''
+            });
+        }
+    });
+
+    if (staff.length === 0) {
+        document.querySelectorAll('a[href*="/persons/"]').forEach(a => {
+            const href = a.href;
+            const text = a.innerText.trim();
+            const lower = text.toLowerCase();
+
+            if (seen.has(href)) return;
+            if (text.length < 3 || text.length > 80) return;
+            if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return;
+
+            seen.add(href);
+            staff.push({
+                name: text,
+                url: href,
+                title: ''
+            });
+        });
+    }
+
+    return staff;
+}"""
+
+JS_EXTRACT_PROGRAMS = """() => {
+    const programs = [];
+    const seen = new Set();
+
+    document.querySelectorAll('a[href]').forEach(a => {
+        const href = a.href;
+        const text = a.innerText.trim().replace(/\\s+/g, ' ');
+
+        if (!href || seen.has(href)) return;
+        if (text.length < 10 || text.length > 200) return;
+
+        const hrefLower = href.toLowerCase();
+        const textLower = text.toLowerCase();
+
+        const isNav = textLower === 'courses' ||
+            textLower === 'masters' ||
+            textLower.includes('admission') ||
+            textLower.includes('fees') ||
+            textLower.includes('skip to') ||
+            textLower.includes('search') ||
+            textLower.includes('contact') ||
+            hrefLower.includes('#');
+        if (isNav) return;
+
+        const hasNumericId = /\\/\\d{5}\\//.test(href);
+        const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId;
+
+        if (isCoursePage) {
+            seen.add(href);
+            programs.push({
+                name: text,
+                url: href
+            });
+        }
+    });
+
+    return programs;
+}"""
+
+
+# =========================
+# 数据匹配
+# =========================
+
+def match_program_to_school(program_name: str) -> str:
+    lower = program_name.lower()
+    for school in SCHOOL_CONFIG:
+        for keyword in school["keywords"]:
+            if keyword in lower:
+                return school["name"]
+    return "Other Programs"
+
+
+# =========================
+# 请求与解析工具
+# =========================
+
+def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+    settings = dict(DEFAULT_REQUEST)
+    for layer in layers:
+        if not layer:
+            continue
+        for key, value in layer.items():
+            if value is not None:
+                settings[key] = value
+    settings["max_retries"] = max(1, int(settings.get("max_retries", 1)))
+    settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000)
+    return settings
+
+
+async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]:
+    last_error = None
+    for attempt in range(settings["max_retries"]):
+        try:
+            await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"])
+            if settings.get("wait_for_selector"):
+                await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"])
+            if settings.get("post_wait_ms"):
+                await page.wait_for_timeout(settings["post_wait_ms"])
+            return True, None
+        except PlaywrightTimeoutError as exc:
+            last_error = f"Timeout: {exc}"
+        except Exception as exc:  # noqa: BLE001
+            last_error = str(exc)
+
+        if attempt < settings["max_retries"] - 1:
+            await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1))
+
+    return False, last_error
+
+
+async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800):
+    repetitions = max(1, repetitions)
+    for i in range(repetitions):
+        await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1))
+        await page.wait_for_timeout(delay_ms)
+
+
+async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500):
+    for _ in range(max_clicks):
+        button = await page.query_selector(selector)
+        if not button:
+            break
+        try:
+            await button.click()
+            await page.wait_for_timeout(wait_ms)
+        except Exception:
+            break
+
+
+def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    seen = set()
+    cleaned = []
+    for item in staff:
+        name = (item.get("name") or "").strip()
+        if not name:
+            continue
+        url = (item.get("url") or "").strip()
+        key = url or name.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()})
+    return cleaned
+
+
+def _append_query(url: str, params: Dict[str, Any]) -> str:
+    delimiter = "&" if "?" in url else "?"
+    return f"{url}{delimiter}{urlencode(params)}"
+
+
+def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]:
+    if not staff_url:
+        return None
+    path = staff_url.rstrip("/").split("/")
+    return path[-1] if path else None
+
+
+def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]:
+    items: List[Dict[str, Any]] = []
+    if isinstance(data, list):
+        items = data
+    elif isinstance(data, dict):
+        for key in ("results", "items", "persons", "data", "entities"):
+            if isinstance(data.get(key), list):
+                items = data[key]
+                break
+        if not items and isinstance(data.get("rows"), list):
+            items = data["rows"]
+
+    staff = []
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        name = item.get("name") or item.get("title") or item.get("fullName")
+        profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL")
+        if not name:
+            continue
+        if profile_url:
+            profile_url = urljoin(base_url, profile_url)
+        staff.append(
+            {
+                "name": name.strip(),
+                "url": (profile_url or "").strip(),
+                "title": (item.get("jobTitle") or item.get("position") or "").strip(),
+            }
+        )
+    return staff
+
+
+def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]:
+    staff: List[Dict[str, str]] = []
+    try:
+        root = ET.fromstring(text)
+    except ET.ParseError:
+        return staff
+
+    for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"):
+        title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="")
+        link = entry.find("{http://www.w3.org/2005/Atom}link")
+        href = link.attrib.get("href") if link is not None else ""
+        if title:
+            staff.append(
+                {
+                    "name": title.strip(),
+                    "url": urljoin(base_url, href) if href else "",
+                    "title": "",
+                }
+            )
+    return staff
+
+
+async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
+    config = school_config.get("research_explorer") or {}
+    if not config and school_config.get("extract_method") != "research_explorer":
+        return []
+
+    base_staff_url = ""
+    if school_config.get("staff_pages"):
+        base_staff_url = school_config["staff_pages"][0].get("url", "")
+
+    page_size = config.get("page_size", 200)
+    timeout_ms = config.get("timeout_ms", 70000)
+
+    candidates: List[str] = []
+    slug = config.get("org_slug") or _guess_research_slug(base_staff_url)
+    base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx")
+
+    if config.get("api_url"):
+        candidates.append(config["api_url"])
+
+    if slug:
+        params = {
+            "action": "search",
+            "language": "en",
+            "format": "json",
+            "site": "default",
+            "showall": "true",
+            "pageSize": page_size,
+            "organisations": slug,
+        }
+        candidates.append(f"{base_api}?{urlencode(params)}")
+
+    if base_staff_url:
+        candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size}))
+        candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size}))
+
+    for url in candidates:
+        try:
+            resp = await context.request.get(url, timeout=timeout_ms)
+            if resp.status != 200:
+                continue
+            ctype = resp.headers.get("content-type", "")
+            if "json" in ctype:
+                data = await resp.json()
+                parsed = _parse_research_explorer_json(data, base_staff_url)
+            else:
+                text = await resp.text()
+                parsed = _parse_research_explorer_xml(text, base_staff_url)
+            parsed = _deduplicate_staff(parsed)
+            if parsed:
+                if output_callback:
+                    output_callback("info", f"  {school_config['name']}: {len(parsed)} staff via API")
+                return parsed
+        except Exception as exc:  # noqa: BLE001
+            if output_callback:
+                output_callback(
+                    "warning", f"  {school_config['name']}: API fetch failed ({str(exc)[:60]})"
+                )
+    return []
+
+
+async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
+    staff_collected: List[Dict[str, str]] = []
+    staff_pages = school_config.get("staff_pages") or []
+    if not staff_pages and school_config.get("staff_url"):
+        staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}]
+
+    page = await context.new_page()
+    blocked_types = school_config.get("blocked_resources", ["image", "font", "media"])
+    if blocked_types:
+        async def _route_handler(route):
+            if route.request.resource_type in blocked_types:
+                await route.abort()
+            else:
+                await route.continue_()
+
+        await page.route("**/*", _route_handler)
+
+    for page_cfg in staff_pages:
+        target_url = page_cfg.get("url")
+        if not target_url:
+            continue
+
+        settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request"))
+        success, error = await _goto_with_retry(page, target_url, settings, school_config["name"])
+        if not success:
+            if output_callback:
+                output_callback("warning", f"  {school_config['name']}: failed to load {target_url} ({error})")
+            continue
+
+        if page_cfg.get("requires_scroll"):
+            await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700))
+
+        if page_cfg.get("load_from_selector"):
+            await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5))
+        elif page_cfg.get("load_more_selector"):
+            await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5))
+
+        method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links"
+        if method == "table":
+            extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF)
+        elif method == "research_explorer":
+            extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER)
+        else:
+            extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF)
+
+        staff_collected.extend(extracted)
+
+    await page.close()
+    return _deduplicate_staff(staff_collected)
+
+
+# =========================
+# 并发抓取学院 Staff
+# =========================
+
+async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback):
+    async with semaphore:
+        staff_list: List[Dict[str, str]] = []
+        status = "success"
+        error: Optional[str] = None
+
+        try:
+            if school_config.get("extract_method") == "research_explorer":
+                staff_list = await fetch_research_explorer_api(context, school_config, output_callback)
+            if not staff_list:
+                staff_list = await scrape_staff_via_browser(context, school_config, output_callback)
+
+            if output_callback:
+                output_callback("info", f"  {school_config['name']}: total {len(staff_list)} staff")
+
+        except Exception as exc:  # noqa: BLE001
+            status = "error"
+            error = str(exc)
+            if output_callback:
+                output_callback("error", f"  {school_config['name']}: {error}")
+
+        return {
+            "name": school_config["name"],
+            "staff": staff_list,
+            "status": status,
+            "error": error,
+        }
+
+
+async def scrape_all_school_staff(context, output_callback):
+    semaphore = asyncio.Semaphore(STAFF_CONCURRENCY)
+    tasks = [
+        asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback))
+        for cfg in SCHOOL_CONFIG
+    ]
+    results = await asyncio.gather(*tasks)
+
+    staff_map = {}
+    diagnostics = {"failed": [], "success": [], "total": len(results)}
+    for res in results:
+        if res["staff"]:
+            staff_map[res["name"]] = res["staff"]
+            diagnostics["success"].append(res["name"])
+        else:
+            diagnostics["failed"].append(
+                {
+                    "name": res["name"],
+                    "status": res["status"],
+                    "error": res.get("error"),
+                }
+            )
+    return staff_map, diagnostics
+
+
+# =========================
+# 主流程
+# =========================
+
+async def scrape(output_callback=None):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        )
+
+        base_url = "https://www.manchester.ac.uk/"
+        result = {
+            "name": "The University of Manchester",
+            "url": base_url,
+            "scraped_at": datetime.now(timezone.utc).isoformat(),
+            "schools": [],
+            "diagnostics": {},
+        }
+
+        try:
+            # Step 1: Masters 列表
+            if output_callback:
+                output_callback("info", "Step 1: Scraping masters programs list...")
+
+            page = await context.new_page()
+            courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
+            await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000)
+            await page.wait_for_timeout(3000)
+            programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS)
+            await page.close()
+
+            if output_callback:
+                output_callback("info", f"Found {len(programs_data)} masters programs")
+
+            # Step 2: 并发抓取学院 Staff
+            if output_callback:
+                output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...")
+            school_staff, diagnostics = await scrape_all_school_staff(context, output_callback)
+
+            # Step 3: 组织数据
+            schools_dict: Dict[str, Dict[str, Any]] = {}
+            for prog in programs_data:
+                school_name = match_program_to_school(prog["name"])
+                if school_name not in schools_dict:
+                    schools_dict[school_name] = {
+                        "name": school_name,
+                        "url": "",
+                        "programs": [],
+                        "faculty": school_staff.get(school_name, []),
+                        "faculty_source": "school_directory" if school_staff.get(school_name) else "",
+                    }
+
+                schools_dict[school_name]["programs"].append(
+                    {
+                        "name": prog["name"],
+                        "url": prog["url"],
+                        "faculty": [],
+                    }
+                )
+
+            for cfg in SCHOOL_CONFIG:
+                if cfg["name"] in schools_dict:
+                    first_page = (cfg.get("staff_pages") or [{}])[0]
+                    schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "")
+
+            _attach_faculty_to_programs(schools_dict, school_staff)
+
+            result["schools"] = list(schools_dict.values())
+
+            total_programs = sum(len(s["programs"]) for s in result["schools"])
+            total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"])
+
+            result["diagnostics"] = {
+                "total_programs": total_programs,
+                "total_faculty_records": total_faculty,
+                "school_staff_success": diagnostics.get("success", []),
+                "school_staff_failed": diagnostics.get("failed", []),
+            }
+
+            if output_callback:
+                output_callback(
+                    "info",
+                    f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty",
+                )
+
+        except Exception as exc:  # noqa: BLE001
+            if output_callback:
+                output_callback("error", f"Scraping error: {str(exc)}")
+        finally:
+            await browser.close()
+
+        return result
+
+
+def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]):
+    for school_name, school_data in schools_dict.items():
+        staff = staff_map.get(school_name, [])
+        cfg = SCHOOL_LOOKUP.get(school_name, {})
+        if not staff or not cfg.get("attach_faculty_to_programs"):
+            continue
+
+        limit = cfg.get("faculty_per_program")
+        for program in school_data["programs"]:
+            sliced = deepcopy(staff[:limit] if limit else staff)
+            program["faculty"] = sliced
+
+
+# =========================
+# CLI
+# =========================
+
+if __name__ == "__main__":
+    import sys
+
+    if sys.platform == "win32":
+        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
+    def print_callback(level, msg):
+        print(f"[{level}] {msg}")
+
+    scrape_result = asyncio.run(scrape(output_callback=print_callback))
+
+    output_path = "output/manchester_complete_result.json"
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(scrape_result, f, ensure_ascii=False, indent=2)
+
+    print("\nResult saved to", output_path)
+    print("\n=== Summary ===")
+    for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))):
+        print(
+            f"  {school['name']}: "
+            f"{len(school['programs'])} programs, "
+            f"{len(school.get('faculty', []))} faculty"
+        )
+