University-Playwright-Codeg…/artifacts/manchester_complete_scraper.py

"""
曼彻斯特大学完整采集脚本
新增特性：
- Research Explorer API 优先拉取 JSON / XML，失败再回落 DOM
- 每个学院独立页面、并行抓取（默认 3 并发）
- 细粒度超时/重试/滚动/Load more 控制
- 多 URL / 备用 Staff 页面配置
- 导师目录缓存，可按学院关键词映射到项目
- 诊断信息记录（失败学院、超时学院、批次信息）
"""

import asyncio
import json
import re
from copy import deepcopy
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlencode, urljoin
from xml.etree import ElementTree as ET

from playwright.async_api import (
    TimeoutError as PlaywrightTimeoutError,
    async_playwright,
)

# =========================
# 配置区
# =========================

DEFAULT_REQUEST = {
    "timeout_ms": 60000,
    "post_wait_ms": 2500,
    "wait_until": "domcontentloaded",
    "max_retries": 3,
    "retry_backoff_ms": 2000,
}

STAFF_CONCURRENCY = 3

SCHOOL_CONFIG: List[Dict[str, Any]] = [
    {
        "name": "Alliance Manchester Business School",
        "keywords": [
            "accounting",
            "finance",
            "business",
            "management",
            "marketing",
            "mba",
            "economics",
            "entrepreneurship",
        ],
        "attach_faculty_to_programs": True,
        "staff_pages": [
            {
                "url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
                "extract_method": "table",
                "request": {"timeout_ms": 60000, "wait_until": "networkidle"},
            }
        ],
    },
    {
        "name": "Department of Computer Science",
        "keywords": [
            "computer",
            "software",
            "data science",
            "artificial intelligence",
            "ai ",
            "machine learning",
            "cyber",
            "computing",
        ],
        "attach_faculty_to_programs": True,
        "staff_pages": [
            {
                "url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/",
                "extract_method": "links",
                "requires_scroll": True,
            },
            {
                "url": "https://www.cs.manchester.ac.uk/about/people/",
                "extract_method": "links",
                "load_more_selector": "button.load-more",
                "max_load_more": 6,
            },
        ],
    },
    {
        "name": "Department of Physics and Astronomy",
        "keywords": [
            "physics",
            "astronomy",
            "astrophysics",
            "nuclear",
            "particle",
        ],
        "attach_faculty_to_programs": True,
        "staff_pages": [
            {
                "url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/",
                "extract_method": "links",
                "requires_scroll": True,
            }
        ],
    },
    {
        "name": "Department of Electrical and Electronic Engineering",
        "keywords": [
            "electrical",
            "electronic",
            "eee",
            "power systems",
            "microelectronics",
        ],
        "attach_faculty_to_programs": True,
        "staff_pages": [
            {
                "url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/",
                "extract_method": "links",
                "requires_scroll": True,
            }
        ],
    },
    {
        "name": "Department of Chemistry",
        "keywords": ["chemistry", "chemical"],
        "attach_faculty_to_programs": True,
        "extract_method": "research_explorer",
        "research_explorer": {"page_size": 200},
        "staff_pages": [
            {
                "url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/",
                "extract_method": "research_explorer",
                "requires_scroll": True,
                "request": {
                    "timeout_ms": 120000,
                    "wait_until": "networkidle",
                    "post_wait_ms": 5000,
                },
            }
        ],
    },
    {
        "name": "Department of Mathematics",
        "keywords": [
            "mathematics",
            "mathematical",
            "applied math",
            "statistics",
            "actuarial",
        ],
        "attach_faculty_to_programs": True,
        "extract_method": "research_explorer",
        "research_explorer": {"page_size": 200},
        "staff_pages": [
            {
                "url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/",
                "extract_method": "research_explorer",
                "requires_scroll": True,
            }
        ],
    },
    {
        "name": "School of Engineering",
        "keywords": [
            "engineering",
            "mechanical",
            "aerospace",
            "civil",
            "structural",
            "materials",
        ],
        "attach_faculty_to_programs": True,
        "extract_method": "research_explorer",
        "research_explorer": {"page_size": 400},
        "staff_pages": [
            {
                "url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/",
                "extract_method": "research_explorer",
                "requires_scroll": True,
            }
        ],
    },
    {
        "name": "Faculty of Biology, Medicine and Health",
        "keywords": [
            "medicine",
            "medical",
            "health",
            "nursing",
            "pharmacy",
            "clinical",
            "dental",
            "optometry",
            "biology",
            "biomedical",
            "anatomical",
            "physiotherapy",
            "midwifery",
            "mental health",
            "psychology",
        ],
        "attach_faculty_to_programs": True,
        "extract_method": "research_explorer",
        "research_explorer": {"page_size": 400},
        "staff_pages": [
            {
                "url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/",
                "extract_method": "research_explorer",
                "requires_scroll": True,
            }
        ],
    },
    {
        "name": "School of Social Sciences",
        "keywords": [
            "sociology",
            "politics",
            "international",
            "social",
            "criminology",
            "anthropology",
            "philosophy",
        ],
        "attach_faculty_to_programs": True,
        "extract_method": "research_explorer",
        "research_explorer": {"page_size": 200},
        "staff_pages": [
            {
                "url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/",
                "extract_method": "research_explorer",
                "requires_scroll": True,
            }
        ],
    },
    {
        "name": "School of Law",
        "keywords": ["law", "legal", "llm"],
        "attach_faculty_to_programs": True,
        "extract_method": "research_explorer",
        "research_explorer": {"page_size": 200},
        "staff_pages": [
            {
                "url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/",
                "extract_method": "research_explorer",
                "requires_scroll": True,
            }
        ],
    },
    {
        "name": "School of Arts, Languages and Cultures",
        "keywords": [
            "arts",
            "languages",
            "culture",
            "music",
            "drama",
            "theatre",
            "history",
            "linguistics",
            "literature",
            "translation",
            "classics",
            "archaeology",
            "religion",
        ],
        "attach_faculty_to_programs": True,
        "extract_method": "research_explorer",
        "research_explorer": {"page_size": 400},
        "staff_pages": [
            {
                "url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/",
                "extract_method": "research_explorer",
                "requires_scroll": True,
            }
        ],
    },
    {
        "name": "School of Environment, Education and Development",
        "keywords": [
            "environment",
            "education",
            "development",
            "planning",
            "architecture",
            "urban",
            "geography",
            "sustainability",
        ],
        "attach_faculty_to_programs": True,
        "extract_method": "research_explorer",
        "research_explorer": {"page_size": 300},
        "staff_pages": [
            {
                "url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/",
                "extract_method": "research_explorer",
                "requires_scroll": True,
            }
        ],
    },
]

SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG}

# =========================
# JS 提取函数
# =========================

JS_EXTRACT_TABLE_STAFF = """() => {
    const staff = [];
    const seen = new Set();

    document.querySelectorAll('table tr').forEach(row => {
        const cells = row.querySelectorAll('td');
        if (cells.length >= 2) {
            const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]');
            const titleCell = cells[2] || cells[1];

            if (link) {
                const name = link.innerText.trim();
                const url = link.href;
                const title = titleCell ? titleCell.innerText.trim() : '';

                if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) {
                    seen.add(url);
                    staff.push({
                        name,
                        url,
                        title
                    });
                }
            }
        }
    });

    return staff;
}"""

JS_EXTRACT_LINK_STAFF = """() => {
    const staff = [];
    const seen = new Set();

    document.querySelectorAll('a[href]').forEach(a => {
        const href = a.href;
        const text = a.innerText.trim();

        if (seen.has(href)) return;
        if (text.length < 5 || text.length > 80) return;

        const lowerText = text.toLowerCase();
        if (lowerText.includes('skip') ||
            lowerText.includes('staff') ||
            lowerText.includes('people') ||
            lowerText.includes('academic') ||
            lowerText.includes('research profiles')) return;

        if (href.includes('/persons/') ||
            href.includes('/portal/en/researchers/') ||
            href.includes('/profile/') ||
            href.includes('/people/')) {
            seen.add(href);
            staff.push({
                name: text,
                url: href,
                title: ''
            });
        }
    });

    return staff;
}"""

JS_EXTRACT_RESEARCH_EXPLORER = """() => {
    const staff = [];
    const seen = new Set();

    document.querySelectorAll('a.link.person').forEach(a => {
        const href = a.href;
        const text = a.innerText.trim();

        if (!seen.has(href) && text.length > 3 && text.length < 80) {
            seen.add(href);
            staff.push({
                name: text,
                url: href,
                title: ''
            });
        }
    });

    if (staff.length === 0) {
        document.querySelectorAll('a[href*="/persons/"]').forEach(a => {
            const href = a.href;
            const text = a.innerText.trim();
            const lower = text.toLowerCase();

            if (seen.has(href)) return;
            if (text.length < 3 || text.length > 80) return;
            if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return;

            seen.add(href);
            staff.push({
                name: text,
                url: href,
                title: ''
            });
        });
    }

    return staff;
}"""

JS_EXTRACT_PROGRAMS = """() => {
    const programs = [];
    const seen = new Set();

    document.querySelectorAll('a[href]').forEach(a => {
        const href = a.href;
        const text = a.innerText.trim().replace(/\\s+/g, ' ');

        if (!href || seen.has(href)) return;
        if (text.length < 10 || text.length > 200) return;

        const hrefLower = href.toLowerCase();
        const textLower = text.toLowerCase();

        const isNav = textLower === 'courses' ||
            textLower === 'masters' ||
            textLower.includes('admission') ||
            textLower.includes('fees') ||
            textLower.includes('skip to') ||
            textLower.includes('search') ||
            textLower.includes('contact') ||
            hrefLower.includes('#');
        if (isNav) return;

        const hasNumericId = /\\/\\d{5}\\//.test(href);
        const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId;

        if (isCoursePage) {
            seen.add(href);
            programs.push({
                name: text,
                url: href
            });
        }
    });

    return programs;
}"""


# =========================
# 数据匹配
# =========================

def match_program_to_school(program_name: str) -> str:
    lower = program_name.lower()
    for school in SCHOOL_CONFIG:
        for keyword in school["keywords"]:
            if keyword in lower:
                return school["name"]
    return "Other Programs"


# =========================
# 请求与解析工具
# =========================

def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]:
    settings = dict(DEFAULT_REQUEST)
    for layer in layers:
        if not layer:
            continue
        for key, value in layer.items():
            if value is not None:
                settings[key] = value
    settings["max_retries"] = max(1, int(settings.get("max_retries", 1)))
    settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000)
    return settings


async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]:
    last_error = None
    for attempt in range(settings["max_retries"]):
        try:
            await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"])
            if settings.get("wait_for_selector"):
                await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"])
            if settings.get("post_wait_ms"):
                await page.wait_for_timeout(settings["post_wait_ms"])
            return True, None
        except PlaywrightTimeoutError as exc:
            last_error = f"Timeout: {exc}"
        except Exception as exc:  # noqa: BLE001
            last_error = str(exc)

        if attempt < settings["max_retries"] - 1:
            await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1))

    return False, last_error


async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800):
    repetitions = max(1, repetitions)
    for i in range(repetitions):
        await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1))
        await page.wait_for_timeout(delay_ms)


async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500):
    for _ in range(max_clicks):
        button = await page.query_selector(selector)
        if not button:
            break
        try:
            await button.click()
            await page.wait_for_timeout(wait_ms)
        except Exception:
            break


def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    seen = set()
    cleaned = []
    for item in staff:
        name = (item.get("name") or "").strip()
        if not name:
            continue
        url = (item.get("url") or "").strip()
        key = url or name.lower()
        if key in seen:
            continue
        seen.add(key)
        cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()})
    return cleaned


def _append_query(url: str, params: Dict[str, Any]) -> str:
    delimiter = "&" if "?" in url else "?"
    return f"{url}{delimiter}{urlencode(params)}"


def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]:
    if not staff_url:
        return None
    path = staff_url.rstrip("/").split("/")
    return path[-1] if path else None


def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]:
    items: List[Dict[str, Any]] = []
    if isinstance(data, list):
        items = data
    elif isinstance(data, dict):
        for key in ("results", "items", "persons", "data", "entities"):
            if isinstance(data.get(key), list):
                items = data[key]
                break
        if not items and isinstance(data.get("rows"), list):
            items = data["rows"]

    staff = []
    for item in items:
        if not isinstance(item, dict):
            continue
        name = item.get("name") or item.get("title") or item.get("fullName")
        profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL")
        if not name:
            continue
        if profile_url:
            profile_url = urljoin(base_url, profile_url)
        staff.append(
            {
                "name": name.strip(),
                "url": (profile_url or "").strip(),
                "title": (item.get("jobTitle") or item.get("position") or "").strip(),
            }
        )
    return staff


def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]:
    staff: List[Dict[str, str]] = []
    try:
        root = ET.fromstring(text)
    except ET.ParseError:
        return staff

    for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"):
        title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="")
        link = entry.find("{http://www.w3.org/2005/Atom}link")
        href = link.attrib.get("href") if link is not None else ""
        if title:
            staff.append(
                {
                    "name": title.strip(),
                    "url": urljoin(base_url, href) if href else "",
                    "title": "",
                }
            )
    return staff


async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
    config = school_config.get("research_explorer") or {}
    if not config and school_config.get("extract_method") != "research_explorer":
        return []

    base_staff_url = ""
    if school_config.get("staff_pages"):
        base_staff_url = school_config["staff_pages"][0].get("url", "")

    page_size = config.get("page_size", 200)
    timeout_ms = config.get("timeout_ms", 70000)

    candidates: List[str] = []
    slug = config.get("org_slug") or _guess_research_slug(base_staff_url)
    base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx")

    if config.get("api_url"):
        candidates.append(config["api_url"])

    if slug:
        params = {
            "action": "search",
            "language": "en",
            "format": "json",
            "site": "default",
            "showall": "true",
            "pageSize": page_size,
            "organisations": slug,
        }
        candidates.append(f"{base_api}?{urlencode(params)}")

    if base_staff_url:
        candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size}))
        candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size}))

    for url in candidates:
        try:
            resp = await context.request.get(url, timeout=timeout_ms)
            if resp.status != 200:
                continue
            ctype = resp.headers.get("content-type", "")
            if "json" in ctype:
                data = await resp.json()
                parsed = _parse_research_explorer_json(data, base_staff_url)
            else:
                text = await resp.text()
                parsed = _parse_research_explorer_xml(text, base_staff_url)
            parsed = _deduplicate_staff(parsed)
            if parsed:
                if output_callback:
                    output_callback("info", f"  {school_config['name']}: {len(parsed)} staff via API")
                return parsed
        except Exception as exc:  # noqa: BLE001
            if output_callback:
                output_callback(
                    "warning", f"  {school_config['name']}: API fetch failed ({str(exc)[:60]})"
                )
    return []


async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
    staff_collected: List[Dict[str, str]] = []
    staff_pages = school_config.get("staff_pages") or []
    if not staff_pages and school_config.get("staff_url"):
        staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}]

    page = await context.new_page()
    blocked_types = school_config.get("blocked_resources", ["image", "font", "media"])
    if blocked_types:
        async def _route_handler(route):
            if route.request.resource_type in blocked_types:
                await route.abort()
            else:
                await route.continue_()

        await page.route("**/*", _route_handler)

    for page_cfg in staff_pages:
        target_url = page_cfg.get("url")
        if not target_url:
            continue

        settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request"))
        success, error = await _goto_with_retry(page, target_url, settings, school_config["name"])
        if not success:
            if output_callback:
                output_callback("warning", f"  {school_config['name']}: failed to load {target_url} ({error})")
            continue

        if page_cfg.get("requires_scroll"):
            await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700))

        if page_cfg.get("load_from_selector"):
            await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5))
        elif page_cfg.get("load_more_selector"):
            await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5))

        method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links"
        if method == "table":
            extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF)
        elif method == "research_explorer":
            extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER)
        else:
            extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF)

        staff_collected.extend(extracted)

    await page.close()
    return _deduplicate_staff(staff_collected)


# =========================
# 并发抓取学院 Staff
# =========================

async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback):
    async with semaphore:
        staff_list: List[Dict[str, str]] = []
        status = "success"
        error: Optional[str] = None

        try:
            if school_config.get("extract_method") == "research_explorer":
                staff_list = await fetch_research_explorer_api(context, school_config, output_callback)
            if not staff_list:
                staff_list = await scrape_staff_via_browser(context, school_config, output_callback)

            if output_callback:
                output_callback("info", f"  {school_config['name']}: total {len(staff_list)} staff")

        except Exception as exc:  # noqa: BLE001
            status = "error"
            error = str(exc)
            if output_callback:
                output_callback("error", f"  {school_config['name']}: {error}")

        return {
            "name": school_config["name"],
            "staff": staff_list,
            "status": status,
            "error": error,
        }


async def scrape_all_school_staff(context, output_callback):
    semaphore = asyncio.Semaphore(STAFF_CONCURRENCY)
    tasks = [
        asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback))
        for cfg in SCHOOL_CONFIG
    ]
    results = await asyncio.gather(*tasks)

    staff_map = {}
    diagnostics = {"failed": [], "success": [], "total": len(results)}
    for res in results:
        if res["staff"]:
            staff_map[res["name"]] = res["staff"]
            diagnostics["success"].append(res["name"])
        else:
            diagnostics["failed"].append(
                {
                    "name": res["name"],
                    "status": res["status"],
                    "error": res.get("error"),
                }
            )
    return staff_map, diagnostics


# =========================
# 主流程
# =========================

async def scrape(output_callback=None):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )

        base_url = "https://www.manchester.ac.uk/"
        result = {
            "name": "The University of Manchester",
            "url": base_url,
            "scraped_at": datetime.now(timezone.utc).isoformat(),
            "schools": [],
            "diagnostics": {},
        }

        try:
            # Step 1: Masters 列表
            if output_callback:
                output_callback("info", "Step 1: Scraping masters programs list...")

            page = await context.new_page()
            courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
            await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000)
            await page.wait_for_timeout(3000)
            programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS)
            await page.close()

            if output_callback:
                output_callback("info", f"Found {len(programs_data)} masters programs")

            # Step 2: 并发抓取学院 Staff
            if output_callback:
                output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...")
            school_staff, diagnostics = await scrape_all_school_staff(context, output_callback)

            # Step 3: 组织数据
            schools_dict: Dict[str, Dict[str, Any]] = {}
            for prog in programs_data:
                school_name = match_program_to_school(prog["name"])
                if school_name not in schools_dict:
                    schools_dict[school_name] = {
                        "name": school_name,
                        "url": "",
                        "programs": [],
                        "faculty": school_staff.get(school_name, []),
                        "faculty_source": "school_directory" if school_staff.get(school_name) else "",
                    }

                schools_dict[school_name]["programs"].append(
                    {
                        "name": prog["name"],
                        "url": prog["url"],
                        "faculty": [],
                    }
                )

            for cfg in SCHOOL_CONFIG:
                if cfg["name"] in schools_dict:
                    first_page = (cfg.get("staff_pages") or [{}])[0]
                    schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "")

            _attach_faculty_to_programs(schools_dict, school_staff)

            result["schools"] = list(schools_dict.values())

            total_programs = sum(len(s["programs"]) for s in result["schools"])
            total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"])

            result["diagnostics"] = {
                "total_programs": total_programs,
                "total_faculty_records": total_faculty,
                "school_staff_success": diagnostics.get("success", []),
                "school_staff_failed": diagnostics.get("failed", []),
            }

            if output_callback:
                output_callback(
                    "info",
                    f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty",
                )

        except Exception as exc:  # noqa: BLE001
            if output_callback:
                output_callback("error", f"Scraping error: {str(exc)}")
        finally:
            await browser.close()

        return result


def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]):
    for school_name, school_data in schools_dict.items():
        staff = staff_map.get(school_name, [])
        cfg = SCHOOL_LOOKUP.get(school_name, {})
        if not staff or not cfg.get("attach_faculty_to_programs"):
            continue

        limit = cfg.get("faculty_per_program")
        for program in school_data["programs"]:
            sliced = deepcopy(staff[:limit] if limit else staff)
            program["faculty"] = sliced


# =========================
# CLI
# =========================

if __name__ == "__main__":
    import sys

    if sys.platform == "win32":
        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

    def print_callback(level, msg):
        print(f"[{level}] {msg}")

    scrape_result = asyncio.run(scrape(output_callback=print_callback))

    output_path = "output/manchester_complete_result.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(scrape_result, f, ensure_ascii=False, indent=2)

    print("\nResult saved to", output_path)
    print("\n=== Summary ===")
    for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))):
        print(
            f"  {school['name']}: "
            f"{len(school['programs'])} programs, "
            f"{len(school.get('faculty', []))} faculty"
        )