""" 曼彻斯特大学完整采集脚本 新增特性: - Research Explorer API 优先拉取 JSON / XML,失败再回落 DOM - 每个学院独立页面、并行抓取(默认 3 并发) - 细粒度超时/重试/滚动/Load more 控制 - 多 URL / 备用 Staff 页面配置 - 导师目录缓存,可按学院关键词映射到项目 - 诊断信息记录(失败学院、超时学院、批次信息) """ import asyncio import json import re from copy import deepcopy from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlencode, urljoin from xml.etree import ElementTree as ET from playwright.async_api import ( TimeoutError as PlaywrightTimeoutError, async_playwright, ) # ========================= # 配置区 # ========================= DEFAULT_REQUEST = { "timeout_ms": 60000, "post_wait_ms": 2500, "wait_until": "domcontentloaded", "max_retries": 3, "retry_backoff_ms": 2000, } STAFF_CONCURRENCY = 3 SCHOOL_CONFIG: List[Dict[str, Any]] = [ { "name": "Alliance Manchester Business School", "keywords": [ "accounting", "finance", "business", "management", "marketing", "mba", "economics", "entrepreneurship", ], "attach_faculty_to_programs": True, "staff_pages": [ { "url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/", "extract_method": "table", "request": {"timeout_ms": 60000, "wait_until": "networkidle"}, } ], }, { "name": "Department of Computer Science", "keywords": [ "computer", "software", "data science", "artificial intelligence", "ai ", "machine learning", "cyber", "computing", ], "attach_faculty_to_programs": True, "staff_pages": [ { "url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/", "extract_method": "links", "requires_scroll": True, }, { "url": "https://www.cs.manchester.ac.uk/about/people/", "extract_method": "links", "load_more_selector": "button.load-more", "max_load_more": 6, }, ], }, { "name": "Department of Physics and Astronomy", "keywords": [ "physics", "astronomy", "astrophysics", "nuclear", "particle", ], "attach_faculty_to_programs": True, "staff_pages": [ { "url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/", "extract_method": "links", "requires_scroll": True, } ], }, { "name": "Department of Electrical and Electronic Engineering", "keywords": [ "electrical", "electronic", "eee", "power systems", "microelectronics", ], "attach_faculty_to_programs": True, "staff_pages": [ { "url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/", "extract_method": "links", "requires_scroll": True, } ], }, { "name": "Department of Chemistry", "keywords": ["chemistry", "chemical"], "attach_faculty_to_programs": True, "extract_method": "research_explorer", "research_explorer": {"page_size": 200}, "staff_pages": [ { "url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/", "extract_method": "research_explorer", "requires_scroll": True, "request": { "timeout_ms": 120000, "wait_until": "networkidle", "post_wait_ms": 5000, }, } ], }, { "name": "Department of Mathematics", "keywords": [ "mathematics", "mathematical", "applied math", "statistics", "actuarial", ], "attach_faculty_to_programs": True, "extract_method": "research_explorer", "research_explorer": {"page_size": 200}, "staff_pages": [ { "url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/", "extract_method": "research_explorer", "requires_scroll": True, } ], }, { "name": "School of Engineering", "keywords": [ "engineering", "mechanical", "aerospace", "civil", "structural", "materials", ], "attach_faculty_to_programs": True, "extract_method": "research_explorer", "research_explorer": {"page_size": 400}, "staff_pages": [ { "url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/", "extract_method": "research_explorer", "requires_scroll": True, } ], }, { "name": "Faculty of Biology, Medicine and Health", "keywords": [ "medicine", "medical", "health", "nursing", "pharmacy", "clinical", "dental", "optometry", "biology", "biomedical", "anatomical", "physiotherapy", "midwifery", "mental health", "psychology", ], "attach_faculty_to_programs": True, "extract_method": "research_explorer", "research_explorer": {"page_size": 400}, "staff_pages": [ { "url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/", "extract_method": "research_explorer", "requires_scroll": True, } ], }, { "name": "School of Social Sciences", "keywords": [ "sociology", "politics", "international", "social", "criminology", "anthropology", "philosophy", ], "attach_faculty_to_programs": True, "extract_method": "research_explorer", "research_explorer": {"page_size": 200}, "staff_pages": [ { "url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/", "extract_method": "research_explorer", "requires_scroll": True, } ], }, { "name": "School of Law", "keywords": ["law", "legal", "llm"], "attach_faculty_to_programs": True, "extract_method": "research_explorer", "research_explorer": {"page_size": 200}, "staff_pages": [ { "url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/", "extract_method": "research_explorer", "requires_scroll": True, } ], }, { "name": "School of Arts, Languages and Cultures", "keywords": [ "arts", "languages", "culture", "music", "drama", "theatre", "history", "linguistics", "literature", "translation", "classics", "archaeology", "religion", ], "attach_faculty_to_programs": True, "extract_method": "research_explorer", "research_explorer": {"page_size": 400}, "staff_pages": [ { "url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/", "extract_method": "research_explorer", "requires_scroll": True, } ], }, { "name": "School of Environment, Education and Development", "keywords": [ "environment", "education", "development", "planning", "architecture", "urban", "geography", "sustainability", ], "attach_faculty_to_programs": True, "extract_method": "research_explorer", "research_explorer": {"page_size": 300}, "staff_pages": [ { "url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/", "extract_method": "research_explorer", "requires_scroll": True, } ], }, ] SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG} # ========================= # JS 提取函数 # ========================= JS_EXTRACT_TABLE_STAFF = """() => { const staff = []; const seen = new Set(); document.querySelectorAll('table tr').forEach(row => { const cells = row.querySelectorAll('td'); if (cells.length >= 2) { const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]'); const titleCell = cells[2] || cells[1]; if (link) { const name = link.innerText.trim(); const url = link.href; const title = titleCell ? titleCell.innerText.trim() : ''; if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) { seen.add(url); staff.push({ name, url, title }); } } } }); return staff; }""" JS_EXTRACT_LINK_STAFF = """() => { const staff = []; const seen = new Set(); document.querySelectorAll('a[href]').forEach(a => { const href = a.href; const text = a.innerText.trim(); if (seen.has(href)) return; if (text.length < 5 || text.length > 80) return; const lowerText = text.toLowerCase(); if (lowerText.includes('skip') || lowerText.includes('staff') || lowerText.includes('people') || lowerText.includes('academic') || lowerText.includes('research profiles')) return; if (href.includes('/persons/') || href.includes('/portal/en/researchers/') || href.includes('/profile/') || href.includes('/people/')) { seen.add(href); staff.push({ name: text, url: href, title: '' }); } }); return staff; }""" JS_EXTRACT_RESEARCH_EXPLORER = """() => { const staff = []; const seen = new Set(); document.querySelectorAll('a.link.person').forEach(a => { const href = a.href; const text = a.innerText.trim(); if (!seen.has(href) && text.length > 3 && text.length < 80) { seen.add(href); staff.push({ name: text, url: href, title: '' }); } }); if (staff.length === 0) { document.querySelectorAll('a[href*="/persons/"]').forEach(a => { const href = a.href; const text = a.innerText.trim(); const lower = text.toLowerCase(); if (seen.has(href)) return; if (text.length < 3 || text.length > 80) return; if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return; seen.add(href); staff.push({ name: text, url: href, title: '' }); }); } return staff; }""" JS_EXTRACT_PROGRAMS = """() => { const programs = []; const seen = new Set(); document.querySelectorAll('a[href]').forEach(a => { const href = a.href; const text = a.innerText.trim().replace(/\\s+/g, ' '); if (!href || seen.has(href)) return; if (text.length < 10 || text.length > 200) return; const hrefLower = href.toLowerCase(); const textLower = text.toLowerCase(); const isNav = textLower === 'courses' || textLower === 'masters' || textLower.includes('admission') || textLower.includes('fees') || textLower.includes('skip to') || textLower.includes('search') || textLower.includes('contact') || hrefLower.includes('#'); if (isNav) return; const hasNumericId = /\\/\\d{5}\\//.test(href); const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId; if (isCoursePage) { seen.add(href); programs.push({ name: text, url: href }); } }); return programs; }""" # ========================= # 数据匹配 # ========================= def match_program_to_school(program_name: str) -> str: lower = program_name.lower() for school in SCHOOL_CONFIG: for keyword in school["keywords"]: if keyword in lower: return school["name"] return "Other Programs" # ========================= # 请求与解析工具 # ========================= def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]: settings = dict(DEFAULT_REQUEST) for layer in layers: if not layer: continue for key, value in layer.items(): if value is not None: settings[key] = value settings["max_retries"] = max(1, int(settings.get("max_retries", 1))) settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000) return settings async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]: last_error = None for attempt in range(settings["max_retries"]): try: await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"]) if settings.get("wait_for_selector"): await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"]) if settings.get("post_wait_ms"): await page.wait_for_timeout(settings["post_wait_ms"]) return True, None except PlaywrightTimeoutError as exc: last_error = f"Timeout: {exc}" except Exception as exc: # noqa: BLE001 last_error = str(exc) if attempt < settings["max_retries"] - 1: await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1)) return False, last_error async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800): repetitions = max(1, repetitions) for i in range(repetitions): await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1)) await page.wait_for_timeout(delay_ms) async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500): for _ in range(max_clicks): button = await page.query_selector(selector) if not button: break try: await button.click() await page.wait_for_timeout(wait_ms) except Exception: break def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]: seen = set() cleaned = [] for item in staff: name = (item.get("name") or "").strip() if not name: continue url = (item.get("url") or "").strip() key = url or name.lower() if key in seen: continue seen.add(key) cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()}) return cleaned def _append_query(url: str, params: Dict[str, Any]) -> str: delimiter = "&" if "?" in url else "?" return f"{url}{delimiter}{urlencode(params)}" def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]: if not staff_url: return None path = staff_url.rstrip("/").split("/") return path[-1] if path else None def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]: items: List[Dict[str, Any]] = [] if isinstance(data, list): items = data elif isinstance(data, dict): for key in ("results", "items", "persons", "data", "entities"): if isinstance(data.get(key), list): items = data[key] break if not items and isinstance(data.get("rows"), list): items = data["rows"] staff = [] for item in items: if not isinstance(item, dict): continue name = item.get("name") or item.get("title") or item.get("fullName") profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL") if not name: continue if profile_url: profile_url = urljoin(base_url, profile_url) staff.append( { "name": name.strip(), "url": (profile_url or "").strip(), "title": (item.get("jobTitle") or item.get("position") or "").strip(), } ) return staff def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]: staff: List[Dict[str, str]] = [] try: root = ET.fromstring(text) except ET.ParseError: return staff for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"): title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="") link = entry.find("{http://www.w3.org/2005/Atom}link") href = link.attrib.get("href") if link is not None else "" if title: staff.append( { "name": title.strip(), "url": urljoin(base_url, href) if href else "", "title": "", } ) return staff async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]: config = school_config.get("research_explorer") or {} if not config and school_config.get("extract_method") != "research_explorer": return [] base_staff_url = "" if school_config.get("staff_pages"): base_staff_url = school_config["staff_pages"][0].get("url", "") page_size = config.get("page_size", 200) timeout_ms = config.get("timeout_ms", 70000) candidates: List[str] = [] slug = config.get("org_slug") or _guess_research_slug(base_staff_url) base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx") if config.get("api_url"): candidates.append(config["api_url"]) if slug: params = { "action": "search", "language": "en", "format": "json", "site": "default", "showall": "true", "pageSize": page_size, "organisations": slug, } candidates.append(f"{base_api}?{urlencode(params)}") if base_staff_url: candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size})) candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size})) for url in candidates: try: resp = await context.request.get(url, timeout=timeout_ms) if resp.status != 200: continue ctype = resp.headers.get("content-type", "") if "json" in ctype: data = await resp.json() parsed = _parse_research_explorer_json(data, base_staff_url) else: text = await resp.text() parsed = _parse_research_explorer_xml(text, base_staff_url) parsed = _deduplicate_staff(parsed) if parsed: if output_callback: output_callback("info", f" {school_config['name']}: {len(parsed)} staff via API") return parsed except Exception as exc: # noqa: BLE001 if output_callback: output_callback( "warning", f" {school_config['name']}: API fetch failed ({str(exc)[:60]})" ) return [] async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]: staff_collected: List[Dict[str, str]] = [] staff_pages = school_config.get("staff_pages") or [] if not staff_pages and school_config.get("staff_url"): staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}] page = await context.new_page() blocked_types = school_config.get("blocked_resources", ["image", "font", "media"]) if blocked_types: async def _route_handler(route): if route.request.resource_type in blocked_types: await route.abort() else: await route.continue_() await page.route("**/*", _route_handler) for page_cfg in staff_pages: target_url = page_cfg.get("url") if not target_url: continue settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request")) success, error = await _goto_with_retry(page, target_url, settings, school_config["name"]) if not success: if output_callback: output_callback("warning", f" {school_config['name']}: failed to load {target_url} ({error})") continue if page_cfg.get("requires_scroll"): await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700)) if page_cfg.get("load_from_selector"): await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5)) elif page_cfg.get("load_more_selector"): await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5)) method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links" if method == "table": extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF) elif method == "research_explorer": extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER) else: extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF) staff_collected.extend(extracted) await page.close() return _deduplicate_staff(staff_collected) # ========================= # 并发抓取学院 Staff # ========================= async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback): async with semaphore: staff_list: List[Dict[str, str]] = [] status = "success" error: Optional[str] = None try: if school_config.get("extract_method") == "research_explorer": staff_list = await fetch_research_explorer_api(context, school_config, output_callback) if not staff_list: staff_list = await scrape_staff_via_browser(context, school_config, output_callback) if output_callback: output_callback("info", f" {school_config['name']}: total {len(staff_list)} staff") except Exception as exc: # noqa: BLE001 status = "error" error = str(exc) if output_callback: output_callback("error", f" {school_config['name']}: {error}") return { "name": school_config["name"], "staff": staff_list, "status": status, "error": error, } async def scrape_all_school_staff(context, output_callback): semaphore = asyncio.Semaphore(STAFF_CONCURRENCY) tasks = [ asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback)) for cfg in SCHOOL_CONFIG ] results = await asyncio.gather(*tasks) staff_map = {} diagnostics = {"failed": [], "success": [], "total": len(results)} for res in results: if res["staff"]: staff_map[res["name"]] = res["staff"] diagnostics["success"].append(res["name"]) else: diagnostics["failed"].append( { "name": res["name"], "status": res["status"], "error": res.get("error"), } ) return staff_map, diagnostics # ========================= # 主流程 # ========================= async def scrape(output_callback=None): async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" ) base_url = "https://www.manchester.ac.uk/" result = { "name": "The University of Manchester", "url": base_url, "scraped_at": datetime.now(timezone.utc).isoformat(), "schools": [], "diagnostics": {}, } try: # Step 1: Masters 列表 if output_callback: output_callback("info", "Step 1: Scraping masters programs list...") page = await context.new_page() courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/" await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000) await page.wait_for_timeout(3000) programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS) await page.close() if output_callback: output_callback("info", f"Found {len(programs_data)} masters programs") # Step 2: 并发抓取学院 Staff if output_callback: output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...") school_staff, diagnostics = await scrape_all_school_staff(context, output_callback) # Step 3: 组织数据 schools_dict: Dict[str, Dict[str, Any]] = {} for prog in programs_data: school_name = match_program_to_school(prog["name"]) if school_name not in schools_dict: schools_dict[school_name] = { "name": school_name, "url": "", "programs": [], "faculty": school_staff.get(school_name, []), "faculty_source": "school_directory" if school_staff.get(school_name) else "", } schools_dict[school_name]["programs"].append( { "name": prog["name"], "url": prog["url"], "faculty": [], } ) for cfg in SCHOOL_CONFIG: if cfg["name"] in schools_dict: first_page = (cfg.get("staff_pages") or [{}])[0] schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "") _attach_faculty_to_programs(schools_dict, school_staff) result["schools"] = list(schools_dict.values()) total_programs = sum(len(s["programs"]) for s in result["schools"]) total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"]) result["diagnostics"] = { "total_programs": total_programs, "total_faculty_records": total_faculty, "school_staff_success": diagnostics.get("success", []), "school_staff_failed": diagnostics.get("failed", []), } if output_callback: output_callback( "info", f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty", ) except Exception as exc: # noqa: BLE001 if output_callback: output_callback("error", f"Scraping error: {str(exc)}") finally: await browser.close() return result def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]): for school_name, school_data in schools_dict.items(): staff = staff_map.get(school_name, []) cfg = SCHOOL_LOOKUP.get(school_name, {}) if not staff or not cfg.get("attach_faculty_to_programs"): continue limit = cfg.get("faculty_per_program") for program in school_data["programs"]: sliced = deepcopy(staff[:limit] if limit else staff) program["faculty"] = sliced # ========================= # CLI # ========================= if __name__ == "__main__": import sys if sys.platform == "win32": asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) def print_callback(level, msg): print(f"[{level}] {msg}") scrape_result = asyncio.run(scrape(output_callback=print_callback)) output_path = "output/manchester_complete_result.json" with open(output_path, "w", encoding="utf-8") as f: json.dump(scrape_result, f, ensure_ascii=False, indent=2) print("\nResult saved to", output_path) print("\n=== Summary ===") for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))): print( f" {school['name']}: " f"{len(school['programs'])} programs, " f"{len(school.get('faculty', []))} faculty" )