""" Test Manchester University scraper - improved faculty mapping """ import asyncio import json from datetime import datetime, timezone from playwright.async_api import async_playwright MASTERS_PATHS = [ "/study/masters/courses/list/", "/study/masters/courses/", "/postgraduate/taught/courses/", "/postgraduate/courses/list/", "/postgraduate/courses/", "/graduate/programs/", "/academics/graduate/programs/", "/programmes/masters/", "/masters/programmes/", "/admissions/graduate/programs/", ] ACCOUNTING_STAFF_URL = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/" ACCOUNTING_STAFF_CACHE = None JS_CHECK_COURSES = r"""() => { const links = document.querySelectorAll('a[href]'); let courseCount = 0; for (const a of links) { const href = a.href.toLowerCase(); if (/\/\d{4,}\//.test(href) || /\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) || /\/course\/[a-z]/.test(href)) { courseCount++; } } return courseCount; }""" JS_FIND_LIST_URL = """() => { const links = document.querySelectorAll('a[href]'); for (const a of links) { const text = a.innerText.toLowerCase(); const href = a.href.toLowerCase(); if ((text.includes('a-z') || text.includes('all course') || text.includes('full list') || text.includes('browse all') || href.includes('/list')) && (href.includes('master') || href.includes('course') || href.includes('postgrad'))) { return a.href; } } return null; }""" JS_FIND_COURSES_FROM_HOME = """() => { const links = document.querySelectorAll('a[href]'); for (const a of links) { const href = a.href.toLowerCase(); const text = a.innerText.toLowerCase(); if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) && (href.includes('course') || href.includes('program') || href.includes('degree'))) { return a.href; } } return null; }""" JS_EXTRACT_PROGRAMS = r"""() => { const programs = []; const seen = new Set(); const currentHost = window.location.hostname; document.querySelectorAll('a[href]').forEach(a => { const href = a.href; const text = a.innerText.trim().replace(/\s+/g, ' '); if (!href || seen.has(href)) return; if (text.length < 5 || text.length > 200) return; if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return; try { const linkHost = new URL(href).hostname; if (!linkHost.includes(currentHost.replace('www.', '')) && !currentHost.includes(linkHost.replace('www.', ''))) return; } catch { return; } const hrefLower = href.toLowerCase(); const textLower = text.toLowerCase(); const isNavigation = textLower === 'courses' || textLower === 'programmes' || textLower === 'undergraduate' || textLower === 'postgraduate' || textLower === 'masters' || textLower === "master's" || textLower.includes('skip to') || textLower.includes('share') || textLower === 'home' || textLower === 'study' || textLower.startsWith('a-z') || textLower.includes('admission') || textLower.includes('fees and funding') || textLower.includes('why should') || textLower.includes('why manchester') || textLower.includes('teaching and learning') || textLower.includes('meet us') || textLower.includes('student support') || textLower.includes('contact us') || textLower.includes('how to apply') || hrefLower.includes('/admissions/') || hrefLower.includes('/fees-and-funding/') || hrefLower.includes('/why-') || hrefLower.includes('/meet-us/') || hrefLower.includes('/contact-us/') || hrefLower.includes('/student-support/') || hrefLower.includes('/teaching-and-learning/') || hrefLower.endsWith('/courses/') || hrefLower.endsWith('/masters/') || hrefLower.endsWith('/postgraduate/'); if (isNavigation) return; const isExcluded = hrefLower.includes('/undergraduate') || hrefLower.includes('/bachelor') || hrefLower.includes('/phd/') || hrefLower.includes('/doctoral') || hrefLower.includes('/research-degree') || textLower.includes('bachelor') || textLower.includes('undergraduate') || (textLower.includes('phd') && !textLower.includes('mphil')); if (isExcluded) return; const hasNumericId = /\/\d{4,}\//.test(href); const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower); const isCoursePage = (hrefLower.includes('/course/') || hrefLower.includes('/courses/list/') || hrefLower.includes('/programme/')) && href.split('/').filter(p => p).length > 4; const textHasDegree = /(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)/i.test(text) || textLower.includes('master'); if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) { seen.add(href); programs.push({ name: text, url: href }); } }); return programs; }""" JS_EXTRACT_FACULTY = r"""() => { const faculty = []; const seen = new Set(); document.querySelectorAll('a[href]').forEach(a => { const href = a.href.toLowerCase(); const text = a.innerText.trim(); if (seen.has(href)) return; if (text.length < 3 || text.length > 100) return; const isStaff = href.includes('/people/') || href.includes('/staff/') || href.includes('/faculty/') || href.includes('/profile/') || href.includes('/academics/') || href.includes('/researcher/'); if (isStaff) { seen.add(href); faculty.push({ name: text.replace(/\s+/g, ' '), url: a.href }); } }); return faculty.slice(0, 20); }""" JS_EXTRACT_ACCOUNTING_STAFF = r"""() => { const rows = Array.from(document.querySelectorAll('table tbody tr')); const staff = []; for (const row of rows) { const cells = row.querySelectorAll('td'); if (!cells || cells.length < 2) { continue; } const nameCell = cells[1]; const roleCell = cells[2]; const emailCell = cells[5]; let profileUrl = ''; let displayName = nameCell ? nameCell.innerText.trim() : ''; const link = nameCell ? nameCell.querySelector('a[href]') : null; if (link) { profileUrl = link.href; displayName = link.innerText.trim() || displayName; } if (!displayName) { continue; } let email = ''; if (emailCell) { const emailLink = emailCell.querySelector('a[href^="mailto:"]'); if (emailLink) { email = emailLink.href.replace('mailto:', '').trim(); } } staff.push({ name: displayName, title: roleCell ? roleCell.innerText.trim() : '', url: profileUrl, email: email }); } return staff; }""" def should_use_accounting_staff(program_name: str) -> bool: lower_name = program_name.lower() return "msc" in lower_name and "accounting" in lower_name async def load_accounting_staff(context, output_callback=None): global ACCOUNTING_STAFF_CACHE if ACCOUNTING_STAFF_CACHE is not None: return ACCOUNTING_STAFF_CACHE staff_page = await context.new_page() try: if output_callback: output_callback("info", "Loading official AMBS Accounting & Finance staff page...") await staff_page.goto(ACCOUNTING_STAFF_URL, wait_until="domcontentloaded", timeout=30000) await staff_page.wait_for_timeout(2000) ACCOUNTING_STAFF_CACHE = await staff_page.evaluate(JS_EXTRACT_ACCOUNTING_STAFF) if output_callback: output_callback("info", f"Captured {len(ACCOUNTING_STAFF_CACHE)} faculty from the official staff page") except Exception as exc: if output_callback: output_callback("error", f"Failed to load AMBS staff page: {exc}") ACCOUNTING_STAFF_CACHE = [] finally: await staff_page.close() return ACCOUNTING_STAFF_CACHE async def find_course_list_page(page, base_url, output_callback): for path in MASTERS_PATHS: test_url = base_url.rstrip('/') + path try: response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000) if response and response.status == 200: title = await page.title() if '404' not in title.lower() and 'not found' not in title.lower(): has_courses = await page.evaluate(JS_CHECK_COURSES) if has_courses > 5: if output_callback: output_callback("info", f"Found course list: {path} ({has_courses} courses)") return test_url list_url = await page.evaluate(JS_FIND_LIST_URL) if list_url: if output_callback: output_callback("info", f"Found full course list: {list_url}") return list_url except: continue try: await page.goto(base_url, wait_until="domcontentloaded", timeout=30000) await page.wait_for_timeout(2000) courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME) if courses_url: return courses_url except: pass return None async def extract_course_links(page, output_callback): return await page.evaluate(JS_EXTRACT_PROGRAMS) async def scrape(output_callback=None): async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) page = await context.new_page() base_url = "https://www.manchester.ac.uk/" result = { "name": "Manchester University", "url": base_url, "scraped_at": datetime.now(timezone.utc).isoformat(), "schools": [] } all_programs = [] try: if output_callback: output_callback("info", "Searching for masters course list...") courses_url = await find_course_list_page(page, base_url, output_callback) if not courses_url: if output_callback: output_callback("warning", "Course list not found, using homepage") courses_url = base_url if output_callback: output_callback("info", "Extracting masters programs...") await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000) await page.wait_for_timeout(3000) for _ in range(3): try: load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")') if await load_more.count() > 0: await load_more.first.click() await page.wait_for_timeout(2000) else: break except: break programs_data = await extract_course_links(page, output_callback) if output_callback: output_callback("info", f"Found {len(programs_data)} masters programs") print("\nTop 20 programs:") for i, prog in enumerate(programs_data[:20]): print(f" {i+1}. {prog['name'][:60]}") print(f" {prog['url']}") max_detail_pages = min(len(programs_data), 30) detailed_processed = 0 logged_official_staff = False for prog in programs_data: faculty_data = [] used_official_staff = False if should_use_accounting_staff(prog['name']): staff_list = await load_accounting_staff(context, output_callback) if staff_list: used_official_staff = True if output_callback and not logged_official_staff: output_callback("info", "Using Alliance MBS Accounting & Finance staff directory for accounting programmes") logged_official_staff = True faculty_data = [ { "name": person.get("name"), "url": person.get("url") or ACCOUNTING_STAFF_URL, "title": person.get("title"), "email": person.get("email"), "source": "Alliance Manchester Business School - Accounting & Finance staff" } for person in staff_list ] elif detailed_processed < max_detail_pages: detailed_processed += 1 if output_callback and detailed_processed % 10 == 0: output_callback("info", f"Processing {detailed_processed}/{max_detail_pages}: {prog['name'][:50]}") try: await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000) await page.wait_for_timeout(800) faculty_data = await page.evaluate(JS_EXTRACT_FACULTY) except Exception as e: if output_callback: output_callback("warning", f"Failed to capture faculty for {prog['name'][:50]}: {e}") faculty_data = [] program_entry = { "name": prog['name'], "url": prog['url'], "faculty": faculty_data } if used_official_staff: program_entry["faculty_page_override"] = ACCOUNTING_STAFF_URL all_programs.append(program_entry) result["schools"] = [{ "name": "Masters Programs", "url": courses_url, "programs": all_programs }] if output_callback: total_faculty = sum(len(p.get('faculty', [])) for p in all_programs) output_callback("info", f"Done! {len(all_programs)} programs, {total_faculty} faculty") except Exception as e: if output_callback: output_callback("error", f"Scraping error: {str(e)}") finally: await browser.close() return result def log_callback(level, message): print(f"[{level.upper()}] {message}") if __name__ == "__main__": result = asyncio.run(scrape(output_callback=log_callback)) print("\n" + "="*60) print("Scrape summary:") print("="*60) if result.get("schools"): school = result["schools"][0] programs = school.get("programs", []) print(f"Course list URL: {school.get('url')}") print(f"Total programs: {len(programs)}") faculty_count = sum(len(p.get('faculty', [])) for p in programs) print(f"Faculty total: {faculty_count}") print("\nTop 10 programs:") for i, p in enumerate(programs[:10]): print(f" {i+1}. {p['name'][:60]}") if p.get("faculty"): print(f" Faculty entries: {len(p['faculty'])}") with open("manchester_test_result.json", "w", encoding="utf-8") as f: json.dump(result, f, indent=2, ensure_ascii=False) print("\nSaved results to manchester_test_result.json")