University-Playwright-Codeg…/artifacts/test_manchester_scraper.py

"""
Test Manchester University scraper - improved faculty mapping
"""

import asyncio
import json
from datetime import datetime, timezone

from playwright.async_api import async_playwright


MASTERS_PATHS = [
    "/study/masters/courses/list/",
    "/study/masters/courses/",
    "/postgraduate/taught/courses/",
    "/postgraduate/courses/list/",
    "/postgraduate/courses/",
    "/graduate/programs/",
    "/academics/graduate/programs/",
    "/programmes/masters/",
    "/masters/programmes/",
    "/admissions/graduate/programs/",
]

ACCOUNTING_STAFF_URL = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
ACCOUNTING_STAFF_CACHE = None


JS_CHECK_COURSES = r"""() => {
        const links = document.querySelectorAll('a[href]');
        let courseCount = 0;
        for (const a of links) {
            const href = a.href.toLowerCase();
            if (/\/\d{4,}\//.test(href) ||
                /\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
                /\/course\/[a-z]/.test(href)) {
                courseCount++;
            }
        }
        return courseCount;
    }"""

JS_FIND_LIST_URL = """() => {
        const links = document.querySelectorAll('a[href]');
        for (const a of links) {
            const text = a.innerText.toLowerCase();
            const href = a.href.toLowerCase();
            if ((text.includes('a-z') || text.includes('all course') ||
                 text.includes('full list') || text.includes('browse all') ||
                 href.includes('/list')) &&
                (href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
                return a.href;
            }
        }
        return null;
    }"""

JS_FIND_COURSES_FROM_HOME = """() => {
        const links = document.querySelectorAll('a[href]');
        for (const a of links) {
            const href = a.href.toLowerCase();
            const text = a.innerText.toLowerCase();
            if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
                (href.includes('course') || href.includes('program') || href.includes('degree'))) {
                return a.href;
            }
        }
        return null;
    }"""

JS_EXTRACT_PROGRAMS = r"""() => {
        const programs = [];
        const seen = new Set();
        const currentHost = window.location.hostname;

        document.querySelectorAll('a[href]').forEach(a => {
            const href = a.href;
            const text = a.innerText.trim().replace(/\s+/g, ' ');

            if (!href || seen.has(href)) return;
            if (text.length < 5 || text.length > 200) return;
            if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;

            try {
                const linkHost = new URL(href).hostname;
                if (!linkHost.includes(currentHost.replace('www.', '')) &&
                    !currentHost.includes(linkHost.replace('www.', ''))) return;
            } catch {
                return;
            }

            const hrefLower = href.toLowerCase();
            const textLower = text.toLowerCase();

            const isNavigation = textLower === 'courses' ||
                                textLower === 'programmes' ||
                                textLower === 'undergraduate' ||
                                textLower === 'postgraduate' ||
                                textLower === 'masters' ||
                                textLower === "master's" ||
                                textLower.includes('skip to') ||
                                textLower.includes('share') ||
                                textLower === 'home' ||
                                textLower === 'study' ||
                                textLower.startsWith('a-z') ||
                                textLower.includes('admission') ||
                                textLower.includes('fees and funding') ||
                                textLower.includes('why should') ||
                                textLower.includes('why manchester') ||
                                textLower.includes('teaching and learning') ||
                                textLower.includes('meet us') ||
                                textLower.includes('student support') ||
                                textLower.includes('contact us') ||
                                textLower.includes('how to apply') ||
                                hrefLower.includes('/admissions/') ||
                                hrefLower.includes('/fees-and-funding/') ||
                                hrefLower.includes('/why-') ||
                                hrefLower.includes('/meet-us/') ||
                                hrefLower.includes('/contact-us/') ||
                                hrefLower.includes('/student-support/') ||
                                hrefLower.includes('/teaching-and-learning/') ||
                                hrefLower.endsWith('/courses/') ||
                                hrefLower.endsWith('/masters/') ||
                                hrefLower.endsWith('/postgraduate/');

            if (isNavigation) return;

            const isExcluded = hrefLower.includes('/undergraduate') ||
                              hrefLower.includes('/bachelor') ||
                              hrefLower.includes('/phd/') ||
                              hrefLower.includes('/doctoral') ||
                              hrefLower.includes('/research-degree') ||
                              textLower.includes('bachelor') ||
                              textLower.includes('undergraduate') ||
                              (textLower.includes('phd') && !textLower.includes('mphil'));

            if (isExcluded) return;

            const hasNumericId = /\/\d{4,}\//.test(href);
            const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
            const isCoursePage = (hrefLower.includes('/course/') ||
                                 hrefLower.includes('/courses/list/') ||
                                 hrefLower.includes('/programme/')) &&
                                 href.split('/').filter(p => p).length > 4;
            const textHasDegree = /(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)/i.test(text) ||
                                 textLower.includes('master');

            if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
                seen.add(href);
                programs.push({
                    name: text,
                    url: href
                });
            }
        });

        return programs;
    }"""

JS_EXTRACT_FACULTY = r"""() => {
        const faculty = [];
        const seen = new Set();

        document.querySelectorAll('a[href]').forEach(a => {
            const href = a.href.toLowerCase();
            const text = a.innerText.trim();

            if (seen.has(href)) return;
            if (text.length < 3 || text.length > 100) return;

            const isStaff = href.includes('/people/') ||
                           href.includes('/staff/') ||
                           href.includes('/faculty/') ||
                           href.includes('/profile/') ||
                           href.includes('/academics/') ||
                           href.includes('/researcher/');

            if (isStaff) {
                seen.add(href);
                faculty.push({
                    name: text.replace(/\s+/g, ' '),
                    url: a.href
                });
            }
        });

        return faculty.slice(0, 20);
    }"""

JS_EXTRACT_ACCOUNTING_STAFF = r"""() => {
        const rows = Array.from(document.querySelectorAll('table tbody tr'));
        const staff = [];

        for (const row of rows) {
            const cells = row.querySelectorAll('td');
            if (!cells || cells.length < 2) {
                continue;
            }

            const nameCell = cells[1];
            const roleCell = cells[2];
            const emailCell = cells[5];

            let profileUrl = '';
            let displayName = nameCell ? nameCell.innerText.trim() : '';
            const link = nameCell ? nameCell.querySelector('a[href]') : null;
            if (link) {
                profileUrl = link.href;
                displayName = link.innerText.trim() || displayName;
            }

            if (!displayName) {
                continue;
            }

            let email = '';
            if (emailCell) {
                const emailLink = emailCell.querySelector('a[href^="mailto:"]');
                if (emailLink) {
                    email = emailLink.href.replace('mailto:', '').trim();
                }
            }

            staff.push({
                name: displayName,
                title: roleCell ? roleCell.innerText.trim() : '',
                url: profileUrl,
                email: email
            });
        }

        return staff;
    }"""


def should_use_accounting_staff(program_name: str) -> bool:
    lower_name = program_name.lower()
    return "msc" in lower_name and "accounting" in lower_name


async def load_accounting_staff(context, output_callback=None):
    global ACCOUNTING_STAFF_CACHE

    if ACCOUNTING_STAFF_CACHE is not None:
        return ACCOUNTING_STAFF_CACHE

    staff_page = await context.new_page()
    try:
        if output_callback:
            output_callback("info", "Loading official AMBS Accounting & Finance staff page...")

        await staff_page.goto(ACCOUNTING_STAFF_URL, wait_until="domcontentloaded", timeout=30000)
        await staff_page.wait_for_timeout(2000)

        ACCOUNTING_STAFF_CACHE = await staff_page.evaluate(JS_EXTRACT_ACCOUNTING_STAFF)

        if output_callback:
            output_callback("info", f"Captured {len(ACCOUNTING_STAFF_CACHE)} faculty from the official staff page")

    except Exception as exc:
        if output_callback:
            output_callback("error", f"Failed to load AMBS staff page: {exc}")
        ACCOUNTING_STAFF_CACHE = []
    finally:
        await staff_page.close()

    return ACCOUNTING_STAFF_CACHE


async def find_course_list_page(page, base_url, output_callback):
    for path in MASTERS_PATHS:
        test_url = base_url.rstrip('/') + path
        try:
            response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
            if response and response.status == 200:
                title = await page.title()
                if '404' not in title.lower() and 'not found' not in title.lower():
                    has_courses = await page.evaluate(JS_CHECK_COURSES)
                    if has_courses > 5:
                        if output_callback:
                            output_callback("info", f"Found course list: {path} ({has_courses} courses)")
                        return test_url

                    list_url = await page.evaluate(JS_FIND_LIST_URL)
                    if list_url:
                        if output_callback:
                            output_callback("info", f"Found full course list: {list_url}")
                        return list_url
        except:
            continue

    try:
        await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
        await page.wait_for_timeout(2000)
        courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
        if courses_url:
            return courses_url
    except:
        pass

    return None


async def extract_course_links(page, output_callback):
    return await page.evaluate(JS_EXTRACT_PROGRAMS)


async def scrape(output_callback=None):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = await context.new_page()

        base_url = "https://www.manchester.ac.uk/"

        result = {
            "name": "Manchester University",
            "url": base_url,
            "scraped_at": datetime.now(timezone.utc).isoformat(),
            "schools": []
        }

        all_programs = []

        try:
            if output_callback:
                output_callback("info", "Searching for masters course list...")

            courses_url = await find_course_list_page(page, base_url, output_callback)

            if not courses_url:
                if output_callback:
                    output_callback("warning", "Course list not found, using homepage")
                courses_url = base_url

            if output_callback:
                output_callback("info", "Extracting masters programs...")

            await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
            await page.wait_for_timeout(3000)

            for _ in range(3):
                try:
                    load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
                    if await load_more.count() > 0:
                        await load_more.first.click()
                        await page.wait_for_timeout(2000)
                    else:
                        break
                except:
                    break

            programs_data = await extract_course_links(page, output_callback)

            if output_callback:
                output_callback("info", f"Found {len(programs_data)} masters programs")

            print("\nTop 20 programs:")
            for i, prog in enumerate(programs_data[:20]):
                print(f"  {i+1}. {prog['name'][:60]}")
                print(f"     {prog['url']}")

            max_detail_pages = min(len(programs_data), 30)
            detailed_processed = 0
            logged_official_staff = False

            for prog in programs_data:
                faculty_data = []
                used_official_staff = False

                if should_use_accounting_staff(prog['name']):
                    staff_list = await load_accounting_staff(context, output_callback)
                    if staff_list:
                        used_official_staff = True
                        if output_callback and not logged_official_staff:
                            output_callback("info", "Using Alliance MBS Accounting & Finance staff directory for accounting programmes")
                            logged_official_staff = True
                        faculty_data = [
                            {
                                "name": person.get("name"),
                                "url": person.get("url") or ACCOUNTING_STAFF_URL,
                                "title": person.get("title"),
                                "email": person.get("email"),
                                "source": "Alliance Manchester Business School - Accounting & Finance staff"
                            }
                            for person in staff_list
                        ]

                elif detailed_processed < max_detail_pages:
                    detailed_processed += 1
                    if output_callback and detailed_processed % 10 == 0:
                        output_callback("info", f"Processing {detailed_processed}/{max_detail_pages}: {prog['name'][:50]}")
                    try:
                        await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
                        await page.wait_for_timeout(800)

                        faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
                    except Exception as e:
                        if output_callback:
                            output_callback("warning", f"Failed to capture faculty for {prog['name'][:50]}: {e}")
                        faculty_data = []

                program_entry = {
                    "name": prog['name'],
                    "url": prog['url'],
                    "faculty": faculty_data
                }

                if used_official_staff:
                    program_entry["faculty_page_override"] = ACCOUNTING_STAFF_URL

                all_programs.append(program_entry)

            result["schools"] = [{
                "name": "Masters Programs",
                "url": courses_url,
                "programs": all_programs
            }]

            if output_callback:
                total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
                output_callback("info", f"Done! {len(all_programs)} programs, {total_faculty} faculty")

        except Exception as e:
            if output_callback:
                output_callback("error", f"Scraping error: {str(e)}")

        finally:
            await browser.close()

        return result


def log_callback(level, message):
    print(f"[{level.upper()}] {message}")


if __name__ == "__main__":
    result = asyncio.run(scrape(output_callback=log_callback))

    print("\n" + "="*60)
    print("Scrape summary:")
    print("="*60)

    if result.get("schools"):
        school = result["schools"][0]
        programs = school.get("programs", [])
        print(f"Course list URL: {school.get('url')}")
        print(f"Total programs: {len(programs)}")

        faculty_count = sum(len(p.get('faculty', [])) for p in programs)
        print(f"Faculty total: {faculty_count}")

        print("\nTop 10 programs:")
        for i, p in enumerate(programs[:10]):
            print(f"  {i+1}. {p['name'][:60]}")
            if p.get("faculty"):
                print(f"     Faculty entries: {len(p['faculty'])}")

    with open("manchester_test_result.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    print("\nSaved results to manchester_test_result.json")