Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/test_manchester_scraper.py
+++ b/artifacts/test_manchester_scraper.py
@ -0,0 +1,464 @@
+"""
+Test Manchester University scraper - improved faculty mapping
+"""
+
+import asyncio
+import json
+from datetime import datetime, timezone
+
+from playwright.async_api import async_playwright
+
+
+MASTERS_PATHS = [
+    "/study/masters/courses/list/",
+    "/study/masters/courses/",
+    "/postgraduate/taught/courses/",
+    "/postgraduate/courses/list/",
+    "/postgraduate/courses/",
+    "/graduate/programs/",
+    "/academics/graduate/programs/",
+    "/programmes/masters/",
+    "/masters/programmes/",
+    "/admissions/graduate/programs/",
+]
+
+ACCOUNTING_STAFF_URL = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
+ACCOUNTING_STAFF_CACHE = None
+
+
+JS_CHECK_COURSES = r"""() => {
+        const links = document.querySelectorAll('a[href]');
+        let courseCount = 0;
+        for (const a of links) {
+            const href = a.href.toLowerCase();
+            if (/\/\d{4,}\//.test(href) ||
+                /\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
+                /\/course\/[a-z]/.test(href)) {
+                courseCount++;
+            }
+        }
+        return courseCount;
+    }"""
+
+JS_FIND_LIST_URL = """() => {
+        const links = document.querySelectorAll('a[href]');
+        for (const a of links) {
+            const text = a.innerText.toLowerCase();
+            const href = a.href.toLowerCase();
+            if ((text.includes('a-z') || text.includes('all course') ||
+                 text.includes('full list') || text.includes('browse all') ||
+                 href.includes('/list')) &&
+                (href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
+                return a.href;
+            }
+        }
+        return null;
+    }"""
+
+JS_FIND_COURSES_FROM_HOME = """() => {
+        const links = document.querySelectorAll('a[href]');
+        for (const a of links) {
+            const href = a.href.toLowerCase();
+            const text = a.innerText.toLowerCase();
+            if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
+                (href.includes('course') || href.includes('program') || href.includes('degree'))) {
+                return a.href;
+            }
+        }
+        return null;
+    }"""
+
+JS_EXTRACT_PROGRAMS = r"""() => {
+        const programs = [];
+        const seen = new Set();
+        const currentHost = window.location.hostname;
+
+        document.querySelectorAll('a[href]').forEach(a => {
+            const href = a.href;
+            const text = a.innerText.trim().replace(/\s+/g, ' ');
+
+            if (!href || seen.has(href)) return;
+            if (text.length < 5 || text.length > 200) return;
+            if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;
+
+            try {
+                const linkHost = new URL(href).hostname;
+                if (!linkHost.includes(currentHost.replace('www.', '')) &&
+                    !currentHost.includes(linkHost.replace('www.', ''))) return;
+            } catch {
+                return;
+            }
+
+            const hrefLower = href.toLowerCase();
+            const textLower = text.toLowerCase();
+
+            const isNavigation = textLower === 'courses' ||
+                                textLower === 'programmes' ||
+                                textLower === 'undergraduate' ||
+                                textLower === 'postgraduate' ||
+                                textLower === 'masters' ||
+                                textLower === "master's" ||
+                                textLower.includes('skip to') ||
+                                textLower.includes('share') ||
+                                textLower === 'home' ||
+                                textLower === 'study' ||
+                                textLower.startsWith('a-z') ||
+                                textLower.includes('admission') ||
+                                textLower.includes('fees and funding') ||
+                                textLower.includes('why should') ||
+                                textLower.includes('why manchester') ||
+                                textLower.includes('teaching and learning') ||
+                                textLower.includes('meet us') ||
+                                textLower.includes('student support') ||
+                                textLower.includes('contact us') ||
+                                textLower.includes('how to apply') ||
+                                hrefLower.includes('/admissions/') ||
+                                hrefLower.includes('/fees-and-funding/') ||
+                                hrefLower.includes('/why-') ||
+                                hrefLower.includes('/meet-us/') ||
+                                hrefLower.includes('/contact-us/') ||
+                                hrefLower.includes('/student-support/') ||
+                                hrefLower.includes('/teaching-and-learning/') ||
+                                hrefLower.endsWith('/courses/') ||
+                                hrefLower.endsWith('/masters/') ||
+                                hrefLower.endsWith('/postgraduate/');
+
+            if (isNavigation) return;
+
+            const isExcluded = hrefLower.includes('/undergraduate') ||
+                              hrefLower.includes('/bachelor') ||
+                              hrefLower.includes('/phd/') ||
+                              hrefLower.includes('/doctoral') ||
+                              hrefLower.includes('/research-degree') ||
+                              textLower.includes('bachelor') ||
+                              textLower.includes('undergraduate') ||
+                              (textLower.includes('phd') && !textLower.includes('mphil'));
+
+            if (isExcluded) return;
+
+            const hasNumericId = /\/\d{4,}\//.test(href);
+            const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
+            const isCoursePage = (hrefLower.includes('/course/') ||
+                                 hrefLower.includes('/courses/list/') ||
+                                 hrefLower.includes('/programme/')) &&
+                                 href.split('/').filter(p => p).length > 4;
+            const textHasDegree = /(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)/i.test(text) ||
+                                 textLower.includes('master');
+
+            if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
+                seen.add(href);
+                programs.push({
+                    name: text,
+                    url: href
+                });
+            }
+        });
+
+        return programs;
+    }"""
+
+JS_EXTRACT_FACULTY = r"""() => {
+        const faculty = [];
+        const seen = new Set();
+
+        document.querySelectorAll('a[href]').forEach(a => {
+            const href = a.href.toLowerCase();
+            const text = a.innerText.trim();
+
+            if (seen.has(href)) return;
+            if (text.length < 3 || text.length > 100) return;
+
+            const isStaff = href.includes('/people/') ||
+                           href.includes('/staff/') ||
+                           href.includes('/faculty/') ||
+                           href.includes('/profile/') ||
+                           href.includes('/academics/') ||
+                           href.includes('/researcher/');
+
+            if (isStaff) {
+                seen.add(href);
+                faculty.push({
+                    name: text.replace(/\s+/g, ' '),
+                    url: a.href
+                });
+            }
+        });
+
+        return faculty.slice(0, 20);
+    }"""
+
+JS_EXTRACT_ACCOUNTING_STAFF = r"""() => {
+        const rows = Array.from(document.querySelectorAll('table tbody tr'));
+        const staff = [];
+
+        for (const row of rows) {
+            const cells = row.querySelectorAll('td');
+            if (!cells || cells.length < 2) {
+                continue;
+            }
+
+            const nameCell = cells[1];
+            const roleCell = cells[2];
+            const emailCell = cells[5];
+
+            let profileUrl = '';
+            let displayName = nameCell ? nameCell.innerText.trim() : '';
+            const link = nameCell ? nameCell.querySelector('a[href]') : null;
+            if (link) {
+                profileUrl = link.href;
+                displayName = link.innerText.trim() || displayName;
+            }
+
+            if (!displayName) {
+                continue;
+            }
+
+            let email = '';
+            if (emailCell) {
+                const emailLink = emailCell.querySelector('a[href^="mailto:"]');
+                if (emailLink) {
+                    email = emailLink.href.replace('mailto:', '').trim();
+                }
+            }
+
+            staff.push({
+                name: displayName,
+                title: roleCell ? roleCell.innerText.trim() : '',
+                url: profileUrl,
+                email: email
+            });
+        }
+
+        return staff;
+    }"""
+
+
+def should_use_accounting_staff(program_name: str) -> bool:
+    lower_name = program_name.lower()
+    return "msc" in lower_name and "accounting" in lower_name
+
+
+async def load_accounting_staff(context, output_callback=None):
+    global ACCOUNTING_STAFF_CACHE
+
+    if ACCOUNTING_STAFF_CACHE is not None:
+        return ACCOUNTING_STAFF_CACHE
+
+    staff_page = await context.new_page()
+    try:
+        if output_callback:
+            output_callback("info", "Loading official AMBS Accounting & Finance staff page...")
+
+        await staff_page.goto(ACCOUNTING_STAFF_URL, wait_until="domcontentloaded", timeout=30000)
+        await staff_page.wait_for_timeout(2000)
+
+        ACCOUNTING_STAFF_CACHE = await staff_page.evaluate(JS_EXTRACT_ACCOUNTING_STAFF)
+
+        if output_callback:
+            output_callback("info", f"Captured {len(ACCOUNTING_STAFF_CACHE)} faculty from the official staff page")
+
+    except Exception as exc:
+        if output_callback:
+            output_callback("error", f"Failed to load AMBS staff page: {exc}")
+        ACCOUNTING_STAFF_CACHE = []
+    finally:
+        await staff_page.close()
+
+    return ACCOUNTING_STAFF_CACHE
+
+
+async def find_course_list_page(page, base_url, output_callback):
+    for path in MASTERS_PATHS:
+        test_url = base_url.rstrip('/') + path
+        try:
+            response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
+            if response and response.status == 200:
+                title = await page.title()
+                if '404' not in title.lower() and 'not found' not in title.lower():
+                    has_courses = await page.evaluate(JS_CHECK_COURSES)
+                    if has_courses > 5:
+                        if output_callback:
+                            output_callback("info", f"Found course list: {path} ({has_courses} courses)")
+                        return test_url
+
+                    list_url = await page.evaluate(JS_FIND_LIST_URL)
+                    if list_url:
+                        if output_callback:
+                            output_callback("info", f"Found full course list: {list_url}")
+                        return list_url
+        except:
+            continue
+
+    try:
+        await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
+        await page.wait_for_timeout(2000)
+        courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
+        if courses_url:
+            return courses_url
+    except:
+        pass
+
+    return None
+
+
+async def extract_course_links(page, output_callback):
+    return await page.evaluate(JS_EXTRACT_PROGRAMS)
+
+
+async def scrape(output_callback=None):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        )
+        page = await context.new_page()
+
+        base_url = "https://www.manchester.ac.uk/"
+
+        result = {
+            "name": "Manchester University",
+            "url": base_url,
+            "scraped_at": datetime.now(timezone.utc).isoformat(),
+            "schools": []
+        }
+
+        all_programs = []
+
+        try:
+            if output_callback:
+                output_callback("info", "Searching for masters course list...")
+
+            courses_url = await find_course_list_page(page, base_url, output_callback)
+
+            if not courses_url:
+                if output_callback:
+                    output_callback("warning", "Course list not found, using homepage")
+                courses_url = base_url
+
+            if output_callback:
+                output_callback("info", "Extracting masters programs...")
+
+            await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(3000)
+
+            for _ in range(3):
+                try:
+                    load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
+                    if await load_more.count() > 0:
+                        await load_more.first.click()
+                        await page.wait_for_timeout(2000)
+                    else:
+                        break
+                except:
+                    break
+
+            programs_data = await extract_course_links(page, output_callback)
+
+            if output_callback:
+                output_callback("info", f"Found {len(programs_data)} masters programs")
+
+            print("\nTop 20 programs:")
+            for i, prog in enumerate(programs_data[:20]):
+                print(f"  {i+1}. {prog['name'][:60]}")
+                print(f"     {prog['url']}")
+
+            max_detail_pages = min(len(programs_data), 30)
+            detailed_processed = 0
+            logged_official_staff = False
+
+            for prog in programs_data:
+                faculty_data = []
+                used_official_staff = False
+
+                if should_use_accounting_staff(prog['name']):
+                    staff_list = await load_accounting_staff(context, output_callback)
+                    if staff_list:
+                        used_official_staff = True
+                        if output_callback and not logged_official_staff:
+                            output_callback("info", "Using Alliance MBS Accounting & Finance staff directory for accounting programmes")
+                            logged_official_staff = True
+                        faculty_data = [
+                            {
+                                "name": person.get("name"),
+                                "url": person.get("url") or ACCOUNTING_STAFF_URL,
+                                "title": person.get("title"),
+                                "email": person.get("email"),
+                                "source": "Alliance Manchester Business School - Accounting & Finance staff"
+                            }
+                            for person in staff_list
+                        ]
+
+                elif detailed_processed < max_detail_pages:
+                    detailed_processed += 1
+                    if output_callback and detailed_processed % 10 == 0:
+                        output_callback("info", f"Processing {detailed_processed}/{max_detail_pages}: {prog['name'][:50]}")
+                    try:
+                        await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
+                        await page.wait_for_timeout(800)
+
+                        faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
+                    except Exception as e:
+                        if output_callback:
+                            output_callback("warning", f"Failed to capture faculty for {prog['name'][:50]}: {e}")
+                        faculty_data = []
+
+                program_entry = {
+                    "name": prog['name'],
+                    "url": prog['url'],
+                    "faculty": faculty_data
+                }
+
+                if used_official_staff:
+                    program_entry["faculty_page_override"] = ACCOUNTING_STAFF_URL
+
+                all_programs.append(program_entry)
+
+            result["schools"] = [{
+                "name": "Masters Programs",
+                "url": courses_url,
+                "programs": all_programs
+            }]
+
+            if output_callback:
+                total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
+                output_callback("info", f"Done! {len(all_programs)} programs, {total_faculty} faculty")
+
+        except Exception as e:
+            if output_callback:
+                output_callback("error", f"Scraping error: {str(e)}")
+
+        finally:
+            await browser.close()
+
+        return result
+
+
+def log_callback(level, message):
+    print(f"[{level.upper()}] {message}")
+
+
+if __name__ == "__main__":
+    result = asyncio.run(scrape(output_callback=log_callback))
+
+    print("\n" + "="*60)
+    print("Scrape summary:")
+    print("="*60)
+
+    if result.get("schools"):
+        school = result["schools"][0]
+        programs = school.get("programs", [])
+        print(f"Course list URL: {school.get('url')}")
+        print(f"Total programs: {len(programs)}")
+
+        faculty_count = sum(len(p.get('faculty', [])) for p in programs)
+        print(f"Faculty total: {faculty_count}")
+
+        print("\nTop 10 programs:")
+        for i, p in enumerate(programs[:10]):
+            print(f"  {i+1}. {p['name'][:60]}")
+            if p.get("faculty"):
+                print(f"     Faculty entries: {len(p['faculty'])}")
+
+    with open("manchester_test_result.json", "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2, ensure_ascii=False)
+    print("\nSaved results to manchester_test_result.json")