Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/debug_cs_faculty.py
+++ b/artifacts/debug_cs_faculty.py
@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""
+调试Computer Science的Faculty页面
+"""
+
+import asyncio
+from playwright.async_api import async_playwright
+
+
+async def debug_cs():
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        page = await browser.new_page()
+
+        # 访问Computer Science GSAS页面
+        gsas_url = "https://gsas.harvard.edu/program/computer-science"
+        print(f"访问: {gsas_url}")
+
+        await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
+        await page.wait_for_timeout(3000)
+
+        await page.screenshot(path="cs_gsas_page.png", full_page=True)
+        print("截图已保存: cs_gsas_page.png")
+
+        # 查找所有链接
+        links = await page.evaluate('''() => {
+            const links = [];
+            document.querySelectorAll('a[href]').forEach(a => {
+                const text = a.innerText.trim();
+                const href = a.href;
+                if (text && text.length > 2 && text.length < 100) {
+                    links.push({text: text, href: href});
+                }
+            });
+            return links;
+        }''')
+
+        print(f"\n页面上的所有链接 ({len(links)} 个):")
+        for link in links:
+            print(f"  - {link['text'][:60]} -> {link['href']}")
+
+        # 查找可能的Faculty或People链接
+        print("\n\n查找Faculty/People相关链接:")
+        for link in links:
+            text_lower = link['text'].lower()
+            href_lower = link['href'].lower()
+            if 'faculty' in text_lower or 'people' in href_lower or 'faculty' in href_lower or 'website' in text_lower:
+                print(f"  * {link['text']} -> {link['href']}")
+
+        # 尝试访问SEAS (School of Engineering)
+        print("\n\n尝试访问SEAS Computer Science页面...")
+        seas_url = "https://seas.harvard.edu/computer-science"
+        await page.goto(seas_url, wait_until="domcontentloaded", timeout=30000)
+        await page.wait_for_timeout(2000)
+
+        await page.screenshot(path="seas_cs_page.png", full_page=True)
+        print("截图已保存: seas_cs_page.png")
+
+        seas_links = await page.evaluate('''() => {
+            const links = [];
+            document.querySelectorAll('a[href]').forEach(a => {
+                const text = a.innerText.trim();
+                const href = a.href;
+                const lowerText = text.toLowerCase();
+                const lowerHref = href.toLowerCase();
+                if ((lowerText.includes('faculty') || lowerText.includes('people') ||
+                     lowerHref.includes('faculty') || lowerHref.includes('people')) &&
+                    text.length > 2) {
+                    links.push({text: text, href: href});
+                }
+            });
+            return links;
+        }''')
+
+        print(f"\nSEAS页面上的Faculty/People链接:")
+        for link in seas_links:
+            print(f"  * {link['text']} -> {link['href']}")
+
+        await browser.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(debug_cs())
--- a/artifacts/explore_faculty_page.py
+++ b/artifacts/explore_faculty_page.py
@ -0,0 +1,110 @@
+"""
+探索Harvard院系People/Faculty页面结构，获取导师列表
+"""
+import asyncio
+from playwright.async_api import async_playwright
+
+async def explore_faculty_page():
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        page = await browser.new_page()
+
+        # 访问AAAS院系People页面
+        people_url = "https://aaas.fas.harvard.edu/aaas-people"
+        print(f"访问院系People页面: {people_url}")
+
+        await page.goto(people_url, wait_until='networkidle')
+        await page.wait_for_timeout(3000)
+
+        # 截图保存
+        await page.screenshot(path="aaas_people_page.png", full_page=True)
+        print("已保存截图: aaas_people_page.png")
+
+        # 获取所有教职员工链接
+        faculty_info = await page.evaluate('''() => {
+            const faculty = [];
+
+            // 查找所有 /people/ 路径的链接
+            document.querySelectorAll('a[href*="/people/"]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+
+                // 过滤掉导航链接，只保留个人页面链接
+                if (href.includes('/people/') && text.length > 3 &&
+                    !text.toLowerCase().includes('people') &&
+                    !href.endsWith('/people/') &&
+                    !href.endsWith('/aaas-people')) {
+                    faculty.push({
+                        name: text,
+                        url: href
+                    });
+                }
+            });
+
+            return faculty;
+        }''')
+
+        print(f"\n找到 {len(faculty_info)} 个教职员工:")
+        for f in faculty_info:
+            print(f"  - {f['name']} -> {f['url']}")
+
+        # 尝试经济学院系的Faculty页面
+        print("\n\n========== 尝试经济学院系Faculty页面 ==========")
+        econ_faculty_url = "http://economics.harvard.edu/people/people-type/faculty"
+        print(f"访问: {econ_faculty_url}")
+
+        await page.goto(econ_faculty_url, wait_until='networkidle')
+        await page.wait_for_timeout(3000)
+
+        await page.screenshot(path="econ_faculty_page.png", full_page=True)
+        print("已保存截图: econ_faculty_page.png")
+
+        econ_faculty = await page.evaluate('''() => {
+            const faculty = [];
+
+            // 查找所有可能的faculty链接
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+                const lowerHref = href.toLowerCase();
+
+                // 查找个人页面链接
+                if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
+                     lowerHref.includes('/profile/')) &&
+                    text.length > 3 && text.length < 100 &&
+                    !text.toLowerCase().includes('faculty') &&
+                    !text.toLowerCase().includes('people')) {
+                    faculty.push({
+                        name: text,
+                        url: href
+                    });
+                }
+            });
+
+            return faculty;
+        }''')
+
+        print(f"\n找到 {len(econ_faculty)} 个教职员工:")
+        for f in econ_faculty[:30]:
+            print(f"  - {f['name']} -> {f['url']}")
+
+        # 查看页面上所有链接用于调试
+        print("\n\n页面上的所有链接:")
+        all_links = await page.evaluate('''() => {
+            const links = [];
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+                if (text && text.length > 2 && text.length < 100) {
+                    links.push({text: text, href: href});
+                }
+            });
+            return links;
+        }''')
+        for link in all_links[:40]:
+            print(f"  - {link['text'][:50]} -> {link['href']}")
+
+        await browser.close()
+
+if __name__ == "__main__":
+    asyncio.run(explore_faculty_page())
--- a/artifacts/explore_manchester.py
+++ b/artifacts/explore_manchester.py
@ -0,0 +1,173 @@
+"""
+探索曼彻斯特大学硕士课程页面结构
+"""
+
+import asyncio
+import json
+from playwright.async_api import async_playwright
+
+
+async def explore_manchester():
+    """探索曼彻斯特大学网站结构"""
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        )
+        page = await context.new_page()
+
+        # 直接访问硕士课程A-Z列表页
+        print("访问硕士课程A-Z列表页面...")
+        await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/",
+                       wait_until="domcontentloaded", timeout=60000)
+        await page.wait_for_timeout(5000)
+
+        # 截图
+        await page.screenshot(path="manchester_masters_page.png", full_page=False)
+        print("截图已保存: manchester_masters_page.png")
+
+        # 分析页面结构
+        page_info = await page.evaluate("""() => {
+            const info = {
+                title: document.title,
+                url: window.location.href,
+                all_links: [],
+                course_candidates: [],
+                page_sections: []
+            };
+
+            // 获取所有链接
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href;
+                const text = a.innerText.trim().substring(0, 100);
+                if (href && text) {
+                    info.all_links.push({href, text});
+                }
+            });
+
+            // 查找可能的课程链接 - 包含 /course/ 或 list-item
+            document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => {
+                info.course_candidates.push({
+                    href: a.href,
+                    text: a.innerText.trim().substring(0, 100),
+                    classes: a.className,
+                    parent_classes: a.parentElement?.className || ''
+                });
+            });
+
+            // 获取页面主要区块
+            document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => {
+                info.page_sections.push({
+                    tag: el.tagName,
+                    id: el.id,
+                    classes: el.className,
+                    children_count: el.children.length
+                });
+            });
+
+            return info;
+        }""")
+
+        print(f"\n页面标题: {page_info['title']}")
+        print(f"当前URL: {page_info['url']}")
+        print(f"\n总链接数: {len(page_info['all_links'])}")
+        print(f"课程候选链接数: {len(page_info['course_candidates'])}")
+
+        # 查找包含 masters/courses/ 的链接
+        masters_links = [l for l in page_info['all_links']
+                        if 'masters/courses/' in l['href'].lower()
+                        and l['href'] != page_info['url']]
+
+        print(f"\n硕士课程相关链接 ({len(masters_links)}):")
+        for link in masters_links[:20]:
+            print(f"  - {link['text'][:50]}: {link['href']}")
+
+        print(f"\n课程候选详情:")
+        for c in page_info['course_candidates'][:10]:
+            print(f"  - {c['text'][:50]}")
+            print(f"    URL: {c['href']}")
+            print(f"    Classes: {c['classes']}")
+
+        # 检查是否有搜索/筛选功能
+        search_elements = await page.evaluate("""() => {
+            const elements = [];
+            document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => {
+                elements.push({
+                    tag: el.tagName,
+                    type: el.type || '',
+                    id: el.id,
+                    name: el.name || '',
+                    classes: el.className
+                });
+            });
+            return elements;
+        }""")
+
+        print(f"\n搜索/筛选元素: {len(search_elements)}")
+        for el in search_elements[:5]:
+            print(f"  - {el}")
+
+        # 尝试找到课程列表的实际结构
+        print("\n\n正在分析页面中的课程列表结构...")
+
+        list_structures = await page.evaluate("""() => {
+            const structures = [];
+
+            // 查找各种可能的列表结构
+            const selectors = [
+                'ul li a[href*="course"]',
+                'div[class*="result"] a',
+                'div[class*="course"] a',
+                'article a[href]',
+                '.search-results a',
+                '[data-course] a',
+                'table tr td a'
+            ];
+
+            for (const selector of selectors) {
+                const elements = document.querySelectorAll(selector);
+                if (elements.length > 0) {
+                    const samples = [];
+                    elements.forEach((el, i) => {
+                        if (i < 5) {
+                            samples.push({
+                                href: el.href,
+                                text: el.innerText.trim().substring(0, 80)
+                            });
+                        }
+                    });
+                    structures.push({
+                        selector: selector,
+                        count: elements.length,
+                        samples: samples
+                    });
+                }
+            }
+
+            return structures;
+        }""")
+
+        print("\n找到的列表结构:")
+        for s in list_structures:
+            print(f"\n  选择器: {s['selector']} (共 {s['count']} 个)")
+            for sample in s['samples']:
+                print(f"    - {sample['text']}: {sample['href']}")
+
+        # 保存完整分析结果
+        with open("manchester_analysis.json", "w", encoding="utf-8") as f:
+            json.dump(page_info, f, indent=2, ensure_ascii=False)
+
+        print("\n\n完整分析已保存到 manchester_analysis.json")
+
+        # 等待用户查看
+        print("\n按 Ctrl+C 关闭浏览器...")
+        try:
+            await asyncio.sleep(30)
+        except:
+            pass
+
+        await browser.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(explore_manchester())
--- a/artifacts/explore_program_page.py
+++ b/artifacts/explore_program_page.py
@ -0,0 +1,226 @@
+"""
+探索Harvard项目页面结构，寻找导师信息
+"""
+import asyncio
+from playwright.async_api import async_playwright
+
+async def explore_program_page():
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        page = await browser.new_page()
+
+        # 访问研究生院系页面 (GSAS)
+        gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies"
+        print(f"访问研究生院系页面: {gsas_url}")
+
+        await page.goto(gsas_url, wait_until='networkidle')
+        await page.wait_for_timeout(3000)
+
+        # 截图保存
+        await page.screenshot(path="gsas_program_page.png", full_page=True)
+        print("已保存截图: gsas_program_page.png")
+
+        # 分析页面结构
+        page_info = await page.evaluate('''() => {
+            const info = {
+                title: document.title,
+                h1: document.querySelector('h1')?.innerText || '',
+                allHeadings: [],
+                facultyLinks: [],
+                peopleLinks: [],
+                allLinks: []
+            };
+
+            // 获取所有标题
+            document.querySelectorAll('h1, h2, h3, h4').forEach(h => {
+                info.allHeadings.push({
+                    tag: h.tagName,
+                    text: h.innerText.trim().substring(0, 100)
+                });
+            });
+
+            // 查找所有链接
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+
+                // 检查是否与教职员工相关
+                const lowerHref = href.toLowerCase();
+                const lowerText = text.toLowerCase();
+
+                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
+                    lowerHref.includes('professor') || lowerHref.includes('staff') ||
+                    lowerText.includes('faculty') || lowerText.includes('people')) {
+                    info.facultyLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+
+                // 检查是否是个人页面链接
+                if (href.includes('/people/') || href.includes('/faculty/') ||
+                    href.includes('/profile/') || href.includes('/person/')) {
+                    info.peopleLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+
+                // 保存所有主要链接
+                if (href && text.length > 2 && text.length < 150) {
+                    info.allLinks.push({
+                        text: text,
+                        href: href
+                    });
+                }
+            });
+
+            return info;
+        }''')
+
+        print(f"\n页面标题: {page_info['title']}")
+        print(f"H1: {page_info['h1']}")
+
+        print(f"\n所有标题 ({len(page_info['allHeadings'])}):")
+        for h in page_info['allHeadings']:
+            print(f"  <{h['tag']}>: {h['text']}")
+
+        print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):")
+        for f in page_info['facultyLinks']:
+            print(f"  - {f['text']} -> {f['href']}")
+
+        print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):")
+        for p in page_info['peopleLinks']:
+            print(f"  - {p['text']} -> {p['href']}")
+
+        print(f"\n所有链接 ({len(page_info['allLinks'])}):")
+        for link in page_info['allLinks'][:50]:
+            print(f"  - {link['text'][:60]} -> {link['href']}")
+
+        # 尝试另一个项目页面看看是否有不同结构
+        print("\n\n========== 尝试另一个项目页面 ==========")
+        economics_url = "https://gsas.harvard.edu/program/economics"
+        print(f"访问: {economics_url}")
+
+        await page.goto(economics_url, wait_until='networkidle')
+        await page.wait_for_timeout(3000)
+
+        # 截图保存
+        await page.screenshot(path="gsas_economics_page.png", full_page=True)
+        print("已保存截图: gsas_economics_page.png")
+
+        # 分析
+        econ_info = await page.evaluate('''() => {
+            const info = {
+                title: document.title,
+                facultyLinks: [],
+                peopleLinks: []
+            };
+
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+                const lowerHref = href.toLowerCase();
+                const lowerText = text.toLowerCase();
+
+                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
+                    lowerText.includes('faculty') || lowerText.includes('people')) {
+                    info.facultyLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+
+                if (href.includes('/people/') || href.includes('/faculty/') ||
+                    href.includes('/profile/') || href.includes('/person/')) {
+                    info.peopleLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+            });
+
+            return info;
+        }''')
+
+        print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):")
+        for f in econ_info['facultyLinks']:
+            print(f"  - {f['text']} -> {f['href']}")
+
+        print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):")
+        for p in econ_info['peopleLinks']:
+            print(f"  - {p['text']} -> {p['href']}")
+
+        # 访问院系主页看看有没有Faculty页面
+        print("\n\n========== 尝试访问院系主页 ==========")
+        dept_url = "https://aaas.fas.harvard.edu/"
+        print(f"访问院系主页: {dept_url}")
+
+        await page.goto(dept_url, wait_until='networkidle')
+        await page.wait_for_timeout(3000)
+
+        await page.screenshot(path="aaas_dept_page.png", full_page=True)
+        print("已保存截图: aaas_dept_page.png")
+
+        dept_info = await page.evaluate('''() => {
+            const info = {
+                title: document.title,
+                navLinks: [],
+                facultyLinks: [],
+                peopleLinks: []
+            };
+
+            // 获取导航链接
+            document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+                if (text && text.length > 1 && text.length < 50) {
+                    info.navLinks.push({
+                        text: text,
+                        href: href
+                    });
+                }
+            });
+
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+                const lowerHref = href.toLowerCase();
+                const lowerText = text.toLowerCase();
+
+                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
+                    lowerText.includes('faculty') || lowerText.includes('people')) {
+                    info.facultyLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+
+                if (href.includes('/people/') || href.includes('/faculty/') ||
+                    href.includes('/profile/')) {
+                    info.peopleLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+            });
+
+            return info;
+        }''')
+
+        print(f"\n导航链接 ({len(dept_info['navLinks'])}):")
+        for link in dept_info['navLinks'][:20]:
+            print(f"  - {link['text']} -> {link['href']}")
+
+        print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):")
+        for f in dept_info['facultyLinks']:
+            print(f"  - {f['text']} -> {f['href']}")
+
+        print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):")
+        for p in dept_info['peopleLinks'][:30]:
+            print(f"  - {p['text']} -> {p['href']}")
+
+        await browser.close()
+
+if __name__ == "__main__":
+    asyncio.run(explore_program_page())
--- a/artifacts/harvard_faculty_scraper.py
+++ b/artifacts/harvard_faculty_scraper.py
@ -125,6 +125,7 @@ class ScrapeSettings:
    output: Path
    verify_links: bool = True
    request_delay: float = 1.0  # Polite crawling delay
+    timeout: int = 60000  # Navigation timeout in ms


 async def extract_links(page: Page) -> List[Tuple[str, str]]:
@ -210,7 +211,7 @@ async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink
                page = await context.new_page()
                try:
                    response = await page.goto(
-                        normalized_url, wait_until="domcontentloaded", timeout=20000
+                        normalized_url, wait_until="domcontentloaded", timeout=settings.timeout
                    )
                    if not response or response.status >= 400:
                        await page.close()
@ -411,6 +412,12 @@ def parse_args() -> argparse.Namespace:
        default=1.0,
        help="Delay between requests in seconds (polite crawling).",
    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=60000,
+        help="Navigation timeout in milliseconds (default: 60000 = 60s).",
+    )
    return parser.parse_args()


@ -424,6 +431,7 @@ async def main_async() -> None:
        output=args.output,
        verify_links=not args.no_verify,
        request_delay=args.delay,
+        timeout=args.timeout,
    )
    links = await crawl(settings, browser_name=args.browser)
    serialize(links, settings.output, settings.root_url)
--- a/artifacts/harvard_programs_scraper.py
+++ b/artifacts/harvard_programs_scraper.py
@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+"""
+Harvard Graduate Programs Scraper
+专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
+通过点击分页按钮遍历所有页面
+"""
+
+import asyncio
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from playwright.async_api import async_playwright
+
+
+async def scrape_harvard_programs():
+    """爬取Harvard研究生项目列表页面 - 通过点击分页按钮"""
+
+    all_programs = []
+    base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
+
+    async with async_playwright() as p:
+        # 使用无头模式
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            viewport={'width': 1920, 'height': 1080}
+        )
+        page = await context.new_page()
+
+        print(f"正在访问: {base_url}")
+        # 使用 domcontentloaded 而非 networkidle，更快加载
+        await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
+        # 等待页面内容加载
+        await page.wait_for_timeout(5000)
+
+        # 滚动到页面底部以确保分页按钮加载
+        print("滚动到页面底部...")
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+        await page.wait_for_timeout(2000)
+
+        current_page = 1
+        max_pages = 15
+
+        while current_page <= max_pages:
+            print(f"\n========== 第 {current_page} 页 ==========")
+
+            # 等待内容加载
+            await page.wait_for_timeout(2000)
+
+            # 提取当前页面的项目
+            # 从调试输出得知，项目按钮的class是 'records__record___PbPhG c-programs-item__title-link'
+            # 需要点击按钮来获取URL，因为Harvard使用JavaScript导航
+
+            # 首先获取所有项目按钮信息
+            page_data = await page.evaluate('''() => {
+                const programs = [];
+
+                // 查找所有项目行/容器
+                const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
+
+                programItems.forEach((item, index) => {
+                    // 获取项目名称按钮
+                    const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
+                    if (!nameBtn) return;
+
+                    const name = nameBtn.innerText.trim();
+                    if (!name || name.length < 3) return;
+
+                    // 获取学位信息
+                    let degrees = '';
+                    const allText = item.innerText;
+                    const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
+                    if (degreeMatch) {
+                        degrees = degreeMatch.join(', ');
+                    }
+
+                    // 查找链接 - 检查各种可能的位置
+                    let url = '';
+
+                    // 方法1: 查找 <a> 标签
+                    const link = item.querySelector('a[href]');
+                    if (link && link.href) {
+                        url = link.href;
+                    }
+
+                    // 方法2: 检查data属性
+                    if (!url) {
+                        const dataUrl = nameBtn.getAttribute('data-url') ||
+                                       nameBtn.getAttribute('data-href') ||
+                                       item.getAttribute('data-url');
+                        if (dataUrl) url = dataUrl;
+                    }
+
+                    // 方法3: 检查onclick属性
+                    if (!url) {
+                        const onclick = nameBtn.getAttribute('onclick') || '';
+                        const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/);
+                        if (urlMatch) url = urlMatch[1];
+                    }
+
+                    programs.push({
+                        name: name,
+                        degrees: degrees,
+                        url: url,
+                        index: index
+                    });
+                });
+
+                // 如果方法1没找到项目，使用备选方法
+                if (programs.length === 0) {
+                    // 查找所有项目按钮
+                    const buttons = document.querySelectorAll('button');
+                    buttons.forEach((btn, index) => {
+                        const className = btn.className || '';
+                        if (className.includes('c-programs-item') || className.includes('title-link')) {
+                            const name = btn.innerText.trim();
+                            if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
+                                programs.push({
+                                    name: name,
+                                    degrees: '',
+                                    url: '',
+                                    index: index
+                                });
+                            }
+                        }
+                    });
+                }
+
+                return {
+                    programs: programs,
+                    totalFound: programs.length
+                };
+            }''')
+
+            # 第一页时调试输出HTML结构
+            if current_page == 1 and len(page_data['programs']) == 0:
+                print("未找到项目，调试HTML结构...")
+                html_debug = await page.evaluate('''() => {
+                    const debug = {
+                        allButtons: [],
+                        allLinks: [],
+                        sampleHTML: ''
+                    };
+
+                    // 获取所有按钮
+                    document.querySelectorAll('button').forEach(btn => {
+                        const text = btn.innerText.trim().substring(0, 50);
+                        if (text && text.length > 3) {
+                            debug.allButtons.push({
+                                text: text,
+                                class: btn.className.substring(0, 80)
+                            });
+                        }
+                    });
+
+                    // 获取main区域的HTML片段
+                    const main = document.querySelector('main') || document.body;
+                    debug.sampleHTML = main.innerHTML.substring(0, 3000);
+
+                    return debug;
+                }''')
+                print(f"找到 {len(html_debug['allButtons'])} 个按钮:")
+                for btn in html_debug['allButtons'][:20]:
+                    print(f"    - {btn['text']} | class: {btn['class']}")
+                print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}")
+
+            print(f"  本页找到 {len(page_data['programs'])} 个项目")
+
+            # 打印找到的项目
+            for prog in page_data['programs']:
+                print(f"    - {prog['name']} ({prog['degrees']})")
+
+            # 添加到总列表（去重）
+            for prog in page_data['programs']:
+                name = prog['name'].strip()
+                if name and not any(p['name'] == name for p in all_programs):
+                    all_programs.append({
+                        'name': name,
+                        'degrees': prog.get('degrees', ''),
+                        'url': prog.get('url', ''),
+                        'page': current_page
+                    })
+
+            # 尝试点击下一页按钮
+            try:
+                clicked = False
+
+                # 首先打印所有分页相关元素用于调试
+                if current_page == 1:
+                    # 截图保存以便调试
+                    await page.screenshot(path="harvard_debug_pagination.png", full_page=True)
+                    print("已保存调试截图: harvard_debug_pagination.png")
+
+                    pagination_info = await page.evaluate('''() => {
+                        const result = {
+                            links: [],
+                            buttons: [],
+                            allClickable: [],
+                            pageNumbers: [],
+                            allText: []
+                        };
+
+                        // 查找所有链接
+                        document.querySelectorAll('a').forEach(a => {
+                            const text = a.innerText.trim();
+                            if (text.match(/^[0-9]+$|Next|page|Prev/i)) {
+                                result.links.push({
+                                    text: text.substring(0, 50),
+                                    href: a.href,
+                                    visible: a.offsetParent !== null,
+                                    className: a.className
+                                });
+                            }
+                        });
+
+                        // 查找所有按钮
+                        document.querySelectorAll('button').forEach(b => {
+                            const text = b.innerText.trim();
+                            if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) {
+                                result.buttons.push({
+                                    text: text.substring(0, 50),
+                                    visible: b.offsetParent !== null,
+                                    className: b.className
+                                });
+                            }
+                        });
+
+                        // 查找所有包含数字的可点击元素（可能是分页）
+                        document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => {
+                            const text = el.innerText.trim();
+                            if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) {
+                                result.pageNumbers.push({
+                                    tag: el.tagName,
+                                    text: text,
+                                    className: el.className,
+                                    id: el.id,
+                                    ariaLabel: el.getAttribute('aria-label'),
+                                    visible: el.offsetParent !== null
+                                });
+                            }
+                        });
+
+                        // 查找页面底部区域的所有可点击元素
+                        const bodyRect = document.body.getBoundingClientRect();
+                        document.querySelectorAll('*').forEach(el => {
+                            const rect = el.getBoundingClientRect();
+                            const text = el.innerText?.trim() || '';
+                            // 只看页面下半部分的元素且文本短
+                            if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) {
+                                const style = window.getComputedStyle(el);
+                                if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') {
+                                    result.allClickable.push({
+                                        tag: el.tagName,
+                                        text: text.substring(0, 30),
+                                        top: Math.round(rect.top),
+                                        className: el.className?.substring?.(0, 50) || ''
+                                    });
+                                }
+                            }
+                        });
+
+                        // 输出页面底部所有文本以便调试
+                        const bodyText = document.body.innerText;
+                        const lines = bodyText.split('\\n').filter(l => l.trim());
+                        // 找到包含数字1-9的行
+                        for (let i = 0; i < lines.length; i++) {
+                            if (lines[i].match(/^[1-9]$|Next page|Previous/)) {
+                                result.allText.push(lines[i]);
+                            }
+                        }
+
+                        return result;
+                    }''')
+                    print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):")
+                    for link in pagination_info['links']:
+                        print(f"    a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})")
+                    print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):")
+                    for btn in pagination_info['buttons']:
+                        print(f"    button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})")
+                    print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):")
+                    for pn in pagination_info['pageNumbers']:
+                        print(f"    {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}")
+                    print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):")
+                    for el in pagination_info['allClickable'][:30]:
+                        print(f"    {el['tag']}: '{el['text']}' (top: {el['top']})")
+                    print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):")
+                    for txt in pagination_info['allText'][:20]:
+                        print(f"    '{txt}'")
+
+                # 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠)
+                # 从调试输出得知，分页按钮是 <button class="c-pagination__link c-pagination__link--next">
+                next_page_num = str(current_page + 1)
+
+                try:
+                    next_btn = page.locator('button.c-pagination__link--next')
+                    if await next_btn.count() > 0:
+                        print(f"\n找到 'Next page' 按钮 (CSS选择器)，尝试点击...")
+                        await next_btn.first.scroll_into_view_if_needed()
+                        await next_btn.first.click()
+                        await page.wait_for_timeout(3000)
+                        current_page += 1
+                        clicked = True
+                except Exception as e:
+                    print(f"方法1失败: {e}")
+
+                if clicked:
+                    continue
+
+                # 方法2: 使用 get_by_role 查找按钮
+                try:
+                    next_btn = page.get_by_role("button", name="Next page")
+                    if await next_btn.count() > 0:
+                        print(f"\n通过role找到 'Next page' 按钮，尝试点击...")
+                        await next_btn.first.scroll_into_view_if_needed()
+                        await next_btn.first.click()
+                        await page.wait_for_timeout(3000)
+                        current_page += 1
+                        clicked = True
+                except Exception as e:
+                    print(f"方法2失败: {e}")
+
+                if clicked:
+                    continue
+
+                # 方法3: 查找所有分页按钮并点击 "Next page"
+                try:
+                    pagination_buttons = await page.query_selector_all('button.c-pagination__link')
+                    for btn in pagination_buttons:
+                        text = await btn.inner_text()
+                        if 'Next page' in text:
+                            print(f"\n通过遍历分页按钮找到 'Next page'，点击...")
+                            await btn.scroll_into_view_if_needed()
+                            await btn.click()
+                            await page.wait_for_timeout(3000)
+                            current_page += 1
+                            clicked = True
+                            break
+                except Exception as e:
+                    print(f"方法3失败: {e}")
+
+                if clicked:
+                    continue
+
+                # 方法4: 通过JavaScript直接点击分页按钮
+                try:
+                    js_clicked = await page.evaluate('''() => {
+                        // 查找 Next page 按钮
+                        const nextBtn = document.querySelector('button.c-pagination__link--next');
+                        if (nextBtn) {
+                            nextBtn.click();
+                            return true;
+                        }
+                        // 备选：查找所有分页按钮
+                        const buttons = document.querySelectorAll('button.c-pagination__link');
+                        for (const btn of buttons) {
+                            if (btn.innerText.includes('Next page')) {
+                                btn.click();
+                                return true;
+                            }
+                        }
+                        return false;
+                    }''')
+                    if js_clicked:
+                        print(f"\n通过JavaScript点击 'Next page' 成功")
+                        await page.wait_for_timeout(3000)
+                        current_page += 1
+                        clicked = True
+                except Exception as e:
+                    print(f"方法4失败: {e}")
+
+                if clicked:
+                    continue
+
+                # 方法5: 遍历所有按钮查找
+                try:
+                    all_buttons = await page.query_selector_all('button')
+                    for btn in all_buttons:
+                        try:
+                            text = await btn.inner_text()
+                            if 'Next page' in text:
+                                visible = await btn.is_visible()
+                                if visible:
+                                    print(f"\n遍历所有按钮找到 'Next page'，点击...")
+                                    await btn.scroll_into_view_if_needed()
+                                    await btn.click()
+                                    await page.wait_for_timeout(3000)
+                                    current_page += 1
+                                    clicked = True
+                                    break
+                        except:
+                            continue
+                except Exception as e:
+                    print(f"方法5失败: {e}")
+
+                if clicked:
+                    continue
+
+                print("没有找到下一页按钮，结束爬取")
+                break
+
+            except Exception as e:
+                print(f"点击下一页时出错: {e}")
+                break
+
+        # 生成项目URL - Harvard的项目URL格式为：
+        # https://www.harvard.edu/programs/{program-name-slug}/
+        # 例如: african-and-african-american-studies
+
+        import re
+
+        def name_to_slug(name):
+            """将项目名称转换为URL slug"""
+            # 转小写
+            slug = name.lower()
+            # 将特殊字符替换为空格
+            slug = re.sub(r'[^\w\s-]', '', slug)
+            # 替换空格为连字符
+            slug = re.sub(r'[\s_]+', '-', slug)
+            # 移除多余的连字符
+            slug = re.sub(r'-+', '-', slug)
+            # 移除首尾连字符
+            slug = slug.strip('-')
+            return slug
+
+        print("\n正在生成项目URL...")
+        for prog in all_programs:
+            slug = name_to_slug(prog['name'])
+            prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
+            print(f"  {prog['name']} -> {prog['url']}")
+
+        await browser.close()
+
+    # 排序
+    programs = sorted(all_programs, key=lambda x: x['name'])
+
+    # 保存
+    result = {
+        'source_url': base_url,
+        'scraped_at': datetime.now(timezone.utc).isoformat(),
+        'total_pages_scraped': current_page,
+        'total_programs': len(programs),
+        'programs': programs
+    }
+
+    output_file = Path('harvard_programs_results.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    print(f"\n{'='*60}")
+    print(f"爬取完成!")
+    print(f"共爬取 {current_page} 页")
+    print(f"共找到 {len(programs)} 个研究生项目")
+    print(f"结果保存到: {output_file}")
+    print(f"{'='*60}")
+
+    # 打印完整列表
+    print("\n研究生项目完整列表:")
+    for i, prog in enumerate(programs, 1):
+        print(f"{i:3}. {prog['name']} - {prog['degrees']}")
+
+    return result
+
+
+if __name__ == "__main__":
+    asyncio.run(scrape_harvard_programs())
--- a/artifacts/harvard_programs_with_faculty_scraper.py
+++ b/artifacts/harvard_programs_with_faculty_scraper.py
@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+"""
+Harvard Graduate Programs Scraper with Faculty Information
+爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
+并获取每个项目的导师个人信息页面URL
+"""
+
+import asyncio
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from playwright.async_api import async_playwright
+
+
+def name_to_slug(name):
+    """将项目名称转换为URL slug"""
+    slug = name.lower()
+    slug = re.sub(r'[^\w\s-]', '', slug)
+    slug = re.sub(r'[\s_]+', '-', slug)
+    slug = re.sub(r'-+', '-', slug)
+    slug = slug.strip('-')
+    return slug
+
+
+async def extract_faculty_from_page(page):
+    """从当前页面提取所有教职员工链接"""
+    faculty_list = await page.evaluate('''() => {
+        const faculty = [];
+        const seen = new Set();
+
+        document.querySelectorAll('a[href]').forEach(a => {
+            const href = a.href || '';
+            const text = a.innerText.trim();
+            const lowerHref = href.toLowerCase();
+            const lowerText = text.toLowerCase();
+
+            // 检查是否是个人页面链接
+            if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
+                 lowerHref.includes('/profile/') || lowerHref.includes('/person/')) &&
+                text.length > 3 && text.length < 100 &&
+                !lowerText.includes('people') &&
+                !lowerText.includes('faculty') &&
+                !lowerText.includes('profile') &&
+                !lowerText.includes('staff') &&
+                !lowerHref.endsWith('/people/') &&
+                !lowerHref.endsWith('/people') &&
+                !lowerHref.endsWith('/faculty/') &&
+                !lowerHref.endsWith('/faculty')) {
+
+                if (!seen.has(href)) {
+                    seen.add(href);
+                    faculty.push({
+                        name: text,
+                        url: href
+                    });
+                }
+            }
+        });
+
+        return faculty;
+    }''')
+    return faculty_list
+
+
+async def get_faculty_from_gsas_page(page, gsas_url, program_name):
+    """从GSAS项目页面获取Faculty链接，然后访问院系People页面获取导师列表"""
+    faculty_list = []
+    faculty_page_url = None
+
+    try:
+        print(f"    访问GSAS页面: {gsas_url}")
+        await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
+        await page.wait_for_timeout(2000)
+
+        # 策略1: 查找 "See list of ... faculty" 链接
+        faculty_link = await page.evaluate('''() => {
+            const links = document.querySelectorAll('a[href]');
+            for (const link of links) {
+                const text = link.innerText.toLowerCase();
+                const href = link.href;
+                if (text.includes('faculty') && text.includes('see list')) {
+                    return href;
+                }
+            }
+            return null;
+        }''')
+
+        # 策略2: 查找任何包含 /people 或 /faculty 的链接
+        if not faculty_link:
+            faculty_link = await page.evaluate('''() => {
+                const links = document.querySelectorAll('a[href]');
+                for (const link of links) {
+                    const text = link.innerText.toLowerCase();
+                    const href = link.href.toLowerCase();
+                    // 查找Faculty相关链接
+                    if ((text.includes('faculty') || text.includes('people')) &&
+                        (href.includes('/people') || href.includes('/faculty'))) {
+                        return link.href;
+                    }
+                }
+                return null;
+            }''')
+
+        # 策略3: 从页面中查找院系网站链接，然后尝试访问其People页面
+        if not faculty_link:
+            dept_website = await page.evaluate('''() => {
+                const links = document.querySelectorAll('a[href]');
+                for (const link of links) {
+                    const text = link.innerText.toLowerCase();
+                    const href = link.href;
+                    // 查找 Website 链接 (通常指向院系主页)
+                    if (text.includes('website') && href.includes('harvard.edu') &&
+                        !href.includes('gsas.harvard.edu')) {
+                        return href;
+                    }
+                }
+                return null;
+            }''')
+
+            if dept_website:
+                print(f"    找到院系网站: {dept_website}")
+                try:
+                    await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000)
+                    await page.wait_for_timeout(2000)
+
+                    # 在院系网站上查找People/Faculty链接
+                    faculty_link = await page.evaluate('''() => {
+                        const links = document.querySelectorAll('a[href]');
+                        for (const link of links) {
+                            const text = link.innerText.toLowerCase().trim();
+                            const href = link.href;
+                            if ((text === 'people' || text === 'faculty' ||
+                                 text === 'faculty & research' || text.includes('our faculty')) &&
+                                (href.includes('/people') || href.includes('/faculty'))) {
+                                return href;
+                            }
+                        }
+                        return null;
+                    }''')
+                except Exception as e:
+                    print(f"    访问院系网站失败: {e}")
+
+        if faculty_link:
+            faculty_page_url = faculty_link
+            print(f"    找到Faculty页面: {faculty_link}")
+
+            # 访问Faculty/People页面
+            await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(2000)
+
+            # 提取所有导师信息
+            faculty_list = await extract_faculty_from_page(page)
+
+            # 如果第一页没找到，尝试处理分页或其他布局
+            if len(faculty_list) == 0:
+                # 可能需要点击某些按钮或处理JavaScript加载
+                await page.wait_for_timeout(2000)
+                faculty_list = await extract_faculty_from_page(page)
+
+            print(f"    找到 {len(faculty_list)} 位导师")
+        else:
+            print(f"    未找到Faculty页面链接")
+
+    except Exception as e:
+        print(f"    获取Faculty信息失败: {e}")
+
+    return faculty_list, faculty_page_url
+
+
+async def scrape_harvard_programs_with_faculty():
+    """爬取Harvard研究生项目列表及导师信息"""
+
+    all_programs = []
+    base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            viewport={'width': 1920, 'height': 1080}
+        )
+        page = await context.new_page()
+
+        print(f"正在访问: {base_url}")
+        await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
+        await page.wait_for_timeout(5000)
+
+        # 滚动到页面底部
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+        await page.wait_for_timeout(2000)
+
+        current_page = 1
+        max_pages = 15
+
+        # 第一阶段：收集所有项目基本信息
+        print("\n========== 第一阶段：收集项目列表 ==========")
+        while current_page <= max_pages:
+            print(f"\n--- 第 {current_page} 页 ---")
+            await page.wait_for_timeout(2000)
+
+            # 提取当前页面的项目
+            page_data = await page.evaluate('''() => {
+                const programs = [];
+                const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
+
+                programItems.forEach((item, index) => {
+                    const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
+                    if (!nameBtn) return;
+
+                    const name = nameBtn.innerText.trim();
+                    if (!name || name.length < 3) return;
+
+                    let degrees = '';
+                    const allText = item.innerText;
+                    const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
+                    if (degreeMatch) {
+                        degrees = degreeMatch.join(', ');
+                    }
+
+                    programs.push({
+                        name: name,
+                        degrees: degrees
+                    });
+                });
+
+                if (programs.length === 0) {
+                    const buttons = document.querySelectorAll('button');
+                    buttons.forEach((btn) => {
+                        const className = btn.className || '';
+                        if (className.includes('c-programs-item') || className.includes('title-link')) {
+                            const name = btn.innerText.trim();
+                            if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
+                                programs.push({
+                                    name: name,
+                                    degrees: ''
+                                });
+                            }
+                        }
+                    });
+                }
+
+                return programs;
+            }''')
+
+            print(f"  本页找到 {len(page_data)} 个项目")
+
+            for prog in page_data:
+                name = prog['name'].strip()
+                if name and not any(p['name'] == name for p in all_programs):
+                    all_programs.append({
+                        'name': name,
+                        'degrees': prog.get('degrees', ''),
+                        'page': current_page
+                    })
+
+            # 尝试点击下一页
+            try:
+                next_btn = page.locator('button.c-pagination__link--next')
+                if await next_btn.count() > 0:
+                    await next_btn.first.scroll_into_view_if_needed()
+                    await next_btn.first.click()
+                    await page.wait_for_timeout(3000)
+                    current_page += 1
+                else:
+                    print("没有下一页按钮，结束收集")
+                    break
+            except Exception as e:
+                print(f"分页失败: {e}")
+                break
+
+        print(f"\n共收集到 {len(all_programs)} 个项目")
+
+        # 第二阶段：为每个项目获取导师信息
+        print("\n========== 第二阶段：获取导师信息 ==========")
+        print("注意：这将访问每个项目的GSAS页面，可能需要较长时间...")
+
+        for i, prog in enumerate(all_programs, 1):
+            print(f"\n[{i}/{len(all_programs)}] {prog['name']}")
+
+            # 生成项目URL
+            slug = name_to_slug(prog['name'])
+            prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
+
+            # 生成GSAS URL
+            gsas_url = f"https://gsas.harvard.edu/program/{slug}"
+
+            # 获取导师信息
+            faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name'])
+
+            prog['faculty_page_url'] = faculty_page_url or ""
+            prog['faculty'] = faculty_list
+            prog['faculty_count'] = len(faculty_list)
+
+            # 每10个项目保存一次进度
+            if i % 10 == 0:
+                temp_result = {
+                    'source_url': base_url,
+                    'scraped_at': datetime.now(timezone.utc).isoformat(),
+                    'progress': f"{i}/{len(all_programs)}",
+                    'programs': all_programs[:i]
+                }
+                with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f:
+                    json.dump(temp_result, f, ensure_ascii=False, indent=2)
+                print(f"    [进度已保存]")
+
+            # 避免请求过快
+            await page.wait_for_timeout(1500)
+
+        await browser.close()
+
+    # 排序
+    programs = sorted(all_programs, key=lambda x: x['name'])
+
+    # 统计
+    total_faculty = sum(p['faculty_count'] for p in programs)
+    programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0)
+
+    # 保存最终结果
+    result = {
+        'source_url': base_url,
+        'scraped_at': datetime.now(timezone.utc).isoformat(),
+        'total_pages_scraped': current_page,
+        'total_programs': len(programs),
+        'programs_with_faculty': programs_with_faculty,
+        'total_faculty_found': total_faculty,
+        'programs': programs
+    }
+
+    output_file = Path('harvard_programs_with_faculty.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    print(f"\n{'='*60}")
+    print(f"爬取完成!")
+    print(f"共爬取 {current_page} 页")
+    print(f"共找到 {len(programs)} 个研究生项目")
+    print(f"其中 {programs_with_faculty} 个项目有导师信息")
+    print(f"共找到 {total_faculty} 位导师")
+    print(f"结果保存到: {output_file}")
+    print(f"{'='*60}")
+
+    # 打印摘要
+    print("\n项目摘要 (前30个):")
+    for i, prog in enumerate(programs[:30], 1):
+        faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)"
+        print(f"{i:3}. {prog['name']} {faculty_info}")
+
+    if len(programs) > 30:
+        print(f"... 还有 {len(programs) - 30} 个项目")
+
+    return result
+
+
+if __name__ == "__main__":
+    asyncio.run(scrape_harvard_programs_with_faculty())
--- a/artifacts/manchester_complete_scraper.py
+++ b/artifacts/manchester_complete_scraper.py
@ -0,0 +1,910 @@
+"""
+曼彻斯特大学完整采集脚本
+新增特性：
+- Research Explorer API 优先拉取 JSON / XML，失败再回落 DOM
+- 每个学院独立页面、并行抓取（默认 3 并发）
+- 细粒度超时/重试/滚动/Load more 控制
+- 多 URL / 备用 Staff 页面配置
+- 导师目录缓存，可按学院关键词映射到项目
+- 诊断信息记录（失败学院、超时学院、批次信息）
+"""
+
+import asyncio
+import json
+import re
+from copy import deepcopy
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlencode, urljoin
+from xml.etree import ElementTree as ET
+
+from playwright.async_api import (
+    TimeoutError as PlaywrightTimeoutError,
+    async_playwright,
+)
+
+# =========================
+# 配置区
+# =========================
+
+DEFAULT_REQUEST = {
+    "timeout_ms": 60000,
+    "post_wait_ms": 2500,
+    "wait_until": "domcontentloaded",
+    "max_retries": 3,
+    "retry_backoff_ms": 2000,
+}
+
+STAFF_CONCURRENCY = 3
+
+SCHOOL_CONFIG: List[Dict[str, Any]] = [
+    {
+        "name": "Alliance Manchester Business School",
+        "keywords": [
+            "accounting",
+            "finance",
+            "business",
+            "management",
+            "marketing",
+            "mba",
+            "economics",
+            "entrepreneurship",
+        ],
+        "attach_faculty_to_programs": True,
+        "staff_pages": [
+            {
+                "url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
+                "extract_method": "table",
+                "request": {"timeout_ms": 60000, "wait_until": "networkidle"},
+            }
+        ],
+    },
+    {
+        "name": "Department of Computer Science",
+        "keywords": [
+            "computer",
+            "software",
+            "data science",
+            "artificial intelligence",
+            "ai ",
+            "machine learning",
+            "cyber",
+            "computing",
+        ],
+        "attach_faculty_to_programs": True,
+        "staff_pages": [
+            {
+                "url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/",
+                "extract_method": "links",
+                "requires_scroll": True,
+            },
+            {
+                "url": "https://www.cs.manchester.ac.uk/about/people/",
+                "extract_method": "links",
+                "load_more_selector": "button.load-more",
+                "max_load_more": 6,
+            },
+        ],
+    },
+    {
+        "name": "Department of Physics and Astronomy",
+        "keywords": [
+            "physics",
+            "astronomy",
+            "astrophysics",
+            "nuclear",
+            "particle",
+        ],
+        "attach_faculty_to_programs": True,
+        "staff_pages": [
+            {
+                "url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/",
+                "extract_method": "links",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "Department of Electrical and Electronic Engineering",
+        "keywords": [
+            "electrical",
+            "electronic",
+            "eee",
+            "power systems",
+            "microelectronics",
+        ],
+        "attach_faculty_to_programs": True,
+        "staff_pages": [
+            {
+                "url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/",
+                "extract_method": "links",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "Department of Chemistry",
+        "keywords": ["chemistry", "chemical"],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 200},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+                "request": {
+                    "timeout_ms": 120000,
+                    "wait_until": "networkidle",
+                    "post_wait_ms": 5000,
+                },
+            }
+        ],
+    },
+    {
+        "name": "Department of Mathematics",
+        "keywords": [
+            "mathematics",
+            "mathematical",
+            "applied math",
+            "statistics",
+            "actuarial",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 200},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Engineering",
+        "keywords": [
+            "engineering",
+            "mechanical",
+            "aerospace",
+            "civil",
+            "structural",
+            "materials",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 400},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "Faculty of Biology, Medicine and Health",
+        "keywords": [
+            "medicine",
+            "medical",
+            "health",
+            "nursing",
+            "pharmacy",
+            "clinical",
+            "dental",
+            "optometry",
+            "biology",
+            "biomedical",
+            "anatomical",
+            "physiotherapy",
+            "midwifery",
+            "mental health",
+            "psychology",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 400},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Social Sciences",
+        "keywords": [
+            "sociology",
+            "politics",
+            "international",
+            "social",
+            "criminology",
+            "anthropology",
+            "philosophy",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 200},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Law",
+        "keywords": ["law", "legal", "llm"],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 200},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Arts, Languages and Cultures",
+        "keywords": [
+            "arts",
+            "languages",
+            "culture",
+            "music",
+            "drama",
+            "theatre",
+            "history",
+            "linguistics",
+            "literature",
+            "translation",
+            "classics",
+            "archaeology",
+            "religion",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 400},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+    {
+        "name": "School of Environment, Education and Development",
+        "keywords": [
+            "environment",
+            "education",
+            "development",
+            "planning",
+            "architecture",
+            "urban",
+            "geography",
+            "sustainability",
+        ],
+        "attach_faculty_to_programs": True,
+        "extract_method": "research_explorer",
+        "research_explorer": {"page_size": 300},
+        "staff_pages": [
+            {
+                "url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/",
+                "extract_method": "research_explorer",
+                "requires_scroll": True,
+            }
+        ],
+    },
+]
+
+SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG}
+
+# =========================
+# JS 提取函数
+# =========================
+
+JS_EXTRACT_TABLE_STAFF = """() => {
+    const staff = [];
+    const seen = new Set();
+
+    document.querySelectorAll('table tr').forEach(row => {
+        const cells = row.querySelectorAll('td');
+        if (cells.length >= 2) {
+            const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]');
+            const titleCell = cells[2] || cells[1];
+
+            if (link) {
+                const name = link.innerText.trim();
+                const url = link.href;
+                const title = titleCell ? titleCell.innerText.trim() : '';
+
+                if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) {
+                    seen.add(url);
+                    staff.push({
+                        name,
+                        url,
+                        title
+                    });
+                }
+            }
+        }
+    });
+
+    return staff;
+}"""
+
+JS_EXTRACT_LINK_STAFF = """() => {
+    const staff = [];
+    const seen = new Set();
+
+    document.querySelectorAll('a[href]').forEach(a => {
+        const href = a.href;
+        const text = a.innerText.trim();
+
+        if (seen.has(href)) return;
+        if (text.length < 5 || text.length > 80) return;
+
+        const lowerText = text.toLowerCase();
+        if (lowerText.includes('skip') ||
+            lowerText.includes('staff') ||
+            lowerText.includes('people') ||
+            lowerText.includes('academic') ||
+            lowerText.includes('research profiles')) return;
+
+        if (href.includes('/persons/') ||
+            href.includes('/portal/en/researchers/') ||
+            href.includes('/profile/') ||
+            href.includes('/people/')) {
+            seen.add(href);
+            staff.push({
+                name: text,
+                url: href,
+                title: ''
+            });
+        }
+    });
+
+    return staff;
+}"""
+
+JS_EXTRACT_RESEARCH_EXPLORER = """() => {
+    const staff = [];
+    const seen = new Set();
+
+    document.querySelectorAll('a.link.person').forEach(a => {
+        const href = a.href;
+        const text = a.innerText.trim();
+
+        if (!seen.has(href) && text.length > 3 && text.length < 80) {
+            seen.add(href);
+            staff.push({
+                name: text,
+                url: href,
+                title: ''
+            });
+        }
+    });
+
+    if (staff.length === 0) {
+        document.querySelectorAll('a[href*="/persons/"]').forEach(a => {
+            const href = a.href;
+            const text = a.innerText.trim();
+            const lower = text.toLowerCase();
+
+            if (seen.has(href)) return;
+            if (text.length < 3 || text.length > 80) return;
+            if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return;
+
+            seen.add(href);
+            staff.push({
+                name: text,
+                url: href,
+                title: ''
+            });
+        });
+    }
+
+    return staff;
+}"""
+
+JS_EXTRACT_PROGRAMS = """() => {
+    const programs = [];
+    const seen = new Set();
+
+    document.querySelectorAll('a[href]').forEach(a => {
+        const href = a.href;
+        const text = a.innerText.trim().replace(/\\s+/g, ' ');
+
+        if (!href || seen.has(href)) return;
+        if (text.length < 10 || text.length > 200) return;
+
+        const hrefLower = href.toLowerCase();
+        const textLower = text.toLowerCase();
+
+        const isNav = textLower === 'courses' ||
+            textLower === 'masters' ||
+            textLower.includes('admission') ||
+            textLower.includes('fees') ||
+            textLower.includes('skip to') ||
+            textLower.includes('search') ||
+            textLower.includes('contact') ||
+            hrefLower.includes('#');
+        if (isNav) return;
+
+        const hasNumericId = /\\/\\d{5}\\//.test(href);
+        const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId;
+
+        if (isCoursePage) {
+            seen.add(href);
+            programs.push({
+                name: text,
+                url: href
+            });
+        }
+    });
+
+    return programs;
+}"""
+
+
+# =========================
+# 数据匹配
+# =========================
+
+def match_program_to_school(program_name: str) -> str:
+    lower = program_name.lower()
+    for school in SCHOOL_CONFIG:
+        for keyword in school["keywords"]:
+            if keyword in lower:
+                return school["name"]
+    return "Other Programs"
+
+
+# =========================
+# 请求与解析工具
+# =========================
+
+def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+    settings = dict(DEFAULT_REQUEST)
+    for layer in layers:
+        if not layer:
+            continue
+        for key, value in layer.items():
+            if value is not None:
+                settings[key] = value
+    settings["max_retries"] = max(1, int(settings.get("max_retries", 1)))
+    settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000)
+    return settings
+
+
+async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]:
+    last_error = None
+    for attempt in range(settings["max_retries"]):
+        try:
+            await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"])
+            if settings.get("wait_for_selector"):
+                await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"])
+            if settings.get("post_wait_ms"):
+                await page.wait_for_timeout(settings["post_wait_ms"])
+            return True, None
+        except PlaywrightTimeoutError as exc:
+            last_error = f"Timeout: {exc}"
+        except Exception as exc:  # noqa: BLE001
+            last_error = str(exc)
+
+        if attempt < settings["max_retries"] - 1:
+            await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1))
+
+    return False, last_error
+
+
+async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800):
+    repetitions = max(1, repetitions)
+    for i in range(repetitions):
+        await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1))
+        await page.wait_for_timeout(delay_ms)
+
+
+async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500):
+    for _ in range(max_clicks):
+        button = await page.query_selector(selector)
+        if not button:
+            break
+        try:
+            await button.click()
+            await page.wait_for_timeout(wait_ms)
+        except Exception:
+            break
+
+
+def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    seen = set()
+    cleaned = []
+    for item in staff:
+        name = (item.get("name") or "").strip()
+        if not name:
+            continue
+        url = (item.get("url") or "").strip()
+        key = url or name.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()})
+    return cleaned
+
+
+def _append_query(url: str, params: Dict[str, Any]) -> str:
+    delimiter = "&" if "?" in url else "?"
+    return f"{url}{delimiter}{urlencode(params)}"
+
+
+def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]:
+    if not staff_url:
+        return None
+    path = staff_url.rstrip("/").split("/")
+    return path[-1] if path else None
+
+
+def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]:
+    items: List[Dict[str, Any]] = []
+    if isinstance(data, list):
+        items = data
+    elif isinstance(data, dict):
+        for key in ("results", "items", "persons", "data", "entities"):
+            if isinstance(data.get(key), list):
+                items = data[key]
+                break
+        if not items and isinstance(data.get("rows"), list):
+            items = data["rows"]
+
+    staff = []
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        name = item.get("name") or item.get("title") or item.get("fullName")
+        profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL")
+        if not name:
+            continue
+        if profile_url:
+            profile_url = urljoin(base_url, profile_url)
+        staff.append(
+            {
+                "name": name.strip(),
+                "url": (profile_url or "").strip(),
+                "title": (item.get("jobTitle") or item.get("position") or "").strip(),
+            }
+        )
+    return staff
+
+
+def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]:
+    staff: List[Dict[str, str]] = []
+    try:
+        root = ET.fromstring(text)
+    except ET.ParseError:
+        return staff
+
+    for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"):
+        title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="")
+        link = entry.find("{http://www.w3.org/2005/Atom}link")
+        href = link.attrib.get("href") if link is not None else ""
+        if title:
+            staff.append(
+                {
+                    "name": title.strip(),
+                    "url": urljoin(base_url, href) if href else "",
+                    "title": "",
+                }
+            )
+    return staff
+
+
+async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
+    config = school_config.get("research_explorer") or {}
+    if not config and school_config.get("extract_method") != "research_explorer":
+        return []
+
+    base_staff_url = ""
+    if school_config.get("staff_pages"):
+        base_staff_url = school_config["staff_pages"][0].get("url", "")
+
+    page_size = config.get("page_size", 200)
+    timeout_ms = config.get("timeout_ms", 70000)
+
+    candidates: List[str] = []
+    slug = config.get("org_slug") or _guess_research_slug(base_staff_url)
+    base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx")
+
+    if config.get("api_url"):
+        candidates.append(config["api_url"])
+
+    if slug:
+        params = {
+            "action": "search",
+            "language": "en",
+            "format": "json",
+            "site": "default",
+            "showall": "true",
+            "pageSize": page_size,
+            "organisations": slug,
+        }
+        candidates.append(f"{base_api}?{urlencode(params)}")
+
+    if base_staff_url:
+        candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size}))
+        candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size}))
+
+    for url in candidates:
+        try:
+            resp = await context.request.get(url, timeout=timeout_ms)
+            if resp.status != 200:
+                continue
+            ctype = resp.headers.get("content-type", "")
+            if "json" in ctype:
+                data = await resp.json()
+                parsed = _parse_research_explorer_json(data, base_staff_url)
+            else:
+                text = await resp.text()
+                parsed = _parse_research_explorer_xml(text, base_staff_url)
+            parsed = _deduplicate_staff(parsed)
+            if parsed:
+                if output_callback:
+                    output_callback("info", f"  {school_config['name']}: {len(parsed)} staff via API")
+                return parsed
+        except Exception as exc:  # noqa: BLE001
+            if output_callback:
+                output_callback(
+                    "warning", f"  {school_config['name']}: API fetch failed ({str(exc)[:60]})"
+                )
+    return []
+
+
+async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
+    staff_collected: List[Dict[str, str]] = []
+    staff_pages = school_config.get("staff_pages") or []
+    if not staff_pages and school_config.get("staff_url"):
+        staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}]
+
+    page = await context.new_page()
+    blocked_types = school_config.get("blocked_resources", ["image", "font", "media"])
+    if blocked_types:
+        async def _route_handler(route):
+            if route.request.resource_type in blocked_types:
+                await route.abort()
+            else:
+                await route.continue_()
+
+        await page.route("**/*", _route_handler)
+
+    for page_cfg in staff_pages:
+        target_url = page_cfg.get("url")
+        if not target_url:
+            continue
+
+        settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request"))
+        success, error = await _goto_with_retry(page, target_url, settings, school_config["name"])
+        if not success:
+            if output_callback:
+                output_callback("warning", f"  {school_config['name']}: failed to load {target_url} ({error})")
+            continue
+
+        if page_cfg.get("requires_scroll"):
+            await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700))
+
+        if page_cfg.get("load_from_selector"):
+            await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5))
+        elif page_cfg.get("load_more_selector"):
+            await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5))
+
+        method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links"
+        if method == "table":
+            extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF)
+        elif method == "research_explorer":
+            extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER)
+        else:
+            extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF)
+
+        staff_collected.extend(extracted)
+
+    await page.close()
+    return _deduplicate_staff(staff_collected)
+
+
+# =========================
+# 并发抓取学院 Staff
+# =========================
+
+async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback):
+    async with semaphore:
+        staff_list: List[Dict[str, str]] = []
+        status = "success"
+        error: Optional[str] = None
+
+        try:
+            if school_config.get("extract_method") == "research_explorer":
+                staff_list = await fetch_research_explorer_api(context, school_config, output_callback)
+            if not staff_list:
+                staff_list = await scrape_staff_via_browser(context, school_config, output_callback)
+
+            if output_callback:
+                output_callback("info", f"  {school_config['name']}: total {len(staff_list)} staff")
+
+        except Exception as exc:  # noqa: BLE001
+            status = "error"
+            error = str(exc)
+            if output_callback:
+                output_callback("error", f"  {school_config['name']}: {error}")
+
+        return {
+            "name": school_config["name"],
+            "staff": staff_list,
+            "status": status,
+            "error": error,
+        }
+
+
+async def scrape_all_school_staff(context, output_callback):
+    semaphore = asyncio.Semaphore(STAFF_CONCURRENCY)
+    tasks = [
+        asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback))
+        for cfg in SCHOOL_CONFIG
+    ]
+    results = await asyncio.gather(*tasks)
+
+    staff_map = {}
+    diagnostics = {"failed": [], "success": [], "total": len(results)}
+    for res in results:
+        if res["staff"]:
+            staff_map[res["name"]] = res["staff"]
+            diagnostics["success"].append(res["name"])
+        else:
+            diagnostics["failed"].append(
+                {
+                    "name": res["name"],
+                    "status": res["status"],
+                    "error": res.get("error"),
+                }
+            )
+    return staff_map, diagnostics
+
+
+# =========================
+# 主流程
+# =========================
+
+async def scrape(output_callback=None):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        )
+
+        base_url = "https://www.manchester.ac.uk/"
+        result = {
+            "name": "The University of Manchester",
+            "url": base_url,
+            "scraped_at": datetime.now(timezone.utc).isoformat(),
+            "schools": [],
+            "diagnostics": {},
+        }
+
+        try:
+            # Step 1: Masters 列表
+            if output_callback:
+                output_callback("info", "Step 1: Scraping masters programs list...")
+
+            page = await context.new_page()
+            courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
+            await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000)
+            await page.wait_for_timeout(3000)
+            programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS)
+            await page.close()
+
+            if output_callback:
+                output_callback("info", f"Found {len(programs_data)} masters programs")
+
+            # Step 2: 并发抓取学院 Staff
+            if output_callback:
+                output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...")
+            school_staff, diagnostics = await scrape_all_school_staff(context, output_callback)
+
+            # Step 3: 组织数据
+            schools_dict: Dict[str, Dict[str, Any]] = {}
+            for prog in programs_data:
+                school_name = match_program_to_school(prog["name"])
+                if school_name not in schools_dict:
+                    schools_dict[school_name] = {
+                        "name": school_name,
+                        "url": "",
+                        "programs": [],
+                        "faculty": school_staff.get(school_name, []),
+                        "faculty_source": "school_directory" if school_staff.get(school_name) else "",
+                    }
+
+                schools_dict[school_name]["programs"].append(
+                    {
+                        "name": prog["name"],
+                        "url": prog["url"],
+                        "faculty": [],
+                    }
+                )
+
+            for cfg in SCHOOL_CONFIG:
+                if cfg["name"] in schools_dict:
+                    first_page = (cfg.get("staff_pages") or [{}])[0]
+                    schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "")
+
+            _attach_faculty_to_programs(schools_dict, school_staff)
+
+            result["schools"] = list(schools_dict.values())
+
+            total_programs = sum(len(s["programs"]) for s in result["schools"])
+            total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"])
+
+            result["diagnostics"] = {
+                "total_programs": total_programs,
+                "total_faculty_records": total_faculty,
+                "school_staff_success": diagnostics.get("success", []),
+                "school_staff_failed": diagnostics.get("failed", []),
+            }
+
+            if output_callback:
+                output_callback(
+                    "info",
+                    f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty",
+                )
+
+        except Exception as exc:  # noqa: BLE001
+            if output_callback:
+                output_callback("error", f"Scraping error: {str(exc)}")
+        finally:
+            await browser.close()
+
+        return result
+
+
+def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]):
+    for school_name, school_data in schools_dict.items():
+        staff = staff_map.get(school_name, [])
+        cfg = SCHOOL_LOOKUP.get(school_name, {})
+        if not staff or not cfg.get("attach_faculty_to_programs"):
+            continue
+
+        limit = cfg.get("faculty_per_program")
+        for program in school_data["programs"]:
+            sliced = deepcopy(staff[:limit] if limit else staff)
+            program["faculty"] = sliced
+
+
+# =========================
+# CLI
+# =========================
+
+if __name__ == "__main__":
+    import sys
+
+    if sys.platform == "win32":
+        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
+    def print_callback(level, msg):
+        print(f"[{level}] {msg}")
+
+    scrape_result = asyncio.run(scrape(output_callback=print_callback))
+
+    output_path = "output/manchester_complete_result.json"
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(scrape_result, f, ensure_ascii=False, indent=2)
+
+    print("\nResult saved to", output_path)
+    print("\n=== Summary ===")
+    for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))):
+        print(
+            f"  {school['name']}: "
+            f"{len(school['programs'])} programs, "
+            f"{len(school.get('faculty', []))} faculty"
+        )
+
--- a/artifacts/manchester_improved_scraper.py
+++ b/artifacts/manchester_improved_scraper.py
@ -0,0 +1,229 @@
+"""
+曼彻斯特大学专用爬虫脚本
+改进版 - 从学院Staff页面提取导师信息
+"""
+
+import asyncio
+import json
+import re
+from datetime import datetime, timezone
+from urllib.parse import urljoin, urlparse
+from playwright.async_api import async_playwright
+
+
+# 曼彻斯特大学学院Staff页面映射
+# 项目关键词 -> 学院Staff页面URL
+SCHOOL_STAFF_MAPPING = {
+    # Alliance Manchester Business School (AMBS)
+    "accounting": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
+    "finance": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
+    "business": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
+    "management": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
+    "marketing": "https://www.alliancembs.manchester.ac.uk/research/management-sciences-and-marketing/",
+    "mba": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
+
+    # 其他学院可以继续添加...
+    # "computer": "...",
+    # "engineering": "...",
+}
+
+# 通用学院Staff页面列表（如果没有匹配的关键词）
+GENERAL_STAFF_PAGES = [
+    "https://www.alliancembs.manchester.ac.uk/about/our-people/",
+]
+
+
+async def scrape(output_callback=None):
+    """执行爬取"""
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        )
+        page = await context.new_page()
+
+        base_url = "https://www.manchester.ac.uk/"
+
+        result = {
+            "name": "The University of Manchester",
+            "url": base_url,
+            "scraped_at": datetime.now(timezone.utc).isoformat(),
+            "schools": []
+        }
+
+        try:
+            # 第一步：爬取硕士项目列表
+            if output_callback:
+                output_callback("info", "Step 1: Scraping masters programs list...")
+
+            courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
+            await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(3000)
+
+            # 提取所有硕士项目
+            programs_data = await page.evaluate('''() => {
+                const programs = [];
+                const seen = new Set();
+
+                document.querySelectorAll('a[href]').forEach(a => {
+                    const href = a.href;
+                    const text = a.innerText.trim().replace(/\\s+/g, ' ');
+
+                    if (!href || seen.has(href)) return;
+                    if (text.length < 10 || text.length > 200) return;
+
+                    const hrefLower = href.toLowerCase();
+                    const textLower = text.toLowerCase();
+
+                    // 排除导航链接
+                    if (textLower === 'courses' || textLower === 'masters' ||
+                        textLower.includes('admission') || textLower.includes('fees') ||
+                        textLower.includes('skip to') || textLower.includes('skip navigation') ||
+                        textLower === 'home' || textLower === 'search' ||
+                        textLower.includes('contact') || textLower.includes('footer') ||
+                        hrefLower.endsWith('/courses/') || hrefLower.endsWith('/masters/') ||
+                        hrefLower.includes('#')) {
+                        return;
+                    }
+
+                    // 检查是否是课程链接 - 必须包含课程ID
+                    const hasNumericId = /\\/\\d{5}\\//.test(href);  // 5位数字ID
+                    const isCoursePage = hrefLower.includes('/courses/list/') &&
+                                        hasNumericId;
+
+                    if (isCoursePage) {
+                        seen.add(href);
+                        programs.push({
+                            name: text,
+                            url: href
+                        });
+                    }
+                });
+
+                return programs;
+            }''')
+
+            if output_callback:
+                output_callback("info", f"Found {len(programs_data)} masters programs")
+
+            # 第二步：爬取学院Staff页面的导师信息
+            if output_callback:
+                output_callback("info", "Step 2: Scraping faculty from school staff pages...")
+
+            all_faculty = {}  # school_url -> faculty list
+
+            # 爬取AMBS Accounting & Finance Staff
+            staff_url = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
+            if output_callback:
+                output_callback("info", f"Scraping staff from: {staff_url}")
+
+            await page.goto(staff_url, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(3000)
+
+            # 从表格提取教职员工
+            faculty_data = await page.evaluate('''() => {
+                const faculty = [];
+                const rows = document.querySelectorAll('table tr');
+
+                rows.forEach(row => {
+                    const cells = row.querySelectorAll('td');
+                    if (cells.length >= 2) {
+                        const link = cells[1]?.querySelector('a[href]');
+                        const titleCell = cells[2];
+
+                        if (link) {
+                            const name = link.innerText.trim();
+                            const url = link.href;
+                            const title = titleCell ? titleCell.innerText.trim() : '';
+
+                            if (name.length > 2 && !name.toLowerCase().includes('skip')) {
+                                faculty.push({
+                                    name: name,
+                                    url: url,
+                                    title: title
+                                });
+                            }
+                        }
+                    }
+                });
+
+                return faculty;
+            }''')
+
+            if output_callback:
+                output_callback("info", f"Found {len(faculty_data)} faculty members from AMBS")
+
+            all_faculty["AMBS - Accounting and Finance"] = faculty_data
+
+            # 第三步：组装结果
+            # 将项目按关键词分配到学院
+            schools_data = {}
+
+            for prog in programs_data:
+                prog_name_lower = prog['name'].lower()
+
+                # 确定所属学院
+                school_name = "Other Programs"
+                matched_faculty = []
+
+                for keyword, staff_url in SCHOOL_STAFF_MAPPING.items():
+                    if keyword in prog_name_lower:
+                        if "accounting" in keyword or "finance" in keyword:
+                            school_name = "Alliance Manchester Business School"
+                            matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
+                        elif "business" in keyword or "management" in keyword or "mba" in keyword:
+                            school_name = "Alliance Manchester Business School"
+                            matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
+                        break
+
+                if school_name not in schools_data:
+                    schools_data[school_name] = {
+                        "name": school_name,
+                        "url": "",
+                        "programs": [],
+                        "faculty": matched_faculty  # 学院级别的导师
+                    }
+
+                schools_data[school_name]["programs"].append({
+                    "name": prog['name'],
+                    "url": prog['url'],
+                    "faculty": []  # 项目级别暂不填充
+                })
+
+            result["schools"] = list(schools_data.values())
+
+            # 统计
+            total_programs = sum(len(s['programs']) for s in result['schools'])
+            total_faculty = sum(len(s.get('faculty', [])) for s in result['schools'])
+
+            if output_callback:
+                output_callback("info", f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty")
+
+        except Exception as e:
+            if output_callback:
+                output_callback("error", f"Scraping error: {str(e)}")
+
+        finally:
+            await browser.close()
+
+        return result
+
+
+if __name__ == "__main__":
+    import sys
+    if sys.platform == "win32":
+        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
+    def print_callback(level, msg):
+        print(f"[{level}] {msg}")
+
+    result = asyncio.run(scrape(output_callback=print_callback))
+
+    # 保存结果
+    with open("output/manchester_improved_result.json", "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2, ensure_ascii=False)
+
+    print(f"\nResult saved to output/manchester_improved_result.json")
+    print(f"Schools: {len(result['schools'])}")
+    for school in result['schools']:
+        print(f"  - {school['name']}: {len(school['programs'])} programs, {len(school.get('faculty', []))} faculty")
--- a/artifacts/test_faculty_scraper.py
+++ b/artifacts/test_faculty_scraper.py
@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""
+测试导师信息爬取逻辑 - 只测试3个项目
+"""
+
+import asyncio
+import json
+import re
+from playwright.async_api import async_playwright
+
+
+def name_to_slug(name):
+    """将项目名称转换为URL slug"""
+    slug = name.lower()
+    slug = re.sub(r'[^\w\s-]', '', slug)
+    slug = re.sub(r'[\s_]+', '-', slug)
+    slug = re.sub(r'-+', '-', slug)
+    slug = slug.strip('-')
+    return slug
+
+
+async def get_faculty_from_gsas_page(page, gsas_url):
+    """从GSAS项目页面获取Faculty链接，然后访问院系People页面获取导师列表"""
+    faculty_list = []
+    faculty_page_url = None
+
+    try:
+        print(f"  访问GSAS页面: {gsas_url}")
+        await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
+        await page.wait_for_timeout(2000)
+
+        # 查找Faculty部分的链接
+        faculty_link = await page.evaluate('''() => {
+            const links = document.querySelectorAll('a[href]');
+            for (const link of links) {
+                const text = link.innerText.toLowerCase();
+                const href = link.href;
+                if (text.includes('faculty') && text.includes('see list')) {
+                    return href;
+                }
+                if (text.includes('faculty') && (href.includes('/people') || href.includes('/faculty'))) {
+                    return href;
+                }
+            }
+            return null;
+        }''')
+
+        if faculty_link:
+            faculty_page_url = faculty_link
+            print(f"  找到Faculty页面链接: {faculty_link}")
+
+            # 访问Faculty/People页面
+            await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(2000)
+
+            # 提取所有导师信息
+            faculty_list = await page.evaluate('''() => {
+                const faculty = [];
+                const seen = new Set();
+
+                document.querySelectorAll('a[href]').forEach(a => {
+                    const href = a.href || '';
+                    const text = a.innerText.trim();
+                    const lowerHref = href.toLowerCase();
+
+                    if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
+                         lowerHref.includes('/profile/')) &&
+                        text.length > 3 && text.length < 100 &&
+                        !text.toLowerCase().includes('people') &&
+                        !text.toLowerCase().includes('faculty') &&
+                        !lowerHref.endsWith('/people/') &&
+                        !lowerHref.endsWith('/faculty/')) {
+
+                        if (!seen.has(href)) {
+                            seen.add(href);
+                            faculty.push({
+                                name: text,
+                                url: href
+                            });
+                        }
+                    }
+                });
+
+                return faculty;
+            }''')
+
+            print(f"  找到 {len(faculty_list)} 位导师")
+            for f in faculty_list[:5]:
+                print(f"    - {f['name']}: {f['url']}")
+            if len(faculty_list) > 5:
+                print(f"    ... 还有 {len(faculty_list) - 5} 位")
+        else:
+            print("  未找到Faculty页面链接")
+
+    except Exception as e:
+        print(f"  获取Faculty信息失败: {e}")
+
+    return faculty_list, faculty_page_url
+
+
+async def test_faculty_scraper():
+    """测试导师爬取"""
+
+    # 测试3个项目
+    test_programs = [
+        "African and African American Studies",
+        "Economics",
+        "Computer Science"
+    ]
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            viewport={'width': 1920, 'height': 1080}
+        )
+        page = await context.new_page()
+
+        results = []
+
+        for i, name in enumerate(test_programs, 1):
+            print(f"\n{'='*60}")
+            print(f"[{i}/{len(test_programs)}] 测试: {name}")
+            print(f"{'='*60}")
+
+            slug = name_to_slug(name)
+            program_url = f"https://www.harvard.edu/programs/{slug}/"
+            gsas_url = f"https://gsas.harvard.edu/program/{slug}"
+
+            print(f"项目URL: {program_url}")
+            print(f"GSAS URL: {gsas_url}")
+
+            faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url)
+
+            results.append({
+                'name': name,
+                'url': program_url,
+                'gsas_url': gsas_url,
+                'faculty_page_url': faculty_page_url,
+                'faculty': faculty_list,
+                'faculty_count': len(faculty_list)
+            })
+
+            await page.wait_for_timeout(1000)
+
+        await browser.close()
+
+    # 输出结果
+    print(f"\n\n{'='*60}")
+    print("测试结果汇总")
+    print(f"{'='*60}")
+
+    for r in results:
+        print(f"\n{r['name']}:")
+        print(f"  Faculty页面: {r['faculty_page_url'] or '未找到'}")
+        print(f"  导师数量: {r['faculty_count']}")
+
+    # 保存测试结果
+    with open('test_faculty_results.json', 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\n测试结果已保存到: test_faculty_results.json")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_faculty_scraper())
--- a/artifacts/test_manchester_scraper.py
+++ b/artifacts/test_manchester_scraper.py
@ -0,0 +1,464 @@
+"""
+Test Manchester University scraper - improved faculty mapping
+"""
+
+import asyncio
+import json
+from datetime import datetime, timezone
+
+from playwright.async_api import async_playwright
+
+
+MASTERS_PATHS = [
+    "/study/masters/courses/list/",
+    "/study/masters/courses/",
+    "/postgraduate/taught/courses/",
+    "/postgraduate/courses/list/",
+    "/postgraduate/courses/",
+    "/graduate/programs/",
+    "/academics/graduate/programs/",
+    "/programmes/masters/",
+    "/masters/programmes/",
+    "/admissions/graduate/programs/",
+]
+
+ACCOUNTING_STAFF_URL = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
+ACCOUNTING_STAFF_CACHE = None
+
+
+JS_CHECK_COURSES = r"""() => {
+        const links = document.querySelectorAll('a[href]');
+        let courseCount = 0;
+        for (const a of links) {
+            const href = a.href.toLowerCase();
+            if (/\/\d{4,}\//.test(href) ||
+                /\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
+                /\/course\/[a-z]/.test(href)) {
+                courseCount++;
+            }
+        }
+        return courseCount;
+    }"""
+
+JS_FIND_LIST_URL = """() => {
+        const links = document.querySelectorAll('a[href]');
+        for (const a of links) {
+            const text = a.innerText.toLowerCase();
+            const href = a.href.toLowerCase();
+            if ((text.includes('a-z') || text.includes('all course') ||
+                 text.includes('full list') || text.includes('browse all') ||
+                 href.includes('/list')) &&
+                (href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
+                return a.href;
+            }
+        }
+        return null;
+    }"""
+
+JS_FIND_COURSES_FROM_HOME = """() => {
+        const links = document.querySelectorAll('a[href]');
+        for (const a of links) {
+            const href = a.href.toLowerCase();
+            const text = a.innerText.toLowerCase();
+            if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
+                (href.includes('course') || href.includes('program') || href.includes('degree'))) {
+                return a.href;
+            }
+        }
+        return null;
+    }"""
+
+JS_EXTRACT_PROGRAMS = r"""() => {
+        const programs = [];
+        const seen = new Set();
+        const currentHost = window.location.hostname;
+
+        document.querySelectorAll('a[href]').forEach(a => {
+            const href = a.href;
+            const text = a.innerText.trim().replace(/\s+/g, ' ');
+
+            if (!href || seen.has(href)) return;
+            if (text.length < 5 || text.length > 200) return;
+            if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;
+
+            try {
+                const linkHost = new URL(href).hostname;
+                if (!linkHost.includes(currentHost.replace('www.', '')) &&
+                    !currentHost.includes(linkHost.replace('www.', ''))) return;
+            } catch {
+                return;
+            }
+
+            const hrefLower = href.toLowerCase();
+            const textLower = text.toLowerCase();
+
+            const isNavigation = textLower === 'courses' ||
+                                textLower === 'programmes' ||
+                                textLower === 'undergraduate' ||
+                                textLower === 'postgraduate' ||
+                                textLower === 'masters' ||
+                                textLower === "master's" ||
+                                textLower.includes('skip to') ||
+                                textLower.includes('share') ||
+                                textLower === 'home' ||
+                                textLower === 'study' ||
+                                textLower.startsWith('a-z') ||
+                                textLower.includes('admission') ||
+                                textLower.includes('fees and funding') ||
+                                textLower.includes('why should') ||
+                                textLower.includes('why manchester') ||
+                                textLower.includes('teaching and learning') ||
+                                textLower.includes('meet us') ||
+                                textLower.includes('student support') ||
+                                textLower.includes('contact us') ||
+                                textLower.includes('how to apply') ||
+                                hrefLower.includes('/admissions/') ||
+                                hrefLower.includes('/fees-and-funding/') ||
+                                hrefLower.includes('/why-') ||
+                                hrefLower.includes('/meet-us/') ||
+                                hrefLower.includes('/contact-us/') ||
+                                hrefLower.includes('/student-support/') ||
+                                hrefLower.includes('/teaching-and-learning/') ||
+                                hrefLower.endsWith('/courses/') ||
+                                hrefLower.endsWith('/masters/') ||
+                                hrefLower.endsWith('/postgraduate/');
+
+            if (isNavigation) return;
+
+            const isExcluded = hrefLower.includes('/undergraduate') ||
+                              hrefLower.includes('/bachelor') ||
+                              hrefLower.includes('/phd/') ||
+                              hrefLower.includes('/doctoral') ||
+                              hrefLower.includes('/research-degree') ||
+                              textLower.includes('bachelor') ||
+                              textLower.includes('undergraduate') ||
+                              (textLower.includes('phd') && !textLower.includes('mphil'));
+
+            if (isExcluded) return;
+
+            const hasNumericId = /\/\d{4,}\//.test(href);
+            const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
+            const isCoursePage = (hrefLower.includes('/course/') ||
+                                 hrefLower.includes('/courses/list/') ||
+                                 hrefLower.includes('/programme/')) &&
+                                 href.split('/').filter(p => p).length > 4;
+            const textHasDegree = /(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)/i.test(text) ||
+                                 textLower.includes('master');
+
+            if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
+                seen.add(href);
+                programs.push({
+                    name: text,
+                    url: href
+                });
+            }
+        });
+
+        return programs;
+    }"""
+
+JS_EXTRACT_FACULTY = r"""() => {
+        const faculty = [];
+        const seen = new Set();
+
+        document.querySelectorAll('a[href]').forEach(a => {
+            const href = a.href.toLowerCase();
+            const text = a.innerText.trim();
+
+            if (seen.has(href)) return;
+            if (text.length < 3 || text.length > 100) return;
+
+            const isStaff = href.includes('/people/') ||
+                           href.includes('/staff/') ||
+                           href.includes('/faculty/') ||
+                           href.includes('/profile/') ||
+                           href.includes('/academics/') ||
+                           href.includes('/researcher/');
+
+            if (isStaff) {
+                seen.add(href);
+                faculty.push({
+                    name: text.replace(/\s+/g, ' '),
+                    url: a.href
+                });
+            }
+        });
+
+        return faculty.slice(0, 20);
+    }"""
+
+JS_EXTRACT_ACCOUNTING_STAFF = r"""() => {
+        const rows = Array.from(document.querySelectorAll('table tbody tr'));
+        const staff = [];
+
+        for (const row of rows) {
+            const cells = row.querySelectorAll('td');
+            if (!cells || cells.length < 2) {
+                continue;
+            }
+
+            const nameCell = cells[1];
+            const roleCell = cells[2];
+            const emailCell = cells[5];
+
+            let profileUrl = '';
+            let displayName = nameCell ? nameCell.innerText.trim() : '';
+            const link = nameCell ? nameCell.querySelector('a[href]') : null;
+            if (link) {
+                profileUrl = link.href;
+                displayName = link.innerText.trim() || displayName;
+            }
+
+            if (!displayName) {
+                continue;
+            }
+
+            let email = '';
+            if (emailCell) {
+                const emailLink = emailCell.querySelector('a[href^="mailto:"]');
+                if (emailLink) {
+                    email = emailLink.href.replace('mailto:', '').trim();
+                }
+            }
+
+            staff.push({
+                name: displayName,
+                title: roleCell ? roleCell.innerText.trim() : '',
+                url: profileUrl,
+                email: email
+            });
+        }
+
+        return staff;
+    }"""
+
+
+def should_use_accounting_staff(program_name: str) -> bool:
+    lower_name = program_name.lower()
+    return "msc" in lower_name and "accounting" in lower_name
+
+
+async def load_accounting_staff(context, output_callback=None):
+    global ACCOUNTING_STAFF_CACHE
+
+    if ACCOUNTING_STAFF_CACHE is not None:
+        return ACCOUNTING_STAFF_CACHE
+
+    staff_page = await context.new_page()
+    try:
+        if output_callback:
+            output_callback("info", "Loading official AMBS Accounting & Finance staff page...")
+
+        await staff_page.goto(ACCOUNTING_STAFF_URL, wait_until="domcontentloaded", timeout=30000)
+        await staff_page.wait_for_timeout(2000)
+
+        ACCOUNTING_STAFF_CACHE = await staff_page.evaluate(JS_EXTRACT_ACCOUNTING_STAFF)
+
+        if output_callback:
+            output_callback("info", f"Captured {len(ACCOUNTING_STAFF_CACHE)} faculty from the official staff page")
+
+    except Exception as exc:
+        if output_callback:
+            output_callback("error", f"Failed to load AMBS staff page: {exc}")
+        ACCOUNTING_STAFF_CACHE = []
+    finally:
+        await staff_page.close()
+
+    return ACCOUNTING_STAFF_CACHE
+
+
+async def find_course_list_page(page, base_url, output_callback):
+    for path in MASTERS_PATHS:
+        test_url = base_url.rstrip('/') + path
+        try:
+            response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
+            if response and response.status == 200:
+                title = await page.title()
+                if '404' not in title.lower() and 'not found' not in title.lower():
+                    has_courses = await page.evaluate(JS_CHECK_COURSES)
+                    if has_courses > 5:
+                        if output_callback:
+                            output_callback("info", f"Found course list: {path} ({has_courses} courses)")
+                        return test_url
+
+                    list_url = await page.evaluate(JS_FIND_LIST_URL)
+                    if list_url:
+                        if output_callback:
+                            output_callback("info", f"Found full course list: {list_url}")
+                        return list_url
+        except:
+            continue
+
+    try:
+        await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
+        await page.wait_for_timeout(2000)
+        courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
+        if courses_url:
+            return courses_url
+    except:
+        pass
+
+    return None
+
+
+async def extract_course_links(page, output_callback):
+    return await page.evaluate(JS_EXTRACT_PROGRAMS)
+
+
+async def scrape(output_callback=None):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        )
+        page = await context.new_page()
+
+        base_url = "https://www.manchester.ac.uk/"
+
+        result = {
+            "name": "Manchester University",
+            "url": base_url,
+            "scraped_at": datetime.now(timezone.utc).isoformat(),
+            "schools": []
+        }
+
+        all_programs = []
+
+        try:
+            if output_callback:
+                output_callback("info", "Searching for masters course list...")
+
+            courses_url = await find_course_list_page(page, base_url, output_callback)
+
+            if not courses_url:
+                if output_callback:
+                    output_callback("warning", "Course list not found, using homepage")
+                courses_url = base_url
+
+            if output_callback:
+                output_callback("info", "Extracting masters programs...")
+
+            await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(3000)
+
+            for _ in range(3):
+                try:
+                    load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
+                    if await load_more.count() > 0:
+                        await load_more.first.click()
+                        await page.wait_for_timeout(2000)
+                    else:
+                        break
+                except:
+                    break
+
+            programs_data = await extract_course_links(page, output_callback)
+
+            if output_callback:
+                output_callback("info", f"Found {len(programs_data)} masters programs")
+
+            print("\nTop 20 programs:")
+            for i, prog in enumerate(programs_data[:20]):
+                print(f"  {i+1}. {prog['name'][:60]}")
+                print(f"     {prog['url']}")
+
+            max_detail_pages = min(len(programs_data), 30)
+            detailed_processed = 0
+            logged_official_staff = False
+
+            for prog in programs_data:
+                faculty_data = []
+                used_official_staff = False
+
+                if should_use_accounting_staff(prog['name']):
+                    staff_list = await load_accounting_staff(context, output_callback)
+                    if staff_list:
+                        used_official_staff = True
+                        if output_callback and not logged_official_staff:
+                            output_callback("info", "Using Alliance MBS Accounting & Finance staff directory for accounting programmes")
+                            logged_official_staff = True
+                        faculty_data = [
+                            {
+                                "name": person.get("name"),
+                                "url": person.get("url") or ACCOUNTING_STAFF_URL,
+                                "title": person.get("title"),
+                                "email": person.get("email"),
+                                "source": "Alliance Manchester Business School - Accounting & Finance staff"
+                            }
+                            for person in staff_list
+                        ]
+
+                elif detailed_processed < max_detail_pages:
+                    detailed_processed += 1
+                    if output_callback and detailed_processed % 10 == 0:
+                        output_callback("info", f"Processing {detailed_processed}/{max_detail_pages}: {prog['name'][:50]}")
+                    try:
+                        await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
+                        await page.wait_for_timeout(800)
+
+                        faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
+                    except Exception as e:
+                        if output_callback:
+                            output_callback("warning", f"Failed to capture faculty for {prog['name'][:50]}: {e}")
+                        faculty_data = []
+
+                program_entry = {
+                    "name": prog['name'],
+                    "url": prog['url'],
+                    "faculty": faculty_data
+                }
+
+                if used_official_staff:
+                    program_entry["faculty_page_override"] = ACCOUNTING_STAFF_URL
+
+                all_programs.append(program_entry)
+
+            result["schools"] = [{
+                "name": "Masters Programs",
+                "url": courses_url,
+                "programs": all_programs
+            }]
+
+            if output_callback:
+                total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
+                output_callback("info", f"Done! {len(all_programs)} programs, {total_faculty} faculty")
+
+        except Exception as e:
+            if output_callback:
+                output_callback("error", f"Scraping error: {str(e)}")
+
+        finally:
+            await browser.close()
+
+        return result
+
+
+def log_callback(level, message):
+    print(f"[{level.upper()}] {message}")
+
+
+if __name__ == "__main__":
+    result = asyncio.run(scrape(output_callback=log_callback))
+
+    print("\n" + "="*60)
+    print("Scrape summary:")
+    print("="*60)
+
+    if result.get("schools"):
+        school = result["schools"][0]
+        programs = school.get("programs", [])
+        print(f"Course list URL: {school.get('url')}")
+        print(f"Total programs: {len(programs)}")
+
+        faculty_count = sum(len(p.get('faculty', [])) for p in programs)
+        print(f"Faculty total: {faculty_count}")
+
+        print("\nTop 10 programs:")
+        for i, p in enumerate(programs[:10]):
+            print(f"  {i+1}. {p['name'][:60]}")
+            if p.get("faculty"):
+                print(f"     Faculty entries: {len(p['faculty'])}")
+
+    with open("manchester_test_result.json", "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2, ensure_ascii=False)
+    print("\nSaved results to manchester_test_result.json")