University-Playwright-Codeg…/artifacts/harvard_programs_with_faculty_scraper.py

#!/usr/bin/env python3
"""
Harvard Graduate Programs Scraper with Faculty Information
爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
并获取每个项目的导师个人信息页面URL
"""

import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright


def name_to_slug(name):
    """将项目名称转换为URL slug"""
    slug = name.lower()
    slug = re.sub(r'[^\w\s-]', '', slug)
    slug = re.sub(r'[\s_]+', '-', slug)
    slug = re.sub(r'-+', '-', slug)
    slug = slug.strip('-')
    return slug


async def extract_faculty_from_page(page):
    """从当前页面提取所有教职员工链接"""
    faculty_list = await page.evaluate('''() => {
        const faculty = [];
        const seen = new Set();

        document.querySelectorAll('a[href]').forEach(a => {
            const href = a.href || '';
            const text = a.innerText.trim();
            const lowerHref = href.toLowerCase();
            const lowerText = text.toLowerCase();

            // 检查是否是个人页面链接
            if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
                 lowerHref.includes('/profile/') || lowerHref.includes('/person/')) &&
                text.length > 3 && text.length < 100 &&
                !lowerText.includes('people') &&
                !lowerText.includes('faculty') &&
                !lowerText.includes('profile') &&
                !lowerText.includes('staff') &&
                !lowerHref.endsWith('/people/') &&
                !lowerHref.endsWith('/people') &&
                !lowerHref.endsWith('/faculty/') &&
                !lowerHref.endsWith('/faculty')) {

                if (!seen.has(href)) {
                    seen.add(href);
                    faculty.push({
                        name: text,
                        url: href
                    });
                }
            }
        });

        return faculty;
    }''')
    return faculty_list


async def get_faculty_from_gsas_page(page, gsas_url, program_name):
    """从GSAS项目页面获取Faculty链接，然后访问院系People页面获取导师列表"""
    faculty_list = []
    faculty_page_url = None

    try:
        print(f"    访问GSAS页面: {gsas_url}")
        await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
        await page.wait_for_timeout(2000)

        # 策略1: 查找 "See list of ... faculty" 链接
        faculty_link = await page.evaluate('''() => {
            const links = document.querySelectorAll('a[href]');
            for (const link of links) {
                const text = link.innerText.toLowerCase();
                const href = link.href;
                if (text.includes('faculty') && text.includes('see list')) {
                    return href;
                }
            }
            return null;
        }''')

        # 策略2: 查找任何包含 /people 或 /faculty 的链接
        if not faculty_link:
            faculty_link = await page.evaluate('''() => {
                const links = document.querySelectorAll('a[href]');
                for (const link of links) {
                    const text = link.innerText.toLowerCase();
                    const href = link.href.toLowerCase();
                    // 查找Faculty相关链接
                    if ((text.includes('faculty') || text.includes('people')) &&
                        (href.includes('/people') || href.includes('/faculty'))) {
                        return link.href;
                    }
                }
                return null;
            }''')

        # 策略3: 从页面中查找院系网站链接，然后尝试访问其People页面
        if not faculty_link:
            dept_website = await page.evaluate('''() => {
                const links = document.querySelectorAll('a[href]');
                for (const link of links) {
                    const text = link.innerText.toLowerCase();
                    const href = link.href;
                    // 查找 Website 链接 (通常指向院系主页)
                    if (text.includes('website') && href.includes('harvard.edu') &&
                        !href.includes('gsas.harvard.edu')) {
                        return href;
                    }
                }
                return null;
            }''')

            if dept_website:
                print(f"    找到院系网站: {dept_website}")
                try:
                    await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000)
                    await page.wait_for_timeout(2000)

                    # 在院系网站上查找People/Faculty链接
                    faculty_link = await page.evaluate('''() => {
                        const links = document.querySelectorAll('a[href]');
                        for (const link of links) {
                            const text = link.innerText.toLowerCase().trim();
                            const href = link.href;
                            if ((text === 'people' || text === 'faculty' ||
                                 text === 'faculty & research' || text.includes('our faculty')) &&
                                (href.includes('/people') || href.includes('/faculty'))) {
                                return href;
                            }
                        }
                        return null;
                    }''')
                except Exception as e:
                    print(f"    访问院系网站失败: {e}")

        if faculty_link:
            faculty_page_url = faculty_link
            print(f"    找到Faculty页面: {faculty_link}")

            # 访问Faculty/People页面
            await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
            await page.wait_for_timeout(2000)

            # 提取所有导师信息
            faculty_list = await extract_faculty_from_page(page)

            # 如果第一页没找到，尝试处理分页或其他布局
            if len(faculty_list) == 0:
                # 可能需要点击某些按钮或处理JavaScript加载
                await page.wait_for_timeout(2000)
                faculty_list = await extract_faculty_from_page(page)

            print(f"    找到 {len(faculty_list)} 位导师")
        else:
            print(f"    未找到Faculty页面链接")

    except Exception as e:
        print(f"    获取Faculty信息失败: {e}")

    return faculty_list, faculty_page_url


async def scrape_harvard_programs_with_faculty():
    """爬取Harvard研究生项目列表及导师信息"""

    all_programs = []
    base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            viewport={'width': 1920, 'height': 1080}
        )
        page = await context.new_page()

        print(f"正在访问: {base_url}")
        await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
        await page.wait_for_timeout(5000)

        # 滚动到页面底部
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(2000)

        current_page = 1
        max_pages = 15

        # 第一阶段：收集所有项目基本信息
        print("\n========== 第一阶段：收集项目列表 ==========")
        while current_page <= max_pages:
            print(f"\n--- 第 {current_page} 页 ---")
            await page.wait_for_timeout(2000)

            # 提取当前页面的项目
            page_data = await page.evaluate('''() => {
                const programs = [];
                const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');

                programItems.forEach((item, index) => {
                    const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
                    if (!nameBtn) return;

                    const name = nameBtn.innerText.trim();
                    if (!name || name.length < 3) return;

                    let degrees = '';
                    const allText = item.innerText;
                    const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
                    if (degreeMatch) {
                        degrees = degreeMatch.join(', ');
                    }

                    programs.push({
                        name: name,
                        degrees: degrees
                    });
                });

                if (programs.length === 0) {
                    const buttons = document.querySelectorAll('button');
                    buttons.forEach((btn) => {
                        const className = btn.className || '';
                        if (className.includes('c-programs-item') || className.includes('title-link')) {
                            const name = btn.innerText.trim();
                            if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
                                programs.push({
                                    name: name,
                                    degrees: ''
                                });
                            }
                        }
                    });
                }

                return programs;
            }''')

            print(f"  本页找到 {len(page_data)} 个项目")

            for prog in page_data:
                name = prog['name'].strip()
                if name and not any(p['name'] == name for p in all_programs):
                    all_programs.append({
                        'name': name,
                        'degrees': prog.get('degrees', ''),
                        'page': current_page
                    })

            # 尝试点击下一页
            try:
                next_btn = page.locator('button.c-pagination__link--next')
                if await next_btn.count() > 0:
                    await next_btn.first.scroll_into_view_if_needed()
                    await next_btn.first.click()
                    await page.wait_for_timeout(3000)
                    current_page += 1
                else:
                    print("没有下一页按钮，结束收集")
                    break
            except Exception as e:
                print(f"分页失败: {e}")
                break

        print(f"\n共收集到 {len(all_programs)} 个项目")

        # 第二阶段：为每个项目获取导师信息
        print("\n========== 第二阶段：获取导师信息 ==========")
        print("注意：这将访问每个项目的GSAS页面，可能需要较长时间...")

        for i, prog in enumerate(all_programs, 1):
            print(f"\n[{i}/{len(all_programs)}] {prog['name']}")

            # 生成项目URL
            slug = name_to_slug(prog['name'])
            prog['url'] = f"https://www.harvard.edu/programs/{slug}/"

            # 生成GSAS URL
            gsas_url = f"https://gsas.harvard.edu/program/{slug}"

            # 获取导师信息
            faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name'])

            prog['faculty_page_url'] = faculty_page_url or ""
            prog['faculty'] = faculty_list
            prog['faculty_count'] = len(faculty_list)

            # 每10个项目保存一次进度
            if i % 10 == 0:
                temp_result = {
                    'source_url': base_url,
                    'scraped_at': datetime.now(timezone.utc).isoformat(),
                    'progress': f"{i}/{len(all_programs)}",
                    'programs': all_programs[:i]
                }
                with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f:
                    json.dump(temp_result, f, ensure_ascii=False, indent=2)
                print(f"    [进度已保存]")

            # 避免请求过快
            await page.wait_for_timeout(1500)

        await browser.close()

    # 排序
    programs = sorted(all_programs, key=lambda x: x['name'])

    # 统计
    total_faculty = sum(p['faculty_count'] for p in programs)
    programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0)

    # 保存最终结果
    result = {
        'source_url': base_url,
        'scraped_at': datetime.now(timezone.utc).isoformat(),
        'total_pages_scraped': current_page,
        'total_programs': len(programs),
        'programs_with_faculty': programs_with_faculty,
        'total_faculty_found': total_faculty,
        'programs': programs
    }

    output_file = Path('harvard_programs_with_faculty.json')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    print(f"\n{'='*60}")
    print(f"爬取完成!")
    print(f"共爬取 {current_page} 页")
    print(f"共找到 {len(programs)} 个研究生项目")
    print(f"其中 {programs_with_faculty} 个项目有导师信息")
    print(f"共找到 {total_faculty} 位导师")
    print(f"结果保存到: {output_file}")
    print(f"{'='*60}")

    # 打印摘要
    print("\n项目摘要 (前30个):")
    for i, prog in enumerate(programs[:30], 1):
        faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)"
        print(f"{i:3}. {prog['name']} {faculty_info}")

    if len(programs) > 30:
        print(f"... 还有 {len(programs) - 30} 个项目")

    return result


if __name__ == "__main__":
    asyncio.run(scrape_harvard_programs_with_faculty())