University-Playwright-Codeg…/artifacts/explore_faculty_page.py

"""
探索Harvard院系People/Faculty页面结构，获取导师列表
"""
import asyncio
from playwright.async_api import async_playwright

async def explore_faculty_page():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        # 访问AAAS院系People页面
        people_url = "https://aaas.fas.harvard.edu/aaas-people"
        print(f"访问院系People页面: {people_url}")

        await page.goto(people_url, wait_until='networkidle')
        await page.wait_for_timeout(3000)

        # 截图保存
        await page.screenshot(path="aaas_people_page.png", full_page=True)
        print("已保存截图: aaas_people_page.png")

        # 获取所有教职员工链接
        faculty_info = await page.evaluate('''() => {
            const faculty = [];

            // 查找所有 /people/ 路径的链接
            document.querySelectorAll('a[href*="/people/"]').forEach(a => {
                const href = a.href || '';
                const text = a.innerText.trim();

                // 过滤掉导航链接，只保留个人页面链接
                if (href.includes('/people/') && text.length > 3 &&
                    !text.toLowerCase().includes('people') &&
                    !href.endsWith('/people/') &&
                    !href.endsWith('/aaas-people')) {
                    faculty.push({
                        name: text,
                        url: href
                    });
                }
            });

            return faculty;
        }''')

        print(f"\n找到 {len(faculty_info)} 个教职员工:")
        for f in faculty_info:
            print(f"  - {f['name']} -> {f['url']}")

        # 尝试经济学院系的Faculty页面
        print("\n\n========== 尝试经济学院系Faculty页面 ==========")
        econ_faculty_url = "http://economics.harvard.edu/people/people-type/faculty"
        print(f"访问: {econ_faculty_url}")

        await page.goto(econ_faculty_url, wait_until='networkidle')
        await page.wait_for_timeout(3000)

        await page.screenshot(path="econ_faculty_page.png", full_page=True)
        print("已保存截图: econ_faculty_page.png")

        econ_faculty = await page.evaluate('''() => {
            const faculty = [];

            // 查找所有可能的faculty链接
            document.querySelectorAll('a[href]').forEach(a => {
                const href = a.href || '';
                const text = a.innerText.trim();
                const lowerHref = href.toLowerCase();

                // 查找个人页面链接
                if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
                     lowerHref.includes('/profile/')) &&
                    text.length > 3 && text.length < 100 &&
                    !text.toLowerCase().includes('faculty') &&
                    !text.toLowerCase().includes('people')) {
                    faculty.push({
                        name: text,
                        url: href
                    });
                }
            });

            return faculty;
        }''')

        print(f"\n找到 {len(econ_faculty)} 个教职员工:")
        for f in econ_faculty[:30]:
            print(f"  - {f['name']} -> {f['url']}")

        # 查看页面上所有链接用于调试
        print("\n\n页面上的所有链接:")
        all_links = await page.evaluate('''() => {
            const links = [];
            document.querySelectorAll('a[href]').forEach(a => {
                const href = a.href || '';
                const text = a.innerText.trim();
                if (text && text.length > 2 && text.length < 100) {
                    links.push({text: text, href: href});
                }
            });
            return links;
        }''')
        for link in all_links[:40]:
            print(f"  - {link['text'][:50]} -> {link['href']}")

        await browser.close()

if __name__ == "__main__":
    asyncio.run(explore_faculty_page())