University-Playwright-Codeg…/artifacts/explore_program_page.py

"""
探索Harvard项目页面结构，寻找导师信息
"""
import asyncio
from playwright.async_api import async_playwright

async def explore_program_page():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        # 访问研究生院系页面 (GSAS)
        gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies"
        print(f"访问研究生院系页面: {gsas_url}")

        await page.goto(gsas_url, wait_until='networkidle')
        await page.wait_for_timeout(3000)

        # 截图保存
        await page.screenshot(path="gsas_program_page.png", full_page=True)
        print("已保存截图: gsas_program_page.png")

        # 分析页面结构
        page_info = await page.evaluate('''() => {
            const info = {
                title: document.title,
                h1: document.querySelector('h1')?.innerText || '',
                allHeadings: [],
                facultyLinks: [],
                peopleLinks: [],
                allLinks: []
            };

            // 获取所有标题
            document.querySelectorAll('h1, h2, h3, h4').forEach(h => {
                info.allHeadings.push({
                    tag: h.tagName,
                    text: h.innerText.trim().substring(0, 100)
                });
            });

            // 查找所有链接
            document.querySelectorAll('a[href]').forEach(a => {
                const href = a.href || '';
                const text = a.innerText.trim();

                // 检查是否与教职员工相关
                const lowerHref = href.toLowerCase();
                const lowerText = text.toLowerCase();

                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
                    lowerHref.includes('professor') || lowerHref.includes('staff') ||
                    lowerText.includes('faculty') || lowerText.includes('people')) {
                    info.facultyLinks.push({
                        text: text.substring(0, 100),
                        href: href
                    });
                }

                // 检查是否是个人页面链接
                if (href.includes('/people/') || href.includes('/faculty/') ||
                    href.includes('/profile/') || href.includes('/person/')) {
                    info.peopleLinks.push({
                        text: text.substring(0, 100),
                        href: href
                    });
                }

                // 保存所有主要链接
                if (href && text.length > 2 && text.length < 150) {
                    info.allLinks.push({
                        text: text,
                        href: href
                    });
                }
            });

            return info;
        }''')

        print(f"\n页面标题: {page_info['title']}")
        print(f"H1: {page_info['h1']}")

        print(f"\n所有标题 ({len(page_info['allHeadings'])}):")
        for h in page_info['allHeadings']:
            print(f"  <{h['tag']}>: {h['text']}")

        print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):")
        for f in page_info['facultyLinks']:
            print(f"  - {f['text']} -> {f['href']}")

        print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):")
        for p in page_info['peopleLinks']:
            print(f"  - {p['text']} -> {p['href']}")

        print(f"\n所有链接 ({len(page_info['allLinks'])}):")
        for link in page_info['allLinks'][:50]:
            print(f"  - {link['text'][:60]} -> {link['href']}")

        # 尝试另一个项目页面看看是否有不同结构
        print("\n\n========== 尝试另一个项目页面 ==========")
        economics_url = "https://gsas.harvard.edu/program/economics"
        print(f"访问: {economics_url}")

        await page.goto(economics_url, wait_until='networkidle')
        await page.wait_for_timeout(3000)

        # 截图保存
        await page.screenshot(path="gsas_economics_page.png", full_page=True)
        print("已保存截图: gsas_economics_page.png")

        # 分析
        econ_info = await page.evaluate('''() => {
            const info = {
                title: document.title,
                facultyLinks: [],
                peopleLinks: []
            };

            document.querySelectorAll('a[href]').forEach(a => {
                const href = a.href || '';
                const text = a.innerText.trim();
                const lowerHref = href.toLowerCase();
                const lowerText = text.toLowerCase();

                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
                    lowerText.includes('faculty') || lowerText.includes('people')) {
                    info.facultyLinks.push({
                        text: text.substring(0, 100),
                        href: href
                    });
                }

                if (href.includes('/people/') || href.includes('/faculty/') ||
                    href.includes('/profile/') || href.includes('/person/')) {
                    info.peopleLinks.push({
                        text: text.substring(0, 100),
                        href: href
                    });
                }
            });

            return info;
        }''')

        print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):")
        for f in econ_info['facultyLinks']:
            print(f"  - {f['text']} -> {f['href']}")

        print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):")
        for p in econ_info['peopleLinks']:
            print(f"  - {p['text']} -> {p['href']}")

        # 访问院系主页看看有没有Faculty页面
        print("\n\n========== 尝试访问院系主页 ==========")
        dept_url = "https://aaas.fas.harvard.edu/"
        print(f"访问院系主页: {dept_url}")

        await page.goto(dept_url, wait_until='networkidle')
        await page.wait_for_timeout(3000)

        await page.screenshot(path="aaas_dept_page.png", full_page=True)
        print("已保存截图: aaas_dept_page.png")

        dept_info = await page.evaluate('''() => {
            const info = {
                title: document.title,
                navLinks: [],
                facultyLinks: [],
                peopleLinks: []
            };

            // 获取导航链接
            document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => {
                const href = a.href || '';
                const text = a.innerText.trim();
                if (text && text.length > 1 && text.length < 50) {
                    info.navLinks.push({
                        text: text,
                        href: href
                    });
                }
            });

            document.querySelectorAll('a[href]').forEach(a => {
                const href = a.href || '';
                const text = a.innerText.trim();
                const lowerHref = href.toLowerCase();
                const lowerText = text.toLowerCase();

                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
                    lowerText.includes('faculty') || lowerText.includes('people')) {
                    info.facultyLinks.push({
                        text: text.substring(0, 100),
                        href: href
                    });
                }

                if (href.includes('/people/') || href.includes('/faculty/') ||
                    href.includes('/profile/')) {
                    info.peopleLinks.push({
                        text: text.substring(0, 100),
                        href: href
                    });
                }
            });

            return info;
        }''')

        print(f"\n导航链接 ({len(dept_info['navLinks'])}):")
        for link in dept_info['navLinks'][:20]:
            print(f"  - {link['text']} -> {link['href']}")

        print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):")
        for f in dept_info['facultyLinks']:
            print(f"  - {f['text']} -> {f['href']}")

        print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):")
        for p in dept_info['peopleLinks'][:30]:
            print(f"  - {p['text']} -> {p['href']}")

        await browser.close()

if __name__ == "__main__":
    asyncio.run(explore_program_page())