""" 探索Harvard项目页面结构,寻找导师信息 """ import asyncio from playwright.async_api import async_playwright async def explore_program_page(): async with async_playwright() as p: browser = await p.chromium.launch(headless=False) page = await browser.new_page() # 访问研究生院系页面 (GSAS) gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies" print(f"访问研究生院系页面: {gsas_url}") await page.goto(gsas_url, wait_until='networkidle') await page.wait_for_timeout(3000) # 截图保存 await page.screenshot(path="gsas_program_page.png", full_page=True) print("已保存截图: gsas_program_page.png") # 分析页面结构 page_info = await page.evaluate('''() => { const info = { title: document.title, h1: document.querySelector('h1')?.innerText || '', allHeadings: [], facultyLinks: [], peopleLinks: [], allLinks: [] }; // 获取所有标题 document.querySelectorAll('h1, h2, h3, h4').forEach(h => { info.allHeadings.push({ tag: h.tagName, text: h.innerText.trim().substring(0, 100) }); }); // 查找所有链接 document.querySelectorAll('a[href]').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); // 检查是否与教职员工相关 const lowerHref = href.toLowerCase(); const lowerText = text.toLowerCase(); if (lowerHref.includes('faculty') || lowerHref.includes('people') || lowerHref.includes('professor') || lowerHref.includes('staff') || lowerText.includes('faculty') || lowerText.includes('people')) { info.facultyLinks.push({ text: text.substring(0, 100), href: href }); } // 检查是否是个人页面链接 if (href.includes('/people/') || href.includes('/faculty/') || href.includes('/profile/') || href.includes('/person/')) { info.peopleLinks.push({ text: text.substring(0, 100), href: href }); } // 保存所有主要链接 if (href && text.length > 2 && text.length < 150) { info.allLinks.push({ text: text, href: href }); } }); return info; }''') print(f"\n页面标题: {page_info['title']}") print(f"H1: {page_info['h1']}") print(f"\n所有标题 ({len(page_info['allHeadings'])}):") for h in page_info['allHeadings']: print(f" <{h['tag']}>: {h['text']}") print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):") for f in page_info['facultyLinks']: print(f" - {f['text']} -> {f['href']}") print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):") for p in page_info['peopleLinks']: print(f" - {p['text']} -> {p['href']}") print(f"\n所有链接 ({len(page_info['allLinks'])}):") for link in page_info['allLinks'][:50]: print(f" - {link['text'][:60]} -> {link['href']}") # 尝试另一个项目页面看看是否有不同结构 print("\n\n========== 尝试另一个项目页面 ==========") economics_url = "https://gsas.harvard.edu/program/economics" print(f"访问: {economics_url}") await page.goto(economics_url, wait_until='networkidle') await page.wait_for_timeout(3000) # 截图保存 await page.screenshot(path="gsas_economics_page.png", full_page=True) print("已保存截图: gsas_economics_page.png") # 分析 econ_info = await page.evaluate('''() => { const info = { title: document.title, facultyLinks: [], peopleLinks: [] }; document.querySelectorAll('a[href]').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); const lowerHref = href.toLowerCase(); const lowerText = text.toLowerCase(); if (lowerHref.includes('faculty') || lowerHref.includes('people') || lowerText.includes('faculty') || lowerText.includes('people')) { info.facultyLinks.push({ text: text.substring(0, 100), href: href }); } if (href.includes('/people/') || href.includes('/faculty/') || href.includes('/profile/') || href.includes('/person/')) { info.peopleLinks.push({ text: text.substring(0, 100), href: href }); } }); return info; }''') print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):") for f in econ_info['facultyLinks']: print(f" - {f['text']} -> {f['href']}") print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):") for p in econ_info['peopleLinks']: print(f" - {p['text']} -> {p['href']}") # 访问院系主页看看有没有Faculty页面 print("\n\n========== 尝试访问院系主页 ==========") dept_url = "https://aaas.fas.harvard.edu/" print(f"访问院系主页: {dept_url}") await page.goto(dept_url, wait_until='networkidle') await page.wait_for_timeout(3000) await page.screenshot(path="aaas_dept_page.png", full_page=True) print("已保存截图: aaas_dept_page.png") dept_info = await page.evaluate('''() => { const info = { title: document.title, navLinks: [], facultyLinks: [], peopleLinks: [] }; // 获取导航链接 document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); if (text && text.length > 1 && text.length < 50) { info.navLinks.push({ text: text, href: href }); } }); document.querySelectorAll('a[href]').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); const lowerHref = href.toLowerCase(); const lowerText = text.toLowerCase(); if (lowerHref.includes('faculty') || lowerHref.includes('people') || lowerText.includes('faculty') || lowerText.includes('people')) { info.facultyLinks.push({ text: text.substring(0, 100), href: href }); } if (href.includes('/people/') || href.includes('/faculty/') || href.includes('/profile/')) { info.peopleLinks.push({ text: text.substring(0, 100), href: href }); } }); return info; }''') print(f"\n导航链接 ({len(dept_info['navLinks'])}):") for link in dept_info['navLinks'][:20]: print(f" - {link['text']} -> {link['href']}") print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):") for f in dept_info['facultyLinks']: print(f" - {f['text']} -> {f['href']}") print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):") for p in dept_info['peopleLinks'][:30]: print(f" - {p['text']} -> {p['href']}") await browser.close() if __name__ == "__main__": asyncio.run(explore_program_page())