""" 探索Harvard院系People/Faculty页面结构,获取导师列表 """ import asyncio from playwright.async_api import async_playwright async def explore_faculty_page(): async with async_playwright() as p: browser = await p.chromium.launch(headless=False) page = await browser.new_page() # 访问AAAS院系People页面 people_url = "https://aaas.fas.harvard.edu/aaas-people" print(f"访问院系People页面: {people_url}") await page.goto(people_url, wait_until='networkidle') await page.wait_for_timeout(3000) # 截图保存 await page.screenshot(path="aaas_people_page.png", full_page=True) print("已保存截图: aaas_people_page.png") # 获取所有教职员工链接 faculty_info = await page.evaluate('''() => { const faculty = []; // 查找所有 /people/ 路径的链接 document.querySelectorAll('a[href*="/people/"]').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); // 过滤掉导航链接,只保留个人页面链接 if (href.includes('/people/') && text.length > 3 && !text.toLowerCase().includes('people') && !href.endsWith('/people/') && !href.endsWith('/aaas-people')) { faculty.push({ name: text, url: href }); } }); return faculty; }''') print(f"\n找到 {len(faculty_info)} 个教职员工:") for f in faculty_info: print(f" - {f['name']} -> {f['url']}") # 尝试经济学院系的Faculty页面 print("\n\n========== 尝试经济学院系Faculty页面 ==========") econ_faculty_url = "http://economics.harvard.edu/people/people-type/faculty" print(f"访问: {econ_faculty_url}") await page.goto(econ_faculty_url, wait_until='networkidle') await page.wait_for_timeout(3000) await page.screenshot(path="econ_faculty_page.png", full_page=True) print("已保存截图: econ_faculty_page.png") econ_faculty = await page.evaluate('''() => { const faculty = []; // 查找所有可能的faculty链接 document.querySelectorAll('a[href]').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); const lowerHref = href.toLowerCase(); // 查找个人页面链接 if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') || lowerHref.includes('/profile/')) && text.length > 3 && text.length < 100 && !text.toLowerCase().includes('faculty') && !text.toLowerCase().includes('people')) { faculty.push({ name: text, url: href }); } }); return faculty; }''') print(f"\n找到 {len(econ_faculty)} 个教职员工:") for f in econ_faculty[:30]: print(f" - {f['name']} -> {f['url']}") # 查看页面上所有链接用于调试 print("\n\n页面上的所有链接:") all_links = await page.evaluate('''() => { const links = []; document.querySelectorAll('a[href]').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); if (text && text.length > 2 && text.length < 100) { links.push({text: text, href: href}); } }); return links; }''') for link in all_links[:40]: print(f" - {link['text'][:50]} -> {link['href']}") await browser.close() if __name__ == "__main__": asyncio.run(explore_faculty_page())