Files
University-Playwright-Codeg…/artifacts/explore_faculty_page.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

111 lines
4.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
探索Harvard院系People/Faculty页面结构获取导师列表
"""
import asyncio
from playwright.async_api import async_playwright
async def explore_faculty_page():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
# 访问AAAS院系People页面
people_url = "https://aaas.fas.harvard.edu/aaas-people"
print(f"访问院系People页面: {people_url}")
await page.goto(people_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
# 截图保存
await page.screenshot(path="aaas_people_page.png", full_page=True)
print("已保存截图: aaas_people_page.png")
# 获取所有教职员工链接
faculty_info = await page.evaluate('''() => {
const faculty = [];
// 查找所有 /people/ 路径的链接
document.querySelectorAll('a[href*="/people/"]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
// 过滤掉导航链接,只保留个人页面链接
if (href.includes('/people/') && text.length > 3 &&
!text.toLowerCase().includes('people') &&
!href.endsWith('/people/') &&
!href.endsWith('/aaas-people')) {
faculty.push({
name: text,
url: href
});
}
});
return faculty;
}''')
print(f"\n找到 {len(faculty_info)} 个教职员工:")
for f in faculty_info:
print(f" - {f['name']} -> {f['url']}")
# 尝试经济学院系的Faculty页面
print("\n\n========== 尝试经济学院系Faculty页面 ==========")
econ_faculty_url = "http://economics.harvard.edu/people/people-type/faculty"
print(f"访问: {econ_faculty_url}")
await page.goto(econ_faculty_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
await page.screenshot(path="econ_faculty_page.png", full_page=True)
print("已保存截图: econ_faculty_page.png")
econ_faculty = await page.evaluate('''() => {
const faculty = [];
// 查找所有可能的faculty链接
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
// 查找个人页面链接
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
lowerHref.includes('/profile/')) &&
text.length > 3 && text.length < 100 &&
!text.toLowerCase().includes('faculty') &&
!text.toLowerCase().includes('people')) {
faculty.push({
name: text,
url: href
});
}
});
return faculty;
}''')
print(f"\n找到 {len(econ_faculty)} 个教职员工:")
for f in econ_faculty[:30]:
print(f" - {f['name']} -> {f['url']}")
# 查看页面上所有链接用于调试
print("\n\n页面上的所有链接:")
all_links = await page.evaluate('''() => {
const links = [];
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
if (text && text.length > 2 && text.length < 100) {
links.push({text: text, href: href});
}
});
return links;
}''')
for link in all_links[:40]:
print(f" - {link['text'][:50]} -> {link['href']}")
await browser.close()
if __name__ == "__main__":
asyncio.run(explore_faculty_page())