Files
University-Playwright-Codeg…/artifacts/explore_program_page.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

227 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
探索Harvard项目页面结构寻找导师信息
"""
import asyncio
from playwright.async_api import async_playwright
async def explore_program_page():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
# 访问研究生院系页面 (GSAS)
gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies"
print(f"访问研究生院系页面: {gsas_url}")
await page.goto(gsas_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
# 截图保存
await page.screenshot(path="gsas_program_page.png", full_page=True)
print("已保存截图: gsas_program_page.png")
# 分析页面结构
page_info = await page.evaluate('''() => {
const info = {
title: document.title,
h1: document.querySelector('h1')?.innerText || '',
allHeadings: [],
facultyLinks: [],
peopleLinks: [],
allLinks: []
};
// 获取所有标题
document.querySelectorAll('h1, h2, h3, h4').forEach(h => {
info.allHeadings.push({
tag: h.tagName,
text: h.innerText.trim().substring(0, 100)
});
});
// 查找所有链接
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
// 检查是否与教职员工相关
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerHref.includes('professor') || lowerHref.includes('staff') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
// 检查是否是个人页面链接
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/') || href.includes('/person/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
// 保存所有主要链接
if (href && text.length > 2 && text.length < 150) {
info.allLinks.push({
text: text,
href: href
});
}
});
return info;
}''')
print(f"\n页面标题: {page_info['title']}")
print(f"H1: {page_info['h1']}")
print(f"\n所有标题 ({len(page_info['allHeadings'])}):")
for h in page_info['allHeadings']:
print(f" <{h['tag']}>: {h['text']}")
print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):")
for f in page_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):")
for p in page_info['peopleLinks']:
print(f" - {p['text']} -> {p['href']}")
print(f"\n所有链接 ({len(page_info['allLinks'])}):")
for link in page_info['allLinks'][:50]:
print(f" - {link['text'][:60]} -> {link['href']}")
# 尝试另一个项目页面看看是否有不同结构
print("\n\n========== 尝试另一个项目页面 ==========")
economics_url = "https://gsas.harvard.edu/program/economics"
print(f"访问: {economics_url}")
await page.goto(economics_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
# 截图保存
await page.screenshot(path="gsas_economics_page.png", full_page=True)
print("已保存截图: gsas_economics_page.png")
# 分析
econ_info = await page.evaluate('''() => {
const info = {
title: document.title,
facultyLinks: [],
peopleLinks: []
};
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/') || href.includes('/person/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
});
return info;
}''')
print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):")
for f in econ_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):")
for p in econ_info['peopleLinks']:
print(f" - {p['text']} -> {p['href']}")
# 访问院系主页看看有没有Faculty页面
print("\n\n========== 尝试访问院系主页 ==========")
dept_url = "https://aaas.fas.harvard.edu/"
print(f"访问院系主页: {dept_url}")
await page.goto(dept_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
await page.screenshot(path="aaas_dept_page.png", full_page=True)
print("已保存截图: aaas_dept_page.png")
dept_info = await page.evaluate('''() => {
const info = {
title: document.title,
navLinks: [],
facultyLinks: [],
peopleLinks: []
};
// 获取导航链接
document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
if (text && text.length > 1 && text.length < 50) {
info.navLinks.push({
text: text,
href: href
});
}
});
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
});
return info;
}''')
print(f"\n导航链接 ({len(dept_info['navLinks'])}):")
for link in dept_info['navLinks'][:20]:
print(f" - {link['text']} -> {link['href']}")
print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):")
for f in dept_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):")
for p in dept_info['peopleLinks'][:30]:
print(f" - {p['text']} -> {p['href']}")
await browser.close()
if __name__ == "__main__":
asyncio.run(explore_program_page())