Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
110
artifacts/explore_faculty_page.py
Normal file
110
artifacts/explore_faculty_page.py
Normal file
@ -0,0 +1,110 @@
|
||||
"""
|
||||
探索Harvard院系People/Faculty页面结构,获取导师列表
|
||||
"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def explore_faculty_page():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
# 访问AAAS院系People页面
|
||||
people_url = "https://aaas.fas.harvard.edu/aaas-people"
|
||||
print(f"访问院系People页面: {people_url}")
|
||||
|
||||
await page.goto(people_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 截图保存
|
||||
await page.screenshot(path="aaas_people_page.png", full_page=True)
|
||||
print("已保存截图: aaas_people_page.png")
|
||||
|
||||
# 获取所有教职员工链接
|
||||
faculty_info = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
|
||||
// 查找所有 /people/ 路径的链接
|
||||
document.querySelectorAll('a[href*="/people/"]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
|
||||
// 过滤掉导航链接,只保留个人页面链接
|
||||
if (href.includes('/people/') && text.length > 3 &&
|
||||
!text.toLowerCase().includes('people') &&
|
||||
!href.endsWith('/people/') &&
|
||||
!href.endsWith('/aaas-people')) {
|
||||
faculty.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
|
||||
print(f"\n找到 {len(faculty_info)} 个教职员工:")
|
||||
for f in faculty_info:
|
||||
print(f" - {f['name']} -> {f['url']}")
|
||||
|
||||
# 尝试经济学院系的Faculty页面
|
||||
print("\n\n========== 尝试经济学院系Faculty页面 ==========")
|
||||
econ_faculty_url = "http://economics.harvard.edu/people/people-type/faculty"
|
||||
print(f"访问: {econ_faculty_url}")
|
||||
|
||||
await page.goto(econ_faculty_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
await page.screenshot(path="econ_faculty_page.png", full_page=True)
|
||||
print("已保存截图: econ_faculty_page.png")
|
||||
|
||||
econ_faculty = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
|
||||
// 查找所有可能的faculty链接
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
|
||||
// 查找个人页面链接
|
||||
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
|
||||
lowerHref.includes('/profile/')) &&
|
||||
text.length > 3 && text.length < 100 &&
|
||||
!text.toLowerCase().includes('faculty') &&
|
||||
!text.toLowerCase().includes('people')) {
|
||||
faculty.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
|
||||
print(f"\n找到 {len(econ_faculty)} 个教职员工:")
|
||||
for f in econ_faculty[:30]:
|
||||
print(f" - {f['name']} -> {f['url']}")
|
||||
|
||||
# 查看页面上所有链接用于调试
|
||||
print("\n\n页面上的所有链接:")
|
||||
all_links = await page.evaluate('''() => {
|
||||
const links = [];
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
if (text && text.length > 2 && text.length < 100) {
|
||||
links.push({text: text, href: href});
|
||||
}
|
||||
});
|
||||
return links;
|
||||
}''')
|
||||
for link in all_links[:40]:
|
||||
print(f" - {link['text'][:50]} -> {link['href']}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(explore_faculty_page())
|
||||
Reference in New Issue
Block a user