Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
226
artifacts/explore_program_page.py
Normal file
226
artifacts/explore_program_page.py
Normal file
@ -0,0 +1,226 @@
|
||||
"""
|
||||
探索Harvard项目页面结构,寻找导师信息
|
||||
"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def explore_program_page():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
# 访问研究生院系页面 (GSAS)
|
||||
gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies"
|
||||
print(f"访问研究生院系页面: {gsas_url}")
|
||||
|
||||
await page.goto(gsas_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 截图保存
|
||||
await page.screenshot(path="gsas_program_page.png", full_page=True)
|
||||
print("已保存截图: gsas_program_page.png")
|
||||
|
||||
# 分析页面结构
|
||||
page_info = await page.evaluate('''() => {
|
||||
const info = {
|
||||
title: document.title,
|
||||
h1: document.querySelector('h1')?.innerText || '',
|
||||
allHeadings: [],
|
||||
facultyLinks: [],
|
||||
peopleLinks: [],
|
||||
allLinks: []
|
||||
};
|
||||
|
||||
// 获取所有标题
|
||||
document.querySelectorAll('h1, h2, h3, h4').forEach(h => {
|
||||
info.allHeadings.push({
|
||||
tag: h.tagName,
|
||||
text: h.innerText.trim().substring(0, 100)
|
||||
});
|
||||
});
|
||||
|
||||
// 查找所有链接
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
|
||||
// 检查是否与教职员工相关
|
||||
const lowerHref = href.toLowerCase();
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
|
||||
lowerHref.includes('professor') || lowerHref.includes('staff') ||
|
||||
lowerText.includes('faculty') || lowerText.includes('people')) {
|
||||
info.facultyLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
|
||||
// 检查是否是个人页面链接
|
||||
if (href.includes('/people/') || href.includes('/faculty/') ||
|
||||
href.includes('/profile/') || href.includes('/person/')) {
|
||||
info.peopleLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
|
||||
// 保存所有主要链接
|
||||
if (href && text.length > 2 && text.length < 150) {
|
||||
info.allLinks.push({
|
||||
text: text,
|
||||
href: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return info;
|
||||
}''')
|
||||
|
||||
print(f"\n页面标题: {page_info['title']}")
|
||||
print(f"H1: {page_info['h1']}")
|
||||
|
||||
print(f"\n所有标题 ({len(page_info['allHeadings'])}):")
|
||||
for h in page_info['allHeadings']:
|
||||
print(f" <{h['tag']}>: {h['text']}")
|
||||
|
||||
print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):")
|
||||
for f in page_info['facultyLinks']:
|
||||
print(f" - {f['text']} -> {f['href']}")
|
||||
|
||||
print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):")
|
||||
for p in page_info['peopleLinks']:
|
||||
print(f" - {p['text']} -> {p['href']}")
|
||||
|
||||
print(f"\n所有链接 ({len(page_info['allLinks'])}):")
|
||||
for link in page_info['allLinks'][:50]:
|
||||
print(f" - {link['text'][:60]} -> {link['href']}")
|
||||
|
||||
# 尝试另一个项目页面看看是否有不同结构
|
||||
print("\n\n========== 尝试另一个项目页面 ==========")
|
||||
economics_url = "https://gsas.harvard.edu/program/economics"
|
||||
print(f"访问: {economics_url}")
|
||||
|
||||
await page.goto(economics_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 截图保存
|
||||
await page.screenshot(path="gsas_economics_page.png", full_page=True)
|
||||
print("已保存截图: gsas_economics_page.png")
|
||||
|
||||
# 分析
|
||||
econ_info = await page.evaluate('''() => {
|
||||
const info = {
|
||||
title: document.title,
|
||||
facultyLinks: [],
|
||||
peopleLinks: []
|
||||
};
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
|
||||
lowerText.includes('faculty') || lowerText.includes('people')) {
|
||||
info.facultyLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
|
||||
if (href.includes('/people/') || href.includes('/faculty/') ||
|
||||
href.includes('/profile/') || href.includes('/person/')) {
|
||||
info.peopleLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return info;
|
||||
}''')
|
||||
|
||||
print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):")
|
||||
for f in econ_info['facultyLinks']:
|
||||
print(f" - {f['text']} -> {f['href']}")
|
||||
|
||||
print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):")
|
||||
for p in econ_info['peopleLinks']:
|
||||
print(f" - {p['text']} -> {p['href']}")
|
||||
|
||||
# 访问院系主页看看有没有Faculty页面
|
||||
print("\n\n========== 尝试访问院系主页 ==========")
|
||||
dept_url = "https://aaas.fas.harvard.edu/"
|
||||
print(f"访问院系主页: {dept_url}")
|
||||
|
||||
await page.goto(dept_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
await page.screenshot(path="aaas_dept_page.png", full_page=True)
|
||||
print("已保存截图: aaas_dept_page.png")
|
||||
|
||||
dept_info = await page.evaluate('''() => {
|
||||
const info = {
|
||||
title: document.title,
|
||||
navLinks: [],
|
||||
facultyLinks: [],
|
||||
peopleLinks: []
|
||||
};
|
||||
|
||||
// 获取导航链接
|
||||
document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
if (text && text.length > 1 && text.length < 50) {
|
||||
info.navLinks.push({
|
||||
text: text,
|
||||
href: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
|
||||
lowerText.includes('faculty') || lowerText.includes('people')) {
|
||||
info.facultyLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
|
||||
if (href.includes('/people/') || href.includes('/faculty/') ||
|
||||
href.includes('/profile/')) {
|
||||
info.peopleLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return info;
|
||||
}''')
|
||||
|
||||
print(f"\n导航链接 ({len(dept_info['navLinks'])}):")
|
||||
for link in dept_info['navLinks'][:20]:
|
||||
print(f" - {link['text']} -> {link['href']}")
|
||||
|
||||
print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):")
|
||||
for f in dept_info['facultyLinks']:
|
||||
print(f" - {f['text']} -> {f['href']}")
|
||||
|
||||
print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):")
|
||||
for p in dept_info['peopleLinks'][:30]:
|
||||
print(f" - {p['text']} -> {p['href']}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(explore_program_page())
|
||||
Reference in New Issue
Block a user