Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,226 @@
"""
探索Harvard项目页面结构寻找导师信息
"""
import asyncio
from playwright.async_api import async_playwright
async def explore_program_page():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
# 访问研究生院系页面 (GSAS)
gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies"
print(f"访问研究生院系页面: {gsas_url}")
await page.goto(gsas_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
# 截图保存
await page.screenshot(path="gsas_program_page.png", full_page=True)
print("已保存截图: gsas_program_page.png")
# 分析页面结构
page_info = await page.evaluate('''() => {
const info = {
title: document.title,
h1: document.querySelector('h1')?.innerText || '',
allHeadings: [],
facultyLinks: [],
peopleLinks: [],
allLinks: []
};
// 获取所有标题
document.querySelectorAll('h1, h2, h3, h4').forEach(h => {
info.allHeadings.push({
tag: h.tagName,
text: h.innerText.trim().substring(0, 100)
});
});
// 查找所有链接
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
// 检查是否与教职员工相关
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerHref.includes('professor') || lowerHref.includes('staff') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
// 检查是否是个人页面链接
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/') || href.includes('/person/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
// 保存所有主要链接
if (href && text.length > 2 && text.length < 150) {
info.allLinks.push({
text: text,
href: href
});
}
});
return info;
}''')
print(f"\n页面标题: {page_info['title']}")
print(f"H1: {page_info['h1']}")
print(f"\n所有标题 ({len(page_info['allHeadings'])}):")
for h in page_info['allHeadings']:
print(f" <{h['tag']}>: {h['text']}")
print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):")
for f in page_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):")
for p in page_info['peopleLinks']:
print(f" - {p['text']} -> {p['href']}")
print(f"\n所有链接 ({len(page_info['allLinks'])}):")
for link in page_info['allLinks'][:50]:
print(f" - {link['text'][:60]} -> {link['href']}")
# 尝试另一个项目页面看看是否有不同结构
print("\n\n========== 尝试另一个项目页面 ==========")
economics_url = "https://gsas.harvard.edu/program/economics"
print(f"访问: {economics_url}")
await page.goto(economics_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
# 截图保存
await page.screenshot(path="gsas_economics_page.png", full_page=True)
print("已保存截图: gsas_economics_page.png")
# 分析
econ_info = await page.evaluate('''() => {
const info = {
title: document.title,
facultyLinks: [],
peopleLinks: []
};
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/') || href.includes('/person/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
});
return info;
}''')
print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):")
for f in econ_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):")
for p in econ_info['peopleLinks']:
print(f" - {p['text']} -> {p['href']}")
# 访问院系主页看看有没有Faculty页面
print("\n\n========== 尝试访问院系主页 ==========")
dept_url = "https://aaas.fas.harvard.edu/"
print(f"访问院系主页: {dept_url}")
await page.goto(dept_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
await page.screenshot(path="aaas_dept_page.png", full_page=True)
print("已保存截图: aaas_dept_page.png")
dept_info = await page.evaluate('''() => {
const info = {
title: document.title,
navLinks: [],
facultyLinks: [],
peopleLinks: []
};
// 获取导航链接
document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
if (text && text.length > 1 && text.length < 50) {
info.navLinks.push({
text: text,
href: href
});
}
});
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
});
return info;
}''')
print(f"\n导航链接 ({len(dept_info['navLinks'])}):")
for link in dept_info['navLinks'][:20]:
print(f" - {link['text']} -> {link['href']}")
print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):")
for f in dept_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):")
for p in dept_info['peopleLinks'][:30]:
print(f" - {p['text']} -> {p['href']}")
await browser.close()
if __name__ == "__main__":
asyncio.run(explore_program_page())