Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,356 @@
#!/usr/bin/env python3
"""
Harvard Graduate Programs Scraper with Faculty Information
爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
并获取每个项目的导师个人信息页面URL
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright
def name_to_slug(name):
"""将项目名称转换为URL slug"""
slug = name.lower()
slug = re.sub(r'[^\w\s-]', '', slug)
slug = re.sub(r'[\s_]+', '-', slug)
slug = re.sub(r'-+', '-', slug)
slug = slug.strip('-')
return slug
async def extract_faculty_from_page(page):
"""从当前页面提取所有教职员工链接"""
faculty_list = await page.evaluate('''() => {
const faculty = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
// 检查是否是个人页面链接
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
lowerHref.includes('/profile/') || lowerHref.includes('/person/')) &&
text.length > 3 && text.length < 100 &&
!lowerText.includes('people') &&
!lowerText.includes('faculty') &&
!lowerText.includes('profile') &&
!lowerText.includes('staff') &&
!lowerHref.endsWith('/people/') &&
!lowerHref.endsWith('/people') &&
!lowerHref.endsWith('/faculty/') &&
!lowerHref.endsWith('/faculty')) {
if (!seen.has(href)) {
seen.add(href);
faculty.push({
name: text,
url: href
});
}
}
});
return faculty;
}''')
return faculty_list
async def get_faculty_from_gsas_page(page, gsas_url, program_name):
"""从GSAS项目页面获取Faculty链接然后访问院系People页面获取导师列表"""
faculty_list = []
faculty_page_url = None
try:
print(f" 访问GSAS页面: {gsas_url}")
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 策略1: 查找 "See list of ... faculty" 链接
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href;
if (text.includes('faculty') && text.includes('see list')) {
return href;
}
}
return null;
}''')
# 策略2: 查找任何包含 /people 或 /faculty 的链接
if not faculty_link:
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href.toLowerCase();
// 查找Faculty相关链接
if ((text.includes('faculty') || text.includes('people')) &&
(href.includes('/people') || href.includes('/faculty'))) {
return link.href;
}
}
return null;
}''')
# 策略3: 从页面中查找院系网站链接然后尝试访问其People页面
if not faculty_link:
dept_website = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href;
// 查找 Website 链接 (通常指向院系主页)
if (text.includes('website') && href.includes('harvard.edu') &&
!href.includes('gsas.harvard.edu')) {
return href;
}
}
return null;
}''')
if dept_website:
print(f" 找到院系网站: {dept_website}")
try:
await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 在院系网站上查找People/Faculty链接
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase().trim();
const href = link.href;
if ((text === 'people' || text === 'faculty' ||
text === 'faculty & research' || text.includes('our faculty')) &&
(href.includes('/people') || href.includes('/faculty'))) {
return href;
}
}
return null;
}''')
except Exception as e:
print(f" 访问院系网站失败: {e}")
if faculty_link:
faculty_page_url = faculty_link
print(f" 找到Faculty页面: {faculty_link}")
# 访问Faculty/People页面
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 提取所有导师信息
faculty_list = await extract_faculty_from_page(page)
# 如果第一页没找到,尝试处理分页或其他布局
if len(faculty_list) == 0:
# 可能需要点击某些按钮或处理JavaScript加载
await page.wait_for_timeout(2000)
faculty_list = await extract_faculty_from_page(page)
print(f" 找到 {len(faculty_list)} 位导师")
else:
print(f" 未找到Faculty页面链接")
except Exception as e:
print(f" 获取Faculty信息失败: {e}")
return faculty_list, faculty_page_url
async def scrape_harvard_programs_with_faculty():
"""爬取Harvard研究生项目列表及导师信息"""
all_programs = []
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
print(f"正在访问: {base_url}")
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
await page.wait_for_timeout(5000)
# 滚动到页面底部
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
current_page = 1
max_pages = 15
# 第一阶段:收集所有项目基本信息
print("\n========== 第一阶段:收集项目列表 ==========")
while current_page <= max_pages:
print(f"\n--- 第 {current_page} 页 ---")
await page.wait_for_timeout(2000)
# 提取当前页面的项目
page_data = await page.evaluate('''() => {
const programs = [];
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
programItems.forEach((item, index) => {
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
if (!nameBtn) return;
const name = nameBtn.innerText.trim();
if (!name || name.length < 3) return;
let degrees = '';
const allText = item.innerText;
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
if (degreeMatch) {
degrees = degreeMatch.join(', ');
}
programs.push({
name: name,
degrees: degrees
});
});
if (programs.length === 0) {
const buttons = document.querySelectorAll('button');
buttons.forEach((btn) => {
const className = btn.className || '';
if (className.includes('c-programs-item') || className.includes('title-link')) {
const name = btn.innerText.trim();
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
programs.push({
name: name,
degrees: ''
});
}
}
});
}
return programs;
}''')
print(f" 本页找到 {len(page_data)} 个项目")
for prog in page_data:
name = prog['name'].strip()
if name and not any(p['name'] == name for p in all_programs):
all_programs.append({
'name': name,
'degrees': prog.get('degrees', ''),
'page': current_page
})
# 尝试点击下一页
try:
next_btn = page.locator('button.c-pagination__link--next')
if await next_btn.count() > 0:
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
else:
print("没有下一页按钮,结束收集")
break
except Exception as e:
print(f"分页失败: {e}")
break
print(f"\n共收集到 {len(all_programs)} 个项目")
# 第二阶段:为每个项目获取导师信息
print("\n========== 第二阶段:获取导师信息 ==========")
print("注意这将访问每个项目的GSAS页面可能需要较长时间...")
for i, prog in enumerate(all_programs, 1):
print(f"\n[{i}/{len(all_programs)}] {prog['name']}")
# 生成项目URL
slug = name_to_slug(prog['name'])
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
# 生成GSAS URL
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
# 获取导师信息
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name'])
prog['faculty_page_url'] = faculty_page_url or ""
prog['faculty'] = faculty_list
prog['faculty_count'] = len(faculty_list)
# 每10个项目保存一次进度
if i % 10 == 0:
temp_result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'progress': f"{i}/{len(all_programs)}",
'programs': all_programs[:i]
}
with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f:
json.dump(temp_result, f, ensure_ascii=False, indent=2)
print(f" [进度已保存]")
# 避免请求过快
await page.wait_for_timeout(1500)
await browser.close()
# 排序
programs = sorted(all_programs, key=lambda x: x['name'])
# 统计
total_faculty = sum(p['faculty_count'] for p in programs)
programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0)
# 保存最终结果
result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'total_pages_scraped': current_page,
'total_programs': len(programs),
'programs_with_faculty': programs_with_faculty,
'total_faculty_found': total_faculty,
'programs': programs
}
output_file = Path('harvard_programs_with_faculty.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n{'='*60}")
print(f"爬取完成!")
print(f"共爬取 {current_page}")
print(f"共找到 {len(programs)} 个研究生项目")
print(f"其中 {programs_with_faculty} 个项目有导师信息")
print(f"共找到 {total_faculty} 位导师")
print(f"结果保存到: {output_file}")
print(f"{'='*60}")
# 打印摘要
print("\n项目摘要 (前30个):")
for i, prog in enumerate(programs[:30], 1):
faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)"
print(f"{i:3}. {prog['name']} {faculty_info}")
if len(programs) > 30:
print(f"... 还有 {len(programs) - 30} 个项目")
return result
if __name__ == "__main__":
asyncio.run(scrape_harvard_programs_with_faculty())