- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
357 lines
14 KiB
Python
357 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Harvard Graduate Programs Scraper with Faculty Information
|
||
爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
|
||
并获取每个项目的导师个人信息页面URL
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
import re
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from playwright.async_api import async_playwright
|
||
|
||
|
||
def name_to_slug(name):
|
||
"""将项目名称转换为URL slug"""
|
||
slug = name.lower()
|
||
slug = re.sub(r'[^\w\s-]', '', slug)
|
||
slug = re.sub(r'[\s_]+', '-', slug)
|
||
slug = re.sub(r'-+', '-', slug)
|
||
slug = slug.strip('-')
|
||
return slug
|
||
|
||
|
||
async def extract_faculty_from_page(page):
|
||
"""从当前页面提取所有教职员工链接"""
|
||
faculty_list = await page.evaluate('''() => {
|
||
const faculty = [];
|
||
const seen = new Set();
|
||
|
||
document.querySelectorAll('a[href]').forEach(a => {
|
||
const href = a.href || '';
|
||
const text = a.innerText.trim();
|
||
const lowerHref = href.toLowerCase();
|
||
const lowerText = text.toLowerCase();
|
||
|
||
// 检查是否是个人页面链接
|
||
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
|
||
lowerHref.includes('/profile/') || lowerHref.includes('/person/')) &&
|
||
text.length > 3 && text.length < 100 &&
|
||
!lowerText.includes('people') &&
|
||
!lowerText.includes('faculty') &&
|
||
!lowerText.includes('profile') &&
|
||
!lowerText.includes('staff') &&
|
||
!lowerHref.endsWith('/people/') &&
|
||
!lowerHref.endsWith('/people') &&
|
||
!lowerHref.endsWith('/faculty/') &&
|
||
!lowerHref.endsWith('/faculty')) {
|
||
|
||
if (!seen.has(href)) {
|
||
seen.add(href);
|
||
faculty.push({
|
||
name: text,
|
||
url: href
|
||
});
|
||
}
|
||
}
|
||
});
|
||
|
||
return faculty;
|
||
}''')
|
||
return faculty_list
|
||
|
||
|
||
async def get_faculty_from_gsas_page(page, gsas_url, program_name):
|
||
"""从GSAS项目页面获取Faculty链接,然后访问院系People页面获取导师列表"""
|
||
faculty_list = []
|
||
faculty_page_url = None
|
||
|
||
try:
|
||
print(f" 访问GSAS页面: {gsas_url}")
|
||
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
|
||
await page.wait_for_timeout(2000)
|
||
|
||
# 策略1: 查找 "See list of ... faculty" 链接
|
||
faculty_link = await page.evaluate('''() => {
|
||
const links = document.querySelectorAll('a[href]');
|
||
for (const link of links) {
|
||
const text = link.innerText.toLowerCase();
|
||
const href = link.href;
|
||
if (text.includes('faculty') && text.includes('see list')) {
|
||
return href;
|
||
}
|
||
}
|
||
return null;
|
||
}''')
|
||
|
||
# 策略2: 查找任何包含 /people 或 /faculty 的链接
|
||
if not faculty_link:
|
||
faculty_link = await page.evaluate('''() => {
|
||
const links = document.querySelectorAll('a[href]');
|
||
for (const link of links) {
|
||
const text = link.innerText.toLowerCase();
|
||
const href = link.href.toLowerCase();
|
||
// 查找Faculty相关链接
|
||
if ((text.includes('faculty') || text.includes('people')) &&
|
||
(href.includes('/people') || href.includes('/faculty'))) {
|
||
return link.href;
|
||
}
|
||
}
|
||
return null;
|
||
}''')
|
||
|
||
# 策略3: 从页面中查找院系网站链接,然后尝试访问其People页面
|
||
if not faculty_link:
|
||
dept_website = await page.evaluate('''() => {
|
||
const links = document.querySelectorAll('a[href]');
|
||
for (const link of links) {
|
||
const text = link.innerText.toLowerCase();
|
||
const href = link.href;
|
||
// 查找 Website 链接 (通常指向院系主页)
|
||
if (text.includes('website') && href.includes('harvard.edu') &&
|
||
!href.includes('gsas.harvard.edu')) {
|
||
return href;
|
||
}
|
||
}
|
||
return null;
|
||
}''')
|
||
|
||
if dept_website:
|
||
print(f" 找到院系网站: {dept_website}")
|
||
try:
|
||
await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000)
|
||
await page.wait_for_timeout(2000)
|
||
|
||
# 在院系网站上查找People/Faculty链接
|
||
faculty_link = await page.evaluate('''() => {
|
||
const links = document.querySelectorAll('a[href]');
|
||
for (const link of links) {
|
||
const text = link.innerText.toLowerCase().trim();
|
||
const href = link.href;
|
||
if ((text === 'people' || text === 'faculty' ||
|
||
text === 'faculty & research' || text.includes('our faculty')) &&
|
||
(href.includes('/people') || href.includes('/faculty'))) {
|
||
return href;
|
||
}
|
||
}
|
||
return null;
|
||
}''')
|
||
except Exception as e:
|
||
print(f" 访问院系网站失败: {e}")
|
||
|
||
if faculty_link:
|
||
faculty_page_url = faculty_link
|
||
print(f" 找到Faculty页面: {faculty_link}")
|
||
|
||
# 访问Faculty/People页面
|
||
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
|
||
await page.wait_for_timeout(2000)
|
||
|
||
# 提取所有导师信息
|
||
faculty_list = await extract_faculty_from_page(page)
|
||
|
||
# 如果第一页没找到,尝试处理分页或其他布局
|
||
if len(faculty_list) == 0:
|
||
# 可能需要点击某些按钮或处理JavaScript加载
|
||
await page.wait_for_timeout(2000)
|
||
faculty_list = await extract_faculty_from_page(page)
|
||
|
||
print(f" 找到 {len(faculty_list)} 位导师")
|
||
else:
|
||
print(f" 未找到Faculty页面链接")
|
||
|
||
except Exception as e:
|
||
print(f" 获取Faculty信息失败: {e}")
|
||
|
||
return faculty_list, faculty_page_url
|
||
|
||
|
||
async def scrape_harvard_programs_with_faculty():
|
||
"""爬取Harvard研究生项目列表及导师信息"""
|
||
|
||
all_programs = []
|
||
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
|
||
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(headless=True)
|
||
context = await browser.new_context(
|
||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
viewport={'width': 1920, 'height': 1080}
|
||
)
|
||
page = await context.new_page()
|
||
|
||
print(f"正在访问: {base_url}")
|
||
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
|
||
await page.wait_for_timeout(5000)
|
||
|
||
# 滚动到页面底部
|
||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||
await page.wait_for_timeout(2000)
|
||
|
||
current_page = 1
|
||
max_pages = 15
|
||
|
||
# 第一阶段:收集所有项目基本信息
|
||
print("\n========== 第一阶段:收集项目列表 ==========")
|
||
while current_page <= max_pages:
|
||
print(f"\n--- 第 {current_page} 页 ---")
|
||
await page.wait_for_timeout(2000)
|
||
|
||
# 提取当前页面的项目
|
||
page_data = await page.evaluate('''() => {
|
||
const programs = [];
|
||
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
|
||
|
||
programItems.forEach((item, index) => {
|
||
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
|
||
if (!nameBtn) return;
|
||
|
||
const name = nameBtn.innerText.trim();
|
||
if (!name || name.length < 3) return;
|
||
|
||
let degrees = '';
|
||
const allText = item.innerText;
|
||
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
|
||
if (degreeMatch) {
|
||
degrees = degreeMatch.join(', ');
|
||
}
|
||
|
||
programs.push({
|
||
name: name,
|
||
degrees: degrees
|
||
});
|
||
});
|
||
|
||
if (programs.length === 0) {
|
||
const buttons = document.querySelectorAll('button');
|
||
buttons.forEach((btn) => {
|
||
const className = btn.className || '';
|
||
if (className.includes('c-programs-item') || className.includes('title-link')) {
|
||
const name = btn.innerText.trim();
|
||
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
|
||
programs.push({
|
||
name: name,
|
||
degrees: ''
|
||
});
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
return programs;
|
||
}''')
|
||
|
||
print(f" 本页找到 {len(page_data)} 个项目")
|
||
|
||
for prog in page_data:
|
||
name = prog['name'].strip()
|
||
if name and not any(p['name'] == name for p in all_programs):
|
||
all_programs.append({
|
||
'name': name,
|
||
'degrees': prog.get('degrees', ''),
|
||
'page': current_page
|
||
})
|
||
|
||
# 尝试点击下一页
|
||
try:
|
||
next_btn = page.locator('button.c-pagination__link--next')
|
||
if await next_btn.count() > 0:
|
||
await next_btn.first.scroll_into_view_if_needed()
|
||
await next_btn.first.click()
|
||
await page.wait_for_timeout(3000)
|
||
current_page += 1
|
||
else:
|
||
print("没有下一页按钮,结束收集")
|
||
break
|
||
except Exception as e:
|
||
print(f"分页失败: {e}")
|
||
break
|
||
|
||
print(f"\n共收集到 {len(all_programs)} 个项目")
|
||
|
||
# 第二阶段:为每个项目获取导师信息
|
||
print("\n========== 第二阶段:获取导师信息 ==========")
|
||
print("注意:这将访问每个项目的GSAS页面,可能需要较长时间...")
|
||
|
||
for i, prog in enumerate(all_programs, 1):
|
||
print(f"\n[{i}/{len(all_programs)}] {prog['name']}")
|
||
|
||
# 生成项目URL
|
||
slug = name_to_slug(prog['name'])
|
||
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
|
||
|
||
# 生成GSAS URL
|
||
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
|
||
|
||
# 获取导师信息
|
||
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name'])
|
||
|
||
prog['faculty_page_url'] = faculty_page_url or ""
|
||
prog['faculty'] = faculty_list
|
||
prog['faculty_count'] = len(faculty_list)
|
||
|
||
# 每10个项目保存一次进度
|
||
if i % 10 == 0:
|
||
temp_result = {
|
||
'source_url': base_url,
|
||
'scraped_at': datetime.now(timezone.utc).isoformat(),
|
||
'progress': f"{i}/{len(all_programs)}",
|
||
'programs': all_programs[:i]
|
||
}
|
||
with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f:
|
||
json.dump(temp_result, f, ensure_ascii=False, indent=2)
|
||
print(f" [进度已保存]")
|
||
|
||
# 避免请求过快
|
||
await page.wait_for_timeout(1500)
|
||
|
||
await browser.close()
|
||
|
||
# 排序
|
||
programs = sorted(all_programs, key=lambda x: x['name'])
|
||
|
||
# 统计
|
||
total_faculty = sum(p['faculty_count'] for p in programs)
|
||
programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0)
|
||
|
||
# 保存最终结果
|
||
result = {
|
||
'source_url': base_url,
|
||
'scraped_at': datetime.now(timezone.utc).isoformat(),
|
||
'total_pages_scraped': current_page,
|
||
'total_programs': len(programs),
|
||
'programs_with_faculty': programs_with_faculty,
|
||
'total_faculty_found': total_faculty,
|
||
'programs': programs
|
||
}
|
||
|
||
output_file = Path('harvard_programs_with_faculty.json')
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"\n{'='*60}")
|
||
print(f"爬取完成!")
|
||
print(f"共爬取 {current_page} 页")
|
||
print(f"共找到 {len(programs)} 个研究生项目")
|
||
print(f"其中 {programs_with_faculty} 个项目有导师信息")
|
||
print(f"共找到 {total_faculty} 位导师")
|
||
print(f"结果保存到: {output_file}")
|
||
print(f"{'='*60}")
|
||
|
||
# 打印摘要
|
||
print("\n项目摘要 (前30个):")
|
||
for i, prog in enumerate(programs[:30], 1):
|
||
faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)"
|
||
print(f"{i:3}. {prog['name']} {faculty_info}")
|
||
|
||
if len(programs) > 30:
|
||
print(f"... 还有 {len(programs) - 30} 个项目")
|
||
|
||
return result
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(scrape_harvard_programs_with_faculty())
|