Files
University-Playwright-Codeg…/artifacts/harvard_programs_with_faculty_scraper.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

357 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Harvard Graduate Programs Scraper with Faculty Information
爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
并获取每个项目的导师个人信息页面URL
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright
def name_to_slug(name):
"""将项目名称转换为URL slug"""
slug = name.lower()
slug = re.sub(r'[^\w\s-]', '', slug)
slug = re.sub(r'[\s_]+', '-', slug)
slug = re.sub(r'-+', '-', slug)
slug = slug.strip('-')
return slug
async def extract_faculty_from_page(page):
"""从当前页面提取所有教职员工链接"""
faculty_list = await page.evaluate('''() => {
const faculty = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
// 检查是否是个人页面链接
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
lowerHref.includes('/profile/') || lowerHref.includes('/person/')) &&
text.length > 3 && text.length < 100 &&
!lowerText.includes('people') &&
!lowerText.includes('faculty') &&
!lowerText.includes('profile') &&
!lowerText.includes('staff') &&
!lowerHref.endsWith('/people/') &&
!lowerHref.endsWith('/people') &&
!lowerHref.endsWith('/faculty/') &&
!lowerHref.endsWith('/faculty')) {
if (!seen.has(href)) {
seen.add(href);
faculty.push({
name: text,
url: href
});
}
}
});
return faculty;
}''')
return faculty_list
async def get_faculty_from_gsas_page(page, gsas_url, program_name):
"""从GSAS项目页面获取Faculty链接然后访问院系People页面获取导师列表"""
faculty_list = []
faculty_page_url = None
try:
print(f" 访问GSAS页面: {gsas_url}")
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 策略1: 查找 "See list of ... faculty" 链接
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href;
if (text.includes('faculty') && text.includes('see list')) {
return href;
}
}
return null;
}''')
# 策略2: 查找任何包含 /people 或 /faculty 的链接
if not faculty_link:
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href.toLowerCase();
// 查找Faculty相关链接
if ((text.includes('faculty') || text.includes('people')) &&
(href.includes('/people') || href.includes('/faculty'))) {
return link.href;
}
}
return null;
}''')
# 策略3: 从页面中查找院系网站链接然后尝试访问其People页面
if not faculty_link:
dept_website = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href;
// 查找 Website 链接 (通常指向院系主页)
if (text.includes('website') && href.includes('harvard.edu') &&
!href.includes('gsas.harvard.edu')) {
return href;
}
}
return null;
}''')
if dept_website:
print(f" 找到院系网站: {dept_website}")
try:
await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 在院系网站上查找People/Faculty链接
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase().trim();
const href = link.href;
if ((text === 'people' || text === 'faculty' ||
text === 'faculty & research' || text.includes('our faculty')) &&
(href.includes('/people') || href.includes('/faculty'))) {
return href;
}
}
return null;
}''')
except Exception as e:
print(f" 访问院系网站失败: {e}")
if faculty_link:
faculty_page_url = faculty_link
print(f" 找到Faculty页面: {faculty_link}")
# 访问Faculty/People页面
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 提取所有导师信息
faculty_list = await extract_faculty_from_page(page)
# 如果第一页没找到,尝试处理分页或其他布局
if len(faculty_list) == 0:
# 可能需要点击某些按钮或处理JavaScript加载
await page.wait_for_timeout(2000)
faculty_list = await extract_faculty_from_page(page)
print(f" 找到 {len(faculty_list)} 位导师")
else:
print(f" 未找到Faculty页面链接")
except Exception as e:
print(f" 获取Faculty信息失败: {e}")
return faculty_list, faculty_page_url
async def scrape_harvard_programs_with_faculty():
"""爬取Harvard研究生项目列表及导师信息"""
all_programs = []
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
print(f"正在访问: {base_url}")
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
await page.wait_for_timeout(5000)
# 滚动到页面底部
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
current_page = 1
max_pages = 15
# 第一阶段:收集所有项目基本信息
print("\n========== 第一阶段:收集项目列表 ==========")
while current_page <= max_pages:
print(f"\n--- 第 {current_page} 页 ---")
await page.wait_for_timeout(2000)
# 提取当前页面的项目
page_data = await page.evaluate('''() => {
const programs = [];
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
programItems.forEach((item, index) => {
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
if (!nameBtn) return;
const name = nameBtn.innerText.trim();
if (!name || name.length < 3) return;
let degrees = '';
const allText = item.innerText;
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
if (degreeMatch) {
degrees = degreeMatch.join(', ');
}
programs.push({
name: name,
degrees: degrees
});
});
if (programs.length === 0) {
const buttons = document.querySelectorAll('button');
buttons.forEach((btn) => {
const className = btn.className || '';
if (className.includes('c-programs-item') || className.includes('title-link')) {
const name = btn.innerText.trim();
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
programs.push({
name: name,
degrees: ''
});
}
}
});
}
return programs;
}''')
print(f" 本页找到 {len(page_data)} 个项目")
for prog in page_data:
name = prog['name'].strip()
if name and not any(p['name'] == name for p in all_programs):
all_programs.append({
'name': name,
'degrees': prog.get('degrees', ''),
'page': current_page
})
# 尝试点击下一页
try:
next_btn = page.locator('button.c-pagination__link--next')
if await next_btn.count() > 0:
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
else:
print("没有下一页按钮,结束收集")
break
except Exception as e:
print(f"分页失败: {e}")
break
print(f"\n共收集到 {len(all_programs)} 个项目")
# 第二阶段:为每个项目获取导师信息
print("\n========== 第二阶段:获取导师信息 ==========")
print("注意这将访问每个项目的GSAS页面可能需要较长时间...")
for i, prog in enumerate(all_programs, 1):
print(f"\n[{i}/{len(all_programs)}] {prog['name']}")
# 生成项目URL
slug = name_to_slug(prog['name'])
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
# 生成GSAS URL
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
# 获取导师信息
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name'])
prog['faculty_page_url'] = faculty_page_url or ""
prog['faculty'] = faculty_list
prog['faculty_count'] = len(faculty_list)
# 每10个项目保存一次进度
if i % 10 == 0:
temp_result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'progress': f"{i}/{len(all_programs)}",
'programs': all_programs[:i]
}
with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f:
json.dump(temp_result, f, ensure_ascii=False, indent=2)
print(f" [进度已保存]")
# 避免请求过快
await page.wait_for_timeout(1500)
await browser.close()
# 排序
programs = sorted(all_programs, key=lambda x: x['name'])
# 统计
total_faculty = sum(p['faculty_count'] for p in programs)
programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0)
# 保存最终结果
result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'total_pages_scraped': current_page,
'total_programs': len(programs),
'programs_with_faculty': programs_with_faculty,
'total_faculty_found': total_faculty,
'programs': programs
}
output_file = Path('harvard_programs_with_faculty.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n{'='*60}")
print(f"爬取完成!")
print(f"共爬取 {current_page}")
print(f"共找到 {len(programs)} 个研究生项目")
print(f"其中 {programs_with_faculty} 个项目有导师信息")
print(f"共找到 {total_faculty} 位导师")
print(f"结果保存到: {output_file}")
print(f"{'='*60}")
# 打印摘要
print("\n项目摘要 (前30个):")
for i, prog in enumerate(programs[:30], 1):
faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)"
print(f"{i:3}. {prog['name']} {faculty_info}")
if len(programs) > 30:
print(f"... 还有 {len(programs) - 30} 个项目")
return result
if __name__ == "__main__":
asyncio.run(scrape_harvard_programs_with_faculty())