#!/usr/bin/env python3 """ Harvard Graduate Programs Scraper with Faculty Information 爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目 并获取每个项目的导师个人信息页面URL """ import asyncio import json import re from datetime import datetime, timezone from pathlib import Path from playwright.async_api import async_playwright def name_to_slug(name): """将项目名称转换为URL slug""" slug = name.lower() slug = re.sub(r'[^\w\s-]', '', slug) slug = re.sub(r'[\s_]+', '-', slug) slug = re.sub(r'-+', '-', slug) slug = slug.strip('-') return slug async def extract_faculty_from_page(page): """从当前页面提取所有教职员工链接""" faculty_list = await page.evaluate('''() => { const faculty = []; const seen = new Set(); document.querySelectorAll('a[href]').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); const lowerHref = href.toLowerCase(); const lowerText = text.toLowerCase(); // 检查是否是个人页面链接 if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') || lowerHref.includes('/profile/') || lowerHref.includes('/person/')) && text.length > 3 && text.length < 100 && !lowerText.includes('people') && !lowerText.includes('faculty') && !lowerText.includes('profile') && !lowerText.includes('staff') && !lowerHref.endsWith('/people/') && !lowerHref.endsWith('/people') && !lowerHref.endsWith('/faculty/') && !lowerHref.endsWith('/faculty')) { if (!seen.has(href)) { seen.add(href); faculty.push({ name: text, url: href }); } } }); return faculty; }''') return faculty_list async def get_faculty_from_gsas_page(page, gsas_url, program_name): """从GSAS项目页面获取Faculty链接,然后访问院系People页面获取导师列表""" faculty_list = [] faculty_page_url = None try: print(f" 访问GSAS页面: {gsas_url}") await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000) await page.wait_for_timeout(2000) # 策略1: 查找 "See list of ... faculty" 链接 faculty_link = await page.evaluate('''() => { const links = document.querySelectorAll('a[href]'); for (const link of links) { const text = link.innerText.toLowerCase(); const href = link.href; if (text.includes('faculty') && text.includes('see list')) { return href; } } return null; }''') # 策略2: 查找任何包含 /people 或 /faculty 的链接 if not faculty_link: faculty_link = await page.evaluate('''() => { const links = document.querySelectorAll('a[href]'); for (const link of links) { const text = link.innerText.toLowerCase(); const href = link.href.toLowerCase(); // 查找Faculty相关链接 if ((text.includes('faculty') || text.includes('people')) && (href.includes('/people') || href.includes('/faculty'))) { return link.href; } } return null; }''') # 策略3: 从页面中查找院系网站链接,然后尝试访问其People页面 if not faculty_link: dept_website = await page.evaluate('''() => { const links = document.querySelectorAll('a[href]'); for (const link of links) { const text = link.innerText.toLowerCase(); const href = link.href; // 查找 Website 链接 (通常指向院系主页) if (text.includes('website') && href.includes('harvard.edu') && !href.includes('gsas.harvard.edu')) { return href; } } return null; }''') if dept_website: print(f" 找到院系网站: {dept_website}") try: await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000) await page.wait_for_timeout(2000) # 在院系网站上查找People/Faculty链接 faculty_link = await page.evaluate('''() => { const links = document.querySelectorAll('a[href]'); for (const link of links) { const text = link.innerText.toLowerCase().trim(); const href = link.href; if ((text === 'people' || text === 'faculty' || text === 'faculty & research' || text.includes('our faculty')) && (href.includes('/people') || href.includes('/faculty'))) { return href; } } return null; }''') except Exception as e: print(f" 访问院系网站失败: {e}") if faculty_link: faculty_page_url = faculty_link print(f" 找到Faculty页面: {faculty_link}") # 访问Faculty/People页面 await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000) await page.wait_for_timeout(2000) # 提取所有导师信息 faculty_list = await extract_faculty_from_page(page) # 如果第一页没找到,尝试处理分页或其他布局 if len(faculty_list) == 0: # 可能需要点击某些按钮或处理JavaScript加载 await page.wait_for_timeout(2000) faculty_list = await extract_faculty_from_page(page) print(f" 找到 {len(faculty_list)} 位导师") else: print(f" 未找到Faculty页面链接") except Exception as e: print(f" 获取Faculty信息失败: {e}") return faculty_list, faculty_page_url async def scrape_harvard_programs_with_faculty(): """爬取Harvard研究生项目列表及导师信息""" all_programs = [] base_url = "https://www.harvard.edu/programs/?degree_levels=graduate" async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", viewport={'width': 1920, 'height': 1080} ) page = await context.new_page() print(f"正在访问: {base_url}") await page.goto(base_url, wait_until="domcontentloaded", timeout=60000) await page.wait_for_timeout(5000) # 滚动到页面底部 await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(2000) current_page = 1 max_pages = 15 # 第一阶段:收集所有项目基本信息 print("\n========== 第一阶段:收集项目列表 ==========") while current_page <= max_pages: print(f"\n--- 第 {current_page} 页 ---") await page.wait_for_timeout(2000) # 提取当前页面的项目 page_data = await page.evaluate('''() => { const programs = []; const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]'); programItems.forEach((item, index) => { const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]'); if (!nameBtn) return; const name = nameBtn.innerText.trim(); if (!name || name.length < 3) return; let degrees = ''; const allText = item.innerText; const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g); if (degreeMatch) { degrees = degreeMatch.join(', '); } programs.push({ name: name, degrees: degrees }); }); if (programs.length === 0) { const buttons = document.querySelectorAll('button'); buttons.forEach((btn) => { const className = btn.className || ''; if (className.includes('c-programs-item') || className.includes('title-link')) { const name = btn.innerText.trim(); if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) { programs.push({ name: name, degrees: '' }); } } }); } return programs; }''') print(f" 本页找到 {len(page_data)} 个项目") for prog in page_data: name = prog['name'].strip() if name and not any(p['name'] == name for p in all_programs): all_programs.append({ 'name': name, 'degrees': prog.get('degrees', ''), 'page': current_page }) # 尝试点击下一页 try: next_btn = page.locator('button.c-pagination__link--next') if await next_btn.count() > 0: await next_btn.first.scroll_into_view_if_needed() await next_btn.first.click() await page.wait_for_timeout(3000) current_page += 1 else: print("没有下一页按钮,结束收集") break except Exception as e: print(f"分页失败: {e}") break print(f"\n共收集到 {len(all_programs)} 个项目") # 第二阶段:为每个项目获取导师信息 print("\n========== 第二阶段:获取导师信息 ==========") print("注意:这将访问每个项目的GSAS页面,可能需要较长时间...") for i, prog in enumerate(all_programs, 1): print(f"\n[{i}/{len(all_programs)}] {prog['name']}") # 生成项目URL slug = name_to_slug(prog['name']) prog['url'] = f"https://www.harvard.edu/programs/{slug}/" # 生成GSAS URL gsas_url = f"https://gsas.harvard.edu/program/{slug}" # 获取导师信息 faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name']) prog['faculty_page_url'] = faculty_page_url or "" prog['faculty'] = faculty_list prog['faculty_count'] = len(faculty_list) # 每10个项目保存一次进度 if i % 10 == 0: temp_result = { 'source_url': base_url, 'scraped_at': datetime.now(timezone.utc).isoformat(), 'progress': f"{i}/{len(all_programs)}", 'programs': all_programs[:i] } with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f: json.dump(temp_result, f, ensure_ascii=False, indent=2) print(f" [进度已保存]") # 避免请求过快 await page.wait_for_timeout(1500) await browser.close() # 排序 programs = sorted(all_programs, key=lambda x: x['name']) # 统计 total_faculty = sum(p['faculty_count'] for p in programs) programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0) # 保存最终结果 result = { 'source_url': base_url, 'scraped_at': datetime.now(timezone.utc).isoformat(), 'total_pages_scraped': current_page, 'total_programs': len(programs), 'programs_with_faculty': programs_with_faculty, 'total_faculty_found': total_faculty, 'programs': programs } output_file = Path('harvard_programs_with_faculty.json') with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"\n{'='*60}") print(f"爬取完成!") print(f"共爬取 {current_page} 页") print(f"共找到 {len(programs)} 个研究生项目") print(f"其中 {programs_with_faculty} 个项目有导师信息") print(f"共找到 {total_faculty} 位导师") print(f"结果保存到: {output_file}") print(f"{'='*60}") # 打印摘要 print("\n项目摘要 (前30个):") for i, prog in enumerate(programs[:30], 1): faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)" print(f"{i:3}. {prog['name']} {faculty_info}") if len(programs) > 30: print(f"... 还有 {len(programs) - 30} 个项目") return result if __name__ == "__main__": asyncio.run(scrape_harvard_programs_with_faculty())