#!/usr/bin/env python3 """ Harvard Graduate Programs Scraper 专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目 通过点击分页按钮遍历所有页面 """ import asyncio import json import re from datetime import datetime, timezone from pathlib import Path from playwright.async_api import async_playwright async def scrape_harvard_programs(): """爬取Harvard研究生项目列表页面 - 通过点击分页按钮""" all_programs = [] base_url = "https://www.harvard.edu/programs/?degree_levels=graduate" async with async_playwright() as p: # 使用无头模式 browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", viewport={'width': 1920, 'height': 1080} ) page = await context.new_page() print(f"正在访问: {base_url}") # 使用 domcontentloaded 而非 networkidle,更快加载 await page.goto(base_url, wait_until="domcontentloaded", timeout=60000) # 等待页面内容加载 await page.wait_for_timeout(5000) # 滚动到页面底部以确保分页按钮加载 print("滚动到页面底部...") await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(2000) current_page = 1 max_pages = 15 while current_page <= max_pages: print(f"\n========== 第 {current_page} 页 ==========") # 等待内容加载 await page.wait_for_timeout(2000) # 提取当前页面的项目 # 从调试输出得知,项目按钮的class是 'records__record___PbPhG c-programs-item__title-link' # 需要点击按钮来获取URL,因为Harvard使用JavaScript导航 # 首先获取所有项目按钮信息 page_data = await page.evaluate('''() => { const programs = []; // 查找所有项目行/容器 const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]'); programItems.forEach((item, index) => { // 获取项目名称按钮 const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]'); if (!nameBtn) return; const name = nameBtn.innerText.trim(); if (!name || name.length < 3) return; // 获取学位信息 let degrees = ''; const allText = item.innerText; const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g); if (degreeMatch) { degrees = degreeMatch.join(', '); } // 查找链接 - 检查各种可能的位置 let url = ''; // 方法1: 查找 标签 const link = item.querySelector('a[href]'); if (link && link.href) { url = link.href; } // 方法2: 检查data属性 if (!url) { const dataUrl = nameBtn.getAttribute('data-url') || nameBtn.getAttribute('data-href') || item.getAttribute('data-url'); if (dataUrl) url = dataUrl; } // 方法3: 检查onclick属性 if (!url) { const onclick = nameBtn.getAttribute('onclick') || ''; const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/); if (urlMatch) url = urlMatch[1]; } programs.push({ name: name, degrees: degrees, url: url, index: index }); }); // 如果方法1没找到项目,使用备选方法 if (programs.length === 0) { // 查找所有项目按钮 const buttons = document.querySelectorAll('button'); buttons.forEach((btn, index) => { const className = btn.className || ''; if (className.includes('c-programs-item') || className.includes('title-link')) { const name = btn.innerText.trim(); if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) { programs.push({ name: name, degrees: '', url: '', index: index }); } } }); } return { programs: programs, totalFound: programs.length }; }''') # 第一页时调试输出HTML结构 if current_page == 1 and len(page_data['programs']) == 0: print("未找到项目,调试HTML结构...") html_debug = await page.evaluate('''() => { const debug = { allButtons: [], allLinks: [], sampleHTML: '' }; // 获取所有按钮 document.querySelectorAll('button').forEach(btn => { const text = btn.innerText.trim().substring(0, 50); if (text && text.length > 3) { debug.allButtons.push({ text: text, class: btn.className.substring(0, 80) }); } }); // 获取main区域的HTML片段 const main = document.querySelector('main') || document.body; debug.sampleHTML = main.innerHTML.substring(0, 3000); return debug; }''') print(f"找到 {len(html_debug['allButtons'])} 个按钮:") for btn in html_debug['allButtons'][:20]: print(f" - {btn['text']} | class: {btn['class']}") print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}") print(f" 本页找到 {len(page_data['programs'])} 个项目") # 打印找到的项目 for prog in page_data['programs']: print(f" - {prog['name']} ({prog['degrees']})") # 添加到总列表(去重) for prog in page_data['programs']: name = prog['name'].strip() if name and not any(p['name'] == name for p in all_programs): all_programs.append({ 'name': name, 'degrees': prog.get('degrees', ''), 'url': prog.get('url', ''), 'page': current_page }) # 尝试点击下一页按钮 try: clicked = False # 首先打印所有分页相关元素用于调试 if current_page == 1: # 截图保存以便调试 await page.screenshot(path="harvard_debug_pagination.png", full_page=True) print("已保存调试截图: harvard_debug_pagination.png") pagination_info = await page.evaluate('''() => { const result = { links: [], buttons: [], allClickable: [], pageNumbers: [], allText: [] }; // 查找所有链接 document.querySelectorAll('a').forEach(a => { const text = a.innerText.trim(); if (text.match(/^[0-9]+$|Next|page|Prev/i)) { result.links.push({ text: text.substring(0, 50), href: a.href, visible: a.offsetParent !== null, className: a.className }); } }); // 查找所有按钮 document.querySelectorAll('button').forEach(b => { const text = b.innerText.trim(); if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) { result.buttons.push({ text: text.substring(0, 50), visible: b.offsetParent !== null, className: b.className }); } }); // 查找所有包含数字的可点击元素(可能是分页) document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => { const text = el.innerText.trim(); if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) { result.pageNumbers.push({ tag: el.tagName, text: text, className: el.className, id: el.id, ariaLabel: el.getAttribute('aria-label'), visible: el.offsetParent !== null }); } }); // 查找页面底部区域的所有可点击元素 const bodyRect = document.body.getBoundingClientRect(); document.querySelectorAll('*').forEach(el => { const rect = el.getBoundingClientRect(); const text = el.innerText?.trim() || ''; // 只看页面下半部分的元素且文本短 if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) { const style = window.getComputedStyle(el); if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') { result.allClickable.push({ tag: el.tagName, text: text.substring(0, 30), top: Math.round(rect.top), className: el.className?.substring?.(0, 50) || '' }); } } }); // 输出页面底部所有文本以便调试 const bodyText = document.body.innerText; const lines = bodyText.split('\\n').filter(l => l.trim()); // 找到包含数字1-9的行 for (let i = 0; i < lines.length; i++) { if (lines[i].match(/^[1-9]$|Next page|Previous/)) { result.allText.push(lines[i]); } } return result; }''') print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):") for link in pagination_info['links']: print(f" a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})") print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):") for btn in pagination_info['buttons']: print(f" button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})") print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):") for pn in pagination_info['pageNumbers']: print(f" {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}") print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):") for el in pagination_info['allClickable'][:30]: print(f" {el['tag']}: '{el['text']}' (top: {el['top']})") print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):") for txt in pagination_info['allText'][:20]: print(f" '{txt}'") # 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠) # 从调试输出得知,分页按钮是