Files
University-Playwright-Codeg…/artifacts/harvard_programs_scraper.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

467 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Harvard Graduate Programs Scraper
专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
通过点击分页按钮遍历所有页面
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright
async def scrape_harvard_programs():
"""爬取Harvard研究生项目列表页面 - 通过点击分页按钮"""
all_programs = []
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
async with async_playwright() as p:
# 使用无头模式
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
print(f"正在访问: {base_url}")
# 使用 domcontentloaded 而非 networkidle更快加载
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
# 等待页面内容加载
await page.wait_for_timeout(5000)
# 滚动到页面底部以确保分页按钮加载
print("滚动到页面底部...")
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
current_page = 1
max_pages = 15
while current_page <= max_pages:
print(f"\n========== 第 {current_page} 页 ==========")
# 等待内容加载
await page.wait_for_timeout(2000)
# 提取当前页面的项目
# 从调试输出得知项目按钮的class是 'records__record___PbPhG c-programs-item__title-link'
# 需要点击按钮来获取URL因为Harvard使用JavaScript导航
# 首先获取所有项目按钮信息
page_data = await page.evaluate('''() => {
const programs = [];
// 查找所有项目行/容器
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
programItems.forEach((item, index) => {
// 获取项目名称按钮
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
if (!nameBtn) return;
const name = nameBtn.innerText.trim();
if (!name || name.length < 3) return;
// 获取学位信息
let degrees = '';
const allText = item.innerText;
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
if (degreeMatch) {
degrees = degreeMatch.join(', ');
}
// 查找链接 - 检查各种可能的位置
let url = '';
// 方法1: 查找 <a> 标签
const link = item.querySelector('a[href]');
if (link && link.href) {
url = link.href;
}
// 方法2: 检查data属性
if (!url) {
const dataUrl = nameBtn.getAttribute('data-url') ||
nameBtn.getAttribute('data-href') ||
item.getAttribute('data-url');
if (dataUrl) url = dataUrl;
}
// 方法3: 检查onclick属性
if (!url) {
const onclick = nameBtn.getAttribute('onclick') || '';
const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/);
if (urlMatch) url = urlMatch[1];
}
programs.push({
name: name,
degrees: degrees,
url: url,
index: index
});
});
// 如果方法1没找到项目使用备选方法
if (programs.length === 0) {
// 查找所有项目按钮
const buttons = document.querySelectorAll('button');
buttons.forEach((btn, index) => {
const className = btn.className || '';
if (className.includes('c-programs-item') || className.includes('title-link')) {
const name = btn.innerText.trim();
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
programs.push({
name: name,
degrees: '',
url: '',
index: index
});
}
}
});
}
return {
programs: programs,
totalFound: programs.length
};
}''')
# 第一页时调试输出HTML结构
if current_page == 1 and len(page_data['programs']) == 0:
print("未找到项目调试HTML结构...")
html_debug = await page.evaluate('''() => {
const debug = {
allButtons: [],
allLinks: [],
sampleHTML: ''
};
// 获取所有按钮
document.querySelectorAll('button').forEach(btn => {
const text = btn.innerText.trim().substring(0, 50);
if (text && text.length > 3) {
debug.allButtons.push({
text: text,
class: btn.className.substring(0, 80)
});
}
});
// 获取main区域的HTML片段
const main = document.querySelector('main') || document.body;
debug.sampleHTML = main.innerHTML.substring(0, 3000);
return debug;
}''')
print(f"找到 {len(html_debug['allButtons'])} 个按钮:")
for btn in html_debug['allButtons'][:20]:
print(f" - {btn['text']} | class: {btn['class']}")
print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}")
print(f" 本页找到 {len(page_data['programs'])} 个项目")
# 打印找到的项目
for prog in page_data['programs']:
print(f" - {prog['name']} ({prog['degrees']})")
# 添加到总列表(去重)
for prog in page_data['programs']:
name = prog['name'].strip()
if name and not any(p['name'] == name for p in all_programs):
all_programs.append({
'name': name,
'degrees': prog.get('degrees', ''),
'url': prog.get('url', ''),
'page': current_page
})
# 尝试点击下一页按钮
try:
clicked = False
# 首先打印所有分页相关元素用于调试
if current_page == 1:
# 截图保存以便调试
await page.screenshot(path="harvard_debug_pagination.png", full_page=True)
print("已保存调试截图: harvard_debug_pagination.png")
pagination_info = await page.evaluate('''() => {
const result = {
links: [],
buttons: [],
allClickable: [],
pageNumbers: [],
allText: []
};
// 查找所有链接
document.querySelectorAll('a').forEach(a => {
const text = a.innerText.trim();
if (text.match(/^[0-9]+$|Next|page|Prev/i)) {
result.links.push({
text: text.substring(0, 50),
href: a.href,
visible: a.offsetParent !== null,
className: a.className
});
}
});
// 查找所有按钮
document.querySelectorAll('button').forEach(b => {
const text = b.innerText.trim();
if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) {
result.buttons.push({
text: text.substring(0, 50),
visible: b.offsetParent !== null,
className: b.className
});
}
});
// 查找所有包含数字的可点击元素(可能是分页)
document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => {
const text = el.innerText.trim();
if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) {
result.pageNumbers.push({
tag: el.tagName,
text: text,
className: el.className,
id: el.id,
ariaLabel: el.getAttribute('aria-label'),
visible: el.offsetParent !== null
});
}
});
// 查找页面底部区域的所有可点击元素
const bodyRect = document.body.getBoundingClientRect();
document.querySelectorAll('*').forEach(el => {
const rect = el.getBoundingClientRect();
const text = el.innerText?.trim() || '';
// 只看页面下半部分的元素且文本短
if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) {
const style = window.getComputedStyle(el);
if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') {
result.allClickable.push({
tag: el.tagName,
text: text.substring(0, 30),
top: Math.round(rect.top),
className: el.className?.substring?.(0, 50) || ''
});
}
}
});
// 输出页面底部所有文本以便调试
const bodyText = document.body.innerText;
const lines = bodyText.split('\\n').filter(l => l.trim());
// 找到包含数字1-9的行
for (let i = 0; i < lines.length; i++) {
if (lines[i].match(/^[1-9]$|Next page|Previous/)) {
result.allText.push(lines[i]);
}
}
return result;
}''')
print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):")
for link in pagination_info['links']:
print(f" a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})")
print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):")
for btn in pagination_info['buttons']:
print(f" button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})")
print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):")
for pn in pagination_info['pageNumbers']:
print(f" {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}")
print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):")
for el in pagination_info['allClickable'][:30]:
print(f" {el['tag']}: '{el['text']}' (top: {el['top']})")
print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):")
for txt in pagination_info['allText'][:20]:
print(f" '{txt}'")
# 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠)
# 从调试输出得知,分页按钮是 <button class="c-pagination__link c-pagination__link--next">
next_page_num = str(current_page + 1)
try:
next_btn = page.locator('button.c-pagination__link--next')
if await next_btn.count() > 0:
print(f"\n找到 'Next page' 按钮 (CSS选择器),尝试点击...")
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法1失败: {e}")
if clicked:
continue
# 方法2: 使用 get_by_role 查找按钮
try:
next_btn = page.get_by_role("button", name="Next page")
if await next_btn.count() > 0:
print(f"\n通过role找到 'Next page' 按钮,尝试点击...")
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法2失败: {e}")
if clicked:
continue
# 方法3: 查找所有分页按钮并点击 "Next page"
try:
pagination_buttons = await page.query_selector_all('button.c-pagination__link')
for btn in pagination_buttons:
text = await btn.inner_text()
if 'Next page' in text:
print(f"\n通过遍历分页按钮找到 'Next page',点击...")
await btn.scroll_into_view_if_needed()
await btn.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
break
except Exception as e:
print(f"方法3失败: {e}")
if clicked:
continue
# 方法4: 通过JavaScript直接点击分页按钮
try:
js_clicked = await page.evaluate('''() => {
// 查找 Next page 按钮
const nextBtn = document.querySelector('button.c-pagination__link--next');
if (nextBtn) {
nextBtn.click();
return true;
}
// 备选:查找所有分页按钮
const buttons = document.querySelectorAll('button.c-pagination__link');
for (const btn of buttons) {
if (btn.innerText.includes('Next page')) {
btn.click();
return true;
}
}
return false;
}''')
if js_clicked:
print(f"\n通过JavaScript点击 'Next page' 成功")
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法4失败: {e}")
if clicked:
continue
# 方法5: 遍历所有按钮查找
try:
all_buttons = await page.query_selector_all('button')
for btn in all_buttons:
try:
text = await btn.inner_text()
if 'Next page' in text:
visible = await btn.is_visible()
if visible:
print(f"\n遍历所有按钮找到 'Next page',点击...")
await btn.scroll_into_view_if_needed()
await btn.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
break
except:
continue
except Exception as e:
print(f"方法5失败: {e}")
if clicked:
continue
print("没有找到下一页按钮,结束爬取")
break
except Exception as e:
print(f"点击下一页时出错: {e}")
break
# 生成项目URL - Harvard的项目URL格式为
# https://www.harvard.edu/programs/{program-name-slug}/
# 例如: african-and-african-american-studies
import re
def name_to_slug(name):
"""将项目名称转换为URL slug"""
# 转小写
slug = name.lower()
# 将特殊字符替换为空格
slug = re.sub(r'[^\w\s-]', '', slug)
# 替换空格为连字符
slug = re.sub(r'[\s_]+', '-', slug)
# 移除多余的连字符
slug = re.sub(r'-+', '-', slug)
# 移除首尾连字符
slug = slug.strip('-')
return slug
print("\n正在生成项目URL...")
for prog in all_programs:
slug = name_to_slug(prog['name'])
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
print(f" {prog['name']} -> {prog['url']}")
await browser.close()
# 排序
programs = sorted(all_programs, key=lambda x: x['name'])
# 保存
result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'total_pages_scraped': current_page,
'total_programs': len(programs),
'programs': programs
}
output_file = Path('harvard_programs_results.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n{'='*60}")
print(f"爬取完成!")
print(f"共爬取 {current_page}")
print(f"共找到 {len(programs)} 个研究生项目")
print(f"结果保存到: {output_file}")
print(f"{'='*60}")
# 打印完整列表
print("\n研究生项目完整列表:")
for i, prog in enumerate(programs, 1):
print(f"{i:3}. {prog['name']} - {prog['degrees']}")
return result
if __name__ == "__main__":
asyncio.run(scrape_harvard_programs())