Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
466
artifacts/harvard_programs_scraper.py
Normal file
466
artifacts/harvard_programs_scraper.py
Normal file
@ -0,0 +1,466 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Harvard Graduate Programs Scraper
|
||||
专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
|
||||
通过点击分页按钮遍历所有页面
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
async def scrape_harvard_programs():
|
||||
"""爬取Harvard研究生项目列表页面 - 通过点击分页按钮"""
|
||||
|
||||
all_programs = []
|
||||
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
|
||||
|
||||
async with async_playwright() as p:
|
||||
# 使用无头模式
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
print(f"正在访问: {base_url}")
|
||||
# 使用 domcontentloaded 而非 networkidle,更快加载
|
||||
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
|
||||
# 等待页面内容加载
|
||||
await page.wait_for_timeout(5000)
|
||||
|
||||
# 滚动到页面底部以确保分页按钮加载
|
||||
print("滚动到页面底部...")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
current_page = 1
|
||||
max_pages = 15
|
||||
|
||||
while current_page <= max_pages:
|
||||
print(f"\n========== 第 {current_page} 页 ==========")
|
||||
|
||||
# 等待内容加载
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 提取当前页面的项目
|
||||
# 从调试输出得知,项目按钮的class是 'records__record___PbPhG c-programs-item__title-link'
|
||||
# 需要点击按钮来获取URL,因为Harvard使用JavaScript导航
|
||||
|
||||
# 首先获取所有项目按钮信息
|
||||
page_data = await page.evaluate('''() => {
|
||||
const programs = [];
|
||||
|
||||
// 查找所有项目行/容器
|
||||
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
|
||||
|
||||
programItems.forEach((item, index) => {
|
||||
// 获取项目名称按钮
|
||||
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
|
||||
if (!nameBtn) return;
|
||||
|
||||
const name = nameBtn.innerText.trim();
|
||||
if (!name || name.length < 3) return;
|
||||
|
||||
// 获取学位信息
|
||||
let degrees = '';
|
||||
const allText = item.innerText;
|
||||
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
|
||||
if (degreeMatch) {
|
||||
degrees = degreeMatch.join(', ');
|
||||
}
|
||||
|
||||
// 查找链接 - 检查各种可能的位置
|
||||
let url = '';
|
||||
|
||||
// 方法1: 查找 <a> 标签
|
||||
const link = item.querySelector('a[href]');
|
||||
if (link && link.href) {
|
||||
url = link.href;
|
||||
}
|
||||
|
||||
// 方法2: 检查data属性
|
||||
if (!url) {
|
||||
const dataUrl = nameBtn.getAttribute('data-url') ||
|
||||
nameBtn.getAttribute('data-href') ||
|
||||
item.getAttribute('data-url');
|
||||
if (dataUrl) url = dataUrl;
|
||||
}
|
||||
|
||||
// 方法3: 检查onclick属性
|
||||
if (!url) {
|
||||
const onclick = nameBtn.getAttribute('onclick') || '';
|
||||
const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/);
|
||||
if (urlMatch) url = urlMatch[1];
|
||||
}
|
||||
|
||||
programs.push({
|
||||
name: name,
|
||||
degrees: degrees,
|
||||
url: url,
|
||||
index: index
|
||||
});
|
||||
});
|
||||
|
||||
// 如果方法1没找到项目,使用备选方法
|
||||
if (programs.length === 0) {
|
||||
// 查找所有项目按钮
|
||||
const buttons = document.querySelectorAll('button');
|
||||
buttons.forEach((btn, index) => {
|
||||
const className = btn.className || '';
|
||||
if (className.includes('c-programs-item') || className.includes('title-link')) {
|
||||
const name = btn.innerText.trim();
|
||||
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
|
||||
programs.push({
|
||||
name: name,
|
||||
degrees: '',
|
||||
url: '',
|
||||
index: index
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
programs: programs,
|
||||
totalFound: programs.length
|
||||
};
|
||||
}''')
|
||||
|
||||
# 第一页时调试输出HTML结构
|
||||
if current_page == 1 and len(page_data['programs']) == 0:
|
||||
print("未找到项目,调试HTML结构...")
|
||||
html_debug = await page.evaluate('''() => {
|
||||
const debug = {
|
||||
allButtons: [],
|
||||
allLinks: [],
|
||||
sampleHTML: ''
|
||||
};
|
||||
|
||||
// 获取所有按钮
|
||||
document.querySelectorAll('button').forEach(btn => {
|
||||
const text = btn.innerText.trim().substring(0, 50);
|
||||
if (text && text.length > 3) {
|
||||
debug.allButtons.push({
|
||||
text: text,
|
||||
class: btn.className.substring(0, 80)
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 获取main区域的HTML片段
|
||||
const main = document.querySelector('main') || document.body;
|
||||
debug.sampleHTML = main.innerHTML.substring(0, 3000);
|
||||
|
||||
return debug;
|
||||
}''')
|
||||
print(f"找到 {len(html_debug['allButtons'])} 个按钮:")
|
||||
for btn in html_debug['allButtons'][:20]:
|
||||
print(f" - {btn['text']} | class: {btn['class']}")
|
||||
print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}")
|
||||
|
||||
print(f" 本页找到 {len(page_data['programs'])} 个项目")
|
||||
|
||||
# 打印找到的项目
|
||||
for prog in page_data['programs']:
|
||||
print(f" - {prog['name']} ({prog['degrees']})")
|
||||
|
||||
# 添加到总列表(去重)
|
||||
for prog in page_data['programs']:
|
||||
name = prog['name'].strip()
|
||||
if name and not any(p['name'] == name for p in all_programs):
|
||||
all_programs.append({
|
||||
'name': name,
|
||||
'degrees': prog.get('degrees', ''),
|
||||
'url': prog.get('url', ''),
|
||||
'page': current_page
|
||||
})
|
||||
|
||||
# 尝试点击下一页按钮
|
||||
try:
|
||||
clicked = False
|
||||
|
||||
# 首先打印所有分页相关元素用于调试
|
||||
if current_page == 1:
|
||||
# 截图保存以便调试
|
||||
await page.screenshot(path="harvard_debug_pagination.png", full_page=True)
|
||||
print("已保存调试截图: harvard_debug_pagination.png")
|
||||
|
||||
pagination_info = await page.evaluate('''() => {
|
||||
const result = {
|
||||
links: [],
|
||||
buttons: [],
|
||||
allClickable: [],
|
||||
pageNumbers: [],
|
||||
allText: []
|
||||
};
|
||||
|
||||
// 查找所有链接
|
||||
document.querySelectorAll('a').forEach(a => {
|
||||
const text = a.innerText.trim();
|
||||
if (text.match(/^[0-9]+$|Next|page|Prev/i)) {
|
||||
result.links.push({
|
||||
text: text.substring(0, 50),
|
||||
href: a.href,
|
||||
visible: a.offsetParent !== null,
|
||||
className: a.className
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 查找所有按钮
|
||||
document.querySelectorAll('button').forEach(b => {
|
||||
const text = b.innerText.trim();
|
||||
if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) {
|
||||
result.buttons.push({
|
||||
text: text.substring(0, 50),
|
||||
visible: b.offsetParent !== null,
|
||||
className: b.className
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 查找所有包含数字的可点击元素(可能是分页)
|
||||
document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => {
|
||||
const text = el.innerText.trim();
|
||||
if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) {
|
||||
result.pageNumbers.push({
|
||||
tag: el.tagName,
|
||||
text: text,
|
||||
className: el.className,
|
||||
id: el.id,
|
||||
ariaLabel: el.getAttribute('aria-label'),
|
||||
visible: el.offsetParent !== null
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 查找页面底部区域的所有可点击元素
|
||||
const bodyRect = document.body.getBoundingClientRect();
|
||||
document.querySelectorAll('*').forEach(el => {
|
||||
const rect = el.getBoundingClientRect();
|
||||
const text = el.innerText?.trim() || '';
|
||||
// 只看页面下半部分的元素且文本短
|
||||
if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) {
|
||||
const style = window.getComputedStyle(el);
|
||||
if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') {
|
||||
result.allClickable.push({
|
||||
tag: el.tagName,
|
||||
text: text.substring(0, 30),
|
||||
top: Math.round(rect.top),
|
||||
className: el.className?.substring?.(0, 50) || ''
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 输出页面底部所有文本以便调试
|
||||
const bodyText = document.body.innerText;
|
||||
const lines = bodyText.split('\\n').filter(l => l.trim());
|
||||
// 找到包含数字1-9的行
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
if (lines[i].match(/^[1-9]$|Next page|Previous/)) {
|
||||
result.allText.push(lines[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}''')
|
||||
print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):")
|
||||
for link in pagination_info['links']:
|
||||
print(f" a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})")
|
||||
print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):")
|
||||
for btn in pagination_info['buttons']:
|
||||
print(f" button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})")
|
||||
print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):")
|
||||
for pn in pagination_info['pageNumbers']:
|
||||
print(f" {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}")
|
||||
print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):")
|
||||
for el in pagination_info['allClickable'][:30]:
|
||||
print(f" {el['tag']}: '{el['text']}' (top: {el['top']})")
|
||||
print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):")
|
||||
for txt in pagination_info['allText'][:20]:
|
||||
print(f" '{txt}'")
|
||||
|
||||
# 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠)
|
||||
# 从调试输出得知,分页按钮是 <button class="c-pagination__link c-pagination__link--next">
|
||||
next_page_num = str(current_page + 1)
|
||||
|
||||
try:
|
||||
next_btn = page.locator('button.c-pagination__link--next')
|
||||
if await next_btn.count() > 0:
|
||||
print(f"\n找到 'Next page' 按钮 (CSS选择器),尝试点击...")
|
||||
await next_btn.first.scroll_into_view_if_needed()
|
||||
await next_btn.first.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
except Exception as e:
|
||||
print(f"方法1失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
# 方法2: 使用 get_by_role 查找按钮
|
||||
try:
|
||||
next_btn = page.get_by_role("button", name="Next page")
|
||||
if await next_btn.count() > 0:
|
||||
print(f"\n通过role找到 'Next page' 按钮,尝试点击...")
|
||||
await next_btn.first.scroll_into_view_if_needed()
|
||||
await next_btn.first.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
except Exception as e:
|
||||
print(f"方法2失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
# 方法3: 查找所有分页按钮并点击 "Next page"
|
||||
try:
|
||||
pagination_buttons = await page.query_selector_all('button.c-pagination__link')
|
||||
for btn in pagination_buttons:
|
||||
text = await btn.inner_text()
|
||||
if 'Next page' in text:
|
||||
print(f"\n通过遍历分页按钮找到 'Next page',点击...")
|
||||
await btn.scroll_into_view_if_needed()
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"方法3失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
# 方法4: 通过JavaScript直接点击分页按钮
|
||||
try:
|
||||
js_clicked = await page.evaluate('''() => {
|
||||
// 查找 Next page 按钮
|
||||
const nextBtn = document.querySelector('button.c-pagination__link--next');
|
||||
if (nextBtn) {
|
||||
nextBtn.click();
|
||||
return true;
|
||||
}
|
||||
// 备选:查找所有分页按钮
|
||||
const buttons = document.querySelectorAll('button.c-pagination__link');
|
||||
for (const btn of buttons) {
|
||||
if (btn.innerText.includes('Next page')) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}''')
|
||||
if js_clicked:
|
||||
print(f"\n通过JavaScript点击 'Next page' 成功")
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
except Exception as e:
|
||||
print(f"方法4失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
# 方法5: 遍历所有按钮查找
|
||||
try:
|
||||
all_buttons = await page.query_selector_all('button')
|
||||
for btn in all_buttons:
|
||||
try:
|
||||
text = await btn.inner_text()
|
||||
if 'Next page' in text:
|
||||
visible = await btn.is_visible()
|
||||
if visible:
|
||||
print(f"\n遍历所有按钮找到 'Next page',点击...")
|
||||
await btn.scroll_into_view_if_needed()
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
break
|
||||
except:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"方法5失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
print("没有找到下一页按钮,结束爬取")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f"点击下一页时出错: {e}")
|
||||
break
|
||||
|
||||
# 生成项目URL - Harvard的项目URL格式为:
|
||||
# https://www.harvard.edu/programs/{program-name-slug}/
|
||||
# 例如: african-and-african-american-studies
|
||||
|
||||
import re
|
||||
|
||||
def name_to_slug(name):
|
||||
"""将项目名称转换为URL slug"""
|
||||
# 转小写
|
||||
slug = name.lower()
|
||||
# 将特殊字符替换为空格
|
||||
slug = re.sub(r'[^\w\s-]', '', slug)
|
||||
# 替换空格为连字符
|
||||
slug = re.sub(r'[\s_]+', '-', slug)
|
||||
# 移除多余的连字符
|
||||
slug = re.sub(r'-+', '-', slug)
|
||||
# 移除首尾连字符
|
||||
slug = slug.strip('-')
|
||||
return slug
|
||||
|
||||
print("\n正在生成项目URL...")
|
||||
for prog in all_programs:
|
||||
slug = name_to_slug(prog['name'])
|
||||
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
|
||||
print(f" {prog['name']} -> {prog['url']}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# 排序
|
||||
programs = sorted(all_programs, key=lambda x: x['name'])
|
||||
|
||||
# 保存
|
||||
result = {
|
||||
'source_url': base_url,
|
||||
'scraped_at': datetime.now(timezone.utc).isoformat(),
|
||||
'total_pages_scraped': current_page,
|
||||
'total_programs': len(programs),
|
||||
'programs': programs
|
||||
}
|
||||
|
||||
output_file = Path('harvard_programs_results.json')
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"爬取完成!")
|
||||
print(f"共爬取 {current_page} 页")
|
||||
print(f"共找到 {len(programs)} 个研究生项目")
|
||||
print(f"结果保存到: {output_file}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 打印完整列表
|
||||
print("\n研究生项目完整列表:")
|
||||
for i, prog in enumerate(programs, 1):
|
||||
print(f"{i:3}. {prog['name']} - {prog['degrees']}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(scrape_harvard_programs())
|
||||
Reference in New Issue
Block a user