Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,466 @@
#!/usr/bin/env python3
"""
Harvard Graduate Programs Scraper
专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
通过点击分页按钮遍历所有页面
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright
async def scrape_harvard_programs():
"""爬取Harvard研究生项目列表页面 - 通过点击分页按钮"""
all_programs = []
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
async with async_playwright() as p:
# 使用无头模式
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
print(f"正在访问: {base_url}")
# 使用 domcontentloaded 而非 networkidle更快加载
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
# 等待页面内容加载
await page.wait_for_timeout(5000)
# 滚动到页面底部以确保分页按钮加载
print("滚动到页面底部...")
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
current_page = 1
max_pages = 15
while current_page <= max_pages:
print(f"\n========== 第 {current_page} 页 ==========")
# 等待内容加载
await page.wait_for_timeout(2000)
# 提取当前页面的项目
# 从调试输出得知项目按钮的class是 'records__record___PbPhG c-programs-item__title-link'
# 需要点击按钮来获取URL因为Harvard使用JavaScript导航
# 首先获取所有项目按钮信息
page_data = await page.evaluate('''() => {
const programs = [];
// 查找所有项目行/容器
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
programItems.forEach((item, index) => {
// 获取项目名称按钮
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
if (!nameBtn) return;
const name = nameBtn.innerText.trim();
if (!name || name.length < 3) return;
// 获取学位信息
let degrees = '';
const allText = item.innerText;
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
if (degreeMatch) {
degrees = degreeMatch.join(', ');
}
// 查找链接 - 检查各种可能的位置
let url = '';
// 方法1: 查找 <a> 标签
const link = item.querySelector('a[href]');
if (link && link.href) {
url = link.href;
}
// 方法2: 检查data属性
if (!url) {
const dataUrl = nameBtn.getAttribute('data-url') ||
nameBtn.getAttribute('data-href') ||
item.getAttribute('data-url');
if (dataUrl) url = dataUrl;
}
// 方法3: 检查onclick属性
if (!url) {
const onclick = nameBtn.getAttribute('onclick') || '';
const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/);
if (urlMatch) url = urlMatch[1];
}
programs.push({
name: name,
degrees: degrees,
url: url,
index: index
});
});
// 如果方法1没找到项目使用备选方法
if (programs.length === 0) {
// 查找所有项目按钮
const buttons = document.querySelectorAll('button');
buttons.forEach((btn, index) => {
const className = btn.className || '';
if (className.includes('c-programs-item') || className.includes('title-link')) {
const name = btn.innerText.trim();
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
programs.push({
name: name,
degrees: '',
url: '',
index: index
});
}
}
});
}
return {
programs: programs,
totalFound: programs.length
};
}''')
# 第一页时调试输出HTML结构
if current_page == 1 and len(page_data['programs']) == 0:
print("未找到项目调试HTML结构...")
html_debug = await page.evaluate('''() => {
const debug = {
allButtons: [],
allLinks: [],
sampleHTML: ''
};
// 获取所有按钮
document.querySelectorAll('button').forEach(btn => {
const text = btn.innerText.trim().substring(0, 50);
if (text && text.length > 3) {
debug.allButtons.push({
text: text,
class: btn.className.substring(0, 80)
});
}
});
// 获取main区域的HTML片段
const main = document.querySelector('main') || document.body;
debug.sampleHTML = main.innerHTML.substring(0, 3000);
return debug;
}''')
print(f"找到 {len(html_debug['allButtons'])} 个按钮:")
for btn in html_debug['allButtons'][:20]:
print(f" - {btn['text']} | class: {btn['class']}")
print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}")
print(f" 本页找到 {len(page_data['programs'])} 个项目")
# 打印找到的项目
for prog in page_data['programs']:
print(f" - {prog['name']} ({prog['degrees']})")
# 添加到总列表(去重)
for prog in page_data['programs']:
name = prog['name'].strip()
if name and not any(p['name'] == name for p in all_programs):
all_programs.append({
'name': name,
'degrees': prog.get('degrees', ''),
'url': prog.get('url', ''),
'page': current_page
})
# 尝试点击下一页按钮
try:
clicked = False
# 首先打印所有分页相关元素用于调试
if current_page == 1:
# 截图保存以便调试
await page.screenshot(path="harvard_debug_pagination.png", full_page=True)
print("已保存调试截图: harvard_debug_pagination.png")
pagination_info = await page.evaluate('''() => {
const result = {
links: [],
buttons: [],
allClickable: [],
pageNumbers: [],
allText: []
};
// 查找所有链接
document.querySelectorAll('a').forEach(a => {
const text = a.innerText.trim();
if (text.match(/^[0-9]+$|Next|page|Prev/i)) {
result.links.push({
text: text.substring(0, 50),
href: a.href,
visible: a.offsetParent !== null,
className: a.className
});
}
});
// 查找所有按钮
document.querySelectorAll('button').forEach(b => {
const text = b.innerText.trim();
if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) {
result.buttons.push({
text: text.substring(0, 50),
visible: b.offsetParent !== null,
className: b.className
});
}
});
// 查找所有包含数字的可点击元素(可能是分页)
document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => {
const text = el.innerText.trim();
if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) {
result.pageNumbers.push({
tag: el.tagName,
text: text,
className: el.className,
id: el.id,
ariaLabel: el.getAttribute('aria-label'),
visible: el.offsetParent !== null
});
}
});
// 查找页面底部区域的所有可点击元素
const bodyRect = document.body.getBoundingClientRect();
document.querySelectorAll('*').forEach(el => {
const rect = el.getBoundingClientRect();
const text = el.innerText?.trim() || '';
// 只看页面下半部分的元素且文本短
if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) {
const style = window.getComputedStyle(el);
if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') {
result.allClickable.push({
tag: el.tagName,
text: text.substring(0, 30),
top: Math.round(rect.top),
className: el.className?.substring?.(0, 50) || ''
});
}
}
});
// 输出页面底部所有文本以便调试
const bodyText = document.body.innerText;
const lines = bodyText.split('\\n').filter(l => l.trim());
// 找到包含数字1-9的行
for (let i = 0; i < lines.length; i++) {
if (lines[i].match(/^[1-9]$|Next page|Previous/)) {
result.allText.push(lines[i]);
}
}
return result;
}''')
print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):")
for link in pagination_info['links']:
print(f" a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})")
print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):")
for btn in pagination_info['buttons']:
print(f" button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})")
print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):")
for pn in pagination_info['pageNumbers']:
print(f" {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}")
print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):")
for el in pagination_info['allClickable'][:30]:
print(f" {el['tag']}: '{el['text']}' (top: {el['top']})")
print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):")
for txt in pagination_info['allText'][:20]:
print(f" '{txt}'")
# 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠)
# 从调试输出得知,分页按钮是 <button class="c-pagination__link c-pagination__link--next">
next_page_num = str(current_page + 1)
try:
next_btn = page.locator('button.c-pagination__link--next')
if await next_btn.count() > 0:
print(f"\n找到 'Next page' 按钮 (CSS选择器),尝试点击...")
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法1失败: {e}")
if clicked:
continue
# 方法2: 使用 get_by_role 查找按钮
try:
next_btn = page.get_by_role("button", name="Next page")
if await next_btn.count() > 0:
print(f"\n通过role找到 'Next page' 按钮,尝试点击...")
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法2失败: {e}")
if clicked:
continue
# 方法3: 查找所有分页按钮并点击 "Next page"
try:
pagination_buttons = await page.query_selector_all('button.c-pagination__link')
for btn in pagination_buttons:
text = await btn.inner_text()
if 'Next page' in text:
print(f"\n通过遍历分页按钮找到 'Next page',点击...")
await btn.scroll_into_view_if_needed()
await btn.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
break
except Exception as e:
print(f"方法3失败: {e}")
if clicked:
continue
# 方法4: 通过JavaScript直接点击分页按钮
try:
js_clicked = await page.evaluate('''() => {
// 查找 Next page 按钮
const nextBtn = document.querySelector('button.c-pagination__link--next');
if (nextBtn) {
nextBtn.click();
return true;
}
// 备选:查找所有分页按钮
const buttons = document.querySelectorAll('button.c-pagination__link');
for (const btn of buttons) {
if (btn.innerText.includes('Next page')) {
btn.click();
return true;
}
}
return false;
}''')
if js_clicked:
print(f"\n通过JavaScript点击 'Next page' 成功")
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法4失败: {e}")
if clicked:
continue
# 方法5: 遍历所有按钮查找
try:
all_buttons = await page.query_selector_all('button')
for btn in all_buttons:
try:
text = await btn.inner_text()
if 'Next page' in text:
visible = await btn.is_visible()
if visible:
print(f"\n遍历所有按钮找到 'Next page',点击...")
await btn.scroll_into_view_if_needed()
await btn.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
break
except:
continue
except Exception as e:
print(f"方法5失败: {e}")
if clicked:
continue
print("没有找到下一页按钮,结束爬取")
break
except Exception as e:
print(f"点击下一页时出错: {e}")
break
# 生成项目URL - Harvard的项目URL格式为
# https://www.harvard.edu/programs/{program-name-slug}/
# 例如: african-and-african-american-studies
import re
def name_to_slug(name):
"""将项目名称转换为URL slug"""
# 转小写
slug = name.lower()
# 将特殊字符替换为空格
slug = re.sub(r'[^\w\s-]', '', slug)
# 替换空格为连字符
slug = re.sub(r'[\s_]+', '-', slug)
# 移除多余的连字符
slug = re.sub(r'-+', '-', slug)
# 移除首尾连字符
slug = slug.strip('-')
return slug
print("\n正在生成项目URL...")
for prog in all_programs:
slug = name_to_slug(prog['name'])
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
print(f" {prog['name']} -> {prog['url']}")
await browser.close()
# 排序
programs = sorted(all_programs, key=lambda x: x['name'])
# 保存
result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'total_pages_scraped': current_page,
'total_programs': len(programs),
'programs': programs
}
output_file = Path('harvard_programs_results.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n{'='*60}")
print(f"爬取完成!")
print(f"共爬取 {current_page}")
print(f"共找到 {len(programs)} 个研究生项目")
print(f"结果保存到: {output_file}")
print(f"{'='*60}")
# 打印完整列表
print("\n研究生项目完整列表:")
for i, prog in enumerate(programs, 1):
print(f"{i:3}. {prog['name']} - {prog['degrees']}")
return result
if __name__ == "__main__":
asyncio.run(scrape_harvard_programs())