Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""
调试Computer Science的Faculty页面
"""
import asyncio
from playwright.async_api import async_playwright
async def debug_cs():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
# 访问Computer Science GSAS页面
gsas_url = "https://gsas.harvard.edu/program/computer-science"
print(f"访问: {gsas_url}")
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
await page.screenshot(path="cs_gsas_page.png", full_page=True)
print("截图已保存: cs_gsas_page.png")
# 查找所有链接
links = await page.evaluate('''() => {
const links = [];
document.querySelectorAll('a[href]').forEach(a => {
const text = a.innerText.trim();
const href = a.href;
if (text && text.length > 2 && text.length < 100) {
links.push({text: text, href: href});
}
});
return links;
}''')
print(f"\n页面上的所有链接 ({len(links)} 个):")
for link in links:
print(f" - {link['text'][:60]} -> {link['href']}")
# 查找可能的Faculty或People链接
print("\n\n查找Faculty/People相关链接:")
for link in links:
text_lower = link['text'].lower()
href_lower = link['href'].lower()
if 'faculty' in text_lower or 'people' in href_lower or 'faculty' in href_lower or 'website' in text_lower:
print(f" * {link['text']} -> {link['href']}")
# 尝试访问SEAS (School of Engineering)
print("\n\n尝试访问SEAS Computer Science页面...")
seas_url = "https://seas.harvard.edu/computer-science"
await page.goto(seas_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
await page.screenshot(path="seas_cs_page.png", full_page=True)
print("截图已保存: seas_cs_page.png")
seas_links = await page.evaluate('''() => {
const links = [];
document.querySelectorAll('a[href]').forEach(a => {
const text = a.innerText.trim();
const href = a.href;
const lowerText = text.toLowerCase();
const lowerHref = href.toLowerCase();
if ((lowerText.includes('faculty') || lowerText.includes('people') ||
lowerHref.includes('faculty') || lowerHref.includes('people')) &&
text.length > 2) {
links.push({text: text, href: href});
}
});
return links;
}''')
print(f"\nSEAS页面上的Faculty/People链接:")
for link in seas_links:
print(f" * {link['text']} -> {link['href']}")
await browser.close()
if __name__ == "__main__":
asyncio.run(debug_cs())

View File

@ -0,0 +1,110 @@
"""
探索Harvard院系People/Faculty页面结构获取导师列表
"""
import asyncio
from playwright.async_api import async_playwright
async def explore_faculty_page():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
# 访问AAAS院系People页面
people_url = "https://aaas.fas.harvard.edu/aaas-people"
print(f"访问院系People页面: {people_url}")
await page.goto(people_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
# 截图保存
await page.screenshot(path="aaas_people_page.png", full_page=True)
print("已保存截图: aaas_people_page.png")
# 获取所有教职员工链接
faculty_info = await page.evaluate('''() => {
const faculty = [];
// 查找所有 /people/ 路径的链接
document.querySelectorAll('a[href*="/people/"]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
// 过滤掉导航链接,只保留个人页面链接
if (href.includes('/people/') && text.length > 3 &&
!text.toLowerCase().includes('people') &&
!href.endsWith('/people/') &&
!href.endsWith('/aaas-people')) {
faculty.push({
name: text,
url: href
});
}
});
return faculty;
}''')
print(f"\n找到 {len(faculty_info)} 个教职员工:")
for f in faculty_info:
print(f" - {f['name']} -> {f['url']}")
# 尝试经济学院系的Faculty页面
print("\n\n========== 尝试经济学院系Faculty页面 ==========")
econ_faculty_url = "http://economics.harvard.edu/people/people-type/faculty"
print(f"访问: {econ_faculty_url}")
await page.goto(econ_faculty_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
await page.screenshot(path="econ_faculty_page.png", full_page=True)
print("已保存截图: econ_faculty_page.png")
econ_faculty = await page.evaluate('''() => {
const faculty = [];
// 查找所有可能的faculty链接
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
// 查找个人页面链接
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
lowerHref.includes('/profile/')) &&
text.length > 3 && text.length < 100 &&
!text.toLowerCase().includes('faculty') &&
!text.toLowerCase().includes('people')) {
faculty.push({
name: text,
url: href
});
}
});
return faculty;
}''')
print(f"\n找到 {len(econ_faculty)} 个教职员工:")
for f in econ_faculty[:30]:
print(f" - {f['name']} -> {f['url']}")
# 查看页面上所有链接用于调试
print("\n\n页面上的所有链接:")
all_links = await page.evaluate('''() => {
const links = [];
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
if (text && text.length > 2 && text.length < 100) {
links.push({text: text, href: href});
}
});
return links;
}''')
for link in all_links[:40]:
print(f" - {link['text'][:50]} -> {link['href']}")
await browser.close()
if __name__ == "__main__":
asyncio.run(explore_faculty_page())

View File

@ -0,0 +1,173 @@
"""
探索曼彻斯特大学硕士课程页面结构
"""
import asyncio
import json
from playwright.async_api import async_playwright
async def explore_manchester():
"""探索曼彻斯特大学网站结构"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
page = await context.new_page()
# 直接访问硕士课程A-Z列表页
print("访问硕士课程A-Z列表页面...")
await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/",
wait_until="domcontentloaded", timeout=60000)
await page.wait_for_timeout(5000)
# 截图
await page.screenshot(path="manchester_masters_page.png", full_page=False)
print("截图已保存: manchester_masters_page.png")
# 分析页面结构
page_info = await page.evaluate("""() => {
const info = {
title: document.title,
url: window.location.href,
all_links: [],
course_candidates: [],
page_sections: []
};
// 获取所有链接
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().substring(0, 100);
if (href && text) {
info.all_links.push({href, text});
}
});
// 查找可能的课程链接 - 包含 /course/ 或 list-item
document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => {
info.course_candidates.push({
href: a.href,
text: a.innerText.trim().substring(0, 100),
classes: a.className,
parent_classes: a.parentElement?.className || ''
});
});
// 获取页面主要区块
document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => {
info.page_sections.push({
tag: el.tagName,
id: el.id,
classes: el.className,
children_count: el.children.length
});
});
return info;
}""")
print(f"\n页面标题: {page_info['title']}")
print(f"当前URL: {page_info['url']}")
print(f"\n总链接数: {len(page_info['all_links'])}")
print(f"课程候选链接数: {len(page_info['course_candidates'])}")
# 查找包含 masters/courses/ 的链接
masters_links = [l for l in page_info['all_links']
if 'masters/courses/' in l['href'].lower()
and l['href'] != page_info['url']]
print(f"\n硕士课程相关链接 ({len(masters_links)}):")
for link in masters_links[:20]:
print(f" - {link['text'][:50]}: {link['href']}")
print(f"\n课程候选详情:")
for c in page_info['course_candidates'][:10]:
print(f" - {c['text'][:50]}")
print(f" URL: {c['href']}")
print(f" Classes: {c['classes']}")
# 检查是否有搜索/筛选功能
search_elements = await page.evaluate("""() => {
const elements = [];
document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => {
elements.push({
tag: el.tagName,
type: el.type || '',
id: el.id,
name: el.name || '',
classes: el.className
});
});
return elements;
}""")
print(f"\n搜索/筛选元素: {len(search_elements)}")
for el in search_elements[:5]:
print(f" - {el}")
# 尝试找到课程列表的实际结构
print("\n\n正在分析页面中的课程列表结构...")
list_structures = await page.evaluate("""() => {
const structures = [];
// 查找各种可能的列表结构
const selectors = [
'ul li a[href*="course"]',
'div[class*="result"] a',
'div[class*="course"] a',
'article a[href]',
'.search-results a',
'[data-course] a',
'table tr td a'
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
const samples = [];
elements.forEach((el, i) => {
if (i < 5) {
samples.push({
href: el.href,
text: el.innerText.trim().substring(0, 80)
});
}
});
structures.push({
selector: selector,
count: elements.length,
samples: samples
});
}
}
return structures;
}""")
print("\n找到的列表结构:")
for s in list_structures:
print(f"\n 选择器: {s['selector']} (共 {s['count']} 个)")
for sample in s['samples']:
print(f" - {sample['text']}: {sample['href']}")
# 保存完整分析结果
with open("manchester_analysis.json", "w", encoding="utf-8") as f:
json.dump(page_info, f, indent=2, ensure_ascii=False)
print("\n\n完整分析已保存到 manchester_analysis.json")
# 等待用户查看
print("\n按 Ctrl+C 关闭浏览器...")
try:
await asyncio.sleep(30)
except:
pass
await browser.close()
if __name__ == "__main__":
asyncio.run(explore_manchester())

View File

@ -0,0 +1,226 @@
"""
探索Harvard项目页面结构寻找导师信息
"""
import asyncio
from playwright.async_api import async_playwright
async def explore_program_page():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
# 访问研究生院系页面 (GSAS)
gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies"
print(f"访问研究生院系页面: {gsas_url}")
await page.goto(gsas_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
# 截图保存
await page.screenshot(path="gsas_program_page.png", full_page=True)
print("已保存截图: gsas_program_page.png")
# 分析页面结构
page_info = await page.evaluate('''() => {
const info = {
title: document.title,
h1: document.querySelector('h1')?.innerText || '',
allHeadings: [],
facultyLinks: [],
peopleLinks: [],
allLinks: []
};
// 获取所有标题
document.querySelectorAll('h1, h2, h3, h4').forEach(h => {
info.allHeadings.push({
tag: h.tagName,
text: h.innerText.trim().substring(0, 100)
});
});
// 查找所有链接
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
// 检查是否与教职员工相关
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerHref.includes('professor') || lowerHref.includes('staff') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
// 检查是否是个人页面链接
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/') || href.includes('/person/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
// 保存所有主要链接
if (href && text.length > 2 && text.length < 150) {
info.allLinks.push({
text: text,
href: href
});
}
});
return info;
}''')
print(f"\n页面标题: {page_info['title']}")
print(f"H1: {page_info['h1']}")
print(f"\n所有标题 ({len(page_info['allHeadings'])}):")
for h in page_info['allHeadings']:
print(f" <{h['tag']}>: {h['text']}")
print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):")
for f in page_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):")
for p in page_info['peopleLinks']:
print(f" - {p['text']} -> {p['href']}")
print(f"\n所有链接 ({len(page_info['allLinks'])}):")
for link in page_info['allLinks'][:50]:
print(f" - {link['text'][:60]} -> {link['href']}")
# 尝试另一个项目页面看看是否有不同结构
print("\n\n========== 尝试另一个项目页面 ==========")
economics_url = "https://gsas.harvard.edu/program/economics"
print(f"访问: {economics_url}")
await page.goto(economics_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
# 截图保存
await page.screenshot(path="gsas_economics_page.png", full_page=True)
print("已保存截图: gsas_economics_page.png")
# 分析
econ_info = await page.evaluate('''() => {
const info = {
title: document.title,
facultyLinks: [],
peopleLinks: []
};
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/') || href.includes('/person/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
});
return info;
}''')
print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):")
for f in econ_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):")
for p in econ_info['peopleLinks']:
print(f" - {p['text']} -> {p['href']}")
# 访问院系主页看看有没有Faculty页面
print("\n\n========== 尝试访问院系主页 ==========")
dept_url = "https://aaas.fas.harvard.edu/"
print(f"访问院系主页: {dept_url}")
await page.goto(dept_url, wait_until='networkidle')
await page.wait_for_timeout(3000)
await page.screenshot(path="aaas_dept_page.png", full_page=True)
print("已保存截图: aaas_dept_page.png")
dept_info = await page.evaluate('''() => {
const info = {
title: document.title,
navLinks: [],
facultyLinks: [],
peopleLinks: []
};
// 获取导航链接
document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
if (text && text.length > 1 && text.length < 50) {
info.navLinks.push({
text: text,
href: href
});
}
});
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
lowerText.includes('faculty') || lowerText.includes('people')) {
info.facultyLinks.push({
text: text.substring(0, 100),
href: href
});
}
if (href.includes('/people/') || href.includes('/faculty/') ||
href.includes('/profile/')) {
info.peopleLinks.push({
text: text.substring(0, 100),
href: href
});
}
});
return info;
}''')
print(f"\n导航链接 ({len(dept_info['navLinks'])}):")
for link in dept_info['navLinks'][:20]:
print(f" - {link['text']} -> {link['href']}")
print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):")
for f in dept_info['facultyLinks']:
print(f" - {f['text']} -> {f['href']}")
print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):")
for p in dept_info['peopleLinks'][:30]:
print(f" - {p['text']} -> {p['href']}")
await browser.close()
if __name__ == "__main__":
asyncio.run(explore_program_page())

View File

@ -125,6 +125,7 @@ class ScrapeSettings:
output: Path
verify_links: bool = True
request_delay: float = 1.0 # Polite crawling delay
timeout: int = 60000 # Navigation timeout in ms
async def extract_links(page: Page) -> List[Tuple[str, str]]:
@ -210,7 +211,7 @@ async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink
page = await context.new_page()
try:
response = await page.goto(
normalized_url, wait_until="domcontentloaded", timeout=20000
normalized_url, wait_until="domcontentloaded", timeout=settings.timeout
)
if not response or response.status >= 400:
await page.close()
@ -411,6 +412,12 @@ def parse_args() -> argparse.Namespace:
default=1.0,
help="Delay between requests in seconds (polite crawling).",
)
parser.add_argument(
"--timeout",
type=int,
default=60000,
help="Navigation timeout in milliseconds (default: 60000 = 60s).",
)
return parser.parse_args()
@ -424,6 +431,7 @@ async def main_async() -> None:
output=args.output,
verify_links=not args.no_verify,
request_delay=args.delay,
timeout=args.timeout,
)
links = await crawl(settings, browser_name=args.browser)
serialize(links, settings.output, settings.root_url)

View File

@ -0,0 +1,466 @@
#!/usr/bin/env python3
"""
Harvard Graduate Programs Scraper
专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
通过点击分页按钮遍历所有页面
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright
async def scrape_harvard_programs():
"""爬取Harvard研究生项目列表页面 - 通过点击分页按钮"""
all_programs = []
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
async with async_playwright() as p:
# 使用无头模式
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
print(f"正在访问: {base_url}")
# 使用 domcontentloaded 而非 networkidle更快加载
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
# 等待页面内容加载
await page.wait_for_timeout(5000)
# 滚动到页面底部以确保分页按钮加载
print("滚动到页面底部...")
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
current_page = 1
max_pages = 15
while current_page <= max_pages:
print(f"\n========== 第 {current_page} 页 ==========")
# 等待内容加载
await page.wait_for_timeout(2000)
# 提取当前页面的项目
# 从调试输出得知项目按钮的class是 'records__record___PbPhG c-programs-item__title-link'
# 需要点击按钮来获取URL因为Harvard使用JavaScript导航
# 首先获取所有项目按钮信息
page_data = await page.evaluate('''() => {
const programs = [];
// 查找所有项目行/容器
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
programItems.forEach((item, index) => {
// 获取项目名称按钮
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
if (!nameBtn) return;
const name = nameBtn.innerText.trim();
if (!name || name.length < 3) return;
// 获取学位信息
let degrees = '';
const allText = item.innerText;
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
if (degreeMatch) {
degrees = degreeMatch.join(', ');
}
// 查找链接 - 检查各种可能的位置
let url = '';
// 方法1: 查找 <a> 标签
const link = item.querySelector('a[href]');
if (link && link.href) {
url = link.href;
}
// 方法2: 检查data属性
if (!url) {
const dataUrl = nameBtn.getAttribute('data-url') ||
nameBtn.getAttribute('data-href') ||
item.getAttribute('data-url');
if (dataUrl) url = dataUrl;
}
// 方法3: 检查onclick属性
if (!url) {
const onclick = nameBtn.getAttribute('onclick') || '';
const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/);
if (urlMatch) url = urlMatch[1];
}
programs.push({
name: name,
degrees: degrees,
url: url,
index: index
});
});
// 如果方法1没找到项目使用备选方法
if (programs.length === 0) {
// 查找所有项目按钮
const buttons = document.querySelectorAll('button');
buttons.forEach((btn, index) => {
const className = btn.className || '';
if (className.includes('c-programs-item') || className.includes('title-link')) {
const name = btn.innerText.trim();
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
programs.push({
name: name,
degrees: '',
url: '',
index: index
});
}
}
});
}
return {
programs: programs,
totalFound: programs.length
};
}''')
# 第一页时调试输出HTML结构
if current_page == 1 and len(page_data['programs']) == 0:
print("未找到项目调试HTML结构...")
html_debug = await page.evaluate('''() => {
const debug = {
allButtons: [],
allLinks: [],
sampleHTML: ''
};
// 获取所有按钮
document.querySelectorAll('button').forEach(btn => {
const text = btn.innerText.trim().substring(0, 50);
if (text && text.length > 3) {
debug.allButtons.push({
text: text,
class: btn.className.substring(0, 80)
});
}
});
// 获取main区域的HTML片段
const main = document.querySelector('main') || document.body;
debug.sampleHTML = main.innerHTML.substring(0, 3000);
return debug;
}''')
print(f"找到 {len(html_debug['allButtons'])} 个按钮:")
for btn in html_debug['allButtons'][:20]:
print(f" - {btn['text']} | class: {btn['class']}")
print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}")
print(f" 本页找到 {len(page_data['programs'])} 个项目")
# 打印找到的项目
for prog in page_data['programs']:
print(f" - {prog['name']} ({prog['degrees']})")
# 添加到总列表(去重)
for prog in page_data['programs']:
name = prog['name'].strip()
if name and not any(p['name'] == name for p in all_programs):
all_programs.append({
'name': name,
'degrees': prog.get('degrees', ''),
'url': prog.get('url', ''),
'page': current_page
})
# 尝试点击下一页按钮
try:
clicked = False
# 首先打印所有分页相关元素用于调试
if current_page == 1:
# 截图保存以便调试
await page.screenshot(path="harvard_debug_pagination.png", full_page=True)
print("已保存调试截图: harvard_debug_pagination.png")
pagination_info = await page.evaluate('''() => {
const result = {
links: [],
buttons: [],
allClickable: [],
pageNumbers: [],
allText: []
};
// 查找所有链接
document.querySelectorAll('a').forEach(a => {
const text = a.innerText.trim();
if (text.match(/^[0-9]+$|Next|page|Prev/i)) {
result.links.push({
text: text.substring(0, 50),
href: a.href,
visible: a.offsetParent !== null,
className: a.className
});
}
});
// 查找所有按钮
document.querySelectorAll('button').forEach(b => {
const text = b.innerText.trim();
if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) {
result.buttons.push({
text: text.substring(0, 50),
visible: b.offsetParent !== null,
className: b.className
});
}
});
// 查找所有包含数字的可点击元素(可能是分页)
document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => {
const text = el.innerText.trim();
if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) {
result.pageNumbers.push({
tag: el.tagName,
text: text,
className: el.className,
id: el.id,
ariaLabel: el.getAttribute('aria-label'),
visible: el.offsetParent !== null
});
}
});
// 查找页面底部区域的所有可点击元素
const bodyRect = document.body.getBoundingClientRect();
document.querySelectorAll('*').forEach(el => {
const rect = el.getBoundingClientRect();
const text = el.innerText?.trim() || '';
// 只看页面下半部分的元素且文本短
if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) {
const style = window.getComputedStyle(el);
if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') {
result.allClickable.push({
tag: el.tagName,
text: text.substring(0, 30),
top: Math.round(rect.top),
className: el.className?.substring?.(0, 50) || ''
});
}
}
});
// 输出页面底部所有文本以便调试
const bodyText = document.body.innerText;
const lines = bodyText.split('\\n').filter(l => l.trim());
// 找到包含数字1-9的行
for (let i = 0; i < lines.length; i++) {
if (lines[i].match(/^[1-9]$|Next page|Previous/)) {
result.allText.push(lines[i]);
}
}
return result;
}''')
print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):")
for link in pagination_info['links']:
print(f" a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})")
print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):")
for btn in pagination_info['buttons']:
print(f" button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})")
print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):")
for pn in pagination_info['pageNumbers']:
print(f" {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}")
print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):")
for el in pagination_info['allClickable'][:30]:
print(f" {el['tag']}: '{el['text']}' (top: {el['top']})")
print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):")
for txt in pagination_info['allText'][:20]:
print(f" '{txt}'")
# 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠)
# 从调试输出得知,分页按钮是 <button class="c-pagination__link c-pagination__link--next">
next_page_num = str(current_page + 1)
try:
next_btn = page.locator('button.c-pagination__link--next')
if await next_btn.count() > 0:
print(f"\n找到 'Next page' 按钮 (CSS选择器),尝试点击...")
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法1失败: {e}")
if clicked:
continue
# 方法2: 使用 get_by_role 查找按钮
try:
next_btn = page.get_by_role("button", name="Next page")
if await next_btn.count() > 0:
print(f"\n通过role找到 'Next page' 按钮,尝试点击...")
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法2失败: {e}")
if clicked:
continue
# 方法3: 查找所有分页按钮并点击 "Next page"
try:
pagination_buttons = await page.query_selector_all('button.c-pagination__link')
for btn in pagination_buttons:
text = await btn.inner_text()
if 'Next page' in text:
print(f"\n通过遍历分页按钮找到 'Next page',点击...")
await btn.scroll_into_view_if_needed()
await btn.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
break
except Exception as e:
print(f"方法3失败: {e}")
if clicked:
continue
# 方法4: 通过JavaScript直接点击分页按钮
try:
js_clicked = await page.evaluate('''() => {
// 查找 Next page 按钮
const nextBtn = document.querySelector('button.c-pagination__link--next');
if (nextBtn) {
nextBtn.click();
return true;
}
// 备选:查找所有分页按钮
const buttons = document.querySelectorAll('button.c-pagination__link');
for (const btn of buttons) {
if (btn.innerText.includes('Next page')) {
btn.click();
return true;
}
}
return false;
}''')
if js_clicked:
print(f"\n通过JavaScript点击 'Next page' 成功")
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
except Exception as e:
print(f"方法4失败: {e}")
if clicked:
continue
# 方法5: 遍历所有按钮查找
try:
all_buttons = await page.query_selector_all('button')
for btn in all_buttons:
try:
text = await btn.inner_text()
if 'Next page' in text:
visible = await btn.is_visible()
if visible:
print(f"\n遍历所有按钮找到 'Next page',点击...")
await btn.scroll_into_view_if_needed()
await btn.click()
await page.wait_for_timeout(3000)
current_page += 1
clicked = True
break
except:
continue
except Exception as e:
print(f"方法5失败: {e}")
if clicked:
continue
print("没有找到下一页按钮,结束爬取")
break
except Exception as e:
print(f"点击下一页时出错: {e}")
break
# 生成项目URL - Harvard的项目URL格式为
# https://www.harvard.edu/programs/{program-name-slug}/
# 例如: african-and-african-american-studies
import re
def name_to_slug(name):
"""将项目名称转换为URL slug"""
# 转小写
slug = name.lower()
# 将特殊字符替换为空格
slug = re.sub(r'[^\w\s-]', '', slug)
# 替换空格为连字符
slug = re.sub(r'[\s_]+', '-', slug)
# 移除多余的连字符
slug = re.sub(r'-+', '-', slug)
# 移除首尾连字符
slug = slug.strip('-')
return slug
print("\n正在生成项目URL...")
for prog in all_programs:
slug = name_to_slug(prog['name'])
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
print(f" {prog['name']} -> {prog['url']}")
await browser.close()
# 排序
programs = sorted(all_programs, key=lambda x: x['name'])
# 保存
result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'total_pages_scraped': current_page,
'total_programs': len(programs),
'programs': programs
}
output_file = Path('harvard_programs_results.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n{'='*60}")
print(f"爬取完成!")
print(f"共爬取 {current_page}")
print(f"共找到 {len(programs)} 个研究生项目")
print(f"结果保存到: {output_file}")
print(f"{'='*60}")
# 打印完整列表
print("\n研究生项目完整列表:")
for i, prog in enumerate(programs, 1):
print(f"{i:3}. {prog['name']} - {prog['degrees']}")
return result
if __name__ == "__main__":
asyncio.run(scrape_harvard_programs())

View File

@ -0,0 +1,356 @@
#!/usr/bin/env python3
"""
Harvard Graduate Programs Scraper with Faculty Information
爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
并获取每个项目的导师个人信息页面URL
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright
def name_to_slug(name):
"""将项目名称转换为URL slug"""
slug = name.lower()
slug = re.sub(r'[^\w\s-]', '', slug)
slug = re.sub(r'[\s_]+', '-', slug)
slug = re.sub(r'-+', '-', slug)
slug = slug.strip('-')
return slug
async def extract_faculty_from_page(page):
"""从当前页面提取所有教职员工链接"""
faculty_list = await page.evaluate('''() => {
const faculty = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
const lowerText = text.toLowerCase();
// 检查是否是个人页面链接
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
lowerHref.includes('/profile/') || lowerHref.includes('/person/')) &&
text.length > 3 && text.length < 100 &&
!lowerText.includes('people') &&
!lowerText.includes('faculty') &&
!lowerText.includes('profile') &&
!lowerText.includes('staff') &&
!lowerHref.endsWith('/people/') &&
!lowerHref.endsWith('/people') &&
!lowerHref.endsWith('/faculty/') &&
!lowerHref.endsWith('/faculty')) {
if (!seen.has(href)) {
seen.add(href);
faculty.push({
name: text,
url: href
});
}
}
});
return faculty;
}''')
return faculty_list
async def get_faculty_from_gsas_page(page, gsas_url, program_name):
"""从GSAS项目页面获取Faculty链接然后访问院系People页面获取导师列表"""
faculty_list = []
faculty_page_url = None
try:
print(f" 访问GSAS页面: {gsas_url}")
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 策略1: 查找 "See list of ... faculty" 链接
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href;
if (text.includes('faculty') && text.includes('see list')) {
return href;
}
}
return null;
}''')
# 策略2: 查找任何包含 /people 或 /faculty 的链接
if not faculty_link:
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href.toLowerCase();
// 查找Faculty相关链接
if ((text.includes('faculty') || text.includes('people')) &&
(href.includes('/people') || href.includes('/faculty'))) {
return link.href;
}
}
return null;
}''')
# 策略3: 从页面中查找院系网站链接然后尝试访问其People页面
if not faculty_link:
dept_website = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href;
// 查找 Website 链接 (通常指向院系主页)
if (text.includes('website') && href.includes('harvard.edu') &&
!href.includes('gsas.harvard.edu')) {
return href;
}
}
return null;
}''')
if dept_website:
print(f" 找到院系网站: {dept_website}")
try:
await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 在院系网站上查找People/Faculty链接
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase().trim();
const href = link.href;
if ((text === 'people' || text === 'faculty' ||
text === 'faculty & research' || text.includes('our faculty')) &&
(href.includes('/people') || href.includes('/faculty'))) {
return href;
}
}
return null;
}''')
except Exception as e:
print(f" 访问院系网站失败: {e}")
if faculty_link:
faculty_page_url = faculty_link
print(f" 找到Faculty页面: {faculty_link}")
# 访问Faculty/People页面
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 提取所有导师信息
faculty_list = await extract_faculty_from_page(page)
# 如果第一页没找到,尝试处理分页或其他布局
if len(faculty_list) == 0:
# 可能需要点击某些按钮或处理JavaScript加载
await page.wait_for_timeout(2000)
faculty_list = await extract_faculty_from_page(page)
print(f" 找到 {len(faculty_list)} 位导师")
else:
print(f" 未找到Faculty页面链接")
except Exception as e:
print(f" 获取Faculty信息失败: {e}")
return faculty_list, faculty_page_url
async def scrape_harvard_programs_with_faculty():
"""爬取Harvard研究生项目列表及导师信息"""
all_programs = []
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
print(f"正在访问: {base_url}")
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
await page.wait_for_timeout(5000)
# 滚动到页面底部
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
current_page = 1
max_pages = 15
# 第一阶段:收集所有项目基本信息
print("\n========== 第一阶段:收集项目列表 ==========")
while current_page <= max_pages:
print(f"\n--- 第 {current_page} 页 ---")
await page.wait_for_timeout(2000)
# 提取当前页面的项目
page_data = await page.evaluate('''() => {
const programs = [];
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
programItems.forEach((item, index) => {
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
if (!nameBtn) return;
const name = nameBtn.innerText.trim();
if (!name || name.length < 3) return;
let degrees = '';
const allText = item.innerText;
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
if (degreeMatch) {
degrees = degreeMatch.join(', ');
}
programs.push({
name: name,
degrees: degrees
});
});
if (programs.length === 0) {
const buttons = document.querySelectorAll('button');
buttons.forEach((btn) => {
const className = btn.className || '';
if (className.includes('c-programs-item') || className.includes('title-link')) {
const name = btn.innerText.trim();
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
programs.push({
name: name,
degrees: ''
});
}
}
});
}
return programs;
}''')
print(f" 本页找到 {len(page_data)} 个项目")
for prog in page_data:
name = prog['name'].strip()
if name and not any(p['name'] == name for p in all_programs):
all_programs.append({
'name': name,
'degrees': prog.get('degrees', ''),
'page': current_page
})
# 尝试点击下一页
try:
next_btn = page.locator('button.c-pagination__link--next')
if await next_btn.count() > 0:
await next_btn.first.scroll_into_view_if_needed()
await next_btn.first.click()
await page.wait_for_timeout(3000)
current_page += 1
else:
print("没有下一页按钮,结束收集")
break
except Exception as e:
print(f"分页失败: {e}")
break
print(f"\n共收集到 {len(all_programs)} 个项目")
# 第二阶段:为每个项目获取导师信息
print("\n========== 第二阶段:获取导师信息 ==========")
print("注意这将访问每个项目的GSAS页面可能需要较长时间...")
for i, prog in enumerate(all_programs, 1):
print(f"\n[{i}/{len(all_programs)}] {prog['name']}")
# 生成项目URL
slug = name_to_slug(prog['name'])
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
# 生成GSAS URL
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
# 获取导师信息
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name'])
prog['faculty_page_url'] = faculty_page_url or ""
prog['faculty'] = faculty_list
prog['faculty_count'] = len(faculty_list)
# 每10个项目保存一次进度
if i % 10 == 0:
temp_result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'progress': f"{i}/{len(all_programs)}",
'programs': all_programs[:i]
}
with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f:
json.dump(temp_result, f, ensure_ascii=False, indent=2)
print(f" [进度已保存]")
# 避免请求过快
await page.wait_for_timeout(1500)
await browser.close()
# 排序
programs = sorted(all_programs, key=lambda x: x['name'])
# 统计
total_faculty = sum(p['faculty_count'] for p in programs)
programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0)
# 保存最终结果
result = {
'source_url': base_url,
'scraped_at': datetime.now(timezone.utc).isoformat(),
'total_pages_scraped': current_page,
'total_programs': len(programs),
'programs_with_faculty': programs_with_faculty,
'total_faculty_found': total_faculty,
'programs': programs
}
output_file = Path('harvard_programs_with_faculty.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n{'='*60}")
print(f"爬取完成!")
print(f"共爬取 {current_page}")
print(f"共找到 {len(programs)} 个研究生项目")
print(f"其中 {programs_with_faculty} 个项目有导师信息")
print(f"共找到 {total_faculty} 位导师")
print(f"结果保存到: {output_file}")
print(f"{'='*60}")
# 打印摘要
print("\n项目摘要 (前30个):")
for i, prog in enumerate(programs[:30], 1):
faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)"
print(f"{i:3}. {prog['name']} {faculty_info}")
if len(programs) > 30:
print(f"... 还有 {len(programs) - 30} 个项目")
return result
if __name__ == "__main__":
asyncio.run(scrape_harvard_programs_with_faculty())

View File

@ -0,0 +1,910 @@
"""
曼彻斯特大学完整采集脚本
新增特性:
- Research Explorer API 优先拉取 JSON / XML失败再回落 DOM
- 每个学院独立页面、并行抓取(默认 3 并发)
- 细粒度超时/重试/滚动/Load more 控制
- 多 URL / 备用 Staff 页面配置
- 导师目录缓存,可按学院关键词映射到项目
- 诊断信息记录(失败学院、超时学院、批次信息)
"""
import asyncio
import json
import re
from copy import deepcopy
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlencode, urljoin
from xml.etree import ElementTree as ET
from playwright.async_api import (
TimeoutError as PlaywrightTimeoutError,
async_playwright,
)
# =========================
# 配置区
# =========================
DEFAULT_REQUEST = {
"timeout_ms": 60000,
"post_wait_ms": 2500,
"wait_until": "domcontentloaded",
"max_retries": 3,
"retry_backoff_ms": 2000,
}
STAFF_CONCURRENCY = 3
SCHOOL_CONFIG: List[Dict[str, Any]] = [
{
"name": "Alliance Manchester Business School",
"keywords": [
"accounting",
"finance",
"business",
"management",
"marketing",
"mba",
"economics",
"entrepreneurship",
],
"attach_faculty_to_programs": True,
"staff_pages": [
{
"url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
"extract_method": "table",
"request": {"timeout_ms": 60000, "wait_until": "networkidle"},
}
],
},
{
"name": "Department of Computer Science",
"keywords": [
"computer",
"software",
"data science",
"artificial intelligence",
"ai ",
"machine learning",
"cyber",
"computing",
],
"attach_faculty_to_programs": True,
"staff_pages": [
{
"url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/",
"extract_method": "links",
"requires_scroll": True,
},
{
"url": "https://www.cs.manchester.ac.uk/about/people/",
"extract_method": "links",
"load_more_selector": "button.load-more",
"max_load_more": 6,
},
],
},
{
"name": "Department of Physics and Astronomy",
"keywords": [
"physics",
"astronomy",
"astrophysics",
"nuclear",
"particle",
],
"attach_faculty_to_programs": True,
"staff_pages": [
{
"url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/",
"extract_method": "links",
"requires_scroll": True,
}
],
},
{
"name": "Department of Electrical and Electronic Engineering",
"keywords": [
"electrical",
"electronic",
"eee",
"power systems",
"microelectronics",
],
"attach_faculty_to_programs": True,
"staff_pages": [
{
"url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/",
"extract_method": "links",
"requires_scroll": True,
}
],
},
{
"name": "Department of Chemistry",
"keywords": ["chemistry", "chemical"],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 200},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
"request": {
"timeout_ms": 120000,
"wait_until": "networkidle",
"post_wait_ms": 5000,
},
}
],
},
{
"name": "Department of Mathematics",
"keywords": [
"mathematics",
"mathematical",
"applied math",
"statistics",
"actuarial",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 200},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Engineering",
"keywords": [
"engineering",
"mechanical",
"aerospace",
"civil",
"structural",
"materials",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 400},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "Faculty of Biology, Medicine and Health",
"keywords": [
"medicine",
"medical",
"health",
"nursing",
"pharmacy",
"clinical",
"dental",
"optometry",
"biology",
"biomedical",
"anatomical",
"physiotherapy",
"midwifery",
"mental health",
"psychology",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 400},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Social Sciences",
"keywords": [
"sociology",
"politics",
"international",
"social",
"criminology",
"anthropology",
"philosophy",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 200},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Law",
"keywords": ["law", "legal", "llm"],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 200},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Arts, Languages and Cultures",
"keywords": [
"arts",
"languages",
"culture",
"music",
"drama",
"theatre",
"history",
"linguistics",
"literature",
"translation",
"classics",
"archaeology",
"religion",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 400},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
{
"name": "School of Environment, Education and Development",
"keywords": [
"environment",
"education",
"development",
"planning",
"architecture",
"urban",
"geography",
"sustainability",
],
"attach_faculty_to_programs": True,
"extract_method": "research_explorer",
"research_explorer": {"page_size": 300},
"staff_pages": [
{
"url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/",
"extract_method": "research_explorer",
"requires_scroll": True,
}
],
},
]
SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG}
# =========================
# JS 提取函数
# =========================
JS_EXTRACT_TABLE_STAFF = """() => {
const staff = [];
const seen = new Set();
document.querySelectorAll('table tr').forEach(row => {
const cells = row.querySelectorAll('td');
if (cells.length >= 2) {
const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]');
const titleCell = cells[2] || cells[1];
if (link) {
const name = link.innerText.trim();
const url = link.href;
const title = titleCell ? titleCell.innerText.trim() : '';
if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) {
seen.add(url);
staff.push({
name,
url,
title
});
}
}
}
});
return staff;
}"""
JS_EXTRACT_LINK_STAFF = """() => {
const staff = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim();
if (seen.has(href)) return;
if (text.length < 5 || text.length > 80) return;
const lowerText = text.toLowerCase();
if (lowerText.includes('skip') ||
lowerText.includes('staff') ||
lowerText.includes('people') ||
lowerText.includes('academic') ||
lowerText.includes('research profiles')) return;
if (href.includes('/persons/') ||
href.includes('/portal/en/researchers/') ||
href.includes('/profile/') ||
href.includes('/people/')) {
seen.add(href);
staff.push({
name: text,
url: href,
title: ''
});
}
});
return staff;
}"""
JS_EXTRACT_RESEARCH_EXPLORER = """() => {
const staff = [];
const seen = new Set();
document.querySelectorAll('a.link.person').forEach(a => {
const href = a.href;
const text = a.innerText.trim();
if (!seen.has(href) && text.length > 3 && text.length < 80) {
seen.add(href);
staff.push({
name: text,
url: href,
title: ''
});
}
});
if (staff.length === 0) {
document.querySelectorAll('a[href*="/persons/"]').forEach(a => {
const href = a.href;
const text = a.innerText.trim();
const lower = text.toLowerCase();
if (seen.has(href)) return;
if (text.length < 3 || text.length > 80) return;
if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return;
seen.add(href);
staff.push({
name: text,
url: href,
title: ''
});
});
}
return staff;
}"""
JS_EXTRACT_PROGRAMS = """() => {
const programs = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().replace(/\\s+/g, ' ');
if (!href || seen.has(href)) return;
if (text.length < 10 || text.length > 200) return;
const hrefLower = href.toLowerCase();
const textLower = text.toLowerCase();
const isNav = textLower === 'courses' ||
textLower === 'masters' ||
textLower.includes('admission') ||
textLower.includes('fees') ||
textLower.includes('skip to') ||
textLower.includes('search') ||
textLower.includes('contact') ||
hrefLower.includes('#');
if (isNav) return;
const hasNumericId = /\\/\\d{5}\\//.test(href);
const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId;
if (isCoursePage) {
seen.add(href);
programs.push({
name: text,
url: href
});
}
});
return programs;
}"""
# =========================
# 数据匹配
# =========================
def match_program_to_school(program_name: str) -> str:
lower = program_name.lower()
for school in SCHOOL_CONFIG:
for keyword in school["keywords"]:
if keyword in lower:
return school["name"]
return "Other Programs"
# =========================
# 请求与解析工具
# =========================
def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]:
settings = dict(DEFAULT_REQUEST)
for layer in layers:
if not layer:
continue
for key, value in layer.items():
if value is not None:
settings[key] = value
settings["max_retries"] = max(1, int(settings.get("max_retries", 1)))
settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000)
return settings
async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]:
last_error = None
for attempt in range(settings["max_retries"]):
try:
await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"])
if settings.get("wait_for_selector"):
await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"])
if settings.get("post_wait_ms"):
await page.wait_for_timeout(settings["post_wait_ms"])
return True, None
except PlaywrightTimeoutError as exc:
last_error = f"Timeout: {exc}"
except Exception as exc: # noqa: BLE001
last_error = str(exc)
if attempt < settings["max_retries"] - 1:
await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1))
return False, last_error
async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800):
repetitions = max(1, repetitions)
for i in range(repetitions):
await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1))
await page.wait_for_timeout(delay_ms)
async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500):
for _ in range(max_clicks):
button = await page.query_selector(selector)
if not button:
break
try:
await button.click()
await page.wait_for_timeout(wait_ms)
except Exception:
break
def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
seen = set()
cleaned = []
for item in staff:
name = (item.get("name") or "").strip()
if not name:
continue
url = (item.get("url") or "").strip()
key = url or name.lower()
if key in seen:
continue
seen.add(key)
cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()})
return cleaned
def _append_query(url: str, params: Dict[str, Any]) -> str:
delimiter = "&" if "?" in url else "?"
return f"{url}{delimiter}{urlencode(params)}"
def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]:
if not staff_url:
return None
path = staff_url.rstrip("/").split("/")
return path[-1] if path else None
def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]:
items: List[Dict[str, Any]] = []
if isinstance(data, list):
items = data
elif isinstance(data, dict):
for key in ("results", "items", "persons", "data", "entities"):
if isinstance(data.get(key), list):
items = data[key]
break
if not items and isinstance(data.get("rows"), list):
items = data["rows"]
staff = []
for item in items:
if not isinstance(item, dict):
continue
name = item.get("name") or item.get("title") or item.get("fullName")
profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL")
if not name:
continue
if profile_url:
profile_url = urljoin(base_url, profile_url)
staff.append(
{
"name": name.strip(),
"url": (profile_url or "").strip(),
"title": (item.get("jobTitle") or item.get("position") or "").strip(),
}
)
return staff
def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]:
staff: List[Dict[str, str]] = []
try:
root = ET.fromstring(text)
except ET.ParseError:
return staff
for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"):
title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="")
link = entry.find("{http://www.w3.org/2005/Atom}link")
href = link.attrib.get("href") if link is not None else ""
if title:
staff.append(
{
"name": title.strip(),
"url": urljoin(base_url, href) if href else "",
"title": "",
}
)
return staff
async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
config = school_config.get("research_explorer") or {}
if not config and school_config.get("extract_method") != "research_explorer":
return []
base_staff_url = ""
if school_config.get("staff_pages"):
base_staff_url = school_config["staff_pages"][0].get("url", "")
page_size = config.get("page_size", 200)
timeout_ms = config.get("timeout_ms", 70000)
candidates: List[str] = []
slug = config.get("org_slug") or _guess_research_slug(base_staff_url)
base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx")
if config.get("api_url"):
candidates.append(config["api_url"])
if slug:
params = {
"action": "search",
"language": "en",
"format": "json",
"site": "default",
"showall": "true",
"pageSize": page_size,
"organisations": slug,
}
candidates.append(f"{base_api}?{urlencode(params)}")
if base_staff_url:
candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size}))
candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size}))
for url in candidates:
try:
resp = await context.request.get(url, timeout=timeout_ms)
if resp.status != 200:
continue
ctype = resp.headers.get("content-type", "")
if "json" in ctype:
data = await resp.json()
parsed = _parse_research_explorer_json(data, base_staff_url)
else:
text = await resp.text()
parsed = _parse_research_explorer_xml(text, base_staff_url)
parsed = _deduplicate_staff(parsed)
if parsed:
if output_callback:
output_callback("info", f" {school_config['name']}: {len(parsed)} staff via API")
return parsed
except Exception as exc: # noqa: BLE001
if output_callback:
output_callback(
"warning", f" {school_config['name']}: API fetch failed ({str(exc)[:60]})"
)
return []
async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
staff_collected: List[Dict[str, str]] = []
staff_pages = school_config.get("staff_pages") or []
if not staff_pages and school_config.get("staff_url"):
staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}]
page = await context.new_page()
blocked_types = school_config.get("blocked_resources", ["image", "font", "media"])
if blocked_types:
async def _route_handler(route):
if route.request.resource_type in blocked_types:
await route.abort()
else:
await route.continue_()
await page.route("**/*", _route_handler)
for page_cfg in staff_pages:
target_url = page_cfg.get("url")
if not target_url:
continue
settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request"))
success, error = await _goto_with_retry(page, target_url, settings, school_config["name"])
if not success:
if output_callback:
output_callback("warning", f" {school_config['name']}: failed to load {target_url} ({error})")
continue
if page_cfg.get("requires_scroll"):
await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700))
if page_cfg.get("load_from_selector"):
await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5))
elif page_cfg.get("load_more_selector"):
await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5))
method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links"
if method == "table":
extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF)
elif method == "research_explorer":
extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER)
else:
extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF)
staff_collected.extend(extracted)
await page.close()
return _deduplicate_staff(staff_collected)
# =========================
# 并发抓取学院 Staff
# =========================
async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback):
async with semaphore:
staff_list: List[Dict[str, str]] = []
status = "success"
error: Optional[str] = None
try:
if school_config.get("extract_method") == "research_explorer":
staff_list = await fetch_research_explorer_api(context, school_config, output_callback)
if not staff_list:
staff_list = await scrape_staff_via_browser(context, school_config, output_callback)
if output_callback:
output_callback("info", f" {school_config['name']}: total {len(staff_list)} staff")
except Exception as exc: # noqa: BLE001
status = "error"
error = str(exc)
if output_callback:
output_callback("error", f" {school_config['name']}: {error}")
return {
"name": school_config["name"],
"staff": staff_list,
"status": status,
"error": error,
}
async def scrape_all_school_staff(context, output_callback):
semaphore = asyncio.Semaphore(STAFF_CONCURRENCY)
tasks = [
asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback))
for cfg in SCHOOL_CONFIG
]
results = await asyncio.gather(*tasks)
staff_map = {}
diagnostics = {"failed": [], "success": [], "total": len(results)}
for res in results:
if res["staff"]:
staff_map[res["name"]] = res["staff"]
diagnostics["success"].append(res["name"])
else:
diagnostics["failed"].append(
{
"name": res["name"],
"status": res["status"],
"error": res.get("error"),
}
)
return staff_map, diagnostics
# =========================
# 主流程
# =========================
async def scrape(output_callback=None):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
base_url = "https://www.manchester.ac.uk/"
result = {
"name": "The University of Manchester",
"url": base_url,
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": [],
"diagnostics": {},
}
try:
# Step 1: Masters 列表
if output_callback:
output_callback("info", "Step 1: Scraping masters programs list...")
page = await context.new_page()
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000)
await page.wait_for_timeout(3000)
programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS)
await page.close()
if output_callback:
output_callback("info", f"Found {len(programs_data)} masters programs")
# Step 2: 并发抓取学院 Staff
if output_callback:
output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...")
school_staff, diagnostics = await scrape_all_school_staff(context, output_callback)
# Step 3: 组织数据
schools_dict: Dict[str, Dict[str, Any]] = {}
for prog in programs_data:
school_name = match_program_to_school(prog["name"])
if school_name not in schools_dict:
schools_dict[school_name] = {
"name": school_name,
"url": "",
"programs": [],
"faculty": school_staff.get(school_name, []),
"faculty_source": "school_directory" if school_staff.get(school_name) else "",
}
schools_dict[school_name]["programs"].append(
{
"name": prog["name"],
"url": prog["url"],
"faculty": [],
}
)
for cfg in SCHOOL_CONFIG:
if cfg["name"] in schools_dict:
first_page = (cfg.get("staff_pages") or [{}])[0]
schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "")
_attach_faculty_to_programs(schools_dict, school_staff)
result["schools"] = list(schools_dict.values())
total_programs = sum(len(s["programs"]) for s in result["schools"])
total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"])
result["diagnostics"] = {
"total_programs": total_programs,
"total_faculty_records": total_faculty,
"school_staff_success": diagnostics.get("success", []),
"school_staff_failed": diagnostics.get("failed", []),
}
if output_callback:
output_callback(
"info",
f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty",
)
except Exception as exc: # noqa: BLE001
if output_callback:
output_callback("error", f"Scraping error: {str(exc)}")
finally:
await browser.close()
return result
def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]):
for school_name, school_data in schools_dict.items():
staff = staff_map.get(school_name, [])
cfg = SCHOOL_LOOKUP.get(school_name, {})
if not staff or not cfg.get("attach_faculty_to_programs"):
continue
limit = cfg.get("faculty_per_program")
for program in school_data["programs"]:
sliced = deepcopy(staff[:limit] if limit else staff)
program["faculty"] = sliced
# =========================
# CLI
# =========================
if __name__ == "__main__":
import sys
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
def print_callback(level, msg):
print(f"[{level}] {msg}")
scrape_result = asyncio.run(scrape(output_callback=print_callback))
output_path = "output/manchester_complete_result.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump(scrape_result, f, ensure_ascii=False, indent=2)
print("\nResult saved to", output_path)
print("\n=== Summary ===")
for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))):
print(
f" {school['name']}: "
f"{len(school['programs'])} programs, "
f"{len(school.get('faculty', []))} faculty"
)

View File

@ -0,0 +1,229 @@
"""
曼彻斯特大学专用爬虫脚本
改进版 - 从学院Staff页面提取导师信息
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright
# 曼彻斯特大学学院Staff页面映射
# 项目关键词 -> 学院Staff页面URL
SCHOOL_STAFF_MAPPING = {
# Alliance Manchester Business School (AMBS)
"accounting": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
"finance": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
"business": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
"management": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
"marketing": "https://www.alliancembs.manchester.ac.uk/research/management-sciences-and-marketing/",
"mba": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
# 其他学院可以继续添加...
# "computer": "...",
# "engineering": "...",
}
# 通用学院Staff页面列表如果没有匹配的关键词
GENERAL_STAFF_PAGES = [
"https://www.alliancembs.manchester.ac.uk/about/our-people/",
]
async def scrape(output_callback=None):
"""执行爬取"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
page = await context.new_page()
base_url = "https://www.manchester.ac.uk/"
result = {
"name": "The University of Manchester",
"url": base_url,
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": []
}
try:
# 第一步:爬取硕士项目列表
if output_callback:
output_callback("info", "Step 1: Scraping masters programs list...")
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
# 提取所有硕士项目
programs_data = await page.evaluate('''() => {
const programs = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().replace(/\\s+/g, ' ');
if (!href || seen.has(href)) return;
if (text.length < 10 || text.length > 200) return;
const hrefLower = href.toLowerCase();
const textLower = text.toLowerCase();
// 排除导航链接
if (textLower === 'courses' || textLower === 'masters' ||
textLower.includes('admission') || textLower.includes('fees') ||
textLower.includes('skip to') || textLower.includes('skip navigation') ||
textLower === 'home' || textLower === 'search' ||
textLower.includes('contact') || textLower.includes('footer') ||
hrefLower.endsWith('/courses/') || hrefLower.endsWith('/masters/') ||
hrefLower.includes('#')) {
return;
}
// 检查是否是课程链接 - 必须包含课程ID
const hasNumericId = /\\/\\d{5}\\//.test(href); // 5位数字ID
const isCoursePage = hrefLower.includes('/courses/list/') &&
hasNumericId;
if (isCoursePage) {
seen.add(href);
programs.push({
name: text,
url: href
});
}
});
return programs;
}''')
if output_callback:
output_callback("info", f"Found {len(programs_data)} masters programs")
# 第二步爬取学院Staff页面的导师信息
if output_callback:
output_callback("info", "Step 2: Scraping faculty from school staff pages...")
all_faculty = {} # school_url -> faculty list
# 爬取AMBS Accounting & Finance Staff
staff_url = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
if output_callback:
output_callback("info", f"Scraping staff from: {staff_url}")
await page.goto(staff_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
# 从表格提取教职员工
faculty_data = await page.evaluate('''() => {
const faculty = [];
const rows = document.querySelectorAll('table tr');
rows.forEach(row => {
const cells = row.querySelectorAll('td');
if (cells.length >= 2) {
const link = cells[1]?.querySelector('a[href]');
const titleCell = cells[2];
if (link) {
const name = link.innerText.trim();
const url = link.href;
const title = titleCell ? titleCell.innerText.trim() : '';
if (name.length > 2 && !name.toLowerCase().includes('skip')) {
faculty.push({
name: name,
url: url,
title: title
});
}
}
}
});
return faculty;
}''')
if output_callback:
output_callback("info", f"Found {len(faculty_data)} faculty members from AMBS")
all_faculty["AMBS - Accounting and Finance"] = faculty_data
# 第三步:组装结果
# 将项目按关键词分配到学院
schools_data = {}
for prog in programs_data:
prog_name_lower = prog['name'].lower()
# 确定所属学院
school_name = "Other Programs"
matched_faculty = []
for keyword, staff_url in SCHOOL_STAFF_MAPPING.items():
if keyword in prog_name_lower:
if "accounting" in keyword or "finance" in keyword:
school_name = "Alliance Manchester Business School"
matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
elif "business" in keyword or "management" in keyword or "mba" in keyword:
school_name = "Alliance Manchester Business School"
matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
break
if school_name not in schools_data:
schools_data[school_name] = {
"name": school_name,
"url": "",
"programs": [],
"faculty": matched_faculty # 学院级别的导师
}
schools_data[school_name]["programs"].append({
"name": prog['name'],
"url": prog['url'],
"faculty": [] # 项目级别暂不填充
})
result["schools"] = list(schools_data.values())
# 统计
total_programs = sum(len(s['programs']) for s in result['schools'])
total_faculty = sum(len(s.get('faculty', [])) for s in result['schools'])
if output_callback:
output_callback("info", f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty")
except Exception as e:
if output_callback:
output_callback("error", f"Scraping error: {str(e)}")
finally:
await browser.close()
return result
if __name__ == "__main__":
import sys
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
def print_callback(level, msg):
print(f"[{level}] {msg}")
result = asyncio.run(scrape(output_callback=print_callback))
# 保存结果
with open("output/manchester_improved_result.json", "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"\nResult saved to output/manchester_improved_result.json")
print(f"Schools: {len(result['schools'])}")
for school in result['schools']:
print(f" - {school['name']}: {len(school['programs'])} programs, {len(school.get('faculty', []))} faculty")

View File

@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""
测试导师信息爬取逻辑 - 只测试3个项目
"""
import asyncio
import json
import re
from playwright.async_api import async_playwright
def name_to_slug(name):
"""将项目名称转换为URL slug"""
slug = name.lower()
slug = re.sub(r'[^\w\s-]', '', slug)
slug = re.sub(r'[\s_]+', '-', slug)
slug = re.sub(r'-+', '-', slug)
slug = slug.strip('-')
return slug
async def get_faculty_from_gsas_page(page, gsas_url):
"""从GSAS项目页面获取Faculty链接然后访问院系People页面获取导师列表"""
faculty_list = []
faculty_page_url = None
try:
print(f" 访问GSAS页面: {gsas_url}")
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 查找Faculty部分的链接
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href;
if (text.includes('faculty') && text.includes('see list')) {
return href;
}
if (text.includes('faculty') && (href.includes('/people') || href.includes('/faculty'))) {
return href;
}
}
return null;
}''')
if faculty_link:
faculty_page_url = faculty_link
print(f" 找到Faculty页面链接: {faculty_link}")
# 访问Faculty/People页面
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 提取所有导师信息
faculty_list = await page.evaluate('''() => {
const faculty = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
lowerHref.includes('/profile/')) &&
text.length > 3 && text.length < 100 &&
!text.toLowerCase().includes('people') &&
!text.toLowerCase().includes('faculty') &&
!lowerHref.endsWith('/people/') &&
!lowerHref.endsWith('/faculty/')) {
if (!seen.has(href)) {
seen.add(href);
faculty.push({
name: text,
url: href
});
}
}
});
return faculty;
}''')
print(f" 找到 {len(faculty_list)} 位导师")
for f in faculty_list[:5]:
print(f" - {f['name']}: {f['url']}")
if len(faculty_list) > 5:
print(f" ... 还有 {len(faculty_list) - 5}")
else:
print(" 未找到Faculty页面链接")
except Exception as e:
print(f" 获取Faculty信息失败: {e}")
return faculty_list, faculty_page_url
async def test_faculty_scraper():
"""测试导师爬取"""
# 测试3个项目
test_programs = [
"African and African American Studies",
"Economics",
"Computer Science"
]
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
results = []
for i, name in enumerate(test_programs, 1):
print(f"\n{'='*60}")
print(f"[{i}/{len(test_programs)}] 测试: {name}")
print(f"{'='*60}")
slug = name_to_slug(name)
program_url = f"https://www.harvard.edu/programs/{slug}/"
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
print(f"项目URL: {program_url}")
print(f"GSAS URL: {gsas_url}")
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url)
results.append({
'name': name,
'url': program_url,
'gsas_url': gsas_url,
'faculty_page_url': faculty_page_url,
'faculty': faculty_list,
'faculty_count': len(faculty_list)
})
await page.wait_for_timeout(1000)
await browser.close()
# 输出结果
print(f"\n\n{'='*60}")
print("测试结果汇总")
print(f"{'='*60}")
for r in results:
print(f"\n{r['name']}:")
print(f" Faculty页面: {r['faculty_page_url'] or '未找到'}")
print(f" 导师数量: {r['faculty_count']}")
# 保存测试结果
with open('test_faculty_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n测试结果已保存到: test_faculty_results.json")
if __name__ == "__main__":
asyncio.run(test_faculty_scraper())

View File

@ -0,0 +1,464 @@
"""
Test Manchester University scraper - improved faculty mapping
"""
import asyncio
import json
from datetime import datetime, timezone
from playwright.async_api import async_playwright
MASTERS_PATHS = [
"/study/masters/courses/list/",
"/study/masters/courses/",
"/postgraduate/taught/courses/",
"/postgraduate/courses/list/",
"/postgraduate/courses/",
"/graduate/programs/",
"/academics/graduate/programs/",
"/programmes/masters/",
"/masters/programmes/",
"/admissions/graduate/programs/",
]
ACCOUNTING_STAFF_URL = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
ACCOUNTING_STAFF_CACHE = None
JS_CHECK_COURSES = r"""() => {
const links = document.querySelectorAll('a[href]');
let courseCount = 0;
for (const a of links) {
const href = a.href.toLowerCase();
if (/\/\d{4,}\//.test(href) ||
/\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
/\/course\/[a-z]/.test(href)) {
courseCount++;
}
}
return courseCount;
}"""
JS_FIND_LIST_URL = """() => {
const links = document.querySelectorAll('a[href]');
for (const a of links) {
const text = a.innerText.toLowerCase();
const href = a.href.toLowerCase();
if ((text.includes('a-z') || text.includes('all course') ||
text.includes('full list') || text.includes('browse all') ||
href.includes('/list')) &&
(href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
return a.href;
}
}
return null;
}"""
JS_FIND_COURSES_FROM_HOME = """() => {
const links = document.querySelectorAll('a[href]');
for (const a of links) {
const href = a.href.toLowerCase();
const text = a.innerText.toLowerCase();
if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
(href.includes('course') || href.includes('program') || href.includes('degree'))) {
return a.href;
}
}
return null;
}"""
JS_EXTRACT_PROGRAMS = r"""() => {
const programs = [];
const seen = new Set();
const currentHost = window.location.hostname;
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().replace(/\s+/g, ' ');
if (!href || seen.has(href)) return;
if (text.length < 5 || text.length > 200) return;
if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;
try {
const linkHost = new URL(href).hostname;
if (!linkHost.includes(currentHost.replace('www.', '')) &&
!currentHost.includes(linkHost.replace('www.', ''))) return;
} catch {
return;
}
const hrefLower = href.toLowerCase();
const textLower = text.toLowerCase();
const isNavigation = textLower === 'courses' ||
textLower === 'programmes' ||
textLower === 'undergraduate' ||
textLower === 'postgraduate' ||
textLower === 'masters' ||
textLower === "master's" ||
textLower.includes('skip to') ||
textLower.includes('share') ||
textLower === 'home' ||
textLower === 'study' ||
textLower.startsWith('a-z') ||
textLower.includes('admission') ||
textLower.includes('fees and funding') ||
textLower.includes('why should') ||
textLower.includes('why manchester') ||
textLower.includes('teaching and learning') ||
textLower.includes('meet us') ||
textLower.includes('student support') ||
textLower.includes('contact us') ||
textLower.includes('how to apply') ||
hrefLower.includes('/admissions/') ||
hrefLower.includes('/fees-and-funding/') ||
hrefLower.includes('/why-') ||
hrefLower.includes('/meet-us/') ||
hrefLower.includes('/contact-us/') ||
hrefLower.includes('/student-support/') ||
hrefLower.includes('/teaching-and-learning/') ||
hrefLower.endsWith('/courses/') ||
hrefLower.endsWith('/masters/') ||
hrefLower.endsWith('/postgraduate/');
if (isNavigation) return;
const isExcluded = hrefLower.includes('/undergraduate') ||
hrefLower.includes('/bachelor') ||
hrefLower.includes('/phd/') ||
hrefLower.includes('/doctoral') ||
hrefLower.includes('/research-degree') ||
textLower.includes('bachelor') ||
textLower.includes('undergraduate') ||
(textLower.includes('phd') && !textLower.includes('mphil'));
if (isExcluded) return;
const hasNumericId = /\/\d{4,}\//.test(href);
const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
const isCoursePage = (hrefLower.includes('/course/') ||
hrefLower.includes('/courses/list/') ||
hrefLower.includes('/programme/')) &&
href.split('/').filter(p => p).length > 4;
const textHasDegree = /(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)/i.test(text) ||
textLower.includes('master');
if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
seen.add(href);
programs.push({
name: text,
url: href
});
}
});
return programs;
}"""
JS_EXTRACT_FACULTY = r"""() => {
const faculty = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href.toLowerCase();
const text = a.innerText.trim();
if (seen.has(href)) return;
if (text.length < 3 || text.length > 100) return;
const isStaff = href.includes('/people/') ||
href.includes('/staff/') ||
href.includes('/faculty/') ||
href.includes('/profile/') ||
href.includes('/academics/') ||
href.includes('/researcher/');
if (isStaff) {
seen.add(href);
faculty.push({
name: text.replace(/\s+/g, ' '),
url: a.href
});
}
});
return faculty.slice(0, 20);
}"""
JS_EXTRACT_ACCOUNTING_STAFF = r"""() => {
const rows = Array.from(document.querySelectorAll('table tbody tr'));
const staff = [];
for (const row of rows) {
const cells = row.querySelectorAll('td');
if (!cells || cells.length < 2) {
continue;
}
const nameCell = cells[1];
const roleCell = cells[2];
const emailCell = cells[5];
let profileUrl = '';
let displayName = nameCell ? nameCell.innerText.trim() : '';
const link = nameCell ? nameCell.querySelector('a[href]') : null;
if (link) {
profileUrl = link.href;
displayName = link.innerText.trim() || displayName;
}
if (!displayName) {
continue;
}
let email = '';
if (emailCell) {
const emailLink = emailCell.querySelector('a[href^="mailto:"]');
if (emailLink) {
email = emailLink.href.replace('mailto:', '').trim();
}
}
staff.push({
name: displayName,
title: roleCell ? roleCell.innerText.trim() : '',
url: profileUrl,
email: email
});
}
return staff;
}"""
def should_use_accounting_staff(program_name: str) -> bool:
lower_name = program_name.lower()
return "msc" in lower_name and "accounting" in lower_name
async def load_accounting_staff(context, output_callback=None):
global ACCOUNTING_STAFF_CACHE
if ACCOUNTING_STAFF_CACHE is not None:
return ACCOUNTING_STAFF_CACHE
staff_page = await context.new_page()
try:
if output_callback:
output_callback("info", "Loading official AMBS Accounting & Finance staff page...")
await staff_page.goto(ACCOUNTING_STAFF_URL, wait_until="domcontentloaded", timeout=30000)
await staff_page.wait_for_timeout(2000)
ACCOUNTING_STAFF_CACHE = await staff_page.evaluate(JS_EXTRACT_ACCOUNTING_STAFF)
if output_callback:
output_callback("info", f"Captured {len(ACCOUNTING_STAFF_CACHE)} faculty from the official staff page")
except Exception as exc:
if output_callback:
output_callback("error", f"Failed to load AMBS staff page: {exc}")
ACCOUNTING_STAFF_CACHE = []
finally:
await staff_page.close()
return ACCOUNTING_STAFF_CACHE
async def find_course_list_page(page, base_url, output_callback):
for path in MASTERS_PATHS:
test_url = base_url.rstrip('/') + path
try:
response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
if response and response.status == 200:
title = await page.title()
if '404' not in title.lower() and 'not found' not in title.lower():
has_courses = await page.evaluate(JS_CHECK_COURSES)
if has_courses > 5:
if output_callback:
output_callback("info", f"Found course list: {path} ({has_courses} courses)")
return test_url
list_url = await page.evaluate(JS_FIND_LIST_URL)
if list_url:
if output_callback:
output_callback("info", f"Found full course list: {list_url}")
return list_url
except:
continue
try:
await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
if courses_url:
return courses_url
except:
pass
return None
async def extract_course_links(page, output_callback):
return await page.evaluate(JS_EXTRACT_PROGRAMS)
async def scrape(output_callback=None):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = await context.new_page()
base_url = "https://www.manchester.ac.uk/"
result = {
"name": "Manchester University",
"url": base_url,
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": []
}
all_programs = []
try:
if output_callback:
output_callback("info", "Searching for masters course list...")
courses_url = await find_course_list_page(page, base_url, output_callback)
if not courses_url:
if output_callback:
output_callback("warning", "Course list not found, using homepage")
courses_url = base_url
if output_callback:
output_callback("info", "Extracting masters programs...")
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
for _ in range(3):
try:
load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
if await load_more.count() > 0:
await load_more.first.click()
await page.wait_for_timeout(2000)
else:
break
except:
break
programs_data = await extract_course_links(page, output_callback)
if output_callback:
output_callback("info", f"Found {len(programs_data)} masters programs")
print("\nTop 20 programs:")
for i, prog in enumerate(programs_data[:20]):
print(f" {i+1}. {prog['name'][:60]}")
print(f" {prog['url']}")
max_detail_pages = min(len(programs_data), 30)
detailed_processed = 0
logged_official_staff = False
for prog in programs_data:
faculty_data = []
used_official_staff = False
if should_use_accounting_staff(prog['name']):
staff_list = await load_accounting_staff(context, output_callback)
if staff_list:
used_official_staff = True
if output_callback and not logged_official_staff:
output_callback("info", "Using Alliance MBS Accounting & Finance staff directory for accounting programmes")
logged_official_staff = True
faculty_data = [
{
"name": person.get("name"),
"url": person.get("url") or ACCOUNTING_STAFF_URL,
"title": person.get("title"),
"email": person.get("email"),
"source": "Alliance Manchester Business School - Accounting & Finance staff"
}
for person in staff_list
]
elif detailed_processed < max_detail_pages:
detailed_processed += 1
if output_callback and detailed_processed % 10 == 0:
output_callback("info", f"Processing {detailed_processed}/{max_detail_pages}: {prog['name'][:50]}")
try:
await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
await page.wait_for_timeout(800)
faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
except Exception as e:
if output_callback:
output_callback("warning", f"Failed to capture faculty for {prog['name'][:50]}: {e}")
faculty_data = []
program_entry = {
"name": prog['name'],
"url": prog['url'],
"faculty": faculty_data
}
if used_official_staff:
program_entry["faculty_page_override"] = ACCOUNTING_STAFF_URL
all_programs.append(program_entry)
result["schools"] = [{
"name": "Masters Programs",
"url": courses_url,
"programs": all_programs
}]
if output_callback:
total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
output_callback("info", f"Done! {len(all_programs)} programs, {total_faculty} faculty")
except Exception as e:
if output_callback:
output_callback("error", f"Scraping error: {str(e)}")
finally:
await browser.close()
return result
def log_callback(level, message):
print(f"[{level.upper()}] {message}")
if __name__ == "__main__":
result = asyncio.run(scrape(output_callback=log_callback))
print("\n" + "="*60)
print("Scrape summary:")
print("="*60)
if result.get("schools"):
school = result["schools"][0]
programs = school.get("programs", [])
print(f"Course list URL: {school.get('url')}")
print(f"Total programs: {len(programs)}")
faculty_count = sum(len(p.get('faculty', [])) for p in programs)
print(f"Faculty total: {faculty_count}")
print("\nTop 10 programs:")
for i, p in enumerate(programs[:10]):
print(f" {i+1}. {p['name'][:60]}")
if p.get("faculty"):
print(f" Faculty entries: {len(p['faculty'])}")
with open("manchester_test_result.json", "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print("\nSaved results to manchester_test_result.json")