Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
83
artifacts/debug_cs_faculty.py
Normal file
83
artifacts/debug_cs_faculty.py
Normal file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
调试Computer Science的Faculty页面
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
async def debug_cs():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
# 访问Computer Science GSAS页面
|
||||
gsas_url = "https://gsas.harvard.edu/program/computer-science"
|
||||
print(f"访问: {gsas_url}")
|
||||
|
||||
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
await page.screenshot(path="cs_gsas_page.png", full_page=True)
|
||||
print("截图已保存: cs_gsas_page.png")
|
||||
|
||||
# 查找所有链接
|
||||
links = await page.evaluate('''() => {
|
||||
const links = [];
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const text = a.innerText.trim();
|
||||
const href = a.href;
|
||||
if (text && text.length > 2 && text.length < 100) {
|
||||
links.push({text: text, href: href});
|
||||
}
|
||||
});
|
||||
return links;
|
||||
}''')
|
||||
|
||||
print(f"\n页面上的所有链接 ({len(links)} 个):")
|
||||
for link in links:
|
||||
print(f" - {link['text'][:60]} -> {link['href']}")
|
||||
|
||||
# 查找可能的Faculty或People链接
|
||||
print("\n\n查找Faculty/People相关链接:")
|
||||
for link in links:
|
||||
text_lower = link['text'].lower()
|
||||
href_lower = link['href'].lower()
|
||||
if 'faculty' in text_lower or 'people' in href_lower or 'faculty' in href_lower or 'website' in text_lower:
|
||||
print(f" * {link['text']} -> {link['href']}")
|
||||
|
||||
# 尝试访问SEAS (School of Engineering)
|
||||
print("\n\n尝试访问SEAS Computer Science页面...")
|
||||
seas_url = "https://seas.harvard.edu/computer-science"
|
||||
await page.goto(seas_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
await page.screenshot(path="seas_cs_page.png", full_page=True)
|
||||
print("截图已保存: seas_cs_page.png")
|
||||
|
||||
seas_links = await page.evaluate('''() => {
|
||||
const links = [];
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const text = a.innerText.trim();
|
||||
const href = a.href;
|
||||
const lowerText = text.toLowerCase();
|
||||
const lowerHref = href.toLowerCase();
|
||||
if ((lowerText.includes('faculty') || lowerText.includes('people') ||
|
||||
lowerHref.includes('faculty') || lowerHref.includes('people')) &&
|
||||
text.length > 2) {
|
||||
links.push({text: text, href: href});
|
||||
}
|
||||
});
|
||||
return links;
|
||||
}''')
|
||||
|
||||
print(f"\nSEAS页面上的Faculty/People链接:")
|
||||
for link in seas_links:
|
||||
print(f" * {link['text']} -> {link['href']}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(debug_cs())
|
||||
110
artifacts/explore_faculty_page.py
Normal file
110
artifacts/explore_faculty_page.py
Normal file
@ -0,0 +1,110 @@
|
||||
"""
|
||||
探索Harvard院系People/Faculty页面结构,获取导师列表
|
||||
"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def explore_faculty_page():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
# 访问AAAS院系People页面
|
||||
people_url = "https://aaas.fas.harvard.edu/aaas-people"
|
||||
print(f"访问院系People页面: {people_url}")
|
||||
|
||||
await page.goto(people_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 截图保存
|
||||
await page.screenshot(path="aaas_people_page.png", full_page=True)
|
||||
print("已保存截图: aaas_people_page.png")
|
||||
|
||||
# 获取所有教职员工链接
|
||||
faculty_info = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
|
||||
// 查找所有 /people/ 路径的链接
|
||||
document.querySelectorAll('a[href*="/people/"]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
|
||||
// 过滤掉导航链接,只保留个人页面链接
|
||||
if (href.includes('/people/') && text.length > 3 &&
|
||||
!text.toLowerCase().includes('people') &&
|
||||
!href.endsWith('/people/') &&
|
||||
!href.endsWith('/aaas-people')) {
|
||||
faculty.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
|
||||
print(f"\n找到 {len(faculty_info)} 个教职员工:")
|
||||
for f in faculty_info:
|
||||
print(f" - {f['name']} -> {f['url']}")
|
||||
|
||||
# 尝试经济学院系的Faculty页面
|
||||
print("\n\n========== 尝试经济学院系Faculty页面 ==========")
|
||||
econ_faculty_url = "http://economics.harvard.edu/people/people-type/faculty"
|
||||
print(f"访问: {econ_faculty_url}")
|
||||
|
||||
await page.goto(econ_faculty_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
await page.screenshot(path="econ_faculty_page.png", full_page=True)
|
||||
print("已保存截图: econ_faculty_page.png")
|
||||
|
||||
econ_faculty = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
|
||||
// 查找所有可能的faculty链接
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
|
||||
// 查找个人页面链接
|
||||
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
|
||||
lowerHref.includes('/profile/')) &&
|
||||
text.length > 3 && text.length < 100 &&
|
||||
!text.toLowerCase().includes('faculty') &&
|
||||
!text.toLowerCase().includes('people')) {
|
||||
faculty.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
|
||||
print(f"\n找到 {len(econ_faculty)} 个教职员工:")
|
||||
for f in econ_faculty[:30]:
|
||||
print(f" - {f['name']} -> {f['url']}")
|
||||
|
||||
# 查看页面上所有链接用于调试
|
||||
print("\n\n页面上的所有链接:")
|
||||
all_links = await page.evaluate('''() => {
|
||||
const links = [];
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
if (text && text.length > 2 && text.length < 100) {
|
||||
links.push({text: text, href: href});
|
||||
}
|
||||
});
|
||||
return links;
|
||||
}''')
|
||||
for link in all_links[:40]:
|
||||
print(f" - {link['text'][:50]} -> {link['href']}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(explore_faculty_page())
|
||||
173
artifacts/explore_manchester.py
Normal file
173
artifacts/explore_manchester.py
Normal file
@ -0,0 +1,173 @@
|
||||
"""
|
||||
探索曼彻斯特大学硕士课程页面结构
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
async def explore_manchester():
|
||||
"""探索曼彻斯特大学网站结构"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
# 直接访问硕士课程A-Z列表页
|
||||
print("访问硕士课程A-Z列表页面...")
|
||||
await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/",
|
||||
wait_until="domcontentloaded", timeout=60000)
|
||||
await page.wait_for_timeout(5000)
|
||||
|
||||
# 截图
|
||||
await page.screenshot(path="manchester_masters_page.png", full_page=False)
|
||||
print("截图已保存: manchester_masters_page.png")
|
||||
|
||||
# 分析页面结构
|
||||
page_info = await page.evaluate("""() => {
|
||||
const info = {
|
||||
title: document.title,
|
||||
url: window.location.href,
|
||||
all_links: [],
|
||||
course_candidates: [],
|
||||
page_sections: []
|
||||
};
|
||||
|
||||
// 获取所有链接
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim().substring(0, 100);
|
||||
if (href && text) {
|
||||
info.all_links.push({href, text});
|
||||
}
|
||||
});
|
||||
|
||||
// 查找可能的课程链接 - 包含 /course/ 或 list-item
|
||||
document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => {
|
||||
info.course_candidates.push({
|
||||
href: a.href,
|
||||
text: a.innerText.trim().substring(0, 100),
|
||||
classes: a.className,
|
||||
parent_classes: a.parentElement?.className || ''
|
||||
});
|
||||
});
|
||||
|
||||
// 获取页面主要区块
|
||||
document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => {
|
||||
info.page_sections.push({
|
||||
tag: el.tagName,
|
||||
id: el.id,
|
||||
classes: el.className,
|
||||
children_count: el.children.length
|
||||
});
|
||||
});
|
||||
|
||||
return info;
|
||||
}""")
|
||||
|
||||
print(f"\n页面标题: {page_info['title']}")
|
||||
print(f"当前URL: {page_info['url']}")
|
||||
print(f"\n总链接数: {len(page_info['all_links'])}")
|
||||
print(f"课程候选链接数: {len(page_info['course_candidates'])}")
|
||||
|
||||
# 查找包含 masters/courses/ 的链接
|
||||
masters_links = [l for l in page_info['all_links']
|
||||
if 'masters/courses/' in l['href'].lower()
|
||||
and l['href'] != page_info['url']]
|
||||
|
||||
print(f"\n硕士课程相关链接 ({len(masters_links)}):")
|
||||
for link in masters_links[:20]:
|
||||
print(f" - {link['text'][:50]}: {link['href']}")
|
||||
|
||||
print(f"\n课程候选详情:")
|
||||
for c in page_info['course_candidates'][:10]:
|
||||
print(f" - {c['text'][:50]}")
|
||||
print(f" URL: {c['href']}")
|
||||
print(f" Classes: {c['classes']}")
|
||||
|
||||
# 检查是否有搜索/筛选功能
|
||||
search_elements = await page.evaluate("""() => {
|
||||
const elements = [];
|
||||
document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => {
|
||||
elements.push({
|
||||
tag: el.tagName,
|
||||
type: el.type || '',
|
||||
id: el.id,
|
||||
name: el.name || '',
|
||||
classes: el.className
|
||||
});
|
||||
});
|
||||
return elements;
|
||||
}""")
|
||||
|
||||
print(f"\n搜索/筛选元素: {len(search_elements)}")
|
||||
for el in search_elements[:5]:
|
||||
print(f" - {el}")
|
||||
|
||||
# 尝试找到课程列表的实际结构
|
||||
print("\n\n正在分析页面中的课程列表结构...")
|
||||
|
||||
list_structures = await page.evaluate("""() => {
|
||||
const structures = [];
|
||||
|
||||
// 查找各种可能的列表结构
|
||||
const selectors = [
|
||||
'ul li a[href*="course"]',
|
||||
'div[class*="result"] a',
|
||||
'div[class*="course"] a',
|
||||
'article a[href]',
|
||||
'.search-results a',
|
||||
'[data-course] a',
|
||||
'table tr td a'
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
if (elements.length > 0) {
|
||||
const samples = [];
|
||||
elements.forEach((el, i) => {
|
||||
if (i < 5) {
|
||||
samples.push({
|
||||
href: el.href,
|
||||
text: el.innerText.trim().substring(0, 80)
|
||||
});
|
||||
}
|
||||
});
|
||||
structures.push({
|
||||
selector: selector,
|
||||
count: elements.length,
|
||||
samples: samples
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return structures;
|
||||
}""")
|
||||
|
||||
print("\n找到的列表结构:")
|
||||
for s in list_structures:
|
||||
print(f"\n 选择器: {s['selector']} (共 {s['count']} 个)")
|
||||
for sample in s['samples']:
|
||||
print(f" - {sample['text']}: {sample['href']}")
|
||||
|
||||
# 保存完整分析结果
|
||||
with open("manchester_analysis.json", "w", encoding="utf-8") as f:
|
||||
json.dump(page_info, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print("\n\n完整分析已保存到 manchester_analysis.json")
|
||||
|
||||
# 等待用户查看
|
||||
print("\n按 Ctrl+C 关闭浏览器...")
|
||||
try:
|
||||
await asyncio.sleep(30)
|
||||
except:
|
||||
pass
|
||||
|
||||
await browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(explore_manchester())
|
||||
226
artifacts/explore_program_page.py
Normal file
226
artifacts/explore_program_page.py
Normal file
@ -0,0 +1,226 @@
|
||||
"""
|
||||
探索Harvard项目页面结构,寻找导师信息
|
||||
"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def explore_program_page():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
# 访问研究生院系页面 (GSAS)
|
||||
gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies"
|
||||
print(f"访问研究生院系页面: {gsas_url}")
|
||||
|
||||
await page.goto(gsas_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 截图保存
|
||||
await page.screenshot(path="gsas_program_page.png", full_page=True)
|
||||
print("已保存截图: gsas_program_page.png")
|
||||
|
||||
# 分析页面结构
|
||||
page_info = await page.evaluate('''() => {
|
||||
const info = {
|
||||
title: document.title,
|
||||
h1: document.querySelector('h1')?.innerText || '',
|
||||
allHeadings: [],
|
||||
facultyLinks: [],
|
||||
peopleLinks: [],
|
||||
allLinks: []
|
||||
};
|
||||
|
||||
// 获取所有标题
|
||||
document.querySelectorAll('h1, h2, h3, h4').forEach(h => {
|
||||
info.allHeadings.push({
|
||||
tag: h.tagName,
|
||||
text: h.innerText.trim().substring(0, 100)
|
||||
});
|
||||
});
|
||||
|
||||
// 查找所有链接
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
|
||||
// 检查是否与教职员工相关
|
||||
const lowerHref = href.toLowerCase();
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
|
||||
lowerHref.includes('professor') || lowerHref.includes('staff') ||
|
||||
lowerText.includes('faculty') || lowerText.includes('people')) {
|
||||
info.facultyLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
|
||||
// 检查是否是个人页面链接
|
||||
if (href.includes('/people/') || href.includes('/faculty/') ||
|
||||
href.includes('/profile/') || href.includes('/person/')) {
|
||||
info.peopleLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
|
||||
// 保存所有主要链接
|
||||
if (href && text.length > 2 && text.length < 150) {
|
||||
info.allLinks.push({
|
||||
text: text,
|
||||
href: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return info;
|
||||
}''')
|
||||
|
||||
print(f"\n页面标题: {page_info['title']}")
|
||||
print(f"H1: {page_info['h1']}")
|
||||
|
||||
print(f"\n所有标题 ({len(page_info['allHeadings'])}):")
|
||||
for h in page_info['allHeadings']:
|
||||
print(f" <{h['tag']}>: {h['text']}")
|
||||
|
||||
print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):")
|
||||
for f in page_info['facultyLinks']:
|
||||
print(f" - {f['text']} -> {f['href']}")
|
||||
|
||||
print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):")
|
||||
for p in page_info['peopleLinks']:
|
||||
print(f" - {p['text']} -> {p['href']}")
|
||||
|
||||
print(f"\n所有链接 ({len(page_info['allLinks'])}):")
|
||||
for link in page_info['allLinks'][:50]:
|
||||
print(f" - {link['text'][:60]} -> {link['href']}")
|
||||
|
||||
# 尝试另一个项目页面看看是否有不同结构
|
||||
print("\n\n========== 尝试另一个项目页面 ==========")
|
||||
economics_url = "https://gsas.harvard.edu/program/economics"
|
||||
print(f"访问: {economics_url}")
|
||||
|
||||
await page.goto(economics_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 截图保存
|
||||
await page.screenshot(path="gsas_economics_page.png", full_page=True)
|
||||
print("已保存截图: gsas_economics_page.png")
|
||||
|
||||
# 分析
|
||||
econ_info = await page.evaluate('''() => {
|
||||
const info = {
|
||||
title: document.title,
|
||||
facultyLinks: [],
|
||||
peopleLinks: []
|
||||
};
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
|
||||
lowerText.includes('faculty') || lowerText.includes('people')) {
|
||||
info.facultyLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
|
||||
if (href.includes('/people/') || href.includes('/faculty/') ||
|
||||
href.includes('/profile/') || href.includes('/person/')) {
|
||||
info.peopleLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return info;
|
||||
}''')
|
||||
|
||||
print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):")
|
||||
for f in econ_info['facultyLinks']:
|
||||
print(f" - {f['text']} -> {f['href']}")
|
||||
|
||||
print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):")
|
||||
for p in econ_info['peopleLinks']:
|
||||
print(f" - {p['text']} -> {p['href']}")
|
||||
|
||||
# 访问院系主页看看有没有Faculty页面
|
||||
print("\n\n========== 尝试访问院系主页 ==========")
|
||||
dept_url = "https://aaas.fas.harvard.edu/"
|
||||
print(f"访问院系主页: {dept_url}")
|
||||
|
||||
await page.goto(dept_url, wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
await page.screenshot(path="aaas_dept_page.png", full_page=True)
|
||||
print("已保存截图: aaas_dept_page.png")
|
||||
|
||||
dept_info = await page.evaluate('''() => {
|
||||
const info = {
|
||||
title: document.title,
|
||||
navLinks: [],
|
||||
facultyLinks: [],
|
||||
peopleLinks: []
|
||||
};
|
||||
|
||||
// 获取导航链接
|
||||
document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
if (text && text.length > 1 && text.length < 50) {
|
||||
info.navLinks.push({
|
||||
text: text,
|
||||
href: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
|
||||
lowerText.includes('faculty') || lowerText.includes('people')) {
|
||||
info.facultyLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
|
||||
if (href.includes('/people/') || href.includes('/faculty/') ||
|
||||
href.includes('/profile/')) {
|
||||
info.peopleLinks.push({
|
||||
text: text.substring(0, 100),
|
||||
href: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return info;
|
||||
}''')
|
||||
|
||||
print(f"\n导航链接 ({len(dept_info['navLinks'])}):")
|
||||
for link in dept_info['navLinks'][:20]:
|
||||
print(f" - {link['text']} -> {link['href']}")
|
||||
|
||||
print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):")
|
||||
for f in dept_info['facultyLinks']:
|
||||
print(f" - {f['text']} -> {f['href']}")
|
||||
|
||||
print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):")
|
||||
for p in dept_info['peopleLinks'][:30]:
|
||||
print(f" - {p['text']} -> {p['href']}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(explore_program_page())
|
||||
@ -125,6 +125,7 @@ class ScrapeSettings:
|
||||
output: Path
|
||||
verify_links: bool = True
|
||||
request_delay: float = 1.0 # Polite crawling delay
|
||||
timeout: int = 60000 # Navigation timeout in ms
|
||||
|
||||
|
||||
async def extract_links(page: Page) -> List[Tuple[str, str]]:
|
||||
@ -210,7 +211,7 @@ async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response = await page.goto(
|
||||
normalized_url, wait_until="domcontentloaded", timeout=20000
|
||||
normalized_url, wait_until="domcontentloaded", timeout=settings.timeout
|
||||
)
|
||||
if not response or response.status >= 400:
|
||||
await page.close()
|
||||
@ -411,6 +412,12 @@ def parse_args() -> argparse.Namespace:
|
||||
default=1.0,
|
||||
help="Delay between requests in seconds (polite crawling).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=60000,
|
||||
help="Navigation timeout in milliseconds (default: 60000 = 60s).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -424,6 +431,7 @@ async def main_async() -> None:
|
||||
output=args.output,
|
||||
verify_links=not args.no_verify,
|
||||
request_delay=args.delay,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
links = await crawl(settings, browser_name=args.browser)
|
||||
serialize(links, settings.output, settings.root_url)
|
||||
|
||||
466
artifacts/harvard_programs_scraper.py
Normal file
466
artifacts/harvard_programs_scraper.py
Normal file
@ -0,0 +1,466 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Harvard Graduate Programs Scraper
|
||||
专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
|
||||
通过点击分页按钮遍历所有页面
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
async def scrape_harvard_programs():
|
||||
"""爬取Harvard研究生项目列表页面 - 通过点击分页按钮"""
|
||||
|
||||
all_programs = []
|
||||
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
|
||||
|
||||
async with async_playwright() as p:
|
||||
# 使用无头模式
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
print(f"正在访问: {base_url}")
|
||||
# 使用 domcontentloaded 而非 networkidle,更快加载
|
||||
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
|
||||
# 等待页面内容加载
|
||||
await page.wait_for_timeout(5000)
|
||||
|
||||
# 滚动到页面底部以确保分页按钮加载
|
||||
print("滚动到页面底部...")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
current_page = 1
|
||||
max_pages = 15
|
||||
|
||||
while current_page <= max_pages:
|
||||
print(f"\n========== 第 {current_page} 页 ==========")
|
||||
|
||||
# 等待内容加载
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 提取当前页面的项目
|
||||
# 从调试输出得知,项目按钮的class是 'records__record___PbPhG c-programs-item__title-link'
|
||||
# 需要点击按钮来获取URL,因为Harvard使用JavaScript导航
|
||||
|
||||
# 首先获取所有项目按钮信息
|
||||
page_data = await page.evaluate('''() => {
|
||||
const programs = [];
|
||||
|
||||
// 查找所有项目行/容器
|
||||
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
|
||||
|
||||
programItems.forEach((item, index) => {
|
||||
// 获取项目名称按钮
|
||||
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
|
||||
if (!nameBtn) return;
|
||||
|
||||
const name = nameBtn.innerText.trim();
|
||||
if (!name || name.length < 3) return;
|
||||
|
||||
// 获取学位信息
|
||||
let degrees = '';
|
||||
const allText = item.innerText;
|
||||
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
|
||||
if (degreeMatch) {
|
||||
degrees = degreeMatch.join(', ');
|
||||
}
|
||||
|
||||
// 查找链接 - 检查各种可能的位置
|
||||
let url = '';
|
||||
|
||||
// 方法1: 查找 <a> 标签
|
||||
const link = item.querySelector('a[href]');
|
||||
if (link && link.href) {
|
||||
url = link.href;
|
||||
}
|
||||
|
||||
// 方法2: 检查data属性
|
||||
if (!url) {
|
||||
const dataUrl = nameBtn.getAttribute('data-url') ||
|
||||
nameBtn.getAttribute('data-href') ||
|
||||
item.getAttribute('data-url');
|
||||
if (dataUrl) url = dataUrl;
|
||||
}
|
||||
|
||||
// 方法3: 检查onclick属性
|
||||
if (!url) {
|
||||
const onclick = nameBtn.getAttribute('onclick') || '';
|
||||
const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/);
|
||||
if (urlMatch) url = urlMatch[1];
|
||||
}
|
||||
|
||||
programs.push({
|
||||
name: name,
|
||||
degrees: degrees,
|
||||
url: url,
|
||||
index: index
|
||||
});
|
||||
});
|
||||
|
||||
// 如果方法1没找到项目,使用备选方法
|
||||
if (programs.length === 0) {
|
||||
// 查找所有项目按钮
|
||||
const buttons = document.querySelectorAll('button');
|
||||
buttons.forEach((btn, index) => {
|
||||
const className = btn.className || '';
|
||||
if (className.includes('c-programs-item') || className.includes('title-link')) {
|
||||
const name = btn.innerText.trim();
|
||||
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
|
||||
programs.push({
|
||||
name: name,
|
||||
degrees: '',
|
||||
url: '',
|
||||
index: index
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
programs: programs,
|
||||
totalFound: programs.length
|
||||
};
|
||||
}''')
|
||||
|
||||
# 第一页时调试输出HTML结构
|
||||
if current_page == 1 and len(page_data['programs']) == 0:
|
||||
print("未找到项目,调试HTML结构...")
|
||||
html_debug = await page.evaluate('''() => {
|
||||
const debug = {
|
||||
allButtons: [],
|
||||
allLinks: [],
|
||||
sampleHTML: ''
|
||||
};
|
||||
|
||||
// 获取所有按钮
|
||||
document.querySelectorAll('button').forEach(btn => {
|
||||
const text = btn.innerText.trim().substring(0, 50);
|
||||
if (text && text.length > 3) {
|
||||
debug.allButtons.push({
|
||||
text: text,
|
||||
class: btn.className.substring(0, 80)
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 获取main区域的HTML片段
|
||||
const main = document.querySelector('main') || document.body;
|
||||
debug.sampleHTML = main.innerHTML.substring(0, 3000);
|
||||
|
||||
return debug;
|
||||
}''')
|
||||
print(f"找到 {len(html_debug['allButtons'])} 个按钮:")
|
||||
for btn in html_debug['allButtons'][:20]:
|
||||
print(f" - {btn['text']} | class: {btn['class']}")
|
||||
print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}")
|
||||
|
||||
print(f" 本页找到 {len(page_data['programs'])} 个项目")
|
||||
|
||||
# 打印找到的项目
|
||||
for prog in page_data['programs']:
|
||||
print(f" - {prog['name']} ({prog['degrees']})")
|
||||
|
||||
# 添加到总列表(去重)
|
||||
for prog in page_data['programs']:
|
||||
name = prog['name'].strip()
|
||||
if name and not any(p['name'] == name for p in all_programs):
|
||||
all_programs.append({
|
||||
'name': name,
|
||||
'degrees': prog.get('degrees', ''),
|
||||
'url': prog.get('url', ''),
|
||||
'page': current_page
|
||||
})
|
||||
|
||||
# 尝试点击下一页按钮
|
||||
try:
|
||||
clicked = False
|
||||
|
||||
# 首先打印所有分页相关元素用于调试
|
||||
if current_page == 1:
|
||||
# 截图保存以便调试
|
||||
await page.screenshot(path="harvard_debug_pagination.png", full_page=True)
|
||||
print("已保存调试截图: harvard_debug_pagination.png")
|
||||
|
||||
pagination_info = await page.evaluate('''() => {
|
||||
const result = {
|
||||
links: [],
|
||||
buttons: [],
|
||||
allClickable: [],
|
||||
pageNumbers: [],
|
||||
allText: []
|
||||
};
|
||||
|
||||
// 查找所有链接
|
||||
document.querySelectorAll('a').forEach(a => {
|
||||
const text = a.innerText.trim();
|
||||
if (text.match(/^[0-9]+$|Next|page|Prev/i)) {
|
||||
result.links.push({
|
||||
text: text.substring(0, 50),
|
||||
href: a.href,
|
||||
visible: a.offsetParent !== null,
|
||||
className: a.className
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 查找所有按钮
|
||||
document.querySelectorAll('button').forEach(b => {
|
||||
const text = b.innerText.trim();
|
||||
if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) {
|
||||
result.buttons.push({
|
||||
text: text.substring(0, 50),
|
||||
visible: b.offsetParent !== null,
|
||||
className: b.className
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 查找所有包含数字的可点击元素(可能是分页)
|
||||
document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => {
|
||||
const text = el.innerText.trim();
|
||||
if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) {
|
||||
result.pageNumbers.push({
|
||||
tag: el.tagName,
|
||||
text: text,
|
||||
className: el.className,
|
||||
id: el.id,
|
||||
ariaLabel: el.getAttribute('aria-label'),
|
||||
visible: el.offsetParent !== null
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 查找页面底部区域的所有可点击元素
|
||||
const bodyRect = document.body.getBoundingClientRect();
|
||||
document.querySelectorAll('*').forEach(el => {
|
||||
const rect = el.getBoundingClientRect();
|
||||
const text = el.innerText?.trim() || '';
|
||||
// 只看页面下半部分的元素且文本短
|
||||
if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) {
|
||||
const style = window.getComputedStyle(el);
|
||||
if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') {
|
||||
result.allClickable.push({
|
||||
tag: el.tagName,
|
||||
text: text.substring(0, 30),
|
||||
top: Math.round(rect.top),
|
||||
className: el.className?.substring?.(0, 50) || ''
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 输出页面底部所有文本以便调试
|
||||
const bodyText = document.body.innerText;
|
||||
const lines = bodyText.split('\\n').filter(l => l.trim());
|
||||
// 找到包含数字1-9的行
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
if (lines[i].match(/^[1-9]$|Next page|Previous/)) {
|
||||
result.allText.push(lines[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}''')
|
||||
print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):")
|
||||
for link in pagination_info['links']:
|
||||
print(f" a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})")
|
||||
print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):")
|
||||
for btn in pagination_info['buttons']:
|
||||
print(f" button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})")
|
||||
print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):")
|
||||
for pn in pagination_info['pageNumbers']:
|
||||
print(f" {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}")
|
||||
print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):")
|
||||
for el in pagination_info['allClickable'][:30]:
|
||||
print(f" {el['tag']}: '{el['text']}' (top: {el['top']})")
|
||||
print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):")
|
||||
for txt in pagination_info['allText'][:20]:
|
||||
print(f" '{txt}'")
|
||||
|
||||
# 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠)
|
||||
# 从调试输出得知,分页按钮是 <button class="c-pagination__link c-pagination__link--next">
|
||||
next_page_num = str(current_page + 1)
|
||||
|
||||
try:
|
||||
next_btn = page.locator('button.c-pagination__link--next')
|
||||
if await next_btn.count() > 0:
|
||||
print(f"\n找到 'Next page' 按钮 (CSS选择器),尝试点击...")
|
||||
await next_btn.first.scroll_into_view_if_needed()
|
||||
await next_btn.first.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
except Exception as e:
|
||||
print(f"方法1失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
# 方法2: 使用 get_by_role 查找按钮
|
||||
try:
|
||||
next_btn = page.get_by_role("button", name="Next page")
|
||||
if await next_btn.count() > 0:
|
||||
print(f"\n通过role找到 'Next page' 按钮,尝试点击...")
|
||||
await next_btn.first.scroll_into_view_if_needed()
|
||||
await next_btn.first.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
except Exception as e:
|
||||
print(f"方法2失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
# 方法3: 查找所有分页按钮并点击 "Next page"
|
||||
try:
|
||||
pagination_buttons = await page.query_selector_all('button.c-pagination__link')
|
||||
for btn in pagination_buttons:
|
||||
text = await btn.inner_text()
|
||||
if 'Next page' in text:
|
||||
print(f"\n通过遍历分页按钮找到 'Next page',点击...")
|
||||
await btn.scroll_into_view_if_needed()
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"方法3失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
# 方法4: 通过JavaScript直接点击分页按钮
|
||||
try:
|
||||
js_clicked = await page.evaluate('''() => {
|
||||
// 查找 Next page 按钮
|
||||
const nextBtn = document.querySelector('button.c-pagination__link--next');
|
||||
if (nextBtn) {
|
||||
nextBtn.click();
|
||||
return true;
|
||||
}
|
||||
// 备选:查找所有分页按钮
|
||||
const buttons = document.querySelectorAll('button.c-pagination__link');
|
||||
for (const btn of buttons) {
|
||||
if (btn.innerText.includes('Next page')) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}''')
|
||||
if js_clicked:
|
||||
print(f"\n通过JavaScript点击 'Next page' 成功")
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
except Exception as e:
|
||||
print(f"方法4失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
# 方法5: 遍历所有按钮查找
|
||||
try:
|
||||
all_buttons = await page.query_selector_all('button')
|
||||
for btn in all_buttons:
|
||||
try:
|
||||
text = await btn.inner_text()
|
||||
if 'Next page' in text:
|
||||
visible = await btn.is_visible()
|
||||
if visible:
|
||||
print(f"\n遍历所有按钮找到 'Next page',点击...")
|
||||
await btn.scroll_into_view_if_needed()
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
clicked = True
|
||||
break
|
||||
except:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"方法5失败: {e}")
|
||||
|
||||
if clicked:
|
||||
continue
|
||||
|
||||
print("没有找到下一页按钮,结束爬取")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f"点击下一页时出错: {e}")
|
||||
break
|
||||
|
||||
# 生成项目URL - Harvard的项目URL格式为:
|
||||
# https://www.harvard.edu/programs/{program-name-slug}/
|
||||
# 例如: african-and-african-american-studies
|
||||
|
||||
import re
|
||||
|
||||
def name_to_slug(name):
|
||||
"""将项目名称转换为URL slug"""
|
||||
# 转小写
|
||||
slug = name.lower()
|
||||
# 将特殊字符替换为空格
|
||||
slug = re.sub(r'[^\w\s-]', '', slug)
|
||||
# 替换空格为连字符
|
||||
slug = re.sub(r'[\s_]+', '-', slug)
|
||||
# 移除多余的连字符
|
||||
slug = re.sub(r'-+', '-', slug)
|
||||
# 移除首尾连字符
|
||||
slug = slug.strip('-')
|
||||
return slug
|
||||
|
||||
print("\n正在生成项目URL...")
|
||||
for prog in all_programs:
|
||||
slug = name_to_slug(prog['name'])
|
||||
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
|
||||
print(f" {prog['name']} -> {prog['url']}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
# 排序
|
||||
programs = sorted(all_programs, key=lambda x: x['name'])
|
||||
|
||||
# 保存
|
||||
result = {
|
||||
'source_url': base_url,
|
||||
'scraped_at': datetime.now(timezone.utc).isoformat(),
|
||||
'total_pages_scraped': current_page,
|
||||
'total_programs': len(programs),
|
||||
'programs': programs
|
||||
}
|
||||
|
||||
output_file = Path('harvard_programs_results.json')
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"爬取完成!")
|
||||
print(f"共爬取 {current_page} 页")
|
||||
print(f"共找到 {len(programs)} 个研究生项目")
|
||||
print(f"结果保存到: {output_file}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 打印完整列表
|
||||
print("\n研究生项目完整列表:")
|
||||
for i, prog in enumerate(programs, 1):
|
||||
print(f"{i:3}. {prog['name']} - {prog['degrees']}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(scrape_harvard_programs())
|
||||
356
artifacts/harvard_programs_with_faculty_scraper.py
Normal file
356
artifacts/harvard_programs_with_faculty_scraper.py
Normal file
@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Harvard Graduate Programs Scraper with Faculty Information
|
||||
爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
|
||||
并获取每个项目的导师个人信息页面URL
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
def name_to_slug(name):
|
||||
"""将项目名称转换为URL slug"""
|
||||
slug = name.lower()
|
||||
slug = re.sub(r'[^\w\s-]', '', slug)
|
||||
slug = re.sub(r'[\s_]+', '-', slug)
|
||||
slug = re.sub(r'-+', '-', slug)
|
||||
slug = slug.strip('-')
|
||||
return slug
|
||||
|
||||
|
||||
async def extract_faculty_from_page(page):
|
||||
"""从当前页面提取所有教职员工链接"""
|
||||
faculty_list = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
// 检查是否是个人页面链接
|
||||
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
|
||||
lowerHref.includes('/profile/') || lowerHref.includes('/person/')) &&
|
||||
text.length > 3 && text.length < 100 &&
|
||||
!lowerText.includes('people') &&
|
||||
!lowerText.includes('faculty') &&
|
||||
!lowerText.includes('profile') &&
|
||||
!lowerText.includes('staff') &&
|
||||
!lowerHref.endsWith('/people/') &&
|
||||
!lowerHref.endsWith('/people') &&
|
||||
!lowerHref.endsWith('/faculty/') &&
|
||||
!lowerHref.endsWith('/faculty')) {
|
||||
|
||||
if (!seen.has(href)) {
|
||||
seen.add(href);
|
||||
faculty.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
return faculty_list
|
||||
|
||||
|
||||
async def get_faculty_from_gsas_page(page, gsas_url, program_name):
|
||||
"""从GSAS项目页面获取Faculty链接,然后访问院系People页面获取导师列表"""
|
||||
faculty_list = []
|
||||
faculty_page_url = None
|
||||
|
||||
try:
|
||||
print(f" 访问GSAS页面: {gsas_url}")
|
||||
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 策略1: 查找 "See list of ... faculty" 链接
|
||||
faculty_link = await page.evaluate('''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const link of links) {
|
||||
const text = link.innerText.toLowerCase();
|
||||
const href = link.href;
|
||||
if (text.includes('faculty') && text.includes('see list')) {
|
||||
return href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}''')
|
||||
|
||||
# 策略2: 查找任何包含 /people 或 /faculty 的链接
|
||||
if not faculty_link:
|
||||
faculty_link = await page.evaluate('''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const link of links) {
|
||||
const text = link.innerText.toLowerCase();
|
||||
const href = link.href.toLowerCase();
|
||||
// 查找Faculty相关链接
|
||||
if ((text.includes('faculty') || text.includes('people')) &&
|
||||
(href.includes('/people') || href.includes('/faculty'))) {
|
||||
return link.href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}''')
|
||||
|
||||
# 策略3: 从页面中查找院系网站链接,然后尝试访问其People页面
|
||||
if not faculty_link:
|
||||
dept_website = await page.evaluate('''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const link of links) {
|
||||
const text = link.innerText.toLowerCase();
|
||||
const href = link.href;
|
||||
// 查找 Website 链接 (通常指向院系主页)
|
||||
if (text.includes('website') && href.includes('harvard.edu') &&
|
||||
!href.includes('gsas.harvard.edu')) {
|
||||
return href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}''')
|
||||
|
||||
if dept_website:
|
||||
print(f" 找到院系网站: {dept_website}")
|
||||
try:
|
||||
await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 在院系网站上查找People/Faculty链接
|
||||
faculty_link = await page.evaluate('''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const link of links) {
|
||||
const text = link.innerText.toLowerCase().trim();
|
||||
const href = link.href;
|
||||
if ((text === 'people' || text === 'faculty' ||
|
||||
text === 'faculty & research' || text.includes('our faculty')) &&
|
||||
(href.includes('/people') || href.includes('/faculty'))) {
|
||||
return href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}''')
|
||||
except Exception as e:
|
||||
print(f" 访问院系网站失败: {e}")
|
||||
|
||||
if faculty_link:
|
||||
faculty_page_url = faculty_link
|
||||
print(f" 找到Faculty页面: {faculty_link}")
|
||||
|
||||
# 访问Faculty/People页面
|
||||
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 提取所有导师信息
|
||||
faculty_list = await extract_faculty_from_page(page)
|
||||
|
||||
# 如果第一页没找到,尝试处理分页或其他布局
|
||||
if len(faculty_list) == 0:
|
||||
# 可能需要点击某些按钮或处理JavaScript加载
|
||||
await page.wait_for_timeout(2000)
|
||||
faculty_list = await extract_faculty_from_page(page)
|
||||
|
||||
print(f" 找到 {len(faculty_list)} 位导师")
|
||||
else:
|
||||
print(f" 未找到Faculty页面链接")
|
||||
|
||||
except Exception as e:
|
||||
print(f" 获取Faculty信息失败: {e}")
|
||||
|
||||
return faculty_list, faculty_page_url
|
||||
|
||||
|
||||
async def scrape_harvard_programs_with_faculty():
|
||||
"""爬取Harvard研究生项目列表及导师信息"""
|
||||
|
||||
all_programs = []
|
||||
base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
print(f"正在访问: {base_url}")
|
||||
await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
|
||||
await page.wait_for_timeout(5000)
|
||||
|
||||
# 滚动到页面底部
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
current_page = 1
|
||||
max_pages = 15
|
||||
|
||||
# 第一阶段:收集所有项目基本信息
|
||||
print("\n========== 第一阶段:收集项目列表 ==========")
|
||||
while current_page <= max_pages:
|
||||
print(f"\n--- 第 {current_page} 页 ---")
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 提取当前页面的项目
|
||||
page_data = await page.evaluate('''() => {
|
||||
const programs = [];
|
||||
const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
|
||||
|
||||
programItems.forEach((item, index) => {
|
||||
const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
|
||||
if (!nameBtn) return;
|
||||
|
||||
const name = nameBtn.innerText.trim();
|
||||
if (!name || name.length < 3) return;
|
||||
|
||||
let degrees = '';
|
||||
const allText = item.innerText;
|
||||
const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
|
||||
if (degreeMatch) {
|
||||
degrees = degreeMatch.join(', ');
|
||||
}
|
||||
|
||||
programs.push({
|
||||
name: name,
|
||||
degrees: degrees
|
||||
});
|
||||
});
|
||||
|
||||
if (programs.length === 0) {
|
||||
const buttons = document.querySelectorAll('button');
|
||||
buttons.forEach((btn) => {
|
||||
const className = btn.className || '';
|
||||
if (className.includes('c-programs-item') || className.includes('title-link')) {
|
||||
const name = btn.innerText.trim();
|
||||
if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
|
||||
programs.push({
|
||||
name: name,
|
||||
degrees: ''
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return programs;
|
||||
}''')
|
||||
|
||||
print(f" 本页找到 {len(page_data)} 个项目")
|
||||
|
||||
for prog in page_data:
|
||||
name = prog['name'].strip()
|
||||
if name and not any(p['name'] == name for p in all_programs):
|
||||
all_programs.append({
|
||||
'name': name,
|
||||
'degrees': prog.get('degrees', ''),
|
||||
'page': current_page
|
||||
})
|
||||
|
||||
# 尝试点击下一页
|
||||
try:
|
||||
next_btn = page.locator('button.c-pagination__link--next')
|
||||
if await next_btn.count() > 0:
|
||||
await next_btn.first.scroll_into_view_if_needed()
|
||||
await next_btn.first.click()
|
||||
await page.wait_for_timeout(3000)
|
||||
current_page += 1
|
||||
else:
|
||||
print("没有下一页按钮,结束收集")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"分页失败: {e}")
|
||||
break
|
||||
|
||||
print(f"\n共收集到 {len(all_programs)} 个项目")
|
||||
|
||||
# 第二阶段:为每个项目获取导师信息
|
||||
print("\n========== 第二阶段:获取导师信息 ==========")
|
||||
print("注意:这将访问每个项目的GSAS页面,可能需要较长时间...")
|
||||
|
||||
for i, prog in enumerate(all_programs, 1):
|
||||
print(f"\n[{i}/{len(all_programs)}] {prog['name']}")
|
||||
|
||||
# 生成项目URL
|
||||
slug = name_to_slug(prog['name'])
|
||||
prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
|
||||
|
||||
# 生成GSAS URL
|
||||
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
|
||||
|
||||
# 获取导师信息
|
||||
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name'])
|
||||
|
||||
prog['faculty_page_url'] = faculty_page_url or ""
|
||||
prog['faculty'] = faculty_list
|
||||
prog['faculty_count'] = len(faculty_list)
|
||||
|
||||
# 每10个项目保存一次进度
|
||||
if i % 10 == 0:
|
||||
temp_result = {
|
||||
'source_url': base_url,
|
||||
'scraped_at': datetime.now(timezone.utc).isoformat(),
|
||||
'progress': f"{i}/{len(all_programs)}",
|
||||
'programs': all_programs[:i]
|
||||
}
|
||||
with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(temp_result, f, ensure_ascii=False, indent=2)
|
||||
print(f" [进度已保存]")
|
||||
|
||||
# 避免请求过快
|
||||
await page.wait_for_timeout(1500)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# 排序
|
||||
programs = sorted(all_programs, key=lambda x: x['name'])
|
||||
|
||||
# 统计
|
||||
total_faculty = sum(p['faculty_count'] for p in programs)
|
||||
programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0)
|
||||
|
||||
# 保存最终结果
|
||||
result = {
|
||||
'source_url': base_url,
|
||||
'scraped_at': datetime.now(timezone.utc).isoformat(),
|
||||
'total_pages_scraped': current_page,
|
||||
'total_programs': len(programs),
|
||||
'programs_with_faculty': programs_with_faculty,
|
||||
'total_faculty_found': total_faculty,
|
||||
'programs': programs
|
||||
}
|
||||
|
||||
output_file = Path('harvard_programs_with_faculty.json')
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"爬取完成!")
|
||||
print(f"共爬取 {current_page} 页")
|
||||
print(f"共找到 {len(programs)} 个研究生项目")
|
||||
print(f"其中 {programs_with_faculty} 个项目有导师信息")
|
||||
print(f"共找到 {total_faculty} 位导师")
|
||||
print(f"结果保存到: {output_file}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 打印摘要
|
||||
print("\n项目摘要 (前30个):")
|
||||
for i, prog in enumerate(programs[:30], 1):
|
||||
faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)"
|
||||
print(f"{i:3}. {prog['name']} {faculty_info}")
|
||||
|
||||
if len(programs) > 30:
|
||||
print(f"... 还有 {len(programs) - 30} 个项目")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(scrape_harvard_programs_with_faculty())
|
||||
910
artifacts/manchester_complete_scraper.py
Normal file
910
artifacts/manchester_complete_scraper.py
Normal file
@ -0,0 +1,910 @@
|
||||
"""
|
||||
曼彻斯特大学完整采集脚本
|
||||
新增特性:
|
||||
- Research Explorer API 优先拉取 JSON / XML,失败再回落 DOM
|
||||
- 每个学院独立页面、并行抓取(默认 3 并发)
|
||||
- 细粒度超时/重试/滚动/Load more 控制
|
||||
- 多 URL / 备用 Staff 页面配置
|
||||
- 导师目录缓存,可按学院关键词映射到项目
|
||||
- 诊断信息记录(失败学院、超时学院、批次信息)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import urlencode, urljoin
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
from playwright.async_api import (
|
||||
TimeoutError as PlaywrightTimeoutError,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
# =========================
|
||||
# 配置区
|
||||
# =========================
|
||||
|
||||
DEFAULT_REQUEST = {
|
||||
"timeout_ms": 60000,
|
||||
"post_wait_ms": 2500,
|
||||
"wait_until": "domcontentloaded",
|
||||
"max_retries": 3,
|
||||
"retry_backoff_ms": 2000,
|
||||
}
|
||||
|
||||
STAFF_CONCURRENCY = 3
|
||||
|
||||
SCHOOL_CONFIG: List[Dict[str, Any]] = [
|
||||
{
|
||||
"name": "Alliance Manchester Business School",
|
||||
"keywords": [
|
||||
"accounting",
|
||||
"finance",
|
||||
"business",
|
||||
"management",
|
||||
"marketing",
|
||||
"mba",
|
||||
"economics",
|
||||
"entrepreneurship",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
|
||||
"extract_method": "table",
|
||||
"request": {"timeout_ms": 60000, "wait_until": "networkidle"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Computer Science",
|
||||
"keywords": [
|
||||
"computer",
|
||||
"software",
|
||||
"data science",
|
||||
"artificial intelligence",
|
||||
"ai ",
|
||||
"machine learning",
|
||||
"cyber",
|
||||
"computing",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||||
"extract_method": "links",
|
||||
"requires_scroll": True,
|
||||
},
|
||||
{
|
||||
"url": "https://www.cs.manchester.ac.uk/about/people/",
|
||||
"extract_method": "links",
|
||||
"load_more_selector": "button.load-more",
|
||||
"max_load_more": 6,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Physics and Astronomy",
|
||||
"keywords": [
|
||||
"physics",
|
||||
"astronomy",
|
||||
"astrophysics",
|
||||
"nuclear",
|
||||
"particle",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||||
"extract_method": "links",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Electrical and Electronic Engineering",
|
||||
"keywords": [
|
||||
"electrical",
|
||||
"electronic",
|
||||
"eee",
|
||||
"power systems",
|
||||
"microelectronics",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/",
|
||||
"extract_method": "links",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Chemistry",
|
||||
"keywords": ["chemistry", "chemical"],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 200},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
"request": {
|
||||
"timeout_ms": 120000,
|
||||
"wait_until": "networkidle",
|
||||
"post_wait_ms": 5000,
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Department of Mathematics",
|
||||
"keywords": [
|
||||
"mathematics",
|
||||
"mathematical",
|
||||
"applied math",
|
||||
"statistics",
|
||||
"actuarial",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 200},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Engineering",
|
||||
"keywords": [
|
||||
"engineering",
|
||||
"mechanical",
|
||||
"aerospace",
|
||||
"civil",
|
||||
"structural",
|
||||
"materials",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 400},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Faculty of Biology, Medicine and Health",
|
||||
"keywords": [
|
||||
"medicine",
|
||||
"medical",
|
||||
"health",
|
||||
"nursing",
|
||||
"pharmacy",
|
||||
"clinical",
|
||||
"dental",
|
||||
"optometry",
|
||||
"biology",
|
||||
"biomedical",
|
||||
"anatomical",
|
||||
"physiotherapy",
|
||||
"midwifery",
|
||||
"mental health",
|
||||
"psychology",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 400},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Social Sciences",
|
||||
"keywords": [
|
||||
"sociology",
|
||||
"politics",
|
||||
"international",
|
||||
"social",
|
||||
"criminology",
|
||||
"anthropology",
|
||||
"philosophy",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 200},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Law",
|
||||
"keywords": ["law", "legal", "llm"],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 200},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Arts, Languages and Cultures",
|
||||
"keywords": [
|
||||
"arts",
|
||||
"languages",
|
||||
"culture",
|
||||
"music",
|
||||
"drama",
|
||||
"theatre",
|
||||
"history",
|
||||
"linguistics",
|
||||
"literature",
|
||||
"translation",
|
||||
"classics",
|
||||
"archaeology",
|
||||
"religion",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 400},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "School of Environment, Education and Development",
|
||||
"keywords": [
|
||||
"environment",
|
||||
"education",
|
||||
"development",
|
||||
"planning",
|
||||
"architecture",
|
||||
"urban",
|
||||
"geography",
|
||||
"sustainability",
|
||||
],
|
||||
"attach_faculty_to_programs": True,
|
||||
"extract_method": "research_explorer",
|
||||
"research_explorer": {"page_size": 300},
|
||||
"staff_pages": [
|
||||
{
|
||||
"url": "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/",
|
||||
"extract_method": "research_explorer",
|
||||
"requires_scroll": True,
|
||||
}
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
SCHOOL_LOOKUP = {cfg["name"]: cfg for cfg in SCHOOL_CONFIG}
|
||||
|
||||
# =========================
|
||||
# JS 提取函数
|
||||
# =========================
|
||||
|
||||
JS_EXTRACT_TABLE_STAFF = """() => {
|
||||
const staff = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('table tr').forEach(row => {
|
||||
const cells = row.querySelectorAll('td');
|
||||
if (cells.length >= 2) {
|
||||
const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]');
|
||||
const titleCell = cells[2] || cells[1];
|
||||
|
||||
if (link) {
|
||||
const name = link.innerText.trim();
|
||||
const url = link.href;
|
||||
const title = titleCell ? titleCell.innerText.trim() : '';
|
||||
|
||||
if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) {
|
||||
seen.add(url);
|
||||
staff.push({
|
||||
name,
|
||||
url,
|
||||
title
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return staff;
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_LINK_STAFF = """() => {
|
||||
const staff = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim();
|
||||
|
||||
if (seen.has(href)) return;
|
||||
if (text.length < 5 || text.length > 80) return;
|
||||
|
||||
const lowerText = text.toLowerCase();
|
||||
if (lowerText.includes('skip') ||
|
||||
lowerText.includes('staff') ||
|
||||
lowerText.includes('people') ||
|
||||
lowerText.includes('academic') ||
|
||||
lowerText.includes('research profiles')) return;
|
||||
|
||||
if (href.includes('/persons/') ||
|
||||
href.includes('/portal/en/researchers/') ||
|
||||
href.includes('/profile/') ||
|
||||
href.includes('/people/')) {
|
||||
seen.add(href);
|
||||
staff.push({
|
||||
name: text,
|
||||
url: href,
|
||||
title: ''
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return staff;
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_RESEARCH_EXPLORER = """() => {
|
||||
const staff = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a.link.person').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim();
|
||||
|
||||
if (!seen.has(href) && text.length > 3 && text.length < 80) {
|
||||
seen.add(href);
|
||||
staff.push({
|
||||
name: text,
|
||||
url: href,
|
||||
title: ''
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if (staff.length === 0) {
|
||||
document.querySelectorAll('a[href*="/persons/"]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim();
|
||||
const lower = text.toLowerCase();
|
||||
|
||||
if (seen.has(href)) return;
|
||||
if (text.length < 3 || text.length > 80) return;
|
||||
if (lower.includes('person') || lower.includes('next') || lower.includes('previous')) return;
|
||||
|
||||
seen.add(href);
|
||||
staff.push({
|
||||
name: text,
|
||||
url: href,
|
||||
title: ''
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return staff;
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_PROGRAMS = """() => {
|
||||
const programs = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim().replace(/\\s+/g, ' ');
|
||||
|
||||
if (!href || seen.has(href)) return;
|
||||
if (text.length < 10 || text.length > 200) return;
|
||||
|
||||
const hrefLower = href.toLowerCase();
|
||||
const textLower = text.toLowerCase();
|
||||
|
||||
const isNav = textLower === 'courses' ||
|
||||
textLower === 'masters' ||
|
||||
textLower.includes('admission') ||
|
||||
textLower.includes('fees') ||
|
||||
textLower.includes('skip to') ||
|
||||
textLower.includes('search') ||
|
||||
textLower.includes('contact') ||
|
||||
hrefLower.includes('#');
|
||||
if (isNav) return;
|
||||
|
||||
const hasNumericId = /\\/\\d{5}\\//.test(href);
|
||||
const isCoursePage = hrefLower.includes('/courses/list/') && hasNumericId;
|
||||
|
||||
if (isCoursePage) {
|
||||
seen.add(href);
|
||||
programs.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return programs;
|
||||
}"""
|
||||
|
||||
|
||||
# =========================
|
||||
# 数据匹配
|
||||
# =========================
|
||||
|
||||
def match_program_to_school(program_name: str) -> str:
|
||||
lower = program_name.lower()
|
||||
for school in SCHOOL_CONFIG:
|
||||
for keyword in school["keywords"]:
|
||||
if keyword in lower:
|
||||
return school["name"]
|
||||
return "Other Programs"
|
||||
|
||||
|
||||
# =========================
|
||||
# 请求与解析工具
|
||||
# =========================
|
||||
|
||||
def _merge_request_settings(*layers: Optional[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
settings = dict(DEFAULT_REQUEST)
|
||||
for layer in layers:
|
||||
if not layer:
|
||||
continue
|
||||
for key, value in layer.items():
|
||||
if value is not None:
|
||||
settings[key] = value
|
||||
settings["max_retries"] = max(1, int(settings.get("max_retries", 1)))
|
||||
settings["retry_backoff_ms"] = settings.get("retry_backoff_ms", 2000)
|
||||
return settings
|
||||
|
||||
|
||||
async def _goto_with_retry(page, url: str, settings: Dict[str, Any], label: str) -> Tuple[bool, Optional[str]]:
|
||||
last_error = None
|
||||
for attempt in range(settings["max_retries"]):
|
||||
try:
|
||||
await page.goto(url, wait_until=settings["wait_until"], timeout=settings["timeout_ms"])
|
||||
if settings.get("wait_for_selector"):
|
||||
await page.wait_for_selector(settings["wait_for_selector"], timeout=settings["timeout_ms"])
|
||||
if settings.get("post_wait_ms"):
|
||||
await page.wait_for_timeout(settings["post_wait_ms"])
|
||||
return True, None
|
||||
except PlaywrightTimeoutError as exc:
|
||||
last_error = f"Timeout: {exc}"
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_error = str(exc)
|
||||
|
||||
if attempt < settings["max_retries"] - 1:
|
||||
await page.wait_for_timeout(settings["retry_backoff_ms"] * (attempt + 1))
|
||||
|
||||
return False, last_error
|
||||
|
||||
|
||||
async def _perform_scroll(page, repetitions: int = 5, delay_ms: int = 800):
|
||||
repetitions = max(1, repetitions)
|
||||
for i in range(repetitions):
|
||||
await page.evaluate("(y) => window.scrollTo(0, y)", 2000 * (i + 1))
|
||||
await page.wait_for_timeout(delay_ms)
|
||||
|
||||
|
||||
async def _load_more(page, selector: str, max_clicks: int = 5, wait_ms: int = 1500):
|
||||
for _ in range(max_clicks):
|
||||
button = await page.query_selector(selector)
|
||||
if not button:
|
||||
break
|
||||
try:
|
||||
await button.click()
|
||||
await page.wait_for_timeout(wait_ms)
|
||||
except Exception:
|
||||
break
|
||||
|
||||
|
||||
def _deduplicate_staff(staff: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
seen = set()
|
||||
cleaned = []
|
||||
for item in staff:
|
||||
name = (item.get("name") or "").strip()
|
||||
if not name:
|
||||
continue
|
||||
url = (item.get("url") or "").strip()
|
||||
key = url or name.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
cleaned.append({"name": name, "url": url, "title": (item.get("title") or "").strip()})
|
||||
return cleaned
|
||||
|
||||
|
||||
def _append_query(url: str, params: Dict[str, Any]) -> str:
|
||||
delimiter = "&" if "?" in url else "?"
|
||||
return f"{url}{delimiter}{urlencode(params)}"
|
||||
|
||||
|
||||
def _guess_research_slug(staff_url: Optional[str]) -> Optional[str]:
|
||||
if not staff_url:
|
||||
return None
|
||||
path = staff_url.rstrip("/").split("/")
|
||||
return path[-1] if path else None
|
||||
|
||||
|
||||
def _parse_research_explorer_json(data: Any, base_url: str) -> List[Dict[str, str]]:
|
||||
items: List[Dict[str, Any]] = []
|
||||
if isinstance(data, list):
|
||||
items = data
|
||||
elif isinstance(data, dict):
|
||||
for key in ("results", "items", "persons", "data", "entities"):
|
||||
if isinstance(data.get(key), list):
|
||||
items = data[key]
|
||||
break
|
||||
if not items and isinstance(data.get("rows"), list):
|
||||
items = data["rows"]
|
||||
|
||||
staff = []
|
||||
for item in items:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
name = item.get("name") or item.get("title") or item.get("fullName")
|
||||
profile_url = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL")
|
||||
if not name:
|
||||
continue
|
||||
if profile_url:
|
||||
profile_url = urljoin(base_url, profile_url)
|
||||
staff.append(
|
||||
{
|
||||
"name": name.strip(),
|
||||
"url": (profile_url or "").strip(),
|
||||
"title": (item.get("jobTitle") or item.get("position") or "").strip(),
|
||||
}
|
||||
)
|
||||
return staff
|
||||
|
||||
|
||||
def _parse_research_explorer_xml(text: str, base_url: str) -> List[Dict[str, str]]:
|
||||
staff: List[Dict[str, str]] = []
|
||||
try:
|
||||
root = ET.fromstring(text)
|
||||
except ET.ParseError:
|
||||
return staff
|
||||
|
||||
for entry in root.findall(".//{http://www.w3.org/2005/Atom}entry"):
|
||||
title = entry.findtext("{http://www.w3.org/2005/Atom}title", default="")
|
||||
link = entry.find("{http://www.w3.org/2005/Atom}link")
|
||||
href = link.attrib.get("href") if link is not None else ""
|
||||
if title:
|
||||
staff.append(
|
||||
{
|
||||
"name": title.strip(),
|
||||
"url": urljoin(base_url, href) if href else "",
|
||||
"title": "",
|
||||
}
|
||||
)
|
||||
return staff
|
||||
|
||||
|
||||
async def fetch_research_explorer_api(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
|
||||
config = school_config.get("research_explorer") or {}
|
||||
if not config and school_config.get("extract_method") != "research_explorer":
|
||||
return []
|
||||
|
||||
base_staff_url = ""
|
||||
if school_config.get("staff_pages"):
|
||||
base_staff_url = school_config["staff_pages"][0].get("url", "")
|
||||
|
||||
page_size = config.get("page_size", 200)
|
||||
timeout_ms = config.get("timeout_ms", 70000)
|
||||
|
||||
candidates: List[str] = []
|
||||
slug = config.get("org_slug") or _guess_research_slug(base_staff_url)
|
||||
base_api = config.get("api_base", "https://research.manchester.ac.uk/ws/portalapi.aspx")
|
||||
|
||||
if config.get("api_url"):
|
||||
candidates.append(config["api_url"])
|
||||
|
||||
if slug:
|
||||
params = {
|
||||
"action": "search",
|
||||
"language": "en",
|
||||
"format": "json",
|
||||
"site": "default",
|
||||
"showall": "true",
|
||||
"pageSize": page_size,
|
||||
"organisations": slug,
|
||||
}
|
||||
candidates.append(f"{base_api}?{urlencode(params)}")
|
||||
|
||||
if base_staff_url:
|
||||
candidates.append(_append_query(base_staff_url, {"format": "json", "limit": page_size}))
|
||||
candidates.append(_append_query(base_staff_url, {"format": "xml", "limit": page_size}))
|
||||
|
||||
for url in candidates:
|
||||
try:
|
||||
resp = await context.request.get(url, timeout=timeout_ms)
|
||||
if resp.status != 200:
|
||||
continue
|
||||
ctype = resp.headers.get("content-type", "")
|
||||
if "json" in ctype:
|
||||
data = await resp.json()
|
||||
parsed = _parse_research_explorer_json(data, base_staff_url)
|
||||
else:
|
||||
text = await resp.text()
|
||||
parsed = _parse_research_explorer_xml(text, base_staff_url)
|
||||
parsed = _deduplicate_staff(parsed)
|
||||
if parsed:
|
||||
if output_callback:
|
||||
output_callback("info", f" {school_config['name']}: {len(parsed)} staff via API")
|
||||
return parsed
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if output_callback:
|
||||
output_callback(
|
||||
"warning", f" {school_config['name']}: API fetch failed ({str(exc)[:60]})"
|
||||
)
|
||||
return []
|
||||
|
||||
|
||||
async def scrape_staff_via_browser(context, school_config: Dict[str, Any], output_callback) -> List[Dict[str, str]]:
|
||||
staff_collected: List[Dict[str, str]] = []
|
||||
staff_pages = school_config.get("staff_pages") or []
|
||||
if not staff_pages and school_config.get("staff_url"):
|
||||
staff_pages = [{"url": school_config["staff_url"], "extract_method": school_config.get("extract_method")}]
|
||||
|
||||
page = await context.new_page()
|
||||
blocked_types = school_config.get("blocked_resources", ["image", "font", "media"])
|
||||
if blocked_types:
|
||||
async def _route_handler(route):
|
||||
if route.request.resource_type in blocked_types:
|
||||
await route.abort()
|
||||
else:
|
||||
await route.continue_()
|
||||
|
||||
await page.route("**/*", _route_handler)
|
||||
|
||||
for page_cfg in staff_pages:
|
||||
target_url = page_cfg.get("url")
|
||||
if not target_url:
|
||||
continue
|
||||
|
||||
settings = _merge_request_settings(school_config.get("request"), page_cfg.get("request"))
|
||||
success, error = await _goto_with_retry(page, target_url, settings, school_config["name"])
|
||||
if not success:
|
||||
if output_callback:
|
||||
output_callback("warning", f" {school_config['name']}: failed to load {target_url} ({error})")
|
||||
continue
|
||||
|
||||
if page_cfg.get("requires_scroll"):
|
||||
await _perform_scroll(page, page_cfg.get("scroll_times", 6), page_cfg.get("scroll_delay_ms", 700))
|
||||
|
||||
if page_cfg.get("load_from_selector"):
|
||||
await _load_more(page, page_cfg["load_from_selector"], page_cfg.get("max_load_more", 5))
|
||||
elif page_cfg.get("load_more_selector"):
|
||||
await _load_more(page, page_cfg["load_more_selector"], page_cfg.get("max_load_more", 5))
|
||||
|
||||
method = page_cfg.get("extract_method") or school_config.get("extract_method") or "links"
|
||||
if method == "table":
|
||||
extracted = await page.evaluate(JS_EXTRACT_TABLE_STAFF)
|
||||
elif method == "research_explorer":
|
||||
extracted = await page.evaluate(JS_EXTRACT_RESEARCH_EXPLORER)
|
||||
else:
|
||||
extracted = await page.evaluate(JS_EXTRACT_LINK_STAFF)
|
||||
|
||||
staff_collected.extend(extracted)
|
||||
|
||||
await page.close()
|
||||
return _deduplicate_staff(staff_collected)
|
||||
|
||||
|
||||
# =========================
|
||||
# 并发抓取学院 Staff
|
||||
# =========================
|
||||
|
||||
async def scrape_school_staff(context, school_config: Dict[str, Any], semaphore, output_callback):
|
||||
async with semaphore:
|
||||
staff_list: List[Dict[str, str]] = []
|
||||
status = "success"
|
||||
error: Optional[str] = None
|
||||
|
||||
try:
|
||||
if school_config.get("extract_method") == "research_explorer":
|
||||
staff_list = await fetch_research_explorer_api(context, school_config, output_callback)
|
||||
if not staff_list:
|
||||
staff_list = await scrape_staff_via_browser(context, school_config, output_callback)
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f" {school_config['name']}: total {len(staff_list)} staff")
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
status = "error"
|
||||
error = str(exc)
|
||||
if output_callback:
|
||||
output_callback("error", f" {school_config['name']}: {error}")
|
||||
|
||||
return {
|
||||
"name": school_config["name"],
|
||||
"staff": staff_list,
|
||||
"status": status,
|
||||
"error": error,
|
||||
}
|
||||
|
||||
|
||||
async def scrape_all_school_staff(context, output_callback):
|
||||
semaphore = asyncio.Semaphore(STAFF_CONCURRENCY)
|
||||
tasks = [
|
||||
asyncio.create_task(scrape_school_staff(context, cfg, semaphore, output_callback))
|
||||
for cfg in SCHOOL_CONFIG
|
||||
]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
staff_map = {}
|
||||
diagnostics = {"failed": [], "success": [], "total": len(results)}
|
||||
for res in results:
|
||||
if res["staff"]:
|
||||
staff_map[res["name"]] = res["staff"]
|
||||
diagnostics["success"].append(res["name"])
|
||||
else:
|
||||
diagnostics["failed"].append(
|
||||
{
|
||||
"name": res["name"],
|
||||
"status": res["status"],
|
||||
"error": res.get("error"),
|
||||
}
|
||||
)
|
||||
return staff_map, diagnostics
|
||||
|
||||
|
||||
# =========================
|
||||
# 主流程
|
||||
# =========================
|
||||
|
||||
async def scrape(output_callback=None):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
)
|
||||
|
||||
base_url = "https://www.manchester.ac.uk/"
|
||||
result = {
|
||||
"name": "The University of Manchester",
|
||||
"url": base_url,
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": [],
|
||||
"diagnostics": {},
|
||||
}
|
||||
|
||||
try:
|
||||
# Step 1: Masters 列表
|
||||
if output_callback:
|
||||
output_callback("info", "Step 1: Scraping masters programs list...")
|
||||
|
||||
page = await context.new_page()
|
||||
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
|
||||
await page.goto(courses_url, wait_until="domcontentloaded", timeout=40000)
|
||||
await page.wait_for_timeout(3000)
|
||||
programs_data = await page.evaluate(JS_EXTRACT_PROGRAMS)
|
||||
await page.close()
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Found {len(programs_data)} masters programs")
|
||||
|
||||
# Step 2: 并发抓取学院 Staff
|
||||
if output_callback:
|
||||
output_callback("info", "Step 2: Scraping faculty from staff pages (parallel)...")
|
||||
school_staff, diagnostics = await scrape_all_school_staff(context, output_callback)
|
||||
|
||||
# Step 3: 组织数据
|
||||
schools_dict: Dict[str, Dict[str, Any]] = {}
|
||||
for prog in programs_data:
|
||||
school_name = match_program_to_school(prog["name"])
|
||||
if school_name not in schools_dict:
|
||||
schools_dict[school_name] = {
|
||||
"name": school_name,
|
||||
"url": "",
|
||||
"programs": [],
|
||||
"faculty": school_staff.get(school_name, []),
|
||||
"faculty_source": "school_directory" if school_staff.get(school_name) else "",
|
||||
}
|
||||
|
||||
schools_dict[school_name]["programs"].append(
|
||||
{
|
||||
"name": prog["name"],
|
||||
"url": prog["url"],
|
||||
"faculty": [],
|
||||
}
|
||||
)
|
||||
|
||||
for cfg in SCHOOL_CONFIG:
|
||||
if cfg["name"] in schools_dict:
|
||||
first_page = (cfg.get("staff_pages") or [{}])[0]
|
||||
schools_dict[cfg["name"]]["url"] = first_page.get("url") or cfg.get("staff_url", "")
|
||||
|
||||
_attach_faculty_to_programs(schools_dict, school_staff)
|
||||
|
||||
result["schools"] = list(schools_dict.values())
|
||||
|
||||
total_programs = sum(len(s["programs"]) for s in result["schools"])
|
||||
total_faculty = sum(len(s.get("faculty", [])) for s in result["schools"])
|
||||
|
||||
result["diagnostics"] = {
|
||||
"total_programs": total_programs,
|
||||
"total_faculty_records": total_faculty,
|
||||
"school_staff_success": diagnostics.get("success", []),
|
||||
"school_staff_failed": diagnostics.get("failed", []),
|
||||
}
|
||||
|
||||
if output_callback:
|
||||
output_callback(
|
||||
"info",
|
||||
f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty",
|
||||
)
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if output_callback:
|
||||
output_callback("error", f"Scraping error: {str(exc)}")
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _attach_faculty_to_programs(schools_dict: Dict[str, Dict[str, Any]], staff_map: Dict[str, List[Dict[str, str]]]):
|
||||
for school_name, school_data in schools_dict.items():
|
||||
staff = staff_map.get(school_name, [])
|
||||
cfg = SCHOOL_LOOKUP.get(school_name, {})
|
||||
if not staff or not cfg.get("attach_faculty_to_programs"):
|
||||
continue
|
||||
|
||||
limit = cfg.get("faculty_per_program")
|
||||
for program in school_data["programs"]:
|
||||
sliced = deepcopy(staff[:limit] if limit else staff)
|
||||
program["faculty"] = sliced
|
||||
|
||||
|
||||
# =========================
|
||||
# CLI
|
||||
# =========================
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if sys.platform == "win32":
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||||
|
||||
def print_callback(level, msg):
|
||||
print(f"[{level}] {msg}")
|
||||
|
||||
scrape_result = asyncio.run(scrape(output_callback=print_callback))
|
||||
|
||||
output_path = "output/manchester_complete_result.json"
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(scrape_result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("\nResult saved to", output_path)
|
||||
print("\n=== Summary ===")
|
||||
for school in sorted(scrape_result["schools"], key=lambda s: -len(s.get("faculty", []))):
|
||||
print(
|
||||
f" {school['name']}: "
|
||||
f"{len(school['programs'])} programs, "
|
||||
f"{len(school.get('faculty', []))} faculty"
|
||||
)
|
||||
|
||||
229
artifacts/manchester_improved_scraper.py
Normal file
229
artifacts/manchester_improved_scraper.py
Normal file
@ -0,0 +1,229 @@
|
||||
"""
|
||||
曼彻斯特大学专用爬虫脚本
|
||||
改进版 - 从学院Staff页面提取导师信息
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
# 曼彻斯特大学学院Staff页面映射
|
||||
# 项目关键词 -> 学院Staff页面URL
|
||||
SCHOOL_STAFF_MAPPING = {
|
||||
# Alliance Manchester Business School (AMBS)
|
||||
"accounting": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
|
||||
"finance": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
|
||||
"business": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
|
||||
"management": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
|
||||
"marketing": "https://www.alliancembs.manchester.ac.uk/research/management-sciences-and-marketing/",
|
||||
"mba": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
|
||||
|
||||
# 其他学院可以继续添加...
|
||||
# "computer": "...",
|
||||
# "engineering": "...",
|
||||
}
|
||||
|
||||
# 通用学院Staff页面列表(如果没有匹配的关键词)
|
||||
GENERAL_STAFF_PAGES = [
|
||||
"https://www.alliancembs.manchester.ac.uk/about/our-people/",
|
||||
]
|
||||
|
||||
|
||||
async def scrape(output_callback=None):
|
||||
"""执行爬取"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
base_url = "https://www.manchester.ac.uk/"
|
||||
|
||||
result = {
|
||||
"name": "The University of Manchester",
|
||||
"url": base_url,
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": []
|
||||
}
|
||||
|
||||
try:
|
||||
# 第一步:爬取硕士项目列表
|
||||
if output_callback:
|
||||
output_callback("info", "Step 1: Scraping masters programs list...")
|
||||
|
||||
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
|
||||
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 提取所有硕士项目
|
||||
programs_data = await page.evaluate('''() => {
|
||||
const programs = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim().replace(/\\s+/g, ' ');
|
||||
|
||||
if (!href || seen.has(href)) return;
|
||||
if (text.length < 10 || text.length > 200) return;
|
||||
|
||||
const hrefLower = href.toLowerCase();
|
||||
const textLower = text.toLowerCase();
|
||||
|
||||
// 排除导航链接
|
||||
if (textLower === 'courses' || textLower === 'masters' ||
|
||||
textLower.includes('admission') || textLower.includes('fees') ||
|
||||
textLower.includes('skip to') || textLower.includes('skip navigation') ||
|
||||
textLower === 'home' || textLower === 'search' ||
|
||||
textLower.includes('contact') || textLower.includes('footer') ||
|
||||
hrefLower.endsWith('/courses/') || hrefLower.endsWith('/masters/') ||
|
||||
hrefLower.includes('#')) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 检查是否是课程链接 - 必须包含课程ID
|
||||
const hasNumericId = /\\/\\d{5}\\//.test(href); // 5位数字ID
|
||||
const isCoursePage = hrefLower.includes('/courses/list/') &&
|
||||
hasNumericId;
|
||||
|
||||
if (isCoursePage) {
|
||||
seen.add(href);
|
||||
programs.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return programs;
|
||||
}''')
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Found {len(programs_data)} masters programs")
|
||||
|
||||
# 第二步:爬取学院Staff页面的导师信息
|
||||
if output_callback:
|
||||
output_callback("info", "Step 2: Scraping faculty from school staff pages...")
|
||||
|
||||
all_faculty = {} # school_url -> faculty list
|
||||
|
||||
# 爬取AMBS Accounting & Finance Staff
|
||||
staff_url = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
|
||||
if output_callback:
|
||||
output_callback("info", f"Scraping staff from: {staff_url}")
|
||||
|
||||
await page.goto(staff_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 从表格提取教职员工
|
||||
faculty_data = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
const rows = document.querySelectorAll('table tr');
|
||||
|
||||
rows.forEach(row => {
|
||||
const cells = row.querySelectorAll('td');
|
||||
if (cells.length >= 2) {
|
||||
const link = cells[1]?.querySelector('a[href]');
|
||||
const titleCell = cells[2];
|
||||
|
||||
if (link) {
|
||||
const name = link.innerText.trim();
|
||||
const url = link.href;
|
||||
const title = titleCell ? titleCell.innerText.trim() : '';
|
||||
|
||||
if (name.length > 2 && !name.toLowerCase().includes('skip')) {
|
||||
faculty.push({
|
||||
name: name,
|
||||
url: url,
|
||||
title: title
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Found {len(faculty_data)} faculty members from AMBS")
|
||||
|
||||
all_faculty["AMBS - Accounting and Finance"] = faculty_data
|
||||
|
||||
# 第三步:组装结果
|
||||
# 将项目按关键词分配到学院
|
||||
schools_data = {}
|
||||
|
||||
for prog in programs_data:
|
||||
prog_name_lower = prog['name'].lower()
|
||||
|
||||
# 确定所属学院
|
||||
school_name = "Other Programs"
|
||||
matched_faculty = []
|
||||
|
||||
for keyword, staff_url in SCHOOL_STAFF_MAPPING.items():
|
||||
if keyword in prog_name_lower:
|
||||
if "accounting" in keyword or "finance" in keyword:
|
||||
school_name = "Alliance Manchester Business School"
|
||||
matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
|
||||
elif "business" in keyword or "management" in keyword or "mba" in keyword:
|
||||
school_name = "Alliance Manchester Business School"
|
||||
matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
|
||||
break
|
||||
|
||||
if school_name not in schools_data:
|
||||
schools_data[school_name] = {
|
||||
"name": school_name,
|
||||
"url": "",
|
||||
"programs": [],
|
||||
"faculty": matched_faculty # 学院级别的导师
|
||||
}
|
||||
|
||||
schools_data[school_name]["programs"].append({
|
||||
"name": prog['name'],
|
||||
"url": prog['url'],
|
||||
"faculty": [] # 项目级别暂不填充
|
||||
})
|
||||
|
||||
result["schools"] = list(schools_data.values())
|
||||
|
||||
# 统计
|
||||
total_programs = sum(len(s['programs']) for s in result['schools'])
|
||||
total_faculty = sum(len(s.get('faculty', [])) for s in result['schools'])
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty")
|
||||
|
||||
except Exception as e:
|
||||
if output_callback:
|
||||
output_callback("error", f"Scraping error: {str(e)}")
|
||||
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
if sys.platform == "win32":
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||||
|
||||
def print_callback(level, msg):
|
||||
print(f"[{level}] {msg}")
|
||||
|
||||
result = asyncio.run(scrape(output_callback=print_callback))
|
||||
|
||||
# 保存结果
|
||||
with open("output/manchester_improved_result.json", "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nResult saved to output/manchester_improved_result.json")
|
||||
print(f"Schools: {len(result['schools'])}")
|
||||
for school in result['schools']:
|
||||
print(f" - {school['name']}: {len(school['programs'])} programs, {len(school.get('faculty', []))} faculty")
|
||||
165
artifacts/test_faculty_scraper.py
Normal file
165
artifacts/test_faculty_scraper.py
Normal file
@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
测试导师信息爬取逻辑 - 只测试3个项目
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
def name_to_slug(name):
|
||||
"""将项目名称转换为URL slug"""
|
||||
slug = name.lower()
|
||||
slug = re.sub(r'[^\w\s-]', '', slug)
|
||||
slug = re.sub(r'[\s_]+', '-', slug)
|
||||
slug = re.sub(r'-+', '-', slug)
|
||||
slug = slug.strip('-')
|
||||
return slug
|
||||
|
||||
|
||||
async def get_faculty_from_gsas_page(page, gsas_url):
|
||||
"""从GSAS项目页面获取Faculty链接,然后访问院系People页面获取导师列表"""
|
||||
faculty_list = []
|
||||
faculty_page_url = None
|
||||
|
||||
try:
|
||||
print(f" 访问GSAS页面: {gsas_url}")
|
||||
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 查找Faculty部分的链接
|
||||
faculty_link = await page.evaluate('''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const link of links) {
|
||||
const text = link.innerText.toLowerCase();
|
||||
const href = link.href;
|
||||
if (text.includes('faculty') && text.includes('see list')) {
|
||||
return href;
|
||||
}
|
||||
if (text.includes('faculty') && (href.includes('/people') || href.includes('/faculty'))) {
|
||||
return href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}''')
|
||||
|
||||
if faculty_link:
|
||||
faculty_page_url = faculty_link
|
||||
print(f" 找到Faculty页面链接: {faculty_link}")
|
||||
|
||||
# 访问Faculty/People页面
|
||||
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 提取所有导师信息
|
||||
faculty_list = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
|
||||
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
|
||||
lowerHref.includes('/profile/')) &&
|
||||
text.length > 3 && text.length < 100 &&
|
||||
!text.toLowerCase().includes('people') &&
|
||||
!text.toLowerCase().includes('faculty') &&
|
||||
!lowerHref.endsWith('/people/') &&
|
||||
!lowerHref.endsWith('/faculty/')) {
|
||||
|
||||
if (!seen.has(href)) {
|
||||
seen.add(href);
|
||||
faculty.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
|
||||
print(f" 找到 {len(faculty_list)} 位导师")
|
||||
for f in faculty_list[:5]:
|
||||
print(f" - {f['name']}: {f['url']}")
|
||||
if len(faculty_list) > 5:
|
||||
print(f" ... 还有 {len(faculty_list) - 5} 位")
|
||||
else:
|
||||
print(" 未找到Faculty页面链接")
|
||||
|
||||
except Exception as e:
|
||||
print(f" 获取Faculty信息失败: {e}")
|
||||
|
||||
return faculty_list, faculty_page_url
|
||||
|
||||
|
||||
async def test_faculty_scraper():
|
||||
"""测试导师爬取"""
|
||||
|
||||
# 测试3个项目
|
||||
test_programs = [
|
||||
"African and African American Studies",
|
||||
"Economics",
|
||||
"Computer Science"
|
||||
]
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
results = []
|
||||
|
||||
for i, name in enumerate(test_programs, 1):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{i}/{len(test_programs)}] 测试: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
slug = name_to_slug(name)
|
||||
program_url = f"https://www.harvard.edu/programs/{slug}/"
|
||||
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
|
||||
|
||||
print(f"项目URL: {program_url}")
|
||||
print(f"GSAS URL: {gsas_url}")
|
||||
|
||||
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url)
|
||||
|
||||
results.append({
|
||||
'name': name,
|
||||
'url': program_url,
|
||||
'gsas_url': gsas_url,
|
||||
'faculty_page_url': faculty_page_url,
|
||||
'faculty': faculty_list,
|
||||
'faculty_count': len(faculty_list)
|
||||
})
|
||||
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# 输出结果
|
||||
print(f"\n\n{'='*60}")
|
||||
print("测试结果汇总")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for r in results:
|
||||
print(f"\n{r['name']}:")
|
||||
print(f" Faculty页面: {r['faculty_page_url'] or '未找到'}")
|
||||
print(f" 导师数量: {r['faculty_count']}")
|
||||
|
||||
# 保存测试结果
|
||||
with open('test_faculty_results.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n测试结果已保存到: test_faculty_results.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_faculty_scraper())
|
||||
464
artifacts/test_manchester_scraper.py
Normal file
464
artifacts/test_manchester_scraper.py
Normal file
@ -0,0 +1,464 @@
|
||||
"""
|
||||
Test Manchester University scraper - improved faculty mapping
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
MASTERS_PATHS = [
|
||||
"/study/masters/courses/list/",
|
||||
"/study/masters/courses/",
|
||||
"/postgraduate/taught/courses/",
|
||||
"/postgraduate/courses/list/",
|
||||
"/postgraduate/courses/",
|
||||
"/graduate/programs/",
|
||||
"/academics/graduate/programs/",
|
||||
"/programmes/masters/",
|
||||
"/masters/programmes/",
|
||||
"/admissions/graduate/programs/",
|
||||
]
|
||||
|
||||
ACCOUNTING_STAFF_URL = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
|
||||
ACCOUNTING_STAFF_CACHE = None
|
||||
|
||||
|
||||
JS_CHECK_COURSES = r"""() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
let courseCount = 0;
|
||||
for (const a of links) {
|
||||
const href = a.href.toLowerCase();
|
||||
if (/\/\d{4,}\//.test(href) ||
|
||||
/\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
|
||||
/\/course\/[a-z]/.test(href)) {
|
||||
courseCount++;
|
||||
}
|
||||
}
|
||||
return courseCount;
|
||||
}"""
|
||||
|
||||
JS_FIND_LIST_URL = """() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const a of links) {
|
||||
const text = a.innerText.toLowerCase();
|
||||
const href = a.href.toLowerCase();
|
||||
if ((text.includes('a-z') || text.includes('all course') ||
|
||||
text.includes('full list') || text.includes('browse all') ||
|
||||
href.includes('/list')) &&
|
||||
(href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
|
||||
return a.href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}"""
|
||||
|
||||
JS_FIND_COURSES_FROM_HOME = """() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const a of links) {
|
||||
const href = a.href.toLowerCase();
|
||||
const text = a.innerText.toLowerCase();
|
||||
if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
|
||||
(href.includes('course') || href.includes('program') || href.includes('degree'))) {
|
||||
return a.href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_PROGRAMS = r"""() => {
|
||||
const programs = [];
|
||||
const seen = new Set();
|
||||
const currentHost = window.location.hostname;
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim().replace(/\s+/g, ' ');
|
||||
|
||||
if (!href || seen.has(href)) return;
|
||||
if (text.length < 5 || text.length > 200) return;
|
||||
if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;
|
||||
|
||||
try {
|
||||
const linkHost = new URL(href).hostname;
|
||||
if (!linkHost.includes(currentHost.replace('www.', '')) &&
|
||||
!currentHost.includes(linkHost.replace('www.', ''))) return;
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
|
||||
const hrefLower = href.toLowerCase();
|
||||
const textLower = text.toLowerCase();
|
||||
|
||||
const isNavigation = textLower === 'courses' ||
|
||||
textLower === 'programmes' ||
|
||||
textLower === 'undergraduate' ||
|
||||
textLower === 'postgraduate' ||
|
||||
textLower === 'masters' ||
|
||||
textLower === "master's" ||
|
||||
textLower.includes('skip to') ||
|
||||
textLower.includes('share') ||
|
||||
textLower === 'home' ||
|
||||
textLower === 'study' ||
|
||||
textLower.startsWith('a-z') ||
|
||||
textLower.includes('admission') ||
|
||||
textLower.includes('fees and funding') ||
|
||||
textLower.includes('why should') ||
|
||||
textLower.includes('why manchester') ||
|
||||
textLower.includes('teaching and learning') ||
|
||||
textLower.includes('meet us') ||
|
||||
textLower.includes('student support') ||
|
||||
textLower.includes('contact us') ||
|
||||
textLower.includes('how to apply') ||
|
||||
hrefLower.includes('/admissions/') ||
|
||||
hrefLower.includes('/fees-and-funding/') ||
|
||||
hrefLower.includes('/why-') ||
|
||||
hrefLower.includes('/meet-us/') ||
|
||||
hrefLower.includes('/contact-us/') ||
|
||||
hrefLower.includes('/student-support/') ||
|
||||
hrefLower.includes('/teaching-and-learning/') ||
|
||||
hrefLower.endsWith('/courses/') ||
|
||||
hrefLower.endsWith('/masters/') ||
|
||||
hrefLower.endsWith('/postgraduate/');
|
||||
|
||||
if (isNavigation) return;
|
||||
|
||||
const isExcluded = hrefLower.includes('/undergraduate') ||
|
||||
hrefLower.includes('/bachelor') ||
|
||||
hrefLower.includes('/phd/') ||
|
||||
hrefLower.includes('/doctoral') ||
|
||||
hrefLower.includes('/research-degree') ||
|
||||
textLower.includes('bachelor') ||
|
||||
textLower.includes('undergraduate') ||
|
||||
(textLower.includes('phd') && !textLower.includes('mphil'));
|
||||
|
||||
if (isExcluded) return;
|
||||
|
||||
const hasNumericId = /\/\d{4,}\//.test(href);
|
||||
const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
|
||||
const isCoursePage = (hrefLower.includes('/course/') ||
|
||||
hrefLower.includes('/courses/list/') ||
|
||||
hrefLower.includes('/programme/')) &&
|
||||
href.split('/').filter(p => p).length > 4;
|
||||
const textHasDegree = /(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)/i.test(text) ||
|
||||
textLower.includes('master');
|
||||
|
||||
if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
|
||||
seen.add(href);
|
||||
programs.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return programs;
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_FACULTY = r"""() => {
|
||||
const faculty = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href.toLowerCase();
|
||||
const text = a.innerText.trim();
|
||||
|
||||
if (seen.has(href)) return;
|
||||
if (text.length < 3 || text.length > 100) return;
|
||||
|
||||
const isStaff = href.includes('/people/') ||
|
||||
href.includes('/staff/') ||
|
||||
href.includes('/faculty/') ||
|
||||
href.includes('/profile/') ||
|
||||
href.includes('/academics/') ||
|
||||
href.includes('/researcher/');
|
||||
|
||||
if (isStaff) {
|
||||
seen.add(href);
|
||||
faculty.push({
|
||||
name: text.replace(/\s+/g, ' '),
|
||||
url: a.href
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return faculty.slice(0, 20);
|
||||
}"""
|
||||
|
||||
JS_EXTRACT_ACCOUNTING_STAFF = r"""() => {
|
||||
const rows = Array.from(document.querySelectorAll('table tbody tr'));
|
||||
const staff = [];
|
||||
|
||||
for (const row of rows) {
|
||||
const cells = row.querySelectorAll('td');
|
||||
if (!cells || cells.length < 2) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const nameCell = cells[1];
|
||||
const roleCell = cells[2];
|
||||
const emailCell = cells[5];
|
||||
|
||||
let profileUrl = '';
|
||||
let displayName = nameCell ? nameCell.innerText.trim() : '';
|
||||
const link = nameCell ? nameCell.querySelector('a[href]') : null;
|
||||
if (link) {
|
||||
profileUrl = link.href;
|
||||
displayName = link.innerText.trim() || displayName;
|
||||
}
|
||||
|
||||
if (!displayName) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let email = '';
|
||||
if (emailCell) {
|
||||
const emailLink = emailCell.querySelector('a[href^="mailto:"]');
|
||||
if (emailLink) {
|
||||
email = emailLink.href.replace('mailto:', '').trim();
|
||||
}
|
||||
}
|
||||
|
||||
staff.push({
|
||||
name: displayName,
|
||||
title: roleCell ? roleCell.innerText.trim() : '',
|
||||
url: profileUrl,
|
||||
email: email
|
||||
});
|
||||
}
|
||||
|
||||
return staff;
|
||||
}"""
|
||||
|
||||
|
||||
def should_use_accounting_staff(program_name: str) -> bool:
|
||||
lower_name = program_name.lower()
|
||||
return "msc" in lower_name and "accounting" in lower_name
|
||||
|
||||
|
||||
async def load_accounting_staff(context, output_callback=None):
|
||||
global ACCOUNTING_STAFF_CACHE
|
||||
|
||||
if ACCOUNTING_STAFF_CACHE is not None:
|
||||
return ACCOUNTING_STAFF_CACHE
|
||||
|
||||
staff_page = await context.new_page()
|
||||
try:
|
||||
if output_callback:
|
||||
output_callback("info", "Loading official AMBS Accounting & Finance staff page...")
|
||||
|
||||
await staff_page.goto(ACCOUNTING_STAFF_URL, wait_until="domcontentloaded", timeout=30000)
|
||||
await staff_page.wait_for_timeout(2000)
|
||||
|
||||
ACCOUNTING_STAFF_CACHE = await staff_page.evaluate(JS_EXTRACT_ACCOUNTING_STAFF)
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Captured {len(ACCOUNTING_STAFF_CACHE)} faculty from the official staff page")
|
||||
|
||||
except Exception as exc:
|
||||
if output_callback:
|
||||
output_callback("error", f"Failed to load AMBS staff page: {exc}")
|
||||
ACCOUNTING_STAFF_CACHE = []
|
||||
finally:
|
||||
await staff_page.close()
|
||||
|
||||
return ACCOUNTING_STAFF_CACHE
|
||||
|
||||
|
||||
async def find_course_list_page(page, base_url, output_callback):
|
||||
for path in MASTERS_PATHS:
|
||||
test_url = base_url.rstrip('/') + path
|
||||
try:
|
||||
response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
|
||||
if response and response.status == 200:
|
||||
title = await page.title()
|
||||
if '404' not in title.lower() and 'not found' not in title.lower():
|
||||
has_courses = await page.evaluate(JS_CHECK_COURSES)
|
||||
if has_courses > 5:
|
||||
if output_callback:
|
||||
output_callback("info", f"Found course list: {path} ({has_courses} courses)")
|
||||
return test_url
|
||||
|
||||
list_url = await page.evaluate(JS_FIND_LIST_URL)
|
||||
if list_url:
|
||||
if output_callback:
|
||||
output_callback("info", f"Found full course list: {list_url}")
|
||||
return list_url
|
||||
except:
|
||||
continue
|
||||
|
||||
try:
|
||||
await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
|
||||
if courses_url:
|
||||
return courses_url
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def extract_course_links(page, output_callback):
|
||||
return await page.evaluate(JS_EXTRACT_PROGRAMS)
|
||||
|
||||
|
||||
async def scrape(output_callback=None):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
base_url = "https://www.manchester.ac.uk/"
|
||||
|
||||
result = {
|
||||
"name": "Manchester University",
|
||||
"url": base_url,
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"schools": []
|
||||
}
|
||||
|
||||
all_programs = []
|
||||
|
||||
try:
|
||||
if output_callback:
|
||||
output_callback("info", "Searching for masters course list...")
|
||||
|
||||
courses_url = await find_course_list_page(page, base_url, output_callback)
|
||||
|
||||
if not courses_url:
|
||||
if output_callback:
|
||||
output_callback("warning", "Course list not found, using homepage")
|
||||
courses_url = base_url
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", "Extracting masters programs...")
|
||||
|
||||
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
for _ in range(3):
|
||||
try:
|
||||
load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
|
||||
if await load_more.count() > 0:
|
||||
await load_more.first.click()
|
||||
await page.wait_for_timeout(2000)
|
||||
else:
|
||||
break
|
||||
except:
|
||||
break
|
||||
|
||||
programs_data = await extract_course_links(page, output_callback)
|
||||
|
||||
if output_callback:
|
||||
output_callback("info", f"Found {len(programs_data)} masters programs")
|
||||
|
||||
print("\nTop 20 programs:")
|
||||
for i, prog in enumerate(programs_data[:20]):
|
||||
print(f" {i+1}. {prog['name'][:60]}")
|
||||
print(f" {prog['url']}")
|
||||
|
||||
max_detail_pages = min(len(programs_data), 30)
|
||||
detailed_processed = 0
|
||||
logged_official_staff = False
|
||||
|
||||
for prog in programs_data:
|
||||
faculty_data = []
|
||||
used_official_staff = False
|
||||
|
||||
if should_use_accounting_staff(prog['name']):
|
||||
staff_list = await load_accounting_staff(context, output_callback)
|
||||
if staff_list:
|
||||
used_official_staff = True
|
||||
if output_callback and not logged_official_staff:
|
||||
output_callback("info", "Using Alliance MBS Accounting & Finance staff directory for accounting programmes")
|
||||
logged_official_staff = True
|
||||
faculty_data = [
|
||||
{
|
||||
"name": person.get("name"),
|
||||
"url": person.get("url") or ACCOUNTING_STAFF_URL,
|
||||
"title": person.get("title"),
|
||||
"email": person.get("email"),
|
||||
"source": "Alliance Manchester Business School - Accounting & Finance staff"
|
||||
}
|
||||
for person in staff_list
|
||||
]
|
||||
|
||||
elif detailed_processed < max_detail_pages:
|
||||
detailed_processed += 1
|
||||
if output_callback and detailed_processed % 10 == 0:
|
||||
output_callback("info", f"Processing {detailed_processed}/{max_detail_pages}: {prog['name'][:50]}")
|
||||
try:
|
||||
await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
|
||||
await page.wait_for_timeout(800)
|
||||
|
||||
faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
|
||||
except Exception as e:
|
||||
if output_callback:
|
||||
output_callback("warning", f"Failed to capture faculty for {prog['name'][:50]}: {e}")
|
||||
faculty_data = []
|
||||
|
||||
program_entry = {
|
||||
"name": prog['name'],
|
||||
"url": prog['url'],
|
||||
"faculty": faculty_data
|
||||
}
|
||||
|
||||
if used_official_staff:
|
||||
program_entry["faculty_page_override"] = ACCOUNTING_STAFF_URL
|
||||
|
||||
all_programs.append(program_entry)
|
||||
|
||||
result["schools"] = [{
|
||||
"name": "Masters Programs",
|
||||
"url": courses_url,
|
||||
"programs": all_programs
|
||||
}]
|
||||
|
||||
if output_callback:
|
||||
total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
|
||||
output_callback("info", f"Done! {len(all_programs)} programs, {total_faculty} faculty")
|
||||
|
||||
except Exception as e:
|
||||
if output_callback:
|
||||
output_callback("error", f"Scraping error: {str(e)}")
|
||||
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def log_callback(level, message):
|
||||
print(f"[{level.upper()}] {message}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
result = asyncio.run(scrape(output_callback=log_callback))
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Scrape summary:")
|
||||
print("="*60)
|
||||
|
||||
if result.get("schools"):
|
||||
school = result["schools"][0]
|
||||
programs = school.get("programs", [])
|
||||
print(f"Course list URL: {school.get('url')}")
|
||||
print(f"Total programs: {len(programs)}")
|
||||
|
||||
faculty_count = sum(len(p.get('faculty', [])) for p in programs)
|
||||
print(f"Faculty total: {faculty_count}")
|
||||
|
||||
print("\nTop 10 programs:")
|
||||
for i, p in enumerate(programs[:10]):
|
||||
print(f" {i+1}. {p['name'][:60]}")
|
||||
if p.get("faculty"):
|
||||
print(f" Faculty entries: {len(p['faculty'])}")
|
||||
|
||||
with open("manchester_test_result.json", "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
print("\nSaved results to manchester_test_result.json")
|
||||
Reference in New Issue
Block a user