Files
University-Playwright-Codeg…/artifacts/manchester_improved_scraper.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

230 lines
9.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
曼彻斯特大学专用爬虫脚本
改进版 - 从学院Staff页面提取导师信息
"""
import asyncio
import json
import re
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright
# 曼彻斯特大学学院Staff页面映射
# 项目关键词 -> 学院Staff页面URL
SCHOOL_STAFF_MAPPING = {
# Alliance Manchester Business School (AMBS)
"accounting": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
"finance": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
"business": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
"management": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
"marketing": "https://www.alliancembs.manchester.ac.uk/research/management-sciences-and-marketing/",
"mba": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
# 其他学院可以继续添加...
# "computer": "...",
# "engineering": "...",
}
# 通用学院Staff页面列表如果没有匹配的关键词
GENERAL_STAFF_PAGES = [
"https://www.alliancembs.manchester.ac.uk/about/our-people/",
]
async def scrape(output_callback=None):
"""执行爬取"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
page = await context.new_page()
base_url = "https://www.manchester.ac.uk/"
result = {
"name": "The University of Manchester",
"url": base_url,
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": []
}
try:
# 第一步:爬取硕士项目列表
if output_callback:
output_callback("info", "Step 1: Scraping masters programs list...")
courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
# 提取所有硕士项目
programs_data = await page.evaluate('''() => {
const programs = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().replace(/\\s+/g, ' ');
if (!href || seen.has(href)) return;
if (text.length < 10 || text.length > 200) return;
const hrefLower = href.toLowerCase();
const textLower = text.toLowerCase();
// 排除导航链接
if (textLower === 'courses' || textLower === 'masters' ||
textLower.includes('admission') || textLower.includes('fees') ||
textLower.includes('skip to') || textLower.includes('skip navigation') ||
textLower === 'home' || textLower === 'search' ||
textLower.includes('contact') || textLower.includes('footer') ||
hrefLower.endsWith('/courses/') || hrefLower.endsWith('/masters/') ||
hrefLower.includes('#')) {
return;
}
// 检查是否是课程链接 - 必须包含课程ID
const hasNumericId = /\\/\\d{5}\\//.test(href); // 5位数字ID
const isCoursePage = hrefLower.includes('/courses/list/') &&
hasNumericId;
if (isCoursePage) {
seen.add(href);
programs.push({
name: text,
url: href
});
}
});
return programs;
}''')
if output_callback:
output_callback("info", f"Found {len(programs_data)} masters programs")
# 第二步爬取学院Staff页面的导师信息
if output_callback:
output_callback("info", "Step 2: Scraping faculty from school staff pages...")
all_faculty = {} # school_url -> faculty list
# 爬取AMBS Accounting & Finance Staff
staff_url = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
if output_callback:
output_callback("info", f"Scraping staff from: {staff_url}")
await page.goto(staff_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
# 从表格提取教职员工
faculty_data = await page.evaluate('''() => {
const faculty = [];
const rows = document.querySelectorAll('table tr');
rows.forEach(row => {
const cells = row.querySelectorAll('td');
if (cells.length >= 2) {
const link = cells[1]?.querySelector('a[href]');
const titleCell = cells[2];
if (link) {
const name = link.innerText.trim();
const url = link.href;
const title = titleCell ? titleCell.innerText.trim() : '';
if (name.length > 2 && !name.toLowerCase().includes('skip')) {
faculty.push({
name: name,
url: url,
title: title
});
}
}
}
});
return faculty;
}''')
if output_callback:
output_callback("info", f"Found {len(faculty_data)} faculty members from AMBS")
all_faculty["AMBS - Accounting and Finance"] = faculty_data
# 第三步:组装结果
# 将项目按关键词分配到学院
schools_data = {}
for prog in programs_data:
prog_name_lower = prog['name'].lower()
# 确定所属学院
school_name = "Other Programs"
matched_faculty = []
for keyword, staff_url in SCHOOL_STAFF_MAPPING.items():
if keyword in prog_name_lower:
if "accounting" in keyword or "finance" in keyword:
school_name = "Alliance Manchester Business School"
matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
elif "business" in keyword or "management" in keyword or "mba" in keyword:
school_name = "Alliance Manchester Business School"
matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
break
if school_name not in schools_data:
schools_data[school_name] = {
"name": school_name,
"url": "",
"programs": [],
"faculty": matched_faculty # 学院级别的导师
}
schools_data[school_name]["programs"].append({
"name": prog['name'],
"url": prog['url'],
"faculty": [] # 项目级别暂不填充
})
result["schools"] = list(schools_data.values())
# 统计
total_programs = sum(len(s['programs']) for s in result['schools'])
total_faculty = sum(len(s.get('faculty', [])) for s in result['schools'])
if output_callback:
output_callback("info", f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty")
except Exception as e:
if output_callback:
output_callback("error", f"Scraping error: {str(e)}")
finally:
await browser.close()
return result
if __name__ == "__main__":
import sys
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
def print_callback(level, msg):
print(f"[{level}] {msg}")
result = asyncio.run(scrape(output_callback=print_callback))
# 保存结果
with open("output/manchester_improved_result.json", "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"\nResult saved to output/manchester_improved_result.json")
print(f"Schools: {len(result['schools'])}")
for school in result['schools']:
print(f" - {school['name']}: {len(school['programs'])} programs, {len(school.get('faculty', []))} faculty")