Files
University-Playwright-Codeg…/artifacts/test_manchester_scraper.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

465 lines
18 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Test Manchester University scraper - improved faculty mapping
"""
import asyncio
import json
from datetime import datetime, timezone
from playwright.async_api import async_playwright
MASTERS_PATHS = [
"/study/masters/courses/list/",
"/study/masters/courses/",
"/postgraduate/taught/courses/",
"/postgraduate/courses/list/",
"/postgraduate/courses/",
"/graduate/programs/",
"/academics/graduate/programs/",
"/programmes/masters/",
"/masters/programmes/",
"/admissions/graduate/programs/",
]
ACCOUNTING_STAFF_URL = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
ACCOUNTING_STAFF_CACHE = None
JS_CHECK_COURSES = r"""() => {
const links = document.querySelectorAll('a[href]');
let courseCount = 0;
for (const a of links) {
const href = a.href.toLowerCase();
if (/\/\d{4,}\//.test(href) ||
/\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
/\/course\/[a-z]/.test(href)) {
courseCount++;
}
}
return courseCount;
}"""
JS_FIND_LIST_URL = """() => {
const links = document.querySelectorAll('a[href]');
for (const a of links) {
const text = a.innerText.toLowerCase();
const href = a.href.toLowerCase();
if ((text.includes('a-z') || text.includes('all course') ||
text.includes('full list') || text.includes('browse all') ||
href.includes('/list')) &&
(href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
return a.href;
}
}
return null;
}"""
JS_FIND_COURSES_FROM_HOME = """() => {
const links = document.querySelectorAll('a[href]');
for (const a of links) {
const href = a.href.toLowerCase();
const text = a.innerText.toLowerCase();
if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
(href.includes('course') || href.includes('program') || href.includes('degree'))) {
return a.href;
}
}
return null;
}"""
JS_EXTRACT_PROGRAMS = r"""() => {
const programs = [];
const seen = new Set();
const currentHost = window.location.hostname;
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().replace(/\s+/g, ' ');
if (!href || seen.has(href)) return;
if (text.length < 5 || text.length > 200) return;
if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;
try {
const linkHost = new URL(href).hostname;
if (!linkHost.includes(currentHost.replace('www.', '')) &&
!currentHost.includes(linkHost.replace('www.', ''))) return;
} catch {
return;
}
const hrefLower = href.toLowerCase();
const textLower = text.toLowerCase();
const isNavigation = textLower === 'courses' ||
textLower === 'programmes' ||
textLower === 'undergraduate' ||
textLower === 'postgraduate' ||
textLower === 'masters' ||
textLower === "master's" ||
textLower.includes('skip to') ||
textLower.includes('share') ||
textLower === 'home' ||
textLower === 'study' ||
textLower.startsWith('a-z') ||
textLower.includes('admission') ||
textLower.includes('fees and funding') ||
textLower.includes('why should') ||
textLower.includes('why manchester') ||
textLower.includes('teaching and learning') ||
textLower.includes('meet us') ||
textLower.includes('student support') ||
textLower.includes('contact us') ||
textLower.includes('how to apply') ||
hrefLower.includes('/admissions/') ||
hrefLower.includes('/fees-and-funding/') ||
hrefLower.includes('/why-') ||
hrefLower.includes('/meet-us/') ||
hrefLower.includes('/contact-us/') ||
hrefLower.includes('/student-support/') ||
hrefLower.includes('/teaching-and-learning/') ||
hrefLower.endsWith('/courses/') ||
hrefLower.endsWith('/masters/') ||
hrefLower.endsWith('/postgraduate/');
if (isNavigation) return;
const isExcluded = hrefLower.includes('/undergraduate') ||
hrefLower.includes('/bachelor') ||
hrefLower.includes('/phd/') ||
hrefLower.includes('/doctoral') ||
hrefLower.includes('/research-degree') ||
textLower.includes('bachelor') ||
textLower.includes('undergraduate') ||
(textLower.includes('phd') && !textLower.includes('mphil'));
if (isExcluded) return;
const hasNumericId = /\/\d{4,}\//.test(href);
const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
const isCoursePage = (hrefLower.includes('/course/') ||
hrefLower.includes('/courses/list/') ||
hrefLower.includes('/programme/')) &&
href.split('/').filter(p => p).length > 4;
const textHasDegree = /(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)/i.test(text) ||
textLower.includes('master');
if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
seen.add(href);
programs.push({
name: text,
url: href
});
}
});
return programs;
}"""
JS_EXTRACT_FACULTY = r"""() => {
const faculty = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href.toLowerCase();
const text = a.innerText.trim();
if (seen.has(href)) return;
if (text.length < 3 || text.length > 100) return;
const isStaff = href.includes('/people/') ||
href.includes('/staff/') ||
href.includes('/faculty/') ||
href.includes('/profile/') ||
href.includes('/academics/') ||
href.includes('/researcher/');
if (isStaff) {
seen.add(href);
faculty.push({
name: text.replace(/\s+/g, ' '),
url: a.href
});
}
});
return faculty.slice(0, 20);
}"""
JS_EXTRACT_ACCOUNTING_STAFF = r"""() => {
const rows = Array.from(document.querySelectorAll('table tbody tr'));
const staff = [];
for (const row of rows) {
const cells = row.querySelectorAll('td');
if (!cells || cells.length < 2) {
continue;
}
const nameCell = cells[1];
const roleCell = cells[2];
const emailCell = cells[5];
let profileUrl = '';
let displayName = nameCell ? nameCell.innerText.trim() : '';
const link = nameCell ? nameCell.querySelector('a[href]') : null;
if (link) {
profileUrl = link.href;
displayName = link.innerText.trim() || displayName;
}
if (!displayName) {
continue;
}
let email = '';
if (emailCell) {
const emailLink = emailCell.querySelector('a[href^="mailto:"]');
if (emailLink) {
email = emailLink.href.replace('mailto:', '').trim();
}
}
staff.push({
name: displayName,
title: roleCell ? roleCell.innerText.trim() : '',
url: profileUrl,
email: email
});
}
return staff;
}"""
def should_use_accounting_staff(program_name: str) -> bool:
lower_name = program_name.lower()
return "msc" in lower_name and "accounting" in lower_name
async def load_accounting_staff(context, output_callback=None):
global ACCOUNTING_STAFF_CACHE
if ACCOUNTING_STAFF_CACHE is not None:
return ACCOUNTING_STAFF_CACHE
staff_page = await context.new_page()
try:
if output_callback:
output_callback("info", "Loading official AMBS Accounting & Finance staff page...")
await staff_page.goto(ACCOUNTING_STAFF_URL, wait_until="domcontentloaded", timeout=30000)
await staff_page.wait_for_timeout(2000)
ACCOUNTING_STAFF_CACHE = await staff_page.evaluate(JS_EXTRACT_ACCOUNTING_STAFF)
if output_callback:
output_callback("info", f"Captured {len(ACCOUNTING_STAFF_CACHE)} faculty from the official staff page")
except Exception as exc:
if output_callback:
output_callback("error", f"Failed to load AMBS staff page: {exc}")
ACCOUNTING_STAFF_CACHE = []
finally:
await staff_page.close()
return ACCOUNTING_STAFF_CACHE
async def find_course_list_page(page, base_url, output_callback):
for path in MASTERS_PATHS:
test_url = base_url.rstrip('/') + path
try:
response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
if response and response.status == 200:
title = await page.title()
if '404' not in title.lower() and 'not found' not in title.lower():
has_courses = await page.evaluate(JS_CHECK_COURSES)
if has_courses > 5:
if output_callback:
output_callback("info", f"Found course list: {path} ({has_courses} courses)")
return test_url
list_url = await page.evaluate(JS_FIND_LIST_URL)
if list_url:
if output_callback:
output_callback("info", f"Found full course list: {list_url}")
return list_url
except:
continue
try:
await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
if courses_url:
return courses_url
except:
pass
return None
async def extract_course_links(page, output_callback):
return await page.evaluate(JS_EXTRACT_PROGRAMS)
async def scrape(output_callback=None):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = await context.new_page()
base_url = "https://www.manchester.ac.uk/"
result = {
"name": "Manchester University",
"url": base_url,
"scraped_at": datetime.now(timezone.utc).isoformat(),
"schools": []
}
all_programs = []
try:
if output_callback:
output_callback("info", "Searching for masters course list...")
courses_url = await find_course_list_page(page, base_url, output_callback)
if not courses_url:
if output_callback:
output_callback("warning", "Course list not found, using homepage")
courses_url = base_url
if output_callback:
output_callback("info", "Extracting masters programs...")
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
for _ in range(3):
try:
load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
if await load_more.count() > 0:
await load_more.first.click()
await page.wait_for_timeout(2000)
else:
break
except:
break
programs_data = await extract_course_links(page, output_callback)
if output_callback:
output_callback("info", f"Found {len(programs_data)} masters programs")
print("\nTop 20 programs:")
for i, prog in enumerate(programs_data[:20]):
print(f" {i+1}. {prog['name'][:60]}")
print(f" {prog['url']}")
max_detail_pages = min(len(programs_data), 30)
detailed_processed = 0
logged_official_staff = False
for prog in programs_data:
faculty_data = []
used_official_staff = False
if should_use_accounting_staff(prog['name']):
staff_list = await load_accounting_staff(context, output_callback)
if staff_list:
used_official_staff = True
if output_callback and not logged_official_staff:
output_callback("info", "Using Alliance MBS Accounting & Finance staff directory for accounting programmes")
logged_official_staff = True
faculty_data = [
{
"name": person.get("name"),
"url": person.get("url") or ACCOUNTING_STAFF_URL,
"title": person.get("title"),
"email": person.get("email"),
"source": "Alliance Manchester Business School - Accounting & Finance staff"
}
for person in staff_list
]
elif detailed_processed < max_detail_pages:
detailed_processed += 1
if output_callback and detailed_processed % 10 == 0:
output_callback("info", f"Processing {detailed_processed}/{max_detail_pages}: {prog['name'][:50]}")
try:
await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
await page.wait_for_timeout(800)
faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
except Exception as e:
if output_callback:
output_callback("warning", f"Failed to capture faculty for {prog['name'][:50]}: {e}")
faculty_data = []
program_entry = {
"name": prog['name'],
"url": prog['url'],
"faculty": faculty_data
}
if used_official_staff:
program_entry["faculty_page_override"] = ACCOUNTING_STAFF_URL
all_programs.append(program_entry)
result["schools"] = [{
"name": "Masters Programs",
"url": courses_url,
"programs": all_programs
}]
if output_callback:
total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
output_callback("info", f"Done! {len(all_programs)} programs, {total_faculty} faculty")
except Exception as e:
if output_callback:
output_callback("error", f"Scraping error: {str(e)}")
finally:
await browser.close()
return result
def log_callback(level, message):
print(f"[{level.upper()}] {message}")
if __name__ == "__main__":
result = asyncio.run(scrape(output_callback=log_callback))
print("\n" + "="*60)
print("Scrape summary:")
print("="*60)
if result.get("schools"):
school = result["schools"][0]
programs = school.get("programs", [])
print(f"Course list URL: {school.get('url')}")
print(f"Total programs: {len(programs)}")
faculty_count = sum(len(p.get('faculty', [])) for p in programs)
print(f"Faculty total: {faculty_count}")
print("\nTop 10 programs:")
for i, p in enumerate(programs[:10]):
print(f" {i+1}. {p['name'][:60]}")
if p.get("faculty"):
print(f" Faculty entries: {len(p['faculty'])}")
with open("manchester_test_result.json", "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print("\nSaved results to manchester_test_result.json")