- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
465 lines
18 KiB
Python
465 lines
18 KiB
Python
"""
|
||
Test Manchester University scraper - improved faculty mapping
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
from datetime import datetime, timezone
|
||
|
||
from playwright.async_api import async_playwright
|
||
|
||
|
||
MASTERS_PATHS = [
|
||
"/study/masters/courses/list/",
|
||
"/study/masters/courses/",
|
||
"/postgraduate/taught/courses/",
|
||
"/postgraduate/courses/list/",
|
||
"/postgraduate/courses/",
|
||
"/graduate/programs/",
|
||
"/academics/graduate/programs/",
|
||
"/programmes/masters/",
|
||
"/masters/programmes/",
|
||
"/admissions/graduate/programs/",
|
||
]
|
||
|
||
ACCOUNTING_STAFF_URL = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
|
||
ACCOUNTING_STAFF_CACHE = None
|
||
|
||
|
||
JS_CHECK_COURSES = r"""() => {
|
||
const links = document.querySelectorAll('a[href]');
|
||
let courseCount = 0;
|
||
for (const a of links) {
|
||
const href = a.href.toLowerCase();
|
||
if (/\/\d{4,}\//.test(href) ||
|
||
/\/(msc|ma|mba|mres|llm|med|meng)-/.test(href) ||
|
||
/\/course\/[a-z]/.test(href)) {
|
||
courseCount++;
|
||
}
|
||
}
|
||
return courseCount;
|
||
}"""
|
||
|
||
JS_FIND_LIST_URL = """() => {
|
||
const links = document.querySelectorAll('a[href]');
|
||
for (const a of links) {
|
||
const text = a.innerText.toLowerCase();
|
||
const href = a.href.toLowerCase();
|
||
if ((text.includes('a-z') || text.includes('all course') ||
|
||
text.includes('full list') || text.includes('browse all') ||
|
||
href.includes('/list')) &&
|
||
(href.includes('master') || href.includes('course') || href.includes('postgrad'))) {
|
||
return a.href;
|
||
}
|
||
}
|
||
return null;
|
||
}"""
|
||
|
||
JS_FIND_COURSES_FROM_HOME = """() => {
|
||
const links = document.querySelectorAll('a[href]');
|
||
for (const a of links) {
|
||
const href = a.href.toLowerCase();
|
||
const text = a.innerText.toLowerCase();
|
||
if ((href.includes('master') || href.includes('postgraduate') || href.includes('graduate')) &&
|
||
(href.includes('course') || href.includes('program') || href.includes('degree'))) {
|
||
return a.href;
|
||
}
|
||
}
|
||
return null;
|
||
}"""
|
||
|
||
JS_EXTRACT_PROGRAMS = r"""() => {
|
||
const programs = [];
|
||
const seen = new Set();
|
||
const currentHost = window.location.hostname;
|
||
|
||
document.querySelectorAll('a[href]').forEach(a => {
|
||
const href = a.href;
|
||
const text = a.innerText.trim().replace(/\s+/g, ' ');
|
||
|
||
if (!href || seen.has(href)) return;
|
||
if (text.length < 5 || text.length > 200) return;
|
||
if (href.includes('#') || href.includes('javascript:') || href.includes('mailto:')) return;
|
||
|
||
try {
|
||
const linkHost = new URL(href).hostname;
|
||
if (!linkHost.includes(currentHost.replace('www.', '')) &&
|
||
!currentHost.includes(linkHost.replace('www.', ''))) return;
|
||
} catch {
|
||
return;
|
||
}
|
||
|
||
const hrefLower = href.toLowerCase();
|
||
const textLower = text.toLowerCase();
|
||
|
||
const isNavigation = textLower === 'courses' ||
|
||
textLower === 'programmes' ||
|
||
textLower === 'undergraduate' ||
|
||
textLower === 'postgraduate' ||
|
||
textLower === 'masters' ||
|
||
textLower === "master's" ||
|
||
textLower.includes('skip to') ||
|
||
textLower.includes('share') ||
|
||
textLower === 'home' ||
|
||
textLower === 'study' ||
|
||
textLower.startsWith('a-z') ||
|
||
textLower.includes('admission') ||
|
||
textLower.includes('fees and funding') ||
|
||
textLower.includes('why should') ||
|
||
textLower.includes('why manchester') ||
|
||
textLower.includes('teaching and learning') ||
|
||
textLower.includes('meet us') ||
|
||
textLower.includes('student support') ||
|
||
textLower.includes('contact us') ||
|
||
textLower.includes('how to apply') ||
|
||
hrefLower.includes('/admissions/') ||
|
||
hrefLower.includes('/fees-and-funding/') ||
|
||
hrefLower.includes('/why-') ||
|
||
hrefLower.includes('/meet-us/') ||
|
||
hrefLower.includes('/contact-us/') ||
|
||
hrefLower.includes('/student-support/') ||
|
||
hrefLower.includes('/teaching-and-learning/') ||
|
||
hrefLower.endsWith('/courses/') ||
|
||
hrefLower.endsWith('/masters/') ||
|
||
hrefLower.endsWith('/postgraduate/');
|
||
|
||
if (isNavigation) return;
|
||
|
||
const isExcluded = hrefLower.includes('/undergraduate') ||
|
||
hrefLower.includes('/bachelor') ||
|
||
hrefLower.includes('/phd/') ||
|
||
hrefLower.includes('/doctoral') ||
|
||
hrefLower.includes('/research-degree') ||
|
||
textLower.includes('bachelor') ||
|
||
textLower.includes('undergraduate') ||
|
||
(textLower.includes('phd') && !textLower.includes('mphil'));
|
||
|
||
if (isExcluded) return;
|
||
|
||
const hasNumericId = /\/\d{4,}\//.test(href);
|
||
const hasDegreeSlug = /\/(msc|ma|mba|mres|llm|med|meng|mpa|mph|mphil)-[a-z]/.test(hrefLower);
|
||
const isCoursePage = (hrefLower.includes('/course/') ||
|
||
hrefLower.includes('/courses/list/') ||
|
||
hrefLower.includes('/programme/')) &&
|
||
href.split('/').filter(p => p).length > 4;
|
||
const textHasDegree = /(msc|ma|mba|mres|llm|med|meng|pgcert|pgdip)/i.test(text) ||
|
||
textLower.includes('master');
|
||
|
||
if (hasNumericId || hasDegreeSlug || isCoursePage || textHasDegree) {
|
||
seen.add(href);
|
||
programs.push({
|
||
name: text,
|
||
url: href
|
||
});
|
||
}
|
||
});
|
||
|
||
return programs;
|
||
}"""
|
||
|
||
JS_EXTRACT_FACULTY = r"""() => {
|
||
const faculty = [];
|
||
const seen = new Set();
|
||
|
||
document.querySelectorAll('a[href]').forEach(a => {
|
||
const href = a.href.toLowerCase();
|
||
const text = a.innerText.trim();
|
||
|
||
if (seen.has(href)) return;
|
||
if (text.length < 3 || text.length > 100) return;
|
||
|
||
const isStaff = href.includes('/people/') ||
|
||
href.includes('/staff/') ||
|
||
href.includes('/faculty/') ||
|
||
href.includes('/profile/') ||
|
||
href.includes('/academics/') ||
|
||
href.includes('/researcher/');
|
||
|
||
if (isStaff) {
|
||
seen.add(href);
|
||
faculty.push({
|
||
name: text.replace(/\s+/g, ' '),
|
||
url: a.href
|
||
});
|
||
}
|
||
});
|
||
|
||
return faculty.slice(0, 20);
|
||
}"""
|
||
|
||
JS_EXTRACT_ACCOUNTING_STAFF = r"""() => {
|
||
const rows = Array.from(document.querySelectorAll('table tbody tr'));
|
||
const staff = [];
|
||
|
||
for (const row of rows) {
|
||
const cells = row.querySelectorAll('td');
|
||
if (!cells || cells.length < 2) {
|
||
continue;
|
||
}
|
||
|
||
const nameCell = cells[1];
|
||
const roleCell = cells[2];
|
||
const emailCell = cells[5];
|
||
|
||
let profileUrl = '';
|
||
let displayName = nameCell ? nameCell.innerText.trim() : '';
|
||
const link = nameCell ? nameCell.querySelector('a[href]') : null;
|
||
if (link) {
|
||
profileUrl = link.href;
|
||
displayName = link.innerText.trim() || displayName;
|
||
}
|
||
|
||
if (!displayName) {
|
||
continue;
|
||
}
|
||
|
||
let email = '';
|
||
if (emailCell) {
|
||
const emailLink = emailCell.querySelector('a[href^="mailto:"]');
|
||
if (emailLink) {
|
||
email = emailLink.href.replace('mailto:', '').trim();
|
||
}
|
||
}
|
||
|
||
staff.push({
|
||
name: displayName,
|
||
title: roleCell ? roleCell.innerText.trim() : '',
|
||
url: profileUrl,
|
||
email: email
|
||
});
|
||
}
|
||
|
||
return staff;
|
||
}"""
|
||
|
||
|
||
def should_use_accounting_staff(program_name: str) -> bool:
|
||
lower_name = program_name.lower()
|
||
return "msc" in lower_name and "accounting" in lower_name
|
||
|
||
|
||
async def load_accounting_staff(context, output_callback=None):
|
||
global ACCOUNTING_STAFF_CACHE
|
||
|
||
if ACCOUNTING_STAFF_CACHE is not None:
|
||
return ACCOUNTING_STAFF_CACHE
|
||
|
||
staff_page = await context.new_page()
|
||
try:
|
||
if output_callback:
|
||
output_callback("info", "Loading official AMBS Accounting & Finance staff page...")
|
||
|
||
await staff_page.goto(ACCOUNTING_STAFF_URL, wait_until="domcontentloaded", timeout=30000)
|
||
await staff_page.wait_for_timeout(2000)
|
||
|
||
ACCOUNTING_STAFF_CACHE = await staff_page.evaluate(JS_EXTRACT_ACCOUNTING_STAFF)
|
||
|
||
if output_callback:
|
||
output_callback("info", f"Captured {len(ACCOUNTING_STAFF_CACHE)} faculty from the official staff page")
|
||
|
||
except Exception as exc:
|
||
if output_callback:
|
||
output_callback("error", f"Failed to load AMBS staff page: {exc}")
|
||
ACCOUNTING_STAFF_CACHE = []
|
||
finally:
|
||
await staff_page.close()
|
||
|
||
return ACCOUNTING_STAFF_CACHE
|
||
|
||
|
||
async def find_course_list_page(page, base_url, output_callback):
|
||
for path in MASTERS_PATHS:
|
||
test_url = base_url.rstrip('/') + path
|
||
try:
|
||
response = await page.goto(test_url, wait_until="domcontentloaded", timeout=15000)
|
||
if response and response.status == 200:
|
||
title = await page.title()
|
||
if '404' not in title.lower() and 'not found' not in title.lower():
|
||
has_courses = await page.evaluate(JS_CHECK_COURSES)
|
||
if has_courses > 5:
|
||
if output_callback:
|
||
output_callback("info", f"Found course list: {path} ({has_courses} courses)")
|
||
return test_url
|
||
|
||
list_url = await page.evaluate(JS_FIND_LIST_URL)
|
||
if list_url:
|
||
if output_callback:
|
||
output_callback("info", f"Found full course list: {list_url}")
|
||
return list_url
|
||
except:
|
||
continue
|
||
|
||
try:
|
||
await page.goto(base_url, wait_until="domcontentloaded", timeout=30000)
|
||
await page.wait_for_timeout(2000)
|
||
courses_url = await page.evaluate(JS_FIND_COURSES_FROM_HOME)
|
||
if courses_url:
|
||
return courses_url
|
||
except:
|
||
pass
|
||
|
||
return None
|
||
|
||
|
||
async def extract_course_links(page, output_callback):
|
||
return await page.evaluate(JS_EXTRACT_PROGRAMS)
|
||
|
||
|
||
async def scrape(output_callback=None):
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(headless=True)
|
||
context = await browser.new_context(
|
||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||
)
|
||
page = await context.new_page()
|
||
|
||
base_url = "https://www.manchester.ac.uk/"
|
||
|
||
result = {
|
||
"name": "Manchester University",
|
||
"url": base_url,
|
||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||
"schools": []
|
||
}
|
||
|
||
all_programs = []
|
||
|
||
try:
|
||
if output_callback:
|
||
output_callback("info", "Searching for masters course list...")
|
||
|
||
courses_url = await find_course_list_page(page, base_url, output_callback)
|
||
|
||
if not courses_url:
|
||
if output_callback:
|
||
output_callback("warning", "Course list not found, using homepage")
|
||
courses_url = base_url
|
||
|
||
if output_callback:
|
||
output_callback("info", "Extracting masters programs...")
|
||
|
||
await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
|
||
await page.wait_for_timeout(3000)
|
||
|
||
for _ in range(3):
|
||
try:
|
||
load_more = page.locator('button:has-text("Load more"), button:has-text("Show more"), button:has-text("View more"), a:has-text("Load more")')
|
||
if await load_more.count() > 0:
|
||
await load_more.first.click()
|
||
await page.wait_for_timeout(2000)
|
||
else:
|
||
break
|
||
except:
|
||
break
|
||
|
||
programs_data = await extract_course_links(page, output_callback)
|
||
|
||
if output_callback:
|
||
output_callback("info", f"Found {len(programs_data)} masters programs")
|
||
|
||
print("\nTop 20 programs:")
|
||
for i, prog in enumerate(programs_data[:20]):
|
||
print(f" {i+1}. {prog['name'][:60]}")
|
||
print(f" {prog['url']}")
|
||
|
||
max_detail_pages = min(len(programs_data), 30)
|
||
detailed_processed = 0
|
||
logged_official_staff = False
|
||
|
||
for prog in programs_data:
|
||
faculty_data = []
|
||
used_official_staff = False
|
||
|
||
if should_use_accounting_staff(prog['name']):
|
||
staff_list = await load_accounting_staff(context, output_callback)
|
||
if staff_list:
|
||
used_official_staff = True
|
||
if output_callback and not logged_official_staff:
|
||
output_callback("info", "Using Alliance MBS Accounting & Finance staff directory for accounting programmes")
|
||
logged_official_staff = True
|
||
faculty_data = [
|
||
{
|
||
"name": person.get("name"),
|
||
"url": person.get("url") or ACCOUNTING_STAFF_URL,
|
||
"title": person.get("title"),
|
||
"email": person.get("email"),
|
||
"source": "Alliance Manchester Business School - Accounting & Finance staff"
|
||
}
|
||
for person in staff_list
|
||
]
|
||
|
||
elif detailed_processed < max_detail_pages:
|
||
detailed_processed += 1
|
||
if output_callback and detailed_processed % 10 == 0:
|
||
output_callback("info", f"Processing {detailed_processed}/{max_detail_pages}: {prog['name'][:50]}")
|
||
try:
|
||
await page.goto(prog['url'], wait_until="domcontentloaded", timeout=15000)
|
||
await page.wait_for_timeout(800)
|
||
|
||
faculty_data = await page.evaluate(JS_EXTRACT_FACULTY)
|
||
except Exception as e:
|
||
if output_callback:
|
||
output_callback("warning", f"Failed to capture faculty for {prog['name'][:50]}: {e}")
|
||
faculty_data = []
|
||
|
||
program_entry = {
|
||
"name": prog['name'],
|
||
"url": prog['url'],
|
||
"faculty": faculty_data
|
||
}
|
||
|
||
if used_official_staff:
|
||
program_entry["faculty_page_override"] = ACCOUNTING_STAFF_URL
|
||
|
||
all_programs.append(program_entry)
|
||
|
||
result["schools"] = [{
|
||
"name": "Masters Programs",
|
||
"url": courses_url,
|
||
"programs": all_programs
|
||
}]
|
||
|
||
if output_callback:
|
||
total_faculty = sum(len(p.get('faculty', [])) for p in all_programs)
|
||
output_callback("info", f"Done! {len(all_programs)} programs, {total_faculty} faculty")
|
||
|
||
except Exception as e:
|
||
if output_callback:
|
||
output_callback("error", f"Scraping error: {str(e)}")
|
||
|
||
finally:
|
||
await browser.close()
|
||
|
||
return result
|
||
|
||
|
||
def log_callback(level, message):
|
||
print(f"[{level.upper()}] {message}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
result = asyncio.run(scrape(output_callback=log_callback))
|
||
|
||
print("\n" + "="*60)
|
||
print("Scrape summary:")
|
||
print("="*60)
|
||
|
||
if result.get("schools"):
|
||
school = result["schools"][0]
|
||
programs = school.get("programs", [])
|
||
print(f"Course list URL: {school.get('url')}")
|
||
print(f"Total programs: {len(programs)}")
|
||
|
||
faculty_count = sum(len(p.get('faculty', [])) for p in programs)
|
||
print(f"Faculty total: {faculty_count}")
|
||
|
||
print("\nTop 10 programs:")
|
||
for i, p in enumerate(programs[:10]):
|
||
print(f" {i+1}. {p['name'][:60]}")
|
||
if p.get("faculty"):
|
||
print(f" Faculty entries: {len(p['faculty'])}")
|
||
|
||
with open("manchester_test_result.json", "w", encoding="utf-8") as f:
|
||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||
print("\nSaved results to manchester_test_result.json")
|