- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
332 lines
11 KiB
YAML
332 lines
11 KiB
YAML
university:
|
|
name: "The University of Manchester"
|
|
url: "https://www.manchester.ac.uk/"
|
|
country: "United Kingdom"
|
|
|
|
schools:
|
|
discovery_method: "static_list"
|
|
request:
|
|
timeout_ms: 45000
|
|
max_retries: 3
|
|
retry_backoff_ms: 3000
|
|
static_list:
|
|
- name: "Alliance Manchester Business School"
|
|
url: "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
|
|
keywords:
|
|
- "accounting"
|
|
- "finance"
|
|
- "business"
|
|
- "management"
|
|
- "marketing"
|
|
- "mba"
|
|
- "economics"
|
|
- "entrepreneurship"
|
|
faculty_pages:
|
|
- url: "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
|
|
extract_method: "table"
|
|
requires_scroll: true
|
|
scroll_times: 6
|
|
scroll_delay_ms: 700
|
|
load_more_selector: "button.load-more, button.show-more"
|
|
max_load_more: 5
|
|
request:
|
|
timeout_ms: 60000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 2500
|
|
- name: "Department of Computer Science"
|
|
url: "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/"
|
|
keywords:
|
|
- "computer"
|
|
- "software"
|
|
- "data science"
|
|
- "artificial intelligence"
|
|
- "ai "
|
|
- "machine learning"
|
|
- "cyber"
|
|
- "computing"
|
|
faculty_pages:
|
|
- url: "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/"
|
|
extract_method: "links"
|
|
requires_scroll: true
|
|
scroll_times: 6
|
|
scroll_delay_ms: 700
|
|
blocked_resources: ["image", "font", "media"]
|
|
- url: "https://www.cs.manchester.ac.uk/about/people/"
|
|
extract_method: "links"
|
|
load_more_selector: "button.load-more"
|
|
max_load_more: 5
|
|
request:
|
|
timeout_ms: 45000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 2000
|
|
- name: "Department of Physics and Astronomy"
|
|
url: "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/"
|
|
keywords:
|
|
- "physics"
|
|
- "astronomy"
|
|
- "astrophysics"
|
|
- "nuclear"
|
|
- "particle"
|
|
faculty_pages:
|
|
- url: "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/"
|
|
extract_method: "links"
|
|
requires_scroll: true
|
|
scroll_times: 5
|
|
scroll_delay_ms: 700
|
|
- name: "Department of Electrical and Electronic Engineering"
|
|
url: "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/"
|
|
keywords:
|
|
- "electrical"
|
|
- "electronic"
|
|
- "eee"
|
|
- "power systems"
|
|
- "microelectronics"
|
|
faculty_pages:
|
|
- url: "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/"
|
|
extract_method: "links"
|
|
requires_scroll: true
|
|
scroll_times: 6
|
|
scroll_delay_ms: 700
|
|
- name: "Department of Chemistry"
|
|
url: "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/"
|
|
keywords:
|
|
- "chemistry"
|
|
- "chemical"
|
|
faculty_pages:
|
|
- url: "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/"
|
|
extract_method: "research_explorer"
|
|
requires_scroll: true
|
|
request:
|
|
timeout_ms: 120000
|
|
wait_until: "networkidle"
|
|
wait_for_selector: "a.link.person"
|
|
post_wait_ms: 5000
|
|
research_explorer:
|
|
org_slug: "department-of-chemistry"
|
|
page_size: 200
|
|
- name: "Department of Mathematics"
|
|
url: "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/"
|
|
keywords:
|
|
- "mathematics"
|
|
- "statistics"
|
|
- "applied math"
|
|
- "actuarial"
|
|
faculty_pages:
|
|
- url: "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/"
|
|
extract_method: "research_explorer"
|
|
requires_scroll: true
|
|
request:
|
|
timeout_ms: 120000
|
|
wait_until: "networkidle"
|
|
wait_for_selector: "a.link.person"
|
|
post_wait_ms: 4500
|
|
research_explorer:
|
|
org_slug: "department-of-mathematics"
|
|
page_size: 200
|
|
- name: "School of Engineering"
|
|
url: "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/"
|
|
keywords:
|
|
- "engineering"
|
|
- "mechanical"
|
|
- "aerospace"
|
|
- "civil"
|
|
- "materials"
|
|
faculty_pages:
|
|
- url: "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/"
|
|
extract_method: "research_explorer"
|
|
requires_scroll: true
|
|
request:
|
|
timeout_ms: 120000
|
|
wait_until: "networkidle"
|
|
wait_for_selector: "a.link.person"
|
|
post_wait_ms: 4500
|
|
research_explorer:
|
|
org_slug: "school-of-engineering"
|
|
page_size: 400
|
|
- name: "Faculty of Biology, Medicine and Health"
|
|
url: "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/"
|
|
keywords:
|
|
- "medicine"
|
|
- "medical"
|
|
- "health"
|
|
- "nursing"
|
|
- "pharmacy"
|
|
- "clinical"
|
|
- "dental"
|
|
- "optometry"
|
|
- "biology"
|
|
- "biomedical"
|
|
- "psychology"
|
|
faculty_pages:
|
|
- url: "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/"
|
|
extract_method: "research_explorer"
|
|
requires_scroll: true
|
|
request:
|
|
timeout_ms: 130000
|
|
wait_until: "networkidle"
|
|
wait_for_selector: "a.link.person"
|
|
post_wait_ms: 4500
|
|
research_explorer:
|
|
org_slug: "faculty-of-biology-medicine-and-health"
|
|
page_size: 400
|
|
- name: "School of Social Sciences"
|
|
url: "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/"
|
|
keywords:
|
|
- "sociology"
|
|
- "politics"
|
|
- "international"
|
|
- "social"
|
|
- "criminology"
|
|
- "anthropology"
|
|
- "philosophy"
|
|
faculty_pages:
|
|
- url: "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/"
|
|
extract_method: "research_explorer"
|
|
requires_scroll: true
|
|
request:
|
|
timeout_ms: 120000
|
|
wait_until: "networkidle"
|
|
wait_for_selector: "a.link.person"
|
|
post_wait_ms: 4500
|
|
research_explorer:
|
|
org_slug: "school-of-social-sciences"
|
|
page_size: 200
|
|
- name: "School of Law"
|
|
url: "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/"
|
|
keywords:
|
|
- "law"
|
|
- "legal"
|
|
- "llm"
|
|
faculty_pages:
|
|
- url: "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/"
|
|
extract_method: "research_explorer"
|
|
requires_scroll: true
|
|
request:
|
|
timeout_ms: 120000
|
|
wait_until: "networkidle"
|
|
wait_for_selector: "a.link.person"
|
|
post_wait_ms: 4500
|
|
research_explorer:
|
|
org_slug: "school-of-law"
|
|
page_size: 200
|
|
- name: "School of Arts, Languages and Cultures"
|
|
url: "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/"
|
|
keywords:
|
|
- "arts"
|
|
- "languages"
|
|
- "culture"
|
|
- "music"
|
|
- "drama"
|
|
- "theatre"
|
|
- "history"
|
|
- "linguistics"
|
|
- "literature"
|
|
- "translation"
|
|
- "archaeology"
|
|
- "religion"
|
|
faculty_pages:
|
|
- url: "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/"
|
|
extract_method: "research_explorer"
|
|
requires_scroll: true
|
|
request:
|
|
timeout_ms: 120000
|
|
wait_until: "networkidle"
|
|
wait_for_selector: "a.link.person"
|
|
post_wait_ms: 4500
|
|
research_explorer:
|
|
org_slug: "school-of-arts-languages-and-cultures"
|
|
page_size: 300
|
|
- name: "School of Environment, Education and Development"
|
|
url: "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/"
|
|
keywords:
|
|
- "environment"
|
|
- "education"
|
|
- "development"
|
|
- "planning"
|
|
- "architecture"
|
|
- "urban"
|
|
- "geography"
|
|
- "sustainability"
|
|
faculty_pages:
|
|
- url: "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/"
|
|
extract_method: "research_explorer"
|
|
requires_scroll: true
|
|
request:
|
|
timeout_ms: 120000
|
|
wait_until: "networkidle"
|
|
wait_for_selector: "a.link.person"
|
|
post_wait_ms: 4500
|
|
research_explorer:
|
|
org_slug: "school-of-environment-education-and-development"
|
|
page_size: 300
|
|
|
|
programs:
|
|
paths_to_try:
|
|
- "/study/masters/courses/list/"
|
|
link_patterns:
|
|
- text_contains: ["masters", "postgraduate", "graduate"]
|
|
href_contains: ["/courses/list", "/study/masters", "/study/postgraduate"]
|
|
selectors:
|
|
program_item: "li.course-item, article.course, .course-listing a"
|
|
program_name: ".course-title, h3, .title"
|
|
program_url: "a[href]"
|
|
degree_type: ".course-award, .badge"
|
|
request:
|
|
timeout_ms: 40000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 2500
|
|
global_catalog:
|
|
url: "https://www.manchester.ac.uk/study/masters/courses/list/"
|
|
request:
|
|
timeout_ms: 60000
|
|
wait_until: "networkidle"
|
|
wait_after_ms: 3000
|
|
metadata_keyword_field: "keywords"
|
|
assign_by_school_keywords: true
|
|
assign_if_no_keywords: false
|
|
allow_multiple_assignments: false
|
|
per_school_limit: 200
|
|
skip_program_faculty_lookup: true
|
|
|
|
faculty:
|
|
discovery_strategies:
|
|
- type: "link_in_page"
|
|
patterns:
|
|
- text_contains: ["people", "faculty", "staff", "directory"]
|
|
href_contains: ["/people", "/faculty", "/staff"]
|
|
request:
|
|
timeout_ms: 30000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 1500
|
|
- type: "url_pattern"
|
|
patterns:
|
|
- "{program_url}/people"
|
|
- "{program_url}/faculty"
|
|
- "{school_url}/people"
|
|
- "{school_url}/staff"
|
|
request:
|
|
timeout_ms: 30000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 1500
|
|
- type: "school_directory"
|
|
assign_to_all: false
|
|
match_by_school_keywords: true
|
|
metadata_keyword_field: "keywords"
|
|
request:
|
|
timeout_ms: 120000
|
|
post_wait_ms: 3500
|
|
|
|
filters:
|
|
program_degree_types:
|
|
include: ["MSc", "MA", "MBA", "MEng", "LLM", "MRes"]
|
|
exclude: ["PhD", "Bachelor", "BSc", "BA", "PGCert"]
|
|
exclude_schools: []
|
|
|
|
playwright:
|
|
stealth: true
|
|
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
|
|
extra_headers:
|
|
Accept-Language: "en-US,en;q=0.9"
|
|
cookies: []
|
|
add_init_scripts: []
|