- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
96 lines
2.9 KiB
YAML
96 lines
2.9 KiB
YAML
university:
|
|
name: "REPLACE_UNIVERSITY_NAME"
|
|
url: "https://www.example.ac.uk/"
|
|
country: "United Kingdom"
|
|
|
|
schools:
|
|
discovery_method: "static_list"
|
|
static_list:
|
|
- name: "Department of Computer Science"
|
|
url: "https://www.example.ac.uk/about/people/academic-and-research-staff/"
|
|
keywords:
|
|
- "computer"
|
|
- "software"
|
|
- "artificial intelligence"
|
|
- "data science"
|
|
faculty_pages:
|
|
- url: "https://www.example.ac.uk/about/people/academic-and-research-staff/"
|
|
extract_method: "links"
|
|
requires_scroll: true
|
|
scroll_times: 6
|
|
scroll_delay_ms: 600
|
|
blocked_resources: ["image", "font", "media"]
|
|
- url: "https://www.example.ac.uk/about/people/"
|
|
extract_method: "links"
|
|
load_more_selector: "button.load-more"
|
|
max_load_more: 5
|
|
request:
|
|
timeout_ms: 45000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 2000
|
|
- name: "Department of Physics"
|
|
url: "https://www.example.ac.uk/physics/about/people/"
|
|
keywords:
|
|
- "physics"
|
|
- "astronomy"
|
|
- "material science"
|
|
faculty_pages:
|
|
- url: "https://www.example.ac.uk/physics/about/people/academic-staff/"
|
|
extract_method: "table"
|
|
request:
|
|
timeout_ms: 60000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 2000
|
|
|
|
programs:
|
|
paths_to_try:
|
|
- "/study/masters/courses/a-to-z/"
|
|
- "/study/masters/courses/list/"
|
|
link_patterns:
|
|
- text_contains: ["courses", "masters", "postgraduate"]
|
|
href_contains: ["/study/", "/masters/", "/courses/"]
|
|
selectors:
|
|
program_item: ".course-card, li.course, article.course"
|
|
program_name: ".course-title, h3, .title"
|
|
program_url: "a[href]"
|
|
degree_type: ".award, .badge"
|
|
request:
|
|
timeout_ms: 35000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 2000
|
|
|
|
faculty:
|
|
discovery_strategies:
|
|
- type: "link_in_page"
|
|
patterns:
|
|
- text_contains: ["people", "faculty", "team", "staff"]
|
|
href_contains: ["/people", "/faculty", "/staff"]
|
|
request:
|
|
timeout_ms: 25000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 1500
|
|
- type: "url_pattern"
|
|
patterns:
|
|
- "{program_url}/people"
|
|
- "{program_url}/staff"
|
|
- "{school_url}/people"
|
|
- "{school_url}/contact/staff"
|
|
request:
|
|
timeout_ms: 25000
|
|
wait_until: "domcontentloaded"
|
|
post_wait_ms: 1500
|
|
- type: "school_directory"
|
|
assign_to_all: false
|
|
match_by_school_keywords: true
|
|
metadata_keyword_field: "keywords"
|
|
request:
|
|
timeout_ms: 60000
|
|
wait_for_selector: "a[href*='/people/'], table"
|
|
post_wait_ms: 2000
|
|
|
|
filters:
|
|
program_degree_types:
|
|
include: ["MSc", "MSci", "MA", "MBA", "MEng", "LLM"]
|
|
exclude: ["PhD", "Bachelor", "BSc", "BA", "PGCert"]
|
|
exclude_schools: []
|