Files
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

170 lines
5.3 KiB
YAML

university:
name: "University College London"
url: "https://www.ucl.ac.uk/"
country: "United Kingdom"
schools:
discovery_method: "static_list"
request:
timeout_ms: 45000
max_retries: 3
retry_backoff_ms: 3000
static_list:
- name: "Faculty of Engineering Sciences"
url: "https://www.ucl.ac.uk/engineering/people"
keywords:
- "engineering"
- "mechanical"
- "civil"
- "materials"
- "electronic"
- "computer"
faculty_pages:
- url: "https://www.ucl.ac.uk/engineering/people"
extract_method: "links"
requires_scroll: true
scroll_times: 8
scroll_delay_ms: 600
blocked_resources: ["image", "font", "media"]
- url: "https://www.ucl.ac.uk/electronic-electrical-engineering/people/academic-staff"
extract_method: "table"
request:
timeout_ms: 45000
wait_until: "domcontentloaded"
post_wait_ms: 2000
- name: "Faculty of Mathematical & Physical Sciences"
url: "https://www.ucl.ac.uk/mathematical-physical-sciences/people"
keywords:
- "mathematics"
- "physics"
- "chemistry"
- "earth sciences"
- "astronomy"
faculty_pages:
- url: "https://www.ucl.ac.uk/mathematical-physical-sciences/people"
extract_method: "links"
requires_scroll: true
scroll_times: 6
scroll_delay_ms: 600
- url: "https://www.ucl.ac.uk/physics-astronomy/people/academic-staff"
extract_method: "links"
- name: "Faculty of Arts & Humanities"
url: "https://www.ucl.ac.uk/arts-humanities/people/academic-staff"
keywords:
- "arts"
- "languages"
- "culture"
- "history"
- "philosophy"
- "translation"
faculty_pages:
- url: "https://www.ucl.ac.uk/arts-humanities/people/academic-staff"
extract_method: "links"
requires_scroll: true
scroll_times: 6
scroll_delay_ms: 600
- name: "Faculty of Laws"
url: "https://www.ucl.ac.uk/laws/people/academic-staff"
keywords:
- "law"
- "legal"
- "llm"
faculty_pages:
- url: "https://www.ucl.ac.uk/laws/people/academic-staff"
extract_method: "links"
requires_scroll: true
scroll_times: 5
scroll_delay_ms: 600
- name: "Faculty of Social & Historical Sciences"
url: "https://www.ucl.ac.uk/social-historical-sciences/people"
keywords:
- "social"
- "economics"
- "geography"
- "anthropology"
- "politics"
- "history"
faculty_pages:
- url: "https://www.ucl.ac.uk/social-historical-sciences/people"
extract_method: "links"
requires_scroll: true
scroll_times: 6
scroll_delay_ms: 600
- name: "Faculty of Brain Sciences"
url: "https://www.ucl.ac.uk/brain-sciences/people"
keywords:
- "neuroscience"
- "psychology"
- "cognitive"
- "biomedical"
faculty_pages:
- url: "https://www.ucl.ac.uk/brain-sciences/people"
extract_method: "links"
requires_scroll: true
scroll_times: 6
scroll_delay_ms: 600
- name: "Faculty of the Built Environment (The Bartlett)"
url: "https://www.ucl.ac.uk/bartlett/people/all"
keywords:
- "architecture"
- "planning"
- "urban"
- "built environment"
faculty_pages:
- url: "https://www.ucl.ac.uk/bartlett/people/all"
extract_method: "links"
requires_scroll: true
scroll_times: 10
scroll_delay_ms: 600
programs:
paths_to_try:
- "/prospective-students/graduate/taught-degrees/"
link_patterns:
- text_contains: ["graduate", "taught", "masters", "postgraduate"]
href_contains: ["/prospective-students/graduate", "/study/graduate", "/courses/"]
selectors:
program_item: ".view-content .view-row, li.listing__item, article.prog-card"
program_name: ".listing__title, h3, .title"
program_url: "a[href]"
degree_type: ".listing__award, .award"
request:
timeout_ms: 40000
wait_until: "domcontentloaded"
post_wait_ms: 2500
faculty:
discovery_strategies:
- type: "link_in_page"
patterns:
- text_contains: ["people", "faculty", "staff", "team"]
href_contains: ["/people", "/faculty", "/staff", "/team"]
request:
timeout_ms: 30000
wait_until: "domcontentloaded"
post_wait_ms: 1500
- type: "url_pattern"
patterns:
- "{program_url}/people"
- "{program_url}/staff"
- "{school_url}/people"
- "{school_url}/staff"
request:
timeout_ms: 30000
wait_until: "domcontentloaded"
post_wait_ms: 1500
- type: "school_directory"
assign_to_all: false
match_by_school_keywords: true
metadata_keyword_field: "keywords"
request:
timeout_ms: 60000
wait_for_selector: "a[href*='/people/'], .person, .profile-card"
post_wait_ms: 2500
filters:
program_degree_types:
include: ["MSc", "MSci", "MA", "MBA", "MEng", "LLM", "MRes"]
exclude: ["PhD", "Bachelor", "BSc", "BA", "PGCert"]
exclude_schools: []