Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
101
configs/templates/uk_research_explorer_template.yaml
Normal file
101
configs/templates/uk_research_explorer_template.yaml
Normal file
@ -0,0 +1,101 @@
|
||||
university:
|
||||
name: "REPLACE_UNIVERSITY_NAME"
|
||||
url: "https://www.example.ac.uk/"
|
||||
country: "United Kingdom"
|
||||
|
||||
schools:
|
||||
discovery_method: "static_list"
|
||||
request:
|
||||
timeout_ms: 45000
|
||||
max_retries: 3
|
||||
retry_backoff_ms: 3000
|
||||
static_list:
|
||||
# 基于 Research Explorer (Pure Portal) 的学院示例
|
||||
- name: "School of Engineering"
|
||||
url: "https://research.example.ac.uk/en/organisations/school-of-engineering/persons/"
|
||||
keywords:
|
||||
- "engineering"
|
||||
- "mechanical"
|
||||
- "civil"
|
||||
- "materials"
|
||||
faculty_pages:
|
||||
- url: "https://research.example.ac.uk/en/organisations/school-of-engineering/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
post_wait_ms: 5000
|
||||
research_explorer:
|
||||
org_slug: "school-of-engineering"
|
||||
page_size: 400
|
||||
- name: "Faculty of Humanities"
|
||||
url: "https://research.example.ac.uk/en/organisations/faculty-of-humanities/persons/"
|
||||
keywords:
|
||||
- "arts"
|
||||
- "languages"
|
||||
- "history"
|
||||
- "philosophy"
|
||||
faculty_pages:
|
||||
- url: "https://research.example.ac.uk/en/organisations/faculty-of-humanities/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "faculty-of-humanities"
|
||||
page_size: 300
|
||||
|
||||
programs:
|
||||
paths_to_try:
|
||||
- "/study/masters/courses/list/"
|
||||
- "/study/postgraduate/courses/list/"
|
||||
link_patterns:
|
||||
- text_contains: ["masters", "postgraduate", "graduate"]
|
||||
href_contains: ["/courses/", "/study/", "/programmes/"]
|
||||
selectors:
|
||||
program_item: "li.course-item, article.course-card, a.course-link"
|
||||
program_name: ".course-title, h3, .title"
|
||||
program_url: "a[href]"
|
||||
degree_type: ".course-award, .badge"
|
||||
request:
|
||||
timeout_ms: 40000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2500
|
||||
|
||||
faculty:
|
||||
discovery_strategies:
|
||||
- type: "link_in_page"
|
||||
patterns:
|
||||
- text_contains: ["faculty", "people", "staff", "directory"]
|
||||
href_contains: ["/faculty", "/people", "/staff"]
|
||||
request:
|
||||
timeout_ms: 30000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "url_pattern"
|
||||
patterns:
|
||||
- "{program_url}/people"
|
||||
- "{program_url}/faculty"
|
||||
- "{school_url}/people"
|
||||
- "{school_url}/staff"
|
||||
request:
|
||||
timeout_ms: 30000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "school_directory"
|
||||
assign_to_all: false
|
||||
match_by_school_keywords: true
|
||||
metadata_keyword_field: "keywords"
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4000
|
||||
|
||||
filters:
|
||||
program_degree_types:
|
||||
include: ["MSc", "MA", "MBA", "MEng", "LLM", "MRes"]
|
||||
exclude: ["PhD", "Bachelor", "BSc", "BA"]
|
||||
exclude_schools: []
|
||||
Reference in New Issue
Block a user