Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
143
configs/harvard.yaml
Normal file
143
configs/harvard.yaml
Normal file
@ -0,0 +1,143 @@
|
||||
# Harvard University 爬虫配置
|
||||
# 按照 学院 → 项目 → 导师 的层级结构组织
|
||||
#
|
||||
# Harvard的特殊情况:有一个集中的项目列表页面,可以从那里获取所有项目
|
||||
# 然后通过GSAS页面关联到各学院和导师信息
|
||||
|
||||
university:
|
||||
name: "Harvard University"
|
||||
url: "https://www.harvard.edu/"
|
||||
country: "USA"
|
||||
|
||||
# 第一层:学院列表
|
||||
schools:
|
||||
discovery_method: "static_list"
|
||||
|
||||
static_list:
|
||||
# 文理研究生院 - 最主要的研究生项目集中地
|
||||
- name: "Graduate School of Arts and Sciences (GSAS)"
|
||||
url: "https://gsas.harvard.edu/"
|
||||
|
||||
# 工程与应用科学学院
|
||||
- name: "John A. Paulson School of Engineering and Applied Sciences (SEAS)"
|
||||
url: "https://seas.harvard.edu/"
|
||||
|
||||
# 商学院
|
||||
- name: "Harvard Business School (HBS)"
|
||||
url: "https://www.hbs.edu/"
|
||||
|
||||
# 设计学院
|
||||
- name: "Graduate School of Design (GSD)"
|
||||
url: "https://www.gsd.harvard.edu/"
|
||||
|
||||
# 教育学院
|
||||
- name: "Graduate School of Education (HGSE)"
|
||||
url: "https://www.gse.harvard.edu/"
|
||||
|
||||
# 肯尼迪政府学院
|
||||
- name: "Harvard Kennedy School (HKS)"
|
||||
url: "https://www.hks.harvard.edu/"
|
||||
|
||||
# 法学院
|
||||
- name: "Harvard Law School (HLS)"
|
||||
url: "https://hls.harvard.edu/"
|
||||
|
||||
# 医学院
|
||||
- name: "Harvard Medical School (HMS)"
|
||||
url: "https://hms.harvard.edu/"
|
||||
|
||||
# 公共卫生学院
|
||||
- name: "T.H. Chan School of Public Health (HSPH)"
|
||||
url: "https://www.hsph.harvard.edu/"
|
||||
|
||||
# 神学院
|
||||
- name: "Harvard Divinity School (HDS)"
|
||||
url: "https://hds.harvard.edu/"
|
||||
|
||||
# 牙医学院
|
||||
- name: "Harvard School of Dental Medicine (HSDM)"
|
||||
url: "https://hsdm.harvard.edu/"
|
||||
|
||||
# 第二层:项目发现配置
|
||||
programs:
|
||||
# 在学院网站上尝试这些路径来查找项目列表
|
||||
paths_to_try:
|
||||
- "/programs"
|
||||
- "/academics/programs"
|
||||
- "/academics/graduate-programs"
|
||||
- "/academics/masters-programs"
|
||||
- "/graduate"
|
||||
- "/degrees"
|
||||
- "/academics"
|
||||
|
||||
# 从学院首页查找项目列表页面的链接模式
|
||||
link_patterns:
|
||||
- text_contains: ["program", "degree", "academics"]
|
||||
href_contains: ["/program", "/degree", "/academic"]
|
||||
- text_contains: ["master", "graduate"]
|
||||
href_contains: ["/master", "/graduate"]
|
||||
|
||||
# 项目列表页面的选择器
|
||||
selectors:
|
||||
program_item: "div.program-item, li.program, .degree-program, article.program, a[href*='/program']"
|
||||
program_name: "h3, h4, .title, .program-title, .name"
|
||||
program_url: "a[href]"
|
||||
degree_type: ".degree, .credential, .degree-type"
|
||||
|
||||
# 分页配置
|
||||
pagination:
|
||||
type: "none"
|
||||
|
||||
# 第三层:导师发现配置
|
||||
faculty:
|
||||
discovery_strategies:
|
||||
- type: "link_in_page"
|
||||
patterns:
|
||||
- text_contains: ["faculty", "people", "advisor"]
|
||||
href_contains: ["/faculty", "/people", "/advisor"]
|
||||
- text_contains: ["see list", "view all"]
|
||||
href_contains: ["/people", "/faculty"]
|
||||
|
||||
- type: "url_pattern"
|
||||
patterns:
|
||||
- "{program_url}/faculty"
|
||||
- "{program_url}/people"
|
||||
- "{school_url}/faculty"
|
||||
- "{school_url}/people"
|
||||
|
||||
selectors:
|
||||
faculty_item: "div.faculty, li.person, .profile-card, article.person"
|
||||
faculty_name: "h3, h4, .name, .title a"
|
||||
faculty_url: "a[href*='/people/'], a[href*='/faculty/'], a[href*='/profile/']"
|
||||
faculty_title: ".title, .position, .role, .job-title"
|
||||
|
||||
# 过滤规则
|
||||
filters:
|
||||
program_degree_types:
|
||||
include:
|
||||
- "Master"
|
||||
- "M.S."
|
||||
- "M.A."
|
||||
- "MBA"
|
||||
- "M.Eng"
|
||||
- "M.Ed"
|
||||
- "M.P.P"
|
||||
- "M.P.A"
|
||||
- "M.Arch"
|
||||
- "M.L.A"
|
||||
- "M.Div"
|
||||
- "M.T.S"
|
||||
- "LL.M"
|
||||
- "S.M."
|
||||
- "A.M."
|
||||
- "A.L.M."
|
||||
exclude:
|
||||
- "Ph.D."
|
||||
- "Doctor"
|
||||
- "Bachelor"
|
||||
- "B.S."
|
||||
- "B.A."
|
||||
- "Certificate"
|
||||
- "Undergraduate"
|
||||
|
||||
exclude_schools: []
|
||||
331
configs/manchester.yaml
Normal file
331
configs/manchester.yaml
Normal file
@ -0,0 +1,331 @@
|
||||
university:
|
||||
name: "The University of Manchester"
|
||||
url: "https://www.manchester.ac.uk/"
|
||||
country: "United Kingdom"
|
||||
|
||||
schools:
|
||||
discovery_method: "static_list"
|
||||
request:
|
||||
timeout_ms: 45000
|
||||
max_retries: 3
|
||||
retry_backoff_ms: 3000
|
||||
static_list:
|
||||
- name: "Alliance Manchester Business School"
|
||||
url: "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
|
||||
keywords:
|
||||
- "accounting"
|
||||
- "finance"
|
||||
- "business"
|
||||
- "management"
|
||||
- "marketing"
|
||||
- "mba"
|
||||
- "economics"
|
||||
- "entrepreneurship"
|
||||
faculty_pages:
|
||||
- url: "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
|
||||
extract_method: "table"
|
||||
requires_scroll: true
|
||||
scroll_times: 6
|
||||
scroll_delay_ms: 700
|
||||
load_more_selector: "button.load-more, button.show-more"
|
||||
max_load_more: 5
|
||||
request:
|
||||
timeout_ms: 60000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2500
|
||||
- name: "Department of Computer Science"
|
||||
url: "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/"
|
||||
keywords:
|
||||
- "computer"
|
||||
- "software"
|
||||
- "data science"
|
||||
- "artificial intelligence"
|
||||
- "ai "
|
||||
- "machine learning"
|
||||
- "cyber"
|
||||
- "computing"
|
||||
faculty_pages:
|
||||
- url: "https://www.cs.manchester.ac.uk/about/people/academic-and-research-staff/"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 6
|
||||
scroll_delay_ms: 700
|
||||
blocked_resources: ["image", "font", "media"]
|
||||
- url: "https://www.cs.manchester.ac.uk/about/people/"
|
||||
extract_method: "links"
|
||||
load_more_selector: "button.load-more"
|
||||
max_load_more: 5
|
||||
request:
|
||||
timeout_ms: 45000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2000
|
||||
- name: "Department of Physics and Astronomy"
|
||||
url: "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/"
|
||||
keywords:
|
||||
- "physics"
|
||||
- "astronomy"
|
||||
- "astrophysics"
|
||||
- "nuclear"
|
||||
- "particle"
|
||||
faculty_pages:
|
||||
- url: "https://www.physics.manchester.ac.uk/about/people/academic-and-research-staff/"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 5
|
||||
scroll_delay_ms: 700
|
||||
- name: "Department of Electrical and Electronic Engineering"
|
||||
url: "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/"
|
||||
keywords:
|
||||
- "electrical"
|
||||
- "electronic"
|
||||
- "eee"
|
||||
- "power systems"
|
||||
- "microelectronics"
|
||||
faculty_pages:
|
||||
- url: "https://www.eee.manchester.ac.uk/about/people/academic-and-research-staff/"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 6
|
||||
scroll_delay_ms: 700
|
||||
- name: "Department of Chemistry"
|
||||
url: "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/"
|
||||
keywords:
|
||||
- "chemistry"
|
||||
- "chemical"
|
||||
faculty_pages:
|
||||
- url: "https://research.manchester.ac.uk/en/organisations/department-of-chemistry/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 5000
|
||||
research_explorer:
|
||||
org_slug: "department-of-chemistry"
|
||||
page_size: 200
|
||||
- name: "Department of Mathematics"
|
||||
url: "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/"
|
||||
keywords:
|
||||
- "mathematics"
|
||||
- "statistics"
|
||||
- "applied math"
|
||||
- "actuarial"
|
||||
faculty_pages:
|
||||
- url: "https://research.manchester.ac.uk/en/organisations/department-of-mathematics/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "department-of-mathematics"
|
||||
page_size: 200
|
||||
- name: "School of Engineering"
|
||||
url: "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/"
|
||||
keywords:
|
||||
- "engineering"
|
||||
- "mechanical"
|
||||
- "aerospace"
|
||||
- "civil"
|
||||
- "materials"
|
||||
faculty_pages:
|
||||
- url: "https://research.manchester.ac.uk/en/organisations/school-of-engineering/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "school-of-engineering"
|
||||
page_size: 400
|
||||
- name: "Faculty of Biology, Medicine and Health"
|
||||
url: "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/"
|
||||
keywords:
|
||||
- "medicine"
|
||||
- "medical"
|
||||
- "health"
|
||||
- "nursing"
|
||||
- "pharmacy"
|
||||
- "clinical"
|
||||
- "dental"
|
||||
- "optometry"
|
||||
- "biology"
|
||||
- "biomedical"
|
||||
- "psychology"
|
||||
faculty_pages:
|
||||
- url: "https://research.manchester.ac.uk/en/organisations/faculty-of-biology-medicine-and-health/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 130000
|
||||
wait_until: "networkidle"
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "faculty-of-biology-medicine-and-health"
|
||||
page_size: 400
|
||||
- name: "School of Social Sciences"
|
||||
url: "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/"
|
||||
keywords:
|
||||
- "sociology"
|
||||
- "politics"
|
||||
- "international"
|
||||
- "social"
|
||||
- "criminology"
|
||||
- "anthropology"
|
||||
- "philosophy"
|
||||
faculty_pages:
|
||||
- url: "https://research.manchester.ac.uk/en/organisations/school-of-social-sciences/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "school-of-social-sciences"
|
||||
page_size: 200
|
||||
- name: "School of Law"
|
||||
url: "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/"
|
||||
keywords:
|
||||
- "law"
|
||||
- "legal"
|
||||
- "llm"
|
||||
faculty_pages:
|
||||
- url: "https://research.manchester.ac.uk/en/organisations/school-of-law/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "school-of-law"
|
||||
page_size: 200
|
||||
- name: "School of Arts, Languages and Cultures"
|
||||
url: "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/"
|
||||
keywords:
|
||||
- "arts"
|
||||
- "languages"
|
||||
- "culture"
|
||||
- "music"
|
||||
- "drama"
|
||||
- "theatre"
|
||||
- "history"
|
||||
- "linguistics"
|
||||
- "literature"
|
||||
- "translation"
|
||||
- "archaeology"
|
||||
- "religion"
|
||||
faculty_pages:
|
||||
- url: "https://research.manchester.ac.uk/en/organisations/school-of-arts-languages-and-cultures/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "school-of-arts-languages-and-cultures"
|
||||
page_size: 300
|
||||
- name: "School of Environment, Education and Development"
|
||||
url: "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/"
|
||||
keywords:
|
||||
- "environment"
|
||||
- "education"
|
||||
- "development"
|
||||
- "planning"
|
||||
- "architecture"
|
||||
- "urban"
|
||||
- "geography"
|
||||
- "sustainability"
|
||||
faculty_pages:
|
||||
- url: "https://research.manchester.ac.uk/en/organisations/school-of-environment-education-and-development/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "school-of-environment-education-and-development"
|
||||
page_size: 300
|
||||
|
||||
programs:
|
||||
paths_to_try:
|
||||
- "/study/masters/courses/list/"
|
||||
link_patterns:
|
||||
- text_contains: ["masters", "postgraduate", "graduate"]
|
||||
href_contains: ["/courses/list", "/study/masters", "/study/postgraduate"]
|
||||
selectors:
|
||||
program_item: "li.course-item, article.course, .course-listing a"
|
||||
program_name: ".course-title, h3, .title"
|
||||
program_url: "a[href]"
|
||||
degree_type: ".course-award, .badge"
|
||||
request:
|
||||
timeout_ms: 40000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2500
|
||||
global_catalog:
|
||||
url: "https://www.manchester.ac.uk/study/masters/courses/list/"
|
||||
request:
|
||||
timeout_ms: 60000
|
||||
wait_until: "networkidle"
|
||||
wait_after_ms: 3000
|
||||
metadata_keyword_field: "keywords"
|
||||
assign_by_school_keywords: true
|
||||
assign_if_no_keywords: false
|
||||
allow_multiple_assignments: false
|
||||
per_school_limit: 200
|
||||
skip_program_faculty_lookup: true
|
||||
|
||||
faculty:
|
||||
discovery_strategies:
|
||||
- type: "link_in_page"
|
||||
patterns:
|
||||
- text_contains: ["people", "faculty", "staff", "directory"]
|
||||
href_contains: ["/people", "/faculty", "/staff"]
|
||||
request:
|
||||
timeout_ms: 30000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "url_pattern"
|
||||
patterns:
|
||||
- "{program_url}/people"
|
||||
- "{program_url}/faculty"
|
||||
- "{school_url}/people"
|
||||
- "{school_url}/staff"
|
||||
request:
|
||||
timeout_ms: 30000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "school_directory"
|
||||
assign_to_all: false
|
||||
match_by_school_keywords: true
|
||||
metadata_keyword_field: "keywords"
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
post_wait_ms: 3500
|
||||
|
||||
filters:
|
||||
program_degree_types:
|
||||
include: ["MSc", "MA", "MBA", "MEng", "LLM", "MRes"]
|
||||
exclude: ["PhD", "Bachelor", "BSc", "BA", "PGCert"]
|
||||
exclude_schools: []
|
||||
|
||||
playwright:
|
||||
stealth: true
|
||||
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
|
||||
extra_headers:
|
||||
Accept-Language: "en-US,en;q=0.9"
|
||||
cookies: []
|
||||
add_init_scripts: []
|
||||
24
configs/templates/README.md
Normal file
24
configs/templates/README.md
Normal file
@ -0,0 +1,24 @@
|
||||
# 英国高校模板库
|
||||
|
||||
该目录存放针对英国大学常见站点结构的 ScraperConfig 模板片段,目标是让生成/调度脚本能够快速套用成熟的学院、项目、导师配置,并保持与 `src/university_scraper` 中的最新能力同步。
|
||||
|
||||
## 使用方式
|
||||
1. 复制需要的模板文件到 `configs/<university>.yaml`,并根据该学校的实际信息替换占位符(域名、学院 URL、Research Explorer 组织 slug 等)。
|
||||
2. 调整 `schools.static_list` 中的学院列表:
|
||||
- `keywords`:用于自动将项目聚类到学院;
|
||||
- `faculty_pages`:定义学院级导师目录(支持 `extract_method: table|links|research_explorer`、滚动/点击更多、独立请求参数)。
|
||||
3. 根据学校的课程导航方式,补全 `programs.paths_to_try`、`link_patterns`、`selectors` 与请求设置。
|
||||
4. `faculty.discovery_strategies` 推荐至少包含:
|
||||
- `link_in_page`:从项目页寻找“People/Faculty”链接;
|
||||
- `url_pattern`:补充常见 URL 模式;
|
||||
- `school_directory`: true:复用 `faculty_pages` 中的导师目录,将其按关键词分发到项目层。
|
||||
5. 运行 `python -m src.university_scraper.cli run --config configs/<university>.yaml --output output/<name>.json`(或在 Web 端触发任务)验证,并将本地结果与旧版对比。
|
||||
|
||||
## 模板列表
|
||||
|
||||
| 文件 | 适用场景 |
|
||||
|------|----------|
|
||||
| `uk_research_explorer_template.yaml` | 大多数使用 Pure Portal / Research Explorer 的英国大学(如曼大、UCL、帝国理工的人文社科学院)。 |
|
||||
| `uk_department_directory_template.yaml` | 传统院系官网列出 HTML Staff Directory 的学院(如各理工学院官网、独立学院站点)。 |
|
||||
|
||||
后续若发现新的页面类型(例如 SharePoint 列表、嵌入式 API 等),请在此目录增加新的模板文件,并在本 README 中更新说明。
|
||||
95
configs/templates/uk_department_directory_template.yaml
Normal file
95
configs/templates/uk_department_directory_template.yaml
Normal file
@ -0,0 +1,95 @@
|
||||
university:
|
||||
name: "REPLACE_UNIVERSITY_NAME"
|
||||
url: "https://www.example.ac.uk/"
|
||||
country: "United Kingdom"
|
||||
|
||||
schools:
|
||||
discovery_method: "static_list"
|
||||
static_list:
|
||||
- name: "Department of Computer Science"
|
||||
url: "https://www.example.ac.uk/about/people/academic-and-research-staff/"
|
||||
keywords:
|
||||
- "computer"
|
||||
- "software"
|
||||
- "artificial intelligence"
|
||||
- "data science"
|
||||
faculty_pages:
|
||||
- url: "https://www.example.ac.uk/about/people/academic-and-research-staff/"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 6
|
||||
scroll_delay_ms: 600
|
||||
blocked_resources: ["image", "font", "media"]
|
||||
- url: "https://www.example.ac.uk/about/people/"
|
||||
extract_method: "links"
|
||||
load_more_selector: "button.load-more"
|
||||
max_load_more: 5
|
||||
request:
|
||||
timeout_ms: 45000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2000
|
||||
- name: "Department of Physics"
|
||||
url: "https://www.example.ac.uk/physics/about/people/"
|
||||
keywords:
|
||||
- "physics"
|
||||
- "astronomy"
|
||||
- "material science"
|
||||
faculty_pages:
|
||||
- url: "https://www.example.ac.uk/physics/about/people/academic-staff/"
|
||||
extract_method: "table"
|
||||
request:
|
||||
timeout_ms: 60000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2000
|
||||
|
||||
programs:
|
||||
paths_to_try:
|
||||
- "/study/masters/courses/a-to-z/"
|
||||
- "/study/masters/courses/list/"
|
||||
link_patterns:
|
||||
- text_contains: ["courses", "masters", "postgraduate"]
|
||||
href_contains: ["/study/", "/masters/", "/courses/"]
|
||||
selectors:
|
||||
program_item: ".course-card, li.course, article.course"
|
||||
program_name: ".course-title, h3, .title"
|
||||
program_url: "a[href]"
|
||||
degree_type: ".award, .badge"
|
||||
request:
|
||||
timeout_ms: 35000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2000
|
||||
|
||||
faculty:
|
||||
discovery_strategies:
|
||||
- type: "link_in_page"
|
||||
patterns:
|
||||
- text_contains: ["people", "faculty", "team", "staff"]
|
||||
href_contains: ["/people", "/faculty", "/staff"]
|
||||
request:
|
||||
timeout_ms: 25000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "url_pattern"
|
||||
patterns:
|
||||
- "{program_url}/people"
|
||||
- "{program_url}/staff"
|
||||
- "{school_url}/people"
|
||||
- "{school_url}/contact/staff"
|
||||
request:
|
||||
timeout_ms: 25000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "school_directory"
|
||||
assign_to_all: false
|
||||
match_by_school_keywords: true
|
||||
metadata_keyword_field: "keywords"
|
||||
request:
|
||||
timeout_ms: 60000
|
||||
wait_for_selector: "a[href*='/people/'], table"
|
||||
post_wait_ms: 2000
|
||||
|
||||
filters:
|
||||
program_degree_types:
|
||||
include: ["MSc", "MSci", "MA", "MBA", "MEng", "LLM"]
|
||||
exclude: ["PhD", "Bachelor", "BSc", "BA", "PGCert"]
|
||||
exclude_schools: []
|
||||
101
configs/templates/uk_research_explorer_template.yaml
Normal file
101
configs/templates/uk_research_explorer_template.yaml
Normal file
@ -0,0 +1,101 @@
|
||||
university:
|
||||
name: "REPLACE_UNIVERSITY_NAME"
|
||||
url: "https://www.example.ac.uk/"
|
||||
country: "United Kingdom"
|
||||
|
||||
schools:
|
||||
discovery_method: "static_list"
|
||||
request:
|
||||
timeout_ms: 45000
|
||||
max_retries: 3
|
||||
retry_backoff_ms: 3000
|
||||
static_list:
|
||||
# 基于 Research Explorer (Pure Portal) 的学院示例
|
||||
- name: "School of Engineering"
|
||||
url: "https://research.example.ac.uk/en/organisations/school-of-engineering/persons/"
|
||||
keywords:
|
||||
- "engineering"
|
||||
- "mechanical"
|
||||
- "civil"
|
||||
- "materials"
|
||||
faculty_pages:
|
||||
- url: "https://research.example.ac.uk/en/organisations/school-of-engineering/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
post_wait_ms: 5000
|
||||
research_explorer:
|
||||
org_slug: "school-of-engineering"
|
||||
page_size: 400
|
||||
- name: "Faculty of Humanities"
|
||||
url: "https://research.example.ac.uk/en/organisations/faculty-of-humanities/persons/"
|
||||
keywords:
|
||||
- "arts"
|
||||
- "languages"
|
||||
- "history"
|
||||
- "philosophy"
|
||||
faculty_pages:
|
||||
- url: "https://research.example.ac.uk/en/organisations/faculty-of-humanities/persons/"
|
||||
extract_method: "research_explorer"
|
||||
requires_scroll: true
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_until: "networkidle"
|
||||
post_wait_ms: 4500
|
||||
research_explorer:
|
||||
org_slug: "faculty-of-humanities"
|
||||
page_size: 300
|
||||
|
||||
programs:
|
||||
paths_to_try:
|
||||
- "/study/masters/courses/list/"
|
||||
- "/study/postgraduate/courses/list/"
|
||||
link_patterns:
|
||||
- text_contains: ["masters", "postgraduate", "graduate"]
|
||||
href_contains: ["/courses/", "/study/", "/programmes/"]
|
||||
selectors:
|
||||
program_item: "li.course-item, article.course-card, a.course-link"
|
||||
program_name: ".course-title, h3, .title"
|
||||
program_url: "a[href]"
|
||||
degree_type: ".course-award, .badge"
|
||||
request:
|
||||
timeout_ms: 40000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2500
|
||||
|
||||
faculty:
|
||||
discovery_strategies:
|
||||
- type: "link_in_page"
|
||||
patterns:
|
||||
- text_contains: ["faculty", "people", "staff", "directory"]
|
||||
href_contains: ["/faculty", "/people", "/staff"]
|
||||
request:
|
||||
timeout_ms: 30000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "url_pattern"
|
||||
patterns:
|
||||
- "{program_url}/people"
|
||||
- "{program_url}/faculty"
|
||||
- "{school_url}/people"
|
||||
- "{school_url}/staff"
|
||||
request:
|
||||
timeout_ms: 30000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "school_directory"
|
||||
assign_to_all: false
|
||||
match_by_school_keywords: true
|
||||
metadata_keyword_field: "keywords"
|
||||
request:
|
||||
timeout_ms: 120000
|
||||
wait_for_selector: "a.link.person"
|
||||
post_wait_ms: 4000
|
||||
|
||||
filters:
|
||||
program_degree_types:
|
||||
include: ["MSc", "MA", "MBA", "MEng", "LLM", "MRes"]
|
||||
exclude: ["PhD", "Bachelor", "BSc", "BA"]
|
||||
exclude_schools: []
|
||||
169
configs/ucl.yaml
Normal file
169
configs/ucl.yaml
Normal file
@ -0,0 +1,169 @@
|
||||
university:
|
||||
name: "University College London"
|
||||
url: "https://www.ucl.ac.uk/"
|
||||
country: "United Kingdom"
|
||||
|
||||
schools:
|
||||
discovery_method: "static_list"
|
||||
request:
|
||||
timeout_ms: 45000
|
||||
max_retries: 3
|
||||
retry_backoff_ms: 3000
|
||||
static_list:
|
||||
- name: "Faculty of Engineering Sciences"
|
||||
url: "https://www.ucl.ac.uk/engineering/people"
|
||||
keywords:
|
||||
- "engineering"
|
||||
- "mechanical"
|
||||
- "civil"
|
||||
- "materials"
|
||||
- "electronic"
|
||||
- "computer"
|
||||
faculty_pages:
|
||||
- url: "https://www.ucl.ac.uk/engineering/people"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 8
|
||||
scroll_delay_ms: 600
|
||||
blocked_resources: ["image", "font", "media"]
|
||||
- url: "https://www.ucl.ac.uk/electronic-electrical-engineering/people/academic-staff"
|
||||
extract_method: "table"
|
||||
request:
|
||||
timeout_ms: 45000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2000
|
||||
- name: "Faculty of Mathematical & Physical Sciences"
|
||||
url: "https://www.ucl.ac.uk/mathematical-physical-sciences/people"
|
||||
keywords:
|
||||
- "mathematics"
|
||||
- "physics"
|
||||
- "chemistry"
|
||||
- "earth sciences"
|
||||
- "astronomy"
|
||||
faculty_pages:
|
||||
- url: "https://www.ucl.ac.uk/mathematical-physical-sciences/people"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 6
|
||||
scroll_delay_ms: 600
|
||||
- url: "https://www.ucl.ac.uk/physics-astronomy/people/academic-staff"
|
||||
extract_method: "links"
|
||||
- name: "Faculty of Arts & Humanities"
|
||||
url: "https://www.ucl.ac.uk/arts-humanities/people/academic-staff"
|
||||
keywords:
|
||||
- "arts"
|
||||
- "languages"
|
||||
- "culture"
|
||||
- "history"
|
||||
- "philosophy"
|
||||
- "translation"
|
||||
faculty_pages:
|
||||
- url: "https://www.ucl.ac.uk/arts-humanities/people/academic-staff"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 6
|
||||
scroll_delay_ms: 600
|
||||
- name: "Faculty of Laws"
|
||||
url: "https://www.ucl.ac.uk/laws/people/academic-staff"
|
||||
keywords:
|
||||
- "law"
|
||||
- "legal"
|
||||
- "llm"
|
||||
faculty_pages:
|
||||
- url: "https://www.ucl.ac.uk/laws/people/academic-staff"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 5
|
||||
scroll_delay_ms: 600
|
||||
- name: "Faculty of Social & Historical Sciences"
|
||||
url: "https://www.ucl.ac.uk/social-historical-sciences/people"
|
||||
keywords:
|
||||
- "social"
|
||||
- "economics"
|
||||
- "geography"
|
||||
- "anthropology"
|
||||
- "politics"
|
||||
- "history"
|
||||
faculty_pages:
|
||||
- url: "https://www.ucl.ac.uk/social-historical-sciences/people"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 6
|
||||
scroll_delay_ms: 600
|
||||
- name: "Faculty of Brain Sciences"
|
||||
url: "https://www.ucl.ac.uk/brain-sciences/people"
|
||||
keywords:
|
||||
- "neuroscience"
|
||||
- "psychology"
|
||||
- "cognitive"
|
||||
- "biomedical"
|
||||
faculty_pages:
|
||||
- url: "https://www.ucl.ac.uk/brain-sciences/people"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 6
|
||||
scroll_delay_ms: 600
|
||||
- name: "Faculty of the Built Environment (The Bartlett)"
|
||||
url: "https://www.ucl.ac.uk/bartlett/people/all"
|
||||
keywords:
|
||||
- "architecture"
|
||||
- "planning"
|
||||
- "urban"
|
||||
- "built environment"
|
||||
faculty_pages:
|
||||
- url: "https://www.ucl.ac.uk/bartlett/people/all"
|
||||
extract_method: "links"
|
||||
requires_scroll: true
|
||||
scroll_times: 10
|
||||
scroll_delay_ms: 600
|
||||
|
||||
programs:
|
||||
paths_to_try:
|
||||
- "/prospective-students/graduate/taught-degrees/"
|
||||
link_patterns:
|
||||
- text_contains: ["graduate", "taught", "masters", "postgraduate"]
|
||||
href_contains: ["/prospective-students/graduate", "/study/graduate", "/courses/"]
|
||||
selectors:
|
||||
program_item: ".view-content .view-row, li.listing__item, article.prog-card"
|
||||
program_name: ".listing__title, h3, .title"
|
||||
program_url: "a[href]"
|
||||
degree_type: ".listing__award, .award"
|
||||
request:
|
||||
timeout_ms: 40000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 2500
|
||||
|
||||
faculty:
|
||||
discovery_strategies:
|
||||
- type: "link_in_page"
|
||||
patterns:
|
||||
- text_contains: ["people", "faculty", "staff", "team"]
|
||||
href_contains: ["/people", "/faculty", "/staff", "/team"]
|
||||
request:
|
||||
timeout_ms: 30000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "url_pattern"
|
||||
patterns:
|
||||
- "{program_url}/people"
|
||||
- "{program_url}/staff"
|
||||
- "{school_url}/people"
|
||||
- "{school_url}/staff"
|
||||
request:
|
||||
timeout_ms: 30000
|
||||
wait_until: "domcontentloaded"
|
||||
post_wait_ms: 1500
|
||||
- type: "school_directory"
|
||||
assign_to_all: false
|
||||
match_by_school_keywords: true
|
||||
metadata_keyword_field: "keywords"
|
||||
request:
|
||||
timeout_ms: 60000
|
||||
wait_for_selector: "a[href*='/people/'], .person, .profile-card"
|
||||
post_wait_ms: 2500
|
||||
|
||||
filters:
|
||||
program_degree_types:
|
||||
include: ["MSc", "MSci", "MA", "MBA", "MEng", "LLM", "MRes"]
|
||||
exclude: ["PhD", "Bachelor", "BSc", "BA", "PGCert"]
|
||||
exclude_schools: []
|
||||
Reference in New Issue
Block a user