Files
University-Playwright-Codeg…/configs/harvard.yaml
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

144 lines
3.8 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Harvard University 爬虫配置
# 按照 学院 → 项目 → 导师 的层级结构组织
#
# Harvard的特殊情况有一个集中的项目列表页面可以从那里获取所有项目
# 然后通过GSAS页面关联到各学院和导师信息
university:
name: "Harvard University"
url: "https://www.harvard.edu/"
country: "USA"
# 第一层:学院列表
schools:
discovery_method: "static_list"
static_list:
# 文理研究生院 - 最主要的研究生项目集中地
- name: "Graduate School of Arts and Sciences (GSAS)"
url: "https://gsas.harvard.edu/"
# 工程与应用科学学院
- name: "John A. Paulson School of Engineering and Applied Sciences (SEAS)"
url: "https://seas.harvard.edu/"
# 商学院
- name: "Harvard Business School (HBS)"
url: "https://www.hbs.edu/"
# 设计学院
- name: "Graduate School of Design (GSD)"
url: "https://www.gsd.harvard.edu/"
# 教育学院
- name: "Graduate School of Education (HGSE)"
url: "https://www.gse.harvard.edu/"
# 肯尼迪政府学院
- name: "Harvard Kennedy School (HKS)"
url: "https://www.hks.harvard.edu/"
# 法学院
- name: "Harvard Law School (HLS)"
url: "https://hls.harvard.edu/"
# 医学院
- name: "Harvard Medical School (HMS)"
url: "https://hms.harvard.edu/"
# 公共卫生学院
- name: "T.H. Chan School of Public Health (HSPH)"
url: "https://www.hsph.harvard.edu/"
# 神学院
- name: "Harvard Divinity School (HDS)"
url: "https://hds.harvard.edu/"
# 牙医学院
- name: "Harvard School of Dental Medicine (HSDM)"
url: "https://hsdm.harvard.edu/"
# 第二层:项目发现配置
programs:
# 在学院网站上尝试这些路径来查找项目列表
paths_to_try:
- "/programs"
- "/academics/programs"
- "/academics/graduate-programs"
- "/academics/masters-programs"
- "/graduate"
- "/degrees"
- "/academics"
# 从学院首页查找项目列表页面的链接模式
link_patterns:
- text_contains: ["program", "degree", "academics"]
href_contains: ["/program", "/degree", "/academic"]
- text_contains: ["master", "graduate"]
href_contains: ["/master", "/graduate"]
# 项目列表页面的选择器
selectors:
program_item: "div.program-item, li.program, .degree-program, article.program, a[href*='/program']"
program_name: "h3, h4, .title, .program-title, .name"
program_url: "a[href]"
degree_type: ".degree, .credential, .degree-type"
# 分页配置
pagination:
type: "none"
# 第三层:导师发现配置
faculty:
discovery_strategies:
- type: "link_in_page"
patterns:
- text_contains: ["faculty", "people", "advisor"]
href_contains: ["/faculty", "/people", "/advisor"]
- text_contains: ["see list", "view all"]
href_contains: ["/people", "/faculty"]
- type: "url_pattern"
patterns:
- "{program_url}/faculty"
- "{program_url}/people"
- "{school_url}/faculty"
- "{school_url}/people"
selectors:
faculty_item: "div.faculty, li.person, .profile-card, article.person"
faculty_name: "h3, h4, .name, .title a"
faculty_url: "a[href*='/people/'], a[href*='/faculty/'], a[href*='/profile/']"
faculty_title: ".title, .position, .role, .job-title"
# 过滤规则
filters:
program_degree_types:
include:
- "Master"
- "M.S."
- "M.A."
- "MBA"
- "M.Eng"
- "M.Ed"
- "M.P.P"
- "M.P.A"
- "M.Arch"
- "M.L.A"
- "M.Div"
- "M.T.S"
- "LL.M"
- "S.M."
- "A.M."
- "A.L.M."
exclude:
- "Ph.D."
- "Doctor"
- "Bachelor"
- "B.S."
- "B.A."
- "Certificate"
- "Undergraduate"
exclude_schools: []