Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

143
configs/harvard.yaml Normal file
View File

@ -0,0 +1,143 @@
# Harvard University 爬虫配置
# 按照 学院 → 项目 → 导师 的层级结构组织
#
# Harvard的特殊情况有一个集中的项目列表页面可以从那里获取所有项目
# 然后通过GSAS页面关联到各学院和导师信息
university:
name: "Harvard University"
url: "https://www.harvard.edu/"
country: "USA"
# 第一层:学院列表
schools:
discovery_method: "static_list"
static_list:
# 文理研究生院 - 最主要的研究生项目集中地
- name: "Graduate School of Arts and Sciences (GSAS)"
url: "https://gsas.harvard.edu/"
# 工程与应用科学学院
- name: "John A. Paulson School of Engineering and Applied Sciences (SEAS)"
url: "https://seas.harvard.edu/"
# 商学院
- name: "Harvard Business School (HBS)"
url: "https://www.hbs.edu/"
# 设计学院
- name: "Graduate School of Design (GSD)"
url: "https://www.gsd.harvard.edu/"
# 教育学院
- name: "Graduate School of Education (HGSE)"
url: "https://www.gse.harvard.edu/"
# 肯尼迪政府学院
- name: "Harvard Kennedy School (HKS)"
url: "https://www.hks.harvard.edu/"
# 法学院
- name: "Harvard Law School (HLS)"
url: "https://hls.harvard.edu/"
# 医学院
- name: "Harvard Medical School (HMS)"
url: "https://hms.harvard.edu/"
# 公共卫生学院
- name: "T.H. Chan School of Public Health (HSPH)"
url: "https://www.hsph.harvard.edu/"
# 神学院
- name: "Harvard Divinity School (HDS)"
url: "https://hds.harvard.edu/"
# 牙医学院
- name: "Harvard School of Dental Medicine (HSDM)"
url: "https://hsdm.harvard.edu/"
# 第二层:项目发现配置
programs:
# 在学院网站上尝试这些路径来查找项目列表
paths_to_try:
- "/programs"
- "/academics/programs"
- "/academics/graduate-programs"
- "/academics/masters-programs"
- "/graduate"
- "/degrees"
- "/academics"
# 从学院首页查找项目列表页面的链接模式
link_patterns:
- text_contains: ["program", "degree", "academics"]
href_contains: ["/program", "/degree", "/academic"]
- text_contains: ["master", "graduate"]
href_contains: ["/master", "/graduate"]
# 项目列表页面的选择器
selectors:
program_item: "div.program-item, li.program, .degree-program, article.program, a[href*='/program']"
program_name: "h3, h4, .title, .program-title, .name"
program_url: "a[href]"
degree_type: ".degree, .credential, .degree-type"
# 分页配置
pagination:
type: "none"
# 第三层:导师发现配置
faculty:
discovery_strategies:
- type: "link_in_page"
patterns:
- text_contains: ["faculty", "people", "advisor"]
href_contains: ["/faculty", "/people", "/advisor"]
- text_contains: ["see list", "view all"]
href_contains: ["/people", "/faculty"]
- type: "url_pattern"
patterns:
- "{program_url}/faculty"
- "{program_url}/people"
- "{school_url}/faculty"
- "{school_url}/people"
selectors:
faculty_item: "div.faculty, li.person, .profile-card, article.person"
faculty_name: "h3, h4, .name, .title a"
faculty_url: "a[href*='/people/'], a[href*='/faculty/'], a[href*='/profile/']"
faculty_title: ".title, .position, .role, .job-title"
# 过滤规则
filters:
program_degree_types:
include:
- "Master"
- "M.S."
- "M.A."
- "MBA"
- "M.Eng"
- "M.Ed"
- "M.P.P"
- "M.P.A"
- "M.Arch"
- "M.L.A"
- "M.Div"
- "M.T.S"
- "LL.M"
- "S.M."
- "A.M."
- "A.L.M."
exclude:
- "Ph.D."
- "Doctor"
- "Bachelor"
- "B.S."
- "B.A."
- "Certificate"
- "Undergraduate"
exclude_schools: []