# Harvard University 爬虫配置 # 按照 学院 → 项目 → 导师 的层级结构组织 # # Harvard的特殊情况:有一个集中的项目列表页面,可以从那里获取所有项目 # 然后通过GSAS页面关联到各学院和导师信息 university: name: "Harvard University" url: "https://www.harvard.edu/" country: "USA" # 第一层:学院列表 schools: discovery_method: "static_list" static_list: # 文理研究生院 - 最主要的研究生项目集中地 - name: "Graduate School of Arts and Sciences (GSAS)" url: "https://gsas.harvard.edu/" # 工程与应用科学学院 - name: "John A. Paulson School of Engineering and Applied Sciences (SEAS)" url: "https://seas.harvard.edu/" # 商学院 - name: "Harvard Business School (HBS)" url: "https://www.hbs.edu/" # 设计学院 - name: "Graduate School of Design (GSD)" url: "https://www.gsd.harvard.edu/" # 教育学院 - name: "Graduate School of Education (HGSE)" url: "https://www.gse.harvard.edu/" # 肯尼迪政府学院 - name: "Harvard Kennedy School (HKS)" url: "https://www.hks.harvard.edu/" # 法学院 - name: "Harvard Law School (HLS)" url: "https://hls.harvard.edu/" # 医学院 - name: "Harvard Medical School (HMS)" url: "https://hms.harvard.edu/" # 公共卫生学院 - name: "T.H. Chan School of Public Health (HSPH)" url: "https://www.hsph.harvard.edu/" # 神学院 - name: "Harvard Divinity School (HDS)" url: "https://hds.harvard.edu/" # 牙医学院 - name: "Harvard School of Dental Medicine (HSDM)" url: "https://hsdm.harvard.edu/" # 第二层:项目发现配置 programs: # 在学院网站上尝试这些路径来查找项目列表 paths_to_try: - "/programs" - "/academics/programs" - "/academics/graduate-programs" - "/academics/masters-programs" - "/graduate" - "/degrees" - "/academics" # 从学院首页查找项目列表页面的链接模式 link_patterns: - text_contains: ["program", "degree", "academics"] href_contains: ["/program", "/degree", "/academic"] - text_contains: ["master", "graduate"] href_contains: ["/master", "/graduate"] # 项目列表页面的选择器 selectors: program_item: "div.program-item, li.program, .degree-program, article.program, a[href*='/program']" program_name: "h3, h4, .title, .program-title, .name" program_url: "a[href]" degree_type: ".degree, .credential, .degree-type" # 分页配置 pagination: type: "none" # 第三层:导师发现配置 faculty: discovery_strategies: - type: "link_in_page" patterns: - text_contains: ["faculty", "people", "advisor"] href_contains: ["/faculty", "/people", "/advisor"] - text_contains: ["see list", "view all"] href_contains: ["/people", "/faculty"] - type: "url_pattern" patterns: - "{program_url}/faculty" - "{program_url}/people" - "{school_url}/faculty" - "{school_url}/people" selectors: faculty_item: "div.faculty, li.person, .profile-card, article.person" faculty_name: "h3, h4, .name, .title a" faculty_url: "a[href*='/people/'], a[href*='/faculty/'], a[href*='/profile/']" faculty_title: ".title, .position, .role, .job-title" # 过滤规则 filters: program_degree_types: include: - "Master" - "M.S." - "M.A." - "MBA" - "M.Eng" - "M.Ed" - "M.P.P" - "M.P.A" - "M.Arch" - "M.L.A" - "M.Div" - "M.T.S" - "LL.M" - "S.M." - "A.M." - "A.L.M." exclude: - "Ph.D." - "Doctor" - "Bachelor" - "B.S." - "B.A." - "Certificate" - "Undergraduate" exclude_schools: []