From 426cf4d2cd20b064175048d7155e34a7d32813be Mon Sep 17 00:00:00 2001 From: yangxiaoyu-crypto <532075404@qq.com> Date: Mon, 22 Dec 2025 15:25:08 +0800 Subject: [PATCH] Add university scraper system with backend, frontend, and configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .gitignore | 28 + SYSTEM_DESIGN.md | 261 ++ artifacts/debug_cs_faculty.py | 83 + artifacts/explore_faculty_page.py | 110 + artifacts/explore_manchester.py | 173 + artifacts/explore_program_page.py | 226 ++ artifacts/harvard_faculty_scraper.py | 10 +- artifacts/harvard_programs_scraper.py | 466 +++ .../harvard_programs_with_faculty_scraper.py | 356 ++ artifacts/manchester_complete_scraper.py | 910 +++++ artifacts/manchester_improved_scraper.py | 229 ++ artifacts/test_faculty_scraper.py | 165 + artifacts/test_manchester_scraper.py | 464 +++ backend/Dockerfile | 25 + backend/app/__init__.py | 1 + backend/app/api/__init__.py | 15 + backend/app/api/jobs.py | 144 + backend/app/api/results.py | 175 + backend/app/api/scripts.py | 167 + backend/app/api/universities.py | 165 + backend/app/config.py | 37 + backend/app/database.py | 35 + backend/app/main.py | 72 + backend/app/models/__init__.py | 8 + backend/app/models/job.py | 56 + backend/app/models/result.py | 34 + backend/app/models/script.py | 34 + backend/app/models/university.py | 31 + backend/app/schemas/__init__.py | 33 + backend/app/schemas/job.py | 52 + backend/app/schemas/result.py | 67 + backend/app/schemas/script.py | 46 + backend/app/schemas/university.py | 48 + backend/app/services/__init__.py | 6 + backend/app/services/scraper_runner.py | 177 + backend/app/services/script_generator.py | 558 +++ backend/app/tasks/__init__.py | 1 + backend/requirements.txt | 25 + configs/harvard.yaml | 143 + configs/manchester.yaml | 331 ++ configs/templates/README.md | 24 + .../uk_department_directory_template.yaml | 95 + .../uk_research_explorer_template.yaml | 101 + configs/ucl.yaml | 169 + docker-compose.yml | 54 + frontend/Dockerfile | 26 + frontend/index.html | 12 + frontend/nginx.conf | 21 + frontend/package-lock.json | 3051 +++++++++++++++++ frontend/package.json | 26 + frontend/src/App.tsx | 75 + frontend/src/index.css | 29 + frontend/src/main.tsx | 26 + frontend/src/pages/AddUniversityPage.tsx | 165 + frontend/src/pages/HomePage.tsx | 185 + frontend/src/pages/UniversityDetailPage.tsx | 368 ++ frontend/src/services/api.ts | 77 + frontend/src/vite-env.d.ts | 1 + frontend/tsconfig.json | 21 + frontend/tsconfig.node.json | 10 + frontend/vite.config.ts | 15 + scripts/reorganize_by_school.py | 164 + scripts/start_backend.py | 45 + scripts/start_dev.bat | 42 + scripts/test_harvard.py | 126 + src/university_scraper/__init__.py | 7 + src/university_scraper/__main__.py | 8 + src/university_scraper/analyzer.py | 374 ++ src/university_scraper/cli.py | 105 + src/university_scraper/config.py | 232 ++ src/university_scraper/harvard_scraper.py | 405 +++ src/university_scraper/models.py | 105 + src/university_scraper/scraper.py | 1360 ++++++++ 任务1.txt | 6 +- 对话总结.txt | 32 + 75 files changed, 13527 insertions(+), 2 deletions(-) create mode 100644 SYSTEM_DESIGN.md create mode 100644 artifacts/debug_cs_faculty.py create mode 100644 artifacts/explore_faculty_page.py create mode 100644 artifacts/explore_manchester.py create mode 100644 artifacts/explore_program_page.py create mode 100644 artifacts/harvard_programs_scraper.py create mode 100644 artifacts/harvard_programs_with_faculty_scraper.py create mode 100644 artifacts/manchester_complete_scraper.py create mode 100644 artifacts/manchester_improved_scraper.py create mode 100644 artifacts/test_faculty_scraper.py create mode 100644 artifacts/test_manchester_scraper.py create mode 100644 backend/Dockerfile create mode 100644 backend/app/__init__.py create mode 100644 backend/app/api/__init__.py create mode 100644 backend/app/api/jobs.py create mode 100644 backend/app/api/results.py create mode 100644 backend/app/api/scripts.py create mode 100644 backend/app/api/universities.py create mode 100644 backend/app/config.py create mode 100644 backend/app/database.py create mode 100644 backend/app/main.py create mode 100644 backend/app/models/__init__.py create mode 100644 backend/app/models/job.py create mode 100644 backend/app/models/result.py create mode 100644 backend/app/models/script.py create mode 100644 backend/app/models/university.py create mode 100644 backend/app/schemas/__init__.py create mode 100644 backend/app/schemas/job.py create mode 100644 backend/app/schemas/result.py create mode 100644 backend/app/schemas/script.py create mode 100644 backend/app/schemas/university.py create mode 100644 backend/app/services/__init__.py create mode 100644 backend/app/services/scraper_runner.py create mode 100644 backend/app/services/script_generator.py create mode 100644 backend/app/tasks/__init__.py create mode 100644 backend/requirements.txt create mode 100644 configs/harvard.yaml create mode 100644 configs/manchester.yaml create mode 100644 configs/templates/README.md create mode 100644 configs/templates/uk_department_directory_template.yaml create mode 100644 configs/templates/uk_research_explorer_template.yaml create mode 100644 configs/ucl.yaml create mode 100644 docker-compose.yml create mode 100644 frontend/Dockerfile create mode 100644 frontend/index.html create mode 100644 frontend/nginx.conf create mode 100644 frontend/package-lock.json create mode 100644 frontend/package.json create mode 100644 frontend/src/App.tsx create mode 100644 frontend/src/index.css create mode 100644 frontend/src/main.tsx create mode 100644 frontend/src/pages/AddUniversityPage.tsx create mode 100644 frontend/src/pages/HomePage.tsx create mode 100644 frontend/src/pages/UniversityDetailPage.tsx create mode 100644 frontend/src/services/api.ts create mode 100644 frontend/src/vite-env.d.ts create mode 100644 frontend/tsconfig.json create mode 100644 frontend/tsconfig.node.json create mode 100644 frontend/vite.config.ts create mode 100644 scripts/reorganize_by_school.py create mode 100644 scripts/start_backend.py create mode 100644 scripts/start_dev.bat create mode 100644 scripts/test_harvard.py create mode 100644 src/university_scraper/__init__.py create mode 100644 src/university_scraper/__main__.py create mode 100644 src/university_scraper/analyzer.py create mode 100644 src/university_scraper/cli.py create mode 100644 src/university_scraper/config.py create mode 100644 src/university_scraper/harvard_scraper.py create mode 100644 src/university_scraper/models.py create mode 100644 src/university_scraper/scraper.py create mode 100644 对话总结.txt diff --git a/.gitignore b/.gitignore index 436fcf5..2d57401 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,31 @@ nul # Scraper output files *_results.json + +# Output directories +output/ + +# Screenshots and debug images +*.png +artifacts/*.html + +# Windows +desktop.ini + +# Claude settings (local) +.claude/ + +# Progress files +*_progress.json + +# Test result files +*_test_result.json + +# Node modules +node_modules/ + +# Database files +*.db + +# Frontend build +frontend/nul diff --git a/SYSTEM_DESIGN.md b/SYSTEM_DESIGN.md new file mode 100644 index 0000000..aa4de01 --- /dev/null +++ b/SYSTEM_DESIGN.md @@ -0,0 +1,261 @@ +# 大学爬虫Web系统设计方案 + +## 一、系统架构 + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ 前端 (React/Vue) │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │ 输入大学URL │ │ 一键生成脚本 │ │ 查看/验证爬取数据 │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 后端 API (FastAPI) │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────��───────────────────┐ │ +│ │ 脚本生成API │ │ 脚本执行API │ │ 数据查询API │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ┌─────────────────┼─────────────────┐ + ▼ ▼ ▼ +┌───────────────────┐ ┌───────────────┐ ┌───────────────────────┐ +│ PostgreSQL │ │ 任务队列 │ │ Agent (Claude) │ +│ 数据库 │ │ (Celery) │ │ 分析+生成脚本 │ +│ - 爬虫脚本 │ └───────────────┘ └───────────────────────┘ +│ - 爬取结果 │ +│ - 执行日志 │ +└───────────────────┘ +``` + +## 二、技术栈选择 + +### 后端 +- **框架**: FastAPI (Python,与现有爬虫代码无缝集成) +- **数据库**: PostgreSQL (存储脚本、结果、日志) +- **任务队列**: Celery + Redis (异步执行爬虫任务) +- **ORM**: SQLAlchemy + +### 前端 +- **框架**: React + TypeScript (或 Vue.js) +- **UI库**: Ant Design / Material-UI +- **状态管理**: React Query (数据获取和缓存) + +### 部署 +- **容器化**: Docker + Docker Compose +- **云平台**: 可部署到 AWS/阿里云/腾讯云 + +## 三、数据库设计 + +```sql +-- 大学表 +CREATE TABLE universities ( + id SERIAL PRIMARY KEY, + name VARCHAR(255) NOT NULL, + url VARCHAR(500) NOT NULL, + country VARCHAR(100), + created_at TIMESTAMP DEFAULT NOW(), + updated_at TIMESTAMP DEFAULT NOW() +); + +-- 爬虫脚本表 +CREATE TABLE scraper_scripts ( + id SERIAL PRIMARY KEY, + university_id INTEGER REFERENCES universities(id), + script_name VARCHAR(255) NOT NULL, + script_content TEXT NOT NULL, -- Python脚本代码 + config_content TEXT, -- YAML配置 + version INTEGER DEFAULT 1, + status VARCHAR(50) DEFAULT 'draft', -- draft, active, deprecated + created_at TIMESTAMP DEFAULT NOW(), + updated_at TIMESTAMP DEFAULT NOW() +); + +-- 爬取任务表 +CREATE TABLE scrape_jobs ( + id SERIAL PRIMARY KEY, + university_id INTEGER REFERENCES universities(id), + script_id INTEGER REFERENCES scraper_scripts(id), + status VARCHAR(50) DEFAULT 'pending', -- pending, running, completed, failed + started_at TIMESTAMP, + completed_at TIMESTAMP, + error_message TEXT, + created_at TIMESTAMP DEFAULT NOW() +); + +-- 爬取结果表 (JSON存储层级数据) +CREATE TABLE scrape_results ( + id SERIAL PRIMARY KEY, + job_id INTEGER REFERENCES scrape_jobs(id), + university_id INTEGER REFERENCES universities(id), + result_data JSONB NOT NULL, -- 学院→项目→导师 JSON数据 + schools_count INTEGER, + programs_count INTEGER, + faculty_count INTEGER, + created_at TIMESTAMP DEFAULT NOW() +); + +-- 执行日志表 +CREATE TABLE scrape_logs ( + id SERIAL PRIMARY KEY, + job_id INTEGER REFERENCES scrape_jobs(id), + level VARCHAR(20), -- info, warning, error + message TEXT, + created_at TIMESTAMP DEFAULT NOW() +); +``` + +## 四、API接口设计 + +### 1. 大学管理 +``` +POST /api/universities 创建大学 +GET /api/universities 获取大学列表 +GET /api/universities/{id} 获取大学详情 +DELETE /api/universities/{id} 删除大学 +``` + +### 2. 爬虫脚本 +``` +POST /api/scripts/generate 生成爬虫脚本 (Agent自动分析) +GET /api/scripts/{university_id} 获取大学的爬虫脚本 +PUT /api/scripts/{id} 更新脚本 +``` + +### 3. 爬取任务 +``` +POST /api/jobs/start/{university_id} 启动爬取任务 +GET /api/jobs/{id} 获取任务状态 +GET /api/jobs/university/{id} 获取大学的任务列表 +POST /api/jobs/{id}/cancel 取消任务 +``` + +### 4. 数据结果 +``` +GET /api/results/{university_id} 获取爬取结果 +GET /api/results/{university_id}/schools 获取学院列表 +GET /api/results/{university_id}/programs 获取项目列表 +GET /api/results/{university_id}/faculty 获取导师列表 +GET /api/results/{university_id}/export?format=json 导出数据 +``` + +## 五、前端页面设计 + +### 页面1: 首页/大学列表 +- 显示已添加的大学列表 +- "添加新大学" 按钮 +- 每个大学卡片显示:名称、状态、项目数、导师数、操作按钮 + +### 页面2: 添加大学 (一键生成脚本) +- 输入框:大学官网URL +- "分析并生成脚本" 按钮 +- 显示分析进度和日志 +- 生成完成后自动跳转到管理页面 + +### 页面3: 大学管理页面 +- 大学基本信息 +- 爬虫脚本状态 +- "一键运行爬虫" 按钮 +- 运行进度和日志实时显示 +- 历史任务列表 + +### 页面4: 数据查看页面 +- 树形结构展示:学院 → 项目 → 导师 +- 搜索和筛选功能 +- 数据导出按钮 (JSON/Excel) +- 数据校验和编辑功能 + +## 六、实现步骤 + +### 阶段1: 后端基础 (优先) +1. 创建 FastAPI 项目结构 +2. 设计数据库模型 (SQLAlchemy) +3. 实现基础 CRUD API +4. 集成现有爬虫代码 + +### 阶段2: 脚本生成与执行 +1. 实现 Agent 自动分析逻辑 +2. 实现脚本存储和版本管理 +3. 集成 Celery 异步任务队列 +4. 实现爬虫执行和日志记录 + +### 阶段3: 前端开发 +1. 搭建 React 项目 +2. 实现大学列表页面 +3. 实现脚本生成页面 +4. 实现数据查看页面 + +### 阶段4: 部署上线 +1. Docker 容器化 +2. 部署到云服务器 +3. 配置域名和 HTTPS + +## 七、目录结构 + +``` +university-scraper-web/ +├── backend/ +│ ├── app/ +│ │ ├── __init__.py +│ │ ├── main.py # FastAPI入口 +│ │ ├── config.py # 配置 +│ │ ├── database.py # 数据库连接 +│ │ ├── models/ # SQLAlchemy模型 +│ │ │ ├── university.py +│ │ │ ├── script.py +│ │ │ ├── job.py +│ │ │ └── result.py +│ │ ├── schemas/ # Pydantic模型 +│ │ ├── api/ # API路由 +│ │ │ ├── universities.py +│ │ │ ├── scripts.py +│ │ │ ├── jobs.py +│ │ │ └── results.py +│ │ ├── services/ # 业务逻辑 +│ │ │ ├── scraper_service.py +│ │ │ └── agent_service.py +│ │ └── tasks/ # Celery任务 +│ │ └── scrape_task.py +│ ├── requirements.txt +│ └── Dockerfile +├── frontend/ +│ ├── src/ +│ │ ├── components/ +│ │ ├── pages/ +│ │ ├── services/ +│ │ └── App.tsx +│ ├── package.json +│ └── Dockerfile +├── docker-compose.yml +└── README.md +``` + +## 八、关于脚本存储位置的建议 + +### 推荐方案:PostgreSQL + 文件系统混合 + +1. **PostgreSQL 存储**: + - 脚本元数据 (名称、版本、状态) + - 脚本代码内容 (TEXT字段) + - 配置文件内容 (JSONB字段) + - 爬取结果 (JSONB字段) + +2. **优点**: + - 事务支持,数据一致性 + - 版本管理方便 + - 查询和搜索方便 + - 备份和迁移简单 + - 与后端集成紧密 + +3. **云部署选项**: + - AWS RDS PostgreSQL + - 阿里云 RDS PostgreSQL + - 腾讯云 TDSQL-C + +### 备选方案:MongoDB + +如果数据结构经常变化,可以考虑 MongoDB: +- 灵活的文档结构 +- 适合存储层级化的爬取结果 +- 但 Python 生态对 PostgreSQL 支持更好 diff --git a/artifacts/debug_cs_faculty.py b/artifacts/debug_cs_faculty.py new file mode 100644 index 0000000..97f55f3 --- /dev/null +++ b/artifacts/debug_cs_faculty.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +调试Computer Science的Faculty页面 +""" + +import asyncio +from playwright.async_api import async_playwright + + +async def debug_cs(): + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False) + page = await browser.new_page() + + # 访问Computer Science GSAS页面 + gsas_url = "https://gsas.harvard.edu/program/computer-science" + print(f"访问: {gsas_url}") + + await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000) + await page.wait_for_timeout(3000) + + await page.screenshot(path="cs_gsas_page.png", full_page=True) + print("截图已保存: cs_gsas_page.png") + + # 查找所有链接 + links = await page.evaluate('''() => { + const links = []; + document.querySelectorAll('a[href]').forEach(a => { + const text = a.innerText.trim(); + const href = a.href; + if (text && text.length > 2 && text.length < 100) { + links.push({text: text, href: href}); + } + }); + return links; + }''') + + print(f"\n页面上的所有链接 ({len(links)} 个):") + for link in links: + print(f" - {link['text'][:60]} -> {link['href']}") + + # 查找可能的Faculty或People链接 + print("\n\n查找Faculty/People相关链接:") + for link in links: + text_lower = link['text'].lower() + href_lower = link['href'].lower() + if 'faculty' in text_lower or 'people' in href_lower or 'faculty' in href_lower or 'website' in text_lower: + print(f" * {link['text']} -> {link['href']}") + + # 尝试访问SEAS (School of Engineering) + print("\n\n尝试访问SEAS Computer Science页面...") + seas_url = "https://seas.harvard.edu/computer-science" + await page.goto(seas_url, wait_until="domcontentloaded", timeout=30000) + await page.wait_for_timeout(2000) + + await page.screenshot(path="seas_cs_page.png", full_page=True) + print("截图已保存: seas_cs_page.png") + + seas_links = await page.evaluate('''() => { + const links = []; + document.querySelectorAll('a[href]').forEach(a => { + const text = a.innerText.trim(); + const href = a.href; + const lowerText = text.toLowerCase(); + const lowerHref = href.toLowerCase(); + if ((lowerText.includes('faculty') || lowerText.includes('people') || + lowerHref.includes('faculty') || lowerHref.includes('people')) && + text.length > 2) { + links.push({text: text, href: href}); + } + }); + return links; + }''') + + print(f"\nSEAS页面上的Faculty/People链接:") + for link in seas_links: + print(f" * {link['text']} -> {link['href']}") + + await browser.close() + + +if __name__ == "__main__": + asyncio.run(debug_cs()) diff --git a/artifacts/explore_faculty_page.py b/artifacts/explore_faculty_page.py new file mode 100644 index 0000000..d86410d --- /dev/null +++ b/artifacts/explore_faculty_page.py @@ -0,0 +1,110 @@ +""" +探索Harvard院系People/Faculty页面结构,获取导师列表 +""" +import asyncio +from playwright.async_api import async_playwright + +async def explore_faculty_page(): + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False) + page = await browser.new_page() + + # 访问AAAS院系People页面 + people_url = "https://aaas.fas.harvard.edu/aaas-people" + print(f"访问院系People页面: {people_url}") + + await page.goto(people_url, wait_until='networkidle') + await page.wait_for_timeout(3000) + + # 截图保存 + await page.screenshot(path="aaas_people_page.png", full_page=True) + print("已保存截图: aaas_people_page.png") + + # 获取所有教职员工链接 + faculty_info = await page.evaluate('''() => { + const faculty = []; + + // 查找所有 /people/ 路径的链接 + document.querySelectorAll('a[href*="/people/"]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + + // 过滤掉导航链接,只保留个人页面链接 + if (href.includes('/people/') && text.length > 3 && + !text.toLowerCase().includes('people') && + !href.endsWith('/people/') && + !href.endsWith('/aaas-people')) { + faculty.push({ + name: text, + url: href + }); + } + }); + + return faculty; + }''') + + print(f"\n找到 {len(faculty_info)} 个教职员工:") + for f in faculty_info: + print(f" - {f['name']} -> {f['url']}") + + # 尝试经济学院系的Faculty页面 + print("\n\n========== 尝试经济学院系Faculty页面 ==========") + econ_faculty_url = "http://economics.harvard.edu/people/people-type/faculty" + print(f"访问: {econ_faculty_url}") + + await page.goto(econ_faculty_url, wait_until='networkidle') + await page.wait_for_timeout(3000) + + await page.screenshot(path="econ_faculty_page.png", full_page=True) + print("已保存截图: econ_faculty_page.png") + + econ_faculty = await page.evaluate('''() => { + const faculty = []; + + // 查找所有可能的faculty链接 + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + const lowerHref = href.toLowerCase(); + + // 查找个人页面链接 + if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') || + lowerHref.includes('/profile/')) && + text.length > 3 && text.length < 100 && + !text.toLowerCase().includes('faculty') && + !text.toLowerCase().includes('people')) { + faculty.push({ + name: text, + url: href + }); + } + }); + + return faculty; + }''') + + print(f"\n找到 {len(econ_faculty)} 个教职员工:") + for f in econ_faculty[:30]: + print(f" - {f['name']} -> {f['url']}") + + # 查看页面上所有链接用于调试 + print("\n\n页面上的所有链接:") + all_links = await page.evaluate('''() => { + const links = []; + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + if (text && text.length > 2 && text.length < 100) { + links.push({text: text, href: href}); + } + }); + return links; + }''') + for link in all_links[:40]: + print(f" - {link['text'][:50]} -> {link['href']}") + + await browser.close() + +if __name__ == "__main__": + asyncio.run(explore_faculty_page()) diff --git a/artifacts/explore_manchester.py b/artifacts/explore_manchester.py new file mode 100644 index 0000000..0350789 --- /dev/null +++ b/artifacts/explore_manchester.py @@ -0,0 +1,173 @@ +""" +探索曼彻斯特大学硕士课程页面结构 +""" + +import asyncio +import json +from playwright.async_api import async_playwright + + +async def explore_manchester(): + """探索曼彻斯特大学网站结构""" + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False) + context = await browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + ) + page = await context.new_page() + + # 直接访问硕士课程A-Z列表页 + print("访问硕士课程A-Z列表页面...") + await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/", + wait_until="domcontentloaded", timeout=60000) + await page.wait_for_timeout(5000) + + # 截图 + await page.screenshot(path="manchester_masters_page.png", full_page=False) + print("截图已保存: manchester_masters_page.png") + + # 分析页面结构 + page_info = await page.evaluate("""() => { + const info = { + title: document.title, + url: window.location.href, + all_links: [], + course_candidates: [], + page_sections: [] + }; + + // 获取所有链接 + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href; + const text = a.innerText.trim().substring(0, 100); + if (href && text) { + info.all_links.push({href, text}); + } + }); + + // 查找可能的课程链接 - 包含 /course/ 或 list-item + document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => { + info.course_candidates.push({ + href: a.href, + text: a.innerText.trim().substring(0, 100), + classes: a.className, + parent_classes: a.parentElement?.className || '' + }); + }); + + // 获取页面主要区块 + document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => { + info.page_sections.push({ + tag: el.tagName, + id: el.id, + classes: el.className, + children_count: el.children.length + }); + }); + + return info; + }""") + + print(f"\n页面标题: {page_info['title']}") + print(f"当前URL: {page_info['url']}") + print(f"\n总链接数: {len(page_info['all_links'])}") + print(f"课程候选链接数: {len(page_info['course_candidates'])}") + + # 查找包含 masters/courses/ 的链接 + masters_links = [l for l in page_info['all_links'] + if 'masters/courses/' in l['href'].lower() + and l['href'] != page_info['url']] + + print(f"\n硕士课程相关链接 ({len(masters_links)}):") + for link in masters_links[:20]: + print(f" - {link['text'][:50]}: {link['href']}") + + print(f"\n课程候选详情:") + for c in page_info['course_candidates'][:10]: + print(f" - {c['text'][:50]}") + print(f" URL: {c['href']}") + print(f" Classes: {c['classes']}") + + # 检查是否有搜索/筛选功能 + search_elements = await page.evaluate("""() => { + const elements = []; + document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => { + elements.push({ + tag: el.tagName, + type: el.type || '', + id: el.id, + name: el.name || '', + classes: el.className + }); + }); + return elements; + }""") + + print(f"\n搜索/筛选元素: {len(search_elements)}") + for el in search_elements[:5]: + print(f" - {el}") + + # 尝试找到课程列表的实际结构 + print("\n\n正在分析页面中的课程列表结构...") + + list_structures = await page.evaluate("""() => { + const structures = []; + + // 查找各种可能的列表结构 + const selectors = [ + 'ul li a[href*="course"]', + 'div[class*="result"] a', + 'div[class*="course"] a', + 'article a[href]', + '.search-results a', + '[data-course] a', + 'table tr td a' + ]; + + for (const selector of selectors) { + const elements = document.querySelectorAll(selector); + if (elements.length > 0) { + const samples = []; + elements.forEach((el, i) => { + if (i < 5) { + samples.push({ + href: el.href, + text: el.innerText.trim().substring(0, 80) + }); + } + }); + structures.push({ + selector: selector, + count: elements.length, + samples: samples + }); + } + } + + return structures; + }""") + + print("\n找到的列表结构:") + for s in list_structures: + print(f"\n 选择器: {s['selector']} (共 {s['count']} 个)") + for sample in s['samples']: + print(f" - {sample['text']}: {sample['href']}") + + # 保存完整分析结果 + with open("manchester_analysis.json", "w", encoding="utf-8") as f: + json.dump(page_info, f, indent=2, ensure_ascii=False) + + print("\n\n完整分析已保存到 manchester_analysis.json") + + # 等待用户查看 + print("\n按 Ctrl+C 关闭浏览器...") + try: + await asyncio.sleep(30) + except: + pass + + await browser.close() + + +if __name__ == "__main__": + asyncio.run(explore_manchester()) diff --git a/artifacts/explore_program_page.py b/artifacts/explore_program_page.py new file mode 100644 index 0000000..ce48d1e --- /dev/null +++ b/artifacts/explore_program_page.py @@ -0,0 +1,226 @@ +""" +探索Harvard项目页面结构,寻找导师信息 +""" +import asyncio +from playwright.async_api import async_playwright + +async def explore_program_page(): + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False) + page = await browser.new_page() + + # 访问研究生院系页面 (GSAS) + gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies" + print(f"访问研究生院系页面: {gsas_url}") + + await page.goto(gsas_url, wait_until='networkidle') + await page.wait_for_timeout(3000) + + # 截图保存 + await page.screenshot(path="gsas_program_page.png", full_page=True) + print("已保存截图: gsas_program_page.png") + + # 分析页面结构 + page_info = await page.evaluate('''() => { + const info = { + title: document.title, + h1: document.querySelector('h1')?.innerText || '', + allHeadings: [], + facultyLinks: [], + peopleLinks: [], + allLinks: [] + }; + + // 获取所有标题 + document.querySelectorAll('h1, h2, h3, h4').forEach(h => { + info.allHeadings.push({ + tag: h.tagName, + text: h.innerText.trim().substring(0, 100) + }); + }); + + // 查找所有链接 + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + + // 检查是否与教职员工相关 + const lowerHref = href.toLowerCase(); + const lowerText = text.toLowerCase(); + + if (lowerHref.includes('faculty') || lowerHref.includes('people') || + lowerHref.includes('professor') || lowerHref.includes('staff') || + lowerText.includes('faculty') || lowerText.includes('people')) { + info.facultyLinks.push({ + text: text.substring(0, 100), + href: href + }); + } + + // 检查是否是个人页面链接 + if (href.includes('/people/') || href.includes('/faculty/') || + href.includes('/profile/') || href.includes('/person/')) { + info.peopleLinks.push({ + text: text.substring(0, 100), + href: href + }); + } + + // 保存所有主要链接 + if (href && text.length > 2 && text.length < 150) { + info.allLinks.push({ + text: text, + href: href + }); + } + }); + + return info; + }''') + + print(f"\n页面标题: {page_info['title']}") + print(f"H1: {page_info['h1']}") + + print(f"\n所有标题 ({len(page_info['allHeadings'])}):") + for h in page_info['allHeadings']: + print(f" <{h['tag']}>: {h['text']}") + + print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):") + for f in page_info['facultyLinks']: + print(f" - {f['text']} -> {f['href']}") + + print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):") + for p in page_info['peopleLinks']: + print(f" - {p['text']} -> {p['href']}") + + print(f"\n所有链接 ({len(page_info['allLinks'])}):") + for link in page_info['allLinks'][:50]: + print(f" - {link['text'][:60]} -> {link['href']}") + + # 尝试另一个项目页面看看是否有不同结构 + print("\n\n========== 尝试另一个项目页面 ==========") + economics_url = "https://gsas.harvard.edu/program/economics" + print(f"访问: {economics_url}") + + await page.goto(economics_url, wait_until='networkidle') + await page.wait_for_timeout(3000) + + # 截图保存 + await page.screenshot(path="gsas_economics_page.png", full_page=True) + print("已保存截图: gsas_economics_page.png") + + # 分析 + econ_info = await page.evaluate('''() => { + const info = { + title: document.title, + facultyLinks: [], + peopleLinks: [] + }; + + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + const lowerHref = href.toLowerCase(); + const lowerText = text.toLowerCase(); + + if (lowerHref.includes('faculty') || lowerHref.includes('people') || + lowerText.includes('faculty') || lowerText.includes('people')) { + info.facultyLinks.push({ + text: text.substring(0, 100), + href: href + }); + } + + if (href.includes('/people/') || href.includes('/faculty/') || + href.includes('/profile/') || href.includes('/person/')) { + info.peopleLinks.push({ + text: text.substring(0, 100), + href: href + }); + } + }); + + return info; + }''') + + print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):") + for f in econ_info['facultyLinks']: + print(f" - {f['text']} -> {f['href']}") + + print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):") + for p in econ_info['peopleLinks']: + print(f" - {p['text']} -> {p['href']}") + + # 访问院系主页看看有没有Faculty页面 + print("\n\n========== 尝试访问院系主页 ==========") + dept_url = "https://aaas.fas.harvard.edu/" + print(f"访问院系主页: {dept_url}") + + await page.goto(dept_url, wait_until='networkidle') + await page.wait_for_timeout(3000) + + await page.screenshot(path="aaas_dept_page.png", full_page=True) + print("已保存截图: aaas_dept_page.png") + + dept_info = await page.evaluate('''() => { + const info = { + title: document.title, + navLinks: [], + facultyLinks: [], + peopleLinks: [] + }; + + // 获取导航链接 + document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + if (text && text.length > 1 && text.length < 50) { + info.navLinks.push({ + text: text, + href: href + }); + } + }); + + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + const lowerHref = href.toLowerCase(); + const lowerText = text.toLowerCase(); + + if (lowerHref.includes('faculty') || lowerHref.includes('people') || + lowerText.includes('faculty') || lowerText.includes('people')) { + info.facultyLinks.push({ + text: text.substring(0, 100), + href: href + }); + } + + if (href.includes('/people/') || href.includes('/faculty/') || + href.includes('/profile/')) { + info.peopleLinks.push({ + text: text.substring(0, 100), + href: href + }); + } + }); + + return info; + }''') + + print(f"\n导航链接 ({len(dept_info['navLinks'])}):") + for link in dept_info['navLinks'][:20]: + print(f" - {link['text']} -> {link['href']}") + + print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):") + for f in dept_info['facultyLinks']: + print(f" - {f['text']} -> {f['href']}") + + print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):") + for p in dept_info['peopleLinks'][:30]: + print(f" - {p['text']} -> {p['href']}") + + await browser.close() + +if __name__ == "__main__": + asyncio.run(explore_program_page()) diff --git a/artifacts/harvard_faculty_scraper.py b/artifacts/harvard_faculty_scraper.py index 6d2d448..c562d21 100644 --- a/artifacts/harvard_faculty_scraper.py +++ b/artifacts/harvard_faculty_scraper.py @@ -125,6 +125,7 @@ class ScrapeSettings: output: Path verify_links: bool = True request_delay: float = 1.0 # Polite crawling delay + timeout: int = 60000 # Navigation timeout in ms async def extract_links(page: Page) -> List[Tuple[str, str]]: @@ -210,7 +211,7 @@ async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink page = await context.new_page() try: response = await page.goto( - normalized_url, wait_until="domcontentloaded", timeout=20000 + normalized_url, wait_until="domcontentloaded", timeout=settings.timeout ) if not response or response.status >= 400: await page.close() @@ -411,6 +412,12 @@ def parse_args() -> argparse.Namespace: default=1.0, help="Delay between requests in seconds (polite crawling).", ) + parser.add_argument( + "--timeout", + type=int, + default=60000, + help="Navigation timeout in milliseconds (default: 60000 = 60s).", + ) return parser.parse_args() @@ -424,6 +431,7 @@ async def main_async() -> None: output=args.output, verify_links=not args.no_verify, request_delay=args.delay, + timeout=args.timeout, ) links = await crawl(settings, browser_name=args.browser) serialize(links, settings.output, settings.root_url) diff --git a/artifacts/harvard_programs_scraper.py b/artifacts/harvard_programs_scraper.py new file mode 100644 index 0000000..7107af7 --- /dev/null +++ b/artifacts/harvard_programs_scraper.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +""" +Harvard Graduate Programs Scraper +专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目 +通过点击分页按钮遍历所有页面 +""" + +import asyncio +import json +import re +from datetime import datetime, timezone +from pathlib import Path +from playwright.async_api import async_playwright + + +async def scrape_harvard_programs(): + """爬取Harvard研究生项目列表页面 - 通过点击分页按钮""" + + all_programs = [] + base_url = "https://www.harvard.edu/programs/?degree_levels=graduate" + + async with async_playwright() as p: + # 使用无头模式 + browser = await p.chromium.launch(headless=True) + context = await browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + viewport={'width': 1920, 'height': 1080} + ) + page = await context.new_page() + + print(f"正在访问: {base_url}") + # 使用 domcontentloaded 而非 networkidle,更快加载 + await page.goto(base_url, wait_until="domcontentloaded", timeout=60000) + # 等待页面内容加载 + await page.wait_for_timeout(5000) + + # 滚动到页面底部以确保分页按钮加载 + print("滚动到页面底部...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(2000) + + current_page = 1 + max_pages = 15 + + while current_page <= max_pages: + print(f"\n========== 第 {current_page} 页 ==========") + + # 等待内容加载 + await page.wait_for_timeout(2000) + + # 提取当前页面的项目 + # 从调试输出得知,项目按钮的class是 'records__record___PbPhG c-programs-item__title-link' + # 需要点击按钮来获取URL,因为Harvard使用JavaScript导航 + + # 首先获取所有项目按钮信息 + page_data = await page.evaluate('''() => { + const programs = []; + + // 查找所有项目行/容器 + const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]'); + + programItems.forEach((item, index) => { + // 获取项目名称按钮 + const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]'); + if (!nameBtn) return; + + const name = nameBtn.innerText.trim(); + if (!name || name.length < 3) return; + + // 获取学位信息 + let degrees = ''; + const allText = item.innerText; + const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g); + if (degreeMatch) { + degrees = degreeMatch.join(', '); + } + + // 查找链接 - 检查各种可能的位置 + let url = ''; + + // 方法1: 查找 标签 + const link = item.querySelector('a[href]'); + if (link && link.href) { + url = link.href; + } + + // 方法2: 检查data属性 + if (!url) { + const dataUrl = nameBtn.getAttribute('data-url') || + nameBtn.getAttribute('data-href') || + item.getAttribute('data-url'); + if (dataUrl) url = dataUrl; + } + + // 方法3: 检查onclick属性 + if (!url) { + const onclick = nameBtn.getAttribute('onclick') || ''; + const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/); + if (urlMatch) url = urlMatch[1]; + } + + programs.push({ + name: name, + degrees: degrees, + url: url, + index: index + }); + }); + + // 如果方法1没找到项目,使用备选方法 + if (programs.length === 0) { + // 查找所有项目按钮 + const buttons = document.querySelectorAll('button'); + buttons.forEach((btn, index) => { + const className = btn.className || ''; + if (className.includes('c-programs-item') || className.includes('title-link')) { + const name = btn.innerText.trim(); + if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) { + programs.push({ + name: name, + degrees: '', + url: '', + index: index + }); + } + } + }); + } + + return { + programs: programs, + totalFound: programs.length + }; + }''') + + # 第一页时调试输出HTML结构 + if current_page == 1 and len(page_data['programs']) == 0: + print("未找到项目,调试HTML结构...") + html_debug = await page.evaluate('''() => { + const debug = { + allButtons: [], + allLinks: [], + sampleHTML: '' + }; + + // 获取所有按钮 + document.querySelectorAll('button').forEach(btn => { + const text = btn.innerText.trim().substring(0, 50); + if (text && text.length > 3) { + debug.allButtons.push({ + text: text, + class: btn.className.substring(0, 80) + }); + } + }); + + // 获取main区域的HTML片段 + const main = document.querySelector('main') || document.body; + debug.sampleHTML = main.innerHTML.substring(0, 3000); + + return debug; + }''') + print(f"找到 {len(html_debug['allButtons'])} 个按钮:") + for btn in html_debug['allButtons'][:20]: + print(f" - {btn['text']} | class: {btn['class']}") + print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}") + + print(f" 本页找到 {len(page_data['programs'])} 个项目") + + # 打印找到的项目 + for prog in page_data['programs']: + print(f" - {prog['name']} ({prog['degrees']})") + + # 添加到总列表(去重) + for prog in page_data['programs']: + name = prog['name'].strip() + if name and not any(p['name'] == name for p in all_programs): + all_programs.append({ + 'name': name, + 'degrees': prog.get('degrees', ''), + 'url': prog.get('url', ''), + 'page': current_page + }) + + # 尝试点击下一页按钮 + try: + clicked = False + + # 首先打印所有分页相关元素用于调试 + if current_page == 1: + # 截图保存以便调试 + await page.screenshot(path="harvard_debug_pagination.png", full_page=True) + print("已保存调试截图: harvard_debug_pagination.png") + + pagination_info = await page.evaluate('''() => { + const result = { + links: [], + buttons: [], + allClickable: [], + pageNumbers: [], + allText: [] + }; + + // 查找所有链接 + document.querySelectorAll('a').forEach(a => { + const text = a.innerText.trim(); + if (text.match(/^[0-9]+$|Next|page|Prev/i)) { + result.links.push({ + text: text.substring(0, 50), + href: a.href, + visible: a.offsetParent !== null, + className: a.className + }); + } + }); + + // 查找所有按钮 + document.querySelectorAll('button').forEach(b => { + const text = b.innerText.trim(); + if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) { + result.buttons.push({ + text: text.substring(0, 50), + visible: b.offsetParent !== null, + className: b.className + }); + } + }); + + // 查找所有包含数字的可点击元素(可能是分页) + document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => { + const text = el.innerText.trim(); + if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) { + result.pageNumbers.push({ + tag: el.tagName, + text: text, + className: el.className, + id: el.id, + ariaLabel: el.getAttribute('aria-label'), + visible: el.offsetParent !== null + }); + } + }); + + // 查找页面底部区域的所有可点击元素 + const bodyRect = document.body.getBoundingClientRect(); + document.querySelectorAll('*').forEach(el => { + const rect = el.getBoundingClientRect(); + const text = el.innerText?.trim() || ''; + // 只看页面下半部分的元素且文本短 + if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) { + const style = window.getComputedStyle(el); + if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') { + result.allClickable.push({ + tag: el.tagName, + text: text.substring(0, 30), + top: Math.round(rect.top), + className: el.className?.substring?.(0, 50) || '' + }); + } + } + }); + + // 输出页面底部所有文本以便调试 + const bodyText = document.body.innerText; + const lines = bodyText.split('\\n').filter(l => l.trim()); + // 找到包含数字1-9的行 + for (let i = 0; i < lines.length; i++) { + if (lines[i].match(/^[1-9]$|Next page|Previous/)) { + result.allText.push(lines[i]); + } + } + + return result; + }''') + print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):") + for link in pagination_info['links']: + print(f" a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})") + print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):") + for btn in pagination_info['buttons']: + print(f" button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})") + print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):") + for pn in pagination_info['pageNumbers']: + print(f" {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}") + print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):") + for el in pagination_info['allClickable'][:30]: + print(f" {el['tag']}: '{el['text']}' (top: {el['top']})") + print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):") + for txt in pagination_info['allText'][:20]: + print(f" '{txt}'") + + # 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠) + # 从调试输出得知,分页按钮是 + + + +
+ 支持的大学类型: +
    +
  • 美国大学 (如 Harvard, MIT, Stanford)
  • +
  • 英国大学 (如 Oxford, Cambridge)
  • +
  • 其他海外大学
  • +
+
+ + )} + + {currentStep === 1 && ( +
+ + 正在分析网站结构... + 系统正在访问大学官网,分析页面结构并生成爬虫脚本 + 这可能需要几秒钟,请稍候... +
+ )} + + {currentStep === 2 && ( + navigate(`/university/${universityId}`)} + > + 进入大学管理页面 + , + + ]} + /> + )} + + ) +} diff --git a/frontend/src/pages/HomePage.tsx b/frontend/src/pages/HomePage.tsx new file mode 100644 index 0000000..806fd8c --- /dev/null +++ b/frontend/src/pages/HomePage.tsx @@ -0,0 +1,185 @@ +/** + * 首页 - 大学列表 + */ +import { useState } from 'react' +import { useNavigate } from 'react-router-dom' +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query' +import { + Card, Table, Button, Input, Space, Tag, message, Popconfirm, Typography, Row, Col, Statistic +} from 'antd' +import { + PlusOutlined, SearchOutlined, DeleteOutlined, EyeOutlined, ReloadOutlined +} from '@ant-design/icons' +import { universityApi } from '../services/api' + +const { Title } = Typography + +// 状态标签映射 +const statusTags: Record = { + pending: { color: 'default', text: '待分析' }, + analyzing: { color: 'processing', text: '分析中' }, + ready: { color: 'success', text: '就绪' }, + error: { color: 'error', text: '错误' } +} + +export default function HomePage() { + const navigate = useNavigate() + const queryClient = useQueryClient() + const [search, setSearch] = useState('') + + // 获取大学列表 + const { data, isLoading, refetch } = useQuery({ + queryKey: ['universities', search], + queryFn: () => universityApi.list({ search: search || undefined }) + }) + + // 删除大学 + const deleteMutation = useMutation({ + mutationFn: universityApi.delete, + onSuccess: () => { + message.success('删除成功') + queryClient.invalidateQueries({ queryKey: ['universities'] }) + }, + onError: () => { + message.error('删除失败') + } + }) + + const universities = data?.data?.items || [] + const total = data?.data?.total || 0 + + // 统计 + const readyCount = universities.filter((u: any) => u.status === 'ready').length + const totalPrograms = universities.reduce((sum: number, u: any) => + sum + (u.latest_result?.programs_count || 0), 0) + const totalFaculty = universities.reduce((sum: number, u: any) => + sum + (u.latest_result?.faculty_count || 0), 0) + + const columns = [ + { + title: '大学名称', + dataIndex: 'name', + key: 'name', + render: (text: string, record: any) => ( +
navigate(`/university/${record.id}`)}>{text} + ) + }, + { + title: '国家', + dataIndex: 'country', + key: 'country', + width: 100 + }, + { + title: '状态', + dataIndex: 'status', + key: 'status', + width: 100, + render: (status: string) => { + const tag = statusTags[status] || { color: 'default', text: status } + return {tag.text} + } + }, + { + title: '项目数', + key: 'programs', + width: 100, + render: (_: any, record: any) => record.latest_result?.programs_count || '-' + }, + { + title: '导师数', + key: 'faculty', + width: 100, + render: (_: any, record: any) => record.latest_result?.faculty_count || '-' + }, + { + title: '操作', + key: 'actions', + width: 150, + render: (_: any, record: any) => ( + + + deleteMutation.mutate(record.id)} + okText="确定" + cancelText="取消" + > + + + + ) + } + ] + + return ( +
+ {/* 统计卡片 */} + + + + + + + + + + + + + + + + + + + + + + + + {/* 大学列表 */} + 大学列表} + extra={ + + } + value={search} + onChange={(e) => setSearch(e.target.value)} + style={{ width: 200 }} + allowClear + /> + + + + } + > + `共 ${t} 所大学` + }} + /> + + + ) +} diff --git a/frontend/src/pages/UniversityDetailPage.tsx b/frontend/src/pages/UniversityDetailPage.tsx new file mode 100644 index 0000000..2fc7f76 --- /dev/null +++ b/frontend/src/pages/UniversityDetailPage.tsx @@ -0,0 +1,368 @@ +/** + * 大学详情页面 - 管理爬虫、运行爬虫、查看数据 + */ +import { useState, useEffect } from 'react' +import { useParams, useNavigate } from 'react-router-dom' +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query' +import { + Card, Tabs, Button, Typography, Tag, Space, Table, Progress, Timeline, Spin, + message, Descriptions, Tree, Input, Row, Col, Statistic, Empty, Modal +} from 'antd' +import { + PlayCircleOutlined, ReloadOutlined, DownloadOutlined, ArrowLeftOutlined, + CheckCircleOutlined, ClockCircleOutlined, ExclamationCircleOutlined, + SearchOutlined, TeamOutlined, BookOutlined, BankOutlined +} from '@ant-design/icons' +import { universityApi, scriptApi, jobApi, resultApi } from '../services/api' + +const { Title, Text, Paragraph } = Typography +const { TabPane } = Tabs + +// 状态映射 +const statusMap: Record = { + pending: { color: 'default', text: '等待中', icon: }, + running: { color: 'processing', text: '运行中', icon: }, + completed: { color: 'success', text: '已完成', icon: }, + failed: { color: 'error', text: '失败', icon: }, + cancelled: { color: 'warning', text: '已取消', icon: } +} + +export default function UniversityDetailPage() { + const { id } = useParams<{ id: string }>() + const navigate = useNavigate() + const queryClient = useQueryClient() + const universityId = parseInt(id || '0') + + const [activeTab, setActiveTab] = useState('overview') + const [pollingJobId, setPollingJobId] = useState(null) + const [searchKeyword, setSearchKeyword] = useState('') + + // 获取大学详情 + const { data: universityData, isLoading: universityLoading } = useQuery({ + queryKey: ['university', universityId], + queryFn: () => universityApi.get(universityId) + }) + + // 获取脚本 + const { data: scriptsData } = useQuery({ + queryKey: ['scripts', universityId], + queryFn: () => scriptApi.getByUniversity(universityId) + }) + + // 获取任务列表 + const { data: jobsData, refetch: refetchJobs } = useQuery({ + queryKey: ['jobs', universityId], + queryFn: () => jobApi.getByUniversity(universityId) + }) + + // 获取结果数据 + const { data: resultData } = useQuery({ + queryKey: ['result', universityId], + queryFn: () => resultApi.get(universityId), + enabled: activeTab === 'data' + }) + + // 获取任务状态 (轮询) + const { data: jobStatusData } = useQuery({ + queryKey: ['job-status', pollingJobId], + queryFn: () => jobApi.getStatus(pollingJobId!), + enabled: !!pollingJobId, + refetchInterval: pollingJobId ? 2000 : false + }) + + // 启动爬虫任务 + const startJobMutation = useMutation({ + mutationFn: () => jobApi.start(universityId), + onSuccess: (response) => { + message.success('爬虫任务已启动') + setPollingJobId(response.data.id) + refetchJobs() + }, + onError: (error: any) => { + message.error(error.response?.data?.detail || '启动失败') + } + }) + + // 监听任务完成 + useEffect(() => { + if (jobStatusData?.data?.status === 'completed' || jobStatusData?.data?.status === 'failed') { + setPollingJobId(null) + refetchJobs() + queryClient.invalidateQueries({ queryKey: ['university', universityId] }) + queryClient.invalidateQueries({ queryKey: ['result', universityId] }) + + if (jobStatusData?.data?.status === 'completed') { + message.success('爬取完成!') + } else { + message.error('爬取失败') + } + } + }, [jobStatusData?.data?.status]) + + const university = universityData?.data + const scripts = scriptsData?.data || [] + const jobs = jobsData?.data || [] + const result = resultData?.data + + // 构建数据树 + const buildDataTree = () => { + if (!result?.result_data?.schools) return [] + + return result.result_data.schools.map((school: any, si: number) => ({ + key: `school-${si}`, + title: ( + + + {school.name} ({school.programs?.length || 0}个项目) + + ), + children: school.programs?.map((prog: any, pi: number) => ({ + key: `program-${si}-${pi}`, + title: ( + + + {prog.name} ({prog.faculty?.length || 0}位导师) + + ), + children: prog.faculty?.map((fac: any, fi: number) => ({ + key: `faculty-${si}-${pi}-${fi}`, + title: ( + + + {fac.name} + + ), + isLeaf: true + })) + })) + })) + } + + if (universityLoading) { + return + } + + if (!university) { + return + } + + const activeScript = scripts.find((s: any) => s.status === 'active') + const latestJob = jobs[0] + const isRunning = pollingJobId !== null || latestJob?.status === 'running' + + return ( +
+ {/* 头部 */} + + + + + + +
+ {university.name} + + {university.url} + + + {university.country || '未知国家'} + + {university.status === 'ready' ? '就绪' : university.status} + + + + + + + + + {/* 统计 */} + + + + + + + + + + + + + + + + + {/* 运行进度 */} + {pollingJobId && jobStatusData?.data && ( + + 爬虫运行中 + + {jobStatusData.data.current_step} + +
+ ({ + color: log.level === 'error' ? 'red' : log.level === 'warning' ? 'orange' : 'blue', + children: {log.message} + }))} + /> +
+
+ )} + + {/* 标签页 */} + + + {/* 概览 */} + + + {university.name} + + {university.url} + + {university.country || '-'} + + + {university.status} + + + + {new Date(university.created_at).toLocaleString()} + + + {new Date(university.updated_at).toLocaleString()} + + + + 历史任务 +
{ + const s = statusMap[status] || { color: 'default', text: status } + return {s.icon} {s.text} + } + }, + { + title: '进度', + dataIndex: 'progress', + width: 150, + render: (progress: number) => + }, + { + title: '开始时间', + dataIndex: 'started_at', + render: (t: string) => t ? new Date(t).toLocaleString() : '-' + }, + { + title: '完成时间', + dataIndex: 'completed_at', + render: (t: string) => t ? new Date(t).toLocaleString() : '-' + } + ]} + /> + + + {/* 数据查看 */} + + {result?.result_data ? ( +
+ +
+ } + value={searchKeyword} + onChange={(e) => setSearchKeyword(e.target.value)} + style={{ width: 300 }} + /> + + + + + + + + + ) : ( + + )} + + + {/* 脚本管理 */} + + {activeScript ? ( +
+ + {activeScript.script_name} + v{activeScript.version} + + 活跃 + + + {new Date(activeScript.created_at).toLocaleString()} + + + + 脚本代码 +
+                  {activeScript.script_content}
+                
+
+ ) : ( + + )} +
+ + + + ) +} diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts new file mode 100644 index 0000000..19e8e47 --- /dev/null +++ b/frontend/src/services/api.ts @@ -0,0 +1,77 @@ +/** + * API服务 + */ +import axios from 'axios' + +const api = axios.create({ + baseURL: '/api', + timeout: 60000 +}) + +// 大学相关API +export const universityApi = { + list: (params?: { skip?: number; limit?: number; search?: string }) => + api.get('/universities', { params }), + + get: (id: number) => + api.get(`/universities/${id}`), + + create: (data: { name: string; url: string; country?: string }) => + api.post('/universities', data), + + update: (id: number, data: { name?: string; url?: string; country?: string }) => + api.put(`/universities/${id}`, data), + + delete: (id: number) => + api.delete(`/universities/${id}`) +} + +// 脚本相关API +export const scriptApi = { + generate: (data: { university_url: string; university_name?: string }) => + api.post('/scripts/generate', data), + + getByUniversity: (universityId: number) => + api.get(`/scripts/university/${universityId}`), + + get: (id: number) => + api.get(`/scripts/${id}`) +} + +// 任务相关API +export const jobApi = { + start: (universityId: number) => + api.post(`/jobs/start/${universityId}`), + + get: (id: number) => + api.get(`/jobs/${id}`), + + getStatus: (id: number) => + api.get(`/jobs/${id}/status`), + + getByUniversity: (universityId: number) => + api.get(`/jobs/university/${universityId}`), + + cancel: (id: number) => + api.post(`/jobs/${id}/cancel`) +} + +// 结果相关API +export const resultApi = { + get: (universityId: number) => + api.get(`/results/university/${universityId}`), + + getSchools: (universityId: number) => + api.get(`/results/university/${universityId}/schools`), + + getPrograms: (universityId: number, params?: { school_name?: string; search?: string }) => + api.get(`/results/university/${universityId}/programs`, { params }), + + getFaculty: (universityId: number, params?: { school_name?: string; program_name?: string; search?: string; skip?: number; limit?: number }) => + api.get(`/results/university/${universityId}/faculty`, { params }), + + export: (universityId: number) => + api.get(`/results/university/${universityId}/export`, { responseType: 'blob' }) +} + +export default api diff --git a/frontend/src/vite-env.d.ts b/frontend/src/vite-env.d.ts new file mode 100644 index 0000000..11f02fe --- /dev/null +++ b/frontend/src/vite-env.d.ts @@ -0,0 +1 @@ +/// diff --git a/frontend/tsconfig.json b/frontend/tsconfig.json new file mode 100644 index 0000000..3934b8f --- /dev/null +++ b/frontend/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "target": "ES2020", + "useDefineForClassFields": true, + "lib": ["ES2020", "DOM", "DOM.Iterable"], + "module": "ESNext", + "skipLibCheck": true, + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "resolveJsonModule": true, + "isolatedModules": true, + "noEmit": true, + "jsx": "react-jsx", + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true + }, + "include": ["src"], + "references": [{ "path": "./tsconfig.node.json" }] +} diff --git a/frontend/tsconfig.node.json b/frontend/tsconfig.node.json new file mode 100644 index 0000000..42872c5 --- /dev/null +++ b/frontend/tsconfig.node.json @@ -0,0 +1,10 @@ +{ + "compilerOptions": { + "composite": true, + "skipLibCheck": true, + "module": "ESNext", + "moduleResolution": "bundler", + "allowSyntheticDefaultImports": true + }, + "include": ["vite.config.ts"] +} diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts new file mode 100644 index 0000000..b69c5ce --- /dev/null +++ b/frontend/vite.config.ts @@ -0,0 +1,15 @@ +import { defineConfig } from 'vite' +import react from '@vitejs/plugin-react' + +export default defineConfig({ + plugins: [react()], + server: { + port: 3000, + proxy: { + '/api': { + target: 'http://localhost:8000', + changeOrigin: true + } + } + } +}) diff --git a/scripts/reorganize_by_school.py b/scripts/reorganize_by_school.py new file mode 100644 index 0000000..494d601 --- /dev/null +++ b/scripts/reorganize_by_school.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +将已爬取的Harvard数据按学院重新组织 + +读取原始扁平数据,按 学院 → 项目 → 导师 层级重新组织输出 +""" + +import json +from pathlib import Path +from datetime import datetime, timezone +from urllib.parse import urlparse +from collections import defaultdict + +# Harvard学院映射 - 根据URL子域名判断所属学院 +SCHOOL_MAPPING = { + "gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)", + "seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)", + "hbs.edu": "Harvard Business School (HBS)", + "www.hbs.edu": "Harvard Business School (HBS)", + "gsd.harvard.edu": "Graduate School of Design (GSD)", + "www.gsd.harvard.edu": "Graduate School of Design (GSD)", + "gse.harvard.edu": "Graduate School of Education (HGSE)", + "www.gse.harvard.edu": "Graduate School of Education (HGSE)", + "hks.harvard.edu": "Harvard Kennedy School (HKS)", + "www.hks.harvard.edu": "Harvard Kennedy School (HKS)", + "hls.harvard.edu": "Harvard Law School (HLS)", + "hms.harvard.edu": "Harvard Medical School (HMS)", + "hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)", + "www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)", + "hds.harvard.edu": "Harvard Divinity School (HDS)", + "hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)", + "fas.harvard.edu": "Faculty of Arts and Sciences (FAS)", + "aaas.fas.harvard.edu": "Faculty of Arts and Sciences (FAS)", + "dce.harvard.edu": "Division of Continuing Education (DCE)", + "extension.harvard.edu": "Harvard Extension School", + "cs.seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)", +} + +# 学院URL映射 +SCHOOL_URLS = { + "Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/", + "John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/", + "Harvard Business School (HBS)": "https://www.hbs.edu/", + "Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/", + "Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/", + "Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/", + "Harvard Law School (HLS)": "https://hls.harvard.edu/", + "Harvard Medical School (HMS)": "https://hms.harvard.edu/", + "T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/", + "Harvard Divinity School (HDS)": "https://hds.harvard.edu/", + "Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/", + "Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/", + "Division of Continuing Education (DCE)": "https://dce.harvard.edu/", + "Harvard Extension School": "https://extension.harvard.edu/", + "Other": "https://www.harvard.edu/", +} + + +def determine_school_from_url(url: str) -> str: + """根据URL判断所属学院""" + if not url: + return "Other" + + parsed = urlparse(url) + domain = parsed.netloc.lower() + + # 先尝试完全匹配 + for pattern, school_name in SCHOOL_MAPPING.items(): + if domain == pattern: + return school_name + + # 再尝试部分匹配 + for pattern, school_name in SCHOOL_MAPPING.items(): + if pattern in domain: + return school_name + + return "Other" + + +def reorganize_data(input_path: str, output_path: str): + """重新组织数据按学院层级""" + + # 读取原始数据 + with open(input_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + print(f"读取原始数据: {data['total_programs']} 个项目, {data['total_faculty_found']} 位导师") + + # 按学院分组 + schools_dict = defaultdict(lambda: {"name": "", "url": "", "programs": []}) + + for prog in data['programs']: + # 根据faculty_page_url判断学院 + faculty_url = prog.get('faculty_page_url', '') + school_name = determine_school_from_url(faculty_url) + + # 如果没有faculty_page_url,尝试从program url推断 + if school_name == "Other" and prog.get('url'): + school_name = determine_school_from_url(prog['url']) + + # 创建项目对象 + program = { + "name": prog['name'], + "url": prog.get('url', ''), + "degree_type": prog.get('degrees', ''), + "faculty_page_url": faculty_url, + "faculty": prog.get('faculty', []) + } + + # 添加到学院 + if not schools_dict[school_name]["name"]: + schools_dict[school_name]["name"] = school_name + schools_dict[school_name]["url"] = SCHOOL_URLS.get(school_name, "") + + schools_dict[school_name]["programs"].append(program) + + # 转换为列表并排序 + schools_list = sorted(schools_dict.values(), key=lambda s: s["name"]) + + # 构建输出结构 + result = { + "name": "Harvard University", + "url": "https://www.harvard.edu/", + "country": "USA", + "scraped_at": datetime.now(timezone.utc).isoformat(), + "schools": schools_list + } + + # 打印统计 + print("\n" + "=" * 60) + print("按学院重新组织完成!") + print("=" * 60) + print(f"大学: {result['name']}") + print(f"学院数: {len(schools_list)}") + + total_programs = sum(len(s['programs']) for s in schools_list) + total_faculty = sum(len(p['faculty']) for s in schools_list for p in s['programs']) + + print(f"项目数: {total_programs}") + print(f"导师数: {total_faculty}") + + print("\n各学院统计:") + for school in schools_list: + prog_count = len(school['programs']) + fac_count = sum(len(p['faculty']) for p in school['programs']) + print(f" {school['name']}: {prog_count}个项目, {fac_count}位导师") + + # 保存结果 + output_file = Path(output_path) + output_file.parent.mkdir(parents=True, exist_ok=True) + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + print(f"\n结果已保存到: {output_path}") + + return result + + +if __name__ == "__main__": + input_file = "artifacts/harvard_programs_with_faculty.json" + output_file = "output/harvard_hierarchical_result.json" + + reorganize_data(input_file, output_file) diff --git a/scripts/start_backend.py b/scripts/start_backend.py new file mode 100644 index 0000000..b2b4bb1 --- /dev/null +++ b/scripts/start_backend.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +""" +启动后端API服务 (本地开发) +""" + +import subprocess +import sys +import os + +# 切换到项目根目录 +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +os.chdir(project_root) + +# 添加backend到Python路径 +backend_path = os.path.join(project_root, "backend") +sys.path.insert(0, backend_path) + +print("=" * 60) +print("启动大学爬虫 Web API 服务") +print("=" * 60) +print(f"项目目录: {project_root}") +print(f"后端目录: {backend_path}") +print() + +# 检查是否安装了依赖 +try: + import fastapi + import uvicorn +except ImportError: + print("正在安装后端依赖...") + subprocess.run([sys.executable, "-m", "pip", "install", "-r", "backend/requirements.txt"]) + +# 初始化数据库 +print("初始化数据库...") +os.chdir(backend_path) + +# 启动服务 +print() +print("启动 FastAPI 服务...") +print("API文档: http://localhost:8000/docs") +print("Swagger UI: http://localhost:8000/redoc") +print() + +import uvicorn +uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True) diff --git a/scripts/start_dev.bat b/scripts/start_dev.bat new file mode 100644 index 0000000..fd89f42 --- /dev/null +++ b/scripts/start_dev.bat @@ -0,0 +1,42 @@ +@echo off +echo ============================================================ +echo 大学爬虫 Web 系统 - 本地开发启动 +echo ============================================================ + +echo. +echo 启动后端API服务... +cd /d "%~dp0..\backend" + +REM 安装后端依赖 +pip install -r requirements.txt -q + +REM 启动后端 +start cmd /k "cd /d %~dp0..\backend && uvicorn app.main:app --reload --port 8000" + +echo 后端已启动: http://localhost:8000 +echo API文档: http://localhost:8000/docs + +echo. +echo 启动前端服务... +cd /d "%~dp0..\frontend" + +REM 安装前端依赖 +if not exist node_modules ( + echo 安装前端依赖... + npm install +) + +REM 启动前端 +start cmd /k "cd /d %~dp0..\frontend && npm run dev" + +echo 前端已启动: http://localhost:3000 + +echo. +echo ============================================================ +echo 系统启动完成! +echo. +echo 后端API: http://localhost:8000/docs +echo 前端页面: http://localhost:3000 +echo ============================================================ + +pause diff --git a/scripts/test_harvard.py b/scripts/test_harvard.py new file mode 100644 index 0000000..9fc404b --- /dev/null +++ b/scripts/test_harvard.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +""" +测试Harvard大学爬取 - 只测试2个学院 +""" + +import asyncio +import sys +from pathlib import Path + +# 添加项目路径 +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from university_scraper.config import ScraperConfig +from university_scraper.scraper import UniversityScraper + + +# 简化的测试配置 - 只测试2个学院 +TEST_CONFIG = { + "university": { + "name": "Harvard University", + "url": "https://www.harvard.edu/", + "country": "USA" + }, + "schools": { + "discovery_method": "static_list", + "static_list": [ + { + "name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)", + "url": "https://seas.harvard.edu/" + }, + { + "name": "Graduate School of Design (GSD)", + "url": "https://www.gsd.harvard.edu/" + } + ] + }, + "programs": { + "paths_to_try": [ + "/academics/graduate-programs", + "/programs", + "/academics/programs", + "/graduate" + ], + "link_patterns": [ + {"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]}, + {"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]} + ], + "selectors": { + "program_item": "div.program-item, li.program, a[href*='/program']", + "program_name": "h3, .title", + "program_url": "a[href]", + "degree_type": ".degree" + }, + "pagination": {"type": "none"} + }, + "faculty": { + "discovery_strategies": [ + { + "type": "link_in_page", + "patterns": [ + {"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]} + ] + }, + { + "type": "url_pattern", + "patterns": [ + "{school_url}/faculty", + "{school_url}/people" + ] + } + ], + "selectors": { + "faculty_item": "div.faculty, li.person", + "faculty_name": "h3, .name", + "faculty_url": "a[href*='/people/'], a[href*='/faculty/']" + } + }, + "filters": { + "program_degree_types": { + "include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."], + "exclude": ["Ph.D.", "Doctor", "Bachelor"] + }, + "exclude_schools": [] + } +} + + +async def test_harvard(): + """测试Harvard爬取""" + print("=" * 60) + print("测试Harvard大学爬取(简化版 - 2个学院)") + print("=" * 60) + + config = ScraperConfig.from_dict(TEST_CONFIG) + + async with UniversityScraper(config, headless=False) as scraper: + university = await scraper.scrape() + scraper.save_results("output/harvard_test_result.json") + + # 打印详细结果 + print("\n" + "=" * 60) + print("详细结果:") + print("=" * 60) + + for school in university.schools: + print(f"\n学院: {school.name}") + print(f" URL: {school.url}") + print(f" 项目数: {len(school.programs)}") + + for prog in school.programs[:5]: + print(f"\n 项目: {prog.name}") + print(f" URL: {prog.url}") + print(f" 学位: {prog.degree_type}") + print(f" 导师数: {len(prog.faculty)}") + + if prog.faculty: + print(" 导师示例:") + for f in prog.faculty[:3]: + print(f" - {f.name}: {f.url}") + + if len(school.programs) > 5: + print(f"\n ... 还有 {len(school.programs) - 5} 个项目") + + +if __name__ == "__main__": + asyncio.run(test_harvard()) diff --git a/src/university_scraper/__init__.py b/src/university_scraper/__init__.py new file mode 100644 index 0000000..cb61906 --- /dev/null +++ b/src/university_scraper/__init__.py @@ -0,0 +1,7 @@ +""" +University Scraper - 通用大学官网爬虫框架 + +支持按照 学院 → 项目 → 导师 的层级结构爬取任意海外大学官网 +""" + +__version__ = "1.0.0" diff --git a/src/university_scraper/__main__.py b/src/university_scraper/__main__.py new file mode 100644 index 0000000..cfbd337 --- /dev/null +++ b/src/university_scraper/__main__.py @@ -0,0 +1,8 @@ +""" +模块入口点,支持 python -m university_scraper 运行 +""" + +from .cli import main + +if __name__ == "__main__": + main() diff --git a/src/university_scraper/analyzer.py b/src/university_scraper/analyzer.py new file mode 100644 index 0000000..c2cdfed --- /dev/null +++ b/src/university_scraper/analyzer.py @@ -0,0 +1,374 @@ +""" +AI辅助页面分析工具 + +帮助分析新大学官网的页面结构,生成配置建议 +""" + +import asyncio +import json +from typing import Dict, Any, List, Optional +from urllib.parse import urljoin, urlparse + +from playwright.async_api import async_playwright, Page + + +class PageAnalyzer: + """页面结构分析器""" + + def __init__(self): + self.browser = None + self.page: Optional[Page] = None + + async def __aenter__(self): + playwright = await async_playwright().start() + self.browser = await playwright.chromium.launch(headless=False) + context = await self.browser.new_context( + viewport={'width': 1920, 'height': 1080} + ) + self.page = await context.new_page() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self.browser: + await self.browser.close() + + async def analyze_university_homepage(self, url: str) -> Dict[str, Any]: + """分析大学官网首页,寻找学院链接""" + print(f"\n分析大学首页: {url}") + + await self.page.goto(url, wait_until='networkidle') + await self.page.wait_for_timeout(3000) + + analysis = await self.page.evaluate('''() => { + const result = { + title: document.title, + schools_links: [], + navigation_links: [], + potential_schools_pages: [], + all_harvard_subdomains: new Set() + }; + + // 查找可能的学院链接 + const schoolKeywords = ['school', 'college', 'faculty', 'institute', 'academy', 'department']; + const navKeywords = ['academics', 'schools', 'colleges', 'programs', 'education']; + + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim().toLowerCase(); + + // 收集所有子域名 + try { + const urlObj = new URL(href); + if (urlObj.hostname.includes('harvard.edu') && + urlObj.hostname !== 'www.harvard.edu') { + result.all_harvard_subdomains.add(urlObj.origin); + } + } catch(e) {} + + // 查找学院链接 + if (schoolKeywords.some(kw => text.includes(kw)) || + schoolKeywords.some(kw => href.toLowerCase().includes(kw))) { + result.schools_links.push({ + text: a.innerText.trim().substring(0, 100), + href: href + }); + } + + // 查找导航到学院列表的链接 + if (navKeywords.some(kw => text.includes(kw))) { + result.potential_schools_pages.push({ + text: a.innerText.trim().substring(0, 50), + href: href + }); + } + }); + + // 转换Set为数组 + result.all_harvard_subdomains = Array.from(result.all_harvard_subdomains); + + return result; + }''') + + print(f"\n页面标题: {analysis['title']}") + print(f"\n发现的子域名 ({len(analysis['all_harvard_subdomains'])} 个):") + for subdomain in analysis['all_harvard_subdomains'][:20]: + print(f" - {subdomain}") + + print(f"\n可能的学院链接 ({len(analysis['schools_links'])} 个):") + for link in analysis['schools_links'][:15]: + print(f" - {link['text'][:50]} -> {link['href']}") + + return analysis + + async def analyze_school_page(self, url: str) -> Dict[str, Any]: + """分析学院页面,寻找项目列表""" + print(f"\n分析学院页面: {url}") + + await self.page.goto(url, wait_until='networkidle') + await self.page.wait_for_timeout(3000) + + analysis = await self.page.evaluate('''() => { + const result = { + title: document.title, + navigation: [], + program_links: [], + degree_mentions: [], + faculty_links: [] + }; + + // 分析导航结构 + document.querySelectorAll('nav a, [class*="nav"] a, header a').forEach(a => { + const text = a.innerText.trim(); + const href = a.href || ''; + if (text.length > 2 && text.length < 50) { + result.navigation.push({ text, href }); + } + }); + + // 查找项目/学位链接 + const programKeywords = ['program', 'degree', 'master', 'graduate', 'academic', 'study']; + + document.querySelectorAll('a[href]').forEach(a => { + const text = a.innerText.trim().toLowerCase(); + const href = a.href.toLowerCase(); + + if (programKeywords.some(kw => text.includes(kw) || href.includes(kw))) { + result.program_links.push({ + text: a.innerText.trim().substring(0, 100), + href: a.href + }); + } + + // 查找Faculty链接 + if (text.includes('faculty') || text.includes('people') || + href.includes('/faculty') || href.includes('/people')) { + result.faculty_links.push({ + text: a.innerText.trim().substring(0, 100), + href: a.href + }); + } + }); + + return result; + }''') + + print(f"\n导航链接:") + for nav in analysis['navigation'][:10]: + print(f" - {nav['text']} -> {nav['href']}") + + print(f"\n项目相关链接 ({len(analysis['program_links'])} 个):") + for link in analysis['program_links'][:15]: + print(f" - {link['text'][:50]} -> {link['href']}") + + print(f"\nFaculty链接 ({len(analysis['faculty_links'])} 个):") + for link in analysis['faculty_links'][:10]: + print(f" - {link['text'][:50]} -> {link['href']}") + + return analysis + + async def analyze_programs_page(self, url: str) -> Dict[str, Any]: + """分析项目列表页面,识别项目选择器""" + print(f"\n分析项目列表页面: {url}") + + await self.page.goto(url, wait_until='networkidle') + await self.page.wait_for_timeout(3000) + + # 保存截图 + screenshot_path = f"analysis_{urlparse(url).netloc.replace('.', '_')}.png" + await self.page.screenshot(path=screenshot_path, full_page=True) + print(f"截图已保存: {screenshot_path}") + + analysis = await self.page.evaluate('''() => { + const result = { + title: document.title, + potential_program_containers: [], + program_items: [], + pagination: null, + selectors_suggestion: {} + }; + + // 分析页面结构,寻找重复的项目容器 + const containers = [ + 'div[class*="program"]', + 'li[class*="program"]', + 'article[class*="program"]', + 'div[class*="degree"]', + 'div[class*="card"]', + 'li.item', + 'div.item' + ]; + + containers.forEach(selector => { + const elements = document.querySelectorAll(selector); + if (elements.length >= 3) { + result.potential_program_containers.push({ + selector: selector, + count: elements.length, + sample: elements[0].outerHTML.substring(0, 500) + }); + } + }); + + // 查找所有看起来像项目的链接 + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href.toLowerCase(); + const text = a.innerText.trim(); + + if ((href.includes('/program') || href.includes('/degree') || + href.includes('/master') || href.includes('/graduate')) && + text.length > 5 && text.length < 150) { + + result.program_items.push({ + text: text, + href: a.href, + parentClass: a.parentElement?.className || '', + grandparentClass: a.parentElement?.parentElement?.className || '' + }); + } + }); + + // 查找分页元素 + const paginationSelectors = [ + '.pagination', + '[class*="pagination"]', + 'nav[aria-label*="page"]', + '.pager' + ]; + + for (const selector of paginationSelectors) { + const elem = document.querySelector(selector); + if (elem) { + result.pagination = { + selector: selector, + html: elem.outerHTML.substring(0, 300) + }; + break; + } + } + + return result; + }''') + + print(f"\n可能的项目容器:") + for container in analysis['potential_program_containers']: + print(f" 选择器: {container['selector']} (找到 {container['count']} 个)") + + print(f"\n找到的项目链接 ({len(analysis['program_items'])} 个):") + for item in analysis['program_items'][:10]: + print(f" - {item['text'][:60]}") + print(f" 父元素class: {item['parentClass'][:50]}") + + if analysis['pagination']: + print(f"\n分页元素: {analysis['pagination']['selector']}") + + return analysis + + async def analyze_faculty_page(self, url: str) -> Dict[str, Any]: + """分析导师列表页面,识别导师选择器""" + print(f"\n分析导师列表页面: {url}") + + await self.page.goto(url, wait_until='networkidle') + await self.page.wait_for_timeout(3000) + + analysis = await self.page.evaluate('''() => { + const result = { + title: document.title, + faculty_links: [], + potential_containers: [], + url_patterns: new Set() + }; + + // 查找个人页面链接 + const personPatterns = ['/people/', '/faculty/', '/profile/', '/person/', '/directory/']; + + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href.toLowerCase(); + const text = a.innerText.trim(); + + if (personPatterns.some(p => href.includes(p)) && + text.length > 3 && text.length < 100) { + + result.faculty_links.push({ + name: text, + url: a.href, + parentClass: a.parentElement?.className || '' + }); + + // 记录URL模式 + personPatterns.forEach(p => { + if (href.includes(p)) { + result.url_patterns.add(p); + } + }); + } + }); + + result.url_patterns = Array.from(result.url_patterns); + + return result; + }''') + + print(f"\n发现的导师链接 ({len(analysis['faculty_links'])} 个):") + for faculty in analysis['faculty_links'][:15]: + print(f" - {faculty['name']} -> {faculty['url']}") + + print(f"\nURL模式: {analysis['url_patterns']}") + + return analysis + + async def generate_config_suggestion(self, university_url: str) -> str: + """生成配置文件建议""" + print(f"\n{'='*60}") + print(f"开始分析: {university_url}") + print(f"{'='*60}") + + # 分析首页 + homepage_analysis = await self.analyze_university_homepage(university_url) + + # 生成配置建议 + domain = urlparse(university_url).netloc + config_suggestion = f'''# {homepage_analysis['title']} 爬虫配置 +# 自动生成的配置建议,请根据实际情况调整 + +university: + name: "{homepage_analysis['title'].split(' - ')[0].split(' | ')[0]}" + url: "{university_url}" + country: "TODO" + +# 发现的子域名(可能是学院网站): +# {chr(10).join(['# - ' + s for s in homepage_analysis['all_harvard_subdomains'][:10]])} + +schools: + discovery_method: "static_list" + + # TODO: 根据上面的子域名和学院链接,手动填写学院列表 + static_list: + # 示例: + # - name: "School of Engineering" + # url: "https://engineering.{domain}/" +''' + + print(f"\n{'='*60}") + print("配置建议:") + print(f"{'='*60}") + print(config_suggestion) + + return config_suggestion + + +async def analyze_new_university(url: str): + """分析新大学的便捷函数""" + async with PageAnalyzer() as analyzer: + await analyzer.generate_config_suggestion(url) + + +# CLI入口 +if __name__ == "__main__": + import sys + + if len(sys.argv) < 2: + print("用法: python analyzer.py ") + print("示例: python analyzer.py https://www.stanford.edu/") + sys.exit(1) + + asyncio.run(analyze_new_university(sys.argv[1])) diff --git a/src/university_scraper/cli.py b/src/university_scraper/cli.py new file mode 100644 index 0000000..7602ee4 --- /dev/null +++ b/src/university_scraper/cli.py @@ -0,0 +1,105 @@ +""" +命令行工具 + +用法: + # 爬取指定大学 + python -m university_scraper scrape harvard + + # 分析新大学 + python -m university_scraper analyze https://www.stanford.edu/ + + # 列出可用配置 + python -m university_scraper list +""" + +import asyncio +import argparse +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser( + description="通用大学官网爬虫 - 按照 学院→项目→导师 层级爬取" + ) + + subparsers = parser.add_subparsers(dest='command', help='可用命令') + + # 爬取命令 + scrape_parser = subparsers.add_parser('scrape', help='爬取指定大学') + scrape_parser.add_argument('university', help='大学名称(配置文件名,不含.yaml)') + scrape_parser.add_argument('-o', '--output', help='输出文件路径', default=None) + scrape_parser.add_argument('--headless', action='store_true', help='无头模式运行') + scrape_parser.add_argument('--config-dir', default='configs', help='配置文件目录') + + # 分析命令 + analyze_parser = subparsers.add_parser('analyze', help='分析新大学官网结构') + analyze_parser.add_argument('url', help='大学官网URL') + + # 列出命令 + list_parser = subparsers.add_parser('list', help='列出可用的大学配置') + list_parser.add_argument('--config-dir', default='configs', help='配置文件目录') + + args = parser.parse_args() + + if args.command == 'scrape': + asyncio.run(run_scrape(args)) + elif args.command == 'analyze': + asyncio.run(run_analyze(args)) + elif args.command == 'list': + run_list(args) + else: + parser.print_help() + + +async def run_scrape(args): + """执行爬取""" + from .config import load_config + from .scraper import UniversityScraper + + config_path = Path(args.config_dir) / f"{args.university}.yaml" + + if not config_path.exists(): + print(f"错误: 配置文件不存在 - {config_path}") + print(f"可用配置: {list_configs(args.config_dir)}") + return + + config = load_config(str(config_path)) + + output_path = args.output or f"output/{args.university}_result.json" + + async with UniversityScraper(config, headless=args.headless) as scraper: + await scraper.scrape() + scraper.save_results(output_path) + + +async def run_analyze(args): + """执行分析""" + from .analyzer import PageAnalyzer + + async with PageAnalyzer() as analyzer: + await analyzer.generate_config_suggestion(args.url) + + +def run_list(args): + """列出可用配置""" + configs = list_configs(args.config_dir) + + if configs: + print("可用的大学配置:") + for name in configs: + print(f" - {name}") + else: + print(f"在 {args.config_dir} 目录下没有找到配置文件") + + +def list_configs(config_dir: str): + """列出配置文件""" + path = Path(config_dir) + if not path.exists(): + return [] + + return [f.stem for f in path.glob("*.yaml")] + [f.stem for f in path.glob("*.yml")] + + +if __name__ == "__main__": + main() diff --git a/src/university_scraper/config.py b/src/university_scraper/config.py new file mode 100644 index 0000000..517ff9a --- /dev/null +++ b/src/university_scraper/config.py @@ -0,0 +1,232 @@ +""" +配置文件加载和验证 + +配置文件格式 (YAML): + +university: + name: "Harvard University" + url: "https://www.harvard.edu/" + country: "USA" + +# 第一层:学院列表页面 +schools: + # 获取学院列表的方式 + discovery_method: "static_list" # static_list | scrape_page | sitemap + + # 方式1: 静态列表 (手动配置已知学院) + static_list: + - name: "School of Engineering and Applied Sciences" + url: "https://seas.harvard.edu/" + keywords: ["engineering", "computer"] + faculty_pages: + - url: "https://seas.harvard.edu/people" + extract_method: "links" # links | table | research_explorer + request: + timeout_ms: 90000 + wait_for_selector: ".profile-card" + - name: "Graduate School of Arts and Sciences" + url: "https://gsas.harvard.edu/" + + # 方式2: 从页面爬取 + scrape_config: + url: "https://www.harvard.edu/schools/" + selector: "a.school-link" + name_attribute: "text" # text | title | data-name + url_attribute: "href" + +# 第二层:每个学院下的项目列表 +programs: + # 相对于学院URL的路径模式 + paths_to_try: + - "/academics/graduate-programs" + - "/programs" + - "/graduate" + - "/academics/masters" + + # 或者使用选择器从学院首页查找 + link_patterns: + - text_contains: ["graduate", "master", "program"] + - href_contains: ["/program", "/graduate", "/academics"] + + # 项目列表页面的选择器 + selectors: + program_item: "div.program-item, li.program, a.program-link" + program_name: "h3, .title, .program-name" + program_url: "a[href]" + degree_type: ".degree, .credential" + request: + timeout_ms: 45000 + max_retries: 3 + retry_backoff_ms: 3000 + + # 分页配置 + pagination: + type: "none" # none | click | url_param | infinite_scroll + next_selector: "a.next, button.next-page" + param_name: "page" + +# 第三层:每个项目下的导师列表 +faculty: + # 查找导师页面的策略 + discovery_strategies: + - type: "link_in_page" + patterns: + - text_contains: ["faculty", "people", "advisor", "professor"] + - href_contains: ["/faculty", "/people", "/directory"] + + - type: "url_pattern" + patterns: + - "{program_url}/faculty" + - "{program_url}/people" + - "{school_url}/people" + - type: "school_directory" + assign_to_all: true + match_by_school_keywords: true + request: + timeout_ms: 90000 + wait_for_selector: "a.link.person" + + # 导师列表页面的选择器 + selectors: + faculty_item: "div.faculty-item, li.person, .profile-card" + faculty_name: "h3, .name, .title a" + faculty_url: "a[href*='/people/'], a[href*='/faculty/'], a[href*='/profile/']" + faculty_title: ".title, .position, .role" + faculty_email: "a[href^='mailto:']" + +# 过滤规则 +filters: + # 只爬取硕士项目 + program_degree_types: + include: ["M.S.", "M.A.", "MBA", "Master", "M.Eng", "M.Ed", "M.P.P", "M.P.A"] + exclude: ["Ph.D.", "Bachelor", "B.S.", "B.A.", "Certificate"] + + # 排除某些学院 + exclude_schools: + - "Summer School" + - "Extension School" +""" + +import yaml +from pathlib import Path +from typing import Dict, Any, List, Optional +from dataclasses import dataclass, field + + +@dataclass +class UniversityConfig: + """大学基本信息配置""" + name: str + url: str + country: str = "Unknown" + + +@dataclass +class SchoolsConfig: + """学院发现配置""" + discovery_method: str = "static_list" + static_list: List[Dict[str, str]] = field(default_factory=list) + scrape_config: Optional[Dict[str, Any]] = None + request: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class ProgramsConfig: + """项目发现配置""" + paths_to_try: List[str] = field(default_factory=list) + link_patterns: List[Dict[str, List[str]]] = field(default_factory=list) + selectors: Dict[str, str] = field(default_factory=dict) + pagination: Dict[str, Any] = field(default_factory=dict) + request: Dict[str, Any] = field(default_factory=dict) + global_catalog: Optional[Dict[str, Any]] = None + + +@dataclass +class FacultyConfig: + """导师发现配置""" + discovery_strategies: List[Dict[str, Any]] = field(default_factory=list) + selectors: Dict[str, str] = field(default_factory=dict) + request: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class FiltersConfig: + """过滤规则配置""" + program_degree_types: Dict[str, List[str]] = field(default_factory=dict) + exclude_schools: List[str] = field(default_factory=list) + +@dataclass +class PlaywrightConfig: + """Playwright运行环境配置""" + stealth: bool = False + user_agent: Optional[str] = None + locale: Optional[str] = None + timezone_id: Optional[str] = None + viewport: Optional[Dict[str, int]] = None + ignore_https_errors: bool = False + extra_headers: Dict[str, str] = field(default_factory=dict) + cookies: List[Dict[str, Any]] = field(default_factory=list) + add_init_scripts: List[str] = field(default_factory=list) + + +@dataclass +class ScraperConfig: + """完整的爬虫配置""" + university: UniversityConfig + schools: SchoolsConfig + programs: ProgramsConfig + faculty: FacultyConfig + filters: FiltersConfig + playwright: PlaywrightConfig = field(default_factory=PlaywrightConfig) + + @classmethod + def from_yaml(cls, yaml_path: str) -> "ScraperConfig": + """从YAML文件加载配置""" + with open(yaml_path, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + return cls( + university=UniversityConfig(**data.get('university', {})), + schools=SchoolsConfig(**data.get('schools', {})), + programs=ProgramsConfig(**data.get('programs', {})), + faculty=FacultyConfig(**data.get('faculty', {})), + filters=FiltersConfig(**data.get('filters', {})), + playwright=PlaywrightConfig(**data.get('playwright', {})) + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ScraperConfig": + """从字典创建配置""" + return cls( + university=UniversityConfig(**data.get('university', {})), + schools=SchoolsConfig(**data.get('schools', {})), + programs=ProgramsConfig(**data.get('programs', {})), + faculty=FacultyConfig(**data.get('faculty', {})), + filters=FiltersConfig(**data.get('filters', {})), + playwright=PlaywrightConfig(**data.get('playwright', {})) + ) + + +def load_config(config_path: str) -> ScraperConfig: + """加载配置文件""" + path = Path(config_path) + if not path.exists(): + raise FileNotFoundError(f"配置文件不存在: {config_path}") + + if path.suffix in ['.yaml', '.yml']: + return ScraperConfig.from_yaml(config_path) + else: + raise ValueError(f"不支持的配置文件格式: {path.suffix}") + + +def list_available_configs(configs_dir: str = "configs") -> List[str]: + """列出所有可用的配置文件""" + path = Path(configs_dir) + if not path.exists(): + return [] + + return [ + f.stem for f in path.glob("*.yaml") + ] + [ + f.stem for f in path.glob("*.yml") + ] diff --git a/src/university_scraper/harvard_scraper.py b/src/university_scraper/harvard_scraper.py new file mode 100644 index 0000000..7e0a400 --- /dev/null +++ b/src/university_scraper/harvard_scraper.py @@ -0,0 +1,405 @@ +#!/usr/bin/env python3 +""" +Harvard专用爬虫 + +Harvard的特殊情况: +1. 有一个集中的项目列表页面 (harvard.edu/programs) +2. 项目详情在GSAS页面 (gsas.harvard.edu/program/xxx) +3. 导师信息在各院系网站 + +爬取流程: +1. 从集中页面获取所有硕士项目 +2. 通过GSAS页面确定每个项目所属学院 +3. 从院系网站获取导师信息 +4. 按 学院→项目→导师 层级组织输出 +""" + +import asyncio +import json +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import List, Dict, Optional, Tuple +from urllib.parse import urljoin + +from playwright.async_api import async_playwright, Page, Browser + +from .models import University, School, Program, Faculty + + +# Harvard学院映射 - 根据URL子域名判断所属学院 +SCHOOL_MAPPING = { + "gsas.harvard.edu": "Graduate School of Arts and Sciences (GSAS)", + "seas.harvard.edu": "John A. Paulson School of Engineering and Applied Sciences (SEAS)", + "hbs.edu": "Harvard Business School (HBS)", + "www.hbs.edu": "Harvard Business School (HBS)", + "gsd.harvard.edu": "Graduate School of Design (GSD)", + "www.gsd.harvard.edu": "Graduate School of Design (GSD)", + "gse.harvard.edu": "Graduate School of Education (HGSE)", + "www.gse.harvard.edu": "Graduate School of Education (HGSE)", + "hks.harvard.edu": "Harvard Kennedy School (HKS)", + "www.hks.harvard.edu": "Harvard Kennedy School (HKS)", + "hls.harvard.edu": "Harvard Law School (HLS)", + "hms.harvard.edu": "Harvard Medical School (HMS)", + "hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)", + "www.hsph.harvard.edu": "T.H. Chan School of Public Health (HSPH)", + "hds.harvard.edu": "Harvard Divinity School (HDS)", + "hsdm.harvard.edu": "Harvard School of Dental Medicine (HSDM)", + "fas.harvard.edu": "Faculty of Arts and Sciences (FAS)", + "dce.harvard.edu": "Division of Continuing Education (DCE)", + "extension.harvard.edu": "Harvard Extension School", +} + +# 学院URL映射 +SCHOOL_URLS = { + "Graduate School of Arts and Sciences (GSAS)": "https://gsas.harvard.edu/", + "John A. Paulson School of Engineering and Applied Sciences (SEAS)": "https://seas.harvard.edu/", + "Harvard Business School (HBS)": "https://www.hbs.edu/", + "Graduate School of Design (GSD)": "https://www.gsd.harvard.edu/", + "Graduate School of Education (HGSE)": "https://www.gse.harvard.edu/", + "Harvard Kennedy School (HKS)": "https://www.hks.harvard.edu/", + "Harvard Law School (HLS)": "https://hls.harvard.edu/", + "Harvard Medical School (HMS)": "https://hms.harvard.edu/", + "T.H. Chan School of Public Health (HSPH)": "https://www.hsph.harvard.edu/", + "Harvard Divinity School (HDS)": "https://hds.harvard.edu/", + "Harvard School of Dental Medicine (HSDM)": "https://hsdm.harvard.edu/", + "Faculty of Arts and Sciences (FAS)": "https://fas.harvard.edu/", + "Other": "https://www.harvard.edu/", +} + + +def name_to_slug(name: str) -> str: + """将项目名称转换为URL slug""" + slug = name.lower() + slug = re.sub(r'[^\w\s-]', '', slug) + slug = re.sub(r'[\s_]+', '-', slug) + slug = re.sub(r'-+', '-', slug) + slug = slug.strip('-') + return slug + + +def determine_school_from_url(url: str) -> str: + """根据URL判断所属学院""" + if not url: + return "Other" + + from urllib.parse import urlparse + parsed = urlparse(url) + domain = parsed.netloc.lower() + + for pattern, school_name in SCHOOL_MAPPING.items(): + if pattern in domain: + return school_name + + return "Other" + + +class HarvardScraper: + """Harvard专用爬虫""" + + def __init__(self, headless: bool = True): + self.headless = headless + self.browser: Optional[Browser] = None + self.page: Optional[Page] = None + self._playwright = None + + async def __aenter__(self): + self._playwright = await async_playwright().start() + self.browser = await self._playwright.chromium.launch(headless=self.headless) + context = await self.browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + viewport={'width': 1920, 'height': 1080}, + java_script_enabled=True, + ) + self.page = await context.new_page() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self.browser: + await self.browser.close() + if self._playwright: + await self._playwright.stop() + + async def _safe_goto(self, url: str, timeout: int = 30000, retries: int = 3) -> bool: + """安全的页面导航,带重试机制""" + for attempt in range(retries): + try: + await self.page.goto(url, wait_until="domcontentloaded", timeout=timeout) + await self.page.wait_for_timeout(2000) + return True + except Exception as e: + print(f" 导航失败 (尝试 {attempt + 1}/{retries}): {str(e)[:50]}") + if attempt < retries - 1: + await self.page.wait_for_timeout(3000) + return False + + async def scrape(self) -> University: + """执行完整的爬取流程""" + print(f"\n{'='*60}") + print("Harvard University 专用爬虫") + print(f"{'='*60}") + + # 创建大学对象 + university = University( + name="Harvard University", + url="https://www.harvard.edu/", + country="USA" + ) + + # 第一阶段:从集中页面获取所有硕士项目 + print("\n[阶段1] 从集中页面获取项目列表...") + raw_programs = await self._scrape_programs_list() + print(f" 找到 {len(raw_programs)} 个项目") + + # 第二阶段:获取每个项目的详情和导师信息 + print("\n[阶段2] 获取项目详情和导师信息...") + + # 按学院组织的项目 + schools_dict: Dict[str, School] = {} + + for i, prog_data in enumerate(raw_programs, 1): + print(f"\n [{i}/{len(raw_programs)}] {prog_data['name']}") + + # 获取项目详情和导师 + program, school_name = await self._get_program_details(prog_data) + + if program: + # 添加到对应学院 + if school_name not in schools_dict: + schools_dict[school_name] = School( + name=school_name, + url=SCHOOL_URLS.get(school_name, "") + ) + schools_dict[school_name].programs.append(program) + + print(f" 学院: {school_name}") + print(f" 导师: {len(program.faculty)}位") + + # 避免请求过快 + await self.page.wait_for_timeout(1000) + + # 转换为列表并排序 + university.schools = sorted(schools_dict.values(), key=lambda s: s.name) + university.scraped_at = datetime.now(timezone.utc).isoformat() + + # 打印统计 + self._print_summary(university) + + return university + + async def _scrape_programs_list(self) -> List[Dict]: + """从Harvard集中页面获取所有硕士项目""" + all_programs = [] + base_url = "https://www.harvard.edu/programs/?degree_levels=graduate" + + print(f" 访问: {base_url}") + if not await self._safe_goto(base_url, timeout=60000): + print(" 无法访问项目页面!") + return [] + await self.page.wait_for_timeout(3000) + + # 滚动到页面底部 + await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await self.page.wait_for_timeout(2000) + + current_page = 1 + max_pages = 15 + + while current_page <= max_pages: + print(f" 第 {current_page} 页...") + await self.page.wait_for_timeout(2000) + + # 提取当前页面的项目 + page_data = await self.page.evaluate('''() => { + const programs = []; + const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]'); + + programItems.forEach((item) => { + const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]'); + if (!nameBtn) return; + + const name = nameBtn.innerText.trim(); + if (!name || name.length < 3) return; + + let degrees = ''; + const allText = item.innerText; + const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g); + if (degreeMatch) { + degrees = degreeMatch.join(', '); + } + + programs.push({ name, degrees }); + }); + + return programs; + }''') + + for prog in page_data: + name = prog['name'].strip() + if name and not any(p['name'] == name for p in all_programs): + all_programs.append(prog) + + # 尝试点击下一页 + try: + next_btn = self.page.locator('button.c-pagination__link--next') + if await next_btn.count() > 0: + await next_btn.first.scroll_into_view_if_needed() + await next_btn.first.click() + await self.page.wait_for_timeout(3000) + current_page += 1 + else: + break + except: + break + + # 过滤:只保留硕士项目 + master_keywords = ['M.A.', 'M.S.', 'S.M.', 'A.M.', 'MBA', 'M.Arch', 'M.L.A.', + 'M.Div', 'M.T.S', 'LL.M', 'M.P.P', 'M.P.A', 'M.Ed', 'Ed.M.', + 'A.L.M.', 'M.P.H.', 'M.M.Sc.', 'Master'] + phd_keywords = ['Ph.D.', 'Doctor', 'D.M.D.', 'D.M.Sc.', 'Ed.D.', 'Th.D.', 'J.D.', 'M.D.'] + + filtered = [] + for prog in all_programs: + degrees = prog.get('degrees', '') + name = prog.get('name', '') + + # 检查是否有硕士学位 + has_master = any(kw in degrees or kw in name for kw in master_keywords) + + # 排除纯博士项目 + is_phd_only = all(kw in degrees for kw in phd_keywords if kw in degrees) and not has_master + + if has_master or (not is_phd_only and not degrees): + filtered.append(prog) + + return filtered + + async def _get_program_details(self, prog_data: Dict) -> Tuple[Optional[Program], str]: + """获取项目详情和导师信息""" + name = prog_data['name'] + degrees = prog_data.get('degrees', '') + + # 生成URL + slug = name_to_slug(name) + program_url = f"https://www.harvard.edu/programs/{slug}/" + gsas_url = f"https://gsas.harvard.edu/program/{slug}" + + # 访问GSAS页面获取详情 + school_name = "Other" + faculty_list = [] + faculty_page_url = None + + try: + if await self._safe_goto(gsas_url, timeout=20000, retries=2): + # 检查页面是否有效 + title = await self.page.title() + if '404' not in title and 'not found' not in title.lower(): + school_name = "Graduate School of Arts and Sciences (GSAS)" + + # 查找Faculty链接 + faculty_link = await self.page.evaluate('''() => { + const links = document.querySelectorAll('a[href]'); + for (const link of links) { + const text = link.innerText.toLowerCase(); + const href = link.href; + if (text.includes('faculty') && text.includes('see list')) { + return href; + } + if ((text.includes('faculty') || text.includes('people')) && + (href.includes('/people') || href.includes('/faculty'))) { + return href; + } + } + return null; + }''') + + if faculty_link: + faculty_page_url = faculty_link + school_name = determine_school_from_url(faculty_link) + + # 访问导师页面 + if await self._safe_goto(faculty_link, timeout=20000, retries=2): + # 提取导师信息 + faculty_list = await self._extract_faculty() + + except Exception as e: + print(f" 获取详情失败: {str(e)[:50]}") + + # 创建项目对象 + program = Program( + name=name, + url=program_url, + degree_type=degrees, + faculty_page_url=faculty_page_url, + faculty=[Faculty(name=f['name'], url=f['url']) for f in faculty_list] + ) + + return program, school_name + + async def _extract_faculty(self) -> List[Dict]: + """从当前页面提取导师信息""" + return await self.page.evaluate('''() => { + const faculty = []; + const seen = new Set(); + const patterns = ['/people/', '/faculty/', '/profile/', '/person/']; + + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + const lowerHref = href.toLowerCase(); + const lowerText = text.toLowerCase(); + + const isPersonLink = patterns.some(p => lowerHref.includes(p)); + const isNavLink = ['people', 'faculty', 'directory', 'staff', 'all'].includes(lowerText); + + if (isPersonLink && !isNavLink && + text.length > 3 && text.length < 100 && + !seen.has(href)) { + seen.add(href); + faculty.push({ name: text, url: href }); + } + }); + + return faculty; + }''') + + def _print_summary(self, university: University): + """打印统计摘要""" + total_programs = sum(len(s.programs) for s in university.schools) + total_faculty = sum(len(p.faculty) for s in university.schools for p in s.programs) + + print(f"\n{'='*60}") + print("爬取完成!") + print(f"{'='*60}") + print(f"大学: {university.name}") + print(f"学院数: {len(university.schools)}") + print(f"项目数: {total_programs}") + print(f"导师数: {total_faculty}") + + print("\n各学院统计:") + for school in university.schools: + prog_count = len(school.programs) + fac_count = sum(len(p.faculty) for p in school.programs) + print(f" {school.name}: {prog_count}个项目, {fac_count}位导师") + + def save_results(self, university: University, output_path: str): + """保存结果""" + output = Path(output_path) + output.parent.mkdir(parents=True, exist_ok=True) + + with open(output, 'w', encoding='utf-8') as f: + json.dump(university.to_dict(), f, ensure_ascii=False, indent=2) + + print(f"\n结果已保存到: {output_path}") + + +async def scrape_harvard(output_path: str = "output/harvard_full_result.json", headless: bool = True): + """爬取Harvard的便捷函数""" + async with HarvardScraper(headless=headless) as scraper: + university = await scraper.scrape() + scraper.save_results(university, output_path) + return university + + +if __name__ == "__main__": + asyncio.run(scrape_harvard(headless=False)) diff --git a/src/university_scraper/models.py b/src/university_scraper/models.py new file mode 100644 index 0000000..f6aeee1 --- /dev/null +++ b/src/university_scraper/models.py @@ -0,0 +1,105 @@ +""" +鏁版嵁妯″瀷瀹氫箟 - 瀛﹂櫌 → 椤圭洰 → 瀵煎笀 层级结构 +""" + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Dict, List, Optional + + +@dataclass +class Faculty: + """瀵煎笀淇℃伅""" + name: str + url: str + title: Optional[str] = None + email: Optional[str] = None + department: Optional[str] = None + + def to_dict(self) -> dict: + return { + "name": self.name, + "url": self.url, + "title": self.title, + "email": self.email, + "department": self.department + } + + +@dataclass +class Program: + """纭曞+椤圭洰淇℃伅""" + name: str + url: str + degree_type: Optional[str] = None # M.S., M.A., MBA, etc. + description: Optional[str] = None + faculty_page_url: Optional[str] = None + faculty: List[Faculty] = field(default_factory=list) + metadata: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict: + return { + "name": self.name, + "url": self.url, + "degree_type": self.degree_type, + "description": self.description, + "faculty_page_url": self.faculty_page_url, + "faculty_count": len(self.faculty), + "faculty": [f.to_dict() for f in self.faculty], + "metadata": self.metadata + } + + +@dataclass +class School: + """瀛﹂櫌淇℃伅""" + name: str + url: str + description: Optional[str] = None + programs: List[Program] = field(default_factory=list) + metadata: Dict[str, Any] = field(default_factory=dict) + faculty_directory: List[Faculty] = field(default_factory=list) + faculty_directory_loaded: bool = False + + def to_dict(self) -> dict: + return { + "name": self.name, + "url": self.url, + "description": self.description, + "program_count": len(self.programs), + "programs": [p.to_dict() for p in self.programs], + "faculty_directory_count": len(self.faculty_directory), + "faculty_directory": [f.to_dict() for f in self.faculty_directory] + } + + +@dataclass +class University: + """澶у淇℃伅 - 椤跺眰鏁版嵁缁撴瀯""" + name: str + url: str + country: Optional[str] = None + schools: List[School] = field(default_factory=list) + scraped_at: Optional[str] = None + + def to_dict(self) -> dict: + # 缁熻 + total_programs = sum(len(s.programs) for s in self.schools) + total_faculty = sum( + len(p.faculty) + for s in self.schools + for p in s.programs + ) + + return { + "university": self.name, + "url": self.url, + "country": self.country, + "scraped_at": self.scraped_at or datetime.utcnow().isoformat(), + "statistics": { + "total_schools": len(self.schools), + "total_programs": total_programs, + "total_faculty": total_faculty + }, + "schools": [s.to_dict() for s in self.schools] + } diff --git a/src/university_scraper/scraper.py b/src/university_scraper/scraper.py new file mode 100644 index 0000000..23ba479 --- /dev/null +++ b/src/university_scraper/scraper.py @@ -0,0 +1,1360 @@ +""" +通用大学爬虫核心实现 + +支持按照 学院 → 项目 → 导师 的层级结构爬取 +""" + +import asyncio +import json +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import List, Optional, Dict, Any, Tuple +from urllib.parse import urljoin, urlencode, urlparse + +from playwright.async_api import async_playwright, Page, Browser, BrowserContext +import xml.etree.ElementTree as ET + +from .models import University, School, Program, Faculty +from .config import ScraperConfig + +DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36" +STEALTH_INIT_SCRIPT = """ +(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + window.chrome = window.chrome || { runtime: {} }; + const originalQuery = window.navigator.permissions?.query; + if (originalQuery) { + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' + ? Promise.resolve({ state: Notification.permission }) + : originalQuery(parameters) + ); + } + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3], + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'], + }); +})(); +""" + +JS_EXTRACT_TABLE_STAFF = '''() => { + const staff = []; + const seen = new Set(); + + document.querySelectorAll('table tr').forEach(row => { + const cells = row.querySelectorAll('td'); + if (cells.length >= 2) { + const link = cells[1]?.querySelector('a[href]') || cells[0]?.querySelector('a[href]'); + const titleCell = cells[2] || cells[1]; + + if (link) { + const name = link.innerText.trim(); + const url = link.href; + const title = titleCell ? titleCell.innerText.trim() : ''; + + if (name.length > 2 && !name.toLowerCase().includes('skip') && !seen.has(url)) { + seen.add(url); + staff.push({ + name: name, + url: url, + title: title + }); + } + } + } + }); + + return staff; +}''' + +JS_EXTRACT_LINK_STAFF = '''() => { + const staff = []; + const seen = new Set(); + + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href; + const text = a.innerText.trim(); + + if (seen.has(href)) return; + if (text.length < 5 || text.length > 80) return; + if (text.toLowerCase().includes('skip') || + text.toLowerCase().includes('staff') || + text.toLowerCase().includes('people') || + text.toLowerCase().includes('academic') || + text.toLowerCase().includes('research profiles')) return; + + if (href.includes('research.') || + href.includes('/portal/en/researchers/') || + href.includes('/profile/') || + href.includes('/people/')) { + seen.add(href); + staff.push({ + name: text, + url: href, + title: '' + }); + } + }); + + return staff; +}''' + +JS_EXTRACT_RESEARCH_EXPLORER = '''() => { + const staff = []; + const seen = new Set(); + + document.querySelectorAll('a.link.person').forEach(a => { + const href = a.href; + const text = a.innerText.trim(); + + if (!seen.has(href) && text.length > 3 && text.length < 80) { + seen.add(href); + staff.push({ + name: text, + url: href, + title: '' + }); + } + }); + + if (staff.length === 0) { + document.querySelectorAll('a[href*="/persons/"]').forEach(a => { + const href = a.href; + const text = a.innerText.trim(); + + if (seen.has(href)) return; + if (text.length < 3 || text.length > 80) return; + if (text.toLowerCase().includes('person') || + text.toLowerCase().includes('next') || + text.toLowerCase().includes('previous')) return; + + seen.add(href); + staff.push({ + name: text, + url: href, + title: '' + }); + }); + } + + return staff; +}''' + + +class UniversityScraper: + """通用大学爬虫""" + + def __init__(self, config: ScraperConfig, headless: bool = True): + self.config = config + self.headless = headless + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + self.page: Optional[Page] = None + self.playwright_config = getattr(self.config, "playwright", None) + + # 结果 + self.university: Optional[University] = None + + # 统计 + self.stats = { + "pages_visited": 0, + "schools_found": 0, + "programs_found": 0, + "faculty_found": 0, + "errors": [] + } + self._global_catalog_cache: Optional[List[Dict[str, Any]]] = None + self._global_catalog_assignments: Dict[str, str] = {} + self.default_request_settings = { + "timeout_ms": 30000, + "max_retries": 2, + "retry_backoff_ms": 2000, + "wait_after_ms": 2000, + "wait_until": "domcontentloaded", + "wait_for_selector": None + } + + async def __aenter__(self): + """异步上下文管理器入口""" + playwright = await async_playwright().start() + self.browser = await playwright.chromium.launch(headless=self.headless) + context_kwargs: Dict[str, Any] = { + "user_agent": DEFAULT_USER_AGENT, + "viewport": {'width': 1920, 'height': 1080} + } + if self.playwright_config: + if self.playwright_config.user_agent: + context_kwargs["user_agent"] = self.playwright_config.user_agent + if self.playwright_config.viewport: + context_kwargs["viewport"] = self.playwright_config.viewport + if self.playwright_config.locale: + context_kwargs["locale"] = self.playwright_config.locale + if self.playwright_config.timezone_id: + context_kwargs["timezone_id"] = self.playwright_config.timezone_id + if self.playwright_config.ignore_https_errors: + context_kwargs["ignore_https_errors"] = True + + self.context = await self.browser.new_context(**context_kwargs) + await self._configure_playwright_context() + self.page = await self.context.new_page() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """异步上下文管理器退出""" + if self.browser: + await self.browser.close() + self.context = None + + async def scrape(self) -> University: + """执行完整的爬取流程""" + print(f"\n{'='*60}") + print(f"开始爬取: {self.config.university.name}") + print(f"官网: {self.config.university.url}") + print(f"{'='*60}") + + # 创建大学对象 + self.university = University( + name=self.config.university.name, + url=self.config.university.url, + country=self.config.university.country + ) + + # 第一阶段:获取学院列表 + print("\n[阶段1] 获取学院列表...") + schools = await self._discover_schools() + print(f" 找到 {len(schools)} 个学院") + + # 第二阶段:获取每个学院的项目 + print("\n[阶段2] 获取各学院的硕士项目...") + for i, school in enumerate(schools, 1): + print(f"\n [{i}/{len(schools)}] {school.name}") + await self._discover_programs(school) + print(f" 找到 {len(school.programs)} 个项目") + self.university.schools.append(school) + + # 第三阶段:获取每个项目的导师 + print("\n[阶段3] 获取各项目的导师信息...") + total_programs = sum(len(s.programs) for s in self.university.schools) + program_idx = 0 + + for school in self.university.schools: + for program in school.programs: + program_idx += 1 + print(f"\n [{program_idx}/{total_programs}] {program.name}") + await self._discover_faculty(program, school) + print(f" 找到 {len(program.faculty)} 位导师") + + # 设置爬取时间 + self.university.scraped_at = datetime.now(timezone.utc).isoformat() + + # 打印统计 + self._print_summary() + + return self.university + + async def _discover_schools(self) -> List[School]: + """发现学院列表""" + schools = [] + method = self.config.schools.discovery_method + + if method == "static_list": + # 使用静态配置的学院列表 + for item in self.config.schools.static_list: + # 过滤排除的学院 + if item['name'] in self.config.filters.exclude_schools: + continue + metadata = { + k: v for k, v in item.items() + if k not in {'name', 'url', 'description'} + } + schools.append(School( + name=item['name'], + url=item['url'], + description=item.get('description'), + metadata=metadata + )) + + elif method == "scrape_page": + # 从页面爬取学院列表 + cfg = self.config.schools.scrape_config + if cfg: + settings = self._get_request_settings("schools", cfg.get('request')) + if not await self._goto_with_retry(cfg['url'], settings=settings): + return [] + + elements = await self.page.query_selector_all(cfg['selector']) + for elem in elements: + name = await self._get_element_attribute(elem, cfg.get('name_attribute', 'text')) + url = await self._get_element_attribute(elem, cfg.get('url_attribute', 'href')) + + if name and url and name not in self.config.filters.exclude_schools: + schools.append(School( + name=name.strip(), + url=urljoin(self.config.university.url, url) + )) + + self.stats["schools_found"] = len(schools) + return schools + + async def _discover_programs(self, school: School): + """发现学院下的硕士项目""" + programs_found = [] + request_settings = self._get_request_settings("programs") + + # 策略1: 尝试预定义的路径 + for path in self.config.programs.paths_to_try: + programs_url = urljoin(school.url, path) + success = await self._goto_with_retry(programs_url, settings=request_settings) + if success: + self.stats["pages_visited"] += 1 + + if await self._is_valid_page(): + programs = await self._extract_programs_from_page(school.url) + if programs: + programs_found.extend(programs) + break + + # 策略2: 从学院首页查找项目链接 + if not programs_found: + try: + success = await self._goto_with_retry(school.url, settings=request_settings) + if not success: + raise RuntimeError("failed to open school page") + + # 查找包含特定关键词的链接 + programs_page_url = await self._find_link_by_patterns( + self.config.programs.link_patterns + ) + + if programs_page_url: + success = await self._goto_with_retry(programs_page_url, settings=request_settings) + if not success: + raise RuntimeError("failed to open program list") + programs_found = await self._extract_programs_from_page(school.url) + + except Exception as e: + self.stats["errors"].append(f"获取{school.name}项目失败: {str(e)}") + + # 过滤:只保留硕士项目 + filtered_programs = self._filter_master_programs(programs_found) + if not filtered_programs: + global_programs = await self._get_programs_from_global_catalog(school) + if global_programs: + filtered_programs = global_programs + + school.programs = filtered_programs + self.stats["programs_found"] += len(filtered_programs) + + async def _get_programs_from_global_catalog(self, school: School) -> List[Program]: + """?????????????????????????????????????????????????????????????????????""" + cfg = self.config.programs.global_catalog + if not cfg: + return [] + + catalog = await self._load_global_program_catalog() + if not catalog: + return [] + + per_school_limit = cfg.get('per_school_limit') + allow_multiple = cfg.get('allow_multiple_assignments', False) + matches: List[Program] = [] + + for record in catalog: + program_id = record.get('uid') + if program_id and (not allow_multiple) and program_id in self._global_catalog_assignments: + continue + if not self._global_catalog_matches_school(record, school, cfg): + continue + + matches.append(self._clone_program_from_record(record)) + + if program_id and not allow_multiple: + self._global_catalog_assignments[program_id] = school.name + + if per_school_limit and len(matches) >= per_school_limit: + break + + return matches + + async def _load_global_program_catalog(self) -> List[Dict[str, Any]]: + """????????????????????????????????????????????????""" + if self._global_catalog_cache is not None: + return self._global_catalog_cache + + cfg = self.config.programs.global_catalog or {} + if not cfg: + self._global_catalog_cache = [] + return self._global_catalog_cache + + urls_to_try: List[str] = [] + if cfg.get('url'): + urls_to_try.append(cfg['url']) + if cfg.get('path'): + urls_to_try.append(urljoin(self.config.university.url, cfg['path'])) + for path_value in cfg.get('paths', []): + if path_value.startswith('http'): + urls_to_try.append(path_value) + else: + urls_to_try.append(urljoin(self.config.university.url, path_value)) + + settings = self._get_request_settings("programs", cfg.get('request')) + + for target_url in urls_to_try: + if not target_url: + continue + + success = await self._goto_with_retry(target_url, settings=settings, note="global program catalog") + if not success: + continue + + self.stats["pages_visited"] += 1 + programs = await self._extract_programs_from_page(self.config.university.url) + programs = self._filter_master_programs(programs) + + if programs: + self._global_catalog_cache = [ + self._build_global_program_record(p) for p in programs + ] + return self._global_catalog_cache + + self._global_catalog_cache = [] + return self._global_catalog_cache + + def _build_global_program_record(self, program: Program) -> Dict[str, Any]: + """???Program????????????????????????????????????? ????""" + cfg = self.config.programs.global_catalog or {} + metadata = {"source": "global_catalog"} + if cfg.get('skip_program_faculty_lookup', True): + metadata['skip_program_faculty'] = True + search_parts = [program.name or ""] + if program.degree_type: + search_parts.append(program.degree_type) + if program.description: + search_parts.append(program.description) + + search_text = " ".join(search_parts).lower() + return { + "uid": program.url or program.name.lower(), + "name": program.name, + "url": program.url, + "degree_type": program.degree_type, + "description": program.description, + "search_text": search_text, + "metadata": metadata + } + + def _clone_program_from_record(self, record: Dict[str, Any]) -> Program: + """?????????????????????????????????Program??????""" + metadata = dict(record.get('metadata') or {}) + return Program( + name=record.get('name', ''), + url=record.get('url', ''), + degree_type=record.get('degree_type'), + description=record.get('description'), + metadata=metadata + ) + + def _global_catalog_matches_school(self, record: Dict[str, Any], school: School, cfg: Dict[str, Any]) -> bool: + """???????????????????????????????????????????????????""" + if not cfg.get('assign_by_school_keywords', True): + return True + + keywords = self._collect_school_keywords(school, cfg) + if not keywords: + return cfg.get('assign_if_no_keywords', False) + + haystack = (record.get('search_text') or record.get('name', '')).lower() + return any(kw in haystack for kw in keywords) + + def _collect_school_keywords(self, school: School, cfg: Dict[str, Any]) -> List[str]: + """?????????metadata????????????????????????????????????""" + field = cfg.get('metadata_keyword_field', 'keywords') + keywords: List[str] = [] + + for kw in school.metadata.get(field, []): + if isinstance(kw, str): + cleaned = kw.strip().lower() + if cleaned: + keywords.append(cleaned) + + overrides = cfg.get('school_keyword_overrides', {}) + if isinstance(overrides, dict): + extra = overrides.get(school.name) + if isinstance(extra, list): + for kw in extra: + if isinstance(kw, str): + cleaned = kw.strip().lower() + if cleaned: + keywords.append(cleaned) + + return keywords + + async def _extract_programs_from_page(self, school_url: str) -> List[Program]: + """从当前页面提取项目列表""" + programs = [] + selectors = self.config.programs.selectors + current_url = self.page.url if self.page else "" + if current_url and "manchester.ac.uk/study/masters/courses/list" in current_url: + specialized = await self._extract_manchester_course_list() + if specialized: + return specialized + + # 获取项目容器 + item_selector = selectors.get('program_item', 'div.program, li.program, a.program-link') + + items = await self.page.query_selector_all(item_selector) + seen_urls = set() + + for item in items: + try: + # 获取项目名称 + name_elem = await item.query_selector(selectors.get('program_name', 'h3, .title')) + name = await name_elem.inner_text() if name_elem else await item.inner_text() + name = name.strip() if name else "" + + # 获取项目URL + url_elem = await item.query_selector(selectors.get('program_url', 'a[href]')) + url = await url_elem.get_attribute('href') if url_elem else "" + url = urljoin(school_url, url) if url else "" + + # 获取学位类型 + degree_elem = await item.query_selector(selectors.get('degree_type', '.degree')) + degree = await degree_elem.inner_text() if degree_elem else "" + degree = degree.strip() if degree else self._extract_degree_from_name(name) + + if not name or len(name) <= 3: + continue + if self._is_navigation_program(name, url): + continue + key = url or name.lower() + if key in seen_urls: + continue + seen_urls.add(key) + + programs.append(Program( + name=name, + url=url, + degree_type=degree + )) + except Exception: + continue + + # 如果选择器没找到,尝试通用方法 + if not programs: + programs = await self._extract_programs_generic() + + return programs + + async def _extract_programs_generic(self) -> List[Program]: + """通用的项目提取方法""" + programs = [] + + # 使用JavaScript提取所有可能的项目链接 + data = await self.page.evaluate('''() => { + const programs = []; + const seen = new Set(); + + // 查找所有链接 + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + + // 检查是否像是项目链接 + const lowerHref = href.toLowerCase(); + const lowerText = text.toLowerCase(); + + if ((lowerHref.includes('/program') || + lowerHref.includes('/degree') || + lowerHref.includes('/master') || + lowerHref.includes('/graduate')) && + text.length > 5 && + text.length < 150 && + !seen.has(href)) { + + seen.add(href); + programs.push({ + name: text, + url: href + }); + } + }); + + return programs; + }''') + + seen = set() + for item in data: + name = item.get('name', '').strip() + url = item.get('url', '').strip() + if not name: + continue + if self._is_navigation_program(name, url): + continue + key = url or name.lower() + if key in seen: + continue + seen.add(key) + programs.append(Program( + name=name, + url=url, + degree_type=self._extract_degree_from_name(name) + )) + + return programs + + async def _extract_manchester_course_list(self) -> List[Program]: + """专门解析曼彻斯特课程列表页(包含5位课程ID)""" + data = await self.page.evaluate('''() => { + const results = []; + const seen = new Set(); + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText ? a.innerText.trim().replace(/\\s+/g, ' ') : ''; + if (!href || !text) return; + if (!/\\/courses\\/list\\/\\d{5}\\//.test(href)) return; + if (text.length < 5 || text.length > 200) return; + const lower = text.toLowerCase(); + if (lower.includes('skip to') || + lower.includes('main content') || + lower.includes('masters degrees') || + lower.includes('student life') || + lower.includes('contact') || + lower.includes('why study')) { + return; + } + if (seen.has(href)) return; + seen.add(href); + results.push({name: text, url: href}); + }); + return results; + }''') + + programs = [] + seen = set() + for item in data: + name = item.get('name', '').strip() + url = item.get('url', '').strip() + if not name or not url: + continue + if self._is_navigation_program(name, url): + continue + if url in seen: + continue + seen.add(url) + programs.append(Program( + name=name, + url=url, + degree_type=self._extract_degree_from_name(name) + )) + + return programs + + def _is_navigation_program(self, name: str, url: str) -> bool: + """过滤掉导航/占位链接""" + if not name: + return True + + lower = name.lower() + nav_keywords = [ + "skip to", + "main content", + "masters degrees", + "contact us", + "student life", + "why study", + "how to apply", + "fees and funding", + "order a prospectus", + "visit us", + "share", + "read more", + "see more", + "find out more" + ] + if any(k in lower for k in nav_keywords): + return True + + if url: + lower_url = url.lower() + if lower_url.endswith("#") or "#main" in lower_url or "contact" in lower_url: + return True + + return False + + async def _discover_faculty(self, program: Program, school: School): + """发现项目的导师列表""" + faculty_found = [] + skip_program_faculty = bool(program.metadata.get('skip_program_faculty')) + + for strategy in self.config.faculty.discovery_strategies: + if faculty_found: + break + + strategy_type = strategy.get('type') + if skip_program_faculty and strategy_type in {"link_in_page", "url_pattern"}: + continue + + if strategy_type == "link_in_page": + # 策略1: 从项目页面查找导师链接 + request_settings = self._get_request_settings("faculty", strategy.get('request')) + try: + if program.url: + success = await self._goto_with_retry(program.url, settings=request_settings) + if not success: + continue + self.stats["pages_visited"] += 1 + + faculty_page_url = await self._find_link_by_patterns(strategy.get('patterns', [])) + + if faculty_page_url: + program.faculty_page_url = faculty_page_url + success = await self._goto_with_retry(faculty_page_url, settings=request_settings) + if success: + self.stats["pages_visited"] += 1 + faculty_found = await self._extract_faculty_from_page() + except Exception as e: + continue + + elif strategy_type == "url_pattern": + # 策略2: 尝试URL模式 + request_settings = self._get_request_settings("faculty", strategy.get('request')) + for pattern in strategy.get('patterns', []): + try: + faculty_url = pattern.replace('{program_url}', program.url.rstrip('/')) + faculty_url = faculty_url.replace('{school_url}', school.url.rstrip('/')) + + success = await self._goto_with_retry(faculty_url, settings=request_settings) + if not success: + continue + + self.stats["pages_visited"] += 1 + if await self._is_valid_page(): + extracted = await self._extract_faculty_from_page() + if extracted: + faculty_found = extracted + program.faculty_page_url = faculty_url + break + except Exception: + continue + + elif strategy_type == "school_directory": + if not school.faculty_directory_loaded: + school.faculty_directory = await self._scrape_school_faculty_directory(school, strategy) + school.faculty_directory_loaded = True + + if school.faculty_directory: + if self._should_attach_school_faculty(program, school, strategy): + faculty_found = self._clone_faculty_records( + school.faculty_directory, + strategy.get('limit_per_program') + ) + + program.faculty = faculty_found + self.stats["faculty_found"] += len(faculty_found) + + async def _extract_faculty_from_page(self) -> List[Faculty]: + """从当前页面提取导师列表""" + faculty = [] + + # 使用JavaScript提取 + data = await self.page.evaluate('''() => { + const faculty = []; + const seen = new Set(); + + // 通用的个人页面链接模式 + const patterns = ['/people/', '/faculty/', '/profile/', '/person/', '/directory/']; + + document.querySelectorAll('a[href]').forEach(a => { + const href = a.href || ''; + const text = a.innerText.trim(); + const lowerHref = href.toLowerCase(); + const lowerText = text.toLowerCase(); + + // 检查是否是个人页面链接 + const isPersonLink = patterns.some(p => lowerHref.includes(p)); + + // 过滤掉导航链接 + const isNavLink = ['people', 'faculty', 'directory', 'staff', 'all'].includes(lowerText); + + if (isPersonLink && + !isNavLink && + text.length > 3 && + text.length < 100 && + !seen.has(href)) { + + seen.add(href); + + // 尝试获取职位信息 + let title = ''; + const parent = a.closest('div, li, article'); + if (parent) { + const titleEl = parent.querySelector('.title, .position, .role, .job-title'); + if (titleEl) title = titleEl.innerText.trim(); + } + + faculty.push({ + name: text, + url: href, + title: title + }); + } + }); + + return faculty; + }''') + + for item in data: + faculty.append(Faculty( + name=item['name'], + url=item['url'], + title=item.get('title') or None + )) + + return faculty + + def _get_request_settings(self, level: str, override: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """合并请求参数""" + settings = dict(self.default_request_settings) + + level_config: Dict[str, Any] = {} + if level == "schools": + level_config = getattr(self.config.schools, "request", {}) or {} + elif level == "programs": + level_config = getattr(self.config.programs, "request", {}) or {} + elif level == "faculty": + level_config = getattr(self.config.faculty, "request", {}) or {} + + settings.update({k: v for k, v in level_config.items() if v is not None}) + if override: + settings.update({k: v for k, v in override.items() if v is not None}) + + if 'post_wait_ms' in settings and settings.get('post_wait_ms') is not None: + settings['wait_after_ms'] = settings.pop('post_wait_ms') + + return settings + + async def _configure_playwright_context(self): + """???Playwright?????????????????????????????????Cookie?????????""" + if not self.context or not self.playwright_config: + return + + cfg = self.playwright_config + + if cfg.extra_headers: + await self.context.set_extra_http_headers(cfg.extra_headers) + + scripts = [] + if cfg.stealth: + scripts.append(STEALTH_INIT_SCRIPT) + if cfg.add_init_scripts: + scripts.extend(cfg.add_init_scripts) + + for script in scripts: + if script: + await self.context.add_init_script(script) + + cookies = self._normalize_custom_cookies(cfg.cookies) + if cookies: + try: + await self.context.add_cookies(cookies) + except Exception as exc: + self.stats["errors"].append(f"???????????????Cookie??????: {exc}") + + def _normalize_custom_cookies(self, cookies: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """?????????????????????Cookie???????????????Playwright???""" + if not cookies: + return [] + + normalized: List[Dict[str, Any]] = [] + default_domain = urlparse(self.config.university.url).netloc + + for raw in cookies: + name = raw.get("name") + value = raw.get("value") + if not name or value is None: + continue + + cookie = {"name": name, "value": str(value), "path": raw.get("path") or "/"} + + if raw.get("url"): + cookie["url"] = raw["url"] + else: + cookie["domain"] = raw.get("domain") or default_domain + + if "secure" in raw: + cookie["secure"] = bool(raw["secure"]) + if "httpOnly" in raw: + cookie["httpOnly"] = bool(raw["httpOnly"]) + if "sameSite" in raw: + cookie["sameSite"] = raw["sameSite"] + if raw.get("expires"): + cookie["expires"] = int(raw["expires"]) + + normalized.append(cookie) + + return normalized + + + async def _goto_with_retry(self, url: str, *, settings: Optional[Dict[str, Any]] = None, note: str = "") -> bool: + """统一的页面访问+重试逻辑""" + if not self.page: + return False + + cfg = dict(settings or self.default_request_settings) + max_retries = max(1, int(cfg.get('max_retries', 1))) + timeout = cfg.get('timeout_ms', 30000) + wait_after = cfg.get('wait_after_ms', 2000) + wait_until = cfg.get('wait_until', 'domcontentloaded') + wait_for_selector = cfg.get('wait_for_selector') + retry_backoff = cfg.get('retry_backoff_ms', 2000) + + for attempt in range(max_retries): + try: + await self.page.goto(url, wait_until=wait_until, timeout=timeout) + if wait_for_selector: + await self.page.wait_for_selector(wait_for_selector, timeout=timeout) + if wait_after: + await self.page.wait_for_timeout(wait_after) + return True + except Exception as exc: + if attempt >= max_retries - 1: + label = note or url + self.stats["errors"].append(f"{label} 加载失败: {str(exc)}") + return False + backoff = retry_backoff * (attempt + 1) + await self.page.wait_for_timeout(backoff) + + async def _scrape_school_faculty_directory(self, school: School, strategy: Dict[str, Any]) -> List[Faculty]: + """按学院级别抓取导师目录""" + pages = school.metadata.get('faculty_pages') or strategy.get('pages', []) + has_research_config = bool(school.metadata.get('research_explorer')) or any( + page.get('extract_method') == 'research_explorer' for page in pages + ) + + if has_research_config: + api_faculty = await self._fetch_research_explorer_api(school) + if api_faculty: + return api_faculty + + if not pages: + return [] + + collected: List[Faculty] = [] + + for page_cfg in pages: + target_url = page_cfg.get('url') + if not target_url: + rel_path = page_cfg.get('path') + if rel_path: + target_url = urljoin(school.url, rel_path) + + if not target_url: + continue + + override_request = {} + if strategy.get('request'): + override_request.update(strategy['request']) + if page_cfg.get('request'): + override_request.update(page_cfg['request']) + + settings = self._get_request_settings("faculty", override_request) + if page_cfg.get('extract_method') == 'research_explorer' and not settings.get('wait_for_selector'): + settings['wait_for_selector'] = "a.link.person" + + success = await self._goto_with_retry(target_url, settings=settings, note=f"{school.name} staff") + if not success: + continue + + self.stats["pages_visited"] += 1 + method = page_cfg.get('extract_method', 'links') + raw_staff = await self._extract_staff_records(method) + + name_keywords = [kw.lower() for kw in page_cfg.get('name_keywords', [])] + if name_keywords: + filtered_staff = [] + for item in raw_staff: + text = item.get('name', '').lower() + if any(kw in text for kw in name_keywords): + filtered_staff.append(item) + raw_staff = filtered_staff + + for item in raw_staff: + faculty = self._convert_staff_record(item, school.name) + if faculty: + collected.append(faculty) + + return self._deduplicate_faculty(collected) + + async def _extract_staff_records(self, method: str) -> List[Dict[str, Any]]: + """根据提取方式获取原始导师记录""" + script_map = { + "table": JS_EXTRACT_TABLE_STAFF, + "links": JS_EXTRACT_LINK_STAFF, + "research_explorer": JS_EXTRACT_RESEARCH_EXPLORER + } + script = script_map.get(method, JS_EXTRACT_LINK_STAFF) + try: + data = await self.page.evaluate(script) + except Exception: + return [] + + if not data: + return [] + return data + + async def _fetch_research_explorer_api(self, school: School) -> List[Faculty]: + """优先通过Research Explorer API获取导师""" + if not self.context: + return [] + + config = school.metadata.get('research_explorer') or {} + if not config: + return [] + + page_size = config.get('page_size', 200) + timeout_ms = config.get('timeout_ms', 90000) + api_base = config.get('api_base', 'https://research.manchester.ac.uk/ws/portalapi.aspx') + slug = config.get('org_slug') or self._guess_research_explorer_slug(school.url) + candidate_urls: List[str] = [] + + if config.get('api_url'): + candidate_urls.append(config['api_url']) + + if slug: + params = { + "action": "search", + "language": "en", + "format": "json", + "site": "default", + "showall": "true", + "pageSize": page_size, + "organisations": slug, + } + candidate_urls.append(f"{api_base}?{urlencode(params)}") + + for page_cfg in school.metadata.get('faculty_pages', []): + if page_cfg.get('extract_method') != 'research_explorer': + continue + page_url = page_cfg.get('url') or school.url + candidate_urls.append(self._append_query(page_url, {"format": "json", "limit": page_size})) + candidate_urls.append(self._append_query(page_url, {"format": "xml", "limit": page_size})) + + for url in candidate_urls: + if not url: + continue + try: + response = await self.context.request.get(url, timeout=timeout_ms) + if response.status != 200: + continue + content_type = response.headers.get("content-type", "").lower() + if "json" in content_type: + payload = await response.json() + staff_raw = self._parse_research_explorer_json(payload, school.url) + else: + text = await response.text() + staff_raw = self._parse_research_explorer_xml(text, school.url) + + staff_raw = self._deduplicate_staff_dicts(staff_raw) + faculty = [ + self._convert_staff_record(item, school.name) + for item in staff_raw + ] + faculty = [f for f in faculty if f] + if faculty: + return faculty + except Exception: + continue + + return [] + + def _guess_research_explorer_slug(self, url: str) -> Optional[str]: + if not url: + return None + path = url.rstrip('/').split('/') + return path[-1] if path else None + + def _parse_research_explorer_json(self, data: Any, base_url: str) -> List[Dict[str, str]]: + items: List[Dict[str, Any]] = [] + if isinstance(data, list): + items = data + elif isinstance(data, dict): + for key in ("results", "items", "persons", "data", "entities", "rows"): + if isinstance(data.get(key), list): + items = data[key] + break + + staff: List[Dict[str, str]] = [] + for item in items: + if not isinstance(item, dict): + continue + name = item.get("name") or item.get("title") or item.get("fullName") + link = item.get("url") or item.get("href") or item.get("link") or item.get("primaryURL") + title = item.get("jobTitle") or item.get("position") or "" + if not name: + continue + if link: + link = urljoin(base_url, link) + staff.append({ + "name": name.strip(), + "url": (link or "").strip(), + "title": (title or "").strip() + }) + return staff + + def _parse_research_explorer_xml(self, text: str, base_url: str) -> List[Dict[str, str]]: + staff: List[Dict[str, str]] = [] + if not text: + return staff + try: + root = ET.fromstring(text) + except ET.ParseError: + return staff + + ns = {"atom": "http://www.w3.org/2005/Atom"} + for entry in root.findall(".//atom:entry", ns): + title = entry.findtext("atom:title", default="", namespaces=ns) + link_el = entry.find("atom:link", ns) + href = link_el.attrib.get("href") if link_el is not None else "" + if not title: + continue + staff.append({ + "name": title.strip(), + "url": urljoin(base_url, href) if href else "", + "title": "" + }) + return staff + + def _deduplicate_staff_dicts(self, records: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + deduped: Dict[str, Dict[str, Any]] = {} + for item in records: + name = (item.get("name") or "").strip() + url = (item.get("url") or "").strip() + if not name: + continue + key = url or name.lower() + if key not in deduped: + deduped[key] = {"name": name, "url": url, "title": item.get("title", "")} + return list(deduped.values()) + + def _append_query(self, url: str, params: Dict[str, Any]) -> str: + if not url: + return "" + delimiter = "&" if "?" in url else "?" + return f"{url}{delimiter}{urlencode(params)}" + + def _convert_staff_record(self, item: Dict[str, Any], department: str) -> Optional[Faculty]: + """将字典记录转换为 Faculty""" + name = (item.get('name') or "").strip() + url = (item.get('url') or "").strip() + if not name: + return None + + return Faculty( + name=name, + url=url, + title=(item.get('title') or "").strip() or None, + email=(item.get('email') or "").strip() or None, + department=department + ) + + def _should_attach_school_faculty(self, program: Program, school: School, strategy: Dict[str, Any]) -> bool: + """根据策略判断是否复用学院导师""" + if not strategy.get('match_by_school_keywords'): + return True + + field_name = strategy.get('metadata_keyword_field', 'keywords') + keywords = [] + for kw in school.metadata.get(field_name, []): + if isinstance(kw, str): + keywords.append(kw.lower()) + + if not keywords: + return strategy.get('assign_to_all', True) + + program_name = program.name.lower() + return any(kw in program_name for kw in keywords) + + def _clone_faculty_records(self, faculty_list: List[Faculty], limit: Optional[int]) -> List[Faculty]: + """复制学院级别导师,以免直接共享引用""" + cloned = [ + Faculty( + name=f.name, + url=f.url, + title=f.title, + email=f.email, + department=f.department + ) + for f in faculty_list + ] + + if limit and limit > 0: + return cloned[:limit] + return cloned + + def _deduplicate_faculty(self, faculty_list: List[Faculty]) -> List[Faculty]: + """按URL或姓名去重""" + deduped: Dict[str, Faculty] = {} + for faculty in faculty_list: + if not faculty.name: + continue + key = faculty.url or faculty.name.lower() + if key not in deduped: + deduped[key] = faculty + return list(deduped.values()) + + async def _find_link_by_patterns(self, patterns: List[Dict[str, Any]]) -> Optional[str]: + """根据模式查找链接""" + for pattern in patterns: + text_contains = pattern.get('text_contains', []) + href_contains = pattern.get('href_contains', []) + + result = await self.page.evaluate('''([textPatterns, hrefPatterns]) => { + const links = document.querySelectorAll('a[href]'); + + for (const link of links) { + const href = link.href || ''; + const text = link.innerText.toLowerCase().trim(); + const lowerHref = href.toLowerCase(); + + // 检查文本匹配 + const textMatch = textPatterns.length === 0 || + textPatterns.some(p => text.includes(p.toLowerCase())); + + // 检查href匹配 + const hrefMatch = hrefPatterns.length === 0 || + hrefPatterns.some(p => lowerHref.includes(p.toLowerCase())); + + if (textMatch && hrefMatch && href) { + return href; + } + } + + return null; + }''', [text_contains, href_contains]) + + if result: + return result + + return None + + async def _is_valid_page(self) -> bool: + """检查当前页面是否有效""" + try: + # 检查是否是404页面 + title = await self.page.title() + if '404' in title or 'not found' in title.lower(): + return False + + # 检查页面是否有内容 + body_text = await self.page.evaluate('document.body.innerText.length') + return body_text > 100 + + except Exception: + return False + + async def _get_element_attribute(self, element, attr: str) -> Optional[str]: + """获取元素属性""" + if attr == 'text': + return await element.inner_text() + elif attr == 'href': + return await element.get_attribute('href') + else: + return await element.get_attribute(attr) + + def _extract_degree_from_name(self, name: str) -> str: + """从项目名称中提取学位类型""" + patterns = [ + (r'\bM\.?S\.?\b', 'M.S.'), + (r'\bM\.?A\.?\b', 'M.A.'), + (r'\bMBA\b', 'MBA'), + (r'\bM\.?Eng\.?\b', 'M.Eng.'), + (r'\bM\.?Ed\.?\b', 'M.Ed.'), + (r'\bMaster\b', 'Master'), + (r'\bPh\.?D\.?\b', 'Ph.D.'), + ] + + for pattern, degree in patterns: + if re.search(pattern, name, re.IGNORECASE): + return degree + + return "" + + def _filter_master_programs(self, programs: List[Program]) -> List[Program]: + """过滤,只保留硕士项目""" + include = self.config.filters.program_degree_types.get('include', []) + exclude = self.config.filters.program_degree_types.get('exclude', []) + + if not include and not exclude: + return programs + + filtered = [] + for p in programs: + name_lower = p.name.lower() + degree_lower = (p.degree_type or "").lower() + + # 检查排除 + if exclude: + should_exclude = any( + ex.lower() in name_lower or ex.lower() in degree_lower + for ex in exclude + ) + if should_exclude: + continue + + # 检查包含 + if include: + should_include = any( + inc.lower() in name_lower or inc.lower() in degree_lower + for inc in include + ) + if should_include: + filtered.append(p) + else: + filtered.append(p) + + return filtered + + def _print_summary(self): + """打印爬取摘要""" + print(f"\n{'='*60}") + print("爬取完成!") + print(f"{'='*60}") + print(f"大学: {self.university.name}") + print(f"学院数: {len(self.university.schools)}") + print(f"项目数: {self.stats['programs_found']}") + print(f"导师数: {self.stats['faculty_found']}") + print(f"访问页面数: {self.stats['pages_visited']}") + + if self.stats['errors']: + print(f"\n错误数: {len(self.stats['errors'])}") + for err in self.stats['errors'][:5]: + print(f" - {err}") + + def save_results(self, output_path: str): + """保存结果到JSON文件""" + if not self.university: + raise ValueError("没有爬取结果可保存") + + output = Path(output_path) + output.parent.mkdir(parents=True, exist_ok=True) + + with open(output, 'w', encoding='utf-8') as f: + json.dump(self.university.to_dict(), f, ensure_ascii=False, indent=2) + + print(f"\n结果已保存到: {output_path}") + + +async def run_scraper(config_path: str, output_path: str, headless: bool = True): + """运行爬虫的便捷函数""" + from .config import load_config + + config = load_config(config_path) + + async with UniversityScraper(config, headless=headless) as scraper: + await scraper.scrape() + scraper.save_results(output_path) + + return scraper.university diff --git a/任务1.txt b/任务1.txt index 1f1d0ad..535334b 100644 --- a/任务1.txt +++ b/任务1.txt @@ -1,4 +1,8 @@ 构建一个自动化生成代码的agent,给定一个海外大学官网的网址,生成一套或者说一个python脚本能够爬取这个大学各级学院下的所有硕士项目的网址 和 硕士项目中各导师个人信息的网址 agent系统使用 https://docs.agno.com/ 然后其中的浏览器自动化使用playwright -使用的python工具库是,uv、ruff、ty,使用这三个构建项目。 \ No newline at end of file +使用的python工具库是,uv、ruff、ty,使用这三个构建项目。 +另外还需要研究一个方案: +当前生成的爬虫脚本还需要在 Python 环境中手动执行。我们希望实现的效果:生成的脚本可以存到数据库(你研究一下是存在哪里好),我们的目的是存到云端或者服务器上,后续我们可以通过接口来调用生成的脚本去爬取。 +理想情况下我们有一个页面,页面上有一个输入栏,用来输入要生成爬虫脚本的学校,然后点击“一键生成爬虫脚本”后,由我们的 agent 自动生成脚本。生成完成后会跳转到该学校的管理页面,页面上可以点击“一键运行爬虫脚本”来抓取对应项目和老师的一面数据。爬取完毕后,我们可以在页面上检查和验证爬虫抓取到的数据。 +目的是做成没有任何电脑知识的普通人也能操作的一个系统。 \ No newline at end of file diff --git a/对话总结.txt b/对话总结.txt new file mode 100644 index 0000000..e0da678 --- /dev/null +++ b/对话总结.txt @@ -0,0 +1,32 @@ +项目进度与存在的问题: +📊 项目进度总结 +✅ 已完成的功能 +Web系统架构 +FastAPI 后端 + React + TypeScript + Ant Design 前端 +SQLite 数据库存储大学、脚本、爬取结果 +动态脚本执行机制(通过 exec() 注入模块) +曼彻斯特大学爬虫 +成功爬取 297 个硕士项目 +创建了完整的学院配置 manchester_complete_scraper.py +实现了 3 种提取方法:table、links、research_explorer +项目按关键词自动分配到 13 个学院 +已修复的问题 +Windows asyncio 事件循环策略 +exec 命名空间问题(函数互相调用) +硕士项目过滤逻辑(排除本科/博士) +⚠️ 当前存在的问题 +问题 影响 原因 +网络超时 11/12 学院页面加载失败 网络不稳定或页面响应慢 +Research Explorer 页面 大量学院使用此系统 JavaScript 渲染慢,60秒超时不够 +导师数据不完整 仅获取 78 名导师(AMBS) 其他学院页面无法访问 +📈 数据统计 +指标 数量 +硕士项目总数 297 +学院分类数 13 +成功获取导师的学院 1/13 +导师总数 78 +🔧 建议的改进方向 +增加超时时间 - 对 Research Explorer 页面增加到 90-120 秒 +添加重试机制 - 失败后自动重试 2-3 次 +使用备选 URL - 为每个学院配置多个可能的 staff 页面 +分批爬取 - 将学院分批处理,避免同时请求过多 \ No newline at end of file