From a4dca812165ca504f16e90376b96c2dafb6e426c Mon Sep 17 00:00:00 2001 From: yangxiaoyu-crypto <532075404@qq.com> Date: Wed, 10 Dec 2025 15:36:14 +0800 Subject: [PATCH] Rename test script and update documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename test_rwth.py to generate_scraper.py with CLI arguments - Update README.md with comprehensive usage guide - Add Harvard scraper as example output - Document troubleshooting tips for common issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- README.md | 207 ++++++++++--- artifacts/harvard_faculty_scraper.py | 437 +++++++++++++++++++++++++++ generate_scraper.py | 135 +++++++++ test_rwth.py | 45 --- 4 files changed, 729 insertions(+), 95 deletions(-) create mode 100644 artifacts/harvard_faculty_scraper.py create mode 100644 generate_scraper.py delete mode 100644 test_rwth.py diff --git a/README.md b/README.md index b8f0ea0..47324ff 100644 --- a/README.md +++ b/README.md @@ -1,73 +1,180 @@ # University Playwright Codegen Agent -构建于 [Agno](https://docs.agno.com/) 的自动化代码生成代理:输入海外大学官网的根地址,即可生成一份使用 **Playwright** 的 Python 脚本,脚本会抓取各学院/研究生院下的硕士项目网址以及项目中列出的导师(Supervisor/Faculty)个人信息页面。本项目使用 `uv` 进行依赖管理,`ruff` 做静态检查,`ty` 负责类型检查,并提供了一个基于 Typer 的 CLI。 +构建于 [Agno](https://docs.agno.com/) 的自动化代码生成代理:输入海外大学官网的根地址,即可生成一份使用 **Playwright** 的 Python 脚本,脚本会抓取各学院/研究生院下的硕士项目网址以及项目中列出的导师(Supervisor/Faculty)个人信息页面。 -## Features +## Quick Start -- ✅ **Agno Agent**:利用 `output_schema` 强制结构化输出,里程碑式地生成 `ScriptPlan` 并将其渲染为可执行脚本。 -- ✅ **Playwright sampling**:计划生成前会用 Playwright 对站点进行轻量抓取,帮助 Agent 找到关键词与导航策略。 -- ✅ **Deterministic script template**:脚本模板包含 BFS 爬取、关键词过滤、JSON 输出等逻辑,确保满足“硕士项目 + 导师”需求。 -- ✅ **uv + ruff + ty workflow**:开箱即用的现代 Python 工具链。 +### 1. 环境准备 -## Getting started +```bash +# 克隆项目 +git clone https://git.prodream.cn/YXY/University-Playwright-Codegen-Agent.git +cd University-Playwright-Codegen-Agent -1. **创建虚拟环境并安装依赖** +# 安装依赖(需要 uv) +uv sync - ```bash - uv venv --python 3.12 - uv pip install -r pyproject.toml - playwright install # 安装浏览器内核 - ``` +# 安装 Playwright 浏览器 +uv run playwright install +``` -2. **配置大模型 API key** +### 2. 配置 API Key - - OpenAI: `export OPENAI_API_KEY=...` - - Anthropic: `export ANTHROPIC_API_KEY=...` - - 可通过环境变量 `CODEGEN_MODEL_PROVIDER` 在 `openai` 与 `anthropic` 之间切换。 +项目使用 OpenRouter API 调用 Claude 模型。设置环境变量: -3. **运行 CLI 生成脚本** +**Windows (PowerShell):** +```powershell +[Environment]::SetEnvironmentVariable("OPENROUTER_API_KEY", "your-api-key", "User") +``` - ```bash - uv run university-agent generate \ - "https://www.example.edu" \ - --campus "Example Campus" \ - --language "English" \ - --max-depth 2 \ - --max-pages 60 - ``` +**Windows (CMD):** +```cmd +setx OPENROUTER_API_KEY "your-api-key" +``` - 运行完成后会在 `artifacts/` 下看到生成的 Playwright 脚本,并在终端展示自动规划的关键词与验证步骤。 +**Linux/macOS:** +```bash +export OPENROUTER_API_KEY="your-api-key" +``` -4. **执行 Ruff & Ty 检查** +或者复制 `.env.example` 为 `.env` 并填入 API Key。 - ```bash - uv run ruff check - uvx ty check - ``` +### 3. 生成爬虫脚本 -## Project structure +**方式一:使用命令行参数** +```bash +uv run python generate_scraper.py \ + --url "https://www.harvard.edu/" \ + --name "Harvard" \ + --language "English" \ + --max-depth 3 \ + --max-pages 30 +``` + +**方式二:修改脚本中的配置** + +编辑 `generate_scraper.py` 顶部的配置: +```python +TARGET_URL = "https://www.example.edu/" +CAMPUS_NAME = "Example University" +LANGUAGE = "English" +MAX_DEPTH = 3 +MAX_PAGES = 30 +``` + +然后运行: +```bash +uv run python generate_scraper.py +``` + +### 4. 运行生成的爬虫 + +生成的脚本保存在 `artifacts/` 目录下: + +```bash +cd artifacts +uv run python harvard_faculty_scraper.py --max-pages 50 --no-verify +``` + +**常用参数:** +| 参数 | 说明 | 默认值 | +|------|------|--------| +| `--max-pages` | 最大爬取页面数 | 30 | +| `--max-depth` | 最大爬取深度 | 3 | +| `--no-verify` | 跳过链接验证(推荐) | False | +| `--browser` | 浏览器引擎 (chromium/firefox/webkit) | chromium | +| `--timeout` | 页面加载超时(ms) | 20000 | +| `--output` | 输出文件路径 | university-scraper_results.json | + +### 5. 查看结果 + +爬取结果保存为 JSON 文件: + +```json +{ + "statistics": { + "total_links": 277, + "program_links": 8, + "faculty_links": 269, + "profile_pages": 265 + }, + "program_links": [...], + "faculty_links": [...] +} +``` + +## 使用 CLI(可选) + +项目也提供 Typer CLI: + +```bash +uv run university-agent generate \ + "https://www.example.edu" \ + --campus "Example Campus" \ + --language "English" \ + --max-depth 2 \ + --max-pages 60 +``` + +## 测试过的大学 + +| 大学 | 状态 | 备注 | +|------|------|------| +| Harvard | ✅ | 找到 277 个链接 | +| RWTH Aachen | ✅ | 找到 108 个链接 | +| KAUST | ✅ | 需使用 Firefox,网站较慢 | + +## 故障排除 + +### 超时错误 +某些网站响应较慢,增加超时时间: +```bash +uv run python xxx_scraper.py --timeout 60000 --no-verify +``` + +### 浏览器被阻止 +某些网站(如 KAUST)会阻止 Chromium,改用 Firefox: +```bash +uv run python xxx_scraper.py --browser firefox +``` + +### API Key 错误 +确保 `OPENROUTER_API_KEY` 环境变量已正确设置: +```bash +echo $OPENROUTER_API_KEY # Linux/macOS +echo %OPENROUTER_API_KEY% # Windows CMD +``` + +## Project Structure ``` ├── README.md +├── generate_scraper.py # 主入口脚本 +├── .env.example # 环境变量模板 ├── pyproject.toml -├── src/university_agent -│ ├── agent.py # Agno Agent 配置 -│ ├── cli.py # Typer CLI -│ ├── config.py # pydantic Settings -│ ├── generator.py # Orchestration 引擎 -│ ├── models.py # 数据模型(请求/计划/结果) -│ ├── renderer.py # ScriptPlan -> Playwright script -│ ├── sampler.py # Playwright 采样 -│ ├── templates/ -│ │ └── playwright_script.py.jinja -│ └── writer.py # 将脚本写入 artifacts/ -└── 任务1.txt +├── artifacts/ # 生成的爬虫脚本 +│ ├── harvard_faculty_scraper.py +│ ├── kaust_faculty_scraper.py +│ └── ... +└── src/university_agent/ + ├── agent.py # Agno Agent 配置 + ├── cli.py # Typer CLI + ├── config.py # pydantic Settings + ├── generator.py # Orchestration 引擎 + ├── models.py # 数据模型 + ├── renderer.py # ScriptPlan -> Playwright script + ├── sampler.py # Playwright 采样 + └── writer.py # 脚本写入 ``` -## Tips +## Features -- `university-agent generate --help` 查看所有 CLI 选项,可选择跳过采样或导出规划 JSON。 -- 如果 Agno Agent 需使用其他工具,可在 `agent.py` 中自行扩展自定义 `tool`。 -- Playwright 采样在某些环境中需要额外的浏览器依赖,请根据官方提示执行 `playwright install`。 +- **Agno Agent**:利用 `output_schema` 强制结构化输出 +- **Playwright sampling**:生成前对站点进行轻量抓取 +- **Deterministic script template**:BFS 爬取、关键词过滤、JSON 输出 +- **OpenRouter 支持**:通过 OpenRouter 使用 Claude 模型 +- **uv + ruff + ty workflow**:现代 Python 工具链 -Happy building! 🎓🤖 +## License + +MIT diff --git a/artifacts/harvard_faculty_scraper.py b/artifacts/harvard_faculty_scraper.py new file mode 100644 index 0000000..6d2d448 --- /dev/null +++ b/artifacts/harvard_faculty_scraper.py @@ -0,0 +1,437 @@ +#!/usr/bin/env python +""" +Auto-generated by the Agno codegen agent. +Target university: Harvard (https://www.harvard.edu/) +Requested caps: depth=3, pages=30 + +Plan description: Playwright scraper for university master programs and faculty profiles. +Navigation strategy: Start at https://www.harvard.edu/ Follow links to /academics/ and /a-to-z/ to find list of schools and departments For each school/department, look for a 'faculty' or 'people' page On faculty directory pages, identify and follow links to individual profiles Check for school/department specific subdomains like hls.harvard.edu, hds.harvard.edu, etc. Prioritize crawling faculty directory pages over general site crawling +Verification checklist: +- Manually review a sample of scraped URLs to verify they are faculty profiles +- Check that major academic departments are represented in the results +- Verify the script is capturing profile page content, not just URLs +- Confirm no login pages, application forms, or directory pages are included +Playwright snapshot used to guide this plan: +1. Harvard University (https://www.harvard.edu/) + Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers Search Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. + Anchors: Skip to main content -> https://www.harvard.edu/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, × -> javascript:void(0), A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/ +2. Index of departments, schools, and affiliates - Harvard University (https://www.harvard.edu/a-to-z/) + Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers Search Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. + Anchors: Skip to main content -> https://www.harvard.edu/a-to-z/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, × -> javascript:void(0), A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/ +3. Academics - Harvard University (https://www.harvard.edu/academics/) + Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers Search Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. + Anchors: Skip to main content -> https://www.harvard.edu/academics/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/, Undergraduate Degrees -> https://www.harvard.edu//programs/?degree_levels=undergraduate +4. Programs - Harvard University (https://www.harvard.edu//programs/?degree_levels=undergraduate) + Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers Search Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. + Anchors: Skip to main content -> https://www.harvard.edu/programs/?degree_levels=undergraduate#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/, Undergraduate Degrees -> https://www.harvard.edu//programs/?degree_levels=undergraduate +Snapshot truncated. + +Generated at: 2025-12-10T07:19:12.294884+00:00 +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import time +from collections import deque +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Deque, Iterable, List, Set, Tuple +from urllib.parse import urljoin, urldefrag, urlparse + +from playwright.async_api import async_playwright, Page, Response + +PROGRAM_KEYWORDS = ['/graduate/', '/masters/', '/programs/?degree_levels=graduate', '/mpp/', 'Master of', 'M.S.', 'M.A.', 'graduate program'] +FACULTY_KEYWORDS = ['/people/', '/~', '/faculty/', '/profile/', 'professor', 'dr.', 'ph.d.', 'firstname-lastname'] +EXCLUSION_KEYWORDS = ['admissions', 'apply', 'tuition', 'news', 'events', 'calendar', 'careers', 'jobs', 'login', 'donate', 'alumni', 'giving'] +METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'email', 'scraped_at'] +EXTRA_NOTES = ['Many Harvard faculty have profiles under the /~username/ URL pattern', 'Some faculty may be cross-listed in multiple departments', 'Prioritize finding profiles from professional schools (business, law, medicine, etc.)', "Check for non-standard faculty titles like 'lecturer', 'fellow', 'researcher'"] + +# URL patterns that indicate individual profile pages +PROFILE_URL_PATTERNS = [ + "/people/", "/person/", "/profile/", "/profiles/", + "/faculty/", "/staff/", "/directory/", + "/~", # Unix-style personal pages + "/bio/", "/about/", +] + +# URL patterns that indicate listing/directory pages (should be crawled deeper) +DIRECTORY_URL_PATTERNS = [ + "/faculty", "/people", "/directory", "/staff", + "/team", "/members", "/researchers", +] + + +def normalize_url(base: str, href: str) -> str: + """Normalize URL by resolving relative paths and removing fragments.""" + absolute = urljoin(base, href) + cleaned, _ = urldefrag(absolute) + # Remove trailing slash for consistency + return cleaned.rstrip("/") + + +def matches_any(text: str, keywords: Iterable[str]) -> bool: + """Check if text contains any of the keywords (case-insensitive).""" + lowered = text.lower() + return any(keyword.lower() in lowered for keyword in keywords) + + +def is_same_domain(url1: str, url2: str) -> bool: + """Check if two URLs belong to the same root domain.""" + domain1 = urlparse(url1).netloc.replace("www.", "") + domain2 = urlparse(url2).netloc.replace("www.", "") + # Allow subdomains of the same root domain + parts1 = domain1.split(".") + parts2 = domain2.split(".") + if len(parts1) >= 2 and len(parts2) >= 2: + return parts1[-2:] == parts2[-2:] + return domain1 == domain2 + + +def is_profile_url(url: str) -> bool: + """Check if URL pattern suggests an individual profile page.""" + url_lower = url.lower() + return any(pattern in url_lower for pattern in PROFILE_URL_PATTERNS) + + +def is_directory_url(url: str) -> bool: + """Check if URL pattern suggests a directory/listing page.""" + url_lower = url.lower() + return any(pattern in url_lower for pattern in DIRECTORY_URL_PATTERNS) + + +@dataclass +class ScrapedLink: + url: str + title: str + text: str + source_url: str + bucket: str # "program" or "faculty" + is_verified: bool = False + http_status: int = 0 + is_profile_page: bool = False + scraped_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) + + +@dataclass +class ScrapeSettings: + root_url: str + max_depth: int + max_pages: int + headless: bool + output: Path + verify_links: bool = True + request_delay: float = 1.0 # Polite crawling delay + + +async def extract_links(page: Page) -> List[Tuple[str, str]]: + """Extract all anchor links from the page.""" + anchors: Iterable[dict] = await page.eval_on_selector_all( + "a", + """elements => elements + .map(el => ({text: (el.textContent || '').trim(), href: el.href})) + .filter(item => item.text && item.href && item.href.startsWith('http'))""", + ) + return [(item["href"], item["text"]) for item in anchors] + + +async def get_page_title(page: Page) -> str: + """Get the page title safely.""" + try: + return await page.title() or "" + except Exception: + return "" + + +async def verify_link(context, url: str, timeout: int = 10000) -> Tuple[bool, int, str]: + """ + Verify a link by making a HEAD-like request. + Returns: (is_valid, status_code, page_title) + """ + page = await context.new_page() + try: + response: Response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout) + if response: + status = response.status + title = await get_page_title(page) + is_valid = 200 <= status < 400 + return is_valid, status, title + return False, 0, "" + except Exception: + return False, 0, "" + finally: + await page.close() + + +async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]: + """ + Crawl the website using BFS, collecting program and faculty links. + Features: + - URL deduplication + - Link verification + - Profile page detection + - Polite crawling with delays + """ + async with async_playwright() as p: + browser_launcher = getattr(p, browser_name) + browser = await browser_launcher.launch(headless=settings.headless) + context = await browser.new_context() + + # Priority queue: (priority, url, depth) - lower priority = processed first + # Directory pages get priority 0, others get priority 1 + queue: Deque[Tuple[int, str, int]] = deque([(0, settings.root_url, 0)]) + visited: Set[str] = set() + found_urls: Set[str] = set() # For deduplication of results + results: List[ScrapedLink] = [] + + print(f"Starting crawl from: {settings.root_url}") + print(f"Max depth: {settings.max_depth}, Max pages: {settings.max_pages}") + + try: + while queue and len(visited) < settings.max_pages: + # Sort queue by priority (directory pages first) + queue = deque(sorted(queue, key=lambda x: x[0])) + priority, url, depth = queue.popleft() + + normalized_url = normalize_url(settings.root_url, url) + if normalized_url in visited or depth > settings.max_depth: + continue + + # Only crawl same-domain URLs + if not is_same_domain(settings.root_url, normalized_url): + continue + + visited.add(normalized_url) + print(f"[{len(visited)}/{settings.max_pages}] Depth {depth}: {normalized_url[:80]}...") + + page = await context.new_page() + try: + response = await page.goto( + normalized_url, wait_until="domcontentloaded", timeout=20000 + ) + if not response or response.status >= 400: + await page.close() + continue + except Exception as e: + print(f" Error: {e}") + await page.close() + continue + + page_title = await get_page_title(page) + links = await extract_links(page) + + for href, text in links: + normalized_href = normalize_url(normalized_url, href) + + # Skip if already found or is excluded + if normalized_href in found_urls: + continue + if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized_href, EXCLUSION_KEYWORDS): + continue + + text_lower = text.lower() + href_lower = normalized_href.lower() + is_profile = is_profile_url(normalized_href) + + # Check for program links + if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(href_lower, PROGRAM_KEYWORDS): + found_urls.add(normalized_href) + results.append( + ScrapedLink( + url=normalized_href, + title="", + text=text[:200], + source_url=normalized_url, + bucket="program", + is_profile_page=False, + ) + ) + + # Check for faculty links + if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(href_lower, FACULTY_KEYWORDS): + found_urls.add(normalized_href) + results.append( + ScrapedLink( + url=normalized_href, + title="", + text=text[:200], + source_url=normalized_url, + bucket="faculty", + is_profile_page=is_profile, + ) + ) + + # Queue for further crawling + if depth < settings.max_depth and is_same_domain(settings.root_url, normalized_href): + # Prioritize directory pages + link_priority = 0 if is_directory_url(normalized_href) else 1 + queue.append((link_priority, normalized_href, depth + 1)) + + await page.close() + + # Polite delay between requests + await asyncio.sleep(settings.request_delay) + + finally: + await context.close() + await browser.close() + + # Verify links if enabled + if settings.verify_links and results: + print(f"\nVerifying {len(results)} links...") + browser = await browser_launcher.launch(headless=True) + context = await browser.new_context() + + verified_results = [] + for i, link in enumerate(results): + if link.url in [r.url for r in verified_results]: + continue # Skip duplicates + + print(f" [{i+1}/{len(results)}] Verifying: {link.url[:60]}...") + is_valid, status, title = await verify_link(context, link.url) + link.is_verified = True + link.http_status = status + link.title = title or link.text + + if is_valid: + verified_results.append(link) + else: + print(f" Invalid (HTTP {status})") + + await asyncio.sleep(0.5) # Delay between verifications + + await context.close() + await browser.close() + results = verified_results + + return results + + +def deduplicate_results(results: List[ScrapedLink]) -> List[ScrapedLink]: + """Remove duplicate URLs, keeping the first occurrence.""" + seen: Set[str] = set() + unique = [] + for link in results: + if link.url not in seen: + seen.add(link.url) + unique.append(link) + return unique + + +def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None: + """Save results to JSON file with statistics.""" + results = deduplicate_results(results) + + program_links = [link for link in results if link.bucket == "program"] + faculty_links = [link for link in results if link.bucket == "faculty"] + profile_pages = [link for link in faculty_links if link.is_profile_page] + + payload = { + "root_url": root_url, + "generated_at": datetime.now(timezone.utc).isoformat(), + "statistics": { + "total_links": len(results), + "program_links": len(program_links), + "faculty_links": len(faculty_links), + "profile_pages": len(profile_pages), + "verified_links": len([r for r in results if r.is_verified and r.http_status == 200]), + }, + "program_links": [asdict(link) for link in program_links], + "faculty_links": [asdict(link) for link in faculty_links], + "notes": EXTRA_NOTES, + "metadata_fields": METADATA_FIELDS, + } + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") + + print(f"\nResults saved to: {target}") + print(f" Total links: {len(results)}") + print(f" Program links: {len(program_links)}") + print(f" Faculty links: {len(faculty_links)}") + print(f" Profile pages: {len(profile_pages)}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Playwright scraper generated by the Agno agent for https://www.harvard.edu/." + ) + parser.add_argument( + "--root-url", + default="https://www.harvard.edu/", + help="Seed url to start crawling from.", + ) + parser.add_argument( + "--max-depth", + type=int, + default=3, + help="Maximum crawl depth.", + ) + parser.add_argument( + "--max-pages", + type=int, + default=30, + help="Maximum number of pages to visit.", + ) + parser.add_argument( + "--output", + type=Path, + default=Path("university-scraper_results.json"), + help="Where to save the JSON output.", + ) + parser.add_argument( + "--headless", + action="store_true", + default=True, + help="Run browser in headless mode (default: True).", + ) + parser.add_argument( + "--no-headless", + action="store_false", + dest="headless", + help="Run browser with visible window.", + ) + parser.add_argument( + "--browser", + choices=["chromium", "firefox", "webkit"], + default="chromium", + help="Browser engine to launch via Playwright.", + ) + parser.add_argument( + "--no-verify", + action="store_true", + default=False, + help="Skip link verification step.", + ) + parser.add_argument( + "--delay", + type=float, + default=1.0, + help="Delay between requests in seconds (polite crawling).", + ) + return parser.parse_args() + + +async def main_async() -> None: + args = parse_args() + settings = ScrapeSettings( + root_url=args.root_url, + max_depth=args.max_depth, + max_pages=args.max_pages, + headless=args.headless, + output=args.output, + verify_links=not args.no_verify, + request_delay=args.delay, + ) + links = await crawl(settings, browser_name=args.browser) + serialize(links, settings.output, settings.root_url) + + +def main() -> None: + asyncio.run(main_async()) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/generate_scraper.py b/generate_scraper.py new file mode 100644 index 0000000..0ef8f35 --- /dev/null +++ b/generate_scraper.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +""" +University Scraper Generator + +This script generates a Playwright-based web scraper for any university website. +It uses an AI agent to analyze the university's website structure and create +a customized scraper that collects master's program pages and faculty profiles. + +Usage: + python generate_scraper.py + +Configuration: + Set the following variables below: + - TARGET_URL: The university homepage URL + - CAMPUS_NAME: Short name for the university + - LANGUAGE: Primary language of the website + - MAX_DEPTH: How deep to crawl (default: 3) + - MAX_PAGES: Maximum pages to visit during sampling (default: 30) +""" +import argparse +import os +import sys + +# ============================================================================ +# CONFIGURATION - Modify these values for your target university +# ============================================================================ +TARGET_URL = "https://www.harvard.edu/" +CAMPUS_NAME = "Harvard" +LANGUAGE = "English" +MAX_DEPTH = 3 +MAX_PAGES = 30 +# ============================================================================ + + +def get_env_key(name: str) -> str | None: + """Get environment variable, with Windows registry fallback.""" + # Try standard environment variable first + value = os.environ.get(name) + if value: + return value + + # Windows: try reading from user environment in registry + if sys.platform == "win32": + try: + import winreg + with winreg.OpenKey(winreg.HKEY_CURRENT_USER, r"Environment") as key: + return winreg.QueryValueEx(key, name)[0] + except Exception: + pass + + return None + + +def main(): + parser = argparse.ArgumentParser( + description="Generate a Playwright scraper for a university website" + ) + parser.add_argument( + "--url", + default=TARGET_URL, + help="University homepage URL" + ) + parser.add_argument( + "--name", + default=CAMPUS_NAME, + help="Short name for the university" + ) + parser.add_argument( + "--language", + default=LANGUAGE, + help="Primary language of the website" + ) + parser.add_argument( + "--max-depth", + type=int, + default=MAX_DEPTH, + help="Maximum crawl depth" + ) + parser.add_argument( + "--max-pages", + type=int, + default=MAX_PAGES, + help="Maximum pages to visit during sampling" + ) + parser.add_argument( + "--no-snapshot", + action="store_true", + help="Skip browser snapshot capture" + ) + args = parser.parse_args() + + # Configure OpenRouter API + openrouter_key = get_env_key("OPENROUTER_API_KEY") + if not openrouter_key: + print("Error: OPENROUTER_API_KEY environment variable not set") + print("Please set it with your OpenRouter API key") + sys.exit(1) + + os.environ["OPENAI_API_KEY"] = openrouter_key + os.environ["CODEGEN_MODEL_PROVIDER"] = "openrouter" + os.environ["CODEGEN_OPENROUTER_MODEL"] = "anthropic/claude-3-opus" + + # Import after environment is configured + from university_agent import GenerationEngine, GenerationRequest, Settings + + settings = Settings() + print(f"Provider: {settings.model_provider}") + print(f"Model: {settings.openrouter_model}") + + engine = GenerationEngine(settings) + request = GenerationRequest( + target_url=args.url, + campus_name=args.name, + assumed_language=args.language, + max_depth=args.max_depth, + max_pages=args.max_pages, + ) + + print(f"\nGenerating scraper for: {args.name}") + print(f"URL: {args.url}") + print(f"Max depth: {args.max_depth}, Max pages: {args.max_pages}") + print("-" * 50) + + result = engine.generate(request, capture_snapshot=not args.no_snapshot) + + print("-" * 50) + print(f"Script saved to: {result.script_path}") + print(f"Project slug: {result.plan.project_slug}") + print(f"\nTo run the scraper:") + print(f" cd artifacts") + print(f" uv run python {result.script_path.name} --max-pages 50 --no-verify") + + +if __name__ == "__main__": + main() diff --git a/test_rwth.py b/test_rwth.py deleted file mode 100644 index dc431e6..0000000 --- a/test_rwth.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -"""Test script to run the agent for RWTH Aachen University.""" -import os - -# Configure OpenRouter - set OPENROUTER_API_KEY environment variable -import winreg - -def get_user_env(name): - """Get user environment variable on Windows.""" - try: - with winreg.OpenKey(winreg.HKEY_CURRENT_USER, r"Environment") as key: - return winreg.QueryValueEx(key, name)[0] - except Exception: - return os.environ.get(name) - -openrouter_key = get_user_env("OPENROUTER_API_KEY") -if not openrouter_key: - raise ValueError("Please set OPENROUTER_API_KEY environment variable") - -os.environ["OPENAI_API_KEY"] = openrouter_key -os.environ["CODEGEN_MODEL_PROVIDER"] = "openrouter" -os.environ["CODEGEN_OPENROUTER_MODEL"] = "anthropic/claude-3-opus" - -print("Using OpenRouter with Claude 3 Opus") - -# Run the agent -from university_agent import GenerationEngine, GenerationRequest, Settings - -settings = Settings() -print(f"Provider: {settings.model_provider}") -print(f"Model: {settings.openrouter_model}") - -engine = GenerationEngine(settings) -request = GenerationRequest( - target_url="https://www.kaust.edu.sa/en/", - campus_name="KAUST", - assumed_language="English", - max_depth=3, - max_pages=30, -) - -print("Starting generation for KAUST...") -result = engine.generate(request, capture_snapshot=True) -print(f"Script saved to: {result.script_path}") -print(f"Project slug: {result.plan.project_slug}")