Files
University-Playwright-Codeg…/generate_scraper.py
yangxiaoyu-crypto a4dca81216 Rename test script and update documentation
- Rename test_rwth.py to generate_scraper.py with CLI arguments
- Update README.md with comprehensive usage guide
- Add Harvard scraper as example output
- Document troubleshooting tips for common issues

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 15:36:14 +08:00

136 lines
4.0 KiB
Python

#!/usr/bin/env python
"""
University Scraper Generator
This script generates a Playwright-based web scraper for any university website.
It uses an AI agent to analyze the university's website structure and create
a customized scraper that collects master's program pages and faculty profiles.
Usage:
python generate_scraper.py
Configuration:
Set the following variables below:
- TARGET_URL: The university homepage URL
- CAMPUS_NAME: Short name for the university
- LANGUAGE: Primary language of the website
- MAX_DEPTH: How deep to crawl (default: 3)
- MAX_PAGES: Maximum pages to visit during sampling (default: 30)
"""
import argparse
import os
import sys
# ============================================================================
# CONFIGURATION - Modify these values for your target university
# ============================================================================
TARGET_URL = "https://www.harvard.edu/"
CAMPUS_NAME = "Harvard"
LANGUAGE = "English"
MAX_DEPTH = 3
MAX_PAGES = 30
# ============================================================================
def get_env_key(name: str) -> str | None:
"""Get environment variable, with Windows registry fallback."""
# Try standard environment variable first
value = os.environ.get(name)
if value:
return value
# Windows: try reading from user environment in registry
if sys.platform == "win32":
try:
import winreg
with winreg.OpenKey(winreg.HKEY_CURRENT_USER, r"Environment") as key:
return winreg.QueryValueEx(key, name)[0]
except Exception:
pass
return None
def main():
parser = argparse.ArgumentParser(
description="Generate a Playwright scraper for a university website"
)
parser.add_argument(
"--url",
default=TARGET_URL,
help="University homepage URL"
)
parser.add_argument(
"--name",
default=CAMPUS_NAME,
help="Short name for the university"
)
parser.add_argument(
"--language",
default=LANGUAGE,
help="Primary language of the website"
)
parser.add_argument(
"--max-depth",
type=int,
default=MAX_DEPTH,
help="Maximum crawl depth"
)
parser.add_argument(
"--max-pages",
type=int,
default=MAX_PAGES,
help="Maximum pages to visit during sampling"
)
parser.add_argument(
"--no-snapshot",
action="store_true",
help="Skip browser snapshot capture"
)
args = parser.parse_args()
# Configure OpenRouter API
openrouter_key = get_env_key("OPENROUTER_API_KEY")
if not openrouter_key:
print("Error: OPENROUTER_API_KEY environment variable not set")
print("Please set it with your OpenRouter API key")
sys.exit(1)
os.environ["OPENAI_API_KEY"] = openrouter_key
os.environ["CODEGEN_MODEL_PROVIDER"] = "openrouter"
os.environ["CODEGEN_OPENROUTER_MODEL"] = "anthropic/claude-3-opus"
# Import after environment is configured
from university_agent import GenerationEngine, GenerationRequest, Settings
settings = Settings()
print(f"Provider: {settings.model_provider}")
print(f"Model: {settings.openrouter_model}")
engine = GenerationEngine(settings)
request = GenerationRequest(
target_url=args.url,
campus_name=args.name,
assumed_language=args.language,
max_depth=args.max_depth,
max_pages=args.max_pages,
)
print(f"\nGenerating scraper for: {args.name}")
print(f"URL: {args.url}")
print(f"Max depth: {args.max_depth}, Max pages: {args.max_pages}")
print("-" * 50)
result = engine.generate(request, capture_snapshot=not args.no_snapshot)
print("-" * 50)
print(f"Script saved to: {result.script_path}")
print(f"Project slug: {result.plan.project_slug}")
print(f"\nTo run the scraper:")
print(f" cd artifacts")
print(f" uv run python {result.script_path.name} --max-pages 50 --no-verify")
if __name__ == "__main__":
main()