#!/usr/bin/env python """ University Scraper Generator This script generates a Playwright-based web scraper for any university website. It uses an AI agent to analyze the university's website structure and create a customized scraper that collects master's program pages and faculty profiles. Usage: python generate_scraper.py Configuration: Set the following variables below: - TARGET_URL: The university homepage URL - CAMPUS_NAME: Short name for the university - LANGUAGE: Primary language of the website - MAX_DEPTH: How deep to crawl (default: 3) - MAX_PAGES: Maximum pages to visit during sampling (default: 30) """ import argparse import os import sys # ============================================================================ # CONFIGURATION - Modify these values for your target university # ============================================================================ TARGET_URL = "https://www.harvard.edu/" CAMPUS_NAME = "Harvard" LANGUAGE = "English" MAX_DEPTH = 3 MAX_PAGES = 30 # ============================================================================ def get_env_key(name: str) -> str | None: """Get environment variable, with Windows registry fallback.""" # Try standard environment variable first value = os.environ.get(name) if value: return value # Windows: try reading from user environment in registry if sys.platform == "win32": try: import winreg with winreg.OpenKey(winreg.HKEY_CURRENT_USER, r"Environment") as key: return winreg.QueryValueEx(key, name)[0] except Exception: pass return None def main(): parser = argparse.ArgumentParser( description="Generate a Playwright scraper for a university website" ) parser.add_argument( "--url", default=TARGET_URL, help="University homepage URL" ) parser.add_argument( "--name", default=CAMPUS_NAME, help="Short name for the university" ) parser.add_argument( "--language", default=LANGUAGE, help="Primary language of the website" ) parser.add_argument( "--max-depth", type=int, default=MAX_DEPTH, help="Maximum crawl depth" ) parser.add_argument( "--max-pages", type=int, default=MAX_PAGES, help="Maximum pages to visit during sampling" ) parser.add_argument( "--no-snapshot", action="store_true", help="Skip browser snapshot capture" ) args = parser.parse_args() # Configure OpenRouter API openrouter_key = get_env_key("OPENROUTER_API_KEY") if not openrouter_key: print("Error: OPENROUTER_API_KEY environment variable not set") print("Please set it with your OpenRouter API key") sys.exit(1) os.environ["OPENAI_API_KEY"] = openrouter_key os.environ["CODEGEN_MODEL_PROVIDER"] = "openrouter" os.environ["CODEGEN_OPENROUTER_MODEL"] = "anthropic/claude-3-opus" # Import after environment is configured from university_agent import GenerationEngine, GenerationRequest, Settings settings = Settings() print(f"Provider: {settings.model_provider}") print(f"Model: {settings.openrouter_model}") engine = GenerationEngine(settings) request = GenerationRequest( target_url=args.url, campus_name=args.name, assumed_language=args.language, max_depth=args.max_depth, max_pages=args.max_pages, ) print(f"\nGenerating scraper for: {args.name}") print(f"URL: {args.url}") print(f"Max depth: {args.max_depth}, Max pages: {args.max_pages}") print("-" * 50) result = engine.generate(request, capture_snapshot=not args.no_snapshot) print("-" * 50) print(f"Script saved to: {result.script_path}") print(f"Project slug: {result.plan.project_slug}") print(f"\nTo run the scraper:") print(f" cd artifacts") print(f" uv run python {result.script_path.name} --max-pages 50 --no-verify") if __name__ == "__main__": main()