- Rename test_rwth.py to generate_scraper.py with CLI arguments - Update README.md with comprehensive usage guide - Add Harvard scraper as example output - Document troubleshooting tips for common issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
136 lines
4.0 KiB
Python
136 lines
4.0 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
University Scraper Generator
|
|
|
|
This script generates a Playwright-based web scraper for any university website.
|
|
It uses an AI agent to analyze the university's website structure and create
|
|
a customized scraper that collects master's program pages and faculty profiles.
|
|
|
|
Usage:
|
|
python generate_scraper.py
|
|
|
|
Configuration:
|
|
Set the following variables below:
|
|
- TARGET_URL: The university homepage URL
|
|
- CAMPUS_NAME: Short name for the university
|
|
- LANGUAGE: Primary language of the website
|
|
- MAX_DEPTH: How deep to crawl (default: 3)
|
|
- MAX_PAGES: Maximum pages to visit during sampling (default: 30)
|
|
"""
|
|
import argparse
|
|
import os
|
|
import sys
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION - Modify these values for your target university
|
|
# ============================================================================
|
|
TARGET_URL = "https://www.harvard.edu/"
|
|
CAMPUS_NAME = "Harvard"
|
|
LANGUAGE = "English"
|
|
MAX_DEPTH = 3
|
|
MAX_PAGES = 30
|
|
# ============================================================================
|
|
|
|
|
|
def get_env_key(name: str) -> str | None:
|
|
"""Get environment variable, with Windows registry fallback."""
|
|
# Try standard environment variable first
|
|
value = os.environ.get(name)
|
|
if value:
|
|
return value
|
|
|
|
# Windows: try reading from user environment in registry
|
|
if sys.platform == "win32":
|
|
try:
|
|
import winreg
|
|
with winreg.OpenKey(winreg.HKEY_CURRENT_USER, r"Environment") as key:
|
|
return winreg.QueryValueEx(key, name)[0]
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate a Playwright scraper for a university website"
|
|
)
|
|
parser.add_argument(
|
|
"--url",
|
|
default=TARGET_URL,
|
|
help="University homepage URL"
|
|
)
|
|
parser.add_argument(
|
|
"--name",
|
|
default=CAMPUS_NAME,
|
|
help="Short name for the university"
|
|
)
|
|
parser.add_argument(
|
|
"--language",
|
|
default=LANGUAGE,
|
|
help="Primary language of the website"
|
|
)
|
|
parser.add_argument(
|
|
"--max-depth",
|
|
type=int,
|
|
default=MAX_DEPTH,
|
|
help="Maximum crawl depth"
|
|
)
|
|
parser.add_argument(
|
|
"--max-pages",
|
|
type=int,
|
|
default=MAX_PAGES,
|
|
help="Maximum pages to visit during sampling"
|
|
)
|
|
parser.add_argument(
|
|
"--no-snapshot",
|
|
action="store_true",
|
|
help="Skip browser snapshot capture"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Configure OpenRouter API
|
|
openrouter_key = get_env_key("OPENROUTER_API_KEY")
|
|
if not openrouter_key:
|
|
print("Error: OPENROUTER_API_KEY environment variable not set")
|
|
print("Please set it with your OpenRouter API key")
|
|
sys.exit(1)
|
|
|
|
os.environ["OPENAI_API_KEY"] = openrouter_key
|
|
os.environ["CODEGEN_MODEL_PROVIDER"] = "openrouter"
|
|
os.environ["CODEGEN_OPENROUTER_MODEL"] = "anthropic/claude-3-opus"
|
|
|
|
# Import after environment is configured
|
|
from university_agent import GenerationEngine, GenerationRequest, Settings
|
|
|
|
settings = Settings()
|
|
print(f"Provider: {settings.model_provider}")
|
|
print(f"Model: {settings.openrouter_model}")
|
|
|
|
engine = GenerationEngine(settings)
|
|
request = GenerationRequest(
|
|
target_url=args.url,
|
|
campus_name=args.name,
|
|
assumed_language=args.language,
|
|
max_depth=args.max_depth,
|
|
max_pages=args.max_pages,
|
|
)
|
|
|
|
print(f"\nGenerating scraper for: {args.name}")
|
|
print(f"URL: {args.url}")
|
|
print(f"Max depth: {args.max_depth}, Max pages: {args.max_pages}")
|
|
print("-" * 50)
|
|
|
|
result = engine.generate(request, capture_snapshot=not args.no_snapshot)
|
|
|
|
print("-" * 50)
|
|
print(f"Script saved to: {result.script_path}")
|
|
print(f"Project slug: {result.plan.project_slug}")
|
|
print(f"\nTo run the scraper:")
|
|
print(f" cd artifacts")
|
|
print(f" uv run python {result.script_path.name} --max-pages 50 --no-verify")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|