Rename test script and update documentation
- Rename test_rwth.py to generate_scraper.py with CLI arguments - Update README.md with comprehensive usage guide - Add Harvard scraper as example output - Document troubleshooting tips for common issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
135
generate_scraper.py
Normal file
135
generate_scraper.py
Normal file
@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
University Scraper Generator
|
||||
|
||||
This script generates a Playwright-based web scraper for any university website.
|
||||
It uses an AI agent to analyze the university's website structure and create
|
||||
a customized scraper that collects master's program pages and faculty profiles.
|
||||
|
||||
Usage:
|
||||
python generate_scraper.py
|
||||
|
||||
Configuration:
|
||||
Set the following variables below:
|
||||
- TARGET_URL: The university homepage URL
|
||||
- CAMPUS_NAME: Short name for the university
|
||||
- LANGUAGE: Primary language of the website
|
||||
- MAX_DEPTH: How deep to crawl (default: 3)
|
||||
- MAX_PAGES: Maximum pages to visit during sampling (default: 30)
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION - Modify these values for your target university
|
||||
# ============================================================================
|
||||
TARGET_URL = "https://www.harvard.edu/"
|
||||
CAMPUS_NAME = "Harvard"
|
||||
LANGUAGE = "English"
|
||||
MAX_DEPTH = 3
|
||||
MAX_PAGES = 30
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def get_env_key(name: str) -> str | None:
|
||||
"""Get environment variable, with Windows registry fallback."""
|
||||
# Try standard environment variable first
|
||||
value = os.environ.get(name)
|
||||
if value:
|
||||
return value
|
||||
|
||||
# Windows: try reading from user environment in registry
|
||||
if sys.platform == "win32":
|
||||
try:
|
||||
import winreg
|
||||
with winreg.OpenKey(winreg.HKEY_CURRENT_USER, r"Environment") as key:
|
||||
return winreg.QueryValueEx(key, name)[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate a Playwright scraper for a university website"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default=TARGET_URL,
|
||||
help="University homepage URL"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--name",
|
||||
default=CAMPUS_NAME,
|
||||
help="Short name for the university"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
default=LANGUAGE,
|
||||
help="Primary language of the website"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-depth",
|
||||
type=int,
|
||||
default=MAX_DEPTH,
|
||||
help="Maximum crawl depth"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=MAX_PAGES,
|
||||
help="Maximum pages to visit during sampling"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-snapshot",
|
||||
action="store_true",
|
||||
help="Skip browser snapshot capture"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure OpenRouter API
|
||||
openrouter_key = get_env_key("OPENROUTER_API_KEY")
|
||||
if not openrouter_key:
|
||||
print("Error: OPENROUTER_API_KEY environment variable not set")
|
||||
print("Please set it with your OpenRouter API key")
|
||||
sys.exit(1)
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = openrouter_key
|
||||
os.environ["CODEGEN_MODEL_PROVIDER"] = "openrouter"
|
||||
os.environ["CODEGEN_OPENROUTER_MODEL"] = "anthropic/claude-3-opus"
|
||||
|
||||
# Import after environment is configured
|
||||
from university_agent import GenerationEngine, GenerationRequest, Settings
|
||||
|
||||
settings = Settings()
|
||||
print(f"Provider: {settings.model_provider}")
|
||||
print(f"Model: {settings.openrouter_model}")
|
||||
|
||||
engine = GenerationEngine(settings)
|
||||
request = GenerationRequest(
|
||||
target_url=args.url,
|
||||
campus_name=args.name,
|
||||
assumed_language=args.language,
|
||||
max_depth=args.max_depth,
|
||||
max_pages=args.max_pages,
|
||||
)
|
||||
|
||||
print(f"\nGenerating scraper for: {args.name}")
|
||||
print(f"URL: {args.url}")
|
||||
print(f"Max depth: {args.max_depth}, Max pages: {args.max_pages}")
|
||||
print("-" * 50)
|
||||
|
||||
result = engine.generate(request, capture_snapshot=not args.no_snapshot)
|
||||
|
||||
print("-" * 50)
|
||||
print(f"Script saved to: {result.script_path}")
|
||||
print(f"Project slug: {result.plan.project_slug}")
|
||||
print(f"\nTo run the scraper:")
|
||||
print(f" cd artifacts")
|
||||
print(f" uv run python {result.script_path.name} --max-pages 50 --no-verify")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user