- Add OpenRouter as third LLM provider option in config.py - Implement _extract_json() to handle markdown-wrapped JSON responses - Add default values for missing required fields in ScriptPlan - Handle navigation_strategy as list or string - Add .env.example with configuration templates - Add test script and sample generated scrapers for RWTH and KAUST 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
435 lines
16 KiB
Python
435 lines
16 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Auto-generated by the Agno codegen agent.
|
|
Target university: KAUST (https://www.kaust.edu.sa/en/)
|
|
Requested caps: depth=3, pages=30
|
|
|
|
Plan description: Playwright scraper for university master programs and faculty profiles.
|
|
Navigation strategy: Start at https://www.kaust.edu.sa/en/ Navigate to /study/ to find degree program links Follow links to individual degree pages under /degree-programs/ Separately, look for links to /faculty/ or /people/ directories Crawl faculty directories to extract links to individual bio pages Individual faculty are often under a subdomain like bio.kaust.edu.sa
|
|
Verification checklist:
|
|
- Verify master's programs are under /study/ or /degree-programs/
|
|
- Check that faculty directory pages contain links to individual bios
|
|
- Confirm individual faculty pages have research/expertise details
|
|
- Ensure exclusion keywords successfully skip irrelevant pages
|
|
Playwright snapshot used to guide this plan:
|
|
No browser snapshot was captured.
|
|
|
|
Generated at: 2025-12-10T02:48:42.571899+00:00
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import time
|
|
from collections import deque
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Deque, Iterable, List, Set, Tuple
|
|
from urllib.parse import urljoin, urldefrag, urlparse
|
|
|
|
from playwright.async_api import async_playwright, Page, Response
|
|
|
|
PROGRAM_KEYWORDS = ['/study/', '/degree-programs/', '/academics/', 'M.Sc.', 'Master of Science', 'graduate program']
|
|
FACULTY_KEYWORDS = ['/people/', '/profiles/faculty/', 'Professor', 'faculty-member', '/faculty/firstname-lastname', 'bio.kaust.edu.sa']
|
|
EXCLUSION_KEYWORDS = ['/admissions/', '/apply/', '/tuition/', '/events/', '/news/', '/careers/', '/jobs/', '/login/', '/alumni/', '/giving/', 'inquiry.kaust.edu.sa']
|
|
METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'email', 'scraped_at']
|
|
EXTRA_NOTES = ['Many faculty are listed under a separate subdomain bio.kaust.edu.sa', 'Prioritize crawling the centralized faculty directory first', 'Alumni and affiliated faculty may not have full profile pages']
|
|
|
|
# URL patterns that indicate individual profile pages
|
|
PROFILE_URL_PATTERNS = [
|
|
"/people/", "/person/", "/profile/", "/profiles/",
|
|
"/faculty/", "/staff/", "/directory/",
|
|
"/~", # Unix-style personal pages
|
|
"/bio/", "/about/",
|
|
]
|
|
|
|
# URL patterns that indicate listing/directory pages (should be crawled deeper)
|
|
DIRECTORY_URL_PATTERNS = [
|
|
"/faculty", "/people", "/directory", "/staff",
|
|
"/team", "/members", "/researchers",
|
|
]
|
|
|
|
|
|
def normalize_url(base: str, href: str) -> str:
|
|
"""Normalize URL by resolving relative paths and removing fragments."""
|
|
absolute = urljoin(base, href)
|
|
cleaned, _ = urldefrag(absolute)
|
|
# Remove trailing slash for consistency
|
|
return cleaned.rstrip("/")
|
|
|
|
|
|
def matches_any(text: str, keywords: Iterable[str]) -> bool:
|
|
"""Check if text contains any of the keywords (case-insensitive)."""
|
|
lowered = text.lower()
|
|
return any(keyword.lower() in lowered for keyword in keywords)
|
|
|
|
|
|
def is_same_domain(url1: str, url2: str) -> bool:
|
|
"""Check if two URLs belong to the same root domain."""
|
|
domain1 = urlparse(url1).netloc.replace("www.", "")
|
|
domain2 = urlparse(url2).netloc.replace("www.", "")
|
|
# Allow subdomains of the same root domain
|
|
parts1 = domain1.split(".")
|
|
parts2 = domain2.split(".")
|
|
if len(parts1) >= 2 and len(parts2) >= 2:
|
|
return parts1[-2:] == parts2[-2:]
|
|
return domain1 == domain2
|
|
|
|
|
|
def is_profile_url(url: str) -> bool:
|
|
"""Check if URL pattern suggests an individual profile page."""
|
|
url_lower = url.lower()
|
|
return any(pattern in url_lower for pattern in PROFILE_URL_PATTERNS)
|
|
|
|
|
|
def is_directory_url(url: str) -> bool:
|
|
"""Check if URL pattern suggests a directory/listing page."""
|
|
url_lower = url.lower()
|
|
return any(pattern in url_lower for pattern in DIRECTORY_URL_PATTERNS)
|
|
|
|
|
|
@dataclass
|
|
class ScrapedLink:
|
|
url: str
|
|
title: str
|
|
text: str
|
|
source_url: str
|
|
bucket: str # "program" or "faculty"
|
|
is_verified: bool = False
|
|
http_status: int = 0
|
|
is_profile_page: bool = False
|
|
scraped_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
|
|
|
|
@dataclass
|
|
class ScrapeSettings:
|
|
root_url: str
|
|
max_depth: int
|
|
max_pages: int
|
|
headless: bool
|
|
output: Path
|
|
verify_links: bool = True
|
|
request_delay: float = 1.0 # Polite crawling delay
|
|
timeout: int = 60000 # Navigation timeout in ms (default 60s for slow sites)
|
|
|
|
|
|
async def extract_links(page: Page) -> List[Tuple[str, str]]:
|
|
"""Extract all anchor links from the page."""
|
|
anchors: Iterable[dict] = await page.eval_on_selector_all(
|
|
"a",
|
|
"""elements => elements
|
|
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
|
|
.filter(item => item.text && item.href && item.href.startsWith('http'))""",
|
|
)
|
|
return [(item["href"], item["text"]) for item in anchors]
|
|
|
|
|
|
async def get_page_title(page: Page) -> str:
|
|
"""Get the page title safely."""
|
|
try:
|
|
return await page.title() or ""
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
async def verify_link(context, url: str, timeout: int = 10000) -> Tuple[bool, int, str]:
|
|
"""
|
|
Verify a link by making a HEAD-like request.
|
|
Returns: (is_valid, status_code, page_title)
|
|
"""
|
|
page = await context.new_page()
|
|
try:
|
|
response: Response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
|
if response:
|
|
status = response.status
|
|
title = await get_page_title(page)
|
|
is_valid = 200 <= status < 400
|
|
return is_valid, status, title
|
|
return False, 0, ""
|
|
except Exception:
|
|
return False, 0, ""
|
|
finally:
|
|
await page.close()
|
|
|
|
|
|
async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
|
|
"""
|
|
Crawl the website using BFS, collecting program and faculty links.
|
|
Features:
|
|
- URL deduplication
|
|
- Link verification
|
|
- Profile page detection
|
|
- Polite crawling with delays
|
|
"""
|
|
async with async_playwright() as p:
|
|
browser_launcher = getattr(p, browser_name)
|
|
browser = await browser_launcher.launch(headless=settings.headless)
|
|
context = await browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
# Priority queue: (priority, url, depth) - lower priority = processed first
|
|
# Directory pages get priority 0, others get priority 1
|
|
queue: Deque[Tuple[int, str, int]] = deque([(0, settings.root_url, 0)])
|
|
visited: Set[str] = set()
|
|
found_urls: Set[str] = set() # For deduplication of results
|
|
results: List[ScrapedLink] = []
|
|
|
|
print(f"Starting crawl from: {settings.root_url}")
|
|
print(f"Max depth: {settings.max_depth}, Max pages: {settings.max_pages}")
|
|
|
|
try:
|
|
while queue and len(visited) < settings.max_pages:
|
|
# Sort queue by priority (directory pages first)
|
|
queue = deque(sorted(queue, key=lambda x: x[0]))
|
|
priority, url, depth = queue.popleft()
|
|
|
|
normalized_url = normalize_url(settings.root_url, url)
|
|
if normalized_url in visited or depth > settings.max_depth:
|
|
continue
|
|
|
|
# Only crawl same-domain URLs
|
|
if not is_same_domain(settings.root_url, normalized_url):
|
|
continue
|
|
|
|
visited.add(normalized_url)
|
|
print(f"[{len(visited)}/{settings.max_pages}] Depth {depth}: {normalized_url[:80]}...")
|
|
|
|
page = await context.new_page()
|
|
try:
|
|
response = await page.goto(
|
|
normalized_url, wait_until="load", timeout=settings.timeout
|
|
)
|
|
if not response or response.status >= 400:
|
|
await page.close()
|
|
continue
|
|
except Exception as e:
|
|
print(f" Error: {e}")
|
|
await page.close()
|
|
continue
|
|
|
|
page_title = await get_page_title(page)
|
|
links = await extract_links(page)
|
|
|
|
for href, text in links:
|
|
normalized_href = normalize_url(normalized_url, href)
|
|
|
|
# Skip if already found or is excluded
|
|
if normalized_href in found_urls:
|
|
continue
|
|
if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized_href, EXCLUSION_KEYWORDS):
|
|
continue
|
|
|
|
text_lower = text.lower()
|
|
href_lower = normalized_href.lower()
|
|
is_profile = is_profile_url(normalized_href)
|
|
|
|
# Check for program links
|
|
if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(href_lower, PROGRAM_KEYWORDS):
|
|
found_urls.add(normalized_href)
|
|
results.append(
|
|
ScrapedLink(
|
|
url=normalized_href,
|
|
title="",
|
|
text=text[:200],
|
|
source_url=normalized_url,
|
|
bucket="program",
|
|
is_profile_page=False,
|
|
)
|
|
)
|
|
|
|
# Check for faculty links
|
|
if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(href_lower, FACULTY_KEYWORDS):
|
|
found_urls.add(normalized_href)
|
|
results.append(
|
|
ScrapedLink(
|
|
url=normalized_href,
|
|
title="",
|
|
text=text[:200],
|
|
source_url=normalized_url,
|
|
bucket="faculty",
|
|
is_profile_page=is_profile,
|
|
)
|
|
)
|
|
|
|
# Queue for further crawling
|
|
if depth < settings.max_depth and is_same_domain(settings.root_url, normalized_href):
|
|
# Prioritize directory pages
|
|
link_priority = 0 if is_directory_url(normalized_href) else 1
|
|
queue.append((link_priority, normalized_href, depth + 1))
|
|
|
|
await page.close()
|
|
|
|
# Polite delay between requests
|
|
await asyncio.sleep(settings.request_delay)
|
|
|
|
finally:
|
|
await context.close()
|
|
await browser.close()
|
|
|
|
# Verify links if enabled
|
|
if settings.verify_links and results:
|
|
print(f"\nVerifying {len(results)} links...")
|
|
browser = await browser_launcher.launch(headless=True)
|
|
context = await browser.new_context()
|
|
|
|
verified_results = []
|
|
for i, link in enumerate(results):
|
|
if link.url in [r.url for r in verified_results]:
|
|
continue # Skip duplicates
|
|
|
|
print(f" [{i+1}/{len(results)}] Verifying: {link.url[:60]}...")
|
|
is_valid, status, title = await verify_link(context, link.url)
|
|
link.is_verified = True
|
|
link.http_status = status
|
|
link.title = title or link.text
|
|
|
|
if is_valid:
|
|
verified_results.append(link)
|
|
else:
|
|
print(f" Invalid (HTTP {status})")
|
|
|
|
await asyncio.sleep(0.5) # Delay between verifications
|
|
|
|
await context.close()
|
|
await browser.close()
|
|
results = verified_results
|
|
|
|
return results
|
|
|
|
|
|
def deduplicate_results(results: List[ScrapedLink]) -> List[ScrapedLink]:
|
|
"""Remove duplicate URLs, keeping the first occurrence."""
|
|
seen: Set[str] = set()
|
|
unique = []
|
|
for link in results:
|
|
if link.url not in seen:
|
|
seen.add(link.url)
|
|
unique.append(link)
|
|
return unique
|
|
|
|
|
|
def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
|
|
"""Save results to JSON file with statistics."""
|
|
results = deduplicate_results(results)
|
|
|
|
program_links = [link for link in results if link.bucket == "program"]
|
|
faculty_links = [link for link in results if link.bucket == "faculty"]
|
|
profile_pages = [link for link in faculty_links if link.is_profile_page]
|
|
|
|
payload = {
|
|
"root_url": root_url,
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
"statistics": {
|
|
"total_links": len(results),
|
|
"program_links": len(program_links),
|
|
"faculty_links": len(faculty_links),
|
|
"profile_pages": len(profile_pages),
|
|
"verified_links": len([r for r in results if r.is_verified and r.http_status == 200]),
|
|
},
|
|
"program_links": [asdict(link) for link in program_links],
|
|
"faculty_links": [asdict(link) for link in faculty_links],
|
|
"notes": EXTRA_NOTES,
|
|
"metadata_fields": METADATA_FIELDS,
|
|
}
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
|
|
print(f"\nResults saved to: {target}")
|
|
print(f" Total links: {len(results)}")
|
|
print(f" Program links: {len(program_links)}")
|
|
print(f" Faculty links: {len(faculty_links)}")
|
|
print(f" Profile pages: {len(profile_pages)}")
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Playwright scraper generated by the Agno agent for https://www.kaust.edu.sa/en/."
|
|
)
|
|
parser.add_argument(
|
|
"--root-url",
|
|
default="https://www.kaust.edu.sa/en/",
|
|
help="Seed url to start crawling from.",
|
|
)
|
|
parser.add_argument(
|
|
"--max-depth",
|
|
type=int,
|
|
default=3,
|
|
help="Maximum crawl depth.",
|
|
)
|
|
parser.add_argument(
|
|
"--max-pages",
|
|
type=int,
|
|
default=30,
|
|
help="Maximum number of pages to visit.",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=Path("university-scraper_results.json"),
|
|
help="Where to save the JSON output.",
|
|
)
|
|
parser.add_argument(
|
|
"--headless",
|
|
action="store_true",
|
|
default=True,
|
|
help="Run browser in headless mode (default: True).",
|
|
)
|
|
parser.add_argument(
|
|
"--no-headless",
|
|
action="store_false",
|
|
dest="headless",
|
|
help="Run browser with visible window.",
|
|
)
|
|
parser.add_argument(
|
|
"--browser",
|
|
choices=["chromium", "firefox", "webkit"],
|
|
default="firefox",
|
|
help="Browser engine to launch via Playwright (firefox recommended for KAUST).",
|
|
)
|
|
parser.add_argument(
|
|
"--no-verify",
|
|
action="store_true",
|
|
default=False,
|
|
help="Skip link verification step.",
|
|
)
|
|
parser.add_argument(
|
|
"--delay",
|
|
type=float,
|
|
default=1.0,
|
|
help="Delay between requests in seconds (polite crawling).",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=60000,
|
|
help="Navigation timeout in milliseconds (default: 60000 = 60s).",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
async def main_async() -> None:
|
|
args = parse_args()
|
|
settings = ScrapeSettings(
|
|
root_url=args.root_url,
|
|
max_depth=args.max_depth,
|
|
max_pages=args.max_pages,
|
|
headless=args.headless,
|
|
output=args.output,
|
|
verify_links=not args.no_verify,
|
|
request_delay=args.delay,
|
|
timeout=args.timeout,
|
|
)
|
|
links = await crawl(settings, browser_name=args.browser)
|
|
serialize(links, settings.output, settings.root_url)
|
|
|
|
|
|
def main() -> None:
|
|
asyncio.run(main_async())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |