University-Playwright-Codeg…/artifacts/stanford_playwright_scraper.py

#!/usr/bin/env python
"""
Auto-generated by the Agno codegen agent.
Target university: Stanford (https://www.stanford.edu)
Requested caps: depth=2, pages=20

Plan description: Async Playwright scraper targeting Stanford University to collect (1) master's program landing pages and (2) faculty/supervisor profiles linked from those programs. Respects depth=2, pages=20 caps. Starts from the main site, navigates through Academics and school/department portals to locate graduate program listings and associated faculty directories.
Navigation strategy: 1. Seed from https://www.stanford.edu and follow the 'Academics' top-nav link to reach school/department listings. 2. On school pages, look for anchors containing master_program_keywords (href fragments like /programs/, /graduate/, /masters/) and queue them. 3. On each master's program page, extract metadata and scan for faculty_keywords in link text or href (e.g., /people/, /faculty/, /directory/). 4. Follow qualifying faculty links up to the depth cap and record each profile. 5. Use URL deduplication (set of visited URLs) and respect the 20-page cap by counting unique pages fetched. 6. Prioritize links via keyword score so high-relevance pages are visited first if cap is approached.
Verification checklist:
- Assert each JSONL row contains a non-empty 'url' and 'title' field.
- Confirm at least one record has entity_type='master_program' and one has entity_type='faculty'.
- Validate that no URL appears more than once in the output file.
- Check that total scraped pages does not exceed the requested cap of 20.
Playwright snapshot used to guide this plan:
1. Stanford University (https://www.stanford.edu)
   Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events  Stanford  Explore Stanford  Main Content A Mission Defined by Possibility  At Stanford,
   Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
2. Stanford University (https://www.stanford.edu/)
   Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events  Stanford  Explore Stanford  Main Content A Mission Defined by Possibility  At Stanford,
   Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
3. Gateway for Students – Stanford University (https://www.stanford.edu/student-gateway/)
   Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Students Gateway for Students  Resources, offices and services t
   Anchors: Skip to content -> https://www.stanford.edu/student-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
4. Gateway for Faculty & Staff – Stanford University (https://www.stanford.edu/faculty-staff-gateway/)
   Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Faculty & Staff Gateway for Faculty & Staff  Resources, offices,
   Anchors: Skip to content -> https://www.stanford.edu/faculty-staff-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
Snapshot truncated.

Generated at: 2025-12-09T04:03:37.802236+00:00
"""

from __future__ import annotations

import argparse
import asyncio
import json
from collections import deque
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Deque, Iterable, List, Set, Tuple
from urllib.parse import urljoin, urldefrag

from playwright.async_api import async_playwright, Page

PROGRAM_KEYWORDS = ['master', 'graduate program', 'MS degree', 'MA degree', 'graduate studies', 'postgraduate']
FACULTY_KEYWORDS = ['faculty', 'professor', 'advisor', 'researcher', 'people', 'directory']
EXCLUSION_KEYWORDS = ['admission', 'apply', 'tuition', 'financial aid', 'login', 'news', 'events', 'alumni', 'athletics', 'giving', 'donate', 'careers', 'jobs']
METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'school', 'description', 'contact_email', 'scraped_at']
EXTRA_NOTES = ["Stanford's seven schools each host their own subdomains (e.g., engineering.stanford.edu, law.stanford.edu); the script should allow cross-subdomain crawling within *.stanford.edu while still counting against the page cap.", "Many program pages load additional content via JavaScript; use page.wait_for_load_state('networkidle') or explicit waits before extracting links.", 'Faculty directories may paginate or use infinite scroll; handle at least the first visible batch given the tight page cap.', 'Respect polite crawling: insert a 1-2 second delay between requests to avoid rate-limiting.']


def normalize_url(base: str, href: str) -> str:
    absolute = urljoin(base, href)
    cleaned, _ = urldefrag(absolute)
    return cleaned


def matches_any(text: str, keywords: Iterable[str]) -> bool:
    lowered = text.lower()
    return any(keyword.lower() in lowered for keyword in keywords)


@dataclass
class ScrapedLink:
    url: str
    text: str
    source_url: str
    bucket: str  # either "program" or "faculty"


@dataclass
class ScrapeSettings:
    root_url: str
    max_depth: int
    max_pages: int
    headless: bool
    output: Path


async def extract_links(page: Page) -> List[Tuple[str, str]]:
    anchors: Iterable[dict] = await page.eval_on_selector_all(
        "a",
        """elements => elements
        .map(el => ({text: (el.textContent || '').trim(), href: el.href}))
        .filter(item => item.text && item.href)""",
    )
    return [(item["href"], item["text"]) for item in anchors]


async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
    async with async_playwright() as p:
        browser_launcher = getattr(p, browser_name)
        browser = await browser_launcher.launch(headless=settings.headless)
        context = await browser.new_context()
        queue: Deque[Tuple[str, int]] = deque([(settings.root_url, 0)])
        visited: Set[str] = set()
        results: List[ScrapedLink] = []

        try:
            while queue and len(visited) < settings.max_pages:
                url, depth = queue.popleft()
                if url in visited or depth > settings.max_depth:
                    continue
                visited.add(url)

                page = await context.new_page()
                try:
                    await page.goto(url, wait_until="domcontentloaded", timeout=20000)
                except Exception:
                    await page.close()
                    continue

                links = await extract_links(page)
                for href, text in links:
                    normalized = normalize_url(url, href)

                    if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized, EXCLUSION_KEYWORDS):
                        continue

                    text_lower = text.lower()
                    normalized_lower = normalized.lower()

                    if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(normalized_lower, PROGRAM_KEYWORDS):
                        results.append(
                            ScrapedLink(
                                url=normalized,
                                text=text,
                                source_url=url,
                                bucket="program",
                            )
                        )
                    if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(normalized_lower, FACULTY_KEYWORDS):
                        results.append(
                            ScrapedLink(
                                url=normalized,
                                text=text,
                                source_url=url,
                                bucket="faculty",
                            )
                        )

                    if depth < settings.max_depth:
                        queue.append((normalized, depth + 1))
                await page.close()
        finally:
            await context.close()
            await browser.close()

        return results


def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
    payload = {
        "root_url": root_url,
        "program_links": [
            {**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "program"
        ],
        "faculty_links": [
            {**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "faculty"
        ],
        "notes": EXTRA_NOTES,
        "metadata_fields": METADATA_FIELDS,
    }
    target.parent.mkdir(parents=True, exist_ok=True)
    target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Playwright scraper generated by the Agno agent for https://www.stanford.edu."
    )
    parser.add_argument(
        "--root-url",
        default="https://www.stanford.edu",
        help="Seed url to start crawling from.",
    )
    parser.add_argument(
        "--max-depth",
        type=int,
        default=2,
        help="Maximum crawl depth.",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=20,
        help="Maximum number of pages to visit.",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=Path("stanford-masters-faculty_masters.json"),
        help="Where to save the JSON output.",
    )
    parser.add_argument(
        "--headless",
        action="store_true",
        default=True,
        help="Run browser in headless mode (default: True).",
    )
    parser.add_argument(
        "--no-headless",
        action="store_false",
        dest="headless",
        help="Run browser with visible window.",
    )
    parser.add_argument(
        "--browser",
        choices=["chromium", "firefox", "webkit"],
        default="chromium",
        help="Browser engine to launch via Playwright.",
    )
    return parser.parse_args()


async def main_async() -> None:
    args = parse_args()
    settings = ScrapeSettings(
        root_url=args.root_url,
        max_depth=args.max_depth,
        max_pages=args.max_pages,
        headless=args.headless,
        output=args.output,
    )
    links = await crawl(settings, browser_name=args.browser)
    serialize(links, settings.output, settings.root_url)


def main() -> None:
    asyncio.run(main_async())


if __name__ == "__main__":
    main()