#!/usr/bin/env python """ Auto-generated by the Agno codegen agent. Target university: Stanford (https://www.stanford.edu) Requested caps: depth=2, pages=20 Plan description: Async Playwright scraper targeting Stanford University to collect (1) master's program landing pages and (2) faculty/supervisor profiles linked from those programs. Respects depth=2, pages=20 caps. Starts from the main site, navigates through Academics and school/department portals to locate graduate program listings and associated faculty directories. Navigation strategy: 1. Seed from https://www.stanford.edu and follow the 'Academics' top-nav link to reach school/department listings. 2. On school pages, look for anchors containing master_program_keywords (href fragments like /programs/, /graduate/, /masters/) and queue them. 3. On each master's program page, extract metadata and scan for faculty_keywords in link text or href (e.g., /people/, /faculty/, /directory/). 4. Follow qualifying faculty links up to the depth cap and record each profile. 5. Use URL deduplication (set of visited URLs) and respect the 20-page cap by counting unique pages fetched. 6. Prioritize links via keyword score so high-relevance pages are visited first if cap is approached. Verification checklist: - Assert each JSONL row contains a non-empty 'url' and 'title' field. - Confirm at least one record has entity_type='master_program' and one has entity_type='faculty'. - Validate that no URL appears more than once in the output file. - Check that total scraped pages does not exceed the requested cap of 20. Playwright snapshot used to guide this plan: 1. Stanford University (https://www.stanford.edu) Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Stanford Explore Stanford Main Content A Mission Defined by Possibility At Stanford, Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/ 2. Stanford University (https://www.stanford.edu/) Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Stanford Explore Stanford Main Content A Mission Defined by Possibility At Stanford, Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/ 3. Gateway for Students – Stanford University (https://www.stanford.edu/student-gateway/) Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Students Gateway for Students Resources, offices and services t Anchors: Skip to content -> https://www.stanford.edu/student-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/ 4. Gateway for Faculty & Staff – Stanford University (https://www.stanford.edu/faculty-staff-gateway/) Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Faculty & Staff Gateway for Faculty & Staff Resources, offices, Anchors: Skip to content -> https://www.stanford.edu/faculty-staff-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/ Snapshot truncated. Generated at: 2025-12-09T04:03:37.802236+00:00 """ from __future__ import annotations import argparse import asyncio import json from collections import deque from dataclasses import asdict, dataclass from pathlib import Path from typing import Deque, Iterable, List, Set, Tuple from urllib.parse import urljoin, urldefrag from playwright.async_api import async_playwright, Page PROGRAM_KEYWORDS = ['master', 'graduate program', 'MS degree', 'MA degree', 'graduate studies', 'postgraduate'] FACULTY_KEYWORDS = ['faculty', 'professor', 'advisor', 'researcher', 'people', 'directory'] EXCLUSION_KEYWORDS = ['admission', 'apply', 'tuition', 'financial aid', 'login', 'news', 'events', 'alumni', 'athletics', 'giving', 'donate', 'careers', 'jobs'] METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'school', 'description', 'contact_email', 'scraped_at'] EXTRA_NOTES = ["Stanford's seven schools each host their own subdomains (e.g., engineering.stanford.edu, law.stanford.edu); the script should allow cross-subdomain crawling within *.stanford.edu while still counting against the page cap.", "Many program pages load additional content via JavaScript; use page.wait_for_load_state('networkidle') or explicit waits before extracting links.", 'Faculty directories may paginate or use infinite scroll; handle at least the first visible batch given the tight page cap.', 'Respect polite crawling: insert a 1-2 second delay between requests to avoid rate-limiting.'] def normalize_url(base: str, href: str) -> str: absolute = urljoin(base, href) cleaned, _ = urldefrag(absolute) return cleaned def matches_any(text: str, keywords: Iterable[str]) -> bool: lowered = text.lower() return any(keyword.lower() in lowered for keyword in keywords) @dataclass class ScrapedLink: url: str text: str source_url: str bucket: str # either "program" or "faculty" @dataclass class ScrapeSettings: root_url: str max_depth: int max_pages: int headless: bool output: Path async def extract_links(page: Page) -> List[Tuple[str, str]]: anchors: Iterable[dict] = await page.eval_on_selector_all( "a", """elements => elements .map(el => ({text: (el.textContent || '').trim(), href: el.href})) .filter(item => item.text && item.href)""", ) return [(item["href"], item["text"]) for item in anchors] async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]: async with async_playwright() as p: browser_launcher = getattr(p, browser_name) browser = await browser_launcher.launch(headless=settings.headless) context = await browser.new_context() queue: Deque[Tuple[str, int]] = deque([(settings.root_url, 0)]) visited: Set[str] = set() results: List[ScrapedLink] = [] try: while queue and len(visited) < settings.max_pages: url, depth = queue.popleft() if url in visited or depth > settings.max_depth: continue visited.add(url) page = await context.new_page() try: await page.goto(url, wait_until="domcontentloaded", timeout=20000) except Exception: await page.close() continue links = await extract_links(page) for href, text in links: normalized = normalize_url(url, href) if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized, EXCLUSION_KEYWORDS): continue text_lower = text.lower() normalized_lower = normalized.lower() if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(normalized_lower, PROGRAM_KEYWORDS): results.append( ScrapedLink( url=normalized, text=text, source_url=url, bucket="program", ) ) if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(normalized_lower, FACULTY_KEYWORDS): results.append( ScrapedLink( url=normalized, text=text, source_url=url, bucket="faculty", ) ) if depth < settings.max_depth: queue.append((normalized, depth + 1)) await page.close() finally: await context.close() await browser.close() return results def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None: payload = { "root_url": root_url, "program_links": [ {**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "program" ], "faculty_links": [ {**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "faculty" ], "notes": EXTRA_NOTES, "metadata_fields": METADATA_FIELDS, } target.parent.mkdir(parents=True, exist_ok=True) target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Playwright scraper generated by the Agno agent for https://www.stanford.edu." ) parser.add_argument( "--root-url", default="https://www.stanford.edu", help="Seed url to start crawling from.", ) parser.add_argument( "--max-depth", type=int, default=2, help="Maximum crawl depth.", ) parser.add_argument( "--max-pages", type=int, default=20, help="Maximum number of pages to visit.", ) parser.add_argument( "--output", type=Path, default=Path("stanford-masters-faculty_masters.json"), help="Where to save the JSON output.", ) parser.add_argument( "--headless", action="store_true", default=True, help="Run browser in headless mode (default: True).", ) parser.add_argument( "--no-headless", action="store_false", dest="headless", help="Run browser with visible window.", ) parser.add_argument( "--browser", choices=["chromium", "firefox", "webkit"], default="chromium", help="Browser engine to launch via Playwright.", ) return parser.parse_args() async def main_async() -> None: args = parse_args() settings = ScrapeSettings( root_url=args.root_url, max_depth=args.max_depth, max_pages=args.max_pages, headless=args.headless, output=args.output, ) links = await crawl(settings, browser_name=args.browser) serialize(links, settings.output, settings.root_url) def main() -> None: asyncio.run(main_async()) if __name__ == "__main__": main()