Initial commit: University Playwright Codegen Agent

🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-09 16:38:33 +08:00
commit 46915964e1
32 changed files with 3091 additions and 0 deletions
--- a/artifacts/stanford_playwright_scraper.py
+++ b/artifacts/stanford_playwright_scraper.py
@ -0,0 +1,234 @@
+#!/usr/bin/env python
+"""
+Auto-generated by the Agno codegen agent.
+Target university: Stanford (https://www.stanford.edu)
+Requested caps: depth=2, pages=20
+
+Plan description: Async Playwright scraper targeting Stanford University to collect (1) master's program landing pages and (2) faculty/supervisor profiles linked from those programs. Respects depth=2, pages=20 caps. Starts from the main site, navigates through Academics and school/department portals to locate graduate program listings and associated faculty directories.
+Navigation strategy: 1. Seed from https://www.stanford.edu and follow the 'Academics' top-nav link to reach school/department listings. 2. On school pages, look for anchors containing master_program_keywords (href fragments like /programs/, /graduate/, /masters/) and queue them. 3. On each master's program page, extract metadata and scan for faculty_keywords in link text or href (e.g., /people/, /faculty/, /directory/). 4. Follow qualifying faculty links up to the depth cap and record each profile. 5. Use URL deduplication (set of visited URLs) and respect the 20-page cap by counting unique pages fetched. 6. Prioritize links via keyword score so high-relevance pages are visited first if cap is approached.
+Verification checklist:
+- Assert each JSONL row contains a non-empty 'url' and 'title' field.
+- Confirm at least one record has entity_type='master_program' and one has entity_type='faculty'.
+- Validate that no URL appears more than once in the output file.
+- Check that total scraped pages does not exceed the requested cap of 20.
+Playwright snapshot used to guide this plan:
+1. Stanford University (https://www.stanford.edu)
+   Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events  Stanford  Explore Stanford  Main Content A Mission Defined by Possibility  At Stanford, 
+   Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
+2. Stanford University (https://www.stanford.edu/)
+   Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events  Stanford  Explore Stanford  Main Content A Mission Defined by Possibility  At Stanford, 
+   Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
+3. Gateway for Students – Stanford University (https://www.stanford.edu/student-gateway/)
+   Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Students Gateway for Students  Resources, offices and services t
+   Anchors: Skip to content -> https://www.stanford.edu/student-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
+4. Gateway for Faculty & Staff – Stanford University (https://www.stanford.edu/faculty-staff-gateway/)
+   Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Faculty & Staff Gateway for Faculty & Staff  Resources, offices,
+   Anchors: Skip to content -> https://www.stanford.edu/faculty-staff-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
+Snapshot truncated.
+
+Generated at: 2025-12-09T04:03:37.802236+00:00
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+from collections import deque
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Deque, Iterable, List, Set, Tuple
+from urllib.parse import urljoin, urldefrag
+
+from playwright.async_api import async_playwright, Page
+
+PROGRAM_KEYWORDS = ['master', 'graduate program', 'MS degree', 'MA degree', 'graduate studies', 'postgraduate']
+FACULTY_KEYWORDS = ['faculty', 'professor', 'advisor', 'researcher', 'people', 'directory']
+EXCLUSION_KEYWORDS = ['admission', 'apply', 'tuition', 'financial aid', 'login', 'news', 'events', 'alumni', 'athletics', 'giving', 'donate', 'careers', 'jobs']
+METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'school', 'description', 'contact_email', 'scraped_at']
+EXTRA_NOTES = ["Stanford's seven schools each host their own subdomains (e.g., engineering.stanford.edu, law.stanford.edu); the script should allow cross-subdomain crawling within *.stanford.edu while still counting against the page cap.", "Many program pages load additional content via JavaScript; use page.wait_for_load_state('networkidle') or explicit waits before extracting links.", 'Faculty directories may paginate or use infinite scroll; handle at least the first visible batch given the tight page cap.', 'Respect polite crawling: insert a 1-2 second delay between requests to avoid rate-limiting.']
+
+
+def normalize_url(base: str, href: str) -> str:
+    absolute = urljoin(base, href)
+    cleaned, _ = urldefrag(absolute)
+    return cleaned
+
+
+def matches_any(text: str, keywords: Iterable[str]) -> bool:
+    lowered = text.lower()
+    return any(keyword.lower() in lowered for keyword in keywords)
+
+
+@dataclass
+class ScrapedLink:
+    url: str
+    text: str
+    source_url: str
+    bucket: str  # either "program" or "faculty"
+
+
+@dataclass
+class ScrapeSettings:
+    root_url: str
+    max_depth: int
+    max_pages: int
+    headless: bool
+    output: Path
+
+
+async def extract_links(page: Page) -> List[Tuple[str, str]]:
+    anchors: Iterable[dict] = await page.eval_on_selector_all(
+        "a",
+        """elements => elements
+        .map(el => ({text: (el.textContent || '').trim(), href: el.href}))
+        .filter(item => item.text && item.href)""",
+    )
+    return [(item["href"], item["text"]) for item in anchors]
+
+
+async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
+    async with async_playwright() as p:
+        browser_launcher = getattr(p, browser_name)
+        browser = await browser_launcher.launch(headless=settings.headless)
+        context = await browser.new_context()
+        queue: Deque[Tuple[str, int]] = deque([(settings.root_url, 0)])
+        visited: Set[str] = set()
+        results: List[ScrapedLink] = []
+
+        try:
+            while queue and len(visited) < settings.max_pages:
+                url, depth = queue.popleft()
+                if url in visited or depth > settings.max_depth:
+                    continue
+                visited.add(url)
+
+                page = await context.new_page()
+                try:
+                    await page.goto(url, wait_until="domcontentloaded", timeout=20000)
+                except Exception:
+                    await page.close()
+                    continue
+
+                links = await extract_links(page)
+                for href, text in links:
+                    normalized = normalize_url(url, href)
+
+                    if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized, EXCLUSION_KEYWORDS):
+                        continue
+
+                    text_lower = text.lower()
+                    normalized_lower = normalized.lower()
+
+                    if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(normalized_lower, PROGRAM_KEYWORDS):
+                        results.append(
+                            ScrapedLink(
+                                url=normalized,
+                                text=text,
+                                source_url=url,
+                                bucket="program",
+                            )
+                        )
+                    if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(normalized_lower, FACULTY_KEYWORDS):
+                        results.append(
+                            ScrapedLink(
+                                url=normalized,
+                                text=text,
+                                source_url=url,
+                                bucket="faculty",
+                            )
+                        )
+
+                    if depth < settings.max_depth:
+                        queue.append((normalized, depth + 1))
+                await page.close()
+        finally:
+            await context.close()
+            await browser.close()
+
+        return results
+
+
+def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
+    payload = {
+        "root_url": root_url,
+        "program_links": [
+            {**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "program"
+        ],
+        "faculty_links": [
+            {**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "faculty"
+        ],
+        "notes": EXTRA_NOTES,
+        "metadata_fields": METADATA_FIELDS,
+    }
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Playwright scraper generated by the Agno agent for https://www.stanford.edu."
+    )
+    parser.add_argument(
+        "--root-url",
+        default="https://www.stanford.edu",
+        help="Seed url to start crawling from.",
+    )
+    parser.add_argument(
+        "--max-depth",
+        type=int,
+        default=2,
+        help="Maximum crawl depth.",
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=20,
+        help="Maximum number of pages to visit.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("stanford-masters-faculty_masters.json"),
+        help="Where to save the JSON output.",
+    )
+    parser.add_argument(
+        "--headless",
+        action="store_true",
+        default=True,
+        help="Run browser in headless mode (default: True).",
+    )
+    parser.add_argument(
+        "--no-headless",
+        action="store_false",
+        dest="headless",
+        help="Run browser with visible window.",
+    )
+    parser.add_argument(
+        "--browser",
+        choices=["chromium", "firefox", "webkit"],
+        default="chromium",
+        help="Browser engine to launch via Playwright.",
+    )
+    return parser.parse_args()
+
+
+async def main_async() -> None:
+    args = parse_args()
+    settings = ScrapeSettings(
+        root_url=args.root_url,
+        max_depth=args.max_depth,
+        max_pages=args.max_pages,
+        headless=args.headless,
+        output=args.output,
+    )
+    links = await crawl(settings, browser_name=args.browser)
+    serialize(links, settings.output, settings.root_url)
+
+
+def main() -> None:
+    asyncio.run(main_async())
+
+
+if __name__ == "__main__":
+    main()