Files
University-Playwright-Codeg…/artifacts/stanford_playwright_scraper.py
yangxiaoyu-crypto 46915964e1 Initial commit: University Playwright Codegen Agent
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-09 16:38:33 +08:00

234 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
"""
Auto-generated by the Agno codegen agent.
Target university: Stanford (https://www.stanford.edu)
Requested caps: depth=2, pages=20
Plan description: Async Playwright scraper targeting Stanford University to collect (1) master's program landing pages and (2) faculty/supervisor profiles linked from those programs. Respects depth=2, pages=20 caps. Starts from the main site, navigates through Academics and school/department portals to locate graduate program listings and associated faculty directories.
Navigation strategy: 1. Seed from https://www.stanford.edu and follow the 'Academics' top-nav link to reach school/department listings. 2. On school pages, look for anchors containing master_program_keywords (href fragments like /programs/, /graduate/, /masters/) and queue them. 3. On each master's program page, extract metadata and scan for faculty_keywords in link text or href (e.g., /people/, /faculty/, /directory/). 4. Follow qualifying faculty links up to the depth cap and record each profile. 5. Use URL deduplication (set of visited URLs) and respect the 20-page cap by counting unique pages fetched. 6. Prioritize links via keyword score so high-relevance pages are visited first if cap is approached.
Verification checklist:
- Assert each JSONL row contains a non-empty 'url' and 'title' field.
- Confirm at least one record has entity_type='master_program' and one has entity_type='faculty'.
- Validate that no URL appears more than once in the output file.
- Check that total scraped pages does not exceed the requested cap of 20.
Playwright snapshot used to guide this plan:
1. Stanford University (https://www.stanford.edu)
Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Stanford Explore Stanford Main Content A Mission Defined by Possibility At Stanford,
Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
2. Stanford University (https://www.stanford.edu/)
Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Stanford Explore Stanford Main Content A Mission Defined by Possibility At Stanford,
Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
3. Gateway for Students Stanford University (https://www.stanford.edu/student-gateway/)
Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Students Gateway for Students Resources, offices and services t
Anchors: Skip to content -> https://www.stanford.edu/student-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
4. Gateway for Faculty & Staff Stanford University (https://www.stanford.edu/faculty-staff-gateway/)
Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Faculty & Staff Gateway for Faculty & Staff Resources, offices,
Anchors: Skip to content -> https://www.stanford.edu/faculty-staff-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
Snapshot truncated.
Generated at: 2025-12-09T04:03:37.802236+00:00
"""
from __future__ import annotations
import argparse
import asyncio
import json
from collections import deque
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Deque, Iterable, List, Set, Tuple
from urllib.parse import urljoin, urldefrag
from playwright.async_api import async_playwright, Page
PROGRAM_KEYWORDS = ['master', 'graduate program', 'MS degree', 'MA degree', 'graduate studies', 'postgraduate']
FACULTY_KEYWORDS = ['faculty', 'professor', 'advisor', 'researcher', 'people', 'directory']
EXCLUSION_KEYWORDS = ['admission', 'apply', 'tuition', 'financial aid', 'login', 'news', 'events', 'alumni', 'athletics', 'giving', 'donate', 'careers', 'jobs']
METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'school', 'description', 'contact_email', 'scraped_at']
EXTRA_NOTES = ["Stanford's seven schools each host their own subdomains (e.g., engineering.stanford.edu, law.stanford.edu); the script should allow cross-subdomain crawling within *.stanford.edu while still counting against the page cap.", "Many program pages load additional content via JavaScript; use page.wait_for_load_state('networkidle') or explicit waits before extracting links.", 'Faculty directories may paginate or use infinite scroll; handle at least the first visible batch given the tight page cap.', 'Respect polite crawling: insert a 1-2 second delay between requests to avoid rate-limiting.']
def normalize_url(base: str, href: str) -> str:
absolute = urljoin(base, href)
cleaned, _ = urldefrag(absolute)
return cleaned
def matches_any(text: str, keywords: Iterable[str]) -> bool:
lowered = text.lower()
return any(keyword.lower() in lowered for keyword in keywords)
@dataclass
class ScrapedLink:
url: str
text: str
source_url: str
bucket: str # either "program" or "faculty"
@dataclass
class ScrapeSettings:
root_url: str
max_depth: int
max_pages: int
headless: bool
output: Path
async def extract_links(page: Page) -> List[Tuple[str, str]]:
anchors: Iterable[dict] = await page.eval_on_selector_all(
"a",
"""elements => elements
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
.filter(item => item.text && item.href)""",
)
return [(item["href"], item["text"]) for item in anchors]
async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
async with async_playwright() as p:
browser_launcher = getattr(p, browser_name)
browser = await browser_launcher.launch(headless=settings.headless)
context = await browser.new_context()
queue: Deque[Tuple[str, int]] = deque([(settings.root_url, 0)])
visited: Set[str] = set()
results: List[ScrapedLink] = []
try:
while queue and len(visited) < settings.max_pages:
url, depth = queue.popleft()
if url in visited or depth > settings.max_depth:
continue
visited.add(url)
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
except Exception:
await page.close()
continue
links = await extract_links(page)
for href, text in links:
normalized = normalize_url(url, href)
if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized, EXCLUSION_KEYWORDS):
continue
text_lower = text.lower()
normalized_lower = normalized.lower()
if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(normalized_lower, PROGRAM_KEYWORDS):
results.append(
ScrapedLink(
url=normalized,
text=text,
source_url=url,
bucket="program",
)
)
if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(normalized_lower, FACULTY_KEYWORDS):
results.append(
ScrapedLink(
url=normalized,
text=text,
source_url=url,
bucket="faculty",
)
)
if depth < settings.max_depth:
queue.append((normalized, depth + 1))
await page.close()
finally:
await context.close()
await browser.close()
return results
def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
payload = {
"root_url": root_url,
"program_links": [
{**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "program"
],
"faculty_links": [
{**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "faculty"
],
"notes": EXTRA_NOTES,
"metadata_fields": METADATA_FIELDS,
}
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Playwright scraper generated by the Agno agent for https://www.stanford.edu."
)
parser.add_argument(
"--root-url",
default="https://www.stanford.edu",
help="Seed url to start crawling from.",
)
parser.add_argument(
"--max-depth",
type=int,
default=2,
help="Maximum crawl depth.",
)
parser.add_argument(
"--max-pages",
type=int,
default=20,
help="Maximum number of pages to visit.",
)
parser.add_argument(
"--output",
type=Path,
default=Path("stanford-masters-faculty_masters.json"),
help="Where to save the JSON output.",
)
parser.add_argument(
"--headless",
action="store_true",
default=True,
help="Run browser in headless mode (default: True).",
)
parser.add_argument(
"--no-headless",
action="store_false",
dest="headless",
help="Run browser with visible window.",
)
parser.add_argument(
"--browser",
choices=["chromium", "firefox", "webkit"],
default="chromium",
help="Browser engine to launch via Playwright.",
)
return parser.parse_args()
async def main_async() -> None:
args = parse_args()
settings = ScrapeSettings(
root_url=args.root_url,
max_depth=args.max_depth,
max_pages=args.max_pages,
headless=args.headless,
output=args.output,
)
links = await crawl(settings, browser_name=args.browser)
serialize(links, settings.output, settings.root_url)
def main() -> None:
asyncio.run(main_async())
if __name__ == "__main__":
main()