Initial commit: University Playwright Codegen Agent
🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
234
artifacts/stanford_playwright_scraper.py
Normal file
234
artifacts/stanford_playwright_scraper.py
Normal file
@ -0,0 +1,234 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Auto-generated by the Agno codegen agent.
|
||||
Target university: Stanford (https://www.stanford.edu)
|
||||
Requested caps: depth=2, pages=20
|
||||
|
||||
Plan description: Async Playwright scraper targeting Stanford University to collect (1) master's program landing pages and (2) faculty/supervisor profiles linked from those programs. Respects depth=2, pages=20 caps. Starts from the main site, navigates through Academics and school/department portals to locate graduate program listings and associated faculty directories.
|
||||
Navigation strategy: 1. Seed from https://www.stanford.edu and follow the 'Academics' top-nav link to reach school/department listings. 2. On school pages, look for anchors containing master_program_keywords (href fragments like /programs/, /graduate/, /masters/) and queue them. 3. On each master's program page, extract metadata and scan for faculty_keywords in link text or href (e.g., /people/, /faculty/, /directory/). 4. Follow qualifying faculty links up to the depth cap and record each profile. 5. Use URL deduplication (set of visited URLs) and respect the 20-page cap by counting unique pages fetched. 6. Prioritize links via keyword score so high-relevance pages are visited first if cap is approached.
|
||||
Verification checklist:
|
||||
- Assert each JSONL row contains a non-empty 'url' and 'title' field.
|
||||
- Confirm at least one record has entity_type='master_program' and one has entity_type='faculty'.
|
||||
- Validate that no URL appears more than once in the output file.
|
||||
- Check that total scraped pages does not exceed the requested cap of 20.
|
||||
Playwright snapshot used to guide this plan:
|
||||
1. Stanford University (https://www.stanford.edu)
|
||||
Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Stanford Explore Stanford Main Content A Mission Defined by Possibility At Stanford,
|
||||
Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
|
||||
2. Stanford University (https://www.stanford.edu/)
|
||||
Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Stanford Explore Stanford Main Content A Mission Defined by Possibility At Stanford,
|
||||
Anchors: Skip to content -> https://www.stanford.edu/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
|
||||
3. Gateway for Students – Stanford University (https://www.stanford.edu/student-gateway/)
|
||||
Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Students Gateway for Students Resources, offices and services t
|
||||
Anchors: Skip to content -> https://www.stanford.edu/student-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
|
||||
4. Gateway for Faculty & Staff – Stanford University (https://www.stanford.edu/faculty-staff-gateway/)
|
||||
Snippet: Skip to content Stanford University Information for: Students Faculty & Staff Families Visitors Alumni Search Academics Research Health Care Campus Life Athletics Admission About News Events Main Content Gateway for Faculty & Staff Gateway for Faculty & Staff Resources, offices,
|
||||
Anchors: Skip to content -> https://www.stanford.edu/faculty-staff-gateway/#main-content, Stanford University -> https://www.stanford.edu/, Students -> https://www.stanford.edu/student-gateway/, Faculty & Staff -> https://www.stanford.edu/faculty-staff-gateway/, Families -> http://parents.stanford.edu/, Visitors -> https://visit.stanford.edu/
|
||||
Snapshot truncated.
|
||||
|
||||
Generated at: 2025-12-09T04:03:37.802236+00:00
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
from collections import deque
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Deque, Iterable, List, Set, Tuple
|
||||
from urllib.parse import urljoin, urldefrag
|
||||
|
||||
from playwright.async_api import async_playwright, Page
|
||||
|
||||
PROGRAM_KEYWORDS = ['master', 'graduate program', 'MS degree', 'MA degree', 'graduate studies', 'postgraduate']
|
||||
FACULTY_KEYWORDS = ['faculty', 'professor', 'advisor', 'researcher', 'people', 'directory']
|
||||
EXCLUSION_KEYWORDS = ['admission', 'apply', 'tuition', 'financial aid', 'login', 'news', 'events', 'alumni', 'athletics', 'giving', 'donate', 'careers', 'jobs']
|
||||
METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'school', 'description', 'contact_email', 'scraped_at']
|
||||
EXTRA_NOTES = ["Stanford's seven schools each host their own subdomains (e.g., engineering.stanford.edu, law.stanford.edu); the script should allow cross-subdomain crawling within *.stanford.edu while still counting against the page cap.", "Many program pages load additional content via JavaScript; use page.wait_for_load_state('networkidle') or explicit waits before extracting links.", 'Faculty directories may paginate or use infinite scroll; handle at least the first visible batch given the tight page cap.', 'Respect polite crawling: insert a 1-2 second delay between requests to avoid rate-limiting.']
|
||||
|
||||
|
||||
def normalize_url(base: str, href: str) -> str:
|
||||
absolute = urljoin(base, href)
|
||||
cleaned, _ = urldefrag(absolute)
|
||||
return cleaned
|
||||
|
||||
|
||||
def matches_any(text: str, keywords: Iterable[str]) -> bool:
|
||||
lowered = text.lower()
|
||||
return any(keyword.lower() in lowered for keyword in keywords)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedLink:
|
||||
url: str
|
||||
text: str
|
||||
source_url: str
|
||||
bucket: str # either "program" or "faculty"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeSettings:
|
||||
root_url: str
|
||||
max_depth: int
|
||||
max_pages: int
|
||||
headless: bool
|
||||
output: Path
|
||||
|
||||
|
||||
async def extract_links(page: Page) -> List[Tuple[str, str]]:
|
||||
anchors: Iterable[dict] = await page.eval_on_selector_all(
|
||||
"a",
|
||||
"""elements => elements
|
||||
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
|
||||
.filter(item => item.text && item.href)""",
|
||||
)
|
||||
return [(item["href"], item["text"]) for item in anchors]
|
||||
|
||||
|
||||
async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
|
||||
async with async_playwright() as p:
|
||||
browser_launcher = getattr(p, browser_name)
|
||||
browser = await browser_launcher.launch(headless=settings.headless)
|
||||
context = await browser.new_context()
|
||||
queue: Deque[Tuple[str, int]] = deque([(settings.root_url, 0)])
|
||||
visited: Set[str] = set()
|
||||
results: List[ScrapedLink] = []
|
||||
|
||||
try:
|
||||
while queue and len(visited) < settings.max_pages:
|
||||
url, depth = queue.popleft()
|
||||
if url in visited or depth > settings.max_depth:
|
||||
continue
|
||||
visited.add(url)
|
||||
|
||||
page = await context.new_page()
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||
except Exception:
|
||||
await page.close()
|
||||
continue
|
||||
|
||||
links = await extract_links(page)
|
||||
for href, text in links:
|
||||
normalized = normalize_url(url, href)
|
||||
|
||||
if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized, EXCLUSION_KEYWORDS):
|
||||
continue
|
||||
|
||||
text_lower = text.lower()
|
||||
normalized_lower = normalized.lower()
|
||||
|
||||
if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(normalized_lower, PROGRAM_KEYWORDS):
|
||||
results.append(
|
||||
ScrapedLink(
|
||||
url=normalized,
|
||||
text=text,
|
||||
source_url=url,
|
||||
bucket="program",
|
||||
)
|
||||
)
|
||||
if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(normalized_lower, FACULTY_KEYWORDS):
|
||||
results.append(
|
||||
ScrapedLink(
|
||||
url=normalized,
|
||||
text=text,
|
||||
source_url=url,
|
||||
bucket="faculty",
|
||||
)
|
||||
)
|
||||
|
||||
if depth < settings.max_depth:
|
||||
queue.append((normalized, depth + 1))
|
||||
await page.close()
|
||||
finally:
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
|
||||
payload = {
|
||||
"root_url": root_url,
|
||||
"program_links": [
|
||||
{**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "program"
|
||||
],
|
||||
"faculty_links": [
|
||||
{**asdict(link), "bucket": link.bucket} for link in results if link.bucket == "faculty"
|
||||
],
|
||||
"notes": EXTRA_NOTES,
|
||||
"metadata_fields": METADATA_FIELDS,
|
||||
}
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Playwright scraper generated by the Agno agent for https://www.stanford.edu."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--root-url",
|
||||
default="https://www.stanford.edu",
|
||||
help="Seed url to start crawling from.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-depth",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Maximum crawl depth.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Maximum number of pages to visit.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path("stanford-masters-faculty_masters.json"),
|
||||
help="Where to save the JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Run browser in headless mode (default: True).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-headless",
|
||||
action="store_false",
|
||||
dest="headless",
|
||||
help="Run browser with visible window.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browser",
|
||||
choices=["chromium", "firefox", "webkit"],
|
||||
default="chromium",
|
||||
help="Browser engine to launch via Playwright.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
async def main_async() -> None:
|
||||
args = parse_args()
|
||||
settings = ScrapeSettings(
|
||||
root_url=args.root_url,
|
||||
max_depth=args.max_depth,
|
||||
max_pages=args.max_pages,
|
||||
headless=args.headless,
|
||||
output=args.output,
|
||||
)
|
||||
links = await crawl(settings, browser_name=args.browser)
|
||||
serialize(links, settings.output, settings.root_url)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
asyncio.run(main_async())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user