#!/usr/bin/env python """ Auto-generated by the Agno codegen agent. Target university: Harvard (https://www.harvard.edu/) Requested caps: depth=3, pages=30 Plan description: Playwright scraper for university master programs and faculty profiles. Navigation strategy: Start at https://www.harvard.edu/ Follow links to /academics/ and /a-to-z/ to find list of schools and departments For each school/department, look for a 'faculty' or 'people' page On faculty directory pages, identify and follow links to individual profiles Check for school/department specific subdomains like hls.harvard.edu, hds.harvard.edu, etc. Prioritize crawling faculty directory pages over general site crawling Verification checklist: - Manually review a sample of scraped URLs to verify they are faculty profiles - Check that major academic departments are represented in the results - Verify the script is capturing profile page content, not just URLs - Confirm no login pages, application forms, or directory pages are included Playwright snapshot used to guide this plan: 1. Harvard University (https://www.harvard.edu/) Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers Search Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. Anchors: Skip to main content -> https://www.harvard.edu/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, × -> javascript:void(0), A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/ 2. Index of departments, schools, and affiliates - Harvard University (https://www.harvard.edu/a-to-z/) Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers Search Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. Anchors: Skip to main content -> https://www.harvard.edu/a-to-z/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, × -> javascript:void(0), A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/ 3. Academics - Harvard University (https://www.harvard.edu/academics/) Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers Search Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. Anchors: Skip to main content -> https://www.harvard.edu/academics/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/, Undergraduate Degrees -> https://www.harvard.edu//programs/?degree_levels=undergraduate 4. Programs - Harvard University (https://www.harvard.edu//programs/?degree_levels=undergraduate) Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers Search Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. Anchors: Skip to main content -> https://www.harvard.edu/programs/?degree_levels=undergraduate#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/, Undergraduate Degrees -> https://www.harvard.edu//programs/?degree_levels=undergraduate Snapshot truncated. Generated at: 2025-12-10T07:19:12.294884+00:00 """ from __future__ import annotations import argparse import asyncio import json import time from collections import deque from dataclasses import asdict, dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Deque, Iterable, List, Set, Tuple from urllib.parse import urljoin, urldefrag, urlparse from playwright.async_api import async_playwright, Page, Response PROGRAM_KEYWORDS = ['/graduate/', '/masters/', '/programs/?degree_levels=graduate', '/mpp/', 'Master of', 'M.S.', 'M.A.', 'graduate program'] FACULTY_KEYWORDS = ['/people/', '/~', '/faculty/', '/profile/', 'professor', 'dr.', 'ph.d.', 'firstname-lastname'] EXCLUSION_KEYWORDS = ['admissions', 'apply', 'tuition', 'news', 'events', 'calendar', 'careers', 'jobs', 'login', 'donate', 'alumni', 'giving'] METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'email', 'scraped_at'] EXTRA_NOTES = ['Many Harvard faculty have profiles under the /~username/ URL pattern', 'Some faculty may be cross-listed in multiple departments', 'Prioritize finding profiles from professional schools (business, law, medicine, etc.)', "Check for non-standard faculty titles like 'lecturer', 'fellow', 'researcher'"] # URL patterns that indicate individual profile pages PROFILE_URL_PATTERNS = [ "/people/", "/person/", "/profile/", "/profiles/", "/faculty/", "/staff/", "/directory/", "/~", # Unix-style personal pages "/bio/", "/about/", ] # URL patterns that indicate listing/directory pages (should be crawled deeper) DIRECTORY_URL_PATTERNS = [ "/faculty", "/people", "/directory", "/staff", "/team", "/members", "/researchers", ] def normalize_url(base: str, href: str) -> str: """Normalize URL by resolving relative paths and removing fragments.""" absolute = urljoin(base, href) cleaned, _ = urldefrag(absolute) # Remove trailing slash for consistency return cleaned.rstrip("/") def matches_any(text: str, keywords: Iterable[str]) -> bool: """Check if text contains any of the keywords (case-insensitive).""" lowered = text.lower() return any(keyword.lower() in lowered for keyword in keywords) def is_same_domain(url1: str, url2: str) -> bool: """Check if two URLs belong to the same root domain.""" domain1 = urlparse(url1).netloc.replace("www.", "") domain2 = urlparse(url2).netloc.replace("www.", "") # Allow subdomains of the same root domain parts1 = domain1.split(".") parts2 = domain2.split(".") if len(parts1) >= 2 and len(parts2) >= 2: return parts1[-2:] == parts2[-2:] return domain1 == domain2 def is_profile_url(url: str) -> bool: """Check if URL pattern suggests an individual profile page.""" url_lower = url.lower() return any(pattern in url_lower for pattern in PROFILE_URL_PATTERNS) def is_directory_url(url: str) -> bool: """Check if URL pattern suggests a directory/listing page.""" url_lower = url.lower() return any(pattern in url_lower for pattern in DIRECTORY_URL_PATTERNS) @dataclass class ScrapedLink: url: str title: str text: str source_url: str bucket: str # "program" or "faculty" is_verified: bool = False http_status: int = 0 is_profile_page: bool = False scraped_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) @dataclass class ScrapeSettings: root_url: str max_depth: int max_pages: int headless: bool output: Path verify_links: bool = True request_delay: float = 1.0 # Polite crawling delay timeout: int = 60000 # Navigation timeout in ms async def extract_links(page: Page) -> List[Tuple[str, str]]: """Extract all anchor links from the page.""" anchors: Iterable[dict] = await page.eval_on_selector_all( "a", """elements => elements .map(el => ({text: (el.textContent || '').trim(), href: el.href})) .filter(item => item.text && item.href && item.href.startsWith('http'))""", ) return [(item["href"], item["text"]) for item in anchors] async def get_page_title(page: Page) -> str: """Get the page title safely.""" try: return await page.title() or "" except Exception: return "" async def verify_link(context, url: str, timeout: int = 10000) -> Tuple[bool, int, str]: """ Verify a link by making a HEAD-like request. Returns: (is_valid, status_code, page_title) """ page = await context.new_page() try: response: Response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout) if response: status = response.status title = await get_page_title(page) is_valid = 200 <= status < 400 return is_valid, status, title return False, 0, "" except Exception: return False, 0, "" finally: await page.close() async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]: """ Crawl the website using BFS, collecting program and faculty links. Features: - URL deduplication - Link verification - Profile page detection - Polite crawling with delays """ async with async_playwright() as p: browser_launcher = getattr(p, browser_name) browser = await browser_launcher.launch(headless=settings.headless) context = await browser.new_context() # Priority queue: (priority, url, depth) - lower priority = processed first # Directory pages get priority 0, others get priority 1 queue: Deque[Tuple[int, str, int]] = deque([(0, settings.root_url, 0)]) visited: Set[str] = set() found_urls: Set[str] = set() # For deduplication of results results: List[ScrapedLink] = [] print(f"Starting crawl from: {settings.root_url}") print(f"Max depth: {settings.max_depth}, Max pages: {settings.max_pages}") try: while queue and len(visited) < settings.max_pages: # Sort queue by priority (directory pages first) queue = deque(sorted(queue, key=lambda x: x[0])) priority, url, depth = queue.popleft() normalized_url = normalize_url(settings.root_url, url) if normalized_url in visited or depth > settings.max_depth: continue # Only crawl same-domain URLs if not is_same_domain(settings.root_url, normalized_url): continue visited.add(normalized_url) print(f"[{len(visited)}/{settings.max_pages}] Depth {depth}: {normalized_url[:80]}...") page = await context.new_page() try: response = await page.goto( normalized_url, wait_until="domcontentloaded", timeout=settings.timeout ) if not response or response.status >= 400: await page.close() continue except Exception as e: print(f" Error: {e}") await page.close() continue page_title = await get_page_title(page) links = await extract_links(page) for href, text in links: normalized_href = normalize_url(normalized_url, href) # Skip if already found or is excluded if normalized_href in found_urls: continue if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized_href, EXCLUSION_KEYWORDS): continue text_lower = text.lower() href_lower = normalized_href.lower() is_profile = is_profile_url(normalized_href) # Check for program links if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(href_lower, PROGRAM_KEYWORDS): found_urls.add(normalized_href) results.append( ScrapedLink( url=normalized_href, title="", text=text[:200], source_url=normalized_url, bucket="program", is_profile_page=False, ) ) # Check for faculty links if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(href_lower, FACULTY_KEYWORDS): found_urls.add(normalized_href) results.append( ScrapedLink( url=normalized_href, title="", text=text[:200], source_url=normalized_url, bucket="faculty", is_profile_page=is_profile, ) ) # Queue for further crawling if depth < settings.max_depth and is_same_domain(settings.root_url, normalized_href): # Prioritize directory pages link_priority = 0 if is_directory_url(normalized_href) else 1 queue.append((link_priority, normalized_href, depth + 1)) await page.close() # Polite delay between requests await asyncio.sleep(settings.request_delay) finally: await context.close() await browser.close() # Verify links if enabled if settings.verify_links and results: print(f"\nVerifying {len(results)} links...") browser = await browser_launcher.launch(headless=True) context = await browser.new_context() verified_results = [] for i, link in enumerate(results): if link.url in [r.url for r in verified_results]: continue # Skip duplicates print(f" [{i+1}/{len(results)}] Verifying: {link.url[:60]}...") is_valid, status, title = await verify_link(context, link.url) link.is_verified = True link.http_status = status link.title = title or link.text if is_valid: verified_results.append(link) else: print(f" Invalid (HTTP {status})") await asyncio.sleep(0.5) # Delay between verifications await context.close() await browser.close() results = verified_results return results def deduplicate_results(results: List[ScrapedLink]) -> List[ScrapedLink]: """Remove duplicate URLs, keeping the first occurrence.""" seen: Set[str] = set() unique = [] for link in results: if link.url not in seen: seen.add(link.url) unique.append(link) return unique def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None: """Save results to JSON file with statistics.""" results = deduplicate_results(results) program_links = [link for link in results if link.bucket == "program"] faculty_links = [link for link in results if link.bucket == "faculty"] profile_pages = [link for link in faculty_links if link.is_profile_page] payload = { "root_url": root_url, "generated_at": datetime.now(timezone.utc).isoformat(), "statistics": { "total_links": len(results), "program_links": len(program_links), "faculty_links": len(faculty_links), "profile_pages": len(profile_pages), "verified_links": len([r for r in results if r.is_verified and r.http_status == 200]), }, "program_links": [asdict(link) for link in program_links], "faculty_links": [asdict(link) for link in faculty_links], "notes": EXTRA_NOTES, "metadata_fields": METADATA_FIELDS, } target.parent.mkdir(parents=True, exist_ok=True) target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") print(f"\nResults saved to: {target}") print(f" Total links: {len(results)}") print(f" Program links: {len(program_links)}") print(f" Faculty links: {len(faculty_links)}") print(f" Profile pages: {len(profile_pages)}") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Playwright scraper generated by the Agno agent for https://www.harvard.edu/." ) parser.add_argument( "--root-url", default="https://www.harvard.edu/", help="Seed url to start crawling from.", ) parser.add_argument( "--max-depth", type=int, default=3, help="Maximum crawl depth.", ) parser.add_argument( "--max-pages", type=int, default=30, help="Maximum number of pages to visit.", ) parser.add_argument( "--output", type=Path, default=Path("university-scraper_results.json"), help="Where to save the JSON output.", ) parser.add_argument( "--headless", action="store_true", default=True, help="Run browser in headless mode (default: True).", ) parser.add_argument( "--no-headless", action="store_false", dest="headless", help="Run browser with visible window.", ) parser.add_argument( "--browser", choices=["chromium", "firefox", "webkit"], default="chromium", help="Browser engine to launch via Playwright.", ) parser.add_argument( "--no-verify", action="store_true", default=False, help="Skip link verification step.", ) parser.add_argument( "--delay", type=float, default=1.0, help="Delay between requests in seconds (polite crawling).", ) parser.add_argument( "--timeout", type=int, default=60000, help="Navigation timeout in milliseconds (default: 60000 = 60s).", ) return parser.parse_args() async def main_async() -> None: args = parse_args() settings = ScrapeSettings( root_url=args.root_url, max_depth=args.max_depth, max_pages=args.max_pages, headless=args.headless, output=args.output, verify_links=not args.no_verify, request_delay=args.delay, timeout=args.timeout, ) links = await crawl(settings, browser_name=args.browser) serialize(links, settings.output, settings.root_url) def main() -> None: asyncio.run(main_async()) if __name__ == "__main__": main()