University-Playwright-Codeg…/artifacts/kaust_faculty_scraper.py

#!/usr/bin/env python
"""
Auto-generated by the Agno codegen agent.
Target university: KAUST (https://www.kaust.edu.sa/en/)
Requested caps: depth=3, pages=30

Plan description: Playwright scraper for university master programs and faculty profiles.
Navigation strategy: Start at https://www.kaust.edu.sa/en/ Navigate to /study/ to find degree program links Follow links to individual degree pages under /degree-programs/ Separately, look for links to /faculty/ or /people/ directories Crawl faculty directories to extract links to individual bio pages Individual faculty are often under a subdomain like bio.kaust.edu.sa
Verification checklist:
- Verify master's programs are under /study/ or /degree-programs/
- Check that faculty directory pages contain links to individual bios
- Confirm individual faculty pages have research/expertise details
- Ensure exclusion keywords successfully skip irrelevant pages
Playwright snapshot used to guide this plan:
No browser snapshot was captured.

Generated at: 2025-12-10T02:48:42.571899+00:00
"""

from __future__ import annotations

import argparse
import asyncio
import json
import time
from collections import deque
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Deque, Iterable, List, Set, Tuple
from urllib.parse import urljoin, urldefrag, urlparse

from playwright.async_api import async_playwright, Page, Response

PROGRAM_KEYWORDS = ['/study/', '/degree-programs/', '/academics/', 'M.Sc.', 'Master of Science', 'graduate program']
FACULTY_KEYWORDS = ['/people/', '/profiles/faculty/', 'Professor', 'faculty-member', '/faculty/firstname-lastname', 'bio.kaust.edu.sa']
EXCLUSION_KEYWORDS = ['/admissions/', '/apply/', '/tuition/', '/events/', '/news/', '/careers/', '/jobs/', '/login/', '/alumni/', '/giving/', 'inquiry.kaust.edu.sa']
METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'email', 'scraped_at']
EXTRA_NOTES = ['Many faculty are listed under a separate subdomain bio.kaust.edu.sa', 'Prioritize crawling the centralized faculty directory first', 'Alumni and affiliated faculty may not have full profile pages']

# URL patterns that indicate individual profile pages
PROFILE_URL_PATTERNS = [
    "/people/", "/person/", "/profile/", "/profiles/",
    "/faculty/", "/staff/", "/directory/",
    "/~",  # Unix-style personal pages
    "/bio/", "/about/",
]

# URL patterns that indicate listing/directory pages (should be crawled deeper)
DIRECTORY_URL_PATTERNS = [
    "/faculty", "/people", "/directory", "/staff",
    "/team", "/members", "/researchers",
]


def normalize_url(base: str, href: str) -> str:
    """Normalize URL by resolving relative paths and removing fragments."""
    absolute = urljoin(base, href)
    cleaned, _ = urldefrag(absolute)
    # Remove trailing slash for consistency
    return cleaned.rstrip("/")


def matches_any(text: str, keywords: Iterable[str]) -> bool:
    """Check if text contains any of the keywords (case-insensitive)."""
    lowered = text.lower()
    return any(keyword.lower() in lowered for keyword in keywords)


def is_same_domain(url1: str, url2: str) -> bool:
    """Check if two URLs belong to the same root domain."""
    domain1 = urlparse(url1).netloc.replace("www.", "")
    domain2 = urlparse(url2).netloc.replace("www.", "")
    # Allow subdomains of the same root domain
    parts1 = domain1.split(".")
    parts2 = domain2.split(".")
    if len(parts1) >= 2 and len(parts2) >= 2:
        return parts1[-2:] == parts2[-2:]
    return domain1 == domain2


def is_profile_url(url: str) -> bool:
    """Check if URL pattern suggests an individual profile page."""
    url_lower = url.lower()
    return any(pattern in url_lower for pattern in PROFILE_URL_PATTERNS)


def is_directory_url(url: str) -> bool:
    """Check if URL pattern suggests a directory/listing page."""
    url_lower = url.lower()
    return any(pattern in url_lower for pattern in DIRECTORY_URL_PATTERNS)


@dataclass
class ScrapedLink:
    url: str
    title: str
    text: str
    source_url: str
    bucket: str  # "program" or "faculty"
    is_verified: bool = False
    http_status: int = 0
    is_profile_page: bool = False
    scraped_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())


@dataclass
class ScrapeSettings:
    root_url: str
    max_depth: int
    max_pages: int
    headless: bool
    output: Path
    verify_links: bool = True
    request_delay: float = 1.0  # Polite crawling delay
    timeout: int = 60000  # Navigation timeout in ms (default 60s for slow sites)


async def extract_links(page: Page) -> List[Tuple[str, str]]:
    """Extract all anchor links from the page."""
    anchors: Iterable[dict] = await page.eval_on_selector_all(
        "a",
        """elements => elements
        .map(el => ({text: (el.textContent || '').trim(), href: el.href}))
        .filter(item => item.text && item.href && item.href.startsWith('http'))""",
    )
    return [(item["href"], item["text"]) for item in anchors]


async def get_page_title(page: Page) -> str:
    """Get the page title safely."""
    try:
        return await page.title() or ""
    except Exception:
        return ""


async def verify_link(context, url: str, timeout: int = 10000) -> Tuple[bool, int, str]:
    """
    Verify a link by making a HEAD-like request.
    Returns: (is_valid, status_code, page_title)
    """
    page = await context.new_page()
    try:
        response: Response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
        if response:
            status = response.status
            title = await get_page_title(page)
            is_valid = 200 <= status < 400
            return is_valid, status, title
        return False, 0, ""
    except Exception:
        return False, 0, ""
    finally:
        await page.close()


async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
    """
    Crawl the website using BFS, collecting program and faculty links.
    Features:
    - URL deduplication
    - Link verification
    - Profile page detection
    - Polite crawling with delays
    """
    async with async_playwright() as p:
        browser_launcher = getattr(p, browser_name)
        browser = await browser_launcher.launch(headless=settings.headless)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )

        # Priority queue: (priority, url, depth) - lower priority = processed first
        # Directory pages get priority 0, others get priority 1
        queue: Deque[Tuple[int, str, int]] = deque([(0, settings.root_url, 0)])
        visited: Set[str] = set()
        found_urls: Set[str] = set()  # For deduplication of results
        results: List[ScrapedLink] = []

        print(f"Starting crawl from: {settings.root_url}")
        print(f"Max depth: {settings.max_depth}, Max pages: {settings.max_pages}")

        try:
            while queue and len(visited) < settings.max_pages:
                # Sort queue by priority (directory pages first)
                queue = deque(sorted(queue, key=lambda x: x[0]))
                priority, url, depth = queue.popleft()

                normalized_url = normalize_url(settings.root_url, url)
                if normalized_url in visited or depth > settings.max_depth:
                    continue

                # Only crawl same-domain URLs
                if not is_same_domain(settings.root_url, normalized_url):
                    continue

                visited.add(normalized_url)
                print(f"[{len(visited)}/{settings.max_pages}] Depth {depth}: {normalized_url[:80]}...")

                page = await context.new_page()
                try:
                    response = await page.goto(
                        normalized_url, wait_until="load", timeout=settings.timeout
                    )
                    if not response or response.status >= 400:
                        await page.close()
                        continue
                except Exception as e:
                    print(f"  Error: {e}")
                    await page.close()
                    continue

                page_title = await get_page_title(page)
                links = await extract_links(page)

                for href, text in links:
                    normalized_href = normalize_url(normalized_url, href)

                    # Skip if already found or is excluded
                    if normalized_href in found_urls:
                        continue
                    if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized_href, EXCLUSION_KEYWORDS):
                        continue

                    text_lower = text.lower()
                    href_lower = normalized_href.lower()
                    is_profile = is_profile_url(normalized_href)

                    # Check for program links
                    if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(href_lower, PROGRAM_KEYWORDS):
                        found_urls.add(normalized_href)
                        results.append(
                            ScrapedLink(
                                url=normalized_href,
                                title="",
                                text=text[:200],
                                source_url=normalized_url,
                                bucket="program",
                                is_profile_page=False,
                            )
                        )

                    # Check for faculty links
                    if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(href_lower, FACULTY_KEYWORDS):
                        found_urls.add(normalized_href)
                        results.append(
                            ScrapedLink(
                                url=normalized_href,
                                title="",
                                text=text[:200],
                                source_url=normalized_url,
                                bucket="faculty",
                                is_profile_page=is_profile,
                            )
                        )

                    # Queue for further crawling
                    if depth < settings.max_depth and is_same_domain(settings.root_url, normalized_href):
                        # Prioritize directory pages
                        link_priority = 0 if is_directory_url(normalized_href) else 1
                        queue.append((link_priority, normalized_href, depth + 1))

                await page.close()

                # Polite delay between requests
                await asyncio.sleep(settings.request_delay)

        finally:
            await context.close()
            await browser.close()

        # Verify links if enabled
        if settings.verify_links and results:
            print(f"\nVerifying {len(results)} links...")
            browser = await browser_launcher.launch(headless=True)
            context = await browser.new_context()

            verified_results = []
            for i, link in enumerate(results):
                if link.url in [r.url for r in verified_results]:
                    continue  # Skip duplicates

                print(f"  [{i+1}/{len(results)}] Verifying: {link.url[:60]}...")
                is_valid, status, title = await verify_link(context, link.url)
                link.is_verified = True
                link.http_status = status
                link.title = title or link.text

                if is_valid:
                    verified_results.append(link)
                else:
                    print(f"    Invalid (HTTP {status})")

                await asyncio.sleep(0.5)  # Delay between verifications

            await context.close()
            await browser.close()
            results = verified_results

        return results


def deduplicate_results(results: List[ScrapedLink]) -> List[ScrapedLink]:
    """Remove duplicate URLs, keeping the first occurrence."""
    seen: Set[str] = set()
    unique = []
    for link in results:
        if link.url not in seen:
            seen.add(link.url)
            unique.append(link)
    return unique


def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
    """Save results to JSON file with statistics."""
    results = deduplicate_results(results)

    program_links = [link for link in results if link.bucket == "program"]
    faculty_links = [link for link in results if link.bucket == "faculty"]
    profile_pages = [link for link in faculty_links if link.is_profile_page]

    payload = {
        "root_url": root_url,
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "statistics": {
            "total_links": len(results),
            "program_links": len(program_links),
            "faculty_links": len(faculty_links),
            "profile_pages": len(profile_pages),
            "verified_links": len([r for r in results if r.is_verified and r.http_status == 200]),
        },
        "program_links": [asdict(link) for link in program_links],
        "faculty_links": [asdict(link) for link in faculty_links],
        "notes": EXTRA_NOTES,
        "metadata_fields": METADATA_FIELDS,
    }
    target.parent.mkdir(parents=True, exist_ok=True)
    target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")

    print(f"\nResults saved to: {target}")
    print(f"  Total links: {len(results)}")
    print(f"  Program links: {len(program_links)}")
    print(f"  Faculty links: {len(faculty_links)}")
    print(f"  Profile pages: {len(profile_pages)}")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Playwright scraper generated by the Agno agent for https://www.kaust.edu.sa/en/."
    )
    parser.add_argument(
        "--root-url",
        default="https://www.kaust.edu.sa/en/",
        help="Seed url to start crawling from.",
    )
    parser.add_argument(
        "--max-depth",
        type=int,
        default=3,
        help="Maximum crawl depth.",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=30,
        help="Maximum number of pages to visit.",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=Path("university-scraper_results.json"),
        help="Where to save the JSON output.",
    )
    parser.add_argument(
        "--headless",
        action="store_true",
        default=True,
        help="Run browser in headless mode (default: True).",
    )
    parser.add_argument(
        "--no-headless",
        action="store_false",
        dest="headless",
        help="Run browser with visible window.",
    )
    parser.add_argument(
        "--browser",
        choices=["chromium", "firefox", "webkit"],
        default="firefox",
        help="Browser engine to launch via Playwright (firefox recommended for KAUST).",
    )
    parser.add_argument(
        "--no-verify",
        action="store_true",
        default=False,
        help="Skip link verification step.",
    )
    parser.add_argument(
        "--delay",
        type=float,
        default=1.0,
        help="Delay between requests in seconds (polite crawling).",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=60000,
        help="Navigation timeout in milliseconds (default: 60000 = 60s).",
    )
    return parser.parse_args()


async def main_async() -> None:
    args = parse_args()
    settings = ScrapeSettings(
        root_url=args.root_url,
        max_depth=args.max_depth,
        max_pages=args.max_pages,
        headless=args.headless,
        output=args.output,
        verify_links=not args.no_verify,
        request_delay=args.delay,
        timeout=args.timeout,
    )
    links = await crawl(settings, browser_name=args.browser)
    serialize(links, settings.output, settings.root_url)


def main() -> None:
    asyncio.run(main_async())


if __name__ == "__main__":
    main()