University-Playwright-Codeg…/src/university_agent/sampler.py

from __future__ import annotations

import asyncio
from collections import deque
from collections.abc import Iterable
from dataclasses import dataclass
from urllib.parse import urldefrag, urljoin

from pydantic import HttpUrl

from .config import Settings
from .models import AnchorSample, SiteSample, SiteSummary


def _normalize_url(base: str, raw: str) -> str:
    absolute = urljoin(base, raw)
    cleaned, _ = urldefrag(absolute)
    return cleaned


@dataclass
class SiteSampler:
    """Lightweight Playwright-powered crawler used to prime the agent."""

    settings: Settings

    async def _gather(self, target: HttpUrl | str, max_pages: int, max_depth: int) -> SiteSummary:
        try:
            from playwright.async_api import async_playwright
        except ImportError as exc:
            raise RuntimeError(
                "Playwright is required for sampling but is not installed. "
                "Install it with `uv pip install playwright` and `playwright install`."
            ) from exc

        summary = SiteSummary(target_url=target)
        visited: set[str] = set()
        queue: deque[tuple[str, int]] = deque([(str(target), 0)])

        async with async_playwright() as p:
            browser_launcher = getattr(p, self.settings.playwright_browser)
            browser = await browser_launcher.launch(headless=self.settings.playwright_headless)
            context = await browser.new_context()

            try:
                while queue and len(summary.pages) < max_pages:
                    url, depth = queue.popleft()
                    if url in visited or depth > max_depth:
                        continue
                    visited.add(url)
                    page = await context.new_page()
                    try:
                        timeout = self.settings.navigation_timeout_ms
                        await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
                    except Exception:
                        await page.close()
                        continue

                    title = await page.title()
                    text_timeout = self.settings.navigation_timeout_ms // 2
                    snippet = await page.inner_text("body", timeout=text_timeout)
                    snippet = snippet.replace("\n", " ").strip()
                    snippet = snippet[:500]
                    anchors_raw: Iterable[dict] = await page.eval_on_selector_all(
                        "a",
                        """elements => elements
                        .map(el => ({text: (el.textContent || '').trim(), href: el.href}))
                        .filter(item => item.href && item.text)""",
                    )
                    anchors: list[AnchorSample] = []
                    for anchor in anchors_raw:
                        if len(anchors) >= 12:
                            break
                        anchors.append(AnchorSample(text=anchor["text"][:80], href=anchor["href"]))
                        if depth < max_depth:
                            queue.append((_normalize_url(url, anchor["href"]), depth + 1))

                    summary.pages.append(
                        SiteSample(
                            url=url,
                            title=title or url,
                            snippet=snippet or "n/a",
                            anchors=anchors,
                        )
                    )
                    await page.close()
            finally:
                await context.close()
                await browser.close()

        summary.truncated = len(summary.pages) >= max_pages and bool(queue)
        return summary

    def snapshot(self, target: HttpUrl | str, *, max_pages: int, max_depth: int) -> SiteSummary:
        """Collect a synchronous snapshot by spinning up an event loop if needed."""

        async def runner() -> SiteSummary:
            return await self._gather(target, max_pages=max_pages, max_depth=max_depth)

        try:
            asyncio.get_running_loop()
        except RuntimeError:
            return asyncio.run(runner())
        raise RuntimeError(
            "The Playwright sampler cannot run inside an active asyncio loop. "
            "Call the private `_gather` coroutine directly if you need async integration."
        )