from __future__ import annotations import asyncio from collections import deque from collections.abc import Iterable from dataclasses import dataclass from urllib.parse import urldefrag, urljoin from pydantic import HttpUrl from .config import Settings from .models import AnchorSample, SiteSample, SiteSummary def _normalize_url(base: str, raw: str) -> str: absolute = urljoin(base, raw) cleaned, _ = urldefrag(absolute) return cleaned @dataclass class SiteSampler: """Lightweight Playwright-powered crawler used to prime the agent.""" settings: Settings async def _gather(self, target: HttpUrl | str, max_pages: int, max_depth: int) -> SiteSummary: try: from playwright.async_api import async_playwright except ImportError as exc: raise RuntimeError( "Playwright is required for sampling but is not installed. " "Install it with `uv pip install playwright` and `playwright install`." ) from exc summary = SiteSummary(target_url=target) visited: set[str] = set() queue: deque[tuple[str, int]] = deque([(str(target), 0)]) async with async_playwright() as p: browser_launcher = getattr(p, self.settings.playwright_browser) browser = await browser_launcher.launch(headless=self.settings.playwright_headless) context = await browser.new_context() try: while queue and len(summary.pages) < max_pages: url, depth = queue.popleft() if url in visited or depth > max_depth: continue visited.add(url) page = await context.new_page() try: timeout = self.settings.navigation_timeout_ms await page.goto(url, wait_until="domcontentloaded", timeout=timeout) except Exception: await page.close() continue title = await page.title() text_timeout = self.settings.navigation_timeout_ms // 2 snippet = await page.inner_text("body", timeout=text_timeout) snippet = snippet.replace("\n", " ").strip() snippet = snippet[:500] anchors_raw: Iterable[dict] = await page.eval_on_selector_all( "a", """elements => elements .map(el => ({text: (el.textContent || '').trim(), href: el.href})) .filter(item => item.href && item.text)""", ) anchors: list[AnchorSample] = [] for anchor in anchors_raw: if len(anchors) >= 12: break anchors.append(AnchorSample(text=anchor["text"][:80], href=anchor["href"])) if depth < max_depth: queue.append((_normalize_url(url, anchor["href"]), depth + 1)) summary.pages.append( SiteSample( url=url, title=title or url, snippet=snippet or "n/a", anchors=anchors, ) ) await page.close() finally: await context.close() await browser.close() summary.truncated = len(summary.pages) >= max_pages and bool(queue) return summary def snapshot(self, target: HttpUrl | str, *, max_pages: int, max_depth: int) -> SiteSummary: """Collect a synchronous snapshot by spinning up an event loop if needed.""" async def runner() -> SiteSummary: return await self._gather(target, max_pages=max_pages, max_depth=max_depth) try: asyncio.get_running_loop() except RuntimeError: return asyncio.run(runner()) raise RuntimeError( "The Playwright sampler cannot run inside an active asyncio loop. " "Call the private `_gather` coroutine directly if you need async integration." )