🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
108 lines
4.2 KiB
Python
108 lines
4.2 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from collections import deque
|
|
from collections.abc import Iterable
|
|
from dataclasses import dataclass
|
|
from urllib.parse import urldefrag, urljoin
|
|
|
|
from pydantic import HttpUrl
|
|
|
|
from .config import Settings
|
|
from .models import AnchorSample, SiteSample, SiteSummary
|
|
|
|
|
|
def _normalize_url(base: str, raw: str) -> str:
|
|
absolute = urljoin(base, raw)
|
|
cleaned, _ = urldefrag(absolute)
|
|
return cleaned
|
|
|
|
|
|
@dataclass
|
|
class SiteSampler:
|
|
"""Lightweight Playwright-powered crawler used to prime the agent."""
|
|
|
|
settings: Settings
|
|
|
|
async def _gather(self, target: HttpUrl | str, max_pages: int, max_depth: int) -> SiteSummary:
|
|
try:
|
|
from playwright.async_api import async_playwright
|
|
except ImportError as exc:
|
|
raise RuntimeError(
|
|
"Playwright is required for sampling but is not installed. "
|
|
"Install it with `uv pip install playwright` and `playwright install`."
|
|
) from exc
|
|
|
|
summary = SiteSummary(target_url=target)
|
|
visited: set[str] = set()
|
|
queue: deque[tuple[str, int]] = deque([(str(target), 0)])
|
|
|
|
async with async_playwright() as p:
|
|
browser_launcher = getattr(p, self.settings.playwright_browser)
|
|
browser = await browser_launcher.launch(headless=self.settings.playwright_headless)
|
|
context = await browser.new_context()
|
|
|
|
try:
|
|
while queue and len(summary.pages) < max_pages:
|
|
url, depth = queue.popleft()
|
|
if url in visited or depth > max_depth:
|
|
continue
|
|
visited.add(url)
|
|
page = await context.new_page()
|
|
try:
|
|
timeout = self.settings.navigation_timeout_ms
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
|
except Exception:
|
|
await page.close()
|
|
continue
|
|
|
|
title = await page.title()
|
|
text_timeout = self.settings.navigation_timeout_ms // 2
|
|
snippet = await page.inner_text("body", timeout=text_timeout)
|
|
snippet = snippet.replace("\n", " ").strip()
|
|
snippet = snippet[:500]
|
|
anchors_raw: Iterable[dict] = await page.eval_on_selector_all(
|
|
"a",
|
|
"""elements => elements
|
|
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
|
|
.filter(item => item.href && item.text)""",
|
|
)
|
|
anchors: list[AnchorSample] = []
|
|
for anchor in anchors_raw:
|
|
if len(anchors) >= 12:
|
|
break
|
|
anchors.append(AnchorSample(text=anchor["text"][:80], href=anchor["href"]))
|
|
if depth < max_depth:
|
|
queue.append((_normalize_url(url, anchor["href"]), depth + 1))
|
|
|
|
summary.pages.append(
|
|
SiteSample(
|
|
url=url,
|
|
title=title or url,
|
|
snippet=snippet or "n/a",
|
|
anchors=anchors,
|
|
)
|
|
)
|
|
await page.close()
|
|
finally:
|
|
await context.close()
|
|
await browser.close()
|
|
|
|
summary.truncated = len(summary.pages) >= max_pages and bool(queue)
|
|
return summary
|
|
|
|
def snapshot(self, target: HttpUrl | str, *, max_pages: int, max_depth: int) -> SiteSummary:
|
|
"""Collect a synchronous snapshot by spinning up an event loop if needed."""
|
|
|
|
async def runner() -> SiteSummary:
|
|
return await self._gather(target, max_pages=max_pages, max_depth=max_depth)
|
|
|
|
try:
|
|
asyncio.get_running_loop()
|
|
except RuntimeError:
|
|
return asyncio.run(runner())
|
|
raise RuntimeError(
|
|
"The Playwright sampler cannot run inside an active asyncio loop. "
|
|
"Call the private `_gather` coroutine directly if you need async integration."
|
|
)
|