Files
University-Playwright-Codeg…/src/university_agent/sampler.py
yangxiaoyu-crypto 46915964e1 Initial commit: University Playwright Codegen Agent
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-09 16:38:33 +08:00

108 lines
4.2 KiB
Python

from __future__ import annotations
import asyncio
from collections import deque
from collections.abc import Iterable
from dataclasses import dataclass
from urllib.parse import urldefrag, urljoin
from pydantic import HttpUrl
from .config import Settings
from .models import AnchorSample, SiteSample, SiteSummary
def _normalize_url(base: str, raw: str) -> str:
absolute = urljoin(base, raw)
cleaned, _ = urldefrag(absolute)
return cleaned
@dataclass
class SiteSampler:
"""Lightweight Playwright-powered crawler used to prime the agent."""
settings: Settings
async def _gather(self, target: HttpUrl | str, max_pages: int, max_depth: int) -> SiteSummary:
try:
from playwright.async_api import async_playwright
except ImportError as exc:
raise RuntimeError(
"Playwright is required for sampling but is not installed. "
"Install it with `uv pip install playwright` and `playwright install`."
) from exc
summary = SiteSummary(target_url=target)
visited: set[str] = set()
queue: deque[tuple[str, int]] = deque([(str(target), 0)])
async with async_playwright() as p:
browser_launcher = getattr(p, self.settings.playwright_browser)
browser = await browser_launcher.launch(headless=self.settings.playwright_headless)
context = await browser.new_context()
try:
while queue and len(summary.pages) < max_pages:
url, depth = queue.popleft()
if url in visited or depth > max_depth:
continue
visited.add(url)
page = await context.new_page()
try:
timeout = self.settings.navigation_timeout_ms
await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
except Exception:
await page.close()
continue
title = await page.title()
text_timeout = self.settings.navigation_timeout_ms // 2
snippet = await page.inner_text("body", timeout=text_timeout)
snippet = snippet.replace("\n", " ").strip()
snippet = snippet[:500]
anchors_raw: Iterable[dict] = await page.eval_on_selector_all(
"a",
"""elements => elements
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
.filter(item => item.href && item.text)""",
)
anchors: list[AnchorSample] = []
for anchor in anchors_raw:
if len(anchors) >= 12:
break
anchors.append(AnchorSample(text=anchor["text"][:80], href=anchor["href"]))
if depth < max_depth:
queue.append((_normalize_url(url, anchor["href"]), depth + 1))
summary.pages.append(
SiteSample(
url=url,
title=title or url,
snippet=snippet or "n/a",
anchors=anchors,
)
)
await page.close()
finally:
await context.close()
await browser.close()
summary.truncated = len(summary.pages) >= max_pages and bool(queue)
return summary
def snapshot(self, target: HttpUrl | str, *, max_pages: int, max_depth: int) -> SiteSummary:
"""Collect a synchronous snapshot by spinning up an event loop if needed."""
async def runner() -> SiteSummary:
return await self._gather(target, max_pages=max_pages, max_depth=max_depth)
try:
asyncio.get_running_loop()
except RuntimeError:
return asyncio.run(runner())
raise RuntimeError(
"The Playwright sampler cannot run inside an active asyncio loop. "
"Call the private `_gather` coroutine directly if you need async integration."
)