Initial commit: University Playwright Codegen Agent
🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
107
src/university_agent/sampler.py
Normal file
107
src/university_agent/sampler.py
Normal file
@ -0,0 +1,107 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from collections import deque
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import urldefrag, urljoin
|
||||
|
||||
from pydantic import HttpUrl
|
||||
|
||||
from .config import Settings
|
||||
from .models import AnchorSample, SiteSample, SiteSummary
|
||||
|
||||
|
||||
def _normalize_url(base: str, raw: str) -> str:
|
||||
absolute = urljoin(base, raw)
|
||||
cleaned, _ = urldefrag(absolute)
|
||||
return cleaned
|
||||
|
||||
|
||||
@dataclass
|
||||
class SiteSampler:
|
||||
"""Lightweight Playwright-powered crawler used to prime the agent."""
|
||||
|
||||
settings: Settings
|
||||
|
||||
async def _gather(self, target: HttpUrl | str, max_pages: int, max_depth: int) -> SiteSummary:
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
except ImportError as exc:
|
||||
raise RuntimeError(
|
||||
"Playwright is required for sampling but is not installed. "
|
||||
"Install it with `uv pip install playwright` and `playwright install`."
|
||||
) from exc
|
||||
|
||||
summary = SiteSummary(target_url=target)
|
||||
visited: set[str] = set()
|
||||
queue: deque[tuple[str, int]] = deque([(str(target), 0)])
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser_launcher = getattr(p, self.settings.playwright_browser)
|
||||
browser = await browser_launcher.launch(headless=self.settings.playwright_headless)
|
||||
context = await browser.new_context()
|
||||
|
||||
try:
|
||||
while queue and len(summary.pages) < max_pages:
|
||||
url, depth = queue.popleft()
|
||||
if url in visited or depth > max_depth:
|
||||
continue
|
||||
visited.add(url)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
timeout = self.settings.navigation_timeout_ms
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
||||
except Exception:
|
||||
await page.close()
|
||||
continue
|
||||
|
||||
title = await page.title()
|
||||
text_timeout = self.settings.navigation_timeout_ms // 2
|
||||
snippet = await page.inner_text("body", timeout=text_timeout)
|
||||
snippet = snippet.replace("\n", " ").strip()
|
||||
snippet = snippet[:500]
|
||||
anchors_raw: Iterable[dict] = await page.eval_on_selector_all(
|
||||
"a",
|
||||
"""elements => elements
|
||||
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
|
||||
.filter(item => item.href && item.text)""",
|
||||
)
|
||||
anchors: list[AnchorSample] = []
|
||||
for anchor in anchors_raw:
|
||||
if len(anchors) >= 12:
|
||||
break
|
||||
anchors.append(AnchorSample(text=anchor["text"][:80], href=anchor["href"]))
|
||||
if depth < max_depth:
|
||||
queue.append((_normalize_url(url, anchor["href"]), depth + 1))
|
||||
|
||||
summary.pages.append(
|
||||
SiteSample(
|
||||
url=url,
|
||||
title=title or url,
|
||||
snippet=snippet or "n/a",
|
||||
anchors=anchors,
|
||||
)
|
||||
)
|
||||
await page.close()
|
||||
finally:
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
summary.truncated = len(summary.pages) >= max_pages and bool(queue)
|
||||
return summary
|
||||
|
||||
def snapshot(self, target: HttpUrl | str, *, max_pages: int, max_depth: int) -> SiteSummary:
|
||||
"""Collect a synchronous snapshot by spinning up an event loop if needed."""
|
||||
|
||||
async def runner() -> SiteSummary:
|
||||
return await self._gather(target, max_pages=max_pages, max_depth=max_depth)
|
||||
|
||||
try:
|
||||
asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
return asyncio.run(runner())
|
||||
raise RuntimeError(
|
||||
"The Playwright sampler cannot run inside an active asyncio loop. "
|
||||
"Call the private `_gather` coroutine directly if you need async integration."
|
||||
)
|
||||
Reference in New Issue
Block a user