Rename test script and update documentation

- Rename test_rwth.py to generate_scraper.py with CLI arguments - Update README.md with comprehensive usage guide - Add Harvard scraper as example output - Document troubleshooting tips for common issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 15:36:14 +08:00
parent fb2aa12f2b
commit a4dca81216
4 changed files with 729 additions and 95 deletions
--- a/artifacts/harvard_faculty_scraper.py
+++ b/artifacts/harvard_faculty_scraper.py
@ -0,0 +1,437 @@
+#!/usr/bin/env python
+"""
+Auto-generated by the Agno codegen agent.
+Target university: Harvard (https://www.harvard.edu/)
+Requested caps: depth=3, pages=30
+
+Plan description: Playwright scraper for university master programs and faculty profiles.
+Navigation strategy: Start at https://www.harvard.edu/ Follow links to /academics/ and /a-to-z/ to find list of schools and departments For each school/department, look for a 'faculty' or 'people' page On faculty directory pages, identify and follow links to individual profiles Check for school/department specific subdomains like hls.harvard.edu, hds.harvard.edu, etc. Prioritize crawling faculty directory pages over general site crawling
+Verification checklist:
+- Manually review a sample of scraped URLs to verify they are faculty profiles
+- Check that major academic departments are represented in the results
+- Verify the script is capturing profile page content, not just URLs
+- Confirm no login pages, application forms, or directory pages are included
+Playwright snapshot used to guide this plan:
+1. Harvard University (https://www.harvard.edu/)
+   Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers  Search  Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. 
+   Anchors: Skip to main content -> https://www.harvard.edu/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, × -> javascript:void(0), A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/
+2. Index of departments, schools, and affiliates - Harvard University (https://www.harvard.edu/a-to-z/)
+   Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers  Search  Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. 
+   Anchors: Skip to main content -> https://www.harvard.edu/a-to-z/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, × -> javascript:void(0), A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/
+3. Academics - Harvard University (https://www.harvard.edu/academics/)
+   Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers  Search  Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. 
+   Anchors: Skip to main content -> https://www.harvard.edu/academics/#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/, Undergraduate Degrees -> https://www.harvard.edu//programs/?degree_levels=undergraduate
+4. Programs - Harvard University (https://www.harvard.edu//programs/?degree_levels=undergraduate)
+   Snippet: Skip to main content Harvard University Learn about our lawsuits to protect our students and researchers  Search  Menu David Liu received the 2025 Breakthrough Prize in Life Sciences for developing a revolutionary gene-editing platforms that precisely corrects genetic mutations. 
+   Anchors: Skip to main content -> https://www.harvard.edu/programs/?degree_levels=undergraduate#main-content, Harvard University -> https://www.harvard.edu/, Learn about our lawsuits to protect our students and researchers -> https://www.harvard.edu/federal-lawsuits/, A to Z index -> https://www.harvard.edu/a-to-z/, Academics -> https://www.harvard.edu/academics/, Undergraduate Degrees -> https://www.harvard.edu//programs/?degree_levels=undergraduate
+Snapshot truncated.
+
+Generated at: 2025-12-10T07:19:12.294884+00:00
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import time
+from collections import deque
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Deque, Iterable, List, Set, Tuple
+from urllib.parse import urljoin, urldefrag, urlparse
+
+from playwright.async_api import async_playwright, Page, Response
+
+PROGRAM_KEYWORDS = ['/graduate/', '/masters/', '/programs/?degree_levels=graduate', '/mpp/', 'Master of', 'M.S.', 'M.A.', 'graduate program']
+FACULTY_KEYWORDS = ['/people/', '/~', '/faculty/', '/profile/', 'professor', 'dr.', 'ph.d.', 'firstname-lastname']
+EXCLUSION_KEYWORDS = ['admissions', 'apply', 'tuition', 'news', 'events', 'calendar', 'careers', 'jobs', 'login', 'donate', 'alumni', 'giving']
+METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'email', 'scraped_at']
+EXTRA_NOTES = ['Many Harvard faculty have profiles under the /~username/ URL pattern', 'Some faculty may be cross-listed in multiple departments', 'Prioritize finding profiles from professional schools (business, law, medicine, etc.)', "Check for non-standard faculty titles like 'lecturer', 'fellow', 'researcher'"]
+
+# URL patterns that indicate individual profile pages
+PROFILE_URL_PATTERNS = [
+    "/people/", "/person/", "/profile/", "/profiles/",
+    "/faculty/", "/staff/", "/directory/",
+    "/~",  # Unix-style personal pages
+    "/bio/", "/about/",
+]
+
+# URL patterns that indicate listing/directory pages (should be crawled deeper)
+DIRECTORY_URL_PATTERNS = [
+    "/faculty", "/people", "/directory", "/staff",
+    "/team", "/members", "/researchers",
+]
+
+
+def normalize_url(base: str, href: str) -> str:
+    """Normalize URL by resolving relative paths and removing fragments."""
+    absolute = urljoin(base, href)
+    cleaned, _ = urldefrag(absolute)
+    # Remove trailing slash for consistency
+    return cleaned.rstrip("/")
+
+
+def matches_any(text: str, keywords: Iterable[str]) -> bool:
+    """Check if text contains any of the keywords (case-insensitive)."""
+    lowered = text.lower()
+    return any(keyword.lower() in lowered for keyword in keywords)
+
+
+def is_same_domain(url1: str, url2: str) -> bool:
+    """Check if two URLs belong to the same root domain."""
+    domain1 = urlparse(url1).netloc.replace("www.", "")
+    domain2 = urlparse(url2).netloc.replace("www.", "")
+    # Allow subdomains of the same root domain
+    parts1 = domain1.split(".")
+    parts2 = domain2.split(".")
+    if len(parts1) >= 2 and len(parts2) >= 2:
+        return parts1[-2:] == parts2[-2:]
+    return domain1 == domain2
+
+
+def is_profile_url(url: str) -> bool:
+    """Check if URL pattern suggests an individual profile page."""
+    url_lower = url.lower()
+    return any(pattern in url_lower for pattern in PROFILE_URL_PATTERNS)
+
+
+def is_directory_url(url: str) -> bool:
+    """Check if URL pattern suggests a directory/listing page."""
+    url_lower = url.lower()
+    return any(pattern in url_lower for pattern in DIRECTORY_URL_PATTERNS)
+
+
+@dataclass
+class ScrapedLink:
+    url: str
+    title: str
+    text: str
+    source_url: str
+    bucket: str  # "program" or "faculty"
+    is_verified: bool = False
+    http_status: int = 0
+    is_profile_page: bool = False
+    scraped_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
+
+
+@dataclass
+class ScrapeSettings:
+    root_url: str
+    max_depth: int
+    max_pages: int
+    headless: bool
+    output: Path
+    verify_links: bool = True
+    request_delay: float = 1.0  # Polite crawling delay
+
+
+async def extract_links(page: Page) -> List[Tuple[str, str]]:
+    """Extract all anchor links from the page."""
+    anchors: Iterable[dict] = await page.eval_on_selector_all(
+        "a",
+        """elements => elements
+        .map(el => ({text: (el.textContent || '').trim(), href: el.href}))
+        .filter(item => item.text && item.href && item.href.startsWith('http'))""",
+    )
+    return [(item["href"], item["text"]) for item in anchors]
+
+
+async def get_page_title(page: Page) -> str:
+    """Get the page title safely."""
+    try:
+        return await page.title() or ""
+    except Exception:
+        return ""
+
+
+async def verify_link(context, url: str, timeout: int = 10000) -> Tuple[bool, int, str]:
+    """
+    Verify a link by making a HEAD-like request.
+    Returns: (is_valid, status_code, page_title)
+    """
+    page = await context.new_page()
+    try:
+        response: Response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
+        if response:
+            status = response.status
+            title = await get_page_title(page)
+            is_valid = 200 <= status < 400
+            return is_valid, status, title
+        return False, 0, ""
+    except Exception:
+        return False, 0, ""
+    finally:
+        await page.close()
+
+
+async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
+    """
+    Crawl the website using BFS, collecting program and faculty links.
+    Features:
+    - URL deduplication
+    - Link verification
+    - Profile page detection
+    - Polite crawling with delays
+    """
+    async with async_playwright() as p:
+        browser_launcher = getattr(p, browser_name)
+        browser = await browser_launcher.launch(headless=settings.headless)
+        context = await browser.new_context()
+
+        # Priority queue: (priority, url, depth) - lower priority = processed first
+        # Directory pages get priority 0, others get priority 1
+        queue: Deque[Tuple[int, str, int]] = deque([(0, settings.root_url, 0)])
+        visited: Set[str] = set()
+        found_urls: Set[str] = set()  # For deduplication of results
+        results: List[ScrapedLink] = []
+
+        print(f"Starting crawl from: {settings.root_url}")
+        print(f"Max depth: {settings.max_depth}, Max pages: {settings.max_pages}")
+
+        try:
+            while queue and len(visited) < settings.max_pages:
+                # Sort queue by priority (directory pages first)
+                queue = deque(sorted(queue, key=lambda x: x[0]))
+                priority, url, depth = queue.popleft()
+
+                normalized_url = normalize_url(settings.root_url, url)
+                if normalized_url in visited or depth > settings.max_depth:
+                    continue
+
+                # Only crawl same-domain URLs
+                if not is_same_domain(settings.root_url, normalized_url):
+                    continue
+
+                visited.add(normalized_url)
+                print(f"[{len(visited)}/{settings.max_pages}] Depth {depth}: {normalized_url[:80]}...")
+
+                page = await context.new_page()
+                try:
+                    response = await page.goto(
+                        normalized_url, wait_until="domcontentloaded", timeout=20000
+                    )
+                    if not response or response.status >= 400:
+                        await page.close()
+                        continue
+                except Exception as e:
+                    print(f"  Error: {e}")
+                    await page.close()
+                    continue
+
+                page_title = await get_page_title(page)
+                links = await extract_links(page)
+
+                for href, text in links:
+                    normalized_href = normalize_url(normalized_url, href)
+
+                    # Skip if already found or is excluded
+                    if normalized_href in found_urls:
+                        continue
+                    if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized_href, EXCLUSION_KEYWORDS):
+                        continue
+
+                    text_lower = text.lower()
+                    href_lower = normalized_href.lower()
+                    is_profile = is_profile_url(normalized_href)
+
+                    # Check for program links
+                    if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(href_lower, PROGRAM_KEYWORDS):
+                        found_urls.add(normalized_href)
+                        results.append(
+                            ScrapedLink(
+                                url=normalized_href,
+                                title="",
+                                text=text[:200],
+                                source_url=normalized_url,
+                                bucket="program",
+                                is_profile_page=False,
+                            )
+                        )
+
+                    # Check for faculty links
+                    if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(href_lower, FACULTY_KEYWORDS):
+                        found_urls.add(normalized_href)
+                        results.append(
+                            ScrapedLink(
+                                url=normalized_href,
+                                title="",
+                                text=text[:200],
+                                source_url=normalized_url,
+                                bucket="faculty",
+                                is_profile_page=is_profile,
+                            )
+                        )
+
+                    # Queue for further crawling
+                    if depth < settings.max_depth and is_same_domain(settings.root_url, normalized_href):
+                        # Prioritize directory pages
+                        link_priority = 0 if is_directory_url(normalized_href) else 1
+                        queue.append((link_priority, normalized_href, depth + 1))
+
+                await page.close()
+
+                # Polite delay between requests
+                await asyncio.sleep(settings.request_delay)
+
+        finally:
+            await context.close()
+            await browser.close()
+
+        # Verify links if enabled
+        if settings.verify_links and results:
+            print(f"\nVerifying {len(results)} links...")
+            browser = await browser_launcher.launch(headless=True)
+            context = await browser.new_context()
+
+            verified_results = []
+            for i, link in enumerate(results):
+                if link.url in [r.url for r in verified_results]:
+                    continue  # Skip duplicates
+
+                print(f"  [{i+1}/{len(results)}] Verifying: {link.url[:60]}...")
+                is_valid, status, title = await verify_link(context, link.url)
+                link.is_verified = True
+                link.http_status = status
+                link.title = title or link.text
+
+                if is_valid:
+                    verified_results.append(link)
+                else:
+                    print(f"    Invalid (HTTP {status})")
+
+                await asyncio.sleep(0.5)  # Delay between verifications
+
+            await context.close()
+            await browser.close()
+            results = verified_results
+
+        return results
+
+
+def deduplicate_results(results: List[ScrapedLink]) -> List[ScrapedLink]:
+    """Remove duplicate URLs, keeping the first occurrence."""
+    seen: Set[str] = set()
+    unique = []
+    for link in results:
+        if link.url not in seen:
+            seen.add(link.url)
+            unique.append(link)
+    return unique
+
+
+def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
+    """Save results to JSON file with statistics."""
+    results = deduplicate_results(results)
+
+    program_links = [link for link in results if link.bucket == "program"]
+    faculty_links = [link for link in results if link.bucket == "faculty"]
+    profile_pages = [link for link in faculty_links if link.is_profile_page]
+
+    payload = {
+        "root_url": root_url,
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "statistics": {
+            "total_links": len(results),
+            "program_links": len(program_links),
+            "faculty_links": len(faculty_links),
+            "profile_pages": len(profile_pages),
+            "verified_links": len([r for r in results if r.is_verified and r.http_status == 200]),
+        },
+        "program_links": [asdict(link) for link in program_links],
+        "faculty_links": [asdict(link) for link in faculty_links],
+        "notes": EXTRA_NOTES,
+        "metadata_fields": METADATA_FIELDS,
+    }
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+
+    print(f"\nResults saved to: {target}")
+    print(f"  Total links: {len(results)}")
+    print(f"  Program links: {len(program_links)}")
+    print(f"  Faculty links: {len(faculty_links)}")
+    print(f"  Profile pages: {len(profile_pages)}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Playwright scraper generated by the Agno agent for https://www.harvard.edu/."
+    )
+    parser.add_argument(
+        "--root-url",
+        default="https://www.harvard.edu/",
+        help="Seed url to start crawling from.",
+    )
+    parser.add_argument(
+        "--max-depth",
+        type=int,
+        default=3,
+        help="Maximum crawl depth.",
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=30,
+        help="Maximum number of pages to visit.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("university-scraper_results.json"),
+        help="Where to save the JSON output.",
+    )
+    parser.add_argument(
+        "--headless",
+        action="store_true",
+        default=True,
+        help="Run browser in headless mode (default: True).",
+    )
+    parser.add_argument(
+        "--no-headless",
+        action="store_false",
+        dest="headless",
+        help="Run browser with visible window.",
+    )
+    parser.add_argument(
+        "--browser",
+        choices=["chromium", "firefox", "webkit"],
+        default="chromium",
+        help="Browser engine to launch via Playwright.",
+    )
+    parser.add_argument(
+        "--no-verify",
+        action="store_true",
+        default=False,
+        help="Skip link verification step.",
+    )
+    parser.add_argument(
+        "--delay",
+        type=float,
+        default=1.0,
+        help="Delay between requests in seconds (polite crawling).",
+    )
+    return parser.parse_args()
+
+
+async def main_async() -> None:
+    args = parse_args()
+    settings = ScrapeSettings(
+        root_url=args.root_url,
+        max_depth=args.max_depth,
+        max_pages=args.max_pages,
+        headless=args.headless,
+        output=args.output,
+        verify_links=not args.no_verify,
+        request_delay=args.delay,
+    )
+    links = await crawl(settings, browser_name=args.browser)
+    serialize(links, settings.output, settings.root_url)
+
+
+def main() -> None:
+    asyncio.run(main_async())
+
+
+if __name__ == "__main__":
+    main()