Add OpenRouter support and improve JSON parsing robustness
- Add OpenRouter as third LLM provider option in config.py - Implement _extract_json() to handle markdown-wrapped JSON responses - Add default values for missing required fields in ScriptPlan - Handle navigation_strategy as list or string - Add .env.example with configuration templates - Add test script and sample generated scrapers for RWTH and KAUST 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
435
artifacts/kaust_faculty_scraper.py
Normal file
435
artifacts/kaust_faculty_scraper.py
Normal file
@ -0,0 +1,435 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Auto-generated by the Agno codegen agent.
|
||||
Target university: KAUST (https://www.kaust.edu.sa/en/)
|
||||
Requested caps: depth=3, pages=30
|
||||
|
||||
Plan description: Playwright scraper for university master programs and faculty profiles.
|
||||
Navigation strategy: Start at https://www.kaust.edu.sa/en/ Navigate to /study/ to find degree program links Follow links to individual degree pages under /degree-programs/ Separately, look for links to /faculty/ or /people/ directories Crawl faculty directories to extract links to individual bio pages Individual faculty are often under a subdomain like bio.kaust.edu.sa
|
||||
Verification checklist:
|
||||
- Verify master's programs are under /study/ or /degree-programs/
|
||||
- Check that faculty directory pages contain links to individual bios
|
||||
- Confirm individual faculty pages have research/expertise details
|
||||
- Ensure exclusion keywords successfully skip irrelevant pages
|
||||
Playwright snapshot used to guide this plan:
|
||||
No browser snapshot was captured.
|
||||
|
||||
Generated at: 2025-12-10T02:48:42.571899+00:00
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from collections import deque
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Deque, Iterable, List, Set, Tuple
|
||||
from urllib.parse import urljoin, urldefrag, urlparse
|
||||
|
||||
from playwright.async_api import async_playwright, Page, Response
|
||||
|
||||
PROGRAM_KEYWORDS = ['/study/', '/degree-programs/', '/academics/', 'M.Sc.', 'Master of Science', 'graduate program']
|
||||
FACULTY_KEYWORDS = ['/people/', '/profiles/faculty/', 'Professor', 'faculty-member', '/faculty/firstname-lastname', 'bio.kaust.edu.sa']
|
||||
EXCLUSION_KEYWORDS = ['/admissions/', '/apply/', '/tuition/', '/events/', '/news/', '/careers/', '/jobs/', '/login/', '/alumni/', '/giving/', 'inquiry.kaust.edu.sa']
|
||||
METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'email', 'scraped_at']
|
||||
EXTRA_NOTES = ['Many faculty are listed under a separate subdomain bio.kaust.edu.sa', 'Prioritize crawling the centralized faculty directory first', 'Alumni and affiliated faculty may not have full profile pages']
|
||||
|
||||
# URL patterns that indicate individual profile pages
|
||||
PROFILE_URL_PATTERNS = [
|
||||
"/people/", "/person/", "/profile/", "/profiles/",
|
||||
"/faculty/", "/staff/", "/directory/",
|
||||
"/~", # Unix-style personal pages
|
||||
"/bio/", "/about/",
|
||||
]
|
||||
|
||||
# URL patterns that indicate listing/directory pages (should be crawled deeper)
|
||||
DIRECTORY_URL_PATTERNS = [
|
||||
"/faculty", "/people", "/directory", "/staff",
|
||||
"/team", "/members", "/researchers",
|
||||
]
|
||||
|
||||
|
||||
def normalize_url(base: str, href: str) -> str:
|
||||
"""Normalize URL by resolving relative paths and removing fragments."""
|
||||
absolute = urljoin(base, href)
|
||||
cleaned, _ = urldefrag(absolute)
|
||||
# Remove trailing slash for consistency
|
||||
return cleaned.rstrip("/")
|
||||
|
||||
|
||||
def matches_any(text: str, keywords: Iterable[str]) -> bool:
|
||||
"""Check if text contains any of the keywords (case-insensitive)."""
|
||||
lowered = text.lower()
|
||||
return any(keyword.lower() in lowered for keyword in keywords)
|
||||
|
||||
|
||||
def is_same_domain(url1: str, url2: str) -> bool:
|
||||
"""Check if two URLs belong to the same root domain."""
|
||||
domain1 = urlparse(url1).netloc.replace("www.", "")
|
||||
domain2 = urlparse(url2).netloc.replace("www.", "")
|
||||
# Allow subdomains of the same root domain
|
||||
parts1 = domain1.split(".")
|
||||
parts2 = domain2.split(".")
|
||||
if len(parts1) >= 2 and len(parts2) >= 2:
|
||||
return parts1[-2:] == parts2[-2:]
|
||||
return domain1 == domain2
|
||||
|
||||
|
||||
def is_profile_url(url: str) -> bool:
|
||||
"""Check if URL pattern suggests an individual profile page."""
|
||||
url_lower = url.lower()
|
||||
return any(pattern in url_lower for pattern in PROFILE_URL_PATTERNS)
|
||||
|
||||
|
||||
def is_directory_url(url: str) -> bool:
|
||||
"""Check if URL pattern suggests a directory/listing page."""
|
||||
url_lower = url.lower()
|
||||
return any(pattern in url_lower for pattern in DIRECTORY_URL_PATTERNS)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedLink:
|
||||
url: str
|
||||
title: str
|
||||
text: str
|
||||
source_url: str
|
||||
bucket: str # "program" or "faculty"
|
||||
is_verified: bool = False
|
||||
http_status: int = 0
|
||||
is_profile_page: bool = False
|
||||
scraped_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeSettings:
|
||||
root_url: str
|
||||
max_depth: int
|
||||
max_pages: int
|
||||
headless: bool
|
||||
output: Path
|
||||
verify_links: bool = True
|
||||
request_delay: float = 1.0 # Polite crawling delay
|
||||
timeout: int = 60000 # Navigation timeout in ms (default 60s for slow sites)
|
||||
|
||||
|
||||
async def extract_links(page: Page) -> List[Tuple[str, str]]:
|
||||
"""Extract all anchor links from the page."""
|
||||
anchors: Iterable[dict] = await page.eval_on_selector_all(
|
||||
"a",
|
||||
"""elements => elements
|
||||
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
|
||||
.filter(item => item.text && item.href && item.href.startsWith('http'))""",
|
||||
)
|
||||
return [(item["href"], item["text"]) for item in anchors]
|
||||
|
||||
|
||||
async def get_page_title(page: Page) -> str:
|
||||
"""Get the page title safely."""
|
||||
try:
|
||||
return await page.title() or ""
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
async def verify_link(context, url: str, timeout: int = 10000) -> Tuple[bool, int, str]:
|
||||
"""
|
||||
Verify a link by making a HEAD-like request.
|
||||
Returns: (is_valid, status_code, page_title)
|
||||
"""
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response: Response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
||||
if response:
|
||||
status = response.status
|
||||
title = await get_page_title(page)
|
||||
is_valid = 200 <= status < 400
|
||||
return is_valid, status, title
|
||||
return False, 0, ""
|
||||
except Exception:
|
||||
return False, 0, ""
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
|
||||
async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
|
||||
"""
|
||||
Crawl the website using BFS, collecting program and faculty links.
|
||||
Features:
|
||||
- URL deduplication
|
||||
- Link verification
|
||||
- Profile page detection
|
||||
- Polite crawling with delays
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser_launcher = getattr(p, browser_name)
|
||||
browser = await browser_launcher.launch(headless=settings.headless)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Priority queue: (priority, url, depth) - lower priority = processed first
|
||||
# Directory pages get priority 0, others get priority 1
|
||||
queue: Deque[Tuple[int, str, int]] = deque([(0, settings.root_url, 0)])
|
||||
visited: Set[str] = set()
|
||||
found_urls: Set[str] = set() # For deduplication of results
|
||||
results: List[ScrapedLink] = []
|
||||
|
||||
print(f"Starting crawl from: {settings.root_url}")
|
||||
print(f"Max depth: {settings.max_depth}, Max pages: {settings.max_pages}")
|
||||
|
||||
try:
|
||||
while queue and len(visited) < settings.max_pages:
|
||||
# Sort queue by priority (directory pages first)
|
||||
queue = deque(sorted(queue, key=lambda x: x[0]))
|
||||
priority, url, depth = queue.popleft()
|
||||
|
||||
normalized_url = normalize_url(settings.root_url, url)
|
||||
if normalized_url in visited or depth > settings.max_depth:
|
||||
continue
|
||||
|
||||
# Only crawl same-domain URLs
|
||||
if not is_same_domain(settings.root_url, normalized_url):
|
||||
continue
|
||||
|
||||
visited.add(normalized_url)
|
||||
print(f"[{len(visited)}/{settings.max_pages}] Depth {depth}: {normalized_url[:80]}...")
|
||||
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response = await page.goto(
|
||||
normalized_url, wait_until="load", timeout=settings.timeout
|
||||
)
|
||||
if not response or response.status >= 400:
|
||||
await page.close()
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
await page.close()
|
||||
continue
|
||||
|
||||
page_title = await get_page_title(page)
|
||||
links = await extract_links(page)
|
||||
|
||||
for href, text in links:
|
||||
normalized_href = normalize_url(normalized_url, href)
|
||||
|
||||
# Skip if already found or is excluded
|
||||
if normalized_href in found_urls:
|
||||
continue
|
||||
if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized_href, EXCLUSION_KEYWORDS):
|
||||
continue
|
||||
|
||||
text_lower = text.lower()
|
||||
href_lower = normalized_href.lower()
|
||||
is_profile = is_profile_url(normalized_href)
|
||||
|
||||
# Check for program links
|
||||
if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(href_lower, PROGRAM_KEYWORDS):
|
||||
found_urls.add(normalized_href)
|
||||
results.append(
|
||||
ScrapedLink(
|
||||
url=normalized_href,
|
||||
title="",
|
||||
text=text[:200],
|
||||
source_url=normalized_url,
|
||||
bucket="program",
|
||||
is_profile_page=False,
|
||||
)
|
||||
)
|
||||
|
||||
# Check for faculty links
|
||||
if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(href_lower, FACULTY_KEYWORDS):
|
||||
found_urls.add(normalized_href)
|
||||
results.append(
|
||||
ScrapedLink(
|
||||
url=normalized_href,
|
||||
title="",
|
||||
text=text[:200],
|
||||
source_url=normalized_url,
|
||||
bucket="faculty",
|
||||
is_profile_page=is_profile,
|
||||
)
|
||||
)
|
||||
|
||||
# Queue for further crawling
|
||||
if depth < settings.max_depth and is_same_domain(settings.root_url, normalized_href):
|
||||
# Prioritize directory pages
|
||||
link_priority = 0 if is_directory_url(normalized_href) else 1
|
||||
queue.append((link_priority, normalized_href, depth + 1))
|
||||
|
||||
await page.close()
|
||||
|
||||
# Polite delay between requests
|
||||
await asyncio.sleep(settings.request_delay)
|
||||
|
||||
finally:
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
# Verify links if enabled
|
||||
if settings.verify_links and results:
|
||||
print(f"\nVerifying {len(results)} links...")
|
||||
browser = await browser_launcher.launch(headless=True)
|
||||
context = await browser.new_context()
|
||||
|
||||
verified_results = []
|
||||
for i, link in enumerate(results):
|
||||
if link.url in [r.url for r in verified_results]:
|
||||
continue # Skip duplicates
|
||||
|
||||
print(f" [{i+1}/{len(results)}] Verifying: {link.url[:60]}...")
|
||||
is_valid, status, title = await verify_link(context, link.url)
|
||||
link.is_verified = True
|
||||
link.http_status = status
|
||||
link.title = title or link.text
|
||||
|
||||
if is_valid:
|
||||
verified_results.append(link)
|
||||
else:
|
||||
print(f" Invalid (HTTP {status})")
|
||||
|
||||
await asyncio.sleep(0.5) # Delay between verifications
|
||||
|
||||
await context.close()
|
||||
await browser.close()
|
||||
results = verified_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def deduplicate_results(results: List[ScrapedLink]) -> List[ScrapedLink]:
|
||||
"""Remove duplicate URLs, keeping the first occurrence."""
|
||||
seen: Set[str] = set()
|
||||
unique = []
|
||||
for link in results:
|
||||
if link.url not in seen:
|
||||
seen.add(link.url)
|
||||
unique.append(link)
|
||||
return unique
|
||||
|
||||
|
||||
def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
|
||||
"""Save results to JSON file with statistics."""
|
||||
results = deduplicate_results(results)
|
||||
|
||||
program_links = [link for link in results if link.bucket == "program"]
|
||||
faculty_links = [link for link in results if link.bucket == "faculty"]
|
||||
profile_pages = [link for link in faculty_links if link.is_profile_page]
|
||||
|
||||
payload = {
|
||||
"root_url": root_url,
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"statistics": {
|
||||
"total_links": len(results),
|
||||
"program_links": len(program_links),
|
||||
"faculty_links": len(faculty_links),
|
||||
"profile_pages": len(profile_pages),
|
||||
"verified_links": len([r for r in results if r.is_verified and r.http_status == 200]),
|
||||
},
|
||||
"program_links": [asdict(link) for link in program_links],
|
||||
"faculty_links": [asdict(link) for link in faculty_links],
|
||||
"notes": EXTRA_NOTES,
|
||||
"metadata_fields": METADATA_FIELDS,
|
||||
}
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
print(f"\nResults saved to: {target}")
|
||||
print(f" Total links: {len(results)}")
|
||||
print(f" Program links: {len(program_links)}")
|
||||
print(f" Faculty links: {len(faculty_links)}")
|
||||
print(f" Profile pages: {len(profile_pages)}")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Playwright scraper generated by the Agno agent for https://www.kaust.edu.sa/en/."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--root-url",
|
||||
default="https://www.kaust.edu.sa/en/",
|
||||
help="Seed url to start crawling from.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-depth",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Maximum crawl depth.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Maximum number of pages to visit.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path("university-scraper_results.json"),
|
||||
help="Where to save the JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Run browser in headless mode (default: True).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-headless",
|
||||
action="store_false",
|
||||
dest="headless",
|
||||
help="Run browser with visible window.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browser",
|
||||
choices=["chromium", "firefox", "webkit"],
|
||||
default="firefox",
|
||||
help="Browser engine to launch via Playwright (firefox recommended for KAUST).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-verify",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Skip link verification step.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delay",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Delay between requests in seconds (polite crawling).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=60000,
|
||||
help="Navigation timeout in milliseconds (default: 60000 = 60s).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
async def main_async() -> None:
|
||||
args = parse_args()
|
||||
settings = ScrapeSettings(
|
||||
root_url=args.root_url,
|
||||
max_depth=args.max_depth,
|
||||
max_pages=args.max_pages,
|
||||
headless=args.headless,
|
||||
output=args.output,
|
||||
verify_links=not args.no_verify,
|
||||
request_delay=args.delay,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
links = await crawl(settings, browser_name=args.browser)
|
||||
serialize(links, settings.output, settings.root_url)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
asyncio.run(main_async())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
438
artifacts/rwth_aachen_playwright_scraper.py
Normal file
438
artifacts/rwth_aachen_playwright_scraper.py
Normal file
@ -0,0 +1,438 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Auto-generated by the Agno codegen agent.
|
||||
Target university: RWTH Aachen (https://www.rwth-aachen.de/go/id/a/?lidx=1)
|
||||
Requested caps: depth=3, pages=30
|
||||
|
||||
Plan description: Playwright scraper for university master programs and faculty profiles.
|
||||
Navigation strategy: Start from the main university page and look for faculty/department directories. RWTH Aachen likely structures content with faculty organized by departments. Look for department pages (like 'Fakultäten'), then navigate to individual department sites, find 'Mitarbeiter' or 'Personal' sections, and extract individual faculty profile URLs. The university uses both German and English, so check for patterns in both languages. Individual faculty pages likely follow patterns like '/mitarbeiter/firstname-lastname' or similar German naming conventions.
|
||||
Verification checklist:
|
||||
- Verify that faculty URLs point to individual person pages, not department listings
|
||||
- Check that master's program pages contain degree information and curriculum details
|
||||
- Ensure scraped faculty pages include personal information like research interests, contact details, or CV
|
||||
- Validate that URLs contain individual identifiers (names, personal paths) rather than generic terms
|
||||
- Cross-check that German and English versions of pages are both captured when available
|
||||
Playwright snapshot used to guide this plan:
|
||||
1. RWTH Aachen University | Rheinisch-Westfälische Technische Hochschule | EN (https://www.rwth-aachen.de/go/id/a/?lidx=1)
|
||||
Snippet: Skip to Content Skip to Main Navigation Skip to Landing Pages for Target Groups Skip to Quick Access Skip to Search Skip to Footer News Information for... Quick Access DE Search for Search Copyright: © Copyright: © Copyright: © Copyright: © Studying at RWTH Welc
|
||||
Anchors: Skip to Content -> https://www.rwth-aachen.de/go/id/a/?lidx=1#main, Skip to Main Navigation -> https://www.rwth-aachen.de/go/id/a/?lidx=1#main-nav-control, Skip to Landing Pages for Target Groups -> https://www.rwth-aachen.de/go/id/a/?lidx=1#persona-control, Skip to Quick Access -> https://www.rwth-aachen.de/go/id/a/?lidx=1#quick-start-control, Skip to Search -> https://www.rwth-aachen.de/go/id/a/?lidx=1#searchbar, Skip to Footer -> https://www.rwth-aachen.de/go/id/a/?lidx=1#footer
|
||||
2. Prospective Students | RWTH Aachen University | EN (https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/)
|
||||
Snippet: Skip to Content Skip to Main Navigation Skip to Landing Pages for Target Groups Skip to Quick Access Skip to Search Skip to Footer News Information for... Quick Access DE Search for Search Prospective Students Choosing A Course of Study Copyright: © Mario Irrmischer Adv
|
||||
Anchors: Skip to Content -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#main, Skip to Main Navigation -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#main-nav-control, Skip to Landing Pages for Target Groups -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#persona-control, Skip to Quick Access -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#quick-start-control, Skip to Search -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#searchbar, Skip to Footer -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#footer
|
||||
3. First-Year Students | RWTH Aachen University | EN (https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/)
|
||||
Snippet: Skip to Content Skip to Main Navigation Skip to Landing Pages for Target Groups Skip to Quick Access Skip to Search Skip to Footer News Information for... Quick Access DE Search for Search First-Year Students Preparing for Your Studies – Recommended Subject-Specific Res
|
||||
Anchors: Skip to Content -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#main, Skip to Main Navigation -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#main-nav-control, Skip to Landing Pages for Target Groups -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#persona-control, Skip to Quick Access -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#quick-start-control, Skip to Search -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#searchbar, Skip to Footer -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#footer
|
||||
4. Students | RWTH Aachen University | EN (https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/)
|
||||
Snippet: Skip to Content Skip to Main Navigation Skip to Landing Pages for Target Groups Skip to Quick Access Skip to Search Skip to Footer News Information for... Quick Access DE Search for Search Students Teaser Copyright: © Martin Braun Classes What lectures do you have next
|
||||
Anchors: Skip to Content -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#main, Skip to Main Navigation -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#main-nav-control, Skip to Landing Pages for Target Groups -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#persona-control, Skip to Quick Access -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#quick-start-control, Skip to Search -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#searchbar, Skip to Footer -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#footer
|
||||
Snapshot truncated.
|
||||
|
||||
Generated at: 2025-12-09T10:27:25.950820+00:00
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from collections import deque
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Deque, Iterable, List, Set, Tuple
|
||||
from urllib.parse import urljoin, urldefrag, urlparse
|
||||
|
||||
from playwright.async_api import async_playwright, Page, Response
|
||||
|
||||
PROGRAM_KEYWORDS = ['Master', 'M.Sc.', 'M.A.', 'Graduate', 'Masterstudiengang', '/studium/', '/studiengänge/', 'Postgraduate']
|
||||
FACULTY_KEYWORDS = ['Prof.', 'Dr.', 'Professor', '/mitarbeiter/', '/people/', '/personal/', '/~', 'Professorin']
|
||||
EXCLUSION_KEYWORDS = ['bewerbung', 'admission', 'apply', 'bewerben', 'news', 'nachrichten', 'events', 'veranstaltungen', 'career', 'stellenangebote', 'login', 'anmelden', 'alumni', 'donate', 'spenden', 'studienanfänger']
|
||||
METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'email', 'scraped_at']
|
||||
EXTRA_NOTES = ["RWTH Aachen is a major German technical university with content in both German and English. The site structure appears to use target group portals ('Zielgruppenportale') for different audiences. Faculty information will likely be distributed across different department websites. The university uses German academic titles (Prof., Dr.) extensively. Be prepared to handle both '/cms/root/' URL structures and potential subdomain variations for different faculties."]
|
||||
|
||||
# URL patterns that indicate individual profile pages
|
||||
PROFILE_URL_PATTERNS = [
|
||||
"/people/", "/person/", "/profile/", "/profiles/",
|
||||
"/faculty/", "/staff/", "/directory/",
|
||||
"/~", # Unix-style personal pages
|
||||
"/bio/", "/about/",
|
||||
]
|
||||
|
||||
# URL patterns that indicate listing/directory pages (should be crawled deeper)
|
||||
DIRECTORY_URL_PATTERNS = [
|
||||
"/faculty", "/people", "/directory", "/staff",
|
||||
"/team", "/members", "/researchers",
|
||||
]
|
||||
|
||||
|
||||
def normalize_url(base: str, href: str) -> str:
|
||||
"""Normalize URL by resolving relative paths and removing fragments."""
|
||||
absolute = urljoin(base, href)
|
||||
cleaned, _ = urldefrag(absolute)
|
||||
# Remove trailing slash for consistency
|
||||
return cleaned.rstrip("/")
|
||||
|
||||
|
||||
def matches_any(text: str, keywords: Iterable[str]) -> bool:
|
||||
"""Check if text contains any of the keywords (case-insensitive)."""
|
||||
lowered = text.lower()
|
||||
return any(keyword.lower() in lowered for keyword in keywords)
|
||||
|
||||
|
||||
def is_same_domain(url1: str, url2: str) -> bool:
|
||||
"""Check if two URLs belong to the same root domain."""
|
||||
domain1 = urlparse(url1).netloc.replace("www.", "")
|
||||
domain2 = urlparse(url2).netloc.replace("www.", "")
|
||||
# Allow subdomains of the same root domain
|
||||
parts1 = domain1.split(".")
|
||||
parts2 = domain2.split(".")
|
||||
if len(parts1) >= 2 and len(parts2) >= 2:
|
||||
return parts1[-2:] == parts2[-2:]
|
||||
return domain1 == domain2
|
||||
|
||||
|
||||
def is_profile_url(url: str) -> bool:
|
||||
"""Check if URL pattern suggests an individual profile page."""
|
||||
url_lower = url.lower()
|
||||
return any(pattern in url_lower for pattern in PROFILE_URL_PATTERNS)
|
||||
|
||||
|
||||
def is_directory_url(url: str) -> bool:
|
||||
"""Check if URL pattern suggests a directory/listing page."""
|
||||
url_lower = url.lower()
|
||||
return any(pattern in url_lower for pattern in DIRECTORY_URL_PATTERNS)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedLink:
|
||||
url: str
|
||||
title: str
|
||||
text: str
|
||||
source_url: str
|
||||
bucket: str # "program" or "faculty"
|
||||
is_verified: bool = False
|
||||
http_status: int = 0
|
||||
is_profile_page: bool = False
|
||||
scraped_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeSettings:
|
||||
root_url: str
|
||||
max_depth: int
|
||||
max_pages: int
|
||||
headless: bool
|
||||
output: Path
|
||||
verify_links: bool = True
|
||||
request_delay: float = 1.0 # Polite crawling delay
|
||||
|
||||
|
||||
async def extract_links(page: Page) -> List[Tuple[str, str]]:
|
||||
"""Extract all anchor links from the page."""
|
||||
anchors: Iterable[dict] = await page.eval_on_selector_all(
|
||||
"a",
|
||||
"""elements => elements
|
||||
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
|
||||
.filter(item => item.text && item.href && item.href.startsWith('http'))""",
|
||||
)
|
||||
return [(item["href"], item["text"]) for item in anchors]
|
||||
|
||||
|
||||
async def get_page_title(page: Page) -> str:
|
||||
"""Get the page title safely."""
|
||||
try:
|
||||
return await page.title() or ""
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
async def verify_link(context, url: str, timeout: int = 10000) -> Tuple[bool, int, str]:
|
||||
"""
|
||||
Verify a link by making a HEAD-like request.
|
||||
Returns: (is_valid, status_code, page_title)
|
||||
"""
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response: Response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
||||
if response:
|
||||
status = response.status
|
||||
title = await get_page_title(page)
|
||||
is_valid = 200 <= status < 400
|
||||
return is_valid, status, title
|
||||
return False, 0, ""
|
||||
except Exception:
|
||||
return False, 0, ""
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
|
||||
async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
|
||||
"""
|
||||
Crawl the website using BFS, collecting program and faculty links.
|
||||
Features:
|
||||
- URL deduplication
|
||||
- Link verification
|
||||
- Profile page detection
|
||||
- Polite crawling with delays
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser_launcher = getattr(p, browser_name)
|
||||
browser = await browser_launcher.launch(headless=settings.headless)
|
||||
context = await browser.new_context()
|
||||
|
||||
# Priority queue: (priority, url, depth) - lower priority = processed first
|
||||
# Directory pages get priority 0, others get priority 1
|
||||
queue: Deque[Tuple[int, str, int]] = deque([(0, settings.root_url, 0)])
|
||||
visited: Set[str] = set()
|
||||
found_urls: Set[str] = set() # For deduplication of results
|
||||
results: List[ScrapedLink] = []
|
||||
|
||||
print(f"Starting crawl from: {settings.root_url}")
|
||||
print(f"Max depth: {settings.max_depth}, Max pages: {settings.max_pages}")
|
||||
|
||||
try:
|
||||
while queue and len(visited) < settings.max_pages:
|
||||
# Sort queue by priority (directory pages first)
|
||||
queue = deque(sorted(queue, key=lambda x: x[0]))
|
||||
priority, url, depth = queue.popleft()
|
||||
|
||||
normalized_url = normalize_url(settings.root_url, url)
|
||||
if normalized_url in visited or depth > settings.max_depth:
|
||||
continue
|
||||
|
||||
# Only crawl same-domain URLs
|
||||
if not is_same_domain(settings.root_url, normalized_url):
|
||||
continue
|
||||
|
||||
visited.add(normalized_url)
|
||||
print(f"[{len(visited)}/{settings.max_pages}] Depth {depth}: {normalized_url[:80]}...")
|
||||
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response = await page.goto(
|
||||
normalized_url, wait_until="domcontentloaded", timeout=20000
|
||||
)
|
||||
if not response or response.status >= 400:
|
||||
await page.close()
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
await page.close()
|
||||
continue
|
||||
|
||||
page_title = await get_page_title(page)
|
||||
links = await extract_links(page)
|
||||
|
||||
for href, text in links:
|
||||
normalized_href = normalize_url(normalized_url, href)
|
||||
|
||||
# Skip if already found or is excluded
|
||||
if normalized_href in found_urls:
|
||||
continue
|
||||
if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized_href, EXCLUSION_KEYWORDS):
|
||||
continue
|
||||
|
||||
text_lower = text.lower()
|
||||
href_lower = normalized_href.lower()
|
||||
is_profile = is_profile_url(normalized_href)
|
||||
|
||||
# Check for program links
|
||||
if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(href_lower, PROGRAM_KEYWORDS):
|
||||
found_urls.add(normalized_href)
|
||||
results.append(
|
||||
ScrapedLink(
|
||||
url=normalized_href,
|
||||
title="",
|
||||
text=text[:200],
|
||||
source_url=normalized_url,
|
||||
bucket="program",
|
||||
is_profile_page=False,
|
||||
)
|
||||
)
|
||||
|
||||
# Check for faculty links
|
||||
if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(href_lower, FACULTY_KEYWORDS):
|
||||
found_urls.add(normalized_href)
|
||||
results.append(
|
||||
ScrapedLink(
|
||||
url=normalized_href,
|
||||
title="",
|
||||
text=text[:200],
|
||||
source_url=normalized_url,
|
||||
bucket="faculty",
|
||||
is_profile_page=is_profile,
|
||||
)
|
||||
)
|
||||
|
||||
# Queue for further crawling
|
||||
if depth < settings.max_depth and is_same_domain(settings.root_url, normalized_href):
|
||||
# Prioritize directory pages
|
||||
link_priority = 0 if is_directory_url(normalized_href) else 1
|
||||
queue.append((link_priority, normalized_href, depth + 1))
|
||||
|
||||
await page.close()
|
||||
|
||||
# Polite delay between requests
|
||||
await asyncio.sleep(settings.request_delay)
|
||||
|
||||
finally:
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
# Verify links if enabled
|
||||
if settings.verify_links and results:
|
||||
print(f"\nVerifying {len(results)} links...")
|
||||
browser = await browser_launcher.launch(headless=True)
|
||||
context = await browser.new_context()
|
||||
|
||||
verified_results = []
|
||||
for i, link in enumerate(results):
|
||||
if link.url in [r.url for r in verified_results]:
|
||||
continue # Skip duplicates
|
||||
|
||||
print(f" [{i+1}/{len(results)}] Verifying: {link.url[:60]}...")
|
||||
is_valid, status, title = await verify_link(context, link.url)
|
||||
link.is_verified = True
|
||||
link.http_status = status
|
||||
link.title = title or link.text
|
||||
|
||||
if is_valid:
|
||||
verified_results.append(link)
|
||||
else:
|
||||
print(f" Invalid (HTTP {status})")
|
||||
|
||||
await asyncio.sleep(0.5) # Delay between verifications
|
||||
|
||||
await context.close()
|
||||
await browser.close()
|
||||
results = verified_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def deduplicate_results(results: List[ScrapedLink]) -> List[ScrapedLink]:
|
||||
"""Remove duplicate URLs, keeping the first occurrence."""
|
||||
seen: Set[str] = set()
|
||||
unique = []
|
||||
for link in results:
|
||||
if link.url not in seen:
|
||||
seen.add(link.url)
|
||||
unique.append(link)
|
||||
return unique
|
||||
|
||||
|
||||
def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
|
||||
"""Save results to JSON file with statistics."""
|
||||
results = deduplicate_results(results)
|
||||
|
||||
program_links = [link for link in results if link.bucket == "program"]
|
||||
faculty_links = [link for link in results if link.bucket == "faculty"]
|
||||
profile_pages = [link for link in faculty_links if link.is_profile_page]
|
||||
|
||||
payload = {
|
||||
"root_url": root_url,
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"statistics": {
|
||||
"total_links": len(results),
|
||||
"program_links": len(program_links),
|
||||
"faculty_links": len(faculty_links),
|
||||
"profile_pages": len(profile_pages),
|
||||
"verified_links": len([r for r in results if r.is_verified and r.http_status == 200]),
|
||||
},
|
||||
"program_links": [asdict(link) for link in program_links],
|
||||
"faculty_links": [asdict(link) for link in faculty_links],
|
||||
"notes": EXTRA_NOTES,
|
||||
"metadata_fields": METADATA_FIELDS,
|
||||
}
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
print(f"\nResults saved to: {target}")
|
||||
print(f" Total links: {len(results)}")
|
||||
print(f" Program links: {len(program_links)}")
|
||||
print(f" Faculty links: {len(faculty_links)}")
|
||||
print(f" Profile pages: {len(profile_pages)}")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Playwright scraper generated by the Agno agent for https://www.rwth-aachen.de/go/id/a/?lidx=1."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--root-url",
|
||||
default="https://www.rwth-aachen.de/go/id/a/?lidx=1",
|
||||
help="Seed url to start crawling from.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-depth",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Maximum crawl depth.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Maximum number of pages to visit.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path("university-scraper_results.json"),
|
||||
help="Where to save the JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Run browser in headless mode (default: True).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-headless",
|
||||
action="store_false",
|
||||
dest="headless",
|
||||
help="Run browser with visible window.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browser",
|
||||
choices=["chromium", "firefox", "webkit"],
|
||||
default="chromium",
|
||||
help="Browser engine to launch via Playwright.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-verify",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Skip link verification step.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delay",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Delay between requests in seconds (polite crawling).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
async def main_async() -> None:
|
||||
args = parse_args()
|
||||
settings = ScrapeSettings(
|
||||
root_url=args.root_url,
|
||||
max_depth=args.max_depth,
|
||||
max_pages=args.max_pages,
|
||||
headless=args.headless,
|
||||
output=args.output,
|
||||
verify_links=not args.no_verify,
|
||||
request_delay=args.delay,
|
||||
)
|
||||
links = await crawl(settings, browser_name=args.browser)
|
||||
serialize(links, settings.output, settings.root_url)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
asyncio.run(main_async())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
437
artifacts/rwth_aachen_university_scraper.py
Normal file
437
artifacts/rwth_aachen_university_scraper.py
Normal file
@ -0,0 +1,437 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Auto-generated by the Agno codegen agent.
|
||||
Target university: RWTH Aachen (https://www.rwth-aachen.de/go/id/a/?lidx=1)
|
||||
Requested caps: depth=3, pages=30
|
||||
|
||||
Plan description: Playwright scraper for university master programs and faculty profiles.
|
||||
Navigation strategy: Start at the university homepage: https://www.rwth-aachen.de/ Navigate to faculty/department pages, e.g. /fakultaeten/, /fachbereiche/ Look for staff/people directory pages within each department Crawl the staff directories to find individual profile pages Some departments may use subdomains like informatik.rwth-aachen.de
|
||||
Verification checklist:
|
||||
- Check that collected URLs are for individual people, not directories
|
||||
- Spot check profile pages to ensure they represent faculty members
|
||||
- Verify relevant graduate program pages were found
|
||||
- Confirm noise pages like news, events, jobs were excluded
|
||||
Playwright snapshot used to guide this plan:
|
||||
1. RWTH Aachen University | Rheinisch-Westfälische Technische Hochschule | EN (https://www.rwth-aachen.de/go/id/a/?lidx=1)
|
||||
Snippet: Skip to Content Skip to Main Navigation Skip to Landing Pages for Target Groups Skip to Quick Access Skip to Search Skip to Footer News Information for... Quick Access DE Search for Search Copyright: © Copyright: © Copyright: © Copyright: © Studying at RWTH Welc
|
||||
Anchors: Skip to Content -> https://www.rwth-aachen.de/go/id/a/?lidx=1#main, Skip to Main Navigation -> https://www.rwth-aachen.de/go/id/a/?lidx=1#main-nav-control, Skip to Landing Pages for Target Groups -> https://www.rwth-aachen.de/go/id/a/?lidx=1#persona-control, Skip to Quick Access -> https://www.rwth-aachen.de/go/id/a/?lidx=1#quick-start-control, Skip to Search -> https://www.rwth-aachen.de/go/id/a/?lidx=1#searchbar, Skip to Footer -> https://www.rwth-aachen.de/go/id/a/?lidx=1#footer
|
||||
2. Prospective Students | RWTH Aachen University | EN (https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/)
|
||||
Snippet: Skip to Content Skip to Main Navigation Skip to Landing Pages for Target Groups Skip to Quick Access Skip to Search Skip to Footer News Information for... Quick Access DE Search for Search Prospective Students Choosing A Course of Study Copyright: © Mario Irrmischer Adv
|
||||
Anchors: Skip to Content -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#main, Skip to Main Navigation -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#main-nav-control, Skip to Landing Pages for Target Groups -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#persona-control, Skip to Quick Access -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#quick-start-control, Skip to Search -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#searchbar, Skip to Footer -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~svo/Studieninteressierte/lidx/1/#footer
|
||||
3. First-Year Students | RWTH Aachen University | EN (https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/)
|
||||
Snippet: Skip to Content Skip to Main Navigation Skip to Landing Pages for Target Groups Skip to Quick Access Skip to Search Skip to Footer News Information for... Quick Access DE Search for Search First-Year Students Preparing for Your Studies – Recommended Subject-Specific Res
|
||||
Anchors: Skip to Content -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#main, Skip to Main Navigation -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#main-nav-control, Skip to Landing Pages for Target Groups -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#persona-control, Skip to Quick Access -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#quick-start-control, Skip to Search -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#searchbar, Skip to Footer -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~cgjnl/Studienanfaengerinnen-und-anfaenger/lidx/1/#footer
|
||||
4. Students | RWTH Aachen University | EN (https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/)
|
||||
Snippet: Skip to Content Skip to Main Navigation Skip to Landing Pages for Target Groups Skip to Quick Access Skip to Search Skip to Footer News Information for... Quick Access DE Search for Search Students Teaser Copyright: © Martin Braun Classes What lectures do you have next
|
||||
Anchors: Skip to Content -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#main, Skip to Main Navigation -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#main-nav-control, Skip to Landing Pages for Target Groups -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#persona-control, Skip to Quick Access -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#quick-start-control, Skip to Search -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#searchbar, Skip to Footer -> https://www.rwth-aachen.de/cms/root/Zielgruppenportale/~tpi/Studierende/lidx/1/#footer
|
||||
Snapshot truncated.
|
||||
|
||||
Generated at: 2025-12-09T15:00:09.586788+00:00
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from collections import deque
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Deque, Iterable, List, Set, Tuple
|
||||
from urllib.parse import urljoin, urldefrag, urlparse
|
||||
|
||||
from playwright.async_api import async_playwright, Page, Response
|
||||
|
||||
PROGRAM_KEYWORDS = ['/studium/', '/studiengaenge/', 'master', 'graduate', 'postgraduate', 'm.sc.', 'm.a.']
|
||||
FACULTY_KEYWORDS = ['/staff/', '/profile/', '/personen/', '/person/', '/aw/personen/', 'prof.', 'dr.', 'professor']
|
||||
EXCLUSION_KEYWORDS = ['studieninteressierte', 'studienanfaenger', 'zulassung', 'bewerbung', 'studienbeitraege', 'studienfinanzierung', 'aktuelles', 'veranstaltungen', 'karriere', 'stellenangebote', 'alumni', 'anmeldung']
|
||||
METADATA_FIELDS = ['url', 'title', 'entity_type', 'department', 'email', 'scraped_at']
|
||||
EXTRA_NOTES = ['Site is primarily in German, so use German keywords', 'Faculty profile URLs contain /personen/ or /person/', 'Graduate program pages use /studium/ and /studiengaenge/']
|
||||
|
||||
# URL patterns that indicate individual profile pages
|
||||
PROFILE_URL_PATTERNS = [
|
||||
"/people/", "/person/", "/profile/", "/profiles/",
|
||||
"/faculty/", "/staff/", "/directory/",
|
||||
"/~", # Unix-style personal pages
|
||||
"/bio/", "/about/",
|
||||
]
|
||||
|
||||
# URL patterns that indicate listing/directory pages (should be crawled deeper)
|
||||
DIRECTORY_URL_PATTERNS = [
|
||||
"/faculty", "/people", "/directory", "/staff",
|
||||
"/team", "/members", "/researchers",
|
||||
]
|
||||
|
||||
|
||||
def normalize_url(base: str, href: str) -> str:
|
||||
"""Normalize URL by resolving relative paths and removing fragments."""
|
||||
absolute = urljoin(base, href)
|
||||
cleaned, _ = urldefrag(absolute)
|
||||
# Remove trailing slash for consistency
|
||||
return cleaned.rstrip("/")
|
||||
|
||||
|
||||
def matches_any(text: str, keywords: Iterable[str]) -> bool:
|
||||
"""Check if text contains any of the keywords (case-insensitive)."""
|
||||
lowered = text.lower()
|
||||
return any(keyword.lower() in lowered for keyword in keywords)
|
||||
|
||||
|
||||
def is_same_domain(url1: str, url2: str) -> bool:
|
||||
"""Check if two URLs belong to the same root domain."""
|
||||
domain1 = urlparse(url1).netloc.replace("www.", "")
|
||||
domain2 = urlparse(url2).netloc.replace("www.", "")
|
||||
# Allow subdomains of the same root domain
|
||||
parts1 = domain1.split(".")
|
||||
parts2 = domain2.split(".")
|
||||
if len(parts1) >= 2 and len(parts2) >= 2:
|
||||
return parts1[-2:] == parts2[-2:]
|
||||
return domain1 == domain2
|
||||
|
||||
|
||||
def is_profile_url(url: str) -> bool:
|
||||
"""Check if URL pattern suggests an individual profile page."""
|
||||
url_lower = url.lower()
|
||||
return any(pattern in url_lower for pattern in PROFILE_URL_PATTERNS)
|
||||
|
||||
|
||||
def is_directory_url(url: str) -> bool:
|
||||
"""Check if URL pattern suggests a directory/listing page."""
|
||||
url_lower = url.lower()
|
||||
return any(pattern in url_lower for pattern in DIRECTORY_URL_PATTERNS)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedLink:
|
||||
url: str
|
||||
title: str
|
||||
text: str
|
||||
source_url: str
|
||||
bucket: str # "program" or "faculty"
|
||||
is_verified: bool = False
|
||||
http_status: int = 0
|
||||
is_profile_page: bool = False
|
||||
scraped_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeSettings:
|
||||
root_url: str
|
||||
max_depth: int
|
||||
max_pages: int
|
||||
headless: bool
|
||||
output: Path
|
||||
verify_links: bool = True
|
||||
request_delay: float = 1.0 # Polite crawling delay
|
||||
|
||||
|
||||
async def extract_links(page: Page) -> List[Tuple[str, str]]:
|
||||
"""Extract all anchor links from the page."""
|
||||
anchors: Iterable[dict] = await page.eval_on_selector_all(
|
||||
"a",
|
||||
"""elements => elements
|
||||
.map(el => ({text: (el.textContent || '').trim(), href: el.href}))
|
||||
.filter(item => item.text && item.href && item.href.startsWith('http'))""",
|
||||
)
|
||||
return [(item["href"], item["text"]) for item in anchors]
|
||||
|
||||
|
||||
async def get_page_title(page: Page) -> str:
|
||||
"""Get the page title safely."""
|
||||
try:
|
||||
return await page.title() or ""
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
async def verify_link(context, url: str, timeout: int = 10000) -> Tuple[bool, int, str]:
|
||||
"""
|
||||
Verify a link by making a HEAD-like request.
|
||||
Returns: (is_valid, status_code, page_title)
|
||||
"""
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response: Response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
||||
if response:
|
||||
status = response.status
|
||||
title = await get_page_title(page)
|
||||
is_valid = 200 <= status < 400
|
||||
return is_valid, status, title
|
||||
return False, 0, ""
|
||||
except Exception:
|
||||
return False, 0, ""
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
|
||||
async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink]:
|
||||
"""
|
||||
Crawl the website using BFS, collecting program and faculty links.
|
||||
Features:
|
||||
- URL deduplication
|
||||
- Link verification
|
||||
- Profile page detection
|
||||
- Polite crawling with delays
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser_launcher = getattr(p, browser_name)
|
||||
browser = await browser_launcher.launch(headless=settings.headless)
|
||||
context = await browser.new_context()
|
||||
|
||||
# Priority queue: (priority, url, depth) - lower priority = processed first
|
||||
# Directory pages get priority 0, others get priority 1
|
||||
queue: Deque[Tuple[int, str, int]] = deque([(0, settings.root_url, 0)])
|
||||
visited: Set[str] = set()
|
||||
found_urls: Set[str] = set() # For deduplication of results
|
||||
results: List[ScrapedLink] = []
|
||||
|
||||
print(f"Starting crawl from: {settings.root_url}")
|
||||
print(f"Max depth: {settings.max_depth}, Max pages: {settings.max_pages}")
|
||||
|
||||
try:
|
||||
while queue and len(visited) < settings.max_pages:
|
||||
# Sort queue by priority (directory pages first)
|
||||
queue = deque(sorted(queue, key=lambda x: x[0]))
|
||||
priority, url, depth = queue.popleft()
|
||||
|
||||
normalized_url = normalize_url(settings.root_url, url)
|
||||
if normalized_url in visited or depth > settings.max_depth:
|
||||
continue
|
||||
|
||||
# Only crawl same-domain URLs
|
||||
if not is_same_domain(settings.root_url, normalized_url):
|
||||
continue
|
||||
|
||||
visited.add(normalized_url)
|
||||
print(f"[{len(visited)}/{settings.max_pages}] Depth {depth}: {normalized_url[:80]}...")
|
||||
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response = await page.goto(
|
||||
normalized_url, wait_until="domcontentloaded", timeout=20000
|
||||
)
|
||||
if not response or response.status >= 400:
|
||||
await page.close()
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
await page.close()
|
||||
continue
|
||||
|
||||
page_title = await get_page_title(page)
|
||||
links = await extract_links(page)
|
||||
|
||||
for href, text in links:
|
||||
normalized_href = normalize_url(normalized_url, href)
|
||||
|
||||
# Skip if already found or is excluded
|
||||
if normalized_href in found_urls:
|
||||
continue
|
||||
if matches_any(text, EXCLUSION_KEYWORDS) or matches_any(normalized_href, EXCLUSION_KEYWORDS):
|
||||
continue
|
||||
|
||||
text_lower = text.lower()
|
||||
href_lower = normalized_href.lower()
|
||||
is_profile = is_profile_url(normalized_href)
|
||||
|
||||
# Check for program links
|
||||
if matches_any(text_lower, PROGRAM_KEYWORDS) or matches_any(href_lower, PROGRAM_KEYWORDS):
|
||||
found_urls.add(normalized_href)
|
||||
results.append(
|
||||
ScrapedLink(
|
||||
url=normalized_href,
|
||||
title="",
|
||||
text=text[:200],
|
||||
source_url=normalized_url,
|
||||
bucket="program",
|
||||
is_profile_page=False,
|
||||
)
|
||||
)
|
||||
|
||||
# Check for faculty links
|
||||
if matches_any(text_lower, FACULTY_KEYWORDS) or matches_any(href_lower, FACULTY_KEYWORDS):
|
||||
found_urls.add(normalized_href)
|
||||
results.append(
|
||||
ScrapedLink(
|
||||
url=normalized_href,
|
||||
title="",
|
||||
text=text[:200],
|
||||
source_url=normalized_url,
|
||||
bucket="faculty",
|
||||
is_profile_page=is_profile,
|
||||
)
|
||||
)
|
||||
|
||||
# Queue for further crawling
|
||||
if depth < settings.max_depth and is_same_domain(settings.root_url, normalized_href):
|
||||
# Prioritize directory pages
|
||||
link_priority = 0 if is_directory_url(normalized_href) else 1
|
||||
queue.append((link_priority, normalized_href, depth + 1))
|
||||
|
||||
await page.close()
|
||||
|
||||
# Polite delay between requests
|
||||
await asyncio.sleep(settings.request_delay)
|
||||
|
||||
finally:
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
# Verify links if enabled
|
||||
if settings.verify_links and results:
|
||||
print(f"\nVerifying {len(results)} links...")
|
||||
browser = await browser_launcher.launch(headless=True)
|
||||
context = await browser.new_context()
|
||||
|
||||
verified_results = []
|
||||
for i, link in enumerate(results):
|
||||
if link.url in [r.url for r in verified_results]:
|
||||
continue # Skip duplicates
|
||||
|
||||
print(f" [{i+1}/{len(results)}] Verifying: {link.url[:60]}...")
|
||||
is_valid, status, title = await verify_link(context, link.url)
|
||||
link.is_verified = True
|
||||
link.http_status = status
|
||||
link.title = title or link.text
|
||||
|
||||
if is_valid:
|
||||
verified_results.append(link)
|
||||
else:
|
||||
print(f" Invalid (HTTP {status})")
|
||||
|
||||
await asyncio.sleep(0.5) # Delay between verifications
|
||||
|
||||
await context.close()
|
||||
await browser.close()
|
||||
results = verified_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def deduplicate_results(results: List[ScrapedLink]) -> List[ScrapedLink]:
|
||||
"""Remove duplicate URLs, keeping the first occurrence."""
|
||||
seen: Set[str] = set()
|
||||
unique = []
|
||||
for link in results:
|
||||
if link.url not in seen:
|
||||
seen.add(link.url)
|
||||
unique.append(link)
|
||||
return unique
|
||||
|
||||
|
||||
def serialize(results: List[ScrapedLink], target: Path, root_url: str) -> None:
|
||||
"""Save results to JSON file with statistics."""
|
||||
results = deduplicate_results(results)
|
||||
|
||||
program_links = [link for link in results if link.bucket == "program"]
|
||||
faculty_links = [link for link in results if link.bucket == "faculty"]
|
||||
profile_pages = [link for link in faculty_links if link.is_profile_page]
|
||||
|
||||
payload = {
|
||||
"root_url": root_url,
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"statistics": {
|
||||
"total_links": len(results),
|
||||
"program_links": len(program_links),
|
||||
"faculty_links": len(faculty_links),
|
||||
"profile_pages": len(profile_pages),
|
||||
"verified_links": len([r for r in results if r.is_verified and r.http_status == 200]),
|
||||
},
|
||||
"program_links": [asdict(link) for link in program_links],
|
||||
"faculty_links": [asdict(link) for link in faculty_links],
|
||||
"notes": EXTRA_NOTES,
|
||||
"metadata_fields": METADATA_FIELDS,
|
||||
}
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
print(f"\nResults saved to: {target}")
|
||||
print(f" Total links: {len(results)}")
|
||||
print(f" Program links: {len(program_links)}")
|
||||
print(f" Faculty links: {len(faculty_links)}")
|
||||
print(f" Profile pages: {len(profile_pages)}")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Playwright scraper generated by the Agno agent for https://www.rwth-aachen.de/go/id/a/?lidx=1."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--root-url",
|
||||
default="https://www.rwth-aachen.de/go/id/a/?lidx=1",
|
||||
help="Seed url to start crawling from.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-depth",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Maximum crawl depth.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Maximum number of pages to visit.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path("university-scraper_results.json"),
|
||||
help="Where to save the JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Run browser in headless mode (default: True).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-headless",
|
||||
action="store_false",
|
||||
dest="headless",
|
||||
help="Run browser with visible window.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browser",
|
||||
choices=["chromium", "firefox", "webkit"],
|
||||
default="chromium",
|
||||
help="Browser engine to launch via Playwright.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-verify",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Skip link verification step.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delay",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Delay between requests in seconds (polite crawling).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
async def main_async() -> None:
|
||||
args = parse_args()
|
||||
settings = ScrapeSettings(
|
||||
root_url=args.root_url,
|
||||
max_depth=args.max_depth,
|
||||
max_pages=args.max_pages,
|
||||
headless=args.headless,
|
||||
output=args.output,
|
||||
verify_links=not args.no_verify,
|
||||
request_delay=args.delay,
|
||||
)
|
||||
links = await crawl(settings, browser_name=args.browser)
|
||||
serialize(links, settings.output, settings.root_url)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
asyncio.run(main_async())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user