Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@ -125,6 +125,7 @@ class ScrapeSettings:
|
||||
output: Path
|
||||
verify_links: bool = True
|
||||
request_delay: float = 1.0 # Polite crawling delay
|
||||
timeout: int = 60000 # Navigation timeout in ms
|
||||
|
||||
|
||||
async def extract_links(page: Page) -> List[Tuple[str, str]]:
|
||||
@ -210,7 +211,7 @@ async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink
|
||||
page = await context.new_page()
|
||||
try:
|
||||
response = await page.goto(
|
||||
normalized_url, wait_until="domcontentloaded", timeout=20000
|
||||
normalized_url, wait_until="domcontentloaded", timeout=settings.timeout
|
||||
)
|
||||
if not response or response.status >= 400:
|
||||
await page.close()
|
||||
@ -411,6 +412,12 @@ def parse_args() -> argparse.Namespace:
|
||||
default=1.0,
|
||||
help="Delay between requests in seconds (polite crawling).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=60000,
|
||||
help="Navigation timeout in milliseconds (default: 60000 = 60s).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -424,6 +431,7 @@ async def main_async() -> None:
|
||||
output=args.output,
|
||||
verify_links=not args.no_verify,
|
||||
request_delay=args.delay,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
links = await crawl(settings, browser_name=args.browser)
|
||||
serialize(links, settings.output, settings.root_url)
|
||||
|
||||
Reference in New Issue
Block a user