Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yangxiaoyu-crypto
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions

View File

@ -125,6 +125,7 @@ class ScrapeSettings:
output: Path
verify_links: bool = True
request_delay: float = 1.0 # Polite crawling delay
timeout: int = 60000 # Navigation timeout in ms
async def extract_links(page: Page) -> List[Tuple[str, str]]:
@ -210,7 +211,7 @@ async def crawl(settings: ScrapeSettings, browser_name: str) -> List[ScrapedLink
page = await context.new_page()
try:
response = await page.goto(
normalized_url, wait_until="domcontentloaded", timeout=20000
normalized_url, wait_until="domcontentloaded", timeout=settings.timeout
)
if not response or response.status >= 400:
await page.close()
@ -411,6 +412,12 @@ def parse_args() -> argparse.Namespace:
default=1.0,
help="Delay between requests in seconds (polite crawling).",
)
parser.add_argument(
"--timeout",
type=int,
default=60000,
help="Navigation timeout in milliseconds (default: 60000 = 60s).",
)
return parser.parse_args()
@ -424,6 +431,7 @@ async def main_async() -> None:
output=args.output,
verify_links=not args.no_verify,
request_delay=args.delay,
timeout=args.timeout,
)
links = await crawl(settings, browser_name=args.browser)
serialize(links, settings.output, settings.root_url)