Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
83
artifacts/debug_cs_faculty.py
Normal file
83
artifacts/debug_cs_faculty.py
Normal file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
调试Computer Science的Faculty页面
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
async def debug_cs():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
# 访问Computer Science GSAS页面
|
||||
gsas_url = "https://gsas.harvard.edu/program/computer-science"
|
||||
print(f"访问: {gsas_url}")
|
||||
|
||||
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
await page.screenshot(path="cs_gsas_page.png", full_page=True)
|
||||
print("截图已保存: cs_gsas_page.png")
|
||||
|
||||
# 查找所有链接
|
||||
links = await page.evaluate('''() => {
|
||||
const links = [];
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const text = a.innerText.trim();
|
||||
const href = a.href;
|
||||
if (text && text.length > 2 && text.length < 100) {
|
||||
links.push({text: text, href: href});
|
||||
}
|
||||
});
|
||||
return links;
|
||||
}''')
|
||||
|
||||
print(f"\n页面上的所有链接 ({len(links)} 个):")
|
||||
for link in links:
|
||||
print(f" - {link['text'][:60]} -> {link['href']}")
|
||||
|
||||
# 查找可能的Faculty或People链接
|
||||
print("\n\n查找Faculty/People相关链接:")
|
||||
for link in links:
|
||||
text_lower = link['text'].lower()
|
||||
href_lower = link['href'].lower()
|
||||
if 'faculty' in text_lower or 'people' in href_lower or 'faculty' in href_lower or 'website' in text_lower:
|
||||
print(f" * {link['text']} -> {link['href']}")
|
||||
|
||||
# 尝试访问SEAS (School of Engineering)
|
||||
print("\n\n尝试访问SEAS Computer Science页面...")
|
||||
seas_url = "https://seas.harvard.edu/computer-science"
|
||||
await page.goto(seas_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
await page.screenshot(path="seas_cs_page.png", full_page=True)
|
||||
print("截图已保存: seas_cs_page.png")
|
||||
|
||||
seas_links = await page.evaluate('''() => {
|
||||
const links = [];
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const text = a.innerText.trim();
|
||||
const href = a.href;
|
||||
const lowerText = text.toLowerCase();
|
||||
const lowerHref = href.toLowerCase();
|
||||
if ((lowerText.includes('faculty') || lowerText.includes('people') ||
|
||||
lowerHref.includes('faculty') || lowerHref.includes('people')) &&
|
||||
text.length > 2) {
|
||||
links.push({text: text, href: href});
|
||||
}
|
||||
});
|
||||
return links;
|
||||
}''')
|
||||
|
||||
print(f"\nSEAS页面上的Faculty/People链接:")
|
||||
for link in seas_links:
|
||||
print(f" * {link['text']} -> {link['href']}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(debug_cs())
|
||||
Reference in New Issue
Block a user