Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/test_faculty_scraper.py
+++ b/artifacts/test_faculty_scraper.py
@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""
+测试导师信息爬取逻辑 - 只测试3个项目
+"""
+
+import asyncio
+import json
+import re
+from playwright.async_api import async_playwright
+
+
+def name_to_slug(name):
+    """将项目名称转换为URL slug"""
+    slug = name.lower()
+    slug = re.sub(r'[^\w\s-]', '', slug)
+    slug = re.sub(r'[\s_]+', '-', slug)
+    slug = re.sub(r'-+', '-', slug)
+    slug = slug.strip('-')
+    return slug
+
+
+async def get_faculty_from_gsas_page(page, gsas_url):
+    """从GSAS项目页面获取Faculty链接，然后访问院系People页面获取导师列表"""
+    faculty_list = []
+    faculty_page_url = None
+
+    try:
+        print(f"  访问GSAS页面: {gsas_url}")
+        await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
+        await page.wait_for_timeout(2000)
+
+        # 查找Faculty部分的链接
+        faculty_link = await page.evaluate('''() => {
+            const links = document.querySelectorAll('a[href]');
+            for (const link of links) {
+                const text = link.innerText.toLowerCase();
+                const href = link.href;
+                if (text.includes('faculty') && text.includes('see list')) {
+                    return href;
+                }
+                if (text.includes('faculty') && (href.includes('/people') || href.includes('/faculty'))) {
+                    return href;
+                }
+            }
+            return null;
+        }''')
+
+        if faculty_link:
+            faculty_page_url = faculty_link
+            print(f"  找到Faculty页面链接: {faculty_link}")
+
+            # 访问Faculty/People页面
+            await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(2000)
+
+            # 提取所有导师信息
+            faculty_list = await page.evaluate('''() => {
+                const faculty = [];
+                const seen = new Set();
+
+                document.querySelectorAll('a[href]').forEach(a => {
+                    const href = a.href || '';
+                    const text = a.innerText.trim();
+                    const lowerHref = href.toLowerCase();
+
+                    if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
+                         lowerHref.includes('/profile/')) &&
+                        text.length > 3 && text.length < 100 &&
+                        !text.toLowerCase().includes('people') &&
+                        !text.toLowerCase().includes('faculty') &&
+                        !lowerHref.endsWith('/people/') &&
+                        !lowerHref.endsWith('/faculty/')) {
+
+                        if (!seen.has(href)) {
+                            seen.add(href);
+                            faculty.push({
+                                name: text,
+                                url: href
+                            });
+                        }
+                    }
+                });
+
+                return faculty;
+            }''')
+
+            print(f"  找到 {len(faculty_list)} 位导师")
+            for f in faculty_list[:5]:
+                print(f"    - {f['name']}: {f['url']}")
+            if len(faculty_list) > 5:
+                print(f"    ... 还有 {len(faculty_list) - 5} 位")
+        else:
+            print("  未找到Faculty页面链接")
+
+    except Exception as e:
+        print(f"  获取Faculty信息失败: {e}")
+
+    return faculty_list, faculty_page_url
+
+
+async def test_faculty_scraper():
+    """测试导师爬取"""
+
+    # 测试3个项目
+    test_programs = [
+        "African and African American Studies",
+        "Economics",
+        "Computer Science"
+    ]
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            viewport={'width': 1920, 'height': 1080}
+        )
+        page = await context.new_page()
+
+        results = []
+
+        for i, name in enumerate(test_programs, 1):
+            print(f"\n{'='*60}")
+            print(f"[{i}/{len(test_programs)}] 测试: {name}")
+            print(f"{'='*60}")
+
+            slug = name_to_slug(name)
+            program_url = f"https://www.harvard.edu/programs/{slug}/"
+            gsas_url = f"https://gsas.harvard.edu/program/{slug}"
+
+            print(f"项目URL: {program_url}")
+            print(f"GSAS URL: {gsas_url}")
+
+            faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url)
+
+            results.append({
+                'name': name,
+                'url': program_url,
+                'gsas_url': gsas_url,
+                'faculty_page_url': faculty_page_url,
+                'faculty': faculty_list,
+                'faculty_count': len(faculty_list)
+            })
+
+            await page.wait_for_timeout(1000)
+
+        await browser.close()
+
+    # 输出结果
+    print(f"\n\n{'='*60}")
+    print("测试结果汇总")
+    print(f"{'='*60}")
+
+    for r in results:
+        print(f"\n{r['name']}:")
+        print(f"  Faculty页面: {r['faculty_page_url'] or '未找到'}")
+        print(f"  导师数量: {r['faculty_count']}")
+
+    # 保存测试结果
+    with open('test_faculty_results.json', 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\n测试结果已保存到: test_faculty_results.json")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_faculty_scraper())