Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/harvard_programs_with_faculty_scraper.py
+++ b/artifacts/harvard_programs_with_faculty_scraper.py
@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+"""
+Harvard Graduate Programs Scraper with Faculty Information
+爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
+并获取每个项目的导师个人信息页面URL
+"""
+
+import asyncio
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from playwright.async_api import async_playwright
+
+
+def name_to_slug(name):
+    """将项目名称转换为URL slug"""
+    slug = name.lower()
+    slug = re.sub(r'[^\w\s-]', '', slug)
+    slug = re.sub(r'[\s_]+', '-', slug)
+    slug = re.sub(r'-+', '-', slug)
+    slug = slug.strip('-')
+    return slug
+
+
+async def extract_faculty_from_page(page):
+    """从当前页面提取所有教职员工链接"""
+    faculty_list = await page.evaluate('''() => {
+        const faculty = [];
+        const seen = new Set();
+
+        document.querySelectorAll('a[href]').forEach(a => {
+            const href = a.href || '';
+            const text = a.innerText.trim();
+            const lowerHref = href.toLowerCase();
+            const lowerText = text.toLowerCase();
+
+            // 检查是否是个人页面链接
+            if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
+                 lowerHref.includes('/profile/') || lowerHref.includes('/person/')) &&
+                text.length > 3 && text.length < 100 &&
+                !lowerText.includes('people') &&
+                !lowerText.includes('faculty') &&
+                !lowerText.includes('profile') &&
+                !lowerText.includes('staff') &&
+                !lowerHref.endsWith('/people/') &&
+                !lowerHref.endsWith('/people') &&
+                !lowerHref.endsWith('/faculty/') &&
+                !lowerHref.endsWith('/faculty')) {
+
+                if (!seen.has(href)) {
+                    seen.add(href);
+                    faculty.push({
+                        name: text,
+                        url: href
+                    });
+                }
+            }
+        });
+
+        return faculty;
+    }''')
+    return faculty_list
+
+
+async def get_faculty_from_gsas_page(page, gsas_url, program_name):
+    """从GSAS项目页面获取Faculty链接，然后访问院系People页面获取导师列表"""
+    faculty_list = []
+    faculty_page_url = None
+
+    try:
+        print(f"    访问GSAS页面: {gsas_url}")
+        await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
+        await page.wait_for_timeout(2000)
+
+        # 策略1: 查找 "See list of ... faculty" 链接
+        faculty_link = await page.evaluate('''() => {
+            const links = document.querySelectorAll('a[href]');
+            for (const link of links) {
+                const text = link.innerText.toLowerCase();
+                const href = link.href;
+                if (text.includes('faculty') && text.includes('see list')) {
+                    return href;
+                }
+            }
+            return null;
+        }''')
+
+        # 策略2: 查找任何包含 /people 或 /faculty 的链接
+        if not faculty_link:
+            faculty_link = await page.evaluate('''() => {
+                const links = document.querySelectorAll('a[href]');
+                for (const link of links) {
+                    const text = link.innerText.toLowerCase();
+                    const href = link.href.toLowerCase();
+                    // 查找Faculty相关链接
+                    if ((text.includes('faculty') || text.includes('people')) &&
+                        (href.includes('/people') || href.includes('/faculty'))) {
+                        return link.href;
+                    }
+                }
+                return null;
+            }''')
+
+        # 策略3: 从页面中查找院系网站链接，然后尝试访问其People页面
+        if not faculty_link:
+            dept_website = await page.evaluate('''() => {
+                const links = document.querySelectorAll('a[href]');
+                for (const link of links) {
+                    const text = link.innerText.toLowerCase();
+                    const href = link.href;
+                    // 查找 Website 链接 (通常指向院系主页)
+                    if (text.includes('website') && href.includes('harvard.edu') &&
+                        !href.includes('gsas.harvard.edu')) {
+                        return href;
+                    }
+                }
+                return null;
+            }''')
+
+            if dept_website:
+                print(f"    找到院系网站: {dept_website}")
+                try:
+                    await page.goto(dept_website, wait_until="domcontentloaded", timeout=30000)
+                    await page.wait_for_timeout(2000)
+
+                    # 在院系网站上查找People/Faculty链接
+                    faculty_link = await page.evaluate('''() => {
+                        const links = document.querySelectorAll('a[href]');
+                        for (const link of links) {
+                            const text = link.innerText.toLowerCase().trim();
+                            const href = link.href;
+                            if ((text === 'people' || text === 'faculty' ||
+                                 text === 'faculty & research' || text.includes('our faculty')) &&
+                                (href.includes('/people') || href.includes('/faculty'))) {
+                                return href;
+                            }
+                        }
+                        return null;
+                    }''')
+                except Exception as e:
+                    print(f"    访问院系网站失败: {e}")
+
+        if faculty_link:
+            faculty_page_url = faculty_link
+            print(f"    找到Faculty页面: {faculty_link}")
+
+            # 访问Faculty/People页面
+            await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(2000)
+
+            # 提取所有导师信息
+            faculty_list = await extract_faculty_from_page(page)
+
+            # 如果第一页没找到，尝试处理分页或其他布局
+            if len(faculty_list) == 0:
+                # 可能需要点击某些按钮或处理JavaScript加载
+                await page.wait_for_timeout(2000)
+                faculty_list = await extract_faculty_from_page(page)
+
+            print(f"    找到 {len(faculty_list)} 位导师")
+        else:
+            print(f"    未找到Faculty页面链接")
+
+    except Exception as e:
+        print(f"    获取Faculty信息失败: {e}")
+
+    return faculty_list, faculty_page_url
+
+
+async def scrape_harvard_programs_with_faculty():
+    """爬取Harvard研究生项目列表及导师信息"""
+
+    all_programs = []
+    base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            viewport={'width': 1920, 'height': 1080}
+        )
+        page = await context.new_page()
+
+        print(f"正在访问: {base_url}")
+        await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
+        await page.wait_for_timeout(5000)
+
+        # 滚动到页面底部
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+        await page.wait_for_timeout(2000)
+
+        current_page = 1
+        max_pages = 15
+
+        # 第一阶段：收集所有项目基本信息
+        print("\n========== 第一阶段：收集项目列表 ==========")
+        while current_page <= max_pages:
+            print(f"\n--- 第 {current_page} 页 ---")
+            await page.wait_for_timeout(2000)
+
+            # 提取当前页面的项目
+            page_data = await page.evaluate('''() => {
+                const programs = [];
+                const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
+
+                programItems.forEach((item, index) => {
+                    const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
+                    if (!nameBtn) return;
+
+                    const name = nameBtn.innerText.trim();
+                    if (!name || name.length < 3) return;
+
+                    let degrees = '';
+                    const allText = item.innerText;
+                    const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
+                    if (degreeMatch) {
+                        degrees = degreeMatch.join(', ');
+                    }
+
+                    programs.push({
+                        name: name,
+                        degrees: degrees
+                    });
+                });
+
+                if (programs.length === 0) {
+                    const buttons = document.querySelectorAll('button');
+                    buttons.forEach((btn) => {
+                        const className = btn.className || '';
+                        if (className.includes('c-programs-item') || className.includes('title-link')) {
+                            const name = btn.innerText.trim();
+                            if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
+                                programs.push({
+                                    name: name,
+                                    degrees: ''
+                                });
+                            }
+                        }
+                    });
+                }
+
+                return programs;
+            }''')
+
+            print(f"  本页找到 {len(page_data)} 个项目")
+
+            for prog in page_data:
+                name = prog['name'].strip()
+                if name and not any(p['name'] == name for p in all_programs):
+                    all_programs.append({
+                        'name': name,
+                        'degrees': prog.get('degrees', ''),
+                        'page': current_page
+                    })
+
+            # 尝试点击下一页
+            try:
+                next_btn = page.locator('button.c-pagination__link--next')
+                if await next_btn.count() > 0:
+                    await next_btn.first.scroll_into_view_if_needed()
+                    await next_btn.first.click()
+                    await page.wait_for_timeout(3000)
+                    current_page += 1
+                else:
+                    print("没有下一页按钮，结束收集")
+                    break
+            except Exception as e:
+                print(f"分页失败: {e}")
+                break
+
+        print(f"\n共收集到 {len(all_programs)} 个项目")
+
+        # 第二阶段：为每个项目获取导师信息
+        print("\n========== 第二阶段：获取导师信息 ==========")
+        print("注意：这将访问每个项目的GSAS页面，可能需要较长时间...")
+
+        for i, prog in enumerate(all_programs, 1):
+            print(f"\n[{i}/{len(all_programs)}] {prog['name']}")
+
+            # 生成项目URL
+            slug = name_to_slug(prog['name'])
+            prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
+
+            # 生成GSAS URL
+            gsas_url = f"https://gsas.harvard.edu/program/{slug}"
+
+            # 获取导师信息
+            faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url, prog['name'])
+
+            prog['faculty_page_url'] = faculty_page_url or ""
+            prog['faculty'] = faculty_list
+            prog['faculty_count'] = len(faculty_list)
+
+            # 每10个项目保存一次进度
+            if i % 10 == 0:
+                temp_result = {
+                    'source_url': base_url,
+                    'scraped_at': datetime.now(timezone.utc).isoformat(),
+                    'progress': f"{i}/{len(all_programs)}",
+                    'programs': all_programs[:i]
+                }
+                with open('harvard_programs_progress.json', 'w', encoding='utf-8') as f:
+                    json.dump(temp_result, f, ensure_ascii=False, indent=2)
+                print(f"    [进度已保存]")
+
+            # 避免请求过快
+            await page.wait_for_timeout(1500)
+
+        await browser.close()
+
+    # 排序
+    programs = sorted(all_programs, key=lambda x: x['name'])
+
+    # 统计
+    total_faculty = sum(p['faculty_count'] for p in programs)
+    programs_with_faculty = sum(1 for p in programs if p['faculty_count'] > 0)
+
+    # 保存最终结果
+    result = {
+        'source_url': base_url,
+        'scraped_at': datetime.now(timezone.utc).isoformat(),
+        'total_pages_scraped': current_page,
+        'total_programs': len(programs),
+        'programs_with_faculty': programs_with_faculty,
+        'total_faculty_found': total_faculty,
+        'programs': programs
+    }
+
+    output_file = Path('harvard_programs_with_faculty.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    print(f"\n{'='*60}")
+    print(f"爬取完成!")
+    print(f"共爬取 {current_page} 页")
+    print(f"共找到 {len(programs)} 个研究生项目")
+    print(f"其中 {programs_with_faculty} 个项目有导师信息")
+    print(f"共找到 {total_faculty} 位导师")
+    print(f"结果保存到: {output_file}")
+    print(f"{'='*60}")
+
+    # 打印摘要
+    print("\n项目摘要 (前30个):")
+    for i, prog in enumerate(programs[:30], 1):
+        faculty_info = f"({prog['faculty_count']}位导师)" if prog['faculty_count'] > 0 else "(无导师信息)"
+        print(f"{i:3}. {prog['name']} {faculty_info}")
+
+    if len(programs) > 30:
+        print(f"... 还有 {len(programs) - 30} 个项目")
+
+    return result
+
+
+if __name__ == "__main__":
+    asyncio.run(scrape_harvard_programs_with_faculty())