Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/harvard_programs_scraper.py
+++ b/artifacts/harvard_programs_scraper.py
@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+"""
+Harvard Graduate Programs Scraper
+专门爬取 https://www.harvard.edu/programs/?degree_levels=graduate 页面的所有研究生项目
+通过点击分页按钮遍历所有页面
+"""
+
+import asyncio
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from playwright.async_api import async_playwright
+
+
+async def scrape_harvard_programs():
+    """爬取Harvard研究生项目列表页面 - 通过点击分页按钮"""
+
+    all_programs = []
+    base_url = "https://www.harvard.edu/programs/?degree_levels=graduate"
+
+    async with async_playwright() as p:
+        # 使用无头模式
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            viewport={'width': 1920, 'height': 1080}
+        )
+        page = await context.new_page()
+
+        print(f"正在访问: {base_url}")
+        # 使用 domcontentloaded 而非 networkidle，更快加载
+        await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
+        # 等待页面内容加载
+        await page.wait_for_timeout(5000)
+
+        # 滚动到页面底部以确保分页按钮加载
+        print("滚动到页面底部...")
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+        await page.wait_for_timeout(2000)
+
+        current_page = 1
+        max_pages = 15
+
+        while current_page <= max_pages:
+            print(f"\n========== 第 {current_page} 页 ==========")
+
+            # 等待内容加载
+            await page.wait_for_timeout(2000)
+
+            # 提取当前页面的项目
+            # 从调试输出得知，项目按钮的class是 'records__record___PbPhG c-programs-item__title-link'
+            # 需要点击按钮来获取URL，因为Harvard使用JavaScript导航
+
+            # 首先获取所有项目按钮信息
+            page_data = await page.evaluate('''() => {
+                const programs = [];
+
+                // 查找所有项目行/容器
+                const programItems = document.querySelectorAll('[class*="records__record"], [class*="c-programs-item"]');
+
+                programItems.forEach((item, index) => {
+                    // 获取项目名称按钮
+                    const nameBtn = item.querySelector('button[class*="title-link"], button[class*="c-programs-item"]');
+                    if (!nameBtn) return;
+
+                    const name = nameBtn.innerText.trim();
+                    if (!name || name.length < 3) return;
+
+                    // 获取学位信息
+                    let degrees = '';
+                    const allText = item.innerText;
+                    const degreeMatch = allText.match(/(A\\.B\\.|Ph\\.D\\.|M\\.A\\.|S\\.M\\.|M\\.Arch\\.|LL\\.M\\.|S\\.B\\.|A\\.L\\.B\\.|A\\.L\\.M\\.|M\\.M\\.Sc\\.|Ed\\.D\\.|Ed\\.M\\.|M\\.P\\.A\\.|M\\.P\\.P\\.|M\\.P\\.H\\.|J\\.D\\.|M\\.B\\.A\\.|M\\.D\\.|D\\.M\\.D\\.|Th\\.D\\.|M\\.Div\\.|M\\.T\\.S\\.|M\\.E\\.|D\\.M\\.Sc\\.|M\\.H\\.C\\.M\\.|M\\.L\\.A\\.|M\\.D\\.E\\.|M\\.R\\.E\\.|M\\.A\\.U\\.D\\.|M\\.R\\.P\\.L\\.)/g);
+                    if (degreeMatch) {
+                        degrees = degreeMatch.join(', ');
+                    }
+
+                    // 查找链接 - 检查各种可能的位置
+                    let url = '';
+
+                    // 方法1: 查找 <a> 标签
+                    const link = item.querySelector('a[href]');
+                    if (link && link.href) {
+                        url = link.href;
+                    }
+
+                    // 方法2: 检查data属性
+                    if (!url) {
+                        const dataUrl = nameBtn.getAttribute('data-url') ||
+                                       nameBtn.getAttribute('data-href') ||
+                                       item.getAttribute('data-url');
+                        if (dataUrl) url = dataUrl;
+                    }
+
+                    // 方法3: 检查onclick属性
+                    if (!url) {
+                        const onclick = nameBtn.getAttribute('onclick') || '';
+                        const urlMatch = onclick.match(/['"]([^'"]*\\/programs\\/[^'"]*)['"]/);
+                        if (urlMatch) url = urlMatch[1];
+                    }
+
+                    programs.push({
+                        name: name,
+                        degrees: degrees,
+                        url: url,
+                        index: index
+                    });
+                });
+
+                // 如果方法1没找到项目，使用备选方法
+                if (programs.length === 0) {
+                    // 查找所有项目按钮
+                    const buttons = document.querySelectorAll('button');
+                    buttons.forEach((btn, index) => {
+                        const className = btn.className || '';
+                        if (className.includes('c-programs-item') || className.includes('title-link')) {
+                            const name = btn.innerText.trim();
+                            if (name && name.length > 3 && !name.match(/^(Page|Next|Previous|Search|Menu|Filter)/)) {
+                                programs.push({
+                                    name: name,
+                                    degrees: '',
+                                    url: '',
+                                    index: index
+                                });
+                            }
+                        }
+                    });
+                }
+
+                return {
+                    programs: programs,
+                    totalFound: programs.length
+                };
+            }''')
+
+            # 第一页时调试输出HTML结构
+            if current_page == 1 and len(page_data['programs']) == 0:
+                print("未找到项目，调试HTML结构...")
+                html_debug = await page.evaluate('''() => {
+                    const debug = {
+                        allButtons: [],
+                        allLinks: [],
+                        sampleHTML: ''
+                    };
+
+                    // 获取所有按钮
+                    document.querySelectorAll('button').forEach(btn => {
+                        const text = btn.innerText.trim().substring(0, 50);
+                        if (text && text.length > 3) {
+                            debug.allButtons.push({
+                                text: text,
+                                class: btn.className.substring(0, 80)
+                            });
+                        }
+                    });
+
+                    // 获取main区域的HTML片段
+                    const main = document.querySelector('main') || document.body;
+                    debug.sampleHTML = main.innerHTML.substring(0, 3000);
+
+                    return debug;
+                }''')
+                print(f"找到 {len(html_debug['allButtons'])} 个按钮:")
+                for btn in html_debug['allButtons'][:20]:
+                    print(f"    - {btn['text']} | class: {btn['class']}")
+                print(f"\nHTML片段:\n{html_debug['sampleHTML'][:1500]}")
+
+            print(f"  本页找到 {len(page_data['programs'])} 个项目")
+
+            # 打印找到的项目
+            for prog in page_data['programs']:
+                print(f"    - {prog['name']} ({prog['degrees']})")
+
+            # 添加到总列表（去重）
+            for prog in page_data['programs']:
+                name = prog['name'].strip()
+                if name and not any(p['name'] == name for p in all_programs):
+                    all_programs.append({
+                        'name': name,
+                        'degrees': prog.get('degrees', ''),
+                        'url': prog.get('url', ''),
+                        'page': current_page
+                    })
+
+            # 尝试点击下一页按钮
+            try:
+                clicked = False
+
+                # 首先打印所有分页相关元素用于调试
+                if current_page == 1:
+                    # 截图保存以便调试
+                    await page.screenshot(path="harvard_debug_pagination.png", full_page=True)
+                    print("已保存调试截图: harvard_debug_pagination.png")
+
+                    pagination_info = await page.evaluate('''() => {
+                        const result = {
+                            links: [],
+                            buttons: [],
+                            allClickable: [],
+                            pageNumbers: [],
+                            allText: []
+                        };
+
+                        // 查找所有链接
+                        document.querySelectorAll('a').forEach(a => {
+                            const text = a.innerText.trim();
+                            if (text.match(/^[0-9]+$|Next|page|Prev/i)) {
+                                result.links.push({
+                                    text: text.substring(0, 50),
+                                    href: a.href,
+                                    visible: a.offsetParent !== null,
+                                    className: a.className
+                                });
+                            }
+                        });
+
+                        // 查找所有按钮
+                        document.querySelectorAll('button').forEach(b => {
+                            const text = b.innerText.trim();
+                            if (text.match(/^[0-9]+$|Next|page|Prev/i) || text.length < 20) {
+                                result.buttons.push({
+                                    text: text.substring(0, 50),
+                                    visible: b.offsetParent !== null,
+                                    className: b.className
+                                });
+                            }
+                        });
+
+                        // 查找所有包含数字的可点击元素（可能是分页）
+                        document.querySelectorAll('a, button, span[role="button"], div[role="button"], li a, nav a').forEach(el => {
+                            const text = el.innerText.trim();
+                            if (text.match(/^[0-9]$/) || text === 'Next page' || text.includes('Next')) {
+                                result.pageNumbers.push({
+                                    tag: el.tagName,
+                                    text: text,
+                                    className: el.className,
+                                    id: el.id,
+                                    ariaLabel: el.getAttribute('aria-label'),
+                                    visible: el.offsetParent !== null
+                                });
+                            }
+                        });
+
+                        // 查找页面底部区域的所有可点击元素
+                        const bodyRect = document.body.getBoundingClientRect();
+                        document.querySelectorAll('*').forEach(el => {
+                            const rect = el.getBoundingClientRect();
+                            const text = el.innerText?.trim() || '';
+                            // 只看页面下半部分的元素且文本短
+                            if (rect.top > bodyRect.height * 0.5 && text.length > 0 && text.length < 30) {
+                                const style = window.getComputedStyle(el);
+                                if (style.cursor === 'pointer' || el.tagName === 'A' || el.tagName === 'BUTTON') {
+                                    result.allClickable.push({
+                                        tag: el.tagName,
+                                        text: text.substring(0, 30),
+                                        top: Math.round(rect.top),
+                                        className: el.className?.substring?.(0, 50) || ''
+                                    });
+                                }
+                            }
+                        });
+
+                        // 输出页面底部所有文本以便调试
+                        const bodyText = document.body.innerText;
+                        const lines = bodyText.split('\\n').filter(l => l.trim());
+                        // 找到包含数字1-9的行
+                        for (let i = 0; i < lines.length; i++) {
+                            if (lines[i].match(/^[1-9]$|Next page|Previous/)) {
+                                result.allText.push(lines[i]);
+                            }
+                        }
+
+                        return result;
+                    }''')
+                    print(f"\n分页相关链接 ({len(pagination_info['links'])} 个):")
+                    for link in pagination_info['links']:
+                        print(f"    a: '{link['text']}' class='{link.get('className', '')}' (visible: {link['visible']})")
+                    print(f"\n分页相关按钮 ({len(pagination_info['buttons'])} 个):")
+                    for btn in pagination_info['buttons']:
+                        print(f"    button: '{btn['text']}' class='{btn.get('className', '')}' (visible: {btn['visible']})")
+                    print(f"\n页码元素 ({len(pagination_info['pageNumbers'])} 个):")
+                    for pn in pagination_info['pageNumbers']:
+                        print(f"    {pn['tag']}: '{pn['text']}' aria-label='{pn.get('ariaLabel')}' visible={pn['visible']}")
+                    print(f"\n页面下半部分可点击元素 ({len(pagination_info['allClickable'])} 个):")
+                    for el in pagination_info['allClickable'][:30]:
+                        print(f"    {el['tag']}: '{el['text']}' (top: {el['top']})")
+                    print(f"\n页面中的分页文本 ({len(pagination_info['allText'])} 个):")
+                    for txt in pagination_info['allText'][:20]:
+                        print(f"    '{txt}'")
+
+                # 方法1: 直接使用CSS选择器查找 "Next page" 按钮 (最可靠)
+                # 从调试输出得知，分页按钮是 <button class="c-pagination__link c-pagination__link--next">
+                next_page_num = str(current_page + 1)
+
+                try:
+                    next_btn = page.locator('button.c-pagination__link--next')
+                    if await next_btn.count() > 0:
+                        print(f"\n找到 'Next page' 按钮 (CSS选择器)，尝试点击...")
+                        await next_btn.first.scroll_into_view_if_needed()
+                        await next_btn.first.click()
+                        await page.wait_for_timeout(3000)
+                        current_page += 1
+                        clicked = True
+                except Exception as e:
+                    print(f"方法1失败: {e}")
+
+                if clicked:
+                    continue
+
+                # 方法2: 使用 get_by_role 查找按钮
+                try:
+                    next_btn = page.get_by_role("button", name="Next page")
+                    if await next_btn.count() > 0:
+                        print(f"\n通过role找到 'Next page' 按钮，尝试点击...")
+                        await next_btn.first.scroll_into_view_if_needed()
+                        await next_btn.first.click()
+                        await page.wait_for_timeout(3000)
+                        current_page += 1
+                        clicked = True
+                except Exception as e:
+                    print(f"方法2失败: {e}")
+
+                if clicked:
+                    continue
+
+                # 方法3: 查找所有分页按钮并点击 "Next page"
+                try:
+                    pagination_buttons = await page.query_selector_all('button.c-pagination__link')
+                    for btn in pagination_buttons:
+                        text = await btn.inner_text()
+                        if 'Next page' in text:
+                            print(f"\n通过遍历分页按钮找到 'Next page'，点击...")
+                            await btn.scroll_into_view_if_needed()
+                            await btn.click()
+                            await page.wait_for_timeout(3000)
+                            current_page += 1
+                            clicked = True
+                            break
+                except Exception as e:
+                    print(f"方法3失败: {e}")
+
+                if clicked:
+                    continue
+
+                # 方法4: 通过JavaScript直接点击分页按钮
+                try:
+                    js_clicked = await page.evaluate('''() => {
+                        // 查找 Next page 按钮
+                        const nextBtn = document.querySelector('button.c-pagination__link--next');
+                        if (nextBtn) {
+                            nextBtn.click();
+                            return true;
+                        }
+                        // 备选：查找所有分页按钮
+                        const buttons = document.querySelectorAll('button.c-pagination__link');
+                        for (const btn of buttons) {
+                            if (btn.innerText.includes('Next page')) {
+                                btn.click();
+                                return true;
+                            }
+                        }
+                        return false;
+                    }''')
+                    if js_clicked:
+                        print(f"\n通过JavaScript点击 'Next page' 成功")
+                        await page.wait_for_timeout(3000)
+                        current_page += 1
+                        clicked = True
+                except Exception as e:
+                    print(f"方法4失败: {e}")
+
+                if clicked:
+                    continue
+
+                # 方法5: 遍历所有按钮查找
+                try:
+                    all_buttons = await page.query_selector_all('button')
+                    for btn in all_buttons:
+                        try:
+                            text = await btn.inner_text()
+                            if 'Next page' in text:
+                                visible = await btn.is_visible()
+                                if visible:
+                                    print(f"\n遍历所有按钮找到 'Next page'，点击...")
+                                    await btn.scroll_into_view_if_needed()
+                                    await btn.click()
+                                    await page.wait_for_timeout(3000)
+                                    current_page += 1
+                                    clicked = True
+                                    break
+                        except:
+                            continue
+                except Exception as e:
+                    print(f"方法5失败: {e}")
+
+                if clicked:
+                    continue
+
+                print("没有找到下一页按钮，结束爬取")
+                break
+
+            except Exception as e:
+                print(f"点击下一页时出错: {e}")
+                break
+
+        # 生成项目URL - Harvard的项目URL格式为：
+        # https://www.harvard.edu/programs/{program-name-slug}/
+        # 例如: african-and-african-american-studies
+
+        import re
+
+        def name_to_slug(name):
+            """将项目名称转换为URL slug"""
+            # 转小写
+            slug = name.lower()
+            # 将特殊字符替换为空格
+            slug = re.sub(r'[^\w\s-]', '', slug)
+            # 替换空格为连字符
+            slug = re.sub(r'[\s_]+', '-', slug)
+            # 移除多余的连字符
+            slug = re.sub(r'-+', '-', slug)
+            # 移除首尾连字符
+            slug = slug.strip('-')
+            return slug
+
+        print("\n正在生成项目URL...")
+        for prog in all_programs:
+            slug = name_to_slug(prog['name'])
+            prog['url'] = f"https://www.harvard.edu/programs/{slug}/"
+            print(f"  {prog['name']} -> {prog['url']}")
+
+        await browser.close()
+
+    # 排序
+    programs = sorted(all_programs, key=lambda x: x['name'])
+
+    # 保存
+    result = {
+        'source_url': base_url,
+        'scraped_at': datetime.now(timezone.utc).isoformat(),
+        'total_pages_scraped': current_page,
+        'total_programs': len(programs),
+        'programs': programs
+    }
+
+    output_file = Path('harvard_programs_results.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    print(f"\n{'='*60}")
+    print(f"爬取完成!")
+    print(f"共爬取 {current_page} 页")
+    print(f"共找到 {len(programs)} 个研究生项目")
+    print(f"结果保存到: {output_file}")
+    print(f"{'='*60}")
+
+    # 打印完整列表
+    print("\n研究生项目完整列表:")
+    for i, prog in enumerate(programs, 1):
+        print(f"{i:3}. {prog['name']} - {prog['degrees']}")
+
+    return result
+
+
+if __name__ == "__main__":
+    asyncio.run(scrape_harvard_programs())