Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/manchester_improved_scraper.py
+++ b/artifacts/manchester_improved_scraper.py
@ -0,0 +1,229 @@
+"""
+曼彻斯特大学专用爬虫脚本
+改进版 - 从学院Staff页面提取导师信息
+"""
+
+import asyncio
+import json
+import re
+from datetime import datetime, timezone
+from urllib.parse import urljoin, urlparse
+from playwright.async_api import async_playwright
+
+
+# 曼彻斯特大学学院Staff页面映射
+# 项目关键词 -> 学院Staff页面URL
+SCHOOL_STAFF_MAPPING = {
+    # Alliance Manchester Business School (AMBS)
+    "accounting": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
+    "finance": "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/",
+    "business": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
+    "management": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
+    "marketing": "https://www.alliancembs.manchester.ac.uk/research/management-sciences-and-marketing/",
+    "mba": "https://www.alliancembs.manchester.ac.uk/about/our-people/",
+
+    # 其他学院可以继续添加...
+    # "computer": "...",
+    # "engineering": "...",
+}
+
+# 通用学院Staff页面列表（如果没有匹配的关键词）
+GENERAL_STAFF_PAGES = [
+    "https://www.alliancembs.manchester.ac.uk/about/our-people/",
+]
+
+
+async def scrape(output_callback=None):
+    """执行爬取"""
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        )
+        page = await context.new_page()
+
+        base_url = "https://www.manchester.ac.uk/"
+
+        result = {
+            "name": "The University of Manchester",
+            "url": base_url,
+            "scraped_at": datetime.now(timezone.utc).isoformat(),
+            "schools": []
+        }
+
+        try:
+            # 第一步：爬取硕士项目列表
+            if output_callback:
+                output_callback("info", "Step 1: Scraping masters programs list...")
+
+            courses_url = "https://www.manchester.ac.uk/study/masters/courses/list/"
+            await page.goto(courses_url, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(3000)
+
+            # 提取所有硕士项目
+            programs_data = await page.evaluate('''() => {
+                const programs = [];
+                const seen = new Set();
+
+                document.querySelectorAll('a[href]').forEach(a => {
+                    const href = a.href;
+                    const text = a.innerText.trim().replace(/\\s+/g, ' ');
+
+                    if (!href || seen.has(href)) return;
+                    if (text.length < 10 || text.length > 200) return;
+
+                    const hrefLower = href.toLowerCase();
+                    const textLower = text.toLowerCase();
+
+                    // 排除导航链接
+                    if (textLower === 'courses' || textLower === 'masters' ||
+                        textLower.includes('admission') || textLower.includes('fees') ||
+                        textLower.includes('skip to') || textLower.includes('skip navigation') ||
+                        textLower === 'home' || textLower === 'search' ||
+                        textLower.includes('contact') || textLower.includes('footer') ||
+                        hrefLower.endsWith('/courses/') || hrefLower.endsWith('/masters/') ||
+                        hrefLower.includes('#')) {
+                        return;
+                    }
+
+                    // 检查是否是课程链接 - 必须包含课程ID
+                    const hasNumericId = /\\/\\d{5}\\//.test(href);  // 5位数字ID
+                    const isCoursePage = hrefLower.includes('/courses/list/') &&
+                                        hasNumericId;
+
+                    if (isCoursePage) {
+                        seen.add(href);
+                        programs.push({
+                            name: text,
+                            url: href
+                        });
+                    }
+                });
+
+                return programs;
+            }''')
+
+            if output_callback:
+                output_callback("info", f"Found {len(programs_data)} masters programs")
+
+            # 第二步：爬取学院Staff页面的导师信息
+            if output_callback:
+                output_callback("info", "Step 2: Scraping faculty from school staff pages...")
+
+            all_faculty = {}  # school_url -> faculty list
+
+            # 爬取AMBS Accounting & Finance Staff
+            staff_url = "https://www.alliancembs.manchester.ac.uk/research/accounting-and-finance/staff/"
+            if output_callback:
+                output_callback("info", f"Scraping staff from: {staff_url}")
+
+            await page.goto(staff_url, wait_until="domcontentloaded", timeout=30000)
+            await page.wait_for_timeout(3000)
+
+            # 从表格提取教职员工
+            faculty_data = await page.evaluate('''() => {
+                const faculty = [];
+                const rows = document.querySelectorAll('table tr');
+
+                rows.forEach(row => {
+                    const cells = row.querySelectorAll('td');
+                    if (cells.length >= 2) {
+                        const link = cells[1]?.querySelector('a[href]');
+                        const titleCell = cells[2];
+
+                        if (link) {
+                            const name = link.innerText.trim();
+                            const url = link.href;
+                            const title = titleCell ? titleCell.innerText.trim() : '';
+
+                            if (name.length > 2 && !name.toLowerCase().includes('skip')) {
+                                faculty.push({
+                                    name: name,
+                                    url: url,
+                                    title: title
+                                });
+                            }
+                        }
+                    }
+                });
+
+                return faculty;
+            }''')
+
+            if output_callback:
+                output_callback("info", f"Found {len(faculty_data)} faculty members from AMBS")
+
+            all_faculty["AMBS - Accounting and Finance"] = faculty_data
+
+            # 第三步：组装结果
+            # 将项目按关键词分配到学院
+            schools_data = {}
+
+            for prog in programs_data:
+                prog_name_lower = prog['name'].lower()
+
+                # 确定所属学院
+                school_name = "Other Programs"
+                matched_faculty = []
+
+                for keyword, staff_url in SCHOOL_STAFF_MAPPING.items():
+                    if keyword in prog_name_lower:
+                        if "accounting" in keyword or "finance" in keyword:
+                            school_name = "Alliance Manchester Business School"
+                            matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
+                        elif "business" in keyword or "management" in keyword or "mba" in keyword:
+                            school_name = "Alliance Manchester Business School"
+                            matched_faculty = all_faculty.get("AMBS - Accounting and Finance", [])
+                        break
+
+                if school_name not in schools_data:
+                    schools_data[school_name] = {
+                        "name": school_name,
+                        "url": "",
+                        "programs": [],
+                        "faculty": matched_faculty  # 学院级别的导师
+                    }
+
+                schools_data[school_name]["programs"].append({
+                    "name": prog['name'],
+                    "url": prog['url'],
+                    "faculty": []  # 项目级别暂不填充
+                })
+
+            result["schools"] = list(schools_data.values())
+
+            # 统计
+            total_programs = sum(len(s['programs']) for s in result['schools'])
+            total_faculty = sum(len(s.get('faculty', [])) for s in result['schools'])
+
+            if output_callback:
+                output_callback("info", f"Done! {len(result['schools'])} schools, {total_programs} programs, {total_faculty} faculty")
+
+        except Exception as e:
+            if output_callback:
+                output_callback("error", f"Scraping error: {str(e)}")
+
+        finally:
+            await browser.close()
+
+        return result
+
+
+if __name__ == "__main__":
+    import sys
+    if sys.platform == "win32":
+        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
+    def print_callback(level, msg):
+        print(f"[{level}] {msg}")
+
+    result = asyncio.run(scrape(output_callback=print_callback))
+
+    # 保存结果
+    with open("output/manchester_improved_result.json", "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2, ensure_ascii=False)
+
+    print(f"\nResult saved to output/manchester_improved_result.json")
+    print(f"Schools: {len(result['schools'])}")
+    for school in result['schools']:
+        print(f"  - {school['name']}: {len(school['programs'])} programs, {len(school.get('faculty', []))} faculty")