Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/explore_manchester.py
+++ b/artifacts/explore_manchester.py
@ -0,0 +1,173 @@
+"""
+探索曼彻斯特大学硕士课程页面结构
+"""
+
+import asyncio
+import json
+from playwright.async_api import async_playwright
+
+
+async def explore_manchester():
+    """探索曼彻斯特大学网站结构"""
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        )
+        page = await context.new_page()
+
+        # 直接访问硕士课程A-Z列表页
+        print("访问硕士课程A-Z列表页面...")
+        await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/",
+                       wait_until="domcontentloaded", timeout=60000)
+        await page.wait_for_timeout(5000)
+
+        # 截图
+        await page.screenshot(path="manchester_masters_page.png", full_page=False)
+        print("截图已保存: manchester_masters_page.png")
+
+        # 分析页面结构
+        page_info = await page.evaluate("""() => {
+            const info = {
+                title: document.title,
+                url: window.location.href,
+                all_links: [],
+                course_candidates: [],
+                page_sections: []
+            };
+
+            // 获取所有链接
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href;
+                const text = a.innerText.trim().substring(0, 100);
+                if (href && text) {
+                    info.all_links.push({href, text});
+                }
+            });
+
+            // 查找可能的课程链接 - 包含 /course/ 或 list-item
+            document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => {
+                info.course_candidates.push({
+                    href: a.href,
+                    text: a.innerText.trim().substring(0, 100),
+                    classes: a.className,
+                    parent_classes: a.parentElement?.className || ''
+                });
+            });
+
+            // 获取页面主要区块
+            document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => {
+                info.page_sections.push({
+                    tag: el.tagName,
+                    id: el.id,
+                    classes: el.className,
+                    children_count: el.children.length
+                });
+            });
+
+            return info;
+        }""")
+
+        print(f"\n页面标题: {page_info['title']}")
+        print(f"当前URL: {page_info['url']}")
+        print(f"\n总链接数: {len(page_info['all_links'])}")
+        print(f"课程候选链接数: {len(page_info['course_candidates'])}")
+
+        # 查找包含 masters/courses/ 的链接
+        masters_links = [l for l in page_info['all_links']
+                        if 'masters/courses/' in l['href'].lower()
+                        and l['href'] != page_info['url']]
+
+        print(f"\n硕士课程相关链接 ({len(masters_links)}):")
+        for link in masters_links[:20]:
+            print(f"  - {link['text'][:50]}: {link['href']}")
+
+        print(f"\n课程候选详情:")
+        for c in page_info['course_candidates'][:10]:
+            print(f"  - {c['text'][:50]}")
+            print(f"    URL: {c['href']}")
+            print(f"    Classes: {c['classes']}")
+
+        # 检查是否有搜索/筛选功能
+        search_elements = await page.evaluate("""() => {
+            const elements = [];
+            document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => {
+                elements.push({
+                    tag: el.tagName,
+                    type: el.type || '',
+                    id: el.id,
+                    name: el.name || '',
+                    classes: el.className
+                });
+            });
+            return elements;
+        }""")
+
+        print(f"\n搜索/筛选元素: {len(search_elements)}")
+        for el in search_elements[:5]:
+            print(f"  - {el}")
+
+        # 尝试找到课程列表的实际结构
+        print("\n\n正在分析页面中的课程列表结构...")
+
+        list_structures = await page.evaluate("""() => {
+            const structures = [];
+
+            // 查找各种可能的列表结构
+            const selectors = [
+                'ul li a[href*="course"]',
+                'div[class*="result"] a',
+                'div[class*="course"] a',
+                'article a[href]',
+                '.search-results a',
+                '[data-course] a',
+                'table tr td a'
+            ];
+
+            for (const selector of selectors) {
+                const elements = document.querySelectorAll(selector);
+                if (elements.length > 0) {
+                    const samples = [];
+                    elements.forEach((el, i) => {
+                        if (i < 5) {
+                            samples.push({
+                                href: el.href,
+                                text: el.innerText.trim().substring(0, 80)
+                            });
+                        }
+                    });
+                    structures.push({
+                        selector: selector,
+                        count: elements.length,
+                        samples: samples
+                    });
+                }
+            }
+
+            return structures;
+        }""")
+
+        print("\n找到的列表结构:")
+        for s in list_structures:
+            print(f"\n  选择器: {s['selector']} (共 {s['count']} 个)")
+            for sample in s['samples']:
+                print(f"    - {sample['text']}: {sample['href']}")
+
+        # 保存完整分析结果
+        with open("manchester_analysis.json", "w", encoding="utf-8") as f:
+            json.dump(page_info, f, indent=2, ensure_ascii=False)
+
+        print("\n\n完整分析已保存到 manchester_analysis.json")
+
+        # 等待用户查看
+        print("\n按 Ctrl+C 关闭浏览器...")
+        try:
+            await asyncio.sleep(30)
+        except:
+            pass
+
+        await browser.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(explore_manchester())