Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/artifacts/explore_program_page.py
+++ b/artifacts/explore_program_page.py
@ -0,0 +1,226 @@
+"""
+探索Harvard项目页面结构，寻找导师信息
+"""
+import asyncio
+from playwright.async_api import async_playwright
+
+async def explore_program_page():
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        page = await browser.new_page()
+
+        # 访问研究生院系页面 (GSAS)
+        gsas_url = "https://gsas.harvard.edu/program/african-and-african-american-studies"
+        print(f"访问研究生院系页面: {gsas_url}")
+
+        await page.goto(gsas_url, wait_until='networkidle')
+        await page.wait_for_timeout(3000)
+
+        # 截图保存
+        await page.screenshot(path="gsas_program_page.png", full_page=True)
+        print("已保存截图: gsas_program_page.png")
+
+        # 分析页面结构
+        page_info = await page.evaluate('''() => {
+            const info = {
+                title: document.title,
+                h1: document.querySelector('h1')?.innerText || '',
+                allHeadings: [],
+                facultyLinks: [],
+                peopleLinks: [],
+                allLinks: []
+            };
+
+            // 获取所有标题
+            document.querySelectorAll('h1, h2, h3, h4').forEach(h => {
+                info.allHeadings.push({
+                    tag: h.tagName,
+                    text: h.innerText.trim().substring(0, 100)
+                });
+            });
+
+            // 查找所有链接
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+
+                // 检查是否与教职员工相关
+                const lowerHref = href.toLowerCase();
+                const lowerText = text.toLowerCase();
+
+                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
+                    lowerHref.includes('professor') || lowerHref.includes('staff') ||
+                    lowerText.includes('faculty') || lowerText.includes('people')) {
+                    info.facultyLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+
+                // 检查是否是个人页面链接
+                if (href.includes('/people/') || href.includes('/faculty/') ||
+                    href.includes('/profile/') || href.includes('/person/')) {
+                    info.peopleLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+
+                // 保存所有主要链接
+                if (href && text.length > 2 && text.length < 150) {
+                    info.allLinks.push({
+                        text: text,
+                        href: href
+                    });
+                }
+            });
+
+            return info;
+        }''')
+
+        print(f"\n页面标题: {page_info['title']}")
+        print(f"H1: {page_info['h1']}")
+
+        print(f"\n所有标题 ({len(page_info['allHeadings'])}):")
+        for h in page_info['allHeadings']:
+            print(f"  <{h['tag']}>: {h['text']}")
+
+        print(f"\n教职员工相关链接 ({len(page_info['facultyLinks'])}):")
+        for f in page_info['facultyLinks']:
+            print(f"  - {f['text']} -> {f['href']}")
+
+        print(f"\n个人页面链接 ({len(page_info['peopleLinks'])}):")
+        for p in page_info['peopleLinks']:
+            print(f"  - {p['text']} -> {p['href']}")
+
+        print(f"\n所有链接 ({len(page_info['allLinks'])}):")
+        for link in page_info['allLinks'][:50]:
+            print(f"  - {link['text'][:60]} -> {link['href']}")
+
+        # 尝试另一个项目页面看看是否有不同结构
+        print("\n\n========== 尝试另一个项目页面 ==========")
+        economics_url = "https://gsas.harvard.edu/program/economics"
+        print(f"访问: {economics_url}")
+
+        await page.goto(economics_url, wait_until='networkidle')
+        await page.wait_for_timeout(3000)
+
+        # 截图保存
+        await page.screenshot(path="gsas_economics_page.png", full_page=True)
+        print("已保存截图: gsas_economics_page.png")
+
+        # 分析
+        econ_info = await page.evaluate('''() => {
+            const info = {
+                title: document.title,
+                facultyLinks: [],
+                peopleLinks: []
+            };
+
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+                const lowerHref = href.toLowerCase();
+                const lowerText = text.toLowerCase();
+
+                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
+                    lowerText.includes('faculty') || lowerText.includes('people')) {
+                    info.facultyLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+
+                if (href.includes('/people/') || href.includes('/faculty/') ||
+                    href.includes('/profile/') || href.includes('/person/')) {
+                    info.peopleLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+            });
+
+            return info;
+        }''')
+
+        print(f"\n教职员工相关链接 ({len(econ_info['facultyLinks'])}):")
+        for f in econ_info['facultyLinks']:
+            print(f"  - {f['text']} -> {f['href']}")
+
+        print(f"\n个人页面链接 ({len(econ_info['peopleLinks'])}):")
+        for p in econ_info['peopleLinks']:
+            print(f"  - {p['text']} -> {p['href']}")
+
+        # 访问院系主页看看有没有Faculty页面
+        print("\n\n========== 尝试访问院系主页 ==========")
+        dept_url = "https://aaas.fas.harvard.edu/"
+        print(f"访问院系主页: {dept_url}")
+
+        await page.goto(dept_url, wait_until='networkidle')
+        await page.wait_for_timeout(3000)
+
+        await page.screenshot(path="aaas_dept_page.png", full_page=True)
+        print("已保存截图: aaas_dept_page.png")
+
+        dept_info = await page.evaluate('''() => {
+            const info = {
+                title: document.title,
+                navLinks: [],
+                facultyLinks: [],
+                peopleLinks: []
+            };
+
+            // 获取导航链接
+            document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+                if (text && text.length > 1 && text.length < 50) {
+                    info.navLinks.push({
+                        text: text,
+                        href: href
+                    });
+                }
+            });
+
+            document.querySelectorAll('a[href]').forEach(a => {
+                const href = a.href || '';
+                const text = a.innerText.trim();
+                const lowerHref = href.toLowerCase();
+                const lowerText = text.toLowerCase();
+
+                if (lowerHref.includes('faculty') || lowerHref.includes('people') ||
+                    lowerText.includes('faculty') || lowerText.includes('people')) {
+                    info.facultyLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+
+                if (href.includes('/people/') || href.includes('/faculty/') ||
+                    href.includes('/profile/')) {
+                    info.peopleLinks.push({
+                        text: text.substring(0, 100),
+                        href: href
+                    });
+                }
+            });
+
+            return info;
+        }''')
+
+        print(f"\n导航链接 ({len(dept_info['navLinks'])}):")
+        for link in dept_info['navLinks'][:20]:
+            print(f"  - {link['text']} -> {link['href']}")
+
+        print(f"\n教职员工相关链接 ({len(dept_info['facultyLinks'])}):")
+        for f in dept_info['facultyLinks']:
+            print(f"  - {f['text']} -> {f['href']}")
+
+        print(f"\n个人页面链接 ({len(dept_info['peopleLinks'])}):")
+        for p in dept_info['peopleLinks'][:30]:
+            print(f"  - {p['text']} -> {p['href']}")
+
+        await browser.close()
+
+if __name__ == "__main__":
+    asyncio.run(explore_program_page())