Add university scraper system with backend, frontend, and configs

- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00
parent 2714c8ad5c
commit 426cf4d2cd
75 changed files with 13527 additions and 2 deletions
--- a/scripts/test_harvard.py
+++ b/scripts/test_harvard.py
@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+测试Harvard大学爬取 - 只测试2个学院
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+# 添加项目路径
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from university_scraper.config import ScraperConfig
+from university_scraper.scraper import UniversityScraper
+
+
+# 简化的测试配置 - 只测试2个学院
+TEST_CONFIG = {
+    "university": {
+        "name": "Harvard University",
+        "url": "https://www.harvard.edu/",
+        "country": "USA"
+    },
+    "schools": {
+        "discovery_method": "static_list",
+        "static_list": [
+            {
+                "name": "John A. Paulson School of Engineering and Applied Sciences (SEAS)",
+                "url": "https://seas.harvard.edu/"
+            },
+            {
+                "name": "Graduate School of Design (GSD)",
+                "url": "https://www.gsd.harvard.edu/"
+            }
+        ]
+    },
+    "programs": {
+        "paths_to_try": [
+            "/academics/graduate-programs",
+            "/programs",
+            "/academics/programs",
+            "/graduate"
+        ],
+        "link_patterns": [
+            {"text_contains": ["program", "degree"], "href_contains": ["/program", "/degree"]},
+            {"text_contains": ["master", "graduate"], "href_contains": ["/master", "/graduate"]}
+        ],
+        "selectors": {
+            "program_item": "div.program-item, li.program, a[href*='/program']",
+            "program_name": "h3, .title",
+            "program_url": "a[href]",
+            "degree_type": ".degree"
+        },
+        "pagination": {"type": "none"}
+    },
+    "faculty": {
+        "discovery_strategies": [
+            {
+                "type": "link_in_page",
+                "patterns": [
+                    {"text_contains": ["faculty", "people"], "href_contains": ["/faculty", "/people"]}
+                ]
+            },
+            {
+                "type": "url_pattern",
+                "patterns": [
+                    "{school_url}/faculty",
+                    "{school_url}/people"
+                ]
+            }
+        ],
+        "selectors": {
+            "faculty_item": "div.faculty, li.person",
+            "faculty_name": "h3, .name",
+            "faculty_url": "a[href*='/people/'], a[href*='/faculty/']"
+        }
+    },
+    "filters": {
+        "program_degree_types": {
+            "include": ["Master", "M.S.", "M.A.", "MBA", "M.Eng", "S.M."],
+            "exclude": ["Ph.D.", "Doctor", "Bachelor"]
+        },
+        "exclude_schools": []
+    }
+}
+
+
+async def test_harvard():
+    """测试Harvard爬取"""
+    print("=" * 60)
+    print("测试Harvard大学爬取（简化版 - 2个学院）")
+    print("=" * 60)
+
+    config = ScraperConfig.from_dict(TEST_CONFIG)
+
+    async with UniversityScraper(config, headless=False) as scraper:
+        university = await scraper.scrape()
+        scraper.save_results("output/harvard_test_result.json")
+
+    # 打印详细结果
+    print("\n" + "=" * 60)
+    print("详细结果:")
+    print("=" * 60)
+
+    for school in university.schools:
+        print(f"\n学院: {school.name}")
+        print(f"  URL: {school.url}")
+        print(f"  项目数: {len(school.programs)}")
+
+        for prog in school.programs[:5]:
+            print(f"\n  项目: {prog.name}")
+            print(f"    URL: {prog.url}")
+            print(f"    学位: {prog.degree_type}")
+            print(f"    导师数: {len(prog.faculty)}")
+
+            if prog.faculty:
+                print("    导师示例:")
+                for f in prog.faculty[:3]:
+                    print(f"      - {f.name}: {f.url}")
+
+        if len(school.programs) > 5:
+            print(f"\n  ... 还有 {len(school.programs) - 5} 个项目")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_harvard())