#!/usr/bin/env python3 """ 测试导师信息爬取逻辑 - 只测试3个项目 """ import asyncio import json import re from playwright.async_api import async_playwright def name_to_slug(name): """将项目名称转换为URL slug""" slug = name.lower() slug = re.sub(r'[^\w\s-]', '', slug) slug = re.sub(r'[\s_]+', '-', slug) slug = re.sub(r'-+', '-', slug) slug = slug.strip('-') return slug async def get_faculty_from_gsas_page(page, gsas_url): """从GSAS项目页面获取Faculty链接,然后访问院系People页面获取导师列表""" faculty_list = [] faculty_page_url = None try: print(f" 访问GSAS页面: {gsas_url}") await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000) await page.wait_for_timeout(2000) # 查找Faculty部分的链接 faculty_link = await page.evaluate('''() => { const links = document.querySelectorAll('a[href]'); for (const link of links) { const text = link.innerText.toLowerCase(); const href = link.href; if (text.includes('faculty') && text.includes('see list')) { return href; } if (text.includes('faculty') && (href.includes('/people') || href.includes('/faculty'))) { return href; } } return null; }''') if faculty_link: faculty_page_url = faculty_link print(f" 找到Faculty页面链接: {faculty_link}") # 访问Faculty/People页面 await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000) await page.wait_for_timeout(2000) # 提取所有导师信息 faculty_list = await page.evaluate('''() => { const faculty = []; const seen = new Set(); document.querySelectorAll('a[href]').forEach(a => { const href = a.href || ''; const text = a.innerText.trim(); const lowerHref = href.toLowerCase(); if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') || lowerHref.includes('/profile/')) && text.length > 3 && text.length < 100 && !text.toLowerCase().includes('people') && !text.toLowerCase().includes('faculty') && !lowerHref.endsWith('/people/') && !lowerHref.endsWith('/faculty/')) { if (!seen.has(href)) { seen.add(href); faculty.push({ name: text, url: href }); } } }); return faculty; }''') print(f" 找到 {len(faculty_list)} 位导师") for f in faculty_list[:5]: print(f" - {f['name']}: {f['url']}") if len(faculty_list) > 5: print(f" ... 还有 {len(faculty_list) - 5} 位") else: print(" 未找到Faculty页面链接") except Exception as e: print(f" 获取Faculty信息失败: {e}") return faculty_list, faculty_page_url async def test_faculty_scraper(): """测试导师爬取""" # 测试3个项目 test_programs = [ "African and African American Studies", "Economics", "Computer Science" ] async with async_playwright() as p: browser = await p.chromium.launch(headless=False) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", viewport={'width': 1920, 'height': 1080} ) page = await context.new_page() results = [] for i, name in enumerate(test_programs, 1): print(f"\n{'='*60}") print(f"[{i}/{len(test_programs)}] 测试: {name}") print(f"{'='*60}") slug = name_to_slug(name) program_url = f"https://www.harvard.edu/programs/{slug}/" gsas_url = f"https://gsas.harvard.edu/program/{slug}" print(f"项目URL: {program_url}") print(f"GSAS URL: {gsas_url}") faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url) results.append({ 'name': name, 'url': program_url, 'gsas_url': gsas_url, 'faculty_page_url': faculty_page_url, 'faculty': faculty_list, 'faculty_count': len(faculty_list) }) await page.wait_for_timeout(1000) await browser.close() # 输出结果 print(f"\n\n{'='*60}") print("测试结果汇总") print(f"{'='*60}") for r in results: print(f"\n{r['name']}:") print(f" Faculty页面: {r['faculty_page_url'] or '未找到'}") print(f" 导师数量: {r['faculty_count']}") # 保存测试结果 with open('test_faculty_results.json', 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n测试结果已保存到: test_faculty_results.json") if __name__ == "__main__": asyncio.run(test_faculty_scraper())