Files
University-Playwright-Codeg…/artifacts/test_faculty_scraper.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

166 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
测试导师信息爬取逻辑 - 只测试3个项目
"""
import asyncio
import json
import re
from playwright.async_api import async_playwright
def name_to_slug(name):
"""将项目名称转换为URL slug"""
slug = name.lower()
slug = re.sub(r'[^\w\s-]', '', slug)
slug = re.sub(r'[\s_]+', '-', slug)
slug = re.sub(r'-+', '-', slug)
slug = slug.strip('-')
return slug
async def get_faculty_from_gsas_page(page, gsas_url):
"""从GSAS项目页面获取Faculty链接然后访问院系People页面获取导师列表"""
faculty_list = []
faculty_page_url = None
try:
print(f" 访问GSAS页面: {gsas_url}")
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 查找Faculty部分的链接
faculty_link = await page.evaluate('''() => {
const links = document.querySelectorAll('a[href]');
for (const link of links) {
const text = link.innerText.toLowerCase();
const href = link.href;
if (text.includes('faculty') && text.includes('see list')) {
return href;
}
if (text.includes('faculty') && (href.includes('/people') || href.includes('/faculty'))) {
return href;
}
}
return null;
}''')
if faculty_link:
faculty_page_url = faculty_link
print(f" 找到Faculty页面链接: {faculty_link}")
# 访问Faculty/People页面
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
# 提取所有导师信息
faculty_list = await page.evaluate('''() => {
const faculty = [];
const seen = new Set();
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href || '';
const text = a.innerText.trim();
const lowerHref = href.toLowerCase();
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
lowerHref.includes('/profile/')) &&
text.length > 3 && text.length < 100 &&
!text.toLowerCase().includes('people') &&
!text.toLowerCase().includes('faculty') &&
!lowerHref.endsWith('/people/') &&
!lowerHref.endsWith('/faculty/')) {
if (!seen.has(href)) {
seen.add(href);
faculty.push({
name: text,
url: href
});
}
}
});
return faculty;
}''')
print(f" 找到 {len(faculty_list)} 位导师")
for f in faculty_list[:5]:
print(f" - {f['name']}: {f['url']}")
if len(faculty_list) > 5:
print(f" ... 还有 {len(faculty_list) - 5}")
else:
print(" 未找到Faculty页面链接")
except Exception as e:
print(f" 获取Faculty信息失败: {e}")
return faculty_list, faculty_page_url
async def test_faculty_scraper():
"""测试导师爬取"""
# 测试3个项目
test_programs = [
"African and African American Studies",
"Economics",
"Computer Science"
]
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
results = []
for i, name in enumerate(test_programs, 1):
print(f"\n{'='*60}")
print(f"[{i}/{len(test_programs)}] 测试: {name}")
print(f"{'='*60}")
slug = name_to_slug(name)
program_url = f"https://www.harvard.edu/programs/{slug}/"
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
print(f"项目URL: {program_url}")
print(f"GSAS URL: {gsas_url}")
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url)
results.append({
'name': name,
'url': program_url,
'gsas_url': gsas_url,
'faculty_page_url': faculty_page_url,
'faculty': faculty_list,
'faculty_count': len(faculty_list)
})
await page.wait_for_timeout(1000)
await browser.close()
# 输出结果
print(f"\n\n{'='*60}")
print("测试结果汇总")
print(f"{'='*60}")
for r in results:
print(f"\n{r['name']}:")
print(f" Faculty页面: {r['faculty_page_url'] or '未找到'}")
print(f" 导师数量: {r['faculty_count']}")
# 保存测试结果
with open('test_faculty_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n测试结果已保存到: test_faculty_results.json")
if __name__ == "__main__":
asyncio.run(test_faculty_scraper())