Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
165
artifacts/test_faculty_scraper.py
Normal file
165
artifacts/test_faculty_scraper.py
Normal file
@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
测试导师信息爬取逻辑 - 只测试3个项目
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
def name_to_slug(name):
|
||||
"""将项目名称转换为URL slug"""
|
||||
slug = name.lower()
|
||||
slug = re.sub(r'[^\w\s-]', '', slug)
|
||||
slug = re.sub(r'[\s_]+', '-', slug)
|
||||
slug = re.sub(r'-+', '-', slug)
|
||||
slug = slug.strip('-')
|
||||
return slug
|
||||
|
||||
|
||||
async def get_faculty_from_gsas_page(page, gsas_url):
|
||||
"""从GSAS项目页面获取Faculty链接,然后访问院系People页面获取导师列表"""
|
||||
faculty_list = []
|
||||
faculty_page_url = None
|
||||
|
||||
try:
|
||||
print(f" 访问GSAS页面: {gsas_url}")
|
||||
await page.goto(gsas_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 查找Faculty部分的链接
|
||||
faculty_link = await page.evaluate('''() => {
|
||||
const links = document.querySelectorAll('a[href]');
|
||||
for (const link of links) {
|
||||
const text = link.innerText.toLowerCase();
|
||||
const href = link.href;
|
||||
if (text.includes('faculty') && text.includes('see list')) {
|
||||
return href;
|
||||
}
|
||||
if (text.includes('faculty') && (href.includes('/people') || href.includes('/faculty'))) {
|
||||
return href;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}''')
|
||||
|
||||
if faculty_link:
|
||||
faculty_page_url = faculty_link
|
||||
print(f" 找到Faculty页面链接: {faculty_link}")
|
||||
|
||||
# 访问Faculty/People页面
|
||||
await page.goto(faculty_link, wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# 提取所有导师信息
|
||||
faculty_list = await page.evaluate('''() => {
|
||||
const faculty = [];
|
||||
const seen = new Set();
|
||||
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href || '';
|
||||
const text = a.innerText.trim();
|
||||
const lowerHref = href.toLowerCase();
|
||||
|
||||
if ((lowerHref.includes('/people/') || lowerHref.includes('/faculty/') ||
|
||||
lowerHref.includes('/profile/')) &&
|
||||
text.length > 3 && text.length < 100 &&
|
||||
!text.toLowerCase().includes('people') &&
|
||||
!text.toLowerCase().includes('faculty') &&
|
||||
!lowerHref.endsWith('/people/') &&
|
||||
!lowerHref.endsWith('/faculty/')) {
|
||||
|
||||
if (!seen.has(href)) {
|
||||
seen.add(href);
|
||||
faculty.push({
|
||||
name: text,
|
||||
url: href
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return faculty;
|
||||
}''')
|
||||
|
||||
print(f" 找到 {len(faculty_list)} 位导师")
|
||||
for f in faculty_list[:5]:
|
||||
print(f" - {f['name']}: {f['url']}")
|
||||
if len(faculty_list) > 5:
|
||||
print(f" ... 还有 {len(faculty_list) - 5} 位")
|
||||
else:
|
||||
print(" 未找到Faculty页面链接")
|
||||
|
||||
except Exception as e:
|
||||
print(f" 获取Faculty信息失败: {e}")
|
||||
|
||||
return faculty_list, faculty_page_url
|
||||
|
||||
|
||||
async def test_faculty_scraper():
|
||||
"""测试导师爬取"""
|
||||
|
||||
# 测试3个项目
|
||||
test_programs = [
|
||||
"African and African American Studies",
|
||||
"Economics",
|
||||
"Computer Science"
|
||||
]
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
results = []
|
||||
|
||||
for i, name in enumerate(test_programs, 1):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{i}/{len(test_programs)}] 测试: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
slug = name_to_slug(name)
|
||||
program_url = f"https://www.harvard.edu/programs/{slug}/"
|
||||
gsas_url = f"https://gsas.harvard.edu/program/{slug}"
|
||||
|
||||
print(f"项目URL: {program_url}")
|
||||
print(f"GSAS URL: {gsas_url}")
|
||||
|
||||
faculty_list, faculty_page_url = await get_faculty_from_gsas_page(page, gsas_url)
|
||||
|
||||
results.append({
|
||||
'name': name,
|
||||
'url': program_url,
|
||||
'gsas_url': gsas_url,
|
||||
'faculty_page_url': faculty_page_url,
|
||||
'faculty': faculty_list,
|
||||
'faculty_count': len(faculty_list)
|
||||
})
|
||||
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
await browser.close()
|
||||
|
||||
# 输出结果
|
||||
print(f"\n\n{'='*60}")
|
||||
print("测试结果汇总")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for r in results:
|
||||
print(f"\n{r['name']}:")
|
||||
print(f" Faculty页面: {r['faculty_page_url'] or '未找到'}")
|
||||
print(f" 导师数量: {r['faculty_count']}")
|
||||
|
||||
# 保存测试结果
|
||||
with open('test_faculty_results.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n测试结果已保存到: test_faculty_results.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_faculty_scraper())
|
||||
Reference in New Issue
Block a user