Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
173
artifacts/explore_manchester.py
Normal file
173
artifacts/explore_manchester.py
Normal file
@ -0,0 +1,173 @@
|
||||
"""
|
||||
探索曼彻斯特大学硕士课程页面结构
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
async def explore_manchester():
|
||||
"""探索曼彻斯特大学网站结构"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
# 直接访问硕士课程A-Z列表页
|
||||
print("访问硕士课程A-Z列表页面...")
|
||||
await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/",
|
||||
wait_until="domcontentloaded", timeout=60000)
|
||||
await page.wait_for_timeout(5000)
|
||||
|
||||
# 截图
|
||||
await page.screenshot(path="manchester_masters_page.png", full_page=False)
|
||||
print("截图已保存: manchester_masters_page.png")
|
||||
|
||||
# 分析页面结构
|
||||
page_info = await page.evaluate("""() => {
|
||||
const info = {
|
||||
title: document.title,
|
||||
url: window.location.href,
|
||||
all_links: [],
|
||||
course_candidates: [],
|
||||
page_sections: []
|
||||
};
|
||||
|
||||
// 获取所有链接
|
||||
document.querySelectorAll('a[href]').forEach(a => {
|
||||
const href = a.href;
|
||||
const text = a.innerText.trim().substring(0, 100);
|
||||
if (href && text) {
|
||||
info.all_links.push({href, text});
|
||||
}
|
||||
});
|
||||
|
||||
// 查找可能的课程链接 - 包含 /course/ 或 list-item
|
||||
document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => {
|
||||
info.course_candidates.push({
|
||||
href: a.href,
|
||||
text: a.innerText.trim().substring(0, 100),
|
||||
classes: a.className,
|
||||
parent_classes: a.parentElement?.className || ''
|
||||
});
|
||||
});
|
||||
|
||||
// 获取页面主要区块
|
||||
document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => {
|
||||
info.page_sections.push({
|
||||
tag: el.tagName,
|
||||
id: el.id,
|
||||
classes: el.className,
|
||||
children_count: el.children.length
|
||||
});
|
||||
});
|
||||
|
||||
return info;
|
||||
}""")
|
||||
|
||||
print(f"\n页面标题: {page_info['title']}")
|
||||
print(f"当前URL: {page_info['url']}")
|
||||
print(f"\n总链接数: {len(page_info['all_links'])}")
|
||||
print(f"课程候选链接数: {len(page_info['course_candidates'])}")
|
||||
|
||||
# 查找包含 masters/courses/ 的链接
|
||||
masters_links = [l for l in page_info['all_links']
|
||||
if 'masters/courses/' in l['href'].lower()
|
||||
and l['href'] != page_info['url']]
|
||||
|
||||
print(f"\n硕士课程相关链接 ({len(masters_links)}):")
|
||||
for link in masters_links[:20]:
|
||||
print(f" - {link['text'][:50]}: {link['href']}")
|
||||
|
||||
print(f"\n课程候选详情:")
|
||||
for c in page_info['course_candidates'][:10]:
|
||||
print(f" - {c['text'][:50]}")
|
||||
print(f" URL: {c['href']}")
|
||||
print(f" Classes: {c['classes']}")
|
||||
|
||||
# 检查是否有搜索/筛选功能
|
||||
search_elements = await page.evaluate("""() => {
|
||||
const elements = [];
|
||||
document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => {
|
||||
elements.push({
|
||||
tag: el.tagName,
|
||||
type: el.type || '',
|
||||
id: el.id,
|
||||
name: el.name || '',
|
||||
classes: el.className
|
||||
});
|
||||
});
|
||||
return elements;
|
||||
}""")
|
||||
|
||||
print(f"\n搜索/筛选元素: {len(search_elements)}")
|
||||
for el in search_elements[:5]:
|
||||
print(f" - {el}")
|
||||
|
||||
# 尝试找到课程列表的实际结构
|
||||
print("\n\n正在分析页面中的课程列表结构...")
|
||||
|
||||
list_structures = await page.evaluate("""() => {
|
||||
const structures = [];
|
||||
|
||||
// 查找各种可能的列表结构
|
||||
const selectors = [
|
||||
'ul li a[href*="course"]',
|
||||
'div[class*="result"] a',
|
||||
'div[class*="course"] a',
|
||||
'article a[href]',
|
||||
'.search-results a',
|
||||
'[data-course] a',
|
||||
'table tr td a'
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
if (elements.length > 0) {
|
||||
const samples = [];
|
||||
elements.forEach((el, i) => {
|
||||
if (i < 5) {
|
||||
samples.push({
|
||||
href: el.href,
|
||||
text: el.innerText.trim().substring(0, 80)
|
||||
});
|
||||
}
|
||||
});
|
||||
structures.push({
|
||||
selector: selector,
|
||||
count: elements.length,
|
||||
samples: samples
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return structures;
|
||||
}""")
|
||||
|
||||
print("\n找到的列表结构:")
|
||||
for s in list_structures:
|
||||
print(f"\n 选择器: {s['selector']} (共 {s['count']} 个)")
|
||||
for sample in s['samples']:
|
||||
print(f" - {sample['text']}: {sample['href']}")
|
||||
|
||||
# 保存完整分析结果
|
||||
with open("manchester_analysis.json", "w", encoding="utf-8") as f:
|
||||
json.dump(page_info, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print("\n\n完整分析已保存到 manchester_analysis.json")
|
||||
|
||||
# 等待用户查看
|
||||
print("\n按 Ctrl+C 关闭浏览器...")
|
||||
try:
|
||||
await asyncio.sleep(30)
|
||||
except:
|
||||
pass
|
||||
|
||||
await browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(explore_manchester())
|
||||
Reference in New Issue
Block a user