- Add src/university_scraper module with scraper, analyzer, and CLI - Add backend FastAPI service with API endpoints and database models - Add frontend React app with university management pages - Add configs for Harvard, Manchester, and UCL universities - Add artifacts with various scraper implementations - Add Docker compose configuration for deployment - Update .gitignore to exclude generated files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
174 lines
6.1 KiB
Python
174 lines
6.1 KiB
Python
"""
|
|
探索曼彻斯特大学硕士课程页面结构
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
from playwright.async_api import async_playwright
|
|
|
|
|
|
async def explore_manchester():
|
|
"""探索曼彻斯特大学网站结构"""
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=False)
|
|
context = await browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
)
|
|
page = await context.new_page()
|
|
|
|
# 直接访问硕士课程A-Z列表页
|
|
print("访问硕士课程A-Z列表页面...")
|
|
await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/",
|
|
wait_until="domcontentloaded", timeout=60000)
|
|
await page.wait_for_timeout(5000)
|
|
|
|
# 截图
|
|
await page.screenshot(path="manchester_masters_page.png", full_page=False)
|
|
print("截图已保存: manchester_masters_page.png")
|
|
|
|
# 分析页面结构
|
|
page_info = await page.evaluate("""() => {
|
|
const info = {
|
|
title: document.title,
|
|
url: window.location.href,
|
|
all_links: [],
|
|
course_candidates: [],
|
|
page_sections: []
|
|
};
|
|
|
|
// 获取所有链接
|
|
document.querySelectorAll('a[href]').forEach(a => {
|
|
const href = a.href;
|
|
const text = a.innerText.trim().substring(0, 100);
|
|
if (href && text) {
|
|
info.all_links.push({href, text});
|
|
}
|
|
});
|
|
|
|
// 查找可能的课程链接 - 包含 /course/ 或 list-item
|
|
document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => {
|
|
info.course_candidates.push({
|
|
href: a.href,
|
|
text: a.innerText.trim().substring(0, 100),
|
|
classes: a.className,
|
|
parent_classes: a.parentElement?.className || ''
|
|
});
|
|
});
|
|
|
|
// 获取页面主要区块
|
|
document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => {
|
|
info.page_sections.push({
|
|
tag: el.tagName,
|
|
id: el.id,
|
|
classes: el.className,
|
|
children_count: el.children.length
|
|
});
|
|
});
|
|
|
|
return info;
|
|
}""")
|
|
|
|
print(f"\n页面标题: {page_info['title']}")
|
|
print(f"当前URL: {page_info['url']}")
|
|
print(f"\n总链接数: {len(page_info['all_links'])}")
|
|
print(f"课程候选链接数: {len(page_info['course_candidates'])}")
|
|
|
|
# 查找包含 masters/courses/ 的链接
|
|
masters_links = [l for l in page_info['all_links']
|
|
if 'masters/courses/' in l['href'].lower()
|
|
and l['href'] != page_info['url']]
|
|
|
|
print(f"\n硕士课程相关链接 ({len(masters_links)}):")
|
|
for link in masters_links[:20]:
|
|
print(f" - {link['text'][:50]}: {link['href']}")
|
|
|
|
print(f"\n课程候选详情:")
|
|
for c in page_info['course_candidates'][:10]:
|
|
print(f" - {c['text'][:50]}")
|
|
print(f" URL: {c['href']}")
|
|
print(f" Classes: {c['classes']}")
|
|
|
|
# 检查是否有搜索/筛选功能
|
|
search_elements = await page.evaluate("""() => {
|
|
const elements = [];
|
|
document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => {
|
|
elements.push({
|
|
tag: el.tagName,
|
|
type: el.type || '',
|
|
id: el.id,
|
|
name: el.name || '',
|
|
classes: el.className
|
|
});
|
|
});
|
|
return elements;
|
|
}""")
|
|
|
|
print(f"\n搜索/筛选元素: {len(search_elements)}")
|
|
for el in search_elements[:5]:
|
|
print(f" - {el}")
|
|
|
|
# 尝试找到课程列表的实际结构
|
|
print("\n\n正在分析页面中的课程列表结构...")
|
|
|
|
list_structures = await page.evaluate("""() => {
|
|
const structures = [];
|
|
|
|
// 查找各种可能的列表结构
|
|
const selectors = [
|
|
'ul li a[href*="course"]',
|
|
'div[class*="result"] a',
|
|
'div[class*="course"] a',
|
|
'article a[href]',
|
|
'.search-results a',
|
|
'[data-course] a',
|
|
'table tr td a'
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
if (elements.length > 0) {
|
|
const samples = [];
|
|
elements.forEach((el, i) => {
|
|
if (i < 5) {
|
|
samples.push({
|
|
href: el.href,
|
|
text: el.innerText.trim().substring(0, 80)
|
|
});
|
|
}
|
|
});
|
|
structures.push({
|
|
selector: selector,
|
|
count: elements.length,
|
|
samples: samples
|
|
});
|
|
}
|
|
}
|
|
|
|
return structures;
|
|
}""")
|
|
|
|
print("\n找到的列表结构:")
|
|
for s in list_structures:
|
|
print(f"\n 选择器: {s['selector']} (共 {s['count']} 个)")
|
|
for sample in s['samples']:
|
|
print(f" - {sample['text']}: {sample['href']}")
|
|
|
|
# 保存完整分析结果
|
|
with open("manchester_analysis.json", "w", encoding="utf-8") as f:
|
|
json.dump(page_info, f, indent=2, ensure_ascii=False)
|
|
|
|
print("\n\n完整分析已保存到 manchester_analysis.json")
|
|
|
|
# 等待用户查看
|
|
print("\n按 Ctrl+C 关闭浏览器...")
|
|
try:
|
|
await asyncio.sleep(30)
|
|
except:
|
|
pass
|
|
|
|
await browser.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(explore_manchester())
|