Files
University-Playwright-Codeg…/artifacts/explore_manchester.py
yangxiaoyu-crypto 426cf4d2cd Add university scraper system with backend, frontend, and configs
- Add src/university_scraper module with scraper, analyzer, and CLI
- Add backend FastAPI service with API endpoints and database models
- Add frontend React app with university management pages
- Add configs for Harvard, Manchester, and UCL universities
- Add artifacts with various scraper implementations
- Add Docker compose configuration for deployment
- Update .gitignore to exclude generated files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 15:25:08 +08:00

174 lines
6.1 KiB
Python

"""
探索曼彻斯特大学硕士课程页面结构
"""
import asyncio
import json
from playwright.async_api import async_playwright
async def explore_manchester():
"""探索曼彻斯特大学网站结构"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
page = await context.new_page()
# 直接访问硕士课程A-Z列表页
print("访问硕士课程A-Z列表页面...")
await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/",
wait_until="domcontentloaded", timeout=60000)
await page.wait_for_timeout(5000)
# 截图
await page.screenshot(path="manchester_masters_page.png", full_page=False)
print("截图已保存: manchester_masters_page.png")
# 分析页面结构
page_info = await page.evaluate("""() => {
const info = {
title: document.title,
url: window.location.href,
all_links: [],
course_candidates: [],
page_sections: []
};
// 获取所有链接
document.querySelectorAll('a[href]').forEach(a => {
const href = a.href;
const text = a.innerText.trim().substring(0, 100);
if (href && text) {
info.all_links.push({href, text});
}
});
// 查找可能的课程链接 - 包含 /course/ 或 list-item
document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => {
info.course_candidates.push({
href: a.href,
text: a.innerText.trim().substring(0, 100),
classes: a.className,
parent_classes: a.parentElement?.className || ''
});
});
// 获取页面主要区块
document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => {
info.page_sections.push({
tag: el.tagName,
id: el.id,
classes: el.className,
children_count: el.children.length
});
});
return info;
}""")
print(f"\n页面标题: {page_info['title']}")
print(f"当前URL: {page_info['url']}")
print(f"\n总链接数: {len(page_info['all_links'])}")
print(f"课程候选链接数: {len(page_info['course_candidates'])}")
# 查找包含 masters/courses/ 的链接
masters_links = [l for l in page_info['all_links']
if 'masters/courses/' in l['href'].lower()
and l['href'] != page_info['url']]
print(f"\n硕士课程相关链接 ({len(masters_links)}):")
for link in masters_links[:20]:
print(f" - {link['text'][:50]}: {link['href']}")
print(f"\n课程候选详情:")
for c in page_info['course_candidates'][:10]:
print(f" - {c['text'][:50]}")
print(f" URL: {c['href']}")
print(f" Classes: {c['classes']}")
# 检查是否有搜索/筛选功能
search_elements = await page.evaluate("""() => {
const elements = [];
document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => {
elements.push({
tag: el.tagName,
type: el.type || '',
id: el.id,
name: el.name || '',
classes: el.className
});
});
return elements;
}""")
print(f"\n搜索/筛选元素: {len(search_elements)}")
for el in search_elements[:5]:
print(f" - {el}")
# 尝试找到课程列表的实际结构
print("\n\n正在分析页面中的课程列表结构...")
list_structures = await page.evaluate("""() => {
const structures = [];
// 查找各种可能的列表结构
const selectors = [
'ul li a[href*="course"]',
'div[class*="result"] a',
'div[class*="course"] a',
'article a[href]',
'.search-results a',
'[data-course] a',
'table tr td a'
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
const samples = [];
elements.forEach((el, i) => {
if (i < 5) {
samples.push({
href: el.href,
text: el.innerText.trim().substring(0, 80)
});
}
});
structures.push({
selector: selector,
count: elements.length,
samples: samples
});
}
}
return structures;
}""")
print("\n找到的列表结构:")
for s in list_structures:
print(f"\n 选择器: {s['selector']} (共 {s['count']} 个)")
for sample in s['samples']:
print(f" - {sample['text']}: {sample['href']}")
# 保存完整分析结果
with open("manchester_analysis.json", "w", encoding="utf-8") as f:
json.dump(page_info, f, indent=2, ensure_ascii=False)
print("\n\n完整分析已保存到 manchester_analysis.json")
# 等待用户查看
print("\n按 Ctrl+C 关闭浏览器...")
try:
await asyncio.sleep(30)
except:
pass
await browser.close()
if __name__ == "__main__":
asyncio.run(explore_manchester())