""" 探索曼彻斯特大学硕士课程页面结构 """ import asyncio import json from playwright.async_api import async_playwright async def explore_manchester(): """探索曼彻斯特大学网站结构""" async with async_playwright() as p: browser = await p.chromium.launch(headless=False) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" ) page = await context.new_page() # 直接访问硕士课程A-Z列表页 print("访问硕士课程A-Z列表页面...") await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/", wait_until="domcontentloaded", timeout=60000) await page.wait_for_timeout(5000) # 截图 await page.screenshot(path="manchester_masters_page.png", full_page=False) print("截图已保存: manchester_masters_page.png") # 分析页面结构 page_info = await page.evaluate("""() => { const info = { title: document.title, url: window.location.href, all_links: [], course_candidates: [], page_sections: [] }; // 获取所有链接 document.querySelectorAll('a[href]').forEach(a => { const href = a.href; const text = a.innerText.trim().substring(0, 100); if (href && text) { info.all_links.push({href, text}); } }); // 查找可能的课程链接 - 包含 /course/ 或 list-item document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => { info.course_candidates.push({ href: a.href, text: a.innerText.trim().substring(0, 100), classes: a.className, parent_classes: a.parentElement?.className || '' }); }); // 获取页面主要区块 document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => { info.page_sections.push({ tag: el.tagName, id: el.id, classes: el.className, children_count: el.children.length }); }); return info; }""") print(f"\n页面标题: {page_info['title']}") print(f"当前URL: {page_info['url']}") print(f"\n总链接数: {len(page_info['all_links'])}") print(f"课程候选链接数: {len(page_info['course_candidates'])}") # 查找包含 masters/courses/ 的链接 masters_links = [l for l in page_info['all_links'] if 'masters/courses/' in l['href'].lower() and l['href'] != page_info['url']] print(f"\n硕士课程相关链接 ({len(masters_links)}):") for link in masters_links[:20]: print(f" - {link['text'][:50]}: {link['href']}") print(f"\n课程候选详情:") for c in page_info['course_candidates'][:10]: print(f" - {c['text'][:50]}") print(f" URL: {c['href']}") print(f" Classes: {c['classes']}") # 检查是否有搜索/筛选功能 search_elements = await page.evaluate("""() => { const elements = []; document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => { elements.push({ tag: el.tagName, type: el.type || '', id: el.id, name: el.name || '', classes: el.className }); }); return elements; }""") print(f"\n搜索/筛选元素: {len(search_elements)}") for el in search_elements[:5]: print(f" - {el}") # 尝试找到课程列表的实际结构 print("\n\n正在分析页面中的课程列表结构...") list_structures = await page.evaluate("""() => { const structures = []; // 查找各种可能的列表结构 const selectors = [ 'ul li a[href*="course"]', 'div[class*="result"] a', 'div[class*="course"] a', 'article a[href]', '.search-results a', '[data-course] a', 'table tr td a' ]; for (const selector of selectors) { const elements = document.querySelectorAll(selector); if (elements.length > 0) { const samples = []; elements.forEach((el, i) => { if (i < 5) { samples.push({ href: el.href, text: el.innerText.trim().substring(0, 80) }); } }); structures.push({ selector: selector, count: elements.length, samples: samples }); } } return structures; }""") print("\n找到的列表结构:") for s in list_structures: print(f"\n 选择器: {s['selector']} (共 {s['count']} 个)") for sample in s['samples']: print(f" - {sample['text']}: {sample['href']}") # 保存完整分析结果 with open("manchester_analysis.json", "w", encoding="utf-8") as f: json.dump(page_info, f, indent=2, ensure_ascii=False) print("\n\n完整分析已保存到 manchester_analysis.json") # 等待用户查看 print("\n按 Ctrl+C 关闭浏览器...") try: await asyncio.sleep(30) except: pass await browser.close() if __name__ == "__main__": asyncio.run(explore_manchester())