University-Playwright-Codeg…/artifacts/explore_manchester.py

"""
探索曼彻斯特大学硕士课程页面结构
"""

import asyncio
import json
from playwright.async_api import async_playwright


async def explore_manchester():
    """探索曼彻斯特大学网站结构"""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        page = await context.new_page()

        # 直接访问硕士课程A-Z列表页
        print("访问硕士课程A-Z列表页面...")
        await page.goto("https://www.manchester.ac.uk/study/masters/courses/list/",
                       wait_until="domcontentloaded", timeout=60000)
        await page.wait_for_timeout(5000)

        # 截图
        await page.screenshot(path="manchester_masters_page.png", full_page=False)
        print("截图已保存: manchester_masters_page.png")

        # 分析页面结构
        page_info = await page.evaluate("""() => {
            const info = {
                title: document.title,
                url: window.location.href,
                all_links: [],
                course_candidates: [],
                page_sections: []
            };

            // 获取所有链接
            document.querySelectorAll('a[href]').forEach(a => {
                const href = a.href;
                const text = a.innerText.trim().substring(0, 100);
                if (href && text) {
                    info.all_links.push({href, text});
                }
            });

            // 查找可能的课程链接 - 包含 /course/ 或 list-item
            document.querySelectorAll('a[href*="/course/"], .course-link, [class*="course"] a, .search-result a, .list-item a').forEach(a => {
                info.course_candidates.push({
                    href: a.href,
                    text: a.innerText.trim().substring(0, 100),
                    classes: a.className,
                    parent_classes: a.parentElement?.className || ''
                });
            });

            // 获取页面主要区块
            document.querySelectorAll('main, [role="main"], .content, #content, .results, .course-list').forEach(el => {
                info.page_sections.push({
                    tag: el.tagName,
                    id: el.id,
                    classes: el.className,
                    children_count: el.children.length
                });
            });

            return info;
        }""")

        print(f"\n页面标题: {page_info['title']}")
        print(f"当前URL: {page_info['url']}")
        print(f"\n总链接数: {len(page_info['all_links'])}")
        print(f"课程候选链接数: {len(page_info['course_candidates'])}")

        # 查找包含 masters/courses/ 的链接
        masters_links = [l for l in page_info['all_links']
                        if 'masters/courses/' in l['href'].lower()
                        and l['href'] != page_info['url']]

        print(f"\n硕士课程相关链接 ({len(masters_links)}):")
        for link in masters_links[:20]:
            print(f"  - {link['text'][:50]}: {link['href']}")

        print(f"\n课程候选详情:")
        for c in page_info['course_candidates'][:10]:
            print(f"  - {c['text'][:50]}")
            print(f"    URL: {c['href']}")
            print(f"    Classes: {c['classes']}")

        # 检查是否有搜索/筛选功能
        search_elements = await page.evaluate("""() => {
            const elements = [];
            document.querySelectorAll('input[type="search"], input[type="text"], select, .filter, .search').forEach(el => {
                elements.push({
                    tag: el.tagName,
                    type: el.type || '',
                    id: el.id,
                    name: el.name || '',
                    classes: el.className
                });
            });
            return elements;
        }""")

        print(f"\n搜索/筛选元素: {len(search_elements)}")
        for el in search_elements[:5]:
            print(f"  - {el}")

        # 尝试找到课程列表的实际结构
        print("\n\n正在分析页面中的课程列表结构...")

        list_structures = await page.evaluate("""() => {
            const structures = [];

            // 查找各种可能的列表结构
            const selectors = [
                'ul li a[href*="course"]',
                'div[class*="result"] a',
                'div[class*="course"] a',
                'article a[href]',
                '.search-results a',
                '[data-course] a',
                'table tr td a'
            ];

            for (const selector of selectors) {
                const elements = document.querySelectorAll(selector);
                if (elements.length > 0) {
                    const samples = [];
                    elements.forEach((el, i) => {
                        if (i < 5) {
                            samples.push({
                                href: el.href,
                                text: el.innerText.trim().substring(0, 80)
                            });
                        }
                    });
                    structures.push({
                        selector: selector,
                        count: elements.length,
                        samples: samples
                    });
                }
            }

            return structures;
        }""")

        print("\n找到的列表结构:")
        for s in list_structures:
            print(f"\n  选择器: {s['selector']} (共 {s['count']} 个)")
            for sample in s['samples']:
                print(f"    - {sample['text']}: {sample['href']}")

        # 保存完整分析结果
        with open("manchester_analysis.json", "w", encoding="utf-8") as f:
            json.dump(page_info, f, indent=2, ensure_ascii=False)

        print("\n\n完整分析已保存到 manchester_analysis.json")

        # 等待用户查看
        print("\n按 Ctrl+C 关闭浏览器...")
        try:
            await asyncio.sleep(30)
        except:
            pass

        await browser.close()


if __name__ == "__main__":
    asyncio.run(explore_manchester())