Skip to main content
Glama
mugoosse

Sitemap MCP Server

get_sitemap_pages

Extract all pages from a website's sitemap, with options for pagination, filtering by route, including metadata, and setting limits for efficient data retrieval.

Instructions

Get all pages from a website's sitemap with optional limits and filtering options. Supports cursor-based pagination.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
cursorNoPagination cursor for fetching the next page of results
include_metadataNoWhether to include additional page metadata (priority, lastmod, etc.)
limitNoMaximum number of pages to return per page (0 for default of 100)
routeNoOptional route path to filter pages by (e.g., '/blog')
sitemap_urlNoOptional URL of a specific sitemap to get pages from
urlYesThe URL of the website homepage (e.g., https://example.com)

Implementation Reference

  • Implementation of the get_sitemap_pages tool handler, including inline Pydantic schema for parameters and the full async function logic for fetching, filtering, paginating, and returning sitemap pages.
    @mcp.tool(
        description="Get all pages from a website's sitemap with optional limits and filtering options. Supports cursor-based pagination.",
    )
    async def get_sitemap_pages(
        ctx: Context,
        url: str = Field(
            ..., description="The URL of the website homepage (e.g., https://example.com)"
        ),
        limit: int = Field(
            0,
            description="Maximum number of pages to return per page (0 for default of 100)",
        ),
        include_metadata: bool = Field(
            False,
            description="Whether to include additional page metadata (priority, lastmod, etc.)",
        ),
        route: str = Field(
            "", description="Optional route path to filter pages by (e.g., '/blog')"
        ),
        sitemap_url: str = Field(
            "", description="Optional URL of a specific sitemap to get pages from"
        ),
        cursor: str = Field(
            "", description="Pagination cursor for fetching the next page of results"
        ),
    ) -> str:
        try:
            normalized_url = normalize_and_validate_url(url)
            if not normalized_url:
                return safe_json_dumps(
                    {
                        "error": "Invalid URL provided. Please provide a valid HTTP or HTTPS URL.",
                        "type": "ValidationError",
                    }
                )
            url = normalized_url
            main_tree = ctx.request_context.lifespan_context.get_sitemap(url)
            target_sitemap = main_tree
            # If filtering by sitemap_url, find the specific sitemap
            if sitemap_url and sitemap_url.strip():
                found = False
                for sitemap in main_tree.all_sitemaps():
                    if hasattr(sitemap, "url") and sitemap.url == sitemap_url:
                        target_sitemap = sitemap
                        found = True
                        break
                if not found:
                    return safe_json_dumps(
                        {
                            "base_url": url,
                            "sitemap_url": sitemap_url,
                            "pages": [],
                            "warning": f"Sitemap URL {sitemap_url} not found",
                        },
                        cls=CustomJSONEncoder,
                    )
    
            # Collect matching pages
            matching_pages = []
    
            # Normalize and validate route
            filter_by_route = bool(route and route.strip())
            if filter_by_route:
                # Ensure route starts with / and does not end with / unless it's just "/"
                if not route.startswith("/"):
                    route = "/" + route
                if route.endswith("/") and len(route) > 1:
                    route = route[:-1]
                parsed_url = urlparse(url)
                base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
    
            for page in target_sitemap.all_pages():
                if filter_by_route:
                    page_url = page.url
                    # Allow all if route == "/"
                    if route == "/":
                        pass
                    else:
                        if not (
                            page_url == base_domain + route
                            or page_url == base_domain + route + "/"
                            or page_url.startswith(base_domain + route + "/")
                        ):
                            continue
                if include_metadata:
                    matching_pages.append(page.to_dict())
                else:
                    matching_pages.append({"url": page.url})
            # Pagination logic
            total_pages = len(matching_pages)
            page_size = 100 if limit == 0 else min(limit, 100)
            page_number = 0
            if cursor and cursor.strip():
                try:
                    cursor_data = json.loads(base64.b64decode(cursor).decode("utf-8"))
                    if "page" in cursor_data:
                        page_number = int(cursor_data["page"])
                except Exception:
                    pass
            start_idx = page_number * page_size
            end_idx = min(start_idx + page_size, total_pages)
            current_page = matching_pages[start_idx:end_idx]
            # Generate next cursor if there are more pages
            next_page = page_number + 1
            next_start_idx = next_page * page_size
            next_cursor = None
            if next_start_idx < total_pages:
                cursor_data = {"page": next_page}
                next_cursor = base64.b64encode(
                    json.dumps(cursor_data).encode("utf-8")
                ).decode("utf-8")
            # Build response
            response = {
                "url": url,
                "page_count": total_pages,
                "pages": current_page,
                "limit": page_size,
            }
            if next_cursor:
                response["nextCursor"] = next_cursor
            return safe_json_dumps(response)
    
        except Exception as e:
            return safe_json_dumps(
                {
                    "error": f"Error fetching sitemap pages for {url}: {str(e)}",
                    "type": type(e).__name__,
                }
            )
Install Server

Other Tools

Related Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mugoosse/sitemap-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server