get_sitemap_pages
Extract all pages from a website's sitemap, with options for pagination, filtering by route, including metadata, and setting limits for efficient data retrieval.
Instructions
Get all pages from a website's sitemap with optional limits and filtering options. Supports cursor-based pagination.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| cursor | No | Pagination cursor for fetching the next page of results | |
| include_metadata | No | Whether to include additional page metadata (priority, lastmod, etc.) | |
| limit | No | Maximum number of pages to return per page (0 for default of 100) | |
| route | No | Optional route path to filter pages by (e.g., '/blog') | |
| sitemap_url | No | Optional URL of a specific sitemap to get pages from | |
| url | Yes | The URL of the website homepage (e.g., https://example.com) |
Input Schema (JSON Schema)
{
"properties": {
"cursor": {
"default": "",
"description": "Pagination cursor for fetching the next page of results",
"title": "Cursor",
"type": "string"
},
"include_metadata": {
"default": false,
"description": "Whether to include additional page metadata (priority, lastmod, etc.)",
"title": "Include Metadata",
"type": "boolean"
},
"limit": {
"default": 0,
"description": "Maximum number of pages to return per page (0 for default of 100)",
"title": "Limit",
"type": "integer"
},
"route": {
"default": "",
"description": "Optional route path to filter pages by (e.g., '/blog')",
"title": "Route",
"type": "string"
},
"sitemap_url": {
"default": "",
"description": "Optional URL of a specific sitemap to get pages from",
"title": "Sitemap Url",
"type": "string"
},
"url": {
"description": "The URL of the website homepage (e.g., https://example.com)",
"title": "Url",
"type": "string"
}
},
"required": [
"url"
],
"title": "get_sitemap_pagesArguments",
"type": "object"
}
Implementation Reference
- src/sitemap_mcp_server/server.py:215-343 (handler)Implementation of the get_sitemap_pages tool handler, including inline Pydantic schema for parameters and the full async function logic for fetching, filtering, paginating, and returning sitemap pages.@mcp.tool( description="Get all pages from a website's sitemap with optional limits and filtering options. Supports cursor-based pagination.", ) async def get_sitemap_pages( ctx: Context, url: str = Field( ..., description="The URL of the website homepage (e.g., https://example.com)" ), limit: int = Field( 0, description="Maximum number of pages to return per page (0 for default of 100)", ), include_metadata: bool = Field( False, description="Whether to include additional page metadata (priority, lastmod, etc.)", ), route: str = Field( "", description="Optional route path to filter pages by (e.g., '/blog')" ), sitemap_url: str = Field( "", description="Optional URL of a specific sitemap to get pages from" ), cursor: str = Field( "", description="Pagination cursor for fetching the next page of results" ), ) -> str: try: normalized_url = normalize_and_validate_url(url) if not normalized_url: return safe_json_dumps( { "error": "Invalid URL provided. Please provide a valid HTTP or HTTPS URL.", "type": "ValidationError", } ) url = normalized_url main_tree = ctx.request_context.lifespan_context.get_sitemap(url) target_sitemap = main_tree # If filtering by sitemap_url, find the specific sitemap if sitemap_url and sitemap_url.strip(): found = False for sitemap in main_tree.all_sitemaps(): if hasattr(sitemap, "url") and sitemap.url == sitemap_url: target_sitemap = sitemap found = True break if not found: return safe_json_dumps( { "base_url": url, "sitemap_url": sitemap_url, "pages": [], "warning": f"Sitemap URL {sitemap_url} not found", }, cls=CustomJSONEncoder, ) # Collect matching pages matching_pages = [] # Normalize and validate route filter_by_route = bool(route and route.strip()) if filter_by_route: # Ensure route starts with / and does not end with / unless it's just "/" if not route.startswith("/"): route = "/" + route if route.endswith("/") and len(route) > 1: route = route[:-1] parsed_url = urlparse(url) base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}" for page in target_sitemap.all_pages(): if filter_by_route: page_url = page.url # Allow all if route == "/" if route == "/": pass else: if not ( page_url == base_domain + route or page_url == base_domain + route + "/" or page_url.startswith(base_domain + route + "/") ): continue if include_metadata: matching_pages.append(page.to_dict()) else: matching_pages.append({"url": page.url}) # Pagination logic total_pages = len(matching_pages) page_size = 100 if limit == 0 else min(limit, 100) page_number = 0 if cursor and cursor.strip(): try: cursor_data = json.loads(base64.b64decode(cursor).decode("utf-8")) if "page" in cursor_data: page_number = int(cursor_data["page"]) except Exception: pass start_idx = page_number * page_size end_idx = min(start_idx + page_size, total_pages) current_page = matching_pages[start_idx:end_idx] # Generate next cursor if there are more pages next_page = page_number + 1 next_start_idx = next_page * page_size next_cursor = None if next_start_idx < total_pages: cursor_data = {"page": next_page} next_cursor = base64.b64encode( json.dumps(cursor_data).encode("utf-8") ).decode("utf-8") # Build response response = { "url": url, "page_count": total_pages, "pages": current_page, "limit": page_size, } if next_cursor: response["nextCursor"] = next_cursor return safe_json_dumps(response) except Exception as e: return safe_json_dumps( { "error": f"Error fetching sitemap pages for {url}: {str(e)}", "type": type(e).__name__, } )