Skip to main content
Glama

get_sitemap_pages

Extract all pages from a website's sitemap, with options for pagination, filtering by route, including metadata, and setting limits for efficient data retrieval.

Instructions

Get all pages from a website's sitemap with optional limits and filtering options. Supports cursor-based pagination.

Input Schema

NameRequiredDescriptionDefault
cursorNoPagination cursor for fetching the next page of results
include_metadataNoWhether to include additional page metadata (priority, lastmod, etc.)
limitNoMaximum number of pages to return per page (0 for default of 100)
routeNoOptional route path to filter pages by (e.g., '/blog')
sitemap_urlNoOptional URL of a specific sitemap to get pages from
urlYesThe URL of the website homepage (e.g., https://example.com)

Input Schema (JSON Schema)

{ "properties": { "cursor": { "default": "", "description": "Pagination cursor for fetching the next page of results", "title": "Cursor", "type": "string" }, "include_metadata": { "default": false, "description": "Whether to include additional page metadata (priority, lastmod, etc.)", "title": "Include Metadata", "type": "boolean" }, "limit": { "default": 0, "description": "Maximum number of pages to return per page (0 for default of 100)", "title": "Limit", "type": "integer" }, "route": { "default": "", "description": "Optional route path to filter pages by (e.g., '/blog')", "title": "Route", "type": "string" }, "sitemap_url": { "default": "", "description": "Optional URL of a specific sitemap to get pages from", "title": "Sitemap Url", "type": "string" }, "url": { "description": "The URL of the website homepage (e.g., https://example.com)", "title": "Url", "type": "string" } }, "required": [ "url" ], "title": "get_sitemap_pagesArguments", "type": "object" }

Implementation Reference

  • Implementation of the get_sitemap_pages tool handler, including inline Pydantic schema for parameters and the full async function logic for fetching, filtering, paginating, and returning sitemap pages.
    @mcp.tool( description="Get all pages from a website's sitemap with optional limits and filtering options. Supports cursor-based pagination.", ) async def get_sitemap_pages( ctx: Context, url: str = Field( ..., description="The URL of the website homepage (e.g., https://example.com)" ), limit: int = Field( 0, description="Maximum number of pages to return per page (0 for default of 100)", ), include_metadata: bool = Field( False, description="Whether to include additional page metadata (priority, lastmod, etc.)", ), route: str = Field( "", description="Optional route path to filter pages by (e.g., '/blog')" ), sitemap_url: str = Field( "", description="Optional URL of a specific sitemap to get pages from" ), cursor: str = Field( "", description="Pagination cursor for fetching the next page of results" ), ) -> str: try: normalized_url = normalize_and_validate_url(url) if not normalized_url: return safe_json_dumps( { "error": "Invalid URL provided. Please provide a valid HTTP or HTTPS URL.", "type": "ValidationError", } ) url = normalized_url main_tree = ctx.request_context.lifespan_context.get_sitemap(url) target_sitemap = main_tree # If filtering by sitemap_url, find the specific sitemap if sitemap_url and sitemap_url.strip(): found = False for sitemap in main_tree.all_sitemaps(): if hasattr(sitemap, "url") and sitemap.url == sitemap_url: target_sitemap = sitemap found = True break if not found: return safe_json_dumps( { "base_url": url, "sitemap_url": sitemap_url, "pages": [], "warning": f"Sitemap URL {sitemap_url} not found", }, cls=CustomJSONEncoder, ) # Collect matching pages matching_pages = [] # Normalize and validate route filter_by_route = bool(route and route.strip()) if filter_by_route: # Ensure route starts with / and does not end with / unless it's just "/" if not route.startswith("/"): route = "/" + route if route.endswith("/") and len(route) > 1: route = route[:-1] parsed_url = urlparse(url) base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}" for page in target_sitemap.all_pages(): if filter_by_route: page_url = page.url # Allow all if route == "/" if route == "/": pass else: if not ( page_url == base_domain + route or page_url == base_domain + route + "/" or page_url.startswith(base_domain + route + "/") ): continue if include_metadata: matching_pages.append(page.to_dict()) else: matching_pages.append({"url": page.url}) # Pagination logic total_pages = len(matching_pages) page_size = 100 if limit == 0 else min(limit, 100) page_number = 0 if cursor and cursor.strip(): try: cursor_data = json.loads(base64.b64decode(cursor).decode("utf-8")) if "page" in cursor_data: page_number = int(cursor_data["page"]) except Exception: pass start_idx = page_number * page_size end_idx = min(start_idx + page_size, total_pages) current_page = matching_pages[start_idx:end_idx] # Generate next cursor if there are more pages next_page = page_number + 1 next_start_idx = next_page * page_size next_cursor = None if next_start_idx < total_pages: cursor_data = {"page": next_page} next_cursor = base64.b64encode( json.dumps(cursor_data).encode("utf-8") ).decode("utf-8") # Build response response = { "url": url, "page_count": total_pages, "pages": current_page, "limit": page_size, } if next_cursor: response["nextCursor"] = next_cursor return safe_json_dumps(response) except Exception as e: return safe_json_dumps( { "error": f"Error fetching sitemap pages for {url}: {str(e)}", "type": type(e).__name__, } )

Other Tools

Related Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mugoosse/sitemap-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server