analytics_data_quality_report
Analyze data quality by comparing Google Search Console and Google Analytics 4 metrics to identify coverage gaps and top URL mismatches for SEO optimization.
Instructions
Show merge coverage and top URL mismatches between GSC and GA4.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| site_url | No | ||
| property_id | No | ||
| start_date | No | ||
| end_date | No | ||
| max_rows | No | ||
| top_n_unmatched | No |
Implementation Reference
- src/seo_analytics_mcp/server.py:755-780 (handler)The analytics_data_quality_report tool handler - an MCP tool decorated function that fetches page data from GSC and GA4, then calls build_data_quality_report() to analyze merge coverage and URL mismatches between the two data sources.def analytics_data_quality_report( site_url: str | None = None, property_id: str | None = None, start_date: str | None = None, end_date: str | None = None, max_rows: int = 50000, top_n_unmatched: int = 20, ) -> dict[str, Any]: """Show merge coverage and top URL mismatches between GSC and GA4.""" data = _fetch_page_data( site_url, property_id, start_date, end_date, include_previous_period=False, max_rows=max_rows, ) return { "ranges": data["ranges"], "site_url": data["site_url"], "property_id": data["property_id"], "quality": build_data_quality_report( data["merged_pages"], top_n_unmatched=max(1, top_n_unmatched), ), }
- src/seo_analytics_mcp/server.py:260-277 (registration)Tool registration in the capabilities() function - lists 'analytics_data_quality_report' among the available tools in the tool registry."tools": [ "gsc_list_sites", "gsc_search_analytics_raw", "gsc_top_pages", "gsc_top_queries", "gsc_query_page_pairs", "ga4_run_report_raw", "ga4_landing_pages", "ga4_channel_report", "analytics_merge_page_metrics", "analytics_generate_action_items", "analytics_popularity_snapshot", "analytics_trend_report", "analytics_data_quality_report", "analytics_query_page_opportunities", "analytics_topic_clusters", ], }
- The build_data_quality_report() helper function - analyzes merged pages to categorize data coverage, identifying pages with GSC only, GA4 only, or both sources, and returns counts and top unmatched pages.def build_data_quality_report( merged_pages: list[dict[str, Any]], *, top_n_unmatched: int = 20, ) -> dict[str, Any]: pages_with_gsc = [p for p in merged_pages if float(p.get("gsc_impressions", 0.0)) > 0] pages_with_ga4 = [p for p in merged_pages if float(p.get("ga4_sessions", 0.0)) > 0] pages_with_both = [ p for p in merged_pages if float(p.get("gsc_impressions", 0.0)) > 0 and float(p.get("ga4_sessions", 0.0)) > 0 ] gsc_only = [ p for p in merged_pages if float(p.get("gsc_impressions", 0.0)) > 0 and float(p.get("ga4_sessions", 0.0)) <= 0 ] ga4_only = [ p for p in merged_pages if float(p.get("ga4_sessions", 0.0)) > 0 and float(p.get("gsc_impressions", 0.0)) <= 0 ] gsc_only_top = sorted( gsc_only, key=lambda p: float(p.get("gsc_impressions", 0.0)), reverse=True )[:top_n_unmatched] ga4_only_top = sorted( ga4_only, key=lambda p: float(p.get("ga4_sessions", 0.0)), reverse=True )[:top_n_unmatched] return { "counts": { "total_merged_pages": len(merged_pages), "pages_with_gsc": len(pages_with_gsc), "pages_with_ga4": len(pages_with_ga4), "pages_with_both": len(pages_with_both), "gsc_only_pages": len(gsc_only), "ga4_only_pages": len(ga4_only), }, "top_gsc_only_pages": [ { "url": p["url"], "gsc_impressions": round(float(p.get("gsc_impressions", 0.0)), 2), "gsc_clicks": round(float(p.get("gsc_clicks", 0.0)), 2), } for p in gsc_only_top ], "top_ga4_only_pages": [ { "url": p["url"], "ga4_sessions": round(float(p.get("ga4_sessions", 0.0)), 2), "ga4_conversions": round(float(p.get("ga4_conversions", 0.0)), 2), } for p in ga4_only_top ], }
- The _fetch_page_data() helper function - fetches and normalizes data from both GSC and GA4 connectors, then merges the page metrics together.def _fetch_page_data( site_url: str | None, property_id: str | None, start_date: str | None, end_date: str | None, *, include_previous_period: bool, max_rows: int, ) -> dict[str, Any]: settings = _get_settings() current_start, current_end = _default_dates(start_date, end_date) ranges = current_and_previous_ranges(current_start, current_end, settings.default_lookback_days) # Ensure current range reflects explicit values. ranges["current"] = (current_start, current_end) gsc_current: dict[str, dict[str, Any]] = {} gsc_previous: dict[str, dict[str, Any]] = {} ga4_current: dict[str, dict[str, Any]] = {} ga4_previous: dict[str, dict[str, Any]] = {} resolved_site_url: str | None = site_url or settings.default_gsc_site_url resolved_property_id = property_id or settings.default_ga4_property_id if settings.enable_gsc: resolved_site_url = _resolve_site_url(site_url) gsc = _get_gsc_connector() current_resp = gsc.search_analytics_all( resolved_site_url, ranges["current"][0], ranges["current"][1], dimensions=["page"], search_type="web", aggregation_type="byPage", max_rows=max_rows, ) gsc_current = normalize_gsc_rows_by_page( current_resp["rows"], dimensions=["page"], base_url=settings.canonical_base_url, ) if include_previous_period: prev_resp = gsc.search_analytics_all( resolved_site_url, ranges["previous"][0], ranges["previous"][1], dimensions=["page"], search_type="web", aggregation_type="byPage", max_rows=max_rows, ) gsc_previous = normalize_gsc_rows_by_page( prev_resp["rows"], dimensions=["page"], base_url=settings.canonical_base_url, ) if settings.enable_ga4 and resolved_property_id: ga4 = _get_ga4_connector() common_kwargs: dict[str, Any] = { "dimensions": ["landingPagePlusQueryString"], "metrics": [ "sessions", "engagedSessions", "conversions", "totalUsers", "screenPageViews", "userEngagementDuration", ], "order_bys": [{"metric": "sessions", "desc": True}], "max_rows": max_rows, } current_resp = ga4.run_report_all( resolved_property_id, ranges["current"][0], ranges["current"][1], **common_kwargs, ) ga4_current = normalize_ga4_rows_by_page( current_resp["rows"], base_url=settings.canonical_base_url, ) if include_previous_period: prev_resp = ga4.run_report_all( resolved_property_id, ranges["previous"][0], ranges["previous"][1], **common_kwargs, ) ga4_previous = normalize_ga4_rows_by_page( prev_resp["rows"], base_url=settings.canonical_base_url, ) merged = merge_page_metrics( gsc_current, ga4_current, gsc_previous=gsc_previous if include_previous_period else None, ga4_previous=ga4_previous if include_previous_period else None, ) return { "ranges": ranges, "site_url": resolved_site_url, "property_id": resolved_property_id, "gsc_pages": len(gsc_current), "ga4_pages": len(ga4_current), "merged_pages": merged, }