ChatSpatial

spatial_genes.py•33.1 KiB

""" Spatial Variable Genes (SVG) identification for ChatSpatial MCP. This module provides implementations for SVG detection methods including SpatialDE and SPARK-X, enabling comprehensive spatial transcriptomics analysis. Each method offers distinct advantages for identifying genes with spatial expression patterns. Methods Overview: - SPARK-X (default): Non-parametric statistical method, best accuracy, requires R - SpatialDE: Gaussian process-based kernel method, statistically rigorous The module integrates these tools into the ChatSpatial MCP framework, handling data preparation, execution, result formatting, and error management across different computational backends. """ from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from ..spatial_mcp_adapter import ToolContext from collections import Counter import numpy as np import pandas as pd import scipy.sparse as sp from ..models.analysis import SpatialVariableGenesResult # noqa: E402 from ..models.data import SpatialVariableGenesParameters # noqa: E402 from ..utils import validate_var_column # noqa: E402 from ..utils.adata_utils import ( # noqa: E402 get_raw_data_source, require_spatial_coords, to_dense, ) from ..utils.dependency_manager import require # noqa: E402 from ..utils.exceptions import DataNotFoundError # noqa: E402 from ..utils.exceptions import DataError, ParameterError, ProcessingError from ..utils.mcp_utils import suppress_output # noqa: E402 # ============================================================================= # Shared Utilities for Spatial Variable Gene Detection # ============================================================================= # Default limit for spatial_genes list returned to LLM # Full results stored in adata.var for complete access DEFAULT_TOP_GENES_LIMIT = 500 def _ensure_unique_gene_names(gene_names: list[str]) -> list[str]: """Ensure gene names are unique by adding suffixes to duplicates. Required for R-based methods (SPARK-X) that use gene names as rownames. Args: gene_names: List of gene names (may contain duplicates) Returns: List of unique gene names with suffixes added to duplicates """ if len(gene_names) == len(set(gene_names)): return gene_names gene_counts = Counter(gene_names) unique_names = [] seen_counts: dict[str, int] = {} for gene in gene_names: if gene_counts[gene] > 1: if gene not in seen_counts: seen_counts[gene] = 0 unique_names.append(gene) else: seen_counts[gene] += 1 unique_names.append(f"{gene}_{seen_counts[gene]}") else: unique_names.append(gene) return unique_names def _calculate_sparse_gene_stats(X) -> tuple[np.ndarray, np.ndarray]: """Calculate gene statistics on sparse or dense matrix. Efficiently computes gene totals and expression counts without densifying the entire matrix. Args: X: Gene expression matrix (cells × genes), sparse or dense Returns: Tuple of (gene_totals, n_expressed_per_gene) as 1D arrays """ is_sparse = sp.issparse(X) if is_sparse: gene_totals = np.array(X.sum(axis=0)).flatten() n_expressed = np.array((X > 0).sum(axis=0)).flatten() else: gene_totals = np.asarray(X.sum(axis=0)).flatten() n_expressed = np.asarray((X > 0).sum(axis=0)).flatten() return gene_totals, n_expressed async def identify_spatial_genes( data_id: str, ctx: "ToolContext", params: SpatialVariableGenesParameters, ) -> SpatialVariableGenesResult: """ Identify spatial variable genes using statistical methods. This is the main entry point for spatial gene detection, routing to the appropriate method based on params.method. Each method has different strengths: Method Selection Guide: - SPARK-X (default): Best for accuracy, handles large datasets efficiently - SpatialDE: Best for statistical rigor in publication-ready analyses Data Requirements: - SPARK-X: Works with raw counts or normalized data - SpatialDE: Works with raw count data Args: data_id: Dataset identifier in data store ctx: ToolContext for data access and logging params: Method-specific parameters (see SpatialVariableGenesParameters) Returns: SpatialVariableGenesResult containing: - List of significant spatial genes - Statistical metrics (p-values, q-values) - Method-specific results Raises: ValueError: If dataset not found or spatial coordinates missing ImportError: If required method dependencies not installed Performance Notes: - SPARK-X: ~2-5 min for 3000 spots × 20000 genes - SpatialDE: ~15-30 min (scales with spot count squared) """ # Get data via ToolContext adata = await ctx.get_adata(data_id) # Validate spatial coordinates exist require_spatial_coords(adata, spatial_key=params.spatial_key) # Route to appropriate method if params.method == "spatialde": return await _identify_spatial_genes_spatialde(data_id, adata, params, ctx) elif params.method == "sparkx": return await _identify_spatial_genes_sparkx(data_id, adata, params, ctx) else: raise ParameterError( f"Unsupported method: {params.method}. Available methods: spatialde, sparkx" ) async def _identify_spatial_genes_spatialde( data_id: str, adata: Any, params: SpatialVariableGenesParameters, ctx: "ToolContext", ) -> SpatialVariableGenesResult: """ Identify spatial variable genes using the SpatialDE statistical framework. SpatialDE employs Gaussian process regression with spatial kernels to decompose gene expression variance into spatial and non-spatial components. It provides rigorous statistical testing for spatial expression patterns with multiple testing correction. Official Preprocessing Workflow (Implemented): This implementation follows the official SpatialDE best practices: 1. Filter low-expression genes (total_counts >= 3) 2. Variance stabilization (NaiveDE.stabilize) 3. Regress out library size effects (NaiveDE.regress_out) 4. Run SpatialDE spatial covariance test 5. Apply FDR correction (Storey q-value) Method Details: - Models spatial correlation using squared exponential kernel - Tests significance via likelihood ratio test - Applies FDR correction for multiple testing - Returns both raw and adjusted p-values Key Parameters: - n_top_genes: Limit analysis to top N genes (for performance) * If provided, preferentially uses HVGs if available * Recommended: 1000-3000 for quick analysis * None (default): Test all genes (may take 15-30 min for large datasets) Performance Notes: - ~10 minutes for 14,000 genes (official benchmark) - Scales approximately linearly with gene count - Performance warning issued when n_genes > 5000 - Tip: Use n_top_genes parameter to reduce runtime Data Requirements: - Raw count data (from adata.raw or adata.X) - 2D spatial coordinates in adata.obsm['spatial'] - Data will be automatically preprocessed using official workflow Returns: Results including: - List of significant spatial genes (q-value < 0.05) - Log-likelihood ratios as test statistics - Raw p-values and FDR-corrected q-values - Spatial correlation length scale per gene Requirements: - SpatialDE package with NaiveDE module - 2D spatial coordinates - Raw count data (not normalized) References: Svensson et al. (2018) "SpatialDE: identification of spatially variable genes" Nature Methods, DOI: 10.1038/nmeth.4636 Official tutorial: https://github.com/Teichlab/SpatialDE """ # Use centralized dependency manager for consistent error handling require("spatialde") # Raises ImportError with install instructions if missing # Apply scipy compatibility patch for SpatialDE (scipy >= 1.14 removed scipy.misc.derivative) from ..utils.compat import ensure_spatialde_compat ensure_spatialde_compat() import NaiveDE import SpatialDE from SpatialDE.util import qvalue # Prepare spatial coordinates coords = pd.DataFrame( adata.obsm[params.spatial_key][:, :2], # Ensure 2D coordinates columns=["x", "y"], index=adata.obs_names, ) # Get raw count data for SpatialDE preprocessing # Use get_raw_data_source (single source of truth) for complete gene coverage # OPTIMIZATION: Filter genes on SPARSE matrix first, then convert only selected genes to dense raw_result = get_raw_data_source(adata, prefer_complete_genes=True) raw_data = raw_result.X var_names = raw_result.var_names var_df = adata.var # For HVG lookup (HVG info is in adata.var) # Step 1: Filter low-expression genes ON SPARSE MATRIX (Official recommendation) # SpatialDE README: "Filter practically unobserved genes" with total_counts >= 3 gene_totals, _ = _calculate_sparse_gene_stats(raw_data) keep_genes_mask = gene_totals >= 3 selected_var_names = var_names[keep_genes_mask] # Step 2: Select top N HVGs ON SPARSE MATRIX (if requested) # This further reduces genes BEFORE densification final_genes = selected_var_names if params.n_top_genes is not None and params.n_top_genes < len(selected_var_names): if "highly_variable" in var_df.columns: # Prioritize HVGs if available hvg_mask = var_df.loc[selected_var_names, "highly_variable"] hvg_genes = selected_var_names[hvg_mask] if len(hvg_genes) >= params.n_top_genes: # Use HVGs final_genes = hvg_genes[: params.n_top_genes] else: # Not enough HVGs, select by expression gene_totals_filtered = gene_totals[keep_genes_mask] top_indices = np.argsort(gene_totals_filtered)[-params.n_top_genes :][ ::-1 ] final_genes = selected_var_names[top_indices] else: # Select by expression gene_totals_filtered = gene_totals[keep_genes_mask] top_indices = np.argsort(gene_totals_filtered)[-params.n_top_genes :][::-1] final_genes = selected_var_names[top_indices] # Step 3: Slice sparse matrix to final genes, THEN convert to dense # This is where the memory optimization happens: only convert selected genes # Directly use raw_result (single source of truth) - no need for double access gene_indices = [raw_result.var_names.get_loc(g) for g in final_genes] # Now create DataFrame from the SUBSET (much smaller memory footprint) counts = pd.DataFrame( to_dense(raw_result.X[:, gene_indices]), columns=final_genes, index=adata.obs_names, ) # Performance warning for large gene sets n_genes = counts.shape[1] n_spots = counts.shape[0] if n_genes > 5000: estimated_time = int(n_genes / 14000 * 10) # Based on 14k genes = 10 min await ctx.warning( f"WARNING:Running SpatialDE on {n_genes} genes × {n_spots} spots may take {estimated_time}-{estimated_time*2} minutes.\n" f" • Official benchmark: ~10 min for 14,000 genes\n" f" • Tip: Use n_top_genes=1000-3000 to test fewer genes\n" f" • Or use method='sparkx' for faster analysis (2-5 min)" ) # Calculate total counts per spot for regress_out total_counts = pd.DataFrame( {"total_counts": counts.sum(axis=1)}, index=counts.index ) # Apply official SpatialDE preprocessing workflow # Step 1: Variance stabilization norm_expr = NaiveDE.stabilize(counts.T).T # Step 2: Regress out library size effects resid_expr = NaiveDE.regress_out( total_counts, norm_expr.T, "np.log(total_counts)" ).T # Step 3: Run SpatialDE results = SpatialDE.run(coords.values, resid_expr) # Multiple testing correction using Storey q-value method if params.spatialde_pi0 is not None: # User-specified pi0 value results["qval"] = qvalue(results["pval"].values, pi0=params.spatialde_pi0) else: # Adaptive pi0 estimation (SpatialDE default, recommended) results["qval"] = qvalue(results["pval"].values) # Sort by q-value results = results.sort_values("qval") # Filter significant genes significant_genes_all = results[results["qval"] < 0.05]["g"].tolist() # Limit for MCP response (full results stored in adata.var) limit = params.n_top_genes or DEFAULT_TOP_GENES_LIMIT significant_genes = significant_genes_all[:limit] # Store results in adata results_key = f"spatialde_results_{data_id}" adata.var["spatialde_pval"] = results.set_index("g")["pval"] adata.var["spatialde_qval"] = results.set_index("g")["qval"] adata.var["spatialde_l"] = results.set_index("g")["l"] # Store scientific metadata for reproducibility from ..utils.adata_utils import store_analysis_metadata from ..utils.results_export import export_analysis_result store_analysis_metadata( adata, analysis_name="spatial_genes_spatialde", method="spatialde_official_workflow", parameters={ "kernel": params.spatialde_kernel, "preprocessing": "NaiveDE.stabilize + NaiveDE.regress_out", "gene_filter_threshold": 3, "n_genes_tested": n_genes, "n_spots": n_spots, "pi0": ( params.spatialde_pi0 if params.spatialde_pi0 is not None else "adaptive" ), }, results_keys={ "var": ["spatialde_pval", "spatialde_qval", "spatialde_l"], "obs": [], "obsm": [], "uns": [], }, statistics={ "n_genes_analyzed": len(results), "n_significant_genes": len( results[results["qval"] < 0.05] # FDR standard threshold ), }, ) # Export results to CSV for reproducibility export_analysis_result(adata, data_id, "spatial_genes_spatialde") # Note: Detailed statistics (gene_statistics, p_values, q_values) are excluded # from MCP response via Field(exclude=True) in SpatialVariableGenesResult. # Full results are accessible via adata.var['spatialde_pval', 'spatialde_qval']. result = SpatialVariableGenesResult( data_id=data_id, method="spatialde", n_genes_analyzed=len(results), n_significant_genes=len(significant_genes_all), spatial_genes=significant_genes, results_key=results_key, ) return result async def _identify_spatial_genes_sparkx( data_id: str, adata: Any, params: SpatialVariableGenesParameters, ctx: "ToolContext", ) -> SpatialVariableGenesResult: """ Identify spatial variable genes using the SPARK-X non-parametric method. SPARK-X is an efficient non-parametric method for detecting spatially variable genes without assuming specific distribution models. It uses spatial covariance testing and is particularly effective for large-scale datasets. The method is implemented in R and accessed via rpy2. Method Advantages: - Non-parametric: No distributional assumptions required - Computationally efficient: Scales well with gene count - Robust: Handles various spatial patterns effectively - Flexible: Works with both single and mixture spatial kernels Gene Filtering Pipeline (based on SPARK-X paper + 2024 best practices): TIER 1 - Standard Filtering (SPARK-X paper): - filter_mt_genes: Remove mitochondrial genes (MT-*, mt-*) [default: True] - filter_ribo_genes: Remove ribosomal genes (RPS*, RPL*) [default: False] - Expression filtering: Min percentage + total counts TIER 2 - Advanced Options (2024 best practice from PMC11537352): - test_only_hvg: Test only highly variable genes [default: False] * Reduces housekeeping gene dominance * Requires prior HVG computation in preprocessing TIER 3 - Quality Warnings: - warn_housekeeping: Warn if >30% top genes are housekeeping [default: True] * Alerts about potential biological interpretation issues Key Parameters: - sparkx_option: 'single' or 'mixture' kernel (default: 'mixture') - sparkx_percentage: Min percentage of cells expressing gene (default: 0.1) - sparkx_min_total_counts: Min total counts per gene (default: 10) - sparkx_num_core: Number of CPU cores for parallel processing - filter_mt_genes: Filter mitochondrial genes (default: True) - filter_ribo_genes: Filter ribosomal genes (default: False) - test_only_hvg: Test only HVGs (default: False) - warn_housekeeping: Warn about housekeeping dominance (default: True) Data Processing: - Automatically filters low-expression genes based on parameters - Uses raw counts when available (adata.raw), otherwise current matrix - Handles duplicate gene names by adding suffixes Returns: Results including: - List of significant spatial genes (adjusted p-value < 0.05) - Raw p-values from spatial covariance test - Bonferroni-adjusted p-values - Results dataframe with all tested genes - Quality warnings if housekeeping genes dominate Requirements: - R installation with SPARK package - rpy2 Python package for R integration - Raw count data preferred (will use adata.raw if available) Performance: - Fastest among the three methods - ~2-5 minutes for typical datasets (3000 spots × 20000 genes) - Memory efficient through gene filtering References: - SPARK-X paper: Sun et al. (2021) Genome Biology - HVG+SVG best practice: PMC11537352 (2024) """ # Use centralized dependency manager for consistent error handling require("rpy2") # Raises ImportError with install instructions if missing from rpy2 import robjects as ro from rpy2.rinterface_lib import openrlib # For thread safety from rpy2.robjects import conversion, default_converter from rpy2.robjects.packages import importr # Prepare spatial coordinates - SPARK needs data.frame format coords_array = adata.obsm[params.spatial_key][:, :2].astype(float) n_spots, n_genes = adata.shape # ==================== OPTIMIZED: Filter on sparse matrix, then convert ==================== # Strategy: Keep data sparse throughout filtering, only convert final filtered result # Benefit: For 30k cells × 20k genes → 3k genes: save ~15GB memory # Get sparse count matrix using get_raw_data_source (single source of truth) raw_result = get_raw_data_source(adata, prefer_complete_genes=True) sparse_counts = raw_result.X # Keep sparse! gene_names = [str(name) for name in raw_result.var_names] n_genes = len(gene_names) # Ensure gene names are unique (required for SPARK-X R rownames) gene_names = _ensure_unique_gene_names(gene_names) # ==================== Gene Filtering Pipeline (ON SPARSE MATRIX) ==================== # Following SPARK-X paper best practices + 2024 literature recommendations # All filtering done on sparse matrix to minimize memory usage # Initialize gene mask (all True = keep all genes initially) gene_mask = np.ones(len(gene_names), dtype=bool) # TIER 1: Mitochondrial gene filtering (SPARK-X paper standard practice) # Use pattern-based detection on gene names (works regardless of data source) if params.filter_mt_genes: mt_mask = np.array( [gene.startswith(("MT-", "mt-")) for gene in gene_names] ) n_mt_genes = mt_mask.sum() if n_mt_genes > 0: gene_mask &= ~mt_mask # Exclude MT genes # TIER 1: Ribosomal gene filtering (optional) # Use pattern-based detection on gene names if params.filter_ribo_genes: ribo_mask = np.array( [gene.startswith(("RPS", "RPL", "Rps", "Rpl")) for gene in gene_names] ) n_ribo_genes = ribo_mask.sum() if n_ribo_genes > 0: gene_mask &= ~ribo_mask # Exclude ribosomal genes # TIER 2: HVG-only testing (2024 best practice from PMC11537352) if params.test_only_hvg: # Check if HVGs are available in adata.var (the preprocessed data) validate_var_column( adata, "highly_variable", "Highly variable genes marker (test_only_hvg=True requires this)", ) # Get HVG list from preprocessed data (adata.var) hvg_genes_set = set(adata.var_names[adata.var["highly_variable"]]) if len(hvg_genes_set) == 0: raise DataNotFoundError("No HVGs found. Run preprocessing first.") # Filter gene_names to only include HVGs hvg_mask = np.array([gene in hvg_genes_set for gene in gene_names]) n_hvg = hvg_mask.sum() if n_hvg == 0: # No overlap between current gene list and HVGs raise DataError( f"test_only_hvg=True but no overlap found between current gene list ({len(gene_names)} genes) " f"and HVGs ({len(hvg_genes_set)} genes). " "This may occur if adata.raw contains different genes than the preprocessed data. " "Try setting test_only_hvg=False or ensure adata.raw is None." ) gene_mask &= hvg_mask # Keep only HVGs # TIER 1: Apply SPARK-X standard filtering (expression-based) - ON SPARSE MATRIX percentage = params.sparkx_percentage min_total_counts = params.sparkx_min_total_counts # Calculate gene statistics on sparse matrix (efficient!) gene_totals, n_expressed = _calculate_sparse_gene_stats(sparse_counts) # Filter genes: must be expressed in at least percentage of cells AND have min total counts min_cells = int(np.ceil(n_spots * percentage)) expr_mask = (n_expressed >= min_cells) & (gene_totals >= min_total_counts) gene_mask &= expr_mask # Combine with previous filters # Apply combined filter mask to sparse matrix (still sparse!) if gene_mask.sum() < len(gene_names): filtered_sparse = sparse_counts[:, gene_mask] gene_names = [ gene for gene, keep in zip(gene_names, gene_mask, strict=False) if keep ] else: filtered_sparse = sparse_counts # NOW convert filtered sparse matrix to dense (much smaller!) # copy=True ensures we don't modify original for dense input counts_matrix = to_dense(filtered_sparse, copy=True) # Ensure counts are non-negative integers counts_matrix = np.maximum(counts_matrix, 0).astype(int) # Update gene count after filtering n_genes = len(gene_names) # Transpose for SPARK format (genes × spots) counts_transposed = counts_matrix.T # Create spot names spot_names = [str(name) for name in adata.obs_names] # Wrap ALL R operations in thread lock and localconverter for proper contextvars handling # This prevents "Conversion rules missing" errors in multithreaded/async environments with openrlib.rlock: # Thread safety lock with conversion.localconverter(default_converter): # Conversion context # Import SPARK package inside context (FIX for contextvars issue) try: spark = importr("SPARK") except Exception as e: raise ImportError( f"SPARK not installed in R. Install with: install.packages('SPARK'). Error: {e}" ) from e # Convert to R format (already in context) # Count matrix: genes × spots r_counts = ro.r.matrix( ro.IntVector(counts_transposed.flatten()), nrow=n_genes, ncol=n_spots, byrow=True, ) r_counts.rownames = ro.StrVector(gene_names) r_counts.colnames = ro.StrVector(spot_names) # Coordinates as data.frame (SPARK requirement) coords_df = pd.DataFrame(coords_array, columns=["x", "y"], index=spot_names) r_coords = ro.r["data.frame"]( x=ro.FloatVector(coords_df["x"]), y=ro.FloatVector(coords_df["y"]), row_names=ro.StrVector(coords_df.index), ) try: # Execute SPARK-X analysis inside context (FIX for contextvars issue) # Keep suppress_output for MCP communication compatibility with suppress_output(): results = spark.sparkx( count_in=r_counts, locus_in=r_coords, X_in=ro.NULL, # No additional covariates (could be extended in future) numCores=params.sparkx_num_core, option=params.sparkx_option, verbose=False, # Ensure verbose is off for cleaner MCP communication ) # Extract p-values from results (inside context for proper conversion) # SPARK-X returns res_mtest as a data.frame with columns: # - combinedPval: combined p-values across spatial kernels # - adjustedPval: BY-adjusted p-values (Benjamini-Yekutieli FDR correction) # Reference: SPARK R package documentation try: pvals = results.rx2("res_mtest") if pvals is None: raise ProcessingError( "SPARK-X returned None for res_mtest. " "This may indicate the analysis failed silently." ) # Verify expected data.frame format is_dataframe = ro.r["is.data.frame"](pvals)[0] if not is_dataframe: raise ProcessingError( "SPARK-X output format error. Requires SPARK >= 1.1.0." ) # Extract combinedPval (raw p-values combined across kernels) combined_pvals = ro.r["$"](pvals, "combinedPval") if combined_pvals is None: raise ProcessingError( "SPARK-X res_mtest missing 'combinedPval' column. " "This is required for spatial gene identification." ) pval_list = [float(p) for p in combined_pvals] # Extract adjustedPval (BY-corrected p-values from SPARK-X) adjusted_pvals = ro.r["$"](pvals, "adjustedPval") if adjusted_pvals is None: raise ProcessingError( "SPARK-X res_mtest missing 'adjustedPval' column. " "This column contains BY-corrected p-values for multiple testing." ) adjusted_pval_list = [float(p) for p in adjusted_pvals] # Create results dataframe results_df = pd.DataFrame( { "gene": gene_names[: len(pval_list)], "pvalue": pval_list, "adjusted_pvalue": adjusted_pval_list, # BY-corrected by SPARK-X } ) # Warn if returned genes much fewer than input genes if len(results_df) < n_genes * 0.5: await ctx.warning( f"SPARK-X returned results for only {len(results_df)}/{n_genes} genes. " f"This may indicate a problem with the R environment, SPARK package, or input data. " f"Consider checking R logs or trying SpatialDE as an alternative method." ) except Exception as e: # P-value extraction failed - provide clear error message raise ProcessingError( f"SPARK-X p-value extraction failed: {e}\n\n" f"Expected SPARK-X output format:\n" f"SPARK-X output invalid. Requires SPARK >= 1.1.0." ) from e except Exception as e: raise ProcessingError(f"SPARK-X analysis failed: {e}") from e # Sort by adjusted p-value results_df = results_df.sort_values("adjusted_pvalue") # Filter significant genes significant_genes_all = results_df[results_df["adjusted_pvalue"] < 0.05][ "gene" ].tolist() # Limit for MCP response (full results stored in adata.var) limit = params.n_top_genes or DEFAULT_TOP_GENES_LIMIT significant_genes = significant_genes_all[:limit] # TIER 3: Housekeeping gene warnings (post-processing quality check) if params.warn_housekeeping and len(results_df) > 0: # Define housekeeping gene patterns (based on literature) housekeeping_patterns = [ "RPS", # Ribosomal protein small subunit "RPL", # Ribosomal protein large subunit "Rps", # Mouse ribosomal small "Rpl", # Mouse ribosomal large "MT-", # Mitochondrial (human) "mt-", # Mitochondrial (mouse) "ACTB", # Beta-actin "GAPDH", # Glyceraldehyde-3-phosphate dehydrogenase "EEF1A1", # Eukaryotic translation elongation factor 1 alpha 1 "TUBA1B", # Tubulin alpha 1b "B2M", # Beta-2-microglobulin ] # Check top significant genes (up to 50) top_genes_to_check = results_df.head(50)["gene"].tolist() # Mark housekeeping genes housekeeping_genes = [ gene for gene in top_genes_to_check if any( gene.startswith(pattern) or gene == pattern for pattern in housekeeping_patterns ) ] n_housekeeping = len(housekeeping_genes) n_top = len(top_genes_to_check) housekeeping_ratio = n_housekeeping / n_top if n_top > 0 else 0 # Warn if >30% are housekeeping genes if housekeeping_ratio > 0.3: await ctx.warning( f"WARNING:Housekeeping gene dominance detected: {n_housekeeping}/{n_top} ({housekeeping_ratio*100:.1f}%) of top genes are housekeeping genes.\n" f" • Housekeeping genes found: {', '.join(housekeeping_genes[:10])}{'...' if len(housekeeping_genes) > 10 else ''}\n" f" • These genes may not represent true spatial patterns\n" f" • Recommendations:\n" f" 1. Use test_only_hvg=True to reduce housekeeping dominance (2024 best practice)\n" f" 2. Use filter_ribo_genes=True to filter ribosomal genes\n" f" 3. Focus on genes with clear biological relevance\n" f" • Note: This is a quality warning, not an error" ) # Store results in adata results_key = f"sparkx_results_{data_id}" adata.var["sparkx_pval"] = pd.Series( dict(zip(results_df["gene"], results_df["pvalue"], strict=False)), name="sparkx_pval", ).reindex(adata.var_names, fill_value=1.0) adata.var["sparkx_qval"] = pd.Series( dict(zip(results_df["gene"], results_df["adjusted_pvalue"], strict=False)), name="sparkx_qval", ).reindex(adata.var_names, fill_value=1.0) # Store scientific metadata for reproducibility from ..utils.adata_utils import store_analysis_metadata from ..utils.results_export import export_analysis_result store_analysis_metadata( adata, analysis_name="spatial_genes_sparkx", method="sparkx", parameters={ "num_core": params.sparkx_num_core, "percentage": params.sparkx_percentage, "min_total_counts": params.sparkx_min_total_counts, "option": params.sparkx_option, "filter_mt_genes": params.filter_mt_genes, "filter_ribo_genes": params.filter_ribo_genes, "test_only_hvg": params.test_only_hvg, "warn_housekeeping": params.warn_housekeeping, }, results_keys={ "var": ["sparkx_pval", "sparkx_qval"], "obs": [], "obsm": [], "uns": [], }, statistics={ "n_genes_analyzed": len(results_df), "n_significant_genes": len(significant_genes_all), }, ) # Export results to CSV for reproducibility export_analysis_result(adata, data_id, "spatial_genes_sparkx") # Note: Detailed statistics (gene_statistics, p_values, q_values) are excluded # from MCP response via Field(exclude=True) in SpatialVariableGenesResult. # Full results are accessible via adata.var['sparkx_pval', 'sparkx_qval']. result = SpatialVariableGenesResult( data_id=data_id, method="sparkx", n_genes_analyzed=len(results_df), n_significant_genes=len(significant_genes_all), spatial_genes=significant_genes, results_key=results_key, ) return result

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cafferychen777/ChatSpatial'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

spatial_genes.py•33.1 KiB