Skip to main content
Glama

Skill-to-MCP

qc_analysis.py9.53 kB
#!/usr/bin/env python3 """ Quality Control Analysis for Single-Cell RNA-seq Data Following scverse best practices from: https://www.sc-best-practices.org/preprocessing_visualization/quality_control.html This is a convenience script that runs a complete QC workflow using the modular functions from qc_core.py and qc_plotting.py. """ import anndata as ad import scanpy as sc import sys import os import argparse # Import our modular utilities from qc_core import ( calculate_qc_metrics, detect_outliers_mad, apply_hard_threshold, filter_cells, filter_genes, print_qc_summary ) from qc_plotting import ( plot_qc_distributions, plot_filtering_thresholds, plot_qc_after_filtering ) print("=" * 80) print("Single-Cell RNA-seq Quality Control Analysis") print("=" * 80) # Default parameters (single source of truth) DEFAULT_MAD_COUNTS = 5 DEFAULT_MAD_GENES = 5 DEFAULT_MAD_MT = 3 DEFAULT_MT_THRESHOLD = 8 DEFAULT_MIN_CELLS = 20 DEFAULT_MT_PATTERN = 'mt-,MT-' DEFAULT_RIBO_PATTERN = 'Rpl,Rps,RPL,RPS' DEFAULT_HB_PATTERN = '^Hb[^(p)]|^HB[^(P)]' # Parse command-line arguments parser = argparse.ArgumentParser( description='Quality Control Analysis for Single-Cell RNA-seq Data', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python3 qc_analysis.py data.h5ad python3 qc_analysis.py raw_feature_bc_matrix.h5 python3 qc_analysis.py data.h5ad --mad-counts 4 --mad-genes 4 --mad-mt 2.5 python3 qc_analysis.py data.h5ad --mt-threshold 10 --min-cells 10 python3 qc_analysis.py data.h5ad --mt-pattern "^mt-" --ribo-pattern "^Rpl,^Rps" """ ) parser.add_argument('input_file', help='Input .h5ad or .h5 file (10X Genomics format)') parser.add_argument('--output-dir', type=str, help='Output directory (default: <input_basename>_qc_results)') parser.add_argument('--mad-counts', type=float, default=DEFAULT_MAD_COUNTS, help=f'MAD threshold for total counts (default: {DEFAULT_MAD_COUNTS})') parser.add_argument('--mad-genes', type=float, default=DEFAULT_MAD_GENES, help=f'MAD threshold for gene counts (default: {DEFAULT_MAD_GENES})') parser.add_argument('--mad-mt', type=float, default=DEFAULT_MAD_MT, help=f'MAD threshold for mitochondrial percentage (default: {DEFAULT_MAD_MT})') parser.add_argument('--mt-threshold', type=float, default=DEFAULT_MT_THRESHOLD, help=f'Hard threshold for mitochondrial percentage (default: {DEFAULT_MT_THRESHOLD})') parser.add_argument('--min-cells', type=int, default=DEFAULT_MIN_CELLS, help=f'Minimum cells for gene filtering (default: {DEFAULT_MIN_CELLS})') parser.add_argument('--mt-pattern', type=str, default=DEFAULT_MT_PATTERN, help=f'Comma-separated mitochondrial gene prefixes (default: "{DEFAULT_MT_PATTERN}")') parser.add_argument('--ribo-pattern', type=str, default=DEFAULT_RIBO_PATTERN, help=f'Comma-separated ribosomal gene prefixes (default: "{DEFAULT_RIBO_PATTERN}")') parser.add_argument('--hb-pattern', type=str, default=DEFAULT_HB_PATTERN, help=f'Hemoglobin gene regex pattern (default: "{DEFAULT_HB_PATTERN}")') args = parser.parse_args() # Verify input file exists if not os.path.exists(args.input_file): print(f"\nError: File '{args.input_file}' not found!") sys.exit(1) input_file = args.input_file base_name = os.path.splitext(os.path.basename(input_file))[0] # Set up output directory if args.output_dir: output_dir = args.output_dir else: output_dir = f"{base_name}_qc_results" os.makedirs(output_dir, exist_ok=True) print(f"\nOutput directory: {output_dir}") # Display parameters print(f"\nParameters:") print(f" MAD thresholds: counts={args.mad_counts}, genes={args.mad_genes}, MT%={args.mad_mt}") print(f" MT hard threshold: {args.mt_threshold}%") print(f" Min cells for gene filtering: {args.min_cells}") print(f" Gene patterns: MT={args.mt_pattern}, Ribo={args.ribo_pattern}") # Load the data print("\n[1/5] Loading data...") file_ext = os.path.splitext(input_file)[1].lower() if file_ext == '.h5ad': adata = ad.read_h5ad(input_file) print(f"Loaded .h5ad file: {adata.n_obs} cells × {adata.n_vars} genes") elif file_ext == '.h5': adata = sc.read_10x_h5(input_file) print(f"Loaded 10X .h5 file: {adata.n_obs} cells × {adata.n_vars} genes") # Make variable names unique (10X data sometimes has duplicate gene names) adata.var_names_make_unique() else: print(f"\nError: Unsupported file format '{file_ext}'. Expected .h5ad or .h5") sys.exit(1) # Store original counts for comparison n_cells_original = adata.n_obs n_genes_original = adata.n_vars # Calculate QC metrics print("\n[2/5] Calculating QC metrics...") calculate_qc_metrics(adata, mt_pattern=args.mt_pattern, ribo_pattern=args.ribo_pattern, hb_pattern=args.hb_pattern, inplace=True) print(f" Found {adata.var['mt'].sum()} mitochondrial genes (pattern: {args.mt_pattern})") print(f" Found {adata.var['ribo'].sum()} ribosomal genes (pattern: {args.ribo_pattern})") print(f" Found {adata.var['hb'].sum()} hemoglobin genes (pattern: {args.hb_pattern})") print_qc_summary(adata, label='QC Metrics Summary (before filtering)') # Create before-filtering visualizations print("\n[3/5] Creating QC visualizations...") before_plot = os.path.join(output_dir, 'qc_metrics_before_filtering.png') plot_qc_distributions(adata, before_plot, title='Quality Control Metrics - Before Filtering') print(f" Saved: {before_plot}") # Apply MAD-based filtering print("\n[4/5] Applying MAD-based filtering thresholds...") # Detect outliers for each metric adata.obs['outlier_counts'] = detect_outliers_mad(adata, 'total_counts', args.mad_counts) adata.obs['outlier_genes'] = detect_outliers_mad(adata, 'n_genes_by_counts', args.mad_genes) adata.obs['outlier_mt'] = detect_outliers_mad(adata, 'pct_counts_mt', args.mad_mt) # Apply hard threshold for mitochondrial content print(f"\n Applying hard threshold for mitochondrial content (>{args.mt_threshold}%):") high_mt_mask = apply_hard_threshold(adata, 'pct_counts_mt', args.mt_threshold, operator='>') # Combine MT filters (MAD + hard threshold) adata.obs['outlier_mt'] = adata.obs['outlier_mt'] | high_mt_mask # Overall filtering decision adata.obs['pass_qc'] = ~( adata.obs['outlier_counts'] | adata.obs['outlier_genes'] | adata.obs['outlier_mt'] ) print(f"\n Total cells failing QC: {(~adata.obs['pass_qc']).sum()} ({(~adata.obs['pass_qc']).sum()/adata.n_obs*100:.2f}%)") print(f" Cells passing QC: {adata.obs['pass_qc'].sum()} ({adata.obs['pass_qc'].sum()/adata.n_obs*100:.2f}%)") # Visualize filtering thresholds outlier_masks = { 'total_counts': adata.obs['outlier_counts'].values, 'n_genes_by_counts': adata.obs['outlier_genes'].values, 'pct_counts_mt': adata.obs['outlier_mt'].values } thresholds = { 'total_counts': {'n_mads': args.mad_counts}, 'n_genes_by_counts': {'n_mads': args.mad_genes}, 'pct_counts_mt': {'n_mads': args.mad_mt, 'hard': args.mt_threshold} } threshold_plot = os.path.join(output_dir, 'qc_filtering_thresholds.png') plot_filtering_thresholds(adata, outlier_masks, thresholds, threshold_plot) print(f"\n Saved: {threshold_plot}") # Apply filtering print("\n[5/5] Applying filters...") adata_filtered = filter_cells(adata, adata.obs['pass_qc'].values, inplace=False) print(f" Cells after filtering: {adata_filtered.n_obs} (removed {n_cells_original - adata_filtered.n_obs})") # Filter genes print(f"\n Filtering genes detected in <{args.min_cells} cells...") filter_genes(adata_filtered, min_cells=args.min_cells, inplace=True) print(f" Genes after filtering: {adata_filtered.n_vars} (removed {n_genes_original - adata_filtered.n_vars})") # Generate summary statistics print("\n" + "=" * 80) print("QC Summary") print("=" * 80) print("\nBefore filtering:") print(f" Cells: {n_cells_original}") print(f" Genes: {n_genes_original}") print("\nAfter filtering:") print(f" Cells: {adata_filtered.n_obs} ({adata_filtered.n_obs/n_cells_original*100:.1f}% retained)") print(f" Genes: {adata_filtered.n_vars} ({adata_filtered.n_vars/n_genes_original*100:.1f}% retained)") print_qc_summary(adata_filtered, label='\nFiltered data QC metrics') # Create after-filtering visualizations after_plot = os.path.join(output_dir, 'qc_metrics_after_filtering.png') plot_qc_after_filtering(adata_filtered, after_plot) print(f"\n Saved: {after_plot}") # Save filtered data print("\nSaving filtered data...") output_filtered = os.path.join(output_dir, f'{base_name}_filtered.h5ad') output_with_qc = os.path.join(output_dir, f'{base_name}_with_qc.h5ad') adata_filtered.write(output_filtered) print(f" Saved: {output_filtered}") # Also save the unfiltered data with QC annotations adata.write(output_with_qc) print(f" Saved: {output_with_qc} (original data with QC annotations)") print("\n" + "=" * 80) print("Quality Control Analysis Complete!") print("=" * 80) print(f"\nAll results saved to: {output_dir}/") print("\nGenerated files:") print(" 1. qc_metrics_before_filtering.png - Initial QC visualizations") print(" 2. qc_filtering_thresholds.png - MAD-based threshold visualization") print(" 3. qc_metrics_after_filtering.png - Post-filtering QC visualizations") print(f" 4. {base_name}_filtered.h5ad - Filtered dataset") print(f" 5. {base_name}_with_qc.h5ad - Original dataset with QC annotations") print("\nNext steps:") print(" - Consider ambient RNA correction (SoupX)") print(" - Consider doublet detection (scDblFinder)") print(" - Proceed with normalization and downstream analysis")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/biocontext-ai/skill-to-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server