Medical GraphRAG Assistant

ingest_mimic_images.py•10.6 kB

#!/usr/bin/env python3 """ Ingest MIMIC-CXR Images into IRIS (VectorSearch.MIMICCXRImages) Loads DICOM chest X-rays with NV-CLIP embeddings for semantic search. Supports both local testing (mock embeddings) and production (real NV-CLIP). """ import sys import os import json import time from pathlib import Path from typing import Optional, List, Dict # Add project root to path parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '.')) if parent_dir not in sys.path: sys.path.insert(0, parent_dir) from src.db.connection import get_connection # Try to import NV-CLIP embeddings try: from src.embeddings.nvclip_embeddings import NVCLIPEmbeddings NVCLIP_AVAILABLE = True except ImportError: NVCLIP_AVAILABLE = False print("Warning: NV-CLIP not available, will use mock embeddings", file=sys.stderr) def get_embedder(): """Get NV-CLIP embedder or None if unavailable.""" if not NVCLIP_AVAILABLE: return None try: return NVCLIPEmbeddings() except Exception as e: print(f"Warning: Could not initialize NV-CLIP: {e}", file=sys.stderr) return None def find_dicom_files(base_path: str, limit: Optional[int] = None) -> List[Path]: """ Find DICOM files in MIMIC-CXR directory structure. MIMIC-CXR structure: files/pXX/pXXXXXXXX/sXXXXXXXX/*.dcm Args: base_path: Root path to MIMIC-CXR files limit: Optional limit on number of files to return Returns: List of Path objects to DICOM files """ base = Path(base_path) if not base.exists(): raise FileNotFoundError(f"MIMIC-CXR path not found: {base_path}") print(f"📂 Scanning for DICOM files in {base_path}...") dicom_files = [] for dcm_file in base.rglob("*.dcm"): dicom_files.append(dcm_file) if limit and len(dicom_files) >= limit: break print(f"Found {len(dicom_files)} DICOM files") return dicom_files def extract_metadata_from_path(dcm_path: Path) -> Dict: """ Extract patient, study, and image IDs from MIMIC-CXR file path. Path format: .../pXX/pXXXXXXXX/sXXXXXXXX/IMAGE_ID.dcm Args: dcm_path: Path to DICOM file Returns: Dictionary with subject_id, study_id, image_id """ parts = dcm_path.parts # Find patient folder (pXXXXXXXX) patient_folder = None study_folder = None for i, part in enumerate(parts): if part.startswith('p') and len(part) == 9: # pXXXXXXXX patient_folder = part # Study folder is next level down if i + 1 < len(parts) and parts[i + 1].startswith('s'): study_folder = parts[i + 1] break image_id = dcm_path.stem # filename without .dcm return { 'subject_id': patient_folder if patient_folder else 'unknown', 'study_id': study_folder if study_folder else 'unknown', 'image_id': image_id, 'file_path': str(dcm_path) } def load_dicom_metadata(dcm_path: Path) -> Optional[Dict]: """ Load DICOM metadata including view position. Args: dcm_path: Path to DICOM file Returns: Dictionary with DICOM metadata, or None if unable to read """ try: import pydicom dcm = pydicom.dcmread(str(dcm_path), stop_before_pixels=True) return { 'view_position': getattr(dcm, 'ViewPosition', 'UNKNOWN'), 'modality': getattr(dcm, 'Modality', 'CR'), 'patient_id': getattr(dcm, 'PatientID', 'unknown'), 'study_date': getattr(dcm, 'StudyDate', 'unknown'), 'series_description': getattr(dcm, 'SeriesDescription', '') } except Exception as e: print(f"Warning: Could not read DICOM metadata from {dcm_path}: {e}", file=sys.stderr) return None def ingest_image(cursor, conn, dcm_path: Path, embedder, dry_run: bool = False) -> bool: """ Ingest a single DICOM image into IRIS. Args: cursor: IRIS database cursor conn: IRIS database connection dcm_path: Path to DICOM file embedder: NV-CLIP embedder (or None for mock) dry_run: If True, don't actually insert Returns: True if successful, False otherwise """ # Extract path-based metadata path_meta = extract_metadata_from_path(dcm_path) image_id = path_meta['image_id'] # Check if already exists cursor.execute(""" SELECT ImageID FROM VectorSearch.MIMICCXRImages WHERE ImageID = ? """, (image_id,)) if cursor.fetchone(): return True # Already exists, skip # Load DICOM metadata dicom_meta = load_dicom_metadata(dcm_path) view_position = dicom_meta['view_position'] if dicom_meta else 'UNKNOWN' # Generate embedding if embedder: try: # Use view position as text for embedding text = f"Chest X-ray {view_position} view" embedding = embedder.embed_text(text) except Exception as e: print(f"Warning: Embedding failed for {image_id}: {e}", file=sys.stderr) embedding = [0.0] * 1024 else: # Mock embedding embedding = [0.0] * 1024 # Prepare metadata JSON metadata = { 'path_metadata': path_meta, 'dicom_metadata': dicom_meta if dicom_meta else {}, 'embedding_source': 'nvclip' if embedder else 'mock' } metadata_str = json.dumps(metadata) if dry_run: print(f"[DRY RUN] Would insert: {image_id} ({view_position})") return True # Insert into database try: embedding_str = ','.join(map(str, embedding)) cursor.execute(""" INSERT INTO VectorSearch.MIMICCXRImages (ImageID, StudyID, SubjectID, ViewPosition, ImagePath, Vector, Metadata) VALUES (?, ?, ?, ?, ?, TO_VECTOR(?, DOUBLE), ?) """, ( image_id, path_meta['study_id'], path_meta['subject_id'], view_position, path_meta['file_path'], embedding_str, metadata_str )) conn.commit() return True except Exception as e: print(f"Error inserting {image_id}: {e}", file=sys.stderr) conn.rollback() return False def ingest_mimic_images( mimic_path: str, limit: Optional[int] = None, dry_run: bool = False, batch_size: int = 100 ): """ Ingest MIMIC-CXR images into IRIS. Args: mimic_path: Path to MIMIC-CXR files directory limit: Optional limit on number of images dry_run: If True, don't actually insert batch_size: Commit after this many inserts """ print("="*60) print("MIMIC-CXR Image Ingestion") print("="*60) print(f"MIMIC path: {mimic_path}") print(f"Limit: {limit if limit else 'None (all files)'}") print(f"Dry run: {dry_run}") print() # Initialize embedder embedder = get_embedder() if embedder: print("✅ NV-CLIP embedder initialized") else: print("⚠️ Using mock embeddings (NV-CLIP not available)") print() # Find DICOM files dicom_files = find_dicom_files(mimic_path, limit) if not dicom_files: print("❌ No DICOM files found") return # Connect to database conn = get_connection() cursor = conn.cursor() try: # Process files print(f"Loading {len(dicom_files)} images into database...") print() success_count = 0 skip_count = 0 error_count = 0 start_time = time.time() for i, dcm_path in enumerate(dicom_files, 1): try: # Check if exists first path_meta = extract_metadata_from_path(dcm_path) cursor.execute(""" SELECT ImageID FROM VectorSearch.MIMICCXRImages WHERE ImageID = ? """, (path_meta['image_id'],)) if cursor.fetchone(): skip_count += 1 if i % 100 == 0: print(f" {i}/{len(dicom_files)}: Skipped {skip_count}, Added {success_count}, Errors {error_count}") continue success = ingest_image(cursor, conn, dcm_path, embedder, dry_run) if success: success_count += 1 else: error_count += 1 # Progress update if i % 100 == 0 or i == len(dicom_files): elapsed = time.time() - start_time rate = i / elapsed if elapsed > 0 else 0 print(f" {i}/{len(dicom_files)}: Skipped {skip_count}, Added {success_count}, Errors {error_count} ({rate:.1f} img/sec)") # Batch commit if not dry_run and i % batch_size == 0: conn.commit() except KeyboardInterrupt: print("\n\n⚠️ Interrupted by user") break except Exception as e: print(f"Error processing {dcm_path}: {e}") error_count += 1 # Final commit if not dry_run: conn.commit() # Summary elapsed = time.time() - start_time print() print("="*60) print("Ingestion Complete!") print("="*60) print(f"Time elapsed: {elapsed:.1f} seconds") print(f"Images processed: {len(dicom_files)}") print(f" - Skipped (already exist): {skip_count}") print(f" - Successfully added: {success_count}") print(f" - Errors: {error_count}") print() # Verify cursor.execute("SELECT COUNT(*) FROM VectorSearch.MIMICCXRImages") total_count = cursor.fetchone()[0] print(f"Total images in database: {total_count}") finally: cursor.close() conn.close() if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Ingest MIMIC-CXR images into IRIS') parser.add_argument('mimic_path', help='Path to MIMIC-CXR files directory') parser.add_argument('--limit', type=int, help='Limit number of images to ingest') parser.add_argument('--dry-run', action='store_true', help='Dry run (don\'t actually insert)') parser.add_argument('--batch-size', type=int, default=100, help='Commit batch size') args = parser.parse_args() ingest_mimic_images( mimic_path=args.mimic_path, limit=args.limit, dry_run=args.dry_run, batch_size=args.batch_size )

Latest Blog Posts

Model Context Protocol Proxies: Enabling Enterprise Control with Virtual MCPs
By Om-Shree-0709 on December 9, 2025.
AI Security
Virtual MCP
Kubernetes Operator
The State of MCP in 2025: Who's Building What and Why It Matters
By punkpeye on December 7, 2025.
mcp
startups
MCP hosting with persistent storage
By punkpeye on December 6, 2025.
changelog

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/isc-tdyar/medical-graphrag-assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server