Skip to main content
Glama
tutorial2_vector_db.py3.53 kB
#!/usr/bin/env python3 """ Tutorial 2: Creating a Vector Database Following the steps from 2-Creating-Vector-DB.ipynb """ import sys sys.path.insert(0, '/Users/tdyar/ws/FHIR-AI-Hackathon-Kit/Tutorial') from Utils.get_iris_connection import get_cursor import pandas as pd import time from sentence_transformers import SentenceTransformer print("=" * 60) print("Tutorial 2: Creating a Vector Database") print("=" * 60) # Step 1: Get cursor and fetch data print("\n[Step 1] Connecting to IRIS and fetching data...") cursor = get_cursor() sql = """SELECT DocumentReferenceContentAttachmentData, DocumentReferenceSubjectReference FROM VectorSearchApp.DocumentReference""" cursor.execute(sql) out = cursor.fetchall() cols = ["ClinicalNotes", "Patient"] df = pd.DataFrame(out, columns=cols) df["PatientID"] = pd.to_numeric(df["Patient"].astype(str).str.strip("Patient/")) print(f"✅ Fetched {len(df)} clinical notes from {len(df['PatientID'].unique())} patients") # Step 2: Decode clinical notes print("\n[Step 2] Decoding clinical notes from hex to plain text...") df["NotesDecoded"] = df["ClinicalNotes"].apply( lambda x: bytes.fromhex(x).decode("utf-8", errors="replace") ) print("✅ Clinical notes decoded") print("\nExample note (first 200 chars):") print(df["NotesDecoded"][0][:200] + "...") # Step 3: Generate embeddings print("\n[Step 3] Loading sentence transformer model and generating embeddings...") print("(This may take a moment...)") model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(df['NotesDecoded'].tolist(), normalize_embeddings=True) df['Notes_Vector'] = embeddings.tolist() print(f"✅ Generated {len(embeddings)} embeddings with {len(embeddings[0])} dimensions") # Step 4: Create table print("\n[Step 4] Creating vector table in IRIS...") table_name = "VectorSearch.DocRefVectors" # Drop table if it exists try: cursor.execute(f"DROP TABLE {table_name}") print(f" Dropped existing table {table_name}") except: print(f" No existing table to drop") create_table_query = f""" CREATE TABLE {table_name} ( PatientID INTEGER, ClinicalNotes LONGVARCHAR, NotesVector VECTOR(DOUBLE, 384) ) """ cursor.execute(create_table_query) print(f"✅ Created table {table_name}") # Step 5: Insert data (using the faster executemany method) print("\n[Step 5] Inserting data into vector table...") insert_query = f"INSERT INTO {table_name} (PatientID, ClinicalNotes, NotesVector) VALUES (?, ?, TO_VECTOR(?))" st = time.time() df["Notes_Vector_str"] = df["Notes_Vector"].astype(str) rows_list = df[["PatientID", "NotesDecoded", "Notes_Vector_str"]].values.tolist() cursor.executemany(insert_query, rows_list) elapsed = time.time() - st print(f"✅ Inserted {len(rows_list)} rows in {elapsed:.3f} seconds") # Step 6: Verify data print("\n[Step 6] Verifying data in table...") sql_query = f"SELECT TOP 3 * FROM {table_name}" cursor.execute(sql_query) results = cursor.fetchall() results_df = pd.DataFrame(results, columns=["PatientID", "NotesDecoded", "Notes_Vector"]) print(f"✅ Table contains data. Sample rows:") print(results_df[["PatientID"]].head()) print(f"\nFirst vector (first 100 chars): {results_df['Notes_Vector'][0][:100]}...") print("\n" + "=" * 60) print("✅ Tutorial 2 Complete!") print("=" * 60) print(f"\nVector database created: {table_name}") print(f"- {len(df)} clinical notes vectorized") print(f"- {len(df['PatientID'].unique())} unique patients") print(f"- 384-dimensional vectors") print("\nReady for Tutorial 3: Vector Search & LLM Prompting!")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/isc-tdyar/medical-graphrag-assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server