embed_to_redis.py•2.2 kB
#!/usr/bin/env python3
import os
import json
import redis
import hashlib
from sentence_transformers import SentenceTransformer
from urllib.parse import urlparse
def main():
# Initialize Redis connection
r = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
output_dir = 'output'
for filename in os.listdir(output_dir):
if not filename.endswith('.md'):
continue
filepath = os.path.join(output_dir, filename)
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.split('\n')
# Process each heading line
for line in lines:
if line.strip().startswith('#'):
# Remove # symbols and get full heading
heading_text = line.strip().lstrip('#').strip()
if not heading_text:
continue
# Create substring for embedding
if heading_text.startswith('https://modelcontextprotocol.io/'):
substring = heading_text.replace('https://modelcontextprotocol.io/', '').replace('/', ' ')
else:
substring = heading_text
# Generate embedding for substring
embedding = model.encode(substring).tolist()
# Create document ID
doc_id = hashlib.md5(f"{filepath}_{heading_text}".encode()).hexdigest()
# Store full heading and file path
r.hset(f"doc:{doc_id}", mapping={
'heading': heading_text,
'file_path': filepath,
'embedding': json.dumps(embedding)
})
# Add to search index
r.sadd('doc_ids', doc_id)
print(f"Processed {filename}: embedded individual headings")
print(f"Total headings in Redis: {r.scard('doc_ids')}")
if __name__ == "__main__":
main()