test-embedding-comparison.sh•9.2 kB
test-embedding-comparison.sh#!/bin/bash
echo "🧠 CodeGraph Embedding Provider Comparison Test"
echo "ONNX (Speed) vs Ollama nomic-embed-code (Code-Specialized)"
echo "=" * 60
# Check if nomic-embed-code is available
if ! ollama list | grep -q "nomic-embed-code"; then
    echo "📦 Installing nomic-embed-code for comparison..."
    ollama pull hf.co/nomic-ai/nomic-embed-code-GGUF:Q4_K_M
    if [ $? -ne 0 ]; then
        echo "❌ Failed to install nomic-embed-code"
        echo "Proceeding with ONNX-only test..."
        OLLAMA_AVAILABLE=false
    else
        echo "✅ nomic-embed-code installed"
        OLLAMA_AVAILABLE=true
    fi
else
    echo "✅ nomic-embed-code already available"
    OLLAMA_AVAILABLE=true
fi
# Build with both embedding providers
echo ""
echo "🔧 Building CodeGraph with both embedding providers..."
MACOSX_DEPLOYMENT_TARGET=11.0 cargo build -p codegraph-mcp --features "qwen-integration,faiss,embeddings,embeddings-ollama,codegraph-vector/onnx"
if [ $? -ne 0 ]; then
    echo "❌ Build failed"
    exit 1
fi
echo "✅ Build successful with both providers"
# Test directory setup
TEST_DIR="embedding-test-$(date +%s)"
mkdir -p "$TEST_DIR"
cd "$TEST_DIR"
# Create test files with different code patterns
echo "📝 Creating test codebase..."
cat > AuthService.ts << 'EOF'
export interface User {
    id: string;
    email: string;
    password: string;
}
export class AuthService {
    private users: Map<string, User> = new Map();
    async validateCredentials(email: string, password: string): Promise<User | null> {
        const user = Array.from(this.users.values()).find(u => u.email === email);
        if (!user) {
            return null;
        }
        const isValid = await this.comparePassword(password, user.password);
        return isValid ? user : null;
    }
    private async comparePassword(plaintext: string, hashed: string): Promise<boolean> {
        // In real implementation, use bcrypt or similar
        return plaintext === hashed;
    }
    async createUser(email: string, password: string): Promise<User> {
        const user: User = {
            id: crypto.randomUUID(),
            email,
            password: await this.hashPassword(password)
        };
        this.users.set(user.id, user);
        return user;
    }
    private async hashPassword(password: string): Promise<string> {
        // In real implementation, use bcrypt
        return `hashed_${password}`;
    }
}
EOF
cat > DatabaseService.ts << 'EOF'
export interface ConnectionConfig {
    host: string;
    port: number;
    database: string;
    username: string;
    password: string;
}
export class DatabaseService {
    private connection: any = null;
    async connect(config: ConnectionConfig): Promise<void> {
        try {
            this.connection = await this.createConnection(config);
            console.log(`Connected to database: ${config.database}`);
        } catch (error) {
            console.error('Database connection failed:', error);
            throw error;
        }
    }
    async query<T>(sql: string, params: any[] = []): Promise<T[]> {
        if (!this.connection) {
            throw new Error('Database not connected');
        }
        try {
            const result = await this.connection.query(sql, params);
            return result.rows;
        } catch (error) {
            console.error('Query failed:', error);
            throw error;
        }
    }
    private async createConnection(config: ConnectionConfig) {
        // Mock connection creation
        return {
            query: async (sql: string, params: any[]) => ({
                rows: []
            })
        };
    }
}
EOF
cat > APIController.ts << 'EOF'
import { AuthService } from './AuthService';
import { DatabaseService } from './DatabaseService';
export interface APIResponse<T> {
    success: boolean;
    data?: T;
    error?: string;
}
export class APIController {
    constructor(
        private authService: AuthService,
        private dbService: DatabaseService
    ) {}
    async handleLogin(email: string, password: string): Promise<APIResponse<{token: string}>> {
        try {
            const user = await this.authService.validateCredentials(email, password);
            if (!user) {
                return {
                    success: false,
                    error: 'Invalid credentials'
                };
            }
            const token = this.generateJWT(user);
            return {
                success: true,
                data: { token }
            };
        } catch (error) {
            return {
                success: false,
                error: 'Internal server error'
            };
        }
    }
    async handleUserCreation(email: string, password: string): Promise<APIResponse<{user: any}>> {
        try {
            const existingUser = await this.dbService.query(
                'SELECT id FROM users WHERE email = ?',
                [email]
            );
            if (existingUser.length > 0) {
                return {
                    success: false,
                    error: 'User already exists'
                };
            }
            const user = await this.authService.createUser(email, password);
            return {
                success: true,
                data: { user }
            };
        } catch (error) {
            return {
                success: false,
                error: 'User creation failed'
            };
        }
    }
    private generateJWT(user: any): string {
        // Mock JWT generation
        return `jwt_token_for_${user.id}`;
    }
}
EOF
echo "✅ Test codebase created (3 TypeScript files with authentication patterns)"
# Initialize CodeGraph
echo ""
echo "🚀 Initializing CodeGraph..."
../target/debug/codegraph init .
echo ""
echo "📊 Testing Embedding Provider Performance Comparison"
echo ""
# Test 1: ONNX Embeddings (Speed optimized)
echo "🔥 Test 1: ONNX Embeddings (Speed Optimized)"
echo "Provider: ONNX Runtime with optimized models"
echo "Expected: Fast indexing, good general embeddings"
export CODEGRAPH_EMBEDDING_PROVIDER=onnx
export CODEGRAPH_LOCAL_MODEL=/Users/username/.cache/huggingface/hub/models--Qdrant--all-MiniLM-L6-v2-onnx/snapshots/5f1b8cd78bc4fb444dd171e59b18f3a3af89a079 
echo "Starting ONNX embedding test..."
time ../target/debug/codegraph index . --force --languages typescript --verbose 2>&1 | grep -E "(Found|embeddings|complete|ONNX)" || true
echo ""a
# Test 2: Ollama Embeddings (Code-specialized)
if [ "$OLLAMA_AVAILABLE" = true ]; then
    echo "🧠 Test 2: Ollama nomic-embed-code (Code-Specialized)"
    echo "Provider: Ollama with nomic-embed-code"
    echo "Expected: Superior code understanding, better semantic search"
    export CODEGRAPH_EMBEDDING_PROVIDER=ollama
    export CODEGRAPH_EMBEDDING_MODEL=hf.co/nomic-ai/nomic-embed-code-GGUF:Q4_K_M
    echo "Starting Ollama embedding test..."
    time ../target/debug/codegraph index . --force --languages typescript --verbose 2>&1 | grep -E "(Found|embeddings|complete|Ollama|nomic)" || true
else
    echo "🚫 Test 2: Skipped (nomic-embed-code not available)"
fi
echo ""
echo "🔍 Testing Semantic Search Quality"
# Test semantic search with both providers
TEST_QUERIES=("authentication pattern" "database connection" "error handling" "user validation" "API endpoint")
for query in "${TEST_QUERIES[@]}"; do
    echo ""
    echo "Query: '$query'"
    echo "ONNX Results:"
    export CODEGRAPH_EMBEDDING_PROVIDER=onnx
    echo "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"vector.search\",\"params\":{\"query\":\"$query\",\"limit\":3}}" | \
        ../target/debug/codegraph start stdio 2>/dev/null | \
        jq -r '.result.results[]?.name // "No results"' 2>/dev/null | head -3 || echo "No results"
    if [ "$OLLAMA_AVAILABLE" = true ]; then
        echo "Ollama Results:"
        export CODEGRAPH_EMBEDDING_PROVIDER=ollama
        echo "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"vector.search\",\"params\":{\"query\":\"$query\",\"limit\":3}}" | \
            ../target/debug/codegraph start stdio 2>/dev/null | \
            jq -r '.result.results[]?.name // "No results"' 2>/dev/null | head -3 || echo "No results"
    fi
done
# Cleanup
cd ..
rm -rf "$TEST_DIR"
echo ""
echo "🎉 Embedding Provider Comparison Complete!"
echo ""
echo "📊 Summary:"
echo "✅ ONNX: Fast, general-purpose embeddings"
if [ "$OLLAMA_AVAILABLE" = true ]; then
    echo "✅ Ollama: Code-specialized embeddings with nomic-embed-code"
    echo ""
    echo "🚀 Revolutionary Architecture Complete:"
    echo "  • Code-specialized embeddings (nomic-embed-code)"
    echo "  • SOTA code analysis (Qwen2.5-Coder-14B-128K)"
    echo "  • 100% local AI development platform"
    echo "  • Zero external dependencies"
    echo "  • Best-in-class code understanding at every level"
else
    echo "⚠️ Ollama: Not tested (model not available)"
fi
echo ""
echo "🎯 Next Steps:"
echo "1. Choose optimal embedding provider for your use case"
echo "2. Configure environment variables in Claude Desktop"
echo "3. Experience revolutionary local-first AI development"