Skip to main content
Glama
orneryd

M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

by orneryd
harness_test.go9.99 kB
package eval import ( "bytes" "context" "testing" "github.com/orneryd/nornicdb/pkg/search" "github.com/orneryd/nornicdb/pkg/storage" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // ============================================================================= // Metric Calculation Tests // ============================================================================= func TestPrecision(t *testing.T) { expected := map[string]bool{"a": true, "b": true, "c": true} t.Run("perfect_precision", func(t *testing.T) { returned := []string{"a", "b", "c", "d", "e"} p := precision(returned, expected, 3) assert.Equal(t, 1.0, p) // 3/3 relevant in top 3 }) t.Run("partial_precision", func(t *testing.T) { returned := []string{"a", "x", "b", "y", "c"} p := precision(returned, expected, 5) assert.Equal(t, 0.6, p) // 3/5 relevant in top 5 }) t.Run("zero_precision", func(t *testing.T) { returned := []string{"x", "y", "z"} p := precision(returned, expected, 3) assert.Equal(t, 0.0, p) }) t.Run("empty_returned", func(t *testing.T) { p := precision([]string{}, expected, 3) assert.Equal(t, 0.0, p) }) } func TestRecall(t *testing.T) { expected := map[string]bool{"a": true, "b": true, "c": true, "d": true} t.Run("perfect_recall", func(t *testing.T) { returned := []string{"a", "b", "c", "d", "x", "y"} r := recall(returned, expected, 10) assert.Equal(t, 1.0, r) // 4/4 found }) t.Run("partial_recall", func(t *testing.T) { returned := []string{"a", "b", "x", "y", "z"} r := recall(returned, expected, 5) assert.Equal(t, 0.5, r) // 2/4 found }) t.Run("zero_recall", func(t *testing.T) { returned := []string{"x", "y", "z"} r := recall(returned, expected, 3) assert.Equal(t, 0.0, r) }) } func TestMRR(t *testing.T) { expected := map[string]bool{"a": true, "b": true} t.Run("first_position", func(t *testing.T) { returned := []string{"a", "x", "y"} m := mrr(returned, expected) assert.Equal(t, 1.0, m) // 1/1 }) t.Run("second_position", func(t *testing.T) { returned := []string{"x", "a", "y"} m := mrr(returned, expected) assert.Equal(t, 0.5, m) // 1/2 }) t.Run("third_position", func(t *testing.T) { returned := []string{"x", "y", "b"} m := mrr(returned, expected) assert.InDelta(t, 0.333, m, 0.01) // 1/3 }) t.Run("not_found", func(t *testing.T) { returned := []string{"x", "y", "z"} m := mrr(returned, expected) assert.Equal(t, 0.0, m) }) } func TestNDCG(t *testing.T) { // Graded relevance: 3=highly relevant, 2=relevant, 1=marginal grades := map[string]int{ "best": 3, "good": 2, "ok": 1, } t.Run("perfect_ranking", func(t *testing.T) { // Best possible order returned := []string{"best", "good", "ok"} n := ndcg(returned, grades, 3) assert.InDelta(t, 1.0, n, 0.01) // Perfect ranking = 1.0 }) t.Run("worst_ranking", func(t *testing.T) { // Worst order (but still has relevant docs) returned := []string{"ok", "good", "best"} n := ndcg(returned, grades, 3) assert.Less(t, n, 1.0) // Not perfect assert.Greater(t, n, 0.0) // Still has relevant docs }) t.Run("no_relevant_docs", func(t *testing.T) { returned := []string{"x", "y", "z"} n := ndcg(returned, grades, 3) assert.Equal(t, 0.0, n) }) } func TestAveragePrecision(t *testing.T) { expected := map[string]bool{"a": true, "b": true, "c": true} t.Run("perfect_ranking", func(t *testing.T) { returned := []string{"a", "b", "c", "x", "y"} ap := averagePrecision(returned, expected) // AP = (1/1 + 2/2 + 3/3) / 3 = (1 + 1 + 1) / 3 = 1.0 assert.Equal(t, 1.0, ap) }) t.Run("interleaved_ranking", func(t *testing.T) { returned := []string{"a", "x", "b", "y", "c"} ap := averagePrecision(returned, expected) // AP = (1/1 + 2/3 + 3/5) / 3 = (1 + 0.667 + 0.6) / 3 ≈ 0.756 assert.InDelta(t, 0.756, ap, 0.01) }) } func TestHitRate(t *testing.T) { expected := map[string]bool{"a": true, "b": true} t.Run("has_hit", func(t *testing.T) { returned := []string{"x", "y", "a"} hr := hitRate(returned, expected) assert.Equal(t, 1.0, hr) }) t.Run("no_hit", func(t *testing.T) { returned := []string{"x", "y", "z"} hr := hitRate(returned, expected) assert.Equal(t, 0.0, hr) }) } // ============================================================================= // Harness Integration Tests // ============================================================================= func TestHarnessBasic(t *testing.T) { // Create a search service with test data store := storage.NewMemoryEngine() service := search.NewService(store) // Add test nodes for i := 0; i < 10; i++ { node := &storage.Node{ ID: storage.NodeID(string(rune('a' + i))), Labels: []string{"Doc"}, Properties: map[string]interface{}{ "title": "Document " + string(rune('a'+i)), "content": "Test content for search", }, } store.CreateNode(node) service.IndexNode(node) } harness := NewHarness(service) t.Run("add_test_case", func(t *testing.T) { harness.AddTestCase(TestCase{ Name: "find docs", Query: "test content", Expected: []string{"a", "b", "c"}, }) }) t.Run("run_evaluation", func(t *testing.T) { ctx := context.Background() result, err := harness.Run(ctx) require.NoError(t, err) assert.Equal(t, 1, result.TotalTests) assert.NotNil(t, result.Aggregate) }) } func TestHarnessMultipleTests(t *testing.T) { store := storage.NewMemoryEngine() service := search.NewService(store) // Add diverse test nodes nodes := []struct { id string title string content string }{ {"ml1", "Machine Learning", "neural networks and deep learning"}, {"ml2", "ML Algorithms", "supervised and unsupervised learning"}, {"db1", "Database Design", "SQL and NoSQL databases"}, {"db2", "Graph Databases", "nodes and relationships"}, } for _, n := range nodes { node := &storage.Node{ ID: storage.NodeID(n.id), Labels: []string{"Doc"}, Properties: map[string]interface{}{ "title": n.title, "content": n.content, }, } store.CreateNode(node) service.IndexNode(node) } harness := NewHarness(service) // Add test cases harness.AddTestCases([]TestCase{ { Name: "ML search", Query: "machine learning neural", Expected: []string{"ml1", "ml2"}, }, { Name: "Database search", Query: "database SQL", Expected: []string{"db1", "db2"}, }, }) ctx := context.Background() result, err := harness.Run(ctx) require.NoError(t, err) assert.Equal(t, 2, result.TotalTests) assert.Len(t, result.Results, 2) } func TestHarnessNoTestCases(t *testing.T) { store := storage.NewMemoryEngine() service := search.NewService(store) harness := NewHarness(service) ctx := context.Background() _, err := harness.Run(ctx) assert.Error(t, err) assert.Contains(t, err.Error(), "no test cases") } func TestThresholds(t *testing.T) { defaults := DefaultThresholds() assert.Equal(t, 0.5, defaults.Precision10) assert.Equal(t, 0.3, defaults.Recall10) assert.Equal(t, 0.5, defaults.MRR) assert.Equal(t, 0.8, defaults.HitRate) } // ============================================================================= // Reporter Tests // ============================================================================= func TestReporterPrintCompact(t *testing.T) { var buf bytes.Buffer reporter := NewReporter(&buf) result := &EvalResult{ TotalTests: 10, PassedTests: 8, FailedTests: 2, Aggregate: Metrics{ Precision10: 0.75, Recall10: 0.60, MRR: 0.85, NDCG10: 0.70, HitRate: 0.90, }, } reporter.PrintCompact(result) output := buf.String() assert.Contains(t, output, "FAIL") assert.Contains(t, output, "8/10") assert.Contains(t, output, "P@10=0.75") } func TestReporterPrintSummary(t *testing.T) { var buf bytes.Buffer reporter := NewReporter(&buf) result := &EvalResult{ SuiteName: "test-suite", TotalTests: 5, PassedTests: 5, FailedTests: 0, Aggregate: Metrics{ Precision10: 0.80, Recall10: 0.70, MRR: 0.90, }, Thresholds: DefaultThresholds(), } reporter.PrintSummary(result) output := buf.String() assert.Contains(t, output, "test-suite") assert.Contains(t, output, "5/5") assert.Contains(t, output, "Precision") } // ============================================================================= // Test Suite JSON Loading // ============================================================================= func TestLoadSuiteJSON(t *testing.T) { // This would test loading from a file // For now, just test the structure suite := TestSuite{ Name: "sample-suite", Description: "A sample test suite", Version: "1.0", TestCases: []TestCase{ { Name: "test1", Query: "sample query", Expected: []string{"doc1", "doc2"}, }, }, } assert.Equal(t, "sample-suite", suite.Name) assert.Len(t, suite.TestCases, 1) } // ============================================================================= // Graded Relevance Tests // ============================================================================= func TestGradedRelevance(t *testing.T) { store := storage.NewMemoryEngine() service := search.NewService(store) // Add nodes for _, id := range []string{"best", "good", "ok", "bad"} { node := &storage.Node{ ID: storage.NodeID(id), Labels: []string{"Doc"}, Properties: map[string]interface{}{ "title": id, }, } store.CreateNode(node) service.IndexNode(node) } harness := NewHarness(service) harness.AddTestCase(TestCase{ Name: "graded test", Query: "test", Expected: []string{"best", "good", "ok"}, // Required for binary metrics RelevanceGrades: map[string]int{ "best": 3, "good": 2, "ok": 1, "bad": 0, }, }) ctx := context.Background() result, err := harness.Run(ctx) require.NoError(t, err) // NDCG should be computed using grades assert.NotNil(t, result.Results[0].Metrics.NDCG10) }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server