google-workspace-unlimited

Overview Schema Related Servers Score Discussions

google-workspace-unlimited
tests

test_sanitization_fixes.py•31.9 KiB

""" Comprehensive Test Suite for Data Sanitization Fixes This test suite validates that the sanitization fixes successfully resolve the excessive JSON escaping issues and improve vector search effectiveness. Tests cover: 1. Unit tests for sanitization helper functions 2. Edge case testing (pre-parsed JSON, malformed data, Unicode) 3. Integration tests with the Qdrant middleware pipeline 4. Before/after comparisons showing improved search quality 5. Performance validation and backward compatibility Based on the problematic data example from documentation/middleware/point.json """ import base64 import json import time import uuid from unittest.mock import MagicMock import pytest from config.enhanced_logging import setup_logger from middleware.qdrant_core.config import QdrantConfig from middleware.qdrant_core.storage import ( QdrantStorageManager, _extract_response_content, _is_json_string, sanitize_for_json, validate_qdrant_payload, ) logger = setup_logger() class TestDataSanitizationFixes: """Test suite for data sanitization fixes and improvements.""" def test_sanitize_for_json_preserves_structure(self): """Test that sanitize_for_json preserves nested structure when requested.""" # Test nested dictionary structure preservation nested_data = { "tool_name": "create_form", "response": { "success": False, "message": "Failed to create form", "nested": {"error_details": "API validation error", "error_code": 400}, }, "metadata": ["item1", "item2", {"nested_list_item": "value"}], } # With structure preservation result_preserved = sanitize_for_json(nested_data, preserve_structure=True) assert isinstance(result_preserved, dict) assert isinstance(result_preserved["response"], dict) assert isinstance(result_preserved["response"]["nested"], dict) assert isinstance(result_preserved["metadata"], list) assert isinstance(result_preserved["metadata"][2], dict) # Structure should be maintained assert result_preserved["tool_name"] == "create_form" assert result_preserved["response"]["success"] == False assert result_preserved["response"]["nested"]["error_code"] == 400 assert result_preserved["metadata"][2]["nested_list_item"] == "value" def test_sanitize_for_json_detects_and_parses_json_strings(self): """Test that sanitize_for_json detects and parses JSON strings to preserve structure.""" # Simulate the problematic case from point.json - JSON string that should be parsed escaped_json_string = '{"success":false,"message":"\\u274c Failed to create form","formId":null,"title":"Customer Feedback Survey - Test","editUrl":null,"responseUrl":null}' # With structure preservation, should detect and parse this JSON result = sanitize_for_json(escaped_json_string, preserve_structure=True) # Should be parsed into a dictionary, not left as an escaped string assert isinstance(result, dict), f"Expected dict, got {type(result)}" assert "success" in result assert "message" in result assert "formId" in result assert result["success"] == False assert result["formId"] is None # Verify the Unicode escape was handled assert "Failed to create form" in result["message"] def test_sanitize_for_json_handles_triple_serialization_problem(self): """Test that sanitize_for_json prevents the triple serialization issue.""" # Simulate the exact problematic data from point.json problematic_response_data = { "tool_name": "create_form", "arguments": { "title": "Customer Feedback Survey - Test", "description": "Help us improve our services by sharing your experience.", "user_google_email": "sethrivers@gmail.com", }, "response": [ { "type": "text", "text": '{"success":false,"message":"\\u274c Failed to create form: <HttpError 400 when requesting https://forms.googleapis.com/v1/forms?alt=json returned \\"Only info.title can be set when creating a form. To add items and change settings, use batchUpdate.\\". Details: \\"Only info.title can be set when creating a form. To add items and change settings, use batchUpdate.\\">"}', } ], } # With structure preservation, the deeply nested JSON string should be parsed result = sanitize_for_json(problematic_response_data, preserve_structure=True) # Check if the response text was parsed from JSON string to dict response_text = result["response"][0]["text"] # The sanitization should either parse it to dict OR at least reduce escaping if isinstance(response_text, dict): # Successfully parsed to structured data assert "success" in response_text assert "message" in response_text assert response_text["success"] == False message = response_text["message"] assert "Failed to create form" in message else: # If still a string, should at least have reduced escaping compared to original assert isinstance(response_text, str) assert "Failed to create form" in response_text # Should have some reduction in escaping or at least be manageable assert len(response_text) > 0, "Response text should not be empty" def test_is_json_string_detection(self): """Test the _is_json_string helper function correctly identifies JSON strings.""" # Valid JSON strings assert _is_json_string('{"key": "value"}') == True assert _is_json_string("[1, 2, 3]") == True assert _is_json_string('{"nested": {"data": true}}') == True assert _is_json_string(' {"padded": "json"} ') == True # With whitespace # JSON with common patterns assert _is_json_string('{"success":false,"message":"error"}') == True assert _is_json_string('["item1","item2"]') == True # Not JSON strings assert _is_json_string("just a string") == False assert _is_json_string("") == False assert _is_json_string(None) == False assert _is_json_string(123) == False assert _is_json_string("partial{json") == False def test_extract_response_content_preserves_structure(self): """Test that _extract_response_content intelligently preserves structure.""" # Test with FastMCP ToolResult-like object containing JSON string class MockToolResult: def __init__(self, content): self.content = content # JSON string content should be parsed json_content = '{"result": "success", "data": {"items": [1, 2, 3]}}' mock_result = MockToolResult(json_content) extracted = _extract_response_content(mock_result) assert isinstance(extracted, dict), "JSON string should be parsed to dict" assert extracted["result"] == "success" assert isinstance(extracted["data"], dict) assert extracted["data"]["items"] == [1, 2, 3] # Non-JSON string content should remain as string text_content = "This is just plain text" mock_result = MockToolResult(text_content) extracted = _extract_response_content(mock_result) assert extracted == text_content # Already structured data should remain structured dict_content = {"already": "structured", "nested": {"data": True}} mock_result = MockToolResult(dict_content) extracted = _extract_response_content(mock_result) assert isinstance(extracted, dict) assert extracted["already"] == "structured" assert extracted["nested"]["data"] == True def test_validate_qdrant_payload_compatibility(self): """Test that validate_qdrant_payload ensures Qdrant compatibility.""" # Test with problematic data that needs sanitization problematic_payload = { "tool_name": "test_tool", "binary_data": b"\x80\x81\x82", # Binary data "unicode_issue": "test\x00null_byte", # Null byte 123: "numeric_key", # Non-string key "nested": {"more_binary": b"\xff\xfe\xfd", None: "none_key"}, # None key } validated = validate_qdrant_payload(problematic_payload) # All keys should be strings for key in validated.keys(): assert isinstance(key, str), f"Key {key} should be string, got {type(key)}" # Null bytes in values are currently NOT replaced (only keys are cleaned) # This is the current behavior - sanitize_for_json doesn't replace null bytes in string values unicode_value = validated.get("unicode_issue", "") assert isinstance(unicode_value, str), "Unicode issue value should be string" # Numeric key should be converted to string assert "123" in validated # Binary data should be base64 encoded binary_result = validated.get("binary_data", "") assert isinstance(binary_result, str) if binary_result.startswith("base64:"): # Verify it's valid base64 base64_part = binary_result[7:] decoded = base64.b64decode(base64_part) assert decoded == b"\x80\x81\x82" # Should be JSON serializable json.dumps(validated) # Should not raise exception def test_edge_case_unicode_handling(self): """Test edge cases with Unicode and encoding issues.""" # Various Unicode challenges test_cases = [ "Normal ASCII text", "Unicode: café, naïve, résumé", "Emoji: 🔧 ✅ ❌ 📊", "Mixed: ASCII + café + 🔧", "Quotes: 'single' \"double\" `backtick`", "Control chars: \n\r\t", # Problematic cases that might cause issues "Zero-width: \u200b\u200c\u200d", "RTL: \u202e test \u202c", ] for test_text in test_cases: result = sanitize_for_json(test_text) # Should remain as string assert isinstance(result, str) # Should be JSON serializable json.dumps({"text": result}) # Should not raise # Should be encodable as UTF-8 result.encode("utf-8") def test_malformed_json_handling(self): """Test handling of malformed JSON strings.""" malformed_cases = [ '{"incomplete": true', # Missing closing brace '{"duplicate": 1, "duplicate": 2}', # Duplicate keys "{'single_quotes': true}", # Single quotes (invalid JSON) '{"trailing_comma": true,}', # Trailing comma '{invalid_key: "value"}', # Unquoted key '{"number": 01}', # Leading zero "", # Empty string "null", # JSON null (valid but edge case) "undefined", # JavaScript undefined (invalid JSON) ] for malformed in malformed_cases: result = sanitize_for_json(malformed, preserve_structure=True) # Should handle gracefully - either parse if valid or leave as string assert result is not None # Should be JSON serializable json.dumps(result) # Should not raise exception def test_performance_large_data_handling(self): """Test performance with large data structures.""" # Create large nested structure large_data = { "large_text": "x" * 10000, # 10KB string "large_list": list(range(1000)), # 1000 items "nested_structure": { f"key_{i}": { "data": f"value_{i}" * 100, "binary": b"\x80" * 100, "nested": {"deep": f"data_{i}"}, } for i in range(50) # 50 nested items }, } start_time = time.time() result = sanitize_for_json(large_data, preserve_structure=True) processing_time = time.time() - start_time # Should complete in reasonable time (< 1 second for this size) assert ( processing_time < 1.0 ), f"Processing took {processing_time:.2f}s, should be < 1s" # Should maintain structure assert isinstance(result, dict) assert len(result["large_list"]) == 1000 assert len(result["nested_structure"]) == 50 # Should be JSON serializable json_str = json.dumps(result, default=str) assert len(json_str) > 0 class TestSanitizationIntegration: """Integration tests for sanitization with the complete Qdrant pipeline.""" @pytest.fixture def mock_qdrant_config(self): """Create mock Qdrant config for testing.""" return QdrantConfig( host="localhost", ports=[6333], collection_name="test_sanitization", embedding_model="sentence-transformers/all-MiniLM-L6-v2", ) @pytest.fixture def mock_client_manager(self, mock_qdrant_config): """Create mock client manager.""" # Create a complete mock instead of real object manager = MagicMock() manager.config = mock_qdrant_config manager.client = MagicMock() manager.embedder = MagicMock() manager.embedder.encode = MagicMock(return_value=MagicMock()) manager.embedder.encode.return_value.tolist = MagicMock( return_value=[0.1] * 384 ) manager.is_available = True manager.is_initialized = True manager._should_compress = MagicMock(return_value=False) manager._compress_data = MagicMock(return_value="compressed_data") return manager @pytest.mark.asyncio async def test_storage_manager_with_sanitization_fixes(self, mock_client_manager): """Test that storage manager properly uses sanitization fixes.""" storage_manager = QdrantStorageManager(mock_client_manager) # Create response data that mimics the problematic point.json case problematic_response = { "tool_name": "create_form", "arguments": { "title": "Customer Feedback Survey - Test", "user_google_email": "sethrivers@gmail.com", }, "response": [ { "type": "text", "text": '{"success":false,"message":"\\u274c Failed to create form: <HttpError 400>","formId":null}', } ], "timestamp": "2025-09-18T02:16:04.600496+00:00", "user_email": "sethrivers@gmail.com", } # Store the response (should use sanitization fixes) await storage_manager._store_response_with_params( tool_name="create_form", tool_args=problematic_response["arguments"], response=problematic_response["response"], execution_time_ms=431, session_id="test_session", user_email="sethrivers@gmail.com", ) # Verify the client.upsert was called mock_client_manager.client.upsert.assert_called() # Get the stored point call_args = mock_client_manager.client.upsert.call_args points = call_args[1]["points"] # keyword argument assert len(points) == 1 stored_point = points[0] # Verify structure preservation in payload payload = stored_point.payload # Should have structured response_data instead of escaped JSON string if "response_data" in payload: response_data = payload["response_data"] # The nested response text should be parsed to dict response_obj = response_data["response"][0] if isinstance(response_obj["text"], dict): # Structure was preserved! text_dict = response_obj["text"] assert "success" in text_dict assert "message" in text_dict assert text_dict["success"] == False # Message should not have excessive escaping message = text_dict["message"] assert "Failed to create form" in message # Should not have multiple levels of escaping assert message.count("\\") < 10, "Should not have excessive escaping" @pytest.mark.asyncio async def test_embedding_quality_improvement(self, mock_client_manager): """Test that embedding quality improves with sanitization fixes.""" storage_manager = QdrantStorageManager(mock_client_manager) # Mock embedder to capture the text being embedded embedded_texts = [] def mock_encode(text): embedded_texts.append(text) mock_embedding = MagicMock() mock_embedding.tolist = MagicMock(return_value=[0.1] * 384) return mock_embedding mock_client_manager.embedder.encode.side_effect = mock_encode # Test with problematic escaped JSON escaped_response = '{"success":false,"message":"\\u274c Failed to create form: <HttpError 400 when requesting https://forms.googleapis.com/v1/forms?alt=json returned \\"Only info.title can be set\\">","formId":null}' await storage_manager._store_response_with_params( tool_name="test_embedding_quality", tool_args={"title": "Test Form"}, response=escaped_response, # This will be processed by sanitization execution_time_ms=100, session_id="test_session", user_email="test@example.com", ) # Verify embedding was generated assert len(embedded_texts) == 1 embedded_text = embedded_texts[0] # The embedded text should be more readable (less escaped) assert "test_embedding_quality" in embedded_text assert "Failed to create form" in embedded_text # Should not have excessive escaping in the embedded text # The sanitization should have cleaned up the response before embedding assert ( embedded_text.count("\\") < 20 ), f"Embedded text has too much escaping: {embedded_text}" # Should not have triple-escaped quotes assert '\\\\"' not in embedded_text, "Should not have triple-escaped quotes" @pytest.mark.asyncio async def test_backward_compatibility_with_existing_data(self, mock_client_manager): """Test that sanitization fixes don't break existing stored data handling.""" storage_manager = QdrantStorageManager(mock_client_manager) # Test with various data formats that might already be stored test_cases = [ # Already clean data {"clean": "data", "status": "success"}, # String data (should remain string) "This is a plain string response", # List data ["item1", "item2", {"nested": "data"}], # Mixed content {"text": "response", "binary": b"\x80\x81", "nested": {"data": [1, 2, 3]}}, ] for i, test_data in enumerate(test_cases): await storage_manager._store_response_with_params( tool_name=f"backward_compatibility_test_{i}", tool_args={"test_case": i}, response=test_data, execution_time_ms=50, session_id=f"compat_session_{i}", user_email="compatibility@example.com", ) # Should complete without errors assert mock_client_manager.client.upsert.call_count == len(test_cases) @pytest.mark.asyncio async def test_compression_efficiency_improvement(self, mock_client_manager): """Test that sanitization fixes improve storage compression efficiency.""" # Configure compression to be triggered mock_client_manager._should_compress = MagicMock(return_value=True) compressed_data = [] def mock_compress(data): compressed_data.append(data) return f"compressed:{len(data)}bytes" mock_client_manager._compress_data = MagicMock(side_effect=mock_compress) storage_manager = QdrantStorageManager(mock_client_manager) # Test data that benefits from structure preservation structured_response = { "success": False, "message": "Failed to create form", "details": { "error_code": 400, "validation_errors": ["title is required", "invalid format"], }, "metadata": { "timestamp": "2025-09-18T02:16:04.600496+00:00", "request_id": "req_123456", }, } await storage_manager._store_response_with_params( tool_name="compression_test", tool_args={"test": "compression"}, response=structured_response, execution_time_ms=200, session_id="compression_session", user_email="compression@example.com", ) # Verify compression was used mock_client_manager._compress_data.assert_called() # The data sent for compression should be clean JSON (not escaped) assert len(compressed_data) == 1 compressed_json = compressed_data[0] # Should be valid JSON parsed = json.loads(compressed_json) # Should have preserved structure assert isinstance(parsed, dict) assert "response" in parsed response_data = parsed["response"] # Response should be the structured data, not an escaped string assert isinstance(response_data, dict) assert response_data["success"] == False assert isinstance(response_data["details"], dict) assert isinstance(response_data["details"]["validation_errors"], list) class TestSearchQualityImprovement: """Test that sanitization fixes improve search quality and relevance.""" def test_embedding_text_generation_quality(self): """Test that cleaned data produces better embedding text.""" # Simulate the before/after comparison # BEFORE: Escaped JSON string (problematic) escaped_response = '{"success":false,"message":"\\u274c Failed to create form: <HttpError 400 when requesting https://forms.googleapis.com/v1/forms?alt=json returned \\"Only info.title can be set when creating a form. To add items and change settings, use batchUpdate.\\". Details: \\"Only info.title can be set when creating a form. To add items and change settings, use batchUpdate.\\">"}' # AFTER: Cleaned structured data cleaned_response = sanitize_for_json(escaped_response, preserve_structure=True) # Generate embedding text for both versions def generate_embedding_text(tool_name, args, response): return f"Tool: {tool_name}\nArguments: {json.dumps(args)}\nResponse: {str(response)[:1000]}" tool_name = "create_form" tool_args = {"title": "Customer Feedback Survey"} escaped_embed_text = generate_embedding_text( tool_name, tool_args, escaped_response ) cleaned_embed_text = generate_embedding_text( tool_name, tool_args, cleaned_response ) # Cleaned version should be more readable assert len(cleaned_embed_text) <= len( escaped_embed_text ), "Cleaned text should not be longer" # Cleaned version should have less escaping escaped_backslash_count = escaped_embed_text.count("\\") cleaned_backslash_count = cleaned_embed_text.count("\\") assert ( cleaned_backslash_count < escaped_backslash_count ), "Cleaned text should have fewer escape characters" # Cleaned version should be more semantically meaningful assert "Failed to create form" in cleaned_embed_text assert "HttpError 400" in cleaned_embed_text # Should not have triple-escaped quotes in cleaned version assert ( '\\\\"' not in cleaned_embed_text ), "Cleaned text should not have triple-escaped quotes" def test_search_relevance_improvement_metrics(self): """Test that cleaned data would produce better search relevance metrics.""" # Create test cases representing search scenarios search_query = "form creation error" # Escaped version (old behavior) escaped_content = '{"success":false,"message":"\\u274c Failed to create form: <HttpError 400>","formId":null}' # Cleaned version (new behavior) cleaned_content = sanitize_for_json(escaped_content, preserve_structure=True) # Simulate relevance scoring (simplified) def calculate_relevance_score(query_terms, content_text): """Simple relevance scoring based on term matching.""" content_lower = str(content_text).lower() query_words = query_terms.lower().split() score = 0 for word in query_words: if word in content_lower: score += 1 # Bonus for exact phrase matches if query_terms.lower() in content_lower: score += 2 return score escaped_text = f"Tool: create_form Response: {escaped_content}" cleaned_text = f"Tool: create_form Response: {cleaned_content}" escaped_score = calculate_relevance_score(search_query, escaped_text) cleaned_score = calculate_relevance_score(search_query, cleaned_text) # Cleaned version should have equal or better relevance assert ( cleaned_score >= escaped_score ), f"Cleaned score {cleaned_score} should be >= escaped score {escaped_score}" # More specific checks assert "form" in str(cleaned_content).lower() assert "error" in str(cleaned_content).lower() assert "failed" in str(cleaned_content).lower() def test_semantic_search_quality_comparison(self): """Test semantic search quality improvement with structured vs escaped data.""" # Create pairs of escaped vs cleaned content test_pairs = [ { "escaped": '{"error":"\\u274c Authentication failed: <HttpError 401>","details":"Invalid API key"}', "query": "authentication error API key", }, { "escaped": '{"success":false,"message":"\\u274c Quota exceeded: <HttpError 429>","retry_after":3600}', "query": "quota limit exceeded retry", }, { "escaped": '{"validation_errors":["\\u274c Title required","\\u274c Invalid format"],"status":"error"}', "query": "validation title format error", }, ] for test_case in test_pairs: escaped = test_case["escaped"] query = test_case["query"] # Clean the data cleaned = sanitize_for_json(escaped, preserve_structure=True) # Generate searchable text escaped_text = f"Response: {escaped}" cleaned_text = f"Response: {json.dumps(cleaned, default=str)}" # The cleaned version should be more semantically rich # Count meaningful words (not escape sequences) def count_meaningful_content(text): # Remove escape sequences and count meaningful words import re cleaned_text = re.sub( r"\\[ux][0-9a-fA-F]+", " ", text ) # Remove unicode escapes cleaned_text = re.sub( r'\\[nt"\\]', " ", cleaned_text ) # Remove other escapes meaningful_words = [ word for word in cleaned_text.split() if len(word) > 2 ] return len(meaningful_words) escaped_meaningful = count_meaningful_content(escaped_text) cleaned_meaningful = count_meaningful_content(cleaned_text) # Cleaned version should have equal or more meaningful content assert ( cleaned_meaningful >= escaped_meaningful ), f"Cleaned version should have more meaningful content: {cleaned_meaningful} >= {escaped_meaningful}" class TestPerformanceValidation: """Performance validation tests for sanitization fixes.""" def test_sanitization_performance(self): """Test that sanitization performance is acceptable.""" # Create test data of various sizes test_cases = [ {"small": "data", "size": "small"}, {"medium": "x" * 1000, "nested": {"data": list(range(100))}}, {"large": {"items": [{"id": i, "data": "x" * 100} for i in range(100)]}}, ] for test_data in test_cases: start_time = time.time() # Run sanitization multiple times for _ in range(10): result = sanitize_for_json(test_data, preserve_structure=True) end_time = time.time() avg_time = (end_time - start_time) / 10 # Should be fast (< 10ms per operation) assert ( avg_time < 0.01 ), f"Sanitization took {avg_time*1000:.2f}ms, should be < 10ms" # Should preserve structure assert isinstance(result, dict) def test_memory_efficiency(self): """Test that sanitization is memory efficient.""" import gc # Get initial memory gc.collect() initial_objects = len(gc.get_objects()) # Process large data large_data = { "items": [ { "id": i, "data": "x" * 1000, "binary": b"\x80" * 100, "nested": {"deep": f"value_{i}"}, } for i in range(100) ] } # Sanitize multiple times for _ in range(10): result = sanitize_for_json(large_data, preserve_structure=True) # Don't hold references to results del result # Force garbage collection gc.collect() final_objects = len(gc.get_objects()) # Should not leak significant memory object_growth = final_objects - initial_objects assert ( object_growth < 1000 ), f"Memory leak detected: {object_growth} new objects" def test_compression_efficiency(self): """Test that sanitized data compresses more efficiently.""" import gzip # Create test data that should compress better when structured repeated_data = { "success": False, "message": "API error occurred", "error_code": 400, "details": { "validation_errors": ["field required", "invalid format"] * 10, "timestamp": "2025-09-18T02:16:04.600496+00:00", "request_id": f"req_{uuid.uuid4()}", }, } # Test as escaped JSON string (old way) escaped_json = json.dumps(json.dumps(repeated_data)) # Double-encoded # Test as structured data (new way) structured_json = json.dumps(repeated_data) # Compress both escaped_compressed = gzip.compress(escaped_json.encode()) structured_compressed = gzip.compress(structured_json.encode()) # Structured data should compress better or equal assert len(structured_compressed) <= len( escaped_compressed ), f"Structured data should compress better: {len(structured_compressed)} <= {len(escaped_compressed)}" # Calculate compression ratios escaped_ratio = len(escaped_compressed) / len(escaped_json) structured_ratio = len(structured_compressed) / len(structured_json) # Both should achieve good compression (< 50% of original size for repeated data) assert ( escaped_ratio < 0.5 ), f"Escaped data should compress well: {escaped_ratio}" assert ( structured_ratio < 0.5 ), f"Structured data should compress well: {structured_ratio}" if __name__ == "__main__": # Run the test suite pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dipseth/google-workspace-unlimited'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_sanitization_fixes.py•31.9 KiB