Graphiti MCP Server

Overview Schema Related Servers Score Discussions

test_content_chunking.py•17.3 KiB

""" Copyright 2024, Zep Software, Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import json from graphiti_core.nodes import EpisodeType from graphiti_core.utils.content_chunking import ( CHARS_PER_TOKEN, _count_json_keys, _json_likely_dense, _text_likely_dense, chunk_json_content, chunk_message_content, chunk_text_content, estimate_tokens, should_chunk, ) class TestEstimateTokens: def test_empty_string(self): assert estimate_tokens('') == 0 def test_short_string(self): # 4 chars per token assert estimate_tokens('abcd') == 1 assert estimate_tokens('abcdefgh') == 2 def test_long_string(self): text = 'a' * 400 assert estimate_tokens(text) == 100 def test_uses_chars_per_token_constant(self): text = 'x' * (CHARS_PER_TOKEN * 10) assert estimate_tokens(text) == 10 class TestChunkJsonArray: def test_small_array_no_chunking(self): data = [{'name': 'Alice'}, {'name': 'Bob'}] content = json.dumps(data) chunks = chunk_json_content(content, chunk_size_tokens=1000) assert len(chunks) == 1 assert json.loads(chunks[0]) == data def test_empty_array(self): chunks = chunk_json_content('[]', chunk_size_tokens=100) assert chunks == ['[]'] def test_array_splits_at_element_boundaries(self): # Create array that exceeds chunk size data = [{'id': i, 'data': 'x' * 100} for i in range(20)] content = json.dumps(data) # Use small chunk size to force splitting chunks = chunk_json_content(content, chunk_size_tokens=100, overlap_tokens=20) # Verify all chunks are valid JSON arrays for chunk in chunks: parsed = json.loads(chunk) assert isinstance(parsed, list) # Each element should be a complete object for item in parsed: assert 'id' in item assert 'data' in item def test_array_preserves_all_elements(self): data = [{'id': i} for i in range(10)] content = json.dumps(data) chunks = chunk_json_content(content, chunk_size_tokens=50, overlap_tokens=10) # Collect all unique IDs across chunks (accounting for overlap) seen_ids = set() for chunk in chunks: parsed = json.loads(chunk) for item in parsed: seen_ids.add(item['id']) # All original IDs should be present assert seen_ids == set(range(10)) class TestChunkJsonObject: def test_small_object_no_chunking(self): data = {'name': 'Alice', 'age': 30} content = json.dumps(data) chunks = chunk_json_content(content, chunk_size_tokens=1000) assert len(chunks) == 1 assert json.loads(chunks[0]) == data def test_empty_object(self): chunks = chunk_json_content('{}', chunk_size_tokens=100) assert chunks == ['{}'] def test_object_splits_at_key_boundaries(self): # Create object that exceeds chunk size data = {f'key_{i}': 'x' * 100 for i in range(20)} content = json.dumps(data) chunks = chunk_json_content(content, chunk_size_tokens=100, overlap_tokens=20) # Verify all chunks are valid JSON objects for chunk in chunks: parsed = json.loads(chunk) assert isinstance(parsed, dict) # Each key-value pair should be complete for key in parsed: assert key.startswith('key_') def test_object_preserves_all_keys(self): data = {f'key_{i}': f'value_{i}' for i in range(10)} content = json.dumps(data) chunks = chunk_json_content(content, chunk_size_tokens=50, overlap_tokens=10) # Collect all unique keys across chunks seen_keys = set() for chunk in chunks: parsed = json.loads(chunk) seen_keys.update(parsed.keys()) # All original keys should be present expected_keys = {f'key_{i}' for i in range(10)} assert seen_keys == expected_keys class TestChunkJsonInvalid: def test_invalid_json_falls_back_to_text(self): invalid_json = 'not valid json {' chunks = chunk_json_content(invalid_json, chunk_size_tokens=1000) # Should fall back to text chunking assert len(chunks) >= 1 assert invalid_json in chunks[0] def test_scalar_value_returns_as_is(self): for scalar in ['"string"', '123', 'true', 'null']: chunks = chunk_json_content(scalar, chunk_size_tokens=1000) assert chunks == [scalar] class TestChunkTextContent: def test_small_text_no_chunking(self): text = 'This is a short text.' chunks = chunk_text_content(text, chunk_size_tokens=1000) assert len(chunks) == 1 assert chunks[0] == text def test_splits_at_paragraph_boundaries(self): paragraphs = ['Paragraph one.', 'Paragraph two.', 'Paragraph three.'] text = '\n\n'.join(paragraphs) # Use small chunk size to force splitting chunks = chunk_text_content(text, chunk_size_tokens=10, overlap_tokens=5) # Each chunk should contain complete paragraphs (possibly with overlap) for chunk in chunks: # Should not have partial words cut off mid-paragraph assert not chunk.endswith(' ') def test_splits_at_sentence_boundaries_for_large_paragraphs(self): # Create a single long paragraph with multiple sentences sentences = ['This is sentence number ' + str(i) + '.' for i in range(20)] long_paragraph = ' '.join(sentences) chunks = chunk_text_content(long_paragraph, chunk_size_tokens=50, overlap_tokens=10) # Should have multiple chunks assert len(chunks) > 1 # Each chunk should end at a sentence boundary where possible for chunk in chunks[:-1]: # All except last # Should end with sentence punctuation or continue to next chunk assert chunk[-1] in '.!? ' or True # Allow flexibility def test_preserves_text_completeness(self): text = 'Alpha beta gamma delta epsilon zeta eta theta.' chunks = chunk_text_content(text, chunk_size_tokens=10, overlap_tokens=2) # All words should appear in at least one chunk all_words = set(text.replace('.', '').split()) found_words = set() for chunk in chunks: found_words.update(chunk.replace('.', '').split()) assert all_words <= found_words class TestChunkMessageContent: def test_small_message_no_chunking(self): content = 'Alice: Hello!\nBob: Hi there!' chunks = chunk_message_content(content, chunk_size_tokens=1000) assert len(chunks) == 1 assert chunks[0] == content def test_preserves_speaker_message_format(self): messages = [f'Speaker{i}: This is message number {i}.' for i in range(10)] content = '\n'.join(messages) chunks = chunk_message_content(content, chunk_size_tokens=50, overlap_tokens=10) # Each chunk should have complete speaker:message pairs for chunk in chunks: lines = [line for line in chunk.split('\n') if line.strip()] for line in lines: # Should have speaker: format assert ':' in line def test_json_message_array_format(self): messages = [{'role': 'user', 'content': f'Message {i}'} for i in range(10)] content = json.dumps(messages) chunks = chunk_message_content(content, chunk_size_tokens=50, overlap_tokens=10) # Each chunk should be valid JSON array for chunk in chunks: parsed = json.loads(chunk) assert isinstance(parsed, list) for msg in parsed: assert 'role' in msg assert 'content' in msg class TestChunkOverlap: def test_json_array_overlap_captures_boundary_elements(self): data = [{'id': i, 'name': f'Entity {i}'} for i in range(10)] content = json.dumps(data) # Use settings that will create overlap chunks = chunk_json_content(content, chunk_size_tokens=80, overlap_tokens=30) if len(chunks) > 1: # Check that adjacent chunks share some elements for i in range(len(chunks) - 1): current = json.loads(chunks[i]) next_chunk = json.loads(chunks[i + 1]) # Get IDs from end of current and start of next current_ids = {item['id'] for item in current} next_ids = {item['id'] for item in next_chunk} # There should be overlap (shared IDs) # Note: overlap may be empty if elements are large # The test verifies the structure, not exact overlap amount _ = current_ids & next_ids def test_text_overlap_captures_boundary_text(self): paragraphs = [f'Paragraph {i} with some content here.' for i in range(10)] text = '\n\n'.join(paragraphs) chunks = chunk_text_content(text, chunk_size_tokens=50, overlap_tokens=20) if len(chunks) > 1: # Adjacent chunks should have some shared content for i in range(len(chunks) - 1): current_words = set(chunks[i].split()) next_words = set(chunks[i + 1].split()) # There should be some overlap overlap = current_words & next_words # At minimum, common words like 'Paragraph', 'with', etc. assert len(overlap) > 0 class TestEdgeCases: def test_very_large_single_element(self): # Single element larger than chunk size data = [{'content': 'x' * 10000}] content = json.dumps(data) chunks = chunk_json_content(content, chunk_size_tokens=100, overlap_tokens=10) # Should handle gracefully - may return single chunk or fall back assert len(chunks) >= 1 def test_empty_content(self): assert chunk_text_content('', chunk_size_tokens=100) == [''] assert chunk_message_content('', chunk_size_tokens=100) == [''] def test_whitespace_only(self): chunks = chunk_text_content(' \n\n ', chunk_size_tokens=100) assert len(chunks) >= 1 class TestShouldChunk: def test_empty_content_never_chunks(self): """Empty content should never chunk.""" assert not should_chunk('', EpisodeType.text) assert not should_chunk('', EpisodeType.json) def test_short_content_never_chunks(self, monkeypatch): """Short content should never chunk regardless of density.""" from graphiti_core.utils import content_chunking # Set very low thresholds that would normally trigger chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.001) monkeypatch.setattr(content_chunking, 'CHUNK_MIN_TOKENS', 1000) # Dense but short JSON (~200 tokens, below 1000 minimum) dense_data = [{'name': f'Entity{i}'} for i in range(50)] dense_json = json.dumps(dense_data) assert not should_chunk(dense_json, EpisodeType.json) def test_high_density_large_json_chunks(self, monkeypatch): """Large high-density JSON should trigger chunking.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.01) monkeypatch.setattr(content_chunking, 'CHUNK_MIN_TOKENS', 500) # Dense JSON: many elements, large enough to exceed minimum dense_data = [{'name': f'Entity{i}', 'desc': 'x' * 20} for i in range(200)] dense_json = json.dumps(dense_data) assert should_chunk(dense_json, EpisodeType.json) def test_low_density_text_no_chunk(self, monkeypatch): """Low-density prose should not trigger chunking.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.05) monkeypatch.setattr(content_chunking, 'CHUNK_MIN_TOKENS', 100) # Low-density prose: mostly lowercase narrative prose = 'the quick brown fox jumps over the lazy dog. ' * 50 assert not should_chunk(prose, EpisodeType.text) def test_low_density_json_no_chunk(self, monkeypatch): """Low-density JSON (few elements, lots of content) should not chunk.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.05) monkeypatch.setattr(content_chunking, 'CHUNK_MIN_TOKENS', 100) # Sparse JSON: few elements with lots of content each sparse_data = [{'content': 'x' * 1000}, {'content': 'y' * 1000}] sparse_json = json.dumps(sparse_data) assert not should_chunk(sparse_json, EpisodeType.json) class TestJsonDensityEstimation: def test_dense_array_detected(self, monkeypatch): """Arrays with many elements should be detected as dense.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.01) # Array with 100 elements, ~800 chars = 200 tokens # Density = 100/200 * 1000 = 500, threshold = 10 data = [{'id': i} for i in range(100)] content = json.dumps(data) tokens = estimate_tokens(content) assert _json_likely_dense(content, tokens) def test_sparse_array_not_dense(self, monkeypatch): """Arrays with few elements should not be detected as dense.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.05) # Array with 2 elements but lots of content each data = [{'content': 'x' * 1000}, {'content': 'y' * 1000}] content = json.dumps(data) tokens = estimate_tokens(content) assert not _json_likely_dense(content, tokens) def test_dense_object_detected(self, monkeypatch): """Objects with many keys should be detected as dense.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.01) # Object with 50 keys data = {f'key_{i}': f'value_{i}' for i in range(50)} content = json.dumps(data) tokens = estimate_tokens(content) assert _json_likely_dense(content, tokens) def test_count_json_keys_shallow(self): """Key counting should work for nested structures.""" data = { 'a': 1, 'b': {'c': 2, 'd': 3}, 'e': [{'f': 4}, {'g': 5}], } # At depth 2: a, b, c, d, e, f, g = 7 keys assert _count_json_keys(data, max_depth=2) == 7 def test_count_json_keys_depth_limit(self): """Key counting should respect depth limit.""" data = { 'a': {'b': {'c': {'d': 1}}}, } # At depth 1: only 'a' assert _count_json_keys(data, max_depth=1) == 1 # At depth 2: 'a' and 'b' assert _count_json_keys(data, max_depth=2) == 2 class TestTextDensityEstimation: def test_entity_rich_text_detected(self, monkeypatch): """Text with many proper nouns should be detected as dense.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.01) # Entity-rich text: many capitalized names text = 'Alice met Bob at Acme Corp. Then Carol and David joined them. ' text += 'Eve from Globex introduced Frank and Grace. ' text += 'Later Henry and Iris arrived from Initech. ' text = text * 10 tokens = estimate_tokens(text) assert _text_likely_dense(text, tokens) def test_prose_not_dense(self, monkeypatch): """Narrative prose should not be detected as dense.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.05) # Low-entity prose prose = """ the sun was setting over the horizon as the old man walked slowly down the dusty road. he had been traveling for many days and his feet were tired. the journey had been long but he knew that soon he would reach his destination. the wind whispered through the trees and the birds sang their evening songs. """ prose = prose * 10 tokens = estimate_tokens(prose) assert not _text_likely_dense(prose, tokens) def test_sentence_starters_ignored(self, monkeypatch): """Capitalized words after periods should be ignored.""" from graphiti_core.utils import content_chunking monkeypatch.setattr(content_chunking, 'CHUNK_DENSITY_THRESHOLD', 0.05) # Many sentences but no mid-sentence proper nouns text = 'This is a sentence. Another one follows. Yet another here. ' text = text * 50 tokens = estimate_tokens(text) # Should not be dense since capitals are sentence starters assert not _text_likely_dense(text, tokens)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/getzep/graphiti'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_content_chunking.py•17.3 KiB