Skip to main content
Glama

OpenFGA MCP

DocumentationChunker.php15.8 kB
<?php declare(strict_types=1); namespace OpenFGA\MCP\Documentation; use function array_slice; use function count; use function is_int; use function strlen; final class DocumentationChunker { private const int DEFAULT_CHUNK_SIZE = 3000; private const int MIN_CHUNK_SIZE = 500; /** * @param string $content * @return array<int, array{type: string, language: string|null, content: string}> */ public function chunkByCodeBlocks(string $content): array { $chunks = []; $lines = explode("\n", $content); $currentChunk = []; $inCodeBlock = false; $codeLanguage = null; $textBuffer = []; foreach ($lines as $line) { $codeMatch = preg_match('/^```(\w*)$/', $line, $matches); if (false !== $codeMatch && 1 === $codeMatch) { if ($inCodeBlock) { $currentChunk[] = $line; $chunks[] = [ 'type' => 'code', 'language' => $codeLanguage, 'content' => implode("\n", $currentChunk), ]; $currentChunk = []; $inCodeBlock = false; $codeLanguage = null; } else { if ([] !== $textBuffer) { $chunks[] = [ 'type' => 'text', 'language' => null, 'content' => implode("\n", $textBuffer), ]; $textBuffer = []; } $inCodeBlock = true; $codeLanguage = isset($matches[1]) && '' !== $matches[1] ? $matches[1] : 'plaintext'; $currentChunk = [$line]; } } elseif ($inCodeBlock) { $currentChunk[] = $line; } else { $textBuffer[] = $line; if (50 <= count($textBuffer)) { $chunks[] = [ 'type' => 'text', 'language' => null, 'content' => implode("\n", $textBuffer), ]; $textBuffer = []; } } } if ([] !== $textBuffer) { $chunks[] = [ 'type' => 'text', 'language' => null, 'content' => implode("\n", $textBuffer), ]; } if ([] !== $currentChunk) { $chunks[] = [ 'type' => $inCodeBlock ? 'code' : 'text', 'language' => $codeLanguage, 'content' => implode("\n", $currentChunk), ]; } return $chunks; } /** * @param string $content * @return array<int, array{header: string|null, content: string, level: int}> */ public function chunkByHeaders(string $content): array { if ('' === $content) { return []; } $chunks = []; $lines = explode("\n", $content); $currentChunk = []; $currentHeader = null; $currentLevel = 0; foreach ($lines as $line) { $headerMatch = preg_match('/^(#{1,6}) (.+)$/', $line, $matches); if (false !== $headerMatch && 1 === $headerMatch) { if ([] !== $currentChunk) { $chunks[] = [ 'header' => $currentHeader, 'content' => implode("\n", $currentChunk), 'level' => $currentLevel, ]; } /** @var array{0: non-falsy-string, 1: non-falsy-string, 2: non-empty-string} $matches */ $currentHeader = trim($matches[2]); $currentLevel = strlen($matches[1]); $currentChunk = [$line]; } else { $currentChunk[] = $line; } } // After the loop, currentChunk will always have content since we checked for empty string $chunks[] = [ 'header' => $currentHeader, 'content' => implode("\n", $currentChunk), 'level' => $currentLevel, ]; return $chunks; } /** * @param string $content * @param int $maxLines * @return array<int, string> */ public function chunkByLines(string $content, int $maxLines = 100): array { $lines = explode("\n", $content); $chunks = []; $currentChunk = []; foreach ($lines as $line) { $currentChunk[] = $line; if (count($currentChunk) >= $maxLines) { $chunks[] = implode("\n", $currentChunk); $currentChunk = array_slice($currentChunk, -10); } } if ([] !== $currentChunk) { $chunks[] = implode("\n", $currentChunk); } return $chunks; } /** * @param string $content * @param int $maxSize * @return array<int, string> */ public function chunkBySize(string $content, int $maxSize = self::DEFAULT_CHUNK_SIZE): array { $chunks = []; $currentChunk = ''; $sentences = $this->splitIntoSentences($content); foreach ($sentences as $sentence) { if (strlen($currentChunk) + strlen($sentence) > $maxSize && self::MIN_CHUNK_SIZE < strlen($currentChunk)) { $chunks[] = trim($currentChunk); $overlapText = $this->getOverlapText($currentChunk); $currentChunk = $overlapText . ' ' . $sentence; } else { $currentChunk .= ' ' . $sentence; } } if ('' !== trim($currentChunk)) { $chunks[] = trim($currentChunk); } return $chunks; } /** * @param string $content * @return array<int, array{source: string|null, content: string, type: string}> */ public function chunkBySourceBlocks(string $content): array { $chunks = []; $lines = explode("\n", $content); $currentChunk = []; $inSourceBlock = false; $sourceFile = null; foreach ($lines as $line) { $sourceMatch = preg_match('/^<!-- Source: (.+) -->$/', $line, $matches); if (false !== $sourceMatch && 1 === $sourceMatch) { if ([] !== $currentChunk) { $chunks[] = [ 'source' => $sourceFile, 'content' => implode("\n", $currentChunk), 'type' => 'source_block', ]; } $inSourceBlock = true; $sourceFile = isset($matches[1]) ? trim($matches[1]) : null; $currentChunk = []; continue; } $endMatch = preg_match('/^<!-- End of .+ -->$/', $line); if (false !== $endMatch && 1 === $endMatch) { if ([] !== $currentChunk) { $chunks[] = [ 'source' => $sourceFile, 'content' => implode("\n", $currentChunk), 'type' => 'source_block', ]; } $inSourceBlock = false; $sourceFile = null; $currentChunk = []; continue; } $currentChunk[] = $line; } if ([] !== $currentChunk) { $chunks[] = [ 'source' => $sourceFile, 'content' => implode("\n", $currentChunk), 'type' => $inSourceBlock ? 'source_block' : 'general', ]; } return $chunks; } /** * @param string $content * @return array<int, array{language: string, code: string, description: string, line_number: int}> */ public function extractCodeExamples(string $content): array { $examples = []; $lines = explode("\n", $content); $inCodeBlock = false; $currentCode = []; $codeLanguage = null; $precedingText = ''; $counter = count($lines); for ($i = 0; $i < $counter; ++$i) { $line = $lines[$i]; $codeMatch = preg_match('/^```(\w*)$/', $line, $matches); if (false !== $codeMatch && 1 === $codeMatch) { if ($inCodeBlock) { $examples[] = [ 'language' => $codeLanguage ?? 'plaintext', 'code' => implode("\n", $currentCode), 'description' => $this->extractDescription($precedingText), 'line_number' => $i - count($currentCode), ]; $currentCode = []; $inCodeBlock = false; $codeLanguage = null; $precedingText = ''; } else { $inCodeBlock = true; $codeLanguage = isset($matches[1]) && '' !== $matches[1] ? $matches[1] : 'plaintext'; $precedingText = $this->getPrecedingText($lines, $i, 5); } } elseif ($inCodeBlock) { $currentCode[] = $line; } } return $examples; } /** * @param array<string, mixed> $options * @param string $content * @return array<int, array{content: string, metadata: array<string, mixed>}|string> */ public function smartChunk(string $content, array $options = []): array { /** @var mixed $maxSizeValue */ $maxSizeValue = $options['max_size'] ?? null; $maxSize = is_int($maxSizeValue) ? $maxSizeValue : self::DEFAULT_CHUNK_SIZE; $preserveHeaders = (bool) ($options['preserve_headers'] ?? true); $includeMetadata = (bool) ($options['include_metadata'] ?? true); $chunks = []; $lines = explode("\n", $content); $currentChunk = []; $currentMetadata = []; $inCodeBlock = false; $currentHeader = null; $currentSize = 0; foreach ($lines as $line) { $lineSize = strlen($line); $codeBlockMatch = preg_match('/^```/', $line); if (false !== $codeBlockMatch && 1 === $codeBlockMatch) { $inCodeBlock = ! $inCodeBlock; } $headerMatch = preg_match('/^(#{1,6}) (.+)$/', $line, $matches); if (! $inCodeBlock && false !== $headerMatch && 1 === $headerMatch) { if (self::MIN_CHUNK_SIZE < $currentSize) { $this->finalizeChunk($chunks, $currentChunk, $currentMetadata, $includeMetadata); $currentChunk = []; $currentSize = 0; } $currentHeader = isset($matches[2]) ? trim($matches[2]) : null; $currentMetadata['header'] = $currentHeader; $currentMetadata['header_level'] = isset($matches[1]) ? strlen($matches[1]) : 0; } // Handle case where a single line is longer than max size if ($lineSize > $maxSize && ! $inCodeBlock) { // If we have existing content, finalize it first if ([] !== $currentChunk) { $this->finalizeChunk($chunks, $currentChunk, $currentMetadata, $includeMetadata); } // Split the line using sentence-based chunking $sentenceChunks = $this->chunkBySize($line, $maxSize); // Add all but the last sentence chunk for ($i = 0; $i < count($sentenceChunks) - 1; ++$i) { $this->finalizeChunk($chunks, [$sentenceChunks[$i]], $currentMetadata, $includeMetadata); } // Keep the last sentence chunk for the next iteration $currentChunk = [$sentenceChunks[count($sentenceChunks) - 1]]; $currentSize = strlen($currentChunk[0]); } else { $currentChunk[] = $line; $currentSize += $lineSize; if ($currentSize >= $maxSize && ! $inCodeBlock) { $this->finalizeChunk($chunks, $currentChunk, $currentMetadata, $includeMetadata); if ($preserveHeaders && null !== $currentHeader) { $currentChunk = [str_repeat('#', $currentMetadata['header_level'] ?? 2) . ' ' . $currentHeader . ' (continued)']; $currentSize = strlen($currentChunk[0]); } else { $currentChunk = []; $currentSize = 0; } } } } if ([] !== $currentChunk) { $this->finalizeChunk($chunks, $currentChunk, $currentMetadata, $includeMetadata); } return $chunks; } private function extractDescription(string $text): string { $text = trim($text); $descMatch = preg_match('/(?:Example|Usage|Sample|Code):\s*(.+)$/i', $text, $matches); if (false !== $descMatch && 1 === $descMatch && isset($matches[1])) { return trim($matches[1]); } $sentences = $this->splitIntoSentences($text); return $sentences[count($sentences) - 1] ?? ''; } /** * @param array<int, array{content: string, metadata: array<string, mixed>}|string> $chunks * @param array<int, string> $lines * @param array<string, mixed> $metadata * @param bool $includeMetadata */ private function finalizeChunk(array &$chunks, array $lines, array $metadata, bool $includeMetadata): void { $content = implode("\n", $lines); if ($includeMetadata) { $chunks[] = [ 'content' => $content, 'metadata' => array_merge($metadata, [ 'size' => strlen($content), 'line_count' => count($lines), ]), ]; } else { $chunks[] = $content; } } private function getOverlapText(string $chunk): string { $words = explode(' ', $chunk); $overlapWords = array_slice($words, -20); return implode(' ', $overlapWords); } /** * @param array<int, string> $lines * @param int $currentIndex * @param int $lookback */ private function getPrecedingText(array $lines, int $currentIndex, int $lookback = 5): string { $start = max(0, $currentIndex - $lookback); $precedingLines = array_slice($lines, $start, $currentIndex - $start); return implode(' ', array_filter($precedingLines, static fn ($line): bool => '' !== trim($line))); } /** * @param string $text * @return array<int, string> */ private function splitIntoSentences(string $text): array { $sentences = preg_split('/(?<=[.!?])\s+/', $text, -1, PREG_SPLIT_NO_EMPTY); return false !== $sentences ? $sentences : [$text]; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/evansims/openfga-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server