DocumentationChunker.php•15.8 kB
<?php
declare(strict_types=1);
namespace OpenFGA\MCP\Documentation;
use function array_slice;
use function count;
use function is_int;
use function strlen;
final class DocumentationChunker
{
    private const int DEFAULT_CHUNK_SIZE = 3000;
    private const int MIN_CHUNK_SIZE = 500;
    /**
     * @param  string                                                                  $content
     * @return array<int, array{type: string, language: string|null, content: string}>
     */
    public function chunkByCodeBlocks(string $content): array
    {
        $chunks = [];
        $lines = explode("\n", $content);
        $currentChunk = [];
        $inCodeBlock = false;
        $codeLanguage = null;
        $textBuffer = [];
        foreach ($lines as $line) {
            $codeMatch = preg_match('/^```(\w*)$/', $line, $matches);
            if (false !== $codeMatch && 1 === $codeMatch) {
                if ($inCodeBlock) {
                    $currentChunk[] = $line;
                    $chunks[] = [
                        'type' => 'code',
                        'language' => $codeLanguage,
                        'content' => implode("\n", $currentChunk),
                    ];
                    $currentChunk = [];
                    $inCodeBlock = false;
                    $codeLanguage = null;
                } else {
                    if ([] !== $textBuffer) {
                        $chunks[] = [
                            'type' => 'text',
                            'language' => null,
                            'content' => implode("\n", $textBuffer),
                        ];
                        $textBuffer = [];
                    }
                    $inCodeBlock = true;
                    $codeLanguage = isset($matches[1]) && '' !== $matches[1] ? $matches[1] : 'plaintext';
                    $currentChunk = [$line];
                }
            } elseif ($inCodeBlock) {
                $currentChunk[] = $line;
            } else {
                $textBuffer[] = $line;
                if (50 <= count($textBuffer)) {
                    $chunks[] = [
                        'type' => 'text',
                        'language' => null,
                        'content' => implode("\n", $textBuffer),
                    ];
                    $textBuffer = [];
                }
            }
        }
        if ([] !== $textBuffer) {
            $chunks[] = [
                'type' => 'text',
                'language' => null,
                'content' => implode("\n", $textBuffer),
            ];
        }
        if ([] !== $currentChunk) {
            $chunks[] = [
                'type' => $inCodeBlock ? 'code' : 'text',
                'language' => $codeLanguage,
                'content' => implode("\n", $currentChunk),
            ];
        }
        return $chunks;
    }
    /**
     * @param  string                                                              $content
     * @return array<int, array{header: string|null, content: string, level: int}>
     */
    public function chunkByHeaders(string $content): array
    {
        if ('' === $content) {
            return [];
        }
        $chunks = [];
        $lines = explode("\n", $content);
        $currentChunk = [];
        $currentHeader = null;
        $currentLevel = 0;
        foreach ($lines as $line) {
            $headerMatch = preg_match('/^(#{1,6}) (.+)$/', $line, $matches);
            if (false !== $headerMatch && 1 === $headerMatch) {
                if ([] !== $currentChunk) {
                    $chunks[] = [
                        'header' => $currentHeader,
                        'content' => implode("\n", $currentChunk),
                        'level' => $currentLevel,
                    ];
                }
                /** @var array{0: non-falsy-string, 1: non-falsy-string, 2: non-empty-string} $matches */
                $currentHeader = trim($matches[2]);
                $currentLevel = strlen($matches[1]);
                $currentChunk = [$line];
            } else {
                $currentChunk[] = $line;
            }
        }
        // After the loop, currentChunk will always have content since we checked for empty string
        $chunks[] = [
            'header' => $currentHeader,
            'content' => implode("\n", $currentChunk),
            'level' => $currentLevel,
        ];
        return $chunks;
    }
    /**
     * @param  string             $content
     * @param  int                $maxLines
     * @return array<int, string>
     */
    public function chunkByLines(string $content, int $maxLines = 100): array
    {
        $lines = explode("\n", $content);
        $chunks = [];
        $currentChunk = [];
        foreach ($lines as $line) {
            $currentChunk[] = $line;
            if (count($currentChunk) >= $maxLines) {
                $chunks[] = implode("\n", $currentChunk);
                $currentChunk = array_slice($currentChunk, -10);
            }
        }
        if ([] !== $currentChunk) {
            $chunks[] = implode("\n", $currentChunk);
        }
        return $chunks;
    }
    /**
     * @param  string             $content
     * @param  int                $maxSize
     * @return array<int, string>
     */
    public function chunkBySize(string $content, int $maxSize = self::DEFAULT_CHUNK_SIZE): array
    {
        $chunks = [];
        $currentChunk = '';
        $sentences = $this->splitIntoSentences($content);
        foreach ($sentences as $sentence) {
            if (strlen($currentChunk) + strlen($sentence) > $maxSize && self::MIN_CHUNK_SIZE < strlen($currentChunk)) {
                $chunks[] = trim($currentChunk);
                $overlapText = $this->getOverlapText($currentChunk);
                $currentChunk = $overlapText . ' ' . $sentence;
            } else {
                $currentChunk .= ' ' . $sentence;
            }
        }
        if ('' !== trim($currentChunk)) {
            $chunks[] = trim($currentChunk);
        }
        return $chunks;
    }
    /**
     * @param  string                                                                $content
     * @return array<int, array{source: string|null, content: string, type: string}>
     */
    public function chunkBySourceBlocks(string $content): array
    {
        $chunks = [];
        $lines = explode("\n", $content);
        $currentChunk = [];
        $inSourceBlock = false;
        $sourceFile = null;
        foreach ($lines as $line) {
            $sourceMatch = preg_match('/^<!-- Source: (.+) -->$/', $line, $matches);
            if (false !== $sourceMatch && 1 === $sourceMatch) {
                if ([] !== $currentChunk) {
                    $chunks[] = [
                        'source' => $sourceFile,
                        'content' => implode("\n", $currentChunk),
                        'type' => 'source_block',
                    ];
                }
                $inSourceBlock = true;
                $sourceFile = isset($matches[1]) ? trim($matches[1]) : null;
                $currentChunk = [];
                continue;
            }
            $endMatch = preg_match('/^<!-- End of .+ -->$/', $line);
            if (false !== $endMatch && 1 === $endMatch) {
                if ([] !== $currentChunk) {
                    $chunks[] = [
                        'source' => $sourceFile,
                        'content' => implode("\n", $currentChunk),
                        'type' => 'source_block',
                    ];
                }
                $inSourceBlock = false;
                $sourceFile = null;
                $currentChunk = [];
                continue;
            }
            $currentChunk[] = $line;
        }
        if ([] !== $currentChunk) {
            $chunks[] = [
                'source' => $sourceFile,
                'content' => implode("\n", $currentChunk),
                'type' => $inSourceBlock ? 'source_block' : 'general',
            ];
        }
        return $chunks;
    }
    /**
     * @param  string                                                                                   $content
     * @return array<int, array{language: string, code: string, description: string, line_number: int}>
     */
    public function extractCodeExamples(string $content): array
    {
        $examples = [];
        $lines = explode("\n", $content);
        $inCodeBlock = false;
        $currentCode = [];
        $codeLanguage = null;
        $precedingText = '';
        $counter = count($lines);
        for ($i = 0; $i < $counter; ++$i) {
            $line = $lines[$i];
            $codeMatch = preg_match('/^```(\w*)$/', $line, $matches);
            if (false !== $codeMatch && 1 === $codeMatch) {
                if ($inCodeBlock) {
                    $examples[] = [
                        'language' => $codeLanguage ?? 'plaintext',
                        'code' => implode("\n", $currentCode),
                        'description' => $this->extractDescription($precedingText),
                        'line_number' => $i - count($currentCode),
                    ];
                    $currentCode = [];
                    $inCodeBlock = false;
                    $codeLanguage = null;
                    $precedingText = '';
                } else {
                    $inCodeBlock = true;
                    $codeLanguage = isset($matches[1]) && '' !== $matches[1] ? $matches[1] : 'plaintext';
                    $precedingText = $this->getPrecedingText($lines, $i, 5);
                }
            } elseif ($inCodeBlock) {
                $currentCode[] = $line;
            }
        }
        return $examples;
    }
    /**
     * @param  array<string, mixed>                                                      $options
     * @param  string                                                                    $content
     * @return array<int, array{content: string, metadata: array<string, mixed>}|string>
     */
    public function smartChunk(string $content, array $options = []): array
    {
        /** @var mixed $maxSizeValue */
        $maxSizeValue = $options['max_size'] ?? null;
        $maxSize = is_int($maxSizeValue) ? $maxSizeValue : self::DEFAULT_CHUNK_SIZE;
        $preserveHeaders = (bool) ($options['preserve_headers'] ?? true);
        $includeMetadata = (bool) ($options['include_metadata'] ?? true);
        $chunks = [];
        $lines = explode("\n", $content);
        $currentChunk = [];
        $currentMetadata = [];
        $inCodeBlock = false;
        $currentHeader = null;
        $currentSize = 0;
        foreach ($lines as $line) {
            $lineSize = strlen($line);
            $codeBlockMatch = preg_match('/^```/', $line);
            if (false !== $codeBlockMatch && 1 === $codeBlockMatch) {
                $inCodeBlock = ! $inCodeBlock;
            }
            $headerMatch = preg_match('/^(#{1,6}) (.+)$/', $line, $matches);
            if (! $inCodeBlock && false !== $headerMatch && 1 === $headerMatch) {
                if (self::MIN_CHUNK_SIZE < $currentSize) {
                    $this->finalizeChunk($chunks, $currentChunk, $currentMetadata, $includeMetadata);
                    $currentChunk = [];
                    $currentSize = 0;
                }
                $currentHeader = isset($matches[2]) ? trim($matches[2]) : null;
                $currentMetadata['header'] = $currentHeader;
                $currentMetadata['header_level'] = isset($matches[1]) ? strlen($matches[1]) : 0;
            }
            // Handle case where a single line is longer than max size
            if ($lineSize > $maxSize && ! $inCodeBlock) {
                // If we have existing content, finalize it first
                if ([] !== $currentChunk) {
                    $this->finalizeChunk($chunks, $currentChunk, $currentMetadata, $includeMetadata);
                }
                // Split the line using sentence-based chunking
                $sentenceChunks = $this->chunkBySize($line, $maxSize);
                // Add all but the last sentence chunk
                for ($i = 0; $i < count($sentenceChunks) - 1; ++$i) {
                    $this->finalizeChunk($chunks, [$sentenceChunks[$i]], $currentMetadata, $includeMetadata);
                }
                // Keep the last sentence chunk for the next iteration
                $currentChunk = [$sentenceChunks[count($sentenceChunks) - 1]];
                $currentSize = strlen($currentChunk[0]);
            } else {
                $currentChunk[] = $line;
                $currentSize += $lineSize;
                if ($currentSize >= $maxSize && ! $inCodeBlock) {
                    $this->finalizeChunk($chunks, $currentChunk, $currentMetadata, $includeMetadata);
                    if ($preserveHeaders && null !== $currentHeader) {
                        $currentChunk = [str_repeat('#', $currentMetadata['header_level'] ?? 2) . ' ' . $currentHeader . ' (continued)'];
                        $currentSize = strlen($currentChunk[0]);
                    } else {
                        $currentChunk = [];
                        $currentSize = 0;
                    }
                }
            }
        }
        if ([] !== $currentChunk) {
            $this->finalizeChunk($chunks, $currentChunk, $currentMetadata, $includeMetadata);
        }
        return $chunks;
    }
    private function extractDescription(string $text): string
    {
        $text = trim($text);
        $descMatch = preg_match('/(?:Example|Usage|Sample|Code):\s*(.+)$/i', $text, $matches);
        if (false !== $descMatch && 1 === $descMatch && isset($matches[1])) {
            return trim($matches[1]);
        }
        $sentences = $this->splitIntoSentences($text);
        return $sentences[count($sentences) - 1] ?? '';
    }
    /**
     * @param array<int, array{content: string, metadata: array<string, mixed>}|string> $chunks
     * @param array<int, string>                                                        $lines
     * @param array<string, mixed>                                                      $metadata
     * @param bool                                                                      $includeMetadata
     */
    private function finalizeChunk(array &$chunks, array $lines, array $metadata, bool $includeMetadata): void
    {
        $content = implode("\n", $lines);
        if ($includeMetadata) {
            $chunks[] = [
                'content' => $content,
                'metadata' => array_merge($metadata, [
                    'size' => strlen($content),
                    'line_count' => count($lines),
                ]),
            ];
        } else {
            $chunks[] = $content;
        }
    }
    private function getOverlapText(string $chunk): string
    {
        $words = explode(' ', $chunk);
        $overlapWords = array_slice($words, -20);
        return implode(' ', $overlapWords);
    }
    /**
     * @param array<int, string> $lines
     * @param int                $currentIndex
     * @param int                $lookback
     */
    private function getPrecedingText(array $lines, int $currentIndex, int $lookback = 5): string
    {
        $start = max(0, $currentIndex - $lookback);
        $precedingLines = array_slice($lines, $start, $currentIndex - $start);
        return implode(' ', array_filter($precedingLines, static fn ($line): bool => '' !== trim($line)));
    }
    /**
     * @param  string             $text
     * @return array<int, string>
     */
    private function splitIntoSentences(string $text): array
    {
        $sentences = preg_split('/(?<=[.!?])\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);
        return false !== $sentences ? $sentences : [$text];
    }
}