Kodit

Overview Schema Related Servers Score Discussions

chunk_files.go•8.46 KiB

package indexing import ( "bytes" "context" "fmt" "log/slog" "path/filepath" "strconv" "strings" "github.com/helixml/kodit/application/handler" "github.com/helixml/kodit/domain/chunk" "github.com/helixml/kodit/domain/enrichment" "github.com/helixml/kodit/domain/repository" "github.com/helixml/kodit/domain/task" "github.com/helixml/kodit/infrastructure/chunking" ) // binaryProbeSize is the number of bytes checked for null bytes to detect binary files. const binaryProbeSize = 8192 // FileContentSource reads file content at a specific commit. type FileContentSource interface { FileContent(ctx context.Context, localPath string, commitSHA string, filePath string) ([]byte, error) } // ChunkFiles creates fixed-size text chunks from commit files. type ChunkFiles struct { repoStore repository.RepositoryStore enrichmentStore enrichment.EnrichmentStore associationStore enrichment.AssociationStore lineRangeStore chunk.LineRangeStore fileStore repository.FileStore fileContent FileContentSource params chunking.ChunkParams trackerFactory handler.TrackerFactory logger *slog.Logger } // NewChunkFiles creates a new ChunkFiles handler. func NewChunkFiles( repoStore repository.RepositoryStore, enrichmentStore enrichment.EnrichmentStore, associationStore enrichment.AssociationStore, lineRangeStore chunk.LineRangeStore, fileStore repository.FileStore, fileContent FileContentSource, params chunking.ChunkParams, trackerFactory handler.TrackerFactory, logger *slog.Logger, ) *ChunkFiles { return &ChunkFiles{ repoStore: repoStore, enrichmentStore: enrichmentStore, associationStore: associationStore, lineRangeStore: lineRangeStore, fileStore: fileStore, fileContent: fileContent, params: params, trackerFactory: trackerFactory, logger: logger, } } // Execute processes the EXTRACT_SNIPPETS_FOR_COMMIT task using fixed-size text chunking. func (h *ChunkFiles) Execute(ctx context.Context, payload map[string]any) error { cp, err := handler.ExtractCommitPayload(payload) if err != nil { return err } tracker := h.trackerFactory.ForOperation( task.OperationExtractSnippetsForCommit, task.TrackableTypeRepository, cp.RepoID(), ) existing, err := h.enrichmentStore.Find(ctx, enrichment.WithCommitSHA(cp.CommitSHA()), enrichment.WithType(enrichment.TypeDevelopment), enrichment.WithSubtype(enrichment.SubtypeChunk), ) if err != nil { return fmt.Errorf("check existing chunks: %w", err) } if len(existing) > 0 { tracker.Skip(ctx, "Chunks already created for commit") return nil } repo, err := h.repoStore.FindOne(ctx, repository.WithID(cp.RepoID())) if err != nil { return fmt.Errorf("get repository: %w", err) } clonedPath := repo.WorkingCopy().Path() if clonedPath == "" { return fmt.Errorf("repository %d has never been cloned", cp.RepoID()) } files, err := h.fileStore.Find(ctx, repository.WithCommitSHA(cp.CommitSHA()), repository.WithOrderAsc("path"), ) if err != nil { return fmt.Errorf("get commit files: %w", err) } if len(files) == 0 { tracker.Skip(ctx, "No files found for commit") return nil } tracker.SetTotal(ctx, len(files)) repoIDStr := strconv.FormatInt(cp.RepoID(), 10) processed := 0 for _, f := range files { tracker.SetCurrent(ctx, processed, fmt.Sprintf("Chunking %s", f.Path())) if !isIndexable(f.Path()) { processed++ continue } relPath := relativeFilePath(f.Path(), clonedPath) content, readErr := h.fileContent.FileContent(ctx, clonedPath, cp.CommitSHA(), relPath) if readErr != nil { h.logger.Warn("failed to read file content", slog.String("path", f.Path()), slog.String("error", readErr.Error()), ) processed++ continue } if isBinary(content) { processed++ continue } textChunks, chunkErr := chunking.NewTextChunks(string(content), h.params) if chunkErr != nil { h.logger.Warn("failed to chunk file", slog.String("path", f.Path()), slog.String("error", chunkErr.Error()), ) processed++ continue } for _, ch := range textChunks.All() { e := enrichment.NewChunkEnrichmentWithLanguage(ch.Content(), f.Extension()) saved, saveErr := h.enrichmentStore.Save(ctx, e) if saveErr != nil { return fmt.Errorf("save chunk enrichment: %w", saveErr) } lr := chunk.NewLineRange(saved.ID(), ch.StartLine(), ch.EndLine()) if _, err := h.lineRangeStore.Save(ctx, lr); err != nil { return fmt.Errorf("save chunk line range: %w", err) } if _, err := h.associationStore.Save(ctx, enrichment.CommitAssociation(saved.ID(), cp.CommitSHA())); err != nil { return fmt.Errorf("save commit association: %w", err) } if f.ID() != 0 { if _, err := h.associationStore.Save(ctx, enrichment.FileAssociation(saved.ID(), strconv.FormatInt(f.ID(), 10))); err != nil { return fmt.Errorf("save file association: %w", err) } } if _, err := h.associationStore.Save(ctx, enrichment.RepositoryAssociation(saved.ID(), repoIDStr)); err != nil { return fmt.Errorf("save repository association: %w", err) } } processed++ } h.logger.Info("text chunks created", slog.Int("files", len(files)), slog.String("commit", handler.ShortSHA(cp.CommitSHA())), ) return nil } // relativeFilePath converts a file path to a path relative to a git repository. // File records from legacy database migrations may contain absolute paths instead of // repository-relative paths. This function normalizes both cases so that git show // (which requires repo-relative paths) works correctly. func relativeFilePath(filePath, clonedPath string) string { if !filepath.IsAbs(filePath) { return filePath } // If the path starts with the current clone directory, strip it. clonedPath = filepath.Clean(clonedPath) if rel, err := filepath.Rel(clonedPath, filePath); err == nil && !strings.HasPrefix(rel, "..") { return rel } // Legacy absolute paths follow the pattern: /<data-dir>/<type>/<repo-name>/<relative-path> // where <type> is "clones" or "repos". Find the last such segment and extract the // relative portion after the repo directory. parts := strings.Split(filepath.Clean(filePath), string(filepath.Separator)) lastIdx := -1 for i, part := range parts { if part == "clones" || part == "repos" { lastIdx = i } } if lastIdx >= 0 && lastIdx+2 < len(parts) { return filepath.Join(parts[lastIdx+2:]...) } return filePath } // indexableExtensions lists file extensions that contain human-written source // code or documentation worth indexing. Everything else (lock files, images, // binary formats, data files) is skipped. var indexableExtensions = map[string]bool{ // Go ".go": true, // Python ".py": true, ".pyi": true, ".pyx": true, // JavaScript / TypeScript ".js": true, ".mjs": true, ".cjs": true, ".jsx": true, ".ts": true, ".mts": true, ".cts": true, ".tsx": true, // Ruby ".rb": true, ".erb": true, // Rust ".rs": true, // Java / Kotlin / Scala / Groovy ".java": true, ".kt": true, ".kts": true, ".scala": true, ".groovy": true, // C / C++ / Objective-C ".c": true, ".h": true, ".cpp": true, ".cc": true, ".cxx": true, ".hpp": true, ".hxx": true, ".m": true, ".mm": true, // C# / F# ".cs": true, ".fs": true, ".fsx": true, // PHP ".php": true, // Swift ".swift": true, // Shell ".sh": true, ".bash": true, ".zsh": true, ".fish": true, // SQL ".sql": true, // R ".r": true, // Lua ".lua": true, // Perl ".pl": true, ".pm": true, // Elixir / Erlang ".ex": true, ".exs": true, ".erl": true, ".hrl": true, // Haskell ".hs": true, // Clojure ".clj": true, ".cljs": true, ".cljc": true, // Dart ".dart": true, // Zig / Nim ".zig": true, ".nim": true, // Julia ".jl": true, // OCaml ".ml": true, ".mli": true, // V / D ".v": true, ".d": true, // Web ".html": true, ".htm": true, ".css": true, ".scss": true, ".sass": true, ".less": true, ".vue": true, ".svelte": true, // Documentation ".md": true, ".mdx": true, ".rst": true, ".adoc": true, ".tex": true, // IDL / Schema ".proto": true, ".graphql": true, ".gql": true, ".thrift": true, } // isIndexable returns true if the file extension is in the whitelist of // source code and documentation formats worth indexing. func isIndexable(path string) bool { ext := strings.ToLower(filepath.Ext(path)) return indexableExtensions[ext] } // isBinary returns true if the content contains null bytes in the first 8KB. func isBinary(content []byte) bool { probe := content if len(probe) > binaryProbeSize { probe = probe[:binaryProbeSize] } return bytes.ContainsRune(probe, 0) }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/helixml/kodit'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

chunk_files.go•8.46 KiB