Code Context MCP Server
by fkesheh
Verified
- code-context-mcp
- utils
import { simpleGit } from "simple-git";
import fs from "fs";
import path from "path";
import db from "./db.js";
import { extensionToSplitter, splitDocument } from "./codeSplitter.js";
import config from "../config.js";
import { generateOllamaEmbeddings } from "./ollamaEmbeddings.js";
/**
* Clone a git repository if it doesn't exist locally
* @param repoUrl URL of the git repository
* @param localPath Local path to clone to
* @returns Path to the cloned repository
*/
export const cloneRepository = async (
repoUrl: string,
localPath: string
): Promise<string> => {
// Extract repository name from URL
const repoName = path.basename(repoUrl, ".git");
const fullPath = path.join(localPath, repoName);
// Check if repository already exists
if (fs.existsSync(fullPath)) {
console.error(`Repository already exists at ${fullPath}`);
return fullPath;
}
// Clone the repository
console.error(`Cloning repository ${repoUrl} to ${fullPath}`);
const git = simpleGit();
await git.clone(repoUrl, fullPath);
return fullPath;
};
/**
* Register a repository in the database
* @param repoPath Path to the local repository
* @returns Repository ID
*/
export const registerRepository = (repoPath: string): number => {
const repoName = path.basename(repoPath);
// Insert repository into database
const result = db.run(
"INSERT INTO repository (name, path) VALUES (:name, :path) ON CONFLICT(path) DO UPDATE SET last_updated = CURRENT_TIMESTAMP RETURNING id",
{ name: repoName, path: repoPath }
);
// If no id was returned, get it
if (!result.lastInsertRowid) {
const repo = db.get("SELECT id FROM repository WHERE path = :path", {
path: repoPath,
}) as { id: number };
return repo.id;
}
return result.lastInsertRowid as number;
};
/**
* Register a branch in the database
* @param branchName Name of the branch
* @param repoId Repository ID
* @param commitSha Latest commit SHA
* @returns Branch ID
*/
export const registerBranch = (
branchName: string,
repoId: number,
commitSha: string
): number => {
// Insert branch into database
const result = db.run(
`INSERT INTO branch (name, repository_id, last_commit_sha, status)
VALUES (:name, :repoId, :commitSha, 'pending')
ON CONFLICT(name, repository_id) DO UPDATE
SET last_commit_sha = :commitSha,
status = CASE
WHEN last_commit_sha <> :commitSha THEN 'pending'
ELSE status
END
RETURNING id`,
{ name: branchName, repoId, commitSha }
);
// If no id was returned, get it
if (!result.lastInsertRowid) {
const branch = db.get(
"SELECT id FROM branch WHERE name = :name AND repository_id = :repoId",
{ name: branchName, repoId }
) as { id: number };
return branch.id;
}
return result.lastInsertRowid as number;
};
interface RepositoryFile {
path: string;
name: string;
sha: string;
}
interface RepositoryFilesResult {
files: RepositoryFile[];
commitSha: string;
}
/**
* Get the files in a repository branch
* @param repoPath Path to the repository
* @param branchName Name of the branch
* @returns List of files with their metadata
*/
export const getRepositoryFiles = async (
repoPath: string,
branchName: string
): Promise<RepositoryFilesResult> => {
const git = simpleGit(repoPath);
// Checkout the branch
await git.checkout(branchName);
// Get the latest commit SHA
const latestCommit = await git.revparse([branchName]);
// Get the file tree
const files: RepositoryFile[] = [];
// Use git ls-tree to get all files recursively
const result = await git.raw(["ls-tree", "-r", branchName]);
const stdout = result.toString();
// Parse the output
const lines = stdout.split("\n").filter((line) => line.trim() !== "");
for (const line of lines) {
// Format: <mode> <type> <object> <file>
const [info, filePath] = line.split("\t");
const [, , sha] = info.split(" ");
if (filePath) {
files.push({
path: filePath,
name: path.basename(filePath),
sha,
});
}
}
return { files, commitSha: latestCommit };
};
/**
* Process files for a branch
* @param branchId Branch ID
* @param repoId Repository ID
* @param repoPath Path to the repository
* @param branchName Name of the branch
*/
export const processRepositoryFiles = async (
branchId: number,
repoId: number,
repoPath: string,
branchName: string
): Promise<void> => {
// Get files in the repository
const { files, commitSha } = await getRepositoryFiles(repoPath, branchName);
// Process each file
db.transaction((db) => {
for (const file of files) {
// Insert file information
const fileResult = db.run(
`INSERT INTO file (repository_id, path, sha, name, status)
VALUES (:repoId, :path, :sha, :name, :status)
ON CONFLICT(repository_id, path, sha) DO NOTHING
RETURNING id`,
{
repoId,
path: file.path,
sha: file.sha,
name: file.name,
status: "pending",
}
);
let fileId;
if (fileResult.lastInsertRowid) {
fileId = fileResult.lastInsertRowid;
} else {
// Get the existing file ID
const existingFile = db.get(
"SELECT id FROM file WHERE repository_id = :repoId AND path = :path AND sha = :sha",
{
repoId,
path: file.path,
sha: file.sha,
}
) as { id: number };
fileId = existingFile.id;
}
// Associate file with branch
db.run(
"INSERT INTO branch_file_association (branch_id, file_id) VALUES (:branchId, :fileId) ON CONFLICT DO NOTHING",
{
branchId,
fileId,
}
);
// Update branch status
db.run(
"UPDATE branch SET last_commit_sha = :commitSha, status = :status WHERE id = :branchId",
{
commitSha,
status: "files_processed",
branchId,
}
);
}
});
};
interface PendingFile {
id: number;
path: string;
sha: string;
}
interface FileChunk {
id: number;
chunk_number: number;
content: string;
file_id: number;
}
/**
* Process file content and split into chunks
* @param branchName Branch name
* @param repoPath Repository path
*/
export const processFileContents = async (
branchName: string,
repoPath: string
): Promise<void> => {
const git = simpleGit(repoPath);
// Checkout the branch
await git.checkout(branchName);
// Get repository and branch IDs
const repo = db.get("SELECT id FROM repository WHERE path = :path", {
path: repoPath,
}) as { id: number };
const branch = db.get(
"SELECT id FROM branch WHERE name = :name AND repository_id = :repoId",
{ name: branchName, repoId: repo.id }
) as { id: number };
// Get all pending files for the branch
const pendingFiles = db.all(
`SELECT f.id, f.path, f.sha
FROM file f
JOIN branch_file_association bfa ON f.id = bfa.file_id
WHERE f.status = 'pending' AND bfa.branch_id = :branchId`,
{ branchId: branch.id }
) as PendingFile[];
for (const file of pendingFiles) {
console.error(`Processing file: ${file.path}`);
const extension = file.path.split(".").pop()?.toLowerCase();
const splitType = extension ? extensionToSplitter(extension) : "ignore";
if (splitType !== "ignore") {
try {
// Get file content
const filePath = path.join(repoPath, file.path);
// Skip if file doesn't exist (might have been deleted)
if (!fs.existsSync(filePath)) {
console.error(`File ${file.path} doesn't exist, skipping`);
continue;
}
let content = fs.readFileSync(filePath, "utf-8");
// Check for null bytes in the content
if (content.includes("\0")) {
console.error(
`File ${file.path} contains null bytes. Removing them.`
);
content = content.replace(/\0/g, "");
}
// Check if the content is valid UTF-8
try {
new TextDecoder("utf-8", { fatal: true }).decode(
new TextEncoder().encode(content)
);
} catch (e) {
console.error(
`File ${file.path} contains invalid UTF-8 characters. Replacing them.`
);
content = content.replace(/[^\x00-\x7F]/g, ""); // Remove non-ASCII characters
}
// Truncate content if it's too long
const maxLength = 1000000; // Adjust this value based on your database column size
if (content.length > maxLength) {
console.error(
`File ${file.path} content is too long. Truncating to ${maxLength} characters.`
);
content = content.substring(0, maxLength);
}
// Split the document
const chunks = await splitDocument(file.path, content);
// Store chunks in the database
db.transaction((db) => {
for (let i = 0; i < chunks.length; i++) {
db.run(
`INSERT INTO file_chunk (file_id, content, chunk_number)
VALUES (:fileId, :content, :chunkNumber)
ON CONFLICT(file_id, chunk_number) DO NOTHING`,
{
fileId: file.id,
content: chunks[i].pageContent,
chunkNumber: i + 1,
}
);
}
// Update file status to 'fetched'
db.run("UPDATE file SET status = :status WHERE id = :fileId", {
status: "fetched",
fileId: file.id,
});
});
} catch (error) {
console.error(`Error processing file ${file.path}:`, error);
}
} else {
// Update file status to 'done' for ignored files
db.run("UPDATE file SET status = :status WHERE id = :fileId", {
status: "done",
fileId: file.id,
});
}
}
};
/**
* Generate embeddings for file chunks
* @param branchName Branch name
* @param repoPath Repository path
*/
export const generateEmbeddings = async (
branchName: string,
repoPath: string
): Promise<void> => {
// Get repository ID
const repo = db.get("SELECT id FROM repository WHERE path = :path", {
path: repoPath,
}) as { id: number };
if (!repo) {
throw new Error(`Repository not found at path ${repoPath}`);
}
// Get branch ID
const branch = db.get(
"SELECT id FROM branch WHERE name = :name AND repository_id = :repoId",
{ name: branchName, repoId: repo.id }
) as { id: number };
if (!branch) {
throw new Error(`Branch ${branchName} not found in repository`);
}
// Update branch status to processing
db.run("UPDATE branch SET status = 'processing_embeddings' WHERE id = :id", {
id: branch.id,
});
// Get chunks that need embeddings
const chunks = db.all(
`SELECT fc.id, fc.content
FROM file_chunk fc
JOIN file f ON fc.file_id = f.id
JOIN branch_file_association bfa ON f.id = bfa.file_id
WHERE bfa.branch_id = :branchId
AND fc.embedding IS NULL`,
{ branchId: branch.id }
) as { id: number; content: string }[];
console.error(`Found ${chunks.length} chunks without embeddings`);
// Process in batches
for (let i = 0; i < chunks.length; i += config.BATCH_SIZE) {
const batchChunks = chunks.slice(i, i + config.BATCH_SIZE);
const batchTexts = batchChunks.map((chunk) => chunk.content);
try {
// Generate embeddings with HuggingFace
const embeddings = await generateOllamaEmbeddings(batchTexts);
// Update chunks with embeddings
db.transaction((db) => {
for (let j = 0; j < batchChunks.length; j++) {
const chunkId = batchChunks[j].id;
const embedding = JSON.stringify(embeddings[j]);
db.run(
`UPDATE file_chunk SET
embedding = :embedding,
model_version = :modelVersion,
token_count = :tokenCount,
updated_at = :updatedAt
WHERE id = :id`,
{
embedding,
modelVersion: config.EMBEDDING_MODEL.model,
tokenCount: batchTexts[j].length,
updatedAt: new Date().toISOString(),
id: chunkId,
}
);
}
})();
console.error(
`Processed embeddings for batch ${
i / config.BATCH_SIZE + 1
}/${Math.ceil(chunks.length / config.BATCH_SIZE)}`
);
} catch (error) {
console.error("Error generating embeddings:", error);
throw error;
}
}
// Update branch status to complete
db.run("UPDATE branch SET status = 'complete' WHERE id = :id", {
id: branch.id,
});
};
/**
* Get the default branch of a repository
* @param owner GitHub repository owner
* @param repo GitHub repository name
* @returns Default branch name (usually main or master)
*/
export const getDefaultBranch = async (
owner: string,
repo: string
): Promise<string> => {
try {
// Check if we have this repo locally
// Note: Repository is stored with just the repo name, not in owner/repo structure
const repoDir = path.join(config.REPO_CACHE_DIR, repo);
if (fs.existsSync(repoDir)) {
console.error(`Found local repo at ${repoDir}, determining default branch`);
// If we have the repo locally, use git to determine the default branch
const git = simpleGit(repoDir);
// Fetch the latest info from the remote
await git.fetch();
// Get the symbolic-ref of HEAD
const headRef = await git.raw([
"symbolic-ref",
"refs/remotes/origin/HEAD",
]);
// Extract the branch name
const branchMatch = headRef.trim().match(/refs\/remotes\/origin\/(.+)$/);
if (branchMatch && branchMatch[1]) {
console.error(`Found default branch: ${branchMatch[1]}`);
return branchMatch[1];
}
} else {
console.error(`Repository not found at ${repoDir}`);
}
// Fallback to main as the default branch
console.error(`Fallback to 'main' as default branch`);
return "main";
} catch (error) {
console.error("Error getting default branch:", error);
// Default to 'main' if there's an error
return "main";
}
};
/**
* Query a repository using embeddings to find relevant code
* @param query Search query text
* @param owner Repository owner
* @param repo Repository name
* @param branch Branch name
* @param topK Number of results to return
* @returns Array of matching code chunks with metadata
*/
export const queryRepository = async (
query: string,
owner: string,
repo: string,
branch: string,
topK: number = 5
): Promise<any[]> => {
// Generate embedding for the query
const queryEmbeddingList = await generateOllamaEmbeddings([query]);
const queryEmbedding = queryEmbeddingList[0];
// Get repository metadata from the database
const repoResult = db.get("SELECT id FROM repository WHERE name = :repo", {
repo,
}) as { id: number } | undefined;
if (!repoResult) {
throw new Error(`Repository ${owner}/${repo} not found in database`);
}
const repoId = repoResult.id;
// Get branch metadata
const branchResult = db.get(
"SELECT id FROM branch WHERE name = :branch AND repository_id = :repoId",
{ branch, repoId }
) as { id: number } | undefined;
if (!branchResult) {
throw new Error(
`Branch ${branch} not found in repository ${owner}/${repo}`
);
}
// Query for similar chunks using vector similarity
const chunkResults = db.all(
`SELECT
c.id,
c.content,
c.chunk_number,
f.path,
vector_similarity(c.embedding, :queryEmbedding) as similarity
FROM
chunk c
JOIN
file f ON c.file_id = f.id
JOIN
branch b ON b.repository_id = f.repository_id
WHERE
b.id = :branchId AND
f.repository_id = :repoId AND
c.embedding IS NOT NULL
ORDER BY
similarity DESC
LIMIT :topK`,
{
branchId: branchResult.id,
repoId,
queryEmbedding,
topK,
}
);
return chunkResults;
};