resultStorage.ts•4.13 kB
export interface LLMToolCall {
toolCallId: string;
toolName: string;
parameters: Record<string, unknown>;
}
export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId"> & {
optional?: boolean;
};
export const AccuracyRunStatus = {
Done: "done",
Failed: "failed",
InProgress: "in-progress",
} as const;
export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus];
export interface AccuracyResult {
/**
* A unique id for each accuracy run. Should either be generated by the
* script triggering the accuracy run or provided via environment variables.
* */
runId: string;
/**
* Represents the status of accuracy run. Each test completion, during an
* accuracy run, is supposed to submit an accuracy result entry with
* InProgress status which then later, after completion of accuracy run, is
* updated to either Done or Failed, depending on whether there were errors
* during the run or not. */
runStatus: AccuracyRunStatuses;
/**
* Timestamp of when this result entry was generated. */
createdOn: number;
/**
* The commit SHA for which the accuracy run was triggered. */
commitSHA: string;
/**
* A list of results for different prompts tested in the accuracy run. */
promptResults: PromptResult[];
}
export interface PromptResult {
/**
* The actual prompt that was provided to LLM as test */
prompt: string;
/**
* A list of tools, along with their parameters, that are expected to be
* called by the LLM in test. */
expectedToolCalls: ExpectedToolCall[];
/**
* The responses from the LLMs tested, when provided with the prompt. */
modelResponses: ModelResponse[];
}
export interface ModelResponse {
/**
* The LLM provider providing the LLM APIs */
provider: string;
/**
* The LLM which was requested to respond to our test prompts */
requestedModel: string;
/**
* The ID of the model that actually responded to our prompt request. */
respondingModel: string;
/**
* The total time taken by LLM to respond to our prompt. */
llmResponseTime: number;
/**
* A number between 0 and 1, representing how accurately the expected tools
* were called by LLM when responding to the provided prompts. To know more
* about how this number is generated, check - toolCallingAccuracy.ts */
toolCallingAccuracy: number;
/**
* A list of tools, along with their parameters, that were actually called
* by the LLM in test. */
llmToolCalls: LLMToolCall[];
/**
* Token usage data, returned as part of LLM prompt response. */
tokensUsed?: TokensUsed;
/**
* The final response text generated by the LLM, in response to our prompt
* request. */
text?: string;
/**
* A list of messages, exchanged between LLM and our testing agent, in
* response to our prompt request. This is particularly helpful for
* debugging. */
messages?: Record<string, unknown>[];
}
interface TokensUsed {
promptTokens?: number;
completionTokens?: number;
totalTokens?: number;
}
export interface AccuracyResultStorage {
/**
* Retrieves the accuracy result for the provided commit SHA and optionally
* the run id. When the run id is omitted, the implementation fetches the
* result for the last successful accuracy run otherwise it fetches the
* result regardless of the run status. */
getAccuracyResult(commitSHA: string, runId?: string): Promise<AccuracyResult | null>;
/**
* Updates the status of the run */
updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise<void>;
/**
* Attempts to atomically insert the model response for the prompt in the
* stored accuracy result. */
saveModelResponseForPrompt(data: {
commitSHA: string;
runId: string;
prompt: string;
expectedToolCalls: ExpectedToolCall[];
modelResponse: ModelResponse;
}): Promise<void>;
close(): Promise<void>;
}