/**
* Unified API Client — ONE server proxy caller with retry and config builder.
*
* Replaces proxy+direct call paths in agent-loop, subagent, teammate.
* CLI-only: always calls through server proxy (no direct Anthropic SDK usage).
*/
import {
getContextManagement,
getMaxOutputTokens,
getThinkingConfig,
isRetryableError,
addPromptCaching,
} from "./agent-core.js";
import { getProvider } from "./constants.js";
import type { ContextProfile, APIRequestConfig } from "./types.js";
// ============================================================================
// CONSTANTS
// ============================================================================
const MAX_RETRIES = 3;
const RETRY_BASE_DELAY_MS = 1000;
// ============================================================================
// API REQUEST CONFIG BUILDER
// ============================================================================
/**
* Build all Anthropic API config from a simple profile.
*
* Profiles:
* - 'main': clear at 80K/keep 3, compact at 120K (Opus only)
* - 'subagent': clear at 60K/keep 2, no compaction, 8192 max tokens
* - 'teammate': clear at 80K/keep 3, no compaction
*/
export function buildAPIRequest(opts: {
model: string;
contextProfile: ContextProfile;
thinkingEnabled?: boolean;
maxOutputTokens?: number;
}): APIRequestConfig {
const { model, contextProfile, thinkingEnabled = false, maxOutputTokens } = opts;
// Context management config per profile
// Non-Anthropic models (Gemini, OpenAI, etc.) don't support Anthropic betas or context management
const provider = getProvider(model);
const isAnthropicModel = provider === "anthropic" || provider === "bedrock";
let betas: string[] = isAnthropicModel ? ["context-management-2025-06-27"] : [];
let edits: Array<Record<string, unknown>> = [];
switch (contextProfile) {
case "main": {
const ctxMgmt = getContextManagement(model);
betas = [...ctxMgmt.betas];
edits = ctxMgmt.config.edits;
break;
}
case "subagent":
if (isAnthropicModel) {
edits = [
{
type: "clear_tool_uses_20250919",
trigger: { type: "input_tokens", value: 60_000 },
keep: { type: "tool_uses", value: 2 },
},
];
}
break;
case "teammate": {
const ctxMgmt = getContextManagement(model);
betas = [...ctxMgmt.betas];
edits = ctxMgmt.config.edits;
break;
}
}
// Thinking config
const thinkingCfg = getThinkingConfig(model, thinkingEnabled);
if (thinkingCfg.beta) betas.push(thinkingCfg.beta);
// Max tokens
const profileMaxTokens = contextProfile === "subagent" ? 8192 : undefined;
const maxTokens = getMaxOutputTokens(model, maxOutputTokens ?? profileMaxTokens);
// Build thinking param (ensure budget < maxTokens)
// Only include for Anthropic/Bedrock — Gemini/OpenAI handle thinking internally
let thinking: { type: string; budget_tokens?: number } | undefined;
if (isAnthropicModel && thinkingCfg.thinking.type !== "disabled") {
thinking = thinkingCfg.thinking.budget_tokens
? { ...thinkingCfg.thinking, budget_tokens: Math.min(thinkingCfg.thinking.budget_tokens, maxTokens - 1) }
: thinkingCfg.thinking;
}
return {
betas,
contextManagement: { edits },
thinking,
maxTokens,
};
}
// ============================================================================
// SERVER PROXY CALLER
// ============================================================================
export interface CallServerProxyConfig {
proxyUrl: string;
token: string;
model: string;
system: Array<Record<string, unknown>>;
messages: Array<Record<string, unknown>>;
tools: Array<Record<string, unknown>>;
apiConfig: APIRequestConfig;
signal?: AbortSignal;
timeoutMs?: number;
fallbackModel?: string;
storeId?: string; // Required for non-Anthropic providers (credential resolution)
onFallback?: (fromModel: string, toModel: string) => void;
onRetry?: (attempt: number, maxRetries: number, error: string) => void;
}
/**
* Call server proxy endpoint and return raw SSE stream.
* Retries with exponential backoff on 429/500/529.
*/
export async function callServerProxy(
config: CallServerProxyConfig,
): Promise<ReadableStream<Uint8Array>> {
const { proxyUrl, token, signal, timeoutMs } = config;
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
try {
const body: Record<string, unknown> = {
mode: "proxy",
messages: config.messages,
system: config.system,
tools: config.tools,
model: config.model,
max_tokens: config.apiConfig.maxTokens,
stream: true,
betas: config.apiConfig.betas,
...(config.apiConfig.contextManagement?.edits?.length
? { context_management: config.apiConfig.contextManagement }
: {}),
};
if (config.apiConfig.thinking) {
body.thinking = config.apiConfig.thinking;
}
if (config.storeId) {
body.store_id = config.storeId;
}
const fetchOpts: RequestInit = {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${token}`,
},
body: JSON.stringify(body),
signal,
};
// Apply timeout if specified
let controller: AbortController | undefined;
let timeout: ReturnType<typeof setTimeout> | undefined;
if (timeoutMs && !signal) {
controller = new AbortController();
timeout = setTimeout(() => controller!.abort(), timeoutMs);
fetchOpts.signal = controller.signal;
}
try {
const response = await fetch(proxyUrl, fetchOpts);
if (timeout) clearTimeout(timeout);
if (response.ok && response.body) {
return response.body;
}
const errorBody = await response.text();
throw Object.assign(
new Error(`Proxy error (${response.status}): ${errorBody}`),
{ status: response.status },
);
} catch (err) {
if (timeout) clearTimeout(timeout);
throw err;
}
} catch (err: unknown) {
if (signal?.aborted) throw err;
if (attempt < MAX_RETRIES && isRetryableError(err)) {
const errMsg = err instanceof Error ? err.message : String(err);
config.onRetry?.(attempt + 1, MAX_RETRIES, errMsg);
const delay = RETRY_BASE_DELAY_MS * Math.pow(2, attempt);
await new Promise((resolve) => setTimeout(resolve, delay));
// Fallback model on last retry
if (attempt === MAX_RETRIES - 1 && config.fallbackModel) {
const fromModel = config.model;
config.model = config.fallbackModel;
config.onFallback?.(fromModel, config.model);
}
continue;
}
throw err;
}
}
throw new Error("Failed to get response after retries");
}
// ============================================================================
// HELPERS — system prompt + caching setup
// ============================================================================
/**
* Build system blocks with prompt caching.
* Cached system prompt + optional dynamic cost context (after cache breakpoint).
*/
export function buildSystemBlocks(
systemPrompt: string,
costContext?: string,
enableCaching = true,
): Array<Record<string, unknown>> {
const system: Array<Record<string, unknown>> = [];
if (enableCaching) {
system.push({
type: "text",
text: systemPrompt,
cache_control: { type: "ephemeral" },
});
} else {
system.push({ type: "text", text: systemPrompt });
}
if (costContext) {
system.push({ type: "text", text: costContext });
}
return system;
}
/**
* Prepare messages and tools with prompt caching.
* Skips cache_control injection for Gemini (uses implicit caching).
*/
export function prepareWithCaching(
tools: Array<Record<string, unknown>>,
messages: Array<Record<string, unknown>>,
model?: string,
): { tools: Array<Record<string, unknown>>; messages: Array<Record<string, unknown>> } {
if (model) {
const provider = getProvider(model);
if (provider === "gemini" || provider === "openai") {
return { tools, messages };
}
}
return addPromptCaching(tools, messages);
}
// ============================================================================
// GEMINI CONTEXT TRIMMING
// ============================================================================
/**
* Client-side context trimming for Gemini (no server-side context management).
* When estimated input tokens exceed threshold, replaces old tool result contents
* with "[trimmed]" to stay within Gemini's 1M context window.
*/
export function trimGeminiContext(
messages: Array<Record<string, unknown>>,
estimatedTokens: number,
threshold = 800_000,
keepRecent = 5,
): Array<Record<string, unknown>> {
if (estimatedTokens < threshold) return messages;
// Find all tool_result blocks and trim oldest ones
let toolResultCount = 0;
for (const msg of messages) {
if (Array.isArray(msg.content)) {
for (const block of msg.content as Array<Record<string, unknown>>) {
if (block.type === "tool_result") toolResultCount++;
}
}
}
if (toolResultCount <= keepRecent) return messages;
const trimCount = toolResultCount - keepRecent;
let trimmed = 0;
return messages.map((msg) => {
if (!Array.isArray(msg.content)) return msg;
const content = (msg.content as Array<Record<string, unknown>>).map((block) => {
if (block.type === "tool_result" && trimmed < trimCount) {
trimmed++;
if (typeof block.content === "string") {
return { ...block, content: "[trimmed]" };
} else if (Array.isArray(block.content)) {
return { ...block, content: [{ type: "text", text: "[trimmed]" }] };
}
}
return block;
});
return { ...msg, content };
});
}
// ============================================================================
// OPENAI CONTEXT TRIMMING
// ============================================================================
/**
* Client-side context trimming for OpenAI (no server-side context management).
* When estimated input tokens exceed threshold, replaces old tool result contents
* with "[trimmed]" to stay within OpenAI's 200K context window.
*/
export function trimOpenAIContext(
messages: Array<Record<string, unknown>>,
estimatedTokens: number,
threshold = 150_000,
keepRecent = 5,
): Array<Record<string, unknown>> {
if (estimatedTokens < threshold) return messages;
// Find all tool_result blocks and trim oldest ones
let toolResultCount = 0;
for (const msg of messages) {
if (Array.isArray(msg.content)) {
for (const block of msg.content as Array<Record<string, unknown>>) {
if (block.type === "tool_result") toolResultCount++;
}
}
}
if (toolResultCount <= keepRecent) return messages;
const trimCount = toolResultCount - keepRecent;
let trimmed = 0;
return messages.map((msg) => {
if (!Array.isArray(msg.content)) return msg;
const content = (msg.content as Array<Record<string, unknown>>).map((block) => {
if (block.type === "tool_result" && trimmed < trimCount) {
trimmed++;
if (typeof block.content === "string") {
return { ...block, content: "[trimmed]" };
} else if (Array.isArray(block.content)) {
return { ...block, content: [{ type: "text", text: "[trimmed]" }] };
}
}
return block;
});
return { ...msg, content };
});
}