ThoughtMCP

llm-client.ts•20.5 KiB

import { z } from "zod";
import { Logger } from "../utils/logger.js";

/**
 * Default inference model configuration
 * Uses environment variables with sensible defaults for local deployment
 * Requirements: 14.1, 14.5, 14.7
 */
export const DEFAULT_LLM_CONFIG = {
  /** Default model - small, efficient model suitable for local deployment */
  modelName: "llama3.2:1b",
  /** Default timeout in milliseconds - accounts for cold starts (model loading) */
  timeout: 60000,
  /** Default temperature for generation (0.0-1.0) */
  temperature: 0.7,
} as const;

export interface LLMConfig {
  host: string;
  modelName: string;
  timeout?: number;
  temperature?: number;
}

export interface ChatMessage {
  role: "system" | "user" | "assistant";
  content: string;
}

/**
 * Status of the inference model
 * Requirements: 15.1
 */
export interface InferenceModelStatus {
  available: boolean;
  modelName: string;
  lastChecked: Date;
  averageLatencyMs: number;
  errorCount: number;
  error?: string;
}

/**
 * Client for interacting with Ollama GenAI (Generation) API
 * Distinct from Embeddings client - this handles Text Generation and Reasoning
 *
 * Configuration via environment variables:
 * - LLM_MODEL: Model name (default: llama3.2:1b)
 * - LLM_TIMEOUT: Request timeout in ms (default: 30000)
 * - LLM_TEMPERATURE: Generation temperature 0.0-1.0 (default: 0.7)
 * - OLLAMA_HOST: Ollama API URL (default: http://localhost:11434)
 *
 * Requirements: 14.1, 14.5, 14.7
 */
export class LLMClient {
  private readonly baseUrl: string;
  private readonly modelName: string;
  private readonly timeout: number;
  private readonly temperature: number;

  /** Track model status for health checks */
  private modelStatus: InferenceModelStatus;

  constructor(configOverride?: Partial<LLMConfig>) {
    this.baseUrl = configOverride?.host ?? process.env.OLLAMA_HOST ?? "http://localhost:11434";
    this.modelName =
      configOverride?.modelName ?? process.env.LLM_MODEL ?? DEFAULT_LLM_CONFIG.modelName;
    this.timeout =
      configOverride?.timeout ??
      (process.env.LLM_TIMEOUT
        ? parseInt(process.env.LLM_TIMEOUT, 10)
        : DEFAULT_LLM_CONFIG.timeout);
    this.temperature =
      configOverride?.temperature ??
      (process.env.LLM_TEMPERATURE
        ? parseFloat(process.env.LLM_TEMPERATURE)
        : DEFAULT_LLM_CONFIG.temperature);

    // Initialize model status
    this.modelStatus = {
      available: false,
      modelName: this.modelName,
      lastChecked: new Date(0), // Never checked
      averageLatencyMs: 0,
      errorCount: 0,
    };
  }

  /**
   * Get the configured model name
   */
  getModelName(): string {
    return this.modelName;
  }

  /**
   * Get the configured temperature
   */
  getTemperature(): number {
    return this.temperature;
  }

  /**
   * Get the configured timeout
   */
  getTimeout(): number {
    return this.timeout;
  }

  /**
   * Get the current model status for health checks
   * Requirements: 15.1
   */
  getModelStatus(): InferenceModelStatus {
    return { ...this.modelStatus };
  }

  /**
   * Check if the inference model is available
   * Makes a request to Ollama to verify the model is loaded and responding
   * Requirements: 14.1, 15.1
   */
  async checkModelAvailability(): Promise<InferenceModelStatus> {
    const startTime = Date.now();

    try {
      // First check if Ollama is responding
      const tagsResponse = await fetch(`${this.baseUrl}/api/tags`, {
        method: "GET",
        signal: AbortSignal.timeout(5000),
      });

      if (!tagsResponse.ok) {
        this.modelStatus = {
          available: false,
          modelName: this.modelName,
          lastChecked: new Date(),
          averageLatencyMs: Date.now() - startTime,
          errorCount: this.modelStatus.errorCount + 1,
          error: `Ollama API error: ${tagsResponse.statusText}`,
        };
        Logger.warn("LLM Model Availability Check Failed", {
          model: this.modelName,
          error: this.modelStatus.error,
        });
        return this.modelStatus;
      }

      const data = (await tagsResponse.json()) as { models?: Array<{ name: string }> };
      const models = data.models ?? [];

      // Check if our specific model is available
      const modelAvailable = models.some(
        (m) => m.name === this.modelName || m.name === `${this.modelName}:latest`
      );

      if (!modelAvailable) {
        this.modelStatus = {
          available: false,
          modelName: this.modelName,
          lastChecked: new Date(),
          averageLatencyMs: Date.now() - startTime,
          errorCount: this.modelStatus.errorCount + 1,
          error: `Model ${this.modelName} not found. Available models: ${models.map((m) => m.name).join(", ")}`,
        };
        Logger.warn("LLM Model Not Available", {
          model: this.modelName,
          availableModels: models.map((m) => m.name),
        });
        return this.modelStatus;
      }

      // Model is available
      const latency = Date.now() - startTime;
      this.modelStatus = {
        available: true,
        modelName: this.modelName,
        lastChecked: new Date(),
        averageLatencyMs:
          this.modelStatus.averageLatencyMs > 0
            ? (this.modelStatus.averageLatencyMs + latency) / 2
            : latency,
        errorCount: 0, // Reset error count on success
      };

      Logger.info("LLM Model Available", {
        model: this.modelName,
        latencyMs: latency,
      });

      return this.modelStatus;
    } catch (error) {
      const errorMessage = error instanceof Error ? error.message : String(error);
      this.modelStatus = {
        available: false,
        modelName: this.modelName,
        lastChecked: new Date(),
        averageLatencyMs: Date.now() - startTime,
        errorCount: this.modelStatus.errorCount + 1,
        error: errorMessage,
      };

      Logger.warn("LLM Model Availability Check Failed", {
        model: this.modelName,
        error: errorMessage,
      });

      return this.modelStatus;
    }
  }

  /**
   * Check if the model is currently available (based on last check)
   * Does not make a network request - use checkModelAvailability() for that
   */
  isModelAvailable(): boolean {
    return this.modelStatus.available;
  }

  /**
   * Generate text from a prompt
   */
  async generate(prompt: string, system?: string): Promise<string> {
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), this.timeout);
    const startTime = Date.now();

    try {
      const response = await fetch(`${this.baseUrl}/api/generate`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        signal: controller.signal,
        body: JSON.stringify({
          model: this.modelName,
          prompt,
          system,
          stream: false,
          options: {
            temperature: this.temperature,
          },
        }),
      });

      if (!response.ok) {
        this.modelStatus.errorCount++;
        throw new Error(`Ollama API error: ${response.statusText}`);
      }

      const data = (await response.json()) as { response: string };

      // Update latency tracking
      const latency = Date.now() - startTime;
      this.modelStatus.averageLatencyMs =
        this.modelStatus.averageLatencyMs > 0
          ? (this.modelStatus.averageLatencyMs + latency) / 2
          : latency;

      // Log warning if latency exceeds threshold (5 seconds)
      if (latency > 5000) {
        Logger.warn("LLM Generation High Latency", {
          model: this.modelName,
          latencyMs: latency,
          threshold: 5000,
        });
      }

      return data.response;
    } catch (error) {
      if (error instanceof Error && error.name === "AbortError") {
        Logger.error("LLM Generation Timed Out", `Timeout after ${this.timeout}ms`);
        this.modelStatus.errorCount++;
        throw new Error(`LLM generation timed out after ${this.timeout}ms`);
      }
      Logger.error("LLM Generation Failed", error instanceof Error ? error.message : String(error));
      this.modelStatus.errorCount++;
      throw error;
    } finally {
      clearTimeout(timeoutId);
    }
  }

  /**
   * Chat completion with JSON Schema enforcement
   * @param messages Conversation history
   * @param schema Optional Zod schema to enforce valid JSON output
   */
  async chat<T>(messages: ChatMessage[], schema?: z.ZodSchema<T>): Promise<T | string> {
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), this.timeout);
    const startTime = Date.now();

    try {
      // If schema provided, append enforcement instruction to system prompt
      const finalMessages = [...messages];

      // Note: Actual schema binding logic will belong in PromptBinder,
      // but here we ensure the request asks for JSON mode if needed.
      const format = schema ? "json" : undefined;

      // Use lower temperature for structured output
      const chatTemperature = schema ? Math.min(this.temperature, 0.2) : this.temperature;

      const response = await fetch(`${this.baseUrl}/api/chat`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        signal: controller.signal,
        body: JSON.stringify({
          model: this.modelName,
          messages: finalMessages,
          stream: false,
          format, // Force JSON mode if schema exists
          options: {
            temperature: chatTemperature,
          },
        }),
      });

      if (!response.ok) {
        this.modelStatus.errorCount++;
        throw new Error(`Ollama Chat API error: ${response.statusText}`);
      }

      const data = (await response.json()) as { message: { content: string } };
      const content = data.message.content;

      // Update latency tracking
      const latency = Date.now() - startTime;
      this.modelStatus.averageLatencyMs =
        this.modelStatus.averageLatencyMs > 0
          ? (this.modelStatus.averageLatencyMs + latency) / 2
          : latency;

      // Log warning if latency exceeds threshold (5 seconds)
      if (latency > 5000) {
        Logger.warn("LLM Chat High Latency", {
          model: this.modelName,
          latencyMs: latency,
          threshold: 5000,
        });
      }

      if (schema) {
        try {
          // Repair and clean the response before parsing
          const repairedContent = this.repairJsonResponse(content);
          const parsed = JSON.parse(repairedContent);
          // @ts-ignore - schema.parse supports unknown
          return schema.parse(parsed);
        } catch (validationError) {
          Logger.error("LLM Output Validation Failed", { content, error: String(validationError) });
          throw new Error(`LLM failed to generate valid JSON: ${String(validationError)}`);
        }
      }

      return content;
    } catch (error) {
      if (error instanceof Error && error.name === "AbortError") {
        Logger.error("LLM Chat Timed Out", `Timeout after ${this.timeout}ms`);
        this.modelStatus.errorCount++;
        throw new Error(`LLM chat timed out after ${this.timeout}ms`);
      }
      Logger.error("LLM Chat Failed", error instanceof Error ? error.message : String(error));
      this.modelStatus.errorCount++;
      throw error;
    } finally {
      clearTimeout(timeoutId);
    }
  }
  /**
   * Stream chat completion
   * @param messages Conversation history
   * @param onChunk Callback for each token/chunk received
   * @param schema Optional Zod schema (only used for final prompt instruction, not validation during stream)
   */
  async streamChat<T>(
    messages: ChatMessage[],
    onChunk: (content: string) => void,
    schema?: z.ZodSchema<T>
  ): Promise<string> {
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), this.timeout);
    const startTime = Date.now();

    try {
      const response = await this.makeStreamRequest(messages, controller.signal, schema);
      const fullContent = await this.processStreamResponse(response, onChunk);
      this.updateLatencyMetrics(startTime, "LLM Streaming High Latency");
      return fullContent;
    } catch (error) {
      this.handleStreamError(error);
      throw error; // Re-throw after handling
    } finally {
      clearTimeout(timeoutId);
    }
  }

  /**
   * Make the streaming request to Ollama
   */
  private async makeStreamRequest<T>(
    messages: ChatMessage[],
    signal: AbortSignal,
    schema?: z.ZodSchema<T>
  ): Promise<Response> {
    const format = schema ? "json" : undefined;
    const chatTemperature = schema ? Math.min(this.temperature, 0.2) : this.temperature;

    const response = await fetch(`${this.baseUrl}/api/chat`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      signal,
      body: JSON.stringify({
        model: this.modelName,
        messages: [...messages],
        stream: true,
        format,
        options: { temperature: chatTemperature },
      }),
    });

    if (!response.ok) {
      this.modelStatus.errorCount++;
      throw new Error(`Ollama Chat API error: ${response.statusText}`);
    }

    if (!response.body) {
      throw new Error("Response body is null");
    }

    return response;
  }

  /**
   * Process the streaming response and accumulate content
   */
  private async processStreamResponse(
    response: Response,
    onChunk: (content: string) => void
  ): Promise<string> {
    if (!response.body) {
      throw new Error("Response body is null");
    }
    const reader = response.body.getReader();
    const decoder = new TextDecoder();
    let fullContent = "";

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;

      const chunk = decoder.decode(value, { stream: true });
      const lines = chunk.split("\n").filter((line) => line.trim() !== "");

      for (const line of lines) {
        const result = this.processStreamLine(line, onChunk);
        if (result.content) {
          fullContent += result.content;
        }
        if (result.done) {
          return fullContent;
        }
      }
    }

    return fullContent;
  }

  /**
   * Update latency metrics and log warnings if threshold exceeded
   */
  private updateLatencyMetrics(startTime: number, warningMessage: string): void {
    const latency = Date.now() - startTime;
    this.modelStatus.averageLatencyMs =
      this.modelStatus.averageLatencyMs > 0
        ? (this.modelStatus.averageLatencyMs + latency) / 2
        : latency;

    if (latency > 5000) {
      Logger.warn(warningMessage, {
        model: this.modelName,
        latencyMs: latency,
        threshold: 5000,
      });
    }
  }

  /**
   * Handle streaming errors with appropriate logging
   */
  private handleStreamError(error: unknown): never {
    if (error instanceof Error && error.name === "AbortError") {
      Logger.error("LLM Streaming Failed", `Timeout after ${this.timeout}ms`);
      this.modelStatus.errorCount++;
      throw new Error(`LLM streaming timed out after ${this.timeout}ms`);
    }
    Logger.error("LLM Streaming Failed", error instanceof Error ? error.message : String(error));
    this.modelStatus.errorCount++;
    throw error;
  }

  /**
   * Repair malformed JSON responses from LLMs
   * Handles common issues like markdown code blocks, schema echoing, and formatting problems
   *
   * @param content - Raw LLM response content
   * @returns Cleaned JSON string ready for parsing
   */
  private repairJsonResponse(content: string): string {
    let cleaned = content;

    // Step 1: Remove markdown code blocks (common LLM quirk)
    cleaned = cleaned
      .replace(/```json\n?/gi, "")
      .replace(/\n?```/g, "")
      .trim();

    // Step 2: Remove any leading/trailing whitespace and newlines
    cleaned = cleaned.trim();

    // Step 3: Handle schema echoing - if model echoed the schema structure
    if (cleaned.includes('"$ref"') || cleaned.includes('"definitions"')) {
      cleaned = this.extractDataFromSchemaEcho(cleaned);
    }

    // Step 4: Fix common JSON issues
    cleaned = this.fixCommonJsonIssues(cleaned);

    // Step 5: If still not valid JSON, try to extract JSON object
    if (!this.isValidJson(cleaned)) {
      const extracted = this.extractJsonObject(cleaned);
      if (extracted) {
        cleaned = extracted;
      }
    }

    return cleaned;
  }

  /**
   * Extract actual data from a response where the model echoed the schema
   *
   * @param content - Content with schema echoing
   * @returns Extracted data or original content
   */
  private extractDataFromSchemaEcho(content: string): string {
    try {
      const parsed = JSON.parse(content);

      // If it has $ref and definitions, try to extract actual data
      if (parsed.$ref && parsed.definitions) {
        const extractedData = this.extractFromDefinitions(parsed.definitions);
        if (extractedData) {
          return JSON.stringify(extractedData);
        }
      }

      // If we can't extract, return original
      return content;
    } catch {
      return content;
    }
  }

  /**
   * Extract data from schema definitions
   *
   * @param definitions - Schema definitions object
   * @returns Extracted data or null
   */
  private extractFromDefinitions(
    definitions: Record<string, unknown>
  ): Record<string, unknown> | null {
    for (const key of Object.keys(definitions)) {
      const def = definitions[key] as Record<string, unknown>;
      if (def.properties) {
        const extractedData = this.extractDataFromProperties(
          def.properties as Record<string, unknown>
        );
        if (Object.keys(extractedData).length > 0) {
          return extractedData;
        }
      }
    }
    return null;
  }

  /**
   * Extract data from schema properties
   *
   * @param properties - Properties object from schema
   * @returns Extracted data object
   */
  private extractDataFromProperties(properties: Record<string, unknown>): Record<string, unknown> {
    const extractedData: Record<string, unknown> = {};

    for (const [propKey, propValue] of Object.entries(properties)) {
      const prop = propValue as Record<string, unknown>;
      // If the property value is an array with actual content, extract it
      if (Array.isArray(prop)) {
        extractedData[propKey] = prop;
      } else if (prop && typeof prop === "object" && !("type" in prop)) {
        // If it's an object without 'type' field, it might be actual data
        extractedData[propKey] = prop;
      }
    }

    return extractedData;
  }

  /**
   * Fix common JSON formatting issues
   *
   * @param content - JSON string with potential issues
   * @returns Fixed JSON string
   */
  private fixCommonJsonIssues(content: string): string {
    let fixed = content;

    // Remove trailing commas before closing brackets/braces
    fixed = fixed.replace(/,(\s*[}\]])/g, "$1");

    // Fix unquoted property names (simple cases)
    fixed = fixed.replace(/([{,]\s*)(\w+)(\s*:)/g, '$1"$2"$3');

    // Remove any BOM or invisible characters at the start
    fixed = fixed.replace(/^\uFEFF/, "");

    // Handle escaped newlines that shouldn't be escaped
    fixed = fixed.replace(/\\n/g, " ");

    return fixed;
  }

  /**
   * Check if a string is valid JSON
   *
   * @param str - String to check
   * @returns True if valid JSON
   */
  private isValidJson(str: string): boolean {
    try {
      JSON.parse(str);
      return true;
    } catch {
      return false;
    }
  }

  /**
   * Try to extract a JSON object from a string that may contain extra content
   *
   * @param content - String potentially containing JSON
   * @returns Extracted JSON string or null
   */
  private extractJsonObject(content: string): string | null {
    // Find the first { and last }
    const firstBrace = content.indexOf("{");
    const lastBrace = content.lastIndexOf("}");

    if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) {
      const extracted = content.substring(firstBrace, lastBrace + 1);
      if (this.isValidJson(extracted)) {
        return extracted;
      }
    }

    // Try to find JSON array
    const firstBracket = content.indexOf("[");
    const lastBracket = content.lastIndexOf("]");

    if (firstBracket !== -1 && lastBracket !== -1 && lastBracket > firstBracket) {
      const extracted = content.substring(firstBracket, lastBracket + 1);
      if (this.isValidJson(extracted)) {
        return extracted;
      }
    }

    return null;
  }

  private processStreamLine(
    line: string,
    onChunk: (content: string) => void
  ): { content?: string; done?: boolean } {
    try {
      const data = JSON.parse(line);
      if (data.message?.content) {
        onChunk(data.message.content);
      }
      return { content: data.message?.content, done: data.done };
    } catch {
      return {};
    }
  }
}

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/keyurgolani/ThoughtMcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

llm-client.ts•20.5 KiB

import { z } from "zod";
import { Logger } from "../utils/logger.js";

/**
 * Default inference model configuration
 * Uses environment variables with sensible defaults for local deployment
 * Requirements: 14.1, 14.5, 14.7
 */
export const DEFAULT_LLM_CONFIG = {
  /** Default model - small, efficient model suitable for local deployment */
  modelName: "llama3.2:1b",
  /** Default timeout in milliseconds - accounts for cold starts (model loading) */
  timeout: 60000,
  /** Default temperature for generation (0.0-1.0) */
  temperature: 0.7,
} as const;

export interface LLMConfig {
  host: string;
  modelName: string;
  timeout?: number;
  temperature?: number;
}

export interface ChatMessage {
  role: "system" | "user" | "assistant";
  content: string;
}

/**
 * Status of the inference model
 * Requirements: 15.1
 */
export interface InferenceModelStatus {
  available: boolean;
  modelName: string;
  lastChecked: Date;
  averageLatencyMs: number;
  errorCount: number;
  error?: string;
}

/**
 * Client for interacting with Ollama GenAI (Generation) API
 * Distinct from Embeddings client - this handles Text Generation and Reasoning
 *
 * Configuration via environment variables:
 * - LLM_MODEL: Model name (default: llama3.2:1b)
 * - LLM_TIMEOUT: Request timeout in ms (default: 30000)
 * - LLM_TEMPERATURE: Generation temperature 0.0-1.0 (default: 0.7)
 * - OLLAMA_HOST: Ollama API URL (default: http://localhost:11434)
 *
 * Requirements: 14.1, 14.5, 14.7
 */
export class LLMClient {
  private readonly baseUrl: string;
  private readonly modelName: string;
  private readonly timeout: number;
  private readonly temperature: number;

  /** Track model status for health checks */
  private modelStatus: InferenceModelStatus;

  constructor(configOverride?: Partial<LLMConfig>) {
    this.baseUrl = configOverride?.host ?? process.env.OLLAMA_HOST ?? "http://localhost:11434";
    this.modelName =
      configOverride?.modelName ?? process.env.LLM_MODEL ?? DEFAULT_LLM_CONFIG.modelName;
    this.timeout =
      configOverride?.timeout ??
      (process.env.LLM_TIMEOUT
        ? parseInt(process.env.LLM_TIMEOUT, 10)
        : DEFAULT_LLM_CONFIG.timeout);
    this.temperature =
      configOverride?.temperature ??
      (process.env.LLM_TEMPERATURE
        ? parseFloat(process.env.LLM_TEMPERATURE)
        : DEFAULT_LLM_CONFIG.temperature);

    // Initialize model status
    this.modelStatus = {
      available: false,
      modelName: this.modelName,
      lastChecked: new Date(0), // Never checked
      averageLatencyMs: 0,
      errorCount: 0,
    };
  }

  /**
   * Get the configured model name
   */
  getModelName(): string {
    return this.modelName;
  }

  /**
   * Get the configured temperature
   */
  getTemperature(): number {
    return this.temperature;
  }

  /**
   * Get the configured timeout
   */
  getTimeout(): number {
    return this.timeout;
  }

  /**
   * Get the current model status for health checks
   * Requirements: 15.1
   */
  getModelStatus(): InferenceModelStatus {
    return { ...this.modelStatus };
  }

  /**
   * Check if the inference model is available
   * Makes a request to Ollama to verify the model is loaded and responding
   * Requirements: 14.1, 15.1
   */
  async checkModelAvailability(): Promise<InferenceModelStatus> {
    const startTime = Date.now();

    try {
      // First check if Ollama is responding
      const tagsResponse = await fetch(`${this.baseUrl}/api/tags`, {
        method: "GET",
        signal: AbortSignal.timeout(5000),
      });

      if (!tagsResponse.ok) {
        this.modelStatus = {
          available: false,
          modelName: this.modelName,
          lastChecked: new Date(),
          averageLatencyMs: Date.now() - startTime,
          errorCount: this.modelStatus.errorCount + 1,
          error: `Ollama API error: ${tagsResponse.statusText}`,
        };
        Logger.warn("LLM Model Availability Check Failed", {
          model: this.modelName,
          error: this.modelStatus.error,
        });
        return this.modelStatus;
      }

      const data = (await tagsResponse.json()) as { models?: Array<{ name: string }> };
      const models = data.models ?? [];

      // Check if our specific model is available
      const modelAvailable = models.some(
        (m) => m.name === this.modelName || m.name === `${this.modelName}:latest`
      );

      if (!modelAvailable) {
        this.modelStatus = {
          available: false,
          modelName: this.modelName,
          lastChecked: new Date(),
          averageLatencyMs: Date.now() - startTime,
          errorCount: this.modelStatus.errorCount + 1,
          error: `Model ${this.modelName} not found. Available models: ${models.map((m) => m.name).join(", ")}`,
        };
        Logger.warn("LLM Model Not Available", {
          model: this.modelName,
          availableModels: models.map((m) => m.name),
        });
        return this.modelStatus;
      }

      // Model is available
      const latency = Date.now() - startTime;
      this.modelStatus = {
        available: true,
        modelName: this.modelName,
        lastChecked: new Date(),
        averageLatencyMs:
          this.modelStatus.averageLatencyMs > 0
            ? (this.modelStatus.averageLatencyMs + latency) / 2
            : latency,
        errorCount: 0, // Reset error count on success
      };

      Logger.info("LLM Model Available", {
        model: this.modelName,
        latencyMs: latency,
      });

      return this.modelStatus;
    } catch (error) {
      const errorMessage = error instanceof Error ? error.message : String(error);
      this.modelStatus = {
        available: false,
        modelName: this.modelName,
        lastChecked: new Date(),
        averageLatencyMs: Date.now() - startTime,
        errorCount: this.modelStatus.errorCount + 1,
        error: errorMessage,
      };

      Logger.warn("LLM Model Availability Check Failed", {
        model: this.modelName,
        error: errorMessage,
      });

      return this.modelStatus;
    }
  }

  /**
   * Check if the model is currently available (based on last check)
   * Does not make a network request - use checkModelAvailability() for that
   */
  isModelAvailable(): boolean {
    return this.modelStatus.available;
  }

  /**
   * Generate text from a prompt
   */
  async generate(prompt: string, system?: string): Promise<string> {
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), this.timeout);
    const startTime = Date.now();

    try {
      const response = await fetch(`${this.baseUrl}/api/generate`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        signal: controller.signal,
        body: JSON.stringify({
          model: this.modelName,
          prompt,
          system,
          stream: false,
          options: {
            temperature: this.temperature,
          },
        }),
      });

      if (!response.ok) {
        this.modelStatus.errorCount++;
        throw new Error(`Ollama API error: ${response.statusText}`);
      }

      const data = (await response.json()) as { response: string };

      // Update latency tracking
      const latency = Date.now() - startTime;
      this.modelStatus.averageLatencyMs =
        this.modelStatus.averageLatencyMs > 0
          ? (this.modelStatus.averageLatencyMs + latency) / 2
          : latency;

      // Log warning if latency exceeds threshold (5 seconds)
      if (latency > 5000) {
        Logger.warn("LLM Generation High Latency", {
          model: this.modelName,
          latencyMs: latency,
          threshold: 5000,
        });
      }

      return data.response;
    } catch (error) {
      if (error instanceof Error && error.name === "AbortError") {
        Logger.error("LLM Generation Timed Out", `Timeout after ${this.timeout}ms`);
        this.modelStatus.errorCount++;
        throw new Error(`LLM generation timed out after ${this.timeout}ms`);
      }
      Logger.error("LLM Generation Failed", error instanceof Error ? error.message : String(error));
      this.modelStatus.errorCount++;
      throw error;
    } finally {
      clearTimeout(timeoutId);
    }
  }

  /**
   * Chat completion with JSON Schema enforcement
   * @param messages Conversation history
   * @param schema Optional Zod schema to enforce valid JSON output
   */
  async chat<T>(messages: ChatMessage[], schema?: z.ZodSchema<T>): Promise<T | string> {
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), this.timeout);
    const startTime = Date.now();

    try {
      // If schema provided, append enforcement instruction to system prompt
      const finalMessages = [...messages];

      // Note: Actual schema binding logic will belong in PromptBinder,
      // but here we ensure the request asks for JSON mode if needed.
      const format = schema ? "json" : undefined;

      // Use lower temperature for structured output
      const chatTemperature = schema ? Math.min(this.temperature, 0.2) : this.temperature;

      const response = await fetch(`${this.baseUrl}/api/chat`, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        signal: controller.signal,
        body: JSON.stringify({
          model: this.modelName,
          messages: finalMessages,
          stream: false,
          format, // Force JSON mode if schema exists
          options: {
            temperature: chatTemperature,
          },
        }),
      });

      if (!response.ok) {
        this.modelStatus.errorCount++;
        throw new Error(`Ollama Chat API error: ${response.statusText}`);
      }

      const data = (await response.json()) as { message: { content: string } };
      const content = data.message.content;

      // Update latency tracking
      const latency = Date.now() - startTime;
      this.modelStatus.averageLatencyMs =
        this.modelStatus.averageLatencyMs > 0
          ? (this.modelStatus.averageLatencyMs + latency) / 2
          : latency;

      // Log warning if latency exceeds threshold (5 seconds)
      if (latency > 5000) {
        Logger.warn("LLM Chat High Latency", {
          model: this.modelName,
          latencyMs: latency,
          threshold: 5000,
        });
      }

      if (schema) {
        try {
          // Repair and clean the response before parsing
          const repairedContent = this.repairJsonResponse(content);
          const parsed = JSON.parse(repairedContent);
          // @ts-ignore - schema.parse supports unknown
          return schema.parse(parsed);
        } catch (validationError) {
          Logger.error("LLM Output Validation Failed", { content, error: String(validationError) });
          throw new Error(`LLM failed to generate valid JSON: ${String(validationError)}`);
        }
      }

      return content;
    } catch (error) {
      if (error instanceof Error && error.name === "AbortError") {
        Logger.error("LLM Chat Timed Out", `Timeout after ${this.timeout}ms`);
        this.modelStatus.errorCount++;
        throw new Error(`LLM chat timed out after ${this.timeout}ms`);
      }
      Logger.error("LLM Chat Failed", error instanceof Error ? error.message : String(error));
      this.modelStatus.errorCount++;
      throw error;
    } finally {
      clearTimeout(timeoutId);
    }
  }
  /**
   * Stream chat completion
   * @param messages Conversation history
   * @param onChunk Callback for each token/chunk received
   * @param schema Optional Zod schema (only used for final prompt instruction, not validation during stream)
   */
  async streamChat<T>(
    messages: ChatMessage[],
    onChunk: (content: string) => void,
    schema?: z.ZodSchema<T>
  ): Promise<string> {
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), this.timeout);
    const startTime = Date.now();

    try {
      const response = await this.makeStreamRequest(messages, controller.signal, schema);
      const fullContent = await this.processStreamResponse(response, onChunk);
      this.updateLatencyMetrics(startTime, "LLM Streaming High Latency");
      return fullContent;
    } catch (error) {
      this.handleStreamError(error);
      throw error; // Re-throw after handling
    } finally {
      clearTimeout(timeoutId);
    }
  }

  /**
   * Make the streaming request to Ollama
   */
  private async makeStreamRequest<T>(
    messages: ChatMessage[],
    signal: AbortSignal,
    schema?: z.ZodSchema<T>
  ): Promise<Response> {
    const format = schema ? "json" : undefined;
    const chatTemperature = schema ? Math.min(this.temperature, 0.2) : this.temperature;

    const response = await fetch(`${this.baseUrl}/api/chat`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      signal,
      body: JSON.stringify({
        model: this.modelName,
        messages: [...messages],
        stream: true,
        format,
        options: { temperature: chatTemperature },
      }),
    });

    if (!response.ok) {
      this.modelStatus.errorCount++;
      throw new Error(`Ollama Chat API error: ${response.statusText}`);
    }

    if (!response.body) {
      throw new Error("Response body is null");
    }

    return response;
  }

  /**
   * Process the streaming response and accumulate content
   */
  private async processStreamResponse(
    response: Response,
    onChunk: (content: string) => void
  ): Promise<string> {
    if (!response.body) {
      throw new Error("Response body is null");
    }
    const reader = response.body.getReader();
    const decoder = new TextDecoder();
    let fullContent = "";

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;

      const chunk = decoder.decode(value, { stream: true });
      const lines = chunk.split("\n").filter((line) => line.trim() !== "");

      for (const line of lines) {
        const result = this.processStreamLine(line, onChunk);
        if (result.content) {
          fullContent += result.content;
        }
        if (result.done) {
          return fullContent;
        }
      }
    }

    return fullContent;
  }

  /**
   * Update latency metrics and log warnings if threshold exceeded
   */
  private updateLatencyMetrics(startTime: number, warningMessage: string): void {
    const latency = Date.now() - startTime;
    this.modelStatus.averageLatencyMs =
      this.modelStatus.averageLatencyMs > 0
        ? (this.modelStatus.averageLatencyMs + latency) / 2
        : latency;

    if (latency > 5000) {
      Logger.warn(warningMessage, {
        model: this.modelName,
        latencyMs: latency,
        threshold: 5000,
      });
    }
  }

  /**
   * Handle streaming errors with appropriate logging
   */
  private handleStreamError(error: unknown): never {
    if (error instanceof Error && error.name === "AbortError") {
      Logger.error("LLM Streaming Failed", `Timeout after ${this.timeout}ms`);
      this.modelStatus.errorCount++;
      throw new Error(`LLM streaming timed out after ${this.timeout}ms`);
    }
    Logger.error("LLM Streaming Failed", error instanceof Error ? error.message : String(error));
    this.modelStatus.errorCount++;
    throw error;
  }

  /**
   * Repair malformed JSON responses from LLMs
   * Handles common issues like markdown code blocks, schema echoing, and formatting problems
   *
   * @param content - Raw LLM response content
   * @returns Cleaned JSON string ready for parsing
   */
  private repairJsonResponse(content: string): string {
    let cleaned = content;

    // Step 1: Remove markdown code blocks (common LLM quirk)
    cleaned = cleaned
      .replace(/```json\n?/gi, "")
      .replace(/\n?```/g, "")
      .trim();

    // Step 2: Remove any leading/trailing whitespace and newlines
    cleaned = cleaned.trim();

    // Step 3: Handle schema echoing - if model echoed the schema structure
    if (cleaned.includes('"$ref"') || cleaned.includes('"definitions"')) {
      cleaned = this.extractDataFromSchemaEcho(cleaned);
    }

    // Step 4: Fix common JSON issues
    cleaned = this.fixCommonJsonIssues(cleaned);

    // Step 5: If still not valid JSON, try to extract JSON object
    if (!this.isValidJson(cleaned)) {
      const extracted = this.extractJsonObject(cleaned);
      if (extracted) {
        cleaned = extracted;
      }
    }

    return cleaned;
  }

  /**
   * Extract actual data from a response where the model echoed the schema
   *
   * @param content - Content with schema echoing
   * @returns Extracted data or original content
   */
  private extractDataFromSchemaEcho(content: string): string {
    try {
      const parsed = JSON.parse(content);

      // If it has $ref and definitions, try to extract actual data
      if (parsed.$ref && parsed.definitions) {
        const extractedData = this.extractFromDefinitions(parsed.definitions);
        if (extractedData) {
          return JSON.stringify(extractedData);
        }
      }

      // If we can't extract, return original
      return content;
    } catch {
      return content;
    }
  }

  /**
   * Extract data from schema definitions
   *
   * @param definitions - Schema definitions object
   * @returns Extracted data or null
   */
  private extractFromDefinitions(
    definitions: Record<string, unknown>
  ): Record<string, unknown> | null {
    for (const key of Object.keys(definitions)) {
      const def = definitions[key] as Record<string, unknown>;
      if (def.properties) {
        const extractedData = this.extractDataFromProperties(
          def.properties as Record<string, unknown>
        );
        if (Object.keys(extractedData).length > 0) {
          return extractedData;
        }
      }
    }
    return null;
  }

  /**
   * Extract data from schema properties
   *
   * @param properties - Properties object from schema
   * @returns Extracted data object
   */
  private extractDataFromProperties(properties: Record<string, unknown>): Record<string, unknown> {
    const extractedData: Record<string, unknown> = {};

    for (const [propKey, propValue] of Object.entries(properties)) {
      const prop = propValue as Record<string, unknown>;
      // If the property value is an array with actual content, extract it
      if (Array.isArray(prop)) {
        extractedData[propKey] = prop;
      } else if (prop && typeof prop === "object" && !("type" in prop)) {
        // If it's an object without 'type' field, it might be actual data
        extractedData[propKey] = prop;
      }
    }

    return extractedData;
  }

  /**
   * Fix common JSON formatting issues
   *
   * @param content - JSON string with potential issues
   * @returns Fixed JSON string
   */
  private fixCommonJsonIssues(content: string): string {
    let fixed = content;

    // Remove trailing commas before closing brackets/braces
    fixed = fixed.replace(/,(\s*[}\]])/g, "$1");

    // Fix unquoted property names (simple cases)
    fixed = fixed.replace(/([{,]\s*)(\w+)(\s*:)/g, '$1"$2"$3');

    // Remove any BOM or invisible characters at the start
    fixed = fixed.replace(/^\uFEFF/, "");

    // Handle escaped newlines that shouldn't be escaped
    fixed = fixed.replace(/\\n/g, " ");

    return fixed;
  }

  /**
   * Check if a string is valid JSON
   *
   * @param str - String to check
   * @returns True if valid JSON
   */
  private isValidJson(str: string): boolean {
    try {
      JSON.parse(str);
      return true;
    } catch {
      return false;
    }
  }

  /**
   * Try to extract a JSON object from a string that may contain extra content
   *
   * @param content - String potentially containing JSON
   * @returns Extracted JSON string or null
   */
  private extractJsonObject(content: string): string | null {
    // Find the first { and last }
    const firstBrace = content.indexOf("{");
    const lastBrace = content.lastIndexOf("}");

    if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) {
      const extracted = content.substring(firstBrace, lastBrace + 1);
      if (this.isValidJson(extracted)) {
        return extracted;
      }
    }

    // Try to find JSON array
    const firstBracket = content.indexOf("[");
    const lastBracket = content.lastIndexOf("]");

    if (firstBracket !== -1 && lastBracket !== -1 && lastBracket > firstBracket) {
      const extracted = content.substring(firstBracket, lastBracket + 1);
      if (this.isValidJson(extracted)) {
        return extracted;
      }
    }

    return null;
  }

  private processStreamLine(
    line: string,
    onChunk: (content: string) => void
  ): { content?: string; done?: boolean } {
    try {
      const data = JSON.parse(line);
      if (data.message?.content) {
        onChunk(data.message.content);
      }
      return { content: data.message?.content, done: data.done };
    } catch {
      return {};
    }
  }
}