DM20 Protocol

Overview Schema Related Servers Score Discussions

kokoro.py•6.08 KiB

"""
Kokoro 82M TTS engine wrapper.

Kokoro is a lightweight, fast TTS model optimized for Apple Silicon.
It serves as the Tier 1 (Speed) engine for Apple Silicon Macs,
providing low-latency synthesis suitable for combat narration.

Requires the `kokoro` package: pip install kokoro
"""

import io
import logging
import struct
import time
from typing import Optional

from .base import AudioFormat, TTSEngine, TTSResult, VoiceConfig

logger = logging.getLogger("dm20-protocol.voice.kokoro")

# Map language codes to Kokoro language identifiers
_LANGUAGE_MAP: dict[str, str] = {
    "en": "en-us",
    "en-us": "en-us",
    "en-gb": "en-gb",
    "it": "it",
}

# Default voice IDs per language
_DEFAULT_VOICES: dict[str, str] = {
    "en": "af_heart",
    "it": "af_heart",
}


def _check_kokoro_available() -> bool:
    """Check if the kokoro package is importable."""
    try:
        import kokoro  # noqa: F401

        return True
    except ImportError:
        return False


def _audio_to_wav(samples: list[float], sample_rate: int) -> bytes:
    """Convert raw float audio samples to WAV bytes.

    Args:
        samples: Audio samples as floats in [-1.0, 1.0].
        sample_rate: Sample rate in Hz.

    Returns:
        WAV file bytes.
    """
    buf = io.BytesIO()
    num_samples = len(samples)
    data_size = num_samples * 2  # 16-bit = 2 bytes per sample

    # WAV header
    buf.write(b"RIFF")
    buf.write(struct.pack("<I", 36 + data_size))
    buf.write(b"WAVE")

    # fmt chunk
    buf.write(b"fmt ")
    buf.write(struct.pack("<I", 16))  # chunk size
    buf.write(struct.pack("<H", 1))  # PCM format
    buf.write(struct.pack("<H", 1))  # mono
    buf.write(struct.pack("<I", sample_rate))
    buf.write(struct.pack("<I", sample_rate * 2))  # byte rate
    buf.write(struct.pack("<H", 2))  # block align
    buf.write(struct.pack("<H", 16))  # bits per sample

    # data chunk
    buf.write(b"data")
    buf.write(struct.pack("<I", data_size))
    for sample in samples:
        clamped = max(-1.0, min(1.0, sample))
        buf.write(struct.pack("<h", int(clamped * 32767)))

    return buf.getvalue()


class KokoroEngine(TTSEngine):
    """Kokoro 82M TTS engine for fast speech synthesis on Apple Silicon.

    This engine wraps the kokoro Python package and provides low-latency
    text-to-speech suitable for real-time combat narration.
    """

    def __init__(self) -> None:
        self._pipeline: object | None = None
        self._available: bool | None = None

    @property
    def name(self) -> str:
        return "kokoro"

    def is_available(self) -> bool:
        if self._available is None:
            self._available = _check_kokoro_available()
            if not self._available:
                logger.debug("Kokoro package not installed")
        return self._available

    async def warmup(self) -> None:
        """Preload the Kokoro pipeline for faster first synthesis."""
        if not self.is_available():
            return

        if self._pipeline is None:
            try:
                from kokoro import KPipeline

                self._pipeline = KPipeline(lang_code="a")
                logger.info("Kokoro pipeline loaded successfully")
            except Exception as exc:
                logger.warning("Failed to load Kokoro pipeline: %s", exc)
                self._available = False

    async def synthesize(
        self,
        text: str,
        voice_config: Optional[VoiceConfig] = None,
    ) -> TTSResult:
        """Synthesize text using Kokoro 82M.

        Args:
            text: Text to synthesize.
            voice_config: Optional voice configuration.

        Returns:
            TTSResult with WAV audio data.

        Raises:
            RuntimeError: If Kokoro is not available or synthesis fails.
        """
        if not self.is_available():
            raise RuntimeError("Kokoro engine is not available (package not installed)")

        config = voice_config or VoiceConfig()
        lang = config.language
        kokoro_lang = _LANGUAGE_MAP.get(lang, "en-us")
        voice_id = config.voice_id
        if voice_id == "default":
            voice_id = _DEFAULT_VOICES.get(lang, "af_heart")

        try:
            if self._pipeline is None:
                from kokoro import KPipeline

                self._pipeline = KPipeline(lang_code=kokoro_lang[0])

            start_time = time.monotonic()

            # Kokoro generates audio as a generator of (graphemes, phonemes, audio) tuples
            all_samples: list[float] = []
            sample_rate = 24000  # Kokoro default

            for _gs, _ps, audio in self._pipeline(
                text, voice=voice_id, speed=config.speed
            ):
                if audio is not None:
                    # audio is a numpy/torch tensor or list of floats
                    if hasattr(audio, "tolist"):
                        all_samples.extend(audio.tolist())
                    elif hasattr(audio, "numpy"):
                        all_samples.extend(audio.numpy().tolist())
                    else:
                        all_samples.extend(list(audio))

            elapsed_ms = (time.monotonic() - start_time) * 1000
            duration_ms = (len(all_samples) / sample_rate) * 1000

            wav_data = _audio_to_wav(all_samples, sample_rate)

            logger.info(
                "Kokoro synthesis: %.0fms latency, %.0fms audio duration",
                elapsed_ms,
                duration_ms,
            )

            return TTSResult(
                audio_data=wav_data,
                format=AudioFormat.WAV,
                sample_rate=sample_rate,
                duration_ms=duration_ms,
                engine_name=self.name,
            )

        except ImportError:
            self._available = False
            raise RuntimeError("Kokoro package not found during synthesis")
        except Exception as exc:
            raise RuntimeError(f"Kokoro synthesis failed: {exc}") from exc

    async def shutdown(self) -> None:
        """Release the Kokoro pipeline."""
        self._pipeline = None
        logger.debug("Kokoro pipeline released")

    def supported_languages(self) -> list[str]:
        return ["en", "it"]

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Polloinfilzato/dm20-protocol'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

kokoro.py•6.08 KiB

"""
Kokoro 82M TTS engine wrapper.

Kokoro is a lightweight, fast TTS model optimized for Apple Silicon.
It serves as the Tier 1 (Speed) engine for Apple Silicon Macs,
providing low-latency synthesis suitable for combat narration.

Requires the `kokoro` package: pip install kokoro
"""

import io
import logging
import struct
import time
from typing import Optional

from .base import AudioFormat, TTSEngine, TTSResult, VoiceConfig

logger = logging.getLogger("dm20-protocol.voice.kokoro")

# Map language codes to Kokoro language identifiers
_LANGUAGE_MAP: dict[str, str] = {
    "en": "en-us",
    "en-us": "en-us",
    "en-gb": "en-gb",
    "it": "it",
}

# Default voice IDs per language
_DEFAULT_VOICES: dict[str, str] = {
    "en": "af_heart",
    "it": "af_heart",
}


def _check_kokoro_available() -> bool:
    """Check if the kokoro package is importable."""
    try:
        import kokoro  # noqa: F401

        return True
    except ImportError:
        return False


def _audio_to_wav(samples: list[float], sample_rate: int) -> bytes:
    """Convert raw float audio samples to WAV bytes.

    Args:
        samples: Audio samples as floats in [-1.0, 1.0].
        sample_rate: Sample rate in Hz.

    Returns:
        WAV file bytes.
    """
    buf = io.BytesIO()
    num_samples = len(samples)
    data_size = num_samples * 2  # 16-bit = 2 bytes per sample

    # WAV header
    buf.write(b"RIFF")
    buf.write(struct.pack("<I", 36 + data_size))
    buf.write(b"WAVE")

    # fmt chunk
    buf.write(b"fmt ")
    buf.write(struct.pack("<I", 16))  # chunk size
    buf.write(struct.pack("<H", 1))  # PCM format
    buf.write(struct.pack("<H", 1))  # mono
    buf.write(struct.pack("<I", sample_rate))
    buf.write(struct.pack("<I", sample_rate * 2))  # byte rate
    buf.write(struct.pack("<H", 2))  # block align
    buf.write(struct.pack("<H", 16))  # bits per sample

    # data chunk
    buf.write(b"data")
    buf.write(struct.pack("<I", data_size))
    for sample in samples:
        clamped = max(-1.0, min(1.0, sample))
        buf.write(struct.pack("<h", int(clamped * 32767)))

    return buf.getvalue()


class KokoroEngine(TTSEngine):
    """Kokoro 82M TTS engine for fast speech synthesis on Apple Silicon.

    This engine wraps the kokoro Python package and provides low-latency
    text-to-speech suitable for real-time combat narration.
    """

    def __init__(self) -> None:
        self._pipeline: object | None = None
        self._available: bool | None = None

    @property
    def name(self) -> str:
        return "kokoro"

    def is_available(self) -> bool:
        if self._available is None:
            self._available = _check_kokoro_available()
            if not self._available:
                logger.debug("Kokoro package not installed")
        return self._available

    async def warmup(self) -> None:
        """Preload the Kokoro pipeline for faster first synthesis."""
        if not self.is_available():
            return

        if self._pipeline is None:
            try:
                from kokoro import KPipeline

                self._pipeline = KPipeline(lang_code="a")
                logger.info("Kokoro pipeline loaded successfully")
            except Exception as exc:
                logger.warning("Failed to load Kokoro pipeline: %s", exc)
                self._available = False

    async def synthesize(
        self,
        text: str,
        voice_config: Optional[VoiceConfig] = None,
    ) -> TTSResult:
        """Synthesize text using Kokoro 82M.

        Args:
            text: Text to synthesize.
            voice_config: Optional voice configuration.

        Returns:
            TTSResult with WAV audio data.

        Raises:
            RuntimeError: If Kokoro is not available or synthesis fails.
        """
        if not self.is_available():
            raise RuntimeError("Kokoro engine is not available (package not installed)")

        config = voice_config or VoiceConfig()
        lang = config.language
        kokoro_lang = _LANGUAGE_MAP.get(lang, "en-us")
        voice_id = config.voice_id
        if voice_id == "default":
            voice_id = _DEFAULT_VOICES.get(lang, "af_heart")

        try:
            if self._pipeline is None:
                from kokoro import KPipeline

                self._pipeline = KPipeline(lang_code=kokoro_lang[0])

            start_time = time.monotonic()

            # Kokoro generates audio as a generator of (graphemes, phonemes, audio) tuples
            all_samples: list[float] = []
            sample_rate = 24000  # Kokoro default

            for _gs, _ps, audio in self._pipeline(
                text, voice=voice_id, speed=config.speed
            ):
                if audio is not None:
                    # audio is a numpy/torch tensor or list of floats
                    if hasattr(audio, "tolist"):
                        all_samples.extend(audio.tolist())
                    elif hasattr(audio, "numpy"):
                        all_samples.extend(audio.numpy().tolist())
                    else:
                        all_samples.extend(list(audio))

            elapsed_ms = (time.monotonic() - start_time) * 1000
            duration_ms = (len(all_samples) / sample_rate) * 1000

            wav_data = _audio_to_wav(all_samples, sample_rate)

            logger.info(
                "Kokoro synthesis: %.0fms latency, %.0fms audio duration",
                elapsed_ms,
                duration_ms,
            )

            return TTSResult(
                audio_data=wav_data,
                format=AudioFormat.WAV,
                sample_rate=sample_rate,
                duration_ms=duration_ms,
                engine_name=self.name,
            )

        except ImportError:
            self._available = False
            raise RuntimeError("Kokoro package not found during synthesis")
        except Exception as exc:
            raise RuntimeError(f"Kokoro synthesis failed: {exc}") from exc

    async def shutdown(self) -> None:
        """Release the Kokoro pipeline."""
        self._pipeline = None
        logger.debug("Kokoro pipeline released")

    def supported_languages(self) -> list[str]:
        return ["en", "it"]