#!/usr/bin/env tsx
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
Tool,
} from "@modelcontextprotocol/sdk/types.js";
import axios, { AxiosInstance } from "axios";
import * as dotenv from "dotenv";
import * as fs from "fs/promises";
import * as path from "path";
import { spawn } from "child_process";
import { tmpdir } from "os";
dotenv.config();
interface VoiceroidSpeaker {
Volume?: number;
Speed?: number;
Pitch?: number;
Emphasis?: number;
PauseMiddle?: number;
PauseLong?: number;
PauseSentence?: number;
}
interface SpeechTextRequest {
Text: string;
Kana?: string;
Speaker?: VoiceroidSpeaker;
}
class VoiceroidDaemonClient {
private client: AxiosInstance;
constructor(baseURL: string, username?: string, password?: string) {
this.client = axios.create({
baseURL,
headers: {
"Content-Type": "application/json",
},
auth: username && password ? { username, password } : undefined,
});
}
async testConnection(): Promise<boolean> {
try {
const response = await this.client.get("/");
return response.status === 200;
} catch (error) {
return false;
}
}
async convertText(text: string): Promise<string> {
const response = await this.client.post<string>("/api/converttext", {
Text: text,
});
return response.data;
}
async speechText(
text: string,
kana?: string,
speaker?: VoiceroidSpeaker
): Promise<Buffer> {
const request: SpeechTextRequest = {
Text: text,
...(kana && { Kana: kana }),
...(speaker && { Speaker: speaker }),
};
const response = await this.client.post("/api/speechtext", request, {
responseType: "arraybuffer",
});
return Buffer.from(response.data);
}
}
async function playWavFile(wavBuffer: Buffer): Promise<void> {
const tempFile = path.join(tmpdir(), `voiceroid_${Date.now()}.wav`);
try {
await fs.writeFile(tempFile, wavBuffer);
const platform = process.platform;
let command: string;
let args: string[];
if (platform === "darwin") {
command = "afplay";
args = [tempFile];
} else if (platform === "win32") {
command = "powershell";
args = ["-c", `(New-Object Media.SoundPlayer "${tempFile}").PlaySync()`];
} else {
command = "aplay";
args = [tempFile];
}
return new Promise((resolve, reject) => {
const player = spawn(command, args);
player.on("close", (code) => {
fs.unlink(tempFile).catch(() => {});
if (code === 0) {
resolve();
} else {
reject(new Error(`Player exited with code ${code}`));
}
});
player.on("error", (err) => {
fs.unlink(tempFile).catch(() => {});
reject(err);
});
});
} catch (error) {
await fs.unlink(tempFile).catch(() => {});
throw error;
}
}
const server = new Server(
{
name: "voiceroid-daemon-mcp",
version: "1.0.0",
},
{
capabilities: {
tools: {},
},
}
);
const VOICEROID_URL = process.env.VOICEROID_DAEMON_URL || "http://127.0.0.1:8080";
const VOICEROID_USERNAME = process.env.VOICEROID_DAEMON_USERNAME;
const VOICEROID_PASSWORD = process.env.VOICEROID_DAEMON_PASSWORD;
// Default speaker parameters from environment variables
const DEFAULT_VOLUME = process.env.VOICEROID_DEFAULT_VOLUME ? parseFloat(process.env.VOICEROID_DEFAULT_VOLUME) : undefined;
const DEFAULT_SPEED = process.env.VOICEROID_DEFAULT_SPEED ? parseFloat(process.env.VOICEROID_DEFAULT_SPEED) : undefined;
const DEFAULT_PITCH = process.env.VOICEROID_DEFAULT_PITCH ? parseFloat(process.env.VOICEROID_DEFAULT_PITCH) : undefined;
const DEFAULT_EMPHASIS = process.env.VOICEROID_DEFAULT_EMPHASIS ? parseFloat(process.env.VOICEROID_DEFAULT_EMPHASIS) : undefined;
const DEFAULT_PAUSE_MIDDLE = process.env.VOICEROID_DEFAULT_PAUSE_MIDDLE ? parseFloat(process.env.VOICEROID_DEFAULT_PAUSE_MIDDLE) : undefined;
const DEFAULT_PAUSE_LONG = process.env.VOICEROID_DEFAULT_PAUSE_LONG ? parseFloat(process.env.VOICEROID_DEFAULT_PAUSE_LONG) : undefined;
const DEFAULT_PAUSE_SENTENCE = process.env.VOICEROID_DEFAULT_PAUSE_SENTENCE ? parseFloat(process.env.VOICEROID_DEFAULT_PAUSE_SENTENCE) : undefined;
const client = new VoiceroidDaemonClient(
VOICEROID_URL,
VOICEROID_USERNAME,
VOICEROID_PASSWORD
);
const TOOLS: Tool[] = [
{
name: "test_connection",
description: "Test connection to voiceroid_daemon server",
inputSchema: {
type: "object",
properties: {},
required: [],
},
},
{
name: "convert_text",
description: "Convert text to phonetic kana reading",
inputSchema: {
type: "object",
properties: {
text: {
type: "string",
description: "Text to convert to kana",
},
},
required: ["text"],
},
},
{
name: "speak_text",
description: "Generate speech audio from text and play it",
inputSchema: {
type: "object",
properties: {
text: {
type: "string",
description: "Text to speak",
},
kana: {
type: "string",
description: "Optional phonetic reading in kana",
},
volume: {
type: "number",
description: "Voice volume (0-2, default: 1)",
minimum: 0,
maximum: 2,
},
speed: {
type: "number",
description: "Speech speed (0.5-4, default: 1)",
minimum: 0.5,
maximum: 4,
},
pitch: {
type: "number",
description: "Voice pitch (0.5-2, default: 1)",
minimum: 0.5,
maximum: 2,
},
emphasis: {
type: "number",
description: "Emphasis level (0-2, default: 1)",
minimum: 0,
maximum: 2,
},
},
required: ["text"],
},
},
];
server.setRequestHandler(ListToolsRequestSchema, async () => {
return {
tools: TOOLS,
};
});
server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params;
try {
switch (name) {
case "test_connection": {
const isConnected = await client.testConnection();
return {
content: [
{
type: "text",
text: isConnected
? `Successfully connected to voiceroid_daemon at ${VOICEROID_URL}`
: `Failed to connect to voiceroid_daemon at ${VOICEROID_URL}`,
},
],
};
}
case "convert_text": {
const { text } = args as { text: string };
const kana = await client.convertText(text);
return {
content: [
{
type: "text",
text: `Converted text: ${kana}`,
},
],
};
}
case "speak_text": {
const { text, kana, volume, speed, pitch, emphasis } = args as {
text: string;
kana?: string;
volume?: number;
speed?: number;
pitch?: number;
emphasis?: number;
};
const speaker: VoiceroidSpeaker = {};
if (volume !== undefined) speaker.Volume = volume;
else if (DEFAULT_VOLUME !== undefined) speaker.Volume = DEFAULT_VOLUME;
if (speed !== undefined) speaker.Speed = speed;
else if (DEFAULT_SPEED !== undefined) speaker.Speed = DEFAULT_SPEED;
if (pitch !== undefined) speaker.Pitch = pitch;
else if (DEFAULT_PITCH !== undefined) speaker.Pitch = DEFAULT_PITCH;
if (emphasis !== undefined) speaker.Emphasis = emphasis;
else if (DEFAULT_EMPHASIS !== undefined) speaker.Emphasis = DEFAULT_EMPHASIS;
if (DEFAULT_PAUSE_MIDDLE !== undefined) speaker.PauseMiddle = DEFAULT_PAUSE_MIDDLE;
if (DEFAULT_PAUSE_LONG !== undefined) speaker.PauseLong = DEFAULT_PAUSE_LONG;
if (DEFAULT_PAUSE_SENTENCE !== undefined) speaker.PauseSentence = DEFAULT_PAUSE_SENTENCE;
const wavBuffer = await client.speechText(
text,
kana,
Object.keys(speaker).length > 0 ? speaker : undefined
);
await playWavFile(wavBuffer);
return {
content: [
{
type: "text",
text: `Successfully played speech for: "${text}"`,
},
],
};
}
default:
throw new Error(`Unknown tool: ${name}`);
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [
{
type: "text",
text: `Error: ${errorMessage}`,
},
],
};
}
});
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
console.error("voiceroid-daemon-mcp server started");
}
main().catch((error) => {
console.error("Server error:", error);
process.exit(1);
});