aivis-speech-synthesis

Convert text to high-quality speech using customizable parameters like speaker ID, style, speed, pitch, and volume. Integrate the tool via the AivisSpeech MCP Server API for AI-powered voice synthesis.

Input Schema

TableJSON Schema

Name	Required	Description
`intonation_scale`	No	イントネーションのスケール（1.0が標準）
`output_sampling_rate`	No	出力音声のサンプリングレート（Hz）
`pitch_scale`	No	音高のスケール（1.0が標準）
`post_phoneme_length`	No	音声の末尾の無音時間（秒）
`pre_phoneme_length`	No	音声の先頭の無音時間（秒）
`speaker_id`	No	音声合成に使用するスピーカーのID
`speed_scale`	No	話速のスケール（1.0が標準）
`style_id`	No	音声合成に使用するスタイルのID
`text`	Yes	音声合成するテキスト
`volume_scale`	No	音量のスケール（1.0が標準）

Implementation Reference

src/services/mcp-service.ts:36-99 (registration)

Registers the MCP tool 'aivis-speech-synthesis' using mcpServer.tool(), defining schema and handler inline.

this.mcpServer.tool(
  MCP_MODEL_ID,
  {
    text: z.string().describe('音声合成するテキスト'),
    speaker_id: z.number().optional().describe('音声合成に使用するスピーカーのID'),
    style_id: z.number().optional().describe('音声合成に使用するスタイルのID'),
    speed_scale: z.number().min(0.5).max(2.0).optional().default(1.0).describe('話速のスケール（1.0が標準）'),
    pitch_scale: z.number().min(0.5).max(2.0).optional().default(1.0).describe('音高のスケール（1.0が標準）'),
    intonation_scale: z.number().min(0.0).max(2.0).optional().default(1.0).describe('イントネーションのスケール（1.0が標準）'),
    volume_scale: z.number().min(0.0).max(2.0).optional().default(1.0).describe('音量のスケール（1.0が標準）'),
    pre_phoneme_length: z.number().min(0.0).max(1.0).optional().default(0.1).describe('音声の先頭の無音時間（秒）'),
    post_phoneme_length: z.number().min(0.0).max(1.0).optional().default(0.1).describe('音声の末尾の無音時間（秒）'),
    output_sampling_rate: z.number().optional().default(24000).describe('出力音声のサンプリングレート（Hz）'),
  },
  async (params, extra) => {
    try {
      // デフォルトのスピーカーIDを.envから取得
      const defaultSpeakerId = parseInt(process.env.AIVIS_SPEECH_SPEAKER_ID || '888753760', 10);

      // AivisSpeech APIリクエストの作成
      const synthesisRequest: SynthesisRequest = {
        text: params.text,
        speaker: params.speaker_id || defaultSpeakerId,
        style_id: params.style_id,
        speed_scale: params.speed_scale,
        pitch_scale: params.pitch_scale,
        intonation_scale: params.intonation_scale,
        volume_scale: params.volume_scale,
        pre_phoneme_length: params.pre_phoneme_length,
        post_phoneme_length: params.post_phoneme_length,
        output_sampling_rate: params.output_sampling_rate,
      };

      try {
        // AivisSpeech APIを呼び出して音声合成を実行
        await aivisSpeechService.synthesize(synthesisRequest);

        // 正しいMCPレスポンス形式で返す
        return {
          content: [
            {
              type: "text",
              text: `「${params.text}」の音声合成が完了しました`
            }
          ]
        };
      } catch (synthesisError) {
        console.error('Synthesis error:', synthesisError);
        const errorMessage = synthesisError instanceof Error ? synthesisError.message : '音声合成処理中にエラーが発生しました';
        return {
          content: [{ type: "text", text: `音声合成に失敗しました: ${errorMessage}` }],
          isError: true
        };
      }
    } catch (error) {
      console.error('Request handling error:', error);
      const errorMessage = error instanceof Error ? error.message : '音声合成リクエストの処理中にエラーが発生しました';
      return {
        content: [{ type: "text", text: `音声合成に失敗しました: ${errorMessage}` }],
        isError: true
      };
    }
  }
);

src/services/mcp-service.ts:38-49 (schema)

Zod input schema for the speech synthesis parameters.

{
  text: z.string().describe('音声合成するテキスト'),
  speaker_id: z.number().optional().describe('音声合成に使用するスピーカーのID'),
  style_id: z.number().optional().describe('音声合成に使用するスタイルのID'),
  speed_scale: z.number().min(0.5).max(2.0).optional().default(1.0).describe('話速のスケール（1.0が標準）'),
  pitch_scale: z.number().min(0.5).max(2.0).optional().default(1.0).describe('音高のスケール（1.0が標準）'),
  intonation_scale: z.number().min(0.0).max(2.0).optional().default(1.0).describe('イントネーションのスケール（1.0が標準）'),
  volume_scale: z.number().min(0.0).max(2.0).optional().default(1.0).describe('音量のスケール（1.0が標準）'),
  pre_phoneme_length: z.number().min(0.0).max(1.0).optional().default(0.1).describe('音声の先頭の無音時間（秒）'),
  post_phoneme_length: z.number().min(0.0).max(1.0).optional().default(0.1).describe('音声の末尾の無音時間（秒）'),
  output_sampling_rate: z.number().optional().default(24000).describe('出力音声のサンプリングレート（Hz）'),
},

src/services/mcp-service.ts:50-98 (handler)

Inline handler function that processes params, calls aivisSpeechService.synthesize(), and formats MCP response.

async (params, extra) => {
  try {
    // デフォルトのスピーカーIDを.envから取得
    const defaultSpeakerId = parseInt(process.env.AIVIS_SPEECH_SPEAKER_ID || '888753760', 10);

    // AivisSpeech APIリクエストの作成
    const synthesisRequest: SynthesisRequest = {
      text: params.text,
      speaker: params.speaker_id || defaultSpeakerId,
      style_id: params.style_id,
      speed_scale: params.speed_scale,
      pitch_scale: params.pitch_scale,
      intonation_scale: params.intonation_scale,
      volume_scale: params.volume_scale,
      pre_phoneme_length: params.pre_phoneme_length,
      post_phoneme_length: params.post_phoneme_length,
      output_sampling_rate: params.output_sampling_rate,
    };

    try {
      // AivisSpeech APIを呼び出して音声合成を実行
      await aivisSpeechService.synthesize(synthesisRequest);

      // 正しいMCPレスポンス形式で返す
      return {
        content: [
          {
            type: "text",
            text: `「${params.text}」の音声合成が完了しました`
          }
        ]
      };
    } catch (synthesisError) {
      console.error('Synthesis error:', synthesisError);
      const errorMessage = synthesisError instanceof Error ? synthesisError.message : '音声合成処理中にエラーが発生しました';
      return {
        content: [{ type: "text", text: `音声合成に失敗しました: ${errorMessage}` }],
        isError: true
      };
    }
  } catch (error) {
    console.error('Request handling error:', error);
    const errorMessage = error instanceof Error ? error.message : '音声合成リクエストの処理中にエラーが発生しました';
    return {
      content: [{ type: "text", text: `音声合成に失敗しました: ${errorMessage}` }],
      isError: true
    };
  }
}

src/services/aivis-speech-service.ts:85-185 (helper)

synthesize() method implementing the core speech synthesis by calling AivisSpeech API endpoints /audio_query and /synthesis, generating WAV file, and playing it.

async synthesize(params: SynthesisRequest): Promise<SynthesisResponse> {
  try {
    // 1. まずaudio_queryを取得
    const queryUrl = `${this.baseUrl}/audio_query`;
    const queryResponse = await axios.post(
      queryUrl,
      null,
      {
        params: {
          text: params.text,
          speaker: params.speaker
        }
      }
    );

    // 2. audio_queryを取得したら、必要に応じてパラメータを更新
    const audioQuery = queryResponse.data;

    if (params.style_id !== undefined) {
      audioQuery.style_id = params.style_id;
    }

    if (params.speed_scale !== undefined) {
      audioQuery.speed_scale = params.speed_scale;
    }

    if (params.pitch_scale !== undefined) {
      audioQuery.pitch_scale = params.pitch_scale;
    }

    if (params.intonation_scale !== undefined) {
      audioQuery.intonation_scale = params.intonation_scale;
    }

    if (params.volume_scale !== undefined) {
      audioQuery.volume_scale = params.volume_scale;
    }

    if (params.pre_phoneme_length !== undefined) {
      audioQuery.pre_phoneme_length = params.pre_phoneme_length;
    }

    if (params.post_phoneme_length !== undefined) {
      audioQuery.post_phoneme_length = params.post_phoneme_length;
    }

    if (params.output_sampling_rate !== undefined) {
      audioQuery.output_sampling_rate = params.output_sampling_rate;
    }

    // 3. 更新したaudio_queryを使って音声合成
    const synthesisUrl = `${this.baseUrl}/synthesis`;
    const synthesisResponse = await axios.post<ArrayBuffer>(
      synthesisUrl,
      audioQuery,
      {
        responseType: 'arraybuffer',
        params: {
          speaker: params.speaker
        },
        headers: {
          'Accept': 'audio/wav',
          'Content-Type': 'application/json'
        }
      }
    );

    // 音声データを一時ファイルに保存して再生
    const audioData = synthesisResponse.data;
    const tempDir = path.join(process.cwd(), 'temp');

    // 一時ディレクトリが存在しない場合は作成
    if (!fs.existsSync(tempDir)) {
      fs.mkdirSync(tempDir, { recursive: true });
    }

    // 一時ファイルのパスを生成
    const audioFilePath = path.join(tempDir, `speech_${Date.now()}.wav`);

    // 音声データをファイルに書き込み
    fs.writeFileSync(audioFilePath, Buffer.from(audioData));

    // node-wav-playerを使って音声を再生（メディアプレイヤーが立ち上がらない）
    try {
      await wavPlayer.play({
        path: audioFilePath,
        sync: false // 非同期再生
      });
    } catch (playError) {
      console.error('Error playing audio:', playError);
    }

    return {
      audioData: synthesisResponse.data
    };
  } catch (error) {
    console.error('Error in synthesize:', error);
    this.logDetailedError(error);
    throw new Error(`音声合成に失敗しました: ${this.getErrorMessage(error)}`);
  }
}

src/types/aivis-speech.ts:22-72 (schema)

SynthesisRequest interface defining the parameters passed to the synthesize helper.

export interface SynthesisRequest {
  /**
   * 合成するテキスト
   */
  text: string;

  /**
   * 話者ID
   */
  speaker: number;

  /**
   * スタイルID
   */
  style_id?: number;

  /**
   * 話速のスケール（1.0が標準）
   */
  speed_scale?: number;

  /**
   * 音高のスケール（1.0が標準）
   */
  pitch_scale?: number;

  /**
   * イントネーションのスケール（1.0が標準）
   */
  intonation_scale?: number;

  /**
   * 音量のスケール（1.0が標準）
   */
  volume_scale?: number;

  /**
   * 音声の先頭の無音時間（秒）
   */
  pre_phoneme_length?: number;

  /**
   * 音声の末尾の無音時間（秒）
   */
  post_phoneme_length?: number;

  /**
   * 出力音声のサンプリングレート（Hz）
   */
  output_sampling_rate?: number;
}

AivisSpeech MCP Server

aivis-speech-synthesis

Input Schema

Implementation Reference

Tool Definition Quality

Other Tools

Related Tools

Latest Blog Posts

MCP directory API