feat: scaffold Fish Audio speech provider plugin

- index.ts: plugin entry with definePluginEntry + registerSpeechProvider - speech-provider.ts: full SpeechProviderPlugin implementation - resolveConfig from messages.tts.providers.fish-audio - parseDirectiveToken for voice, model, speed, latency, temperature, top_p - listVoices merging official + user's own voices - synthesize with format-aware output (opus for voice-note, mp3 otherwise) - stub Talk Mode (resolveTalkConfig/resolveTalkOverrides) - tts.ts: raw fishAudioTTS() fetch + listFishAudioVoices() - streaming chunked → buffer, error body included in exceptions - parallel voice listing with graceful partial failure - speech-provider.test.ts: voice ID validation tests - openclaw.plugin.json: speechProviders contract - package.json: peer dep on openclaw >=2026.3.0
2026-03-29 18:14:29 +11:00
parent ee1eb27cf0
commit 4842dc64a5
7 changed files with 675 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,97 @@
-# fish-audio-plugin
+# Fish Audio Speech Plugin for OpenClaw
-Fish Audio TTS speech provider plugin for OpenClaw
+A speech provider plugin that integrates [Fish Audio](https://fish.audio) TTS with OpenClaw.
 ## Features
 - **Fish Audio S2-Pro / S1 / S2** model support
 - **Dynamic voice listing** — your own cloned voices + official Fish Audio voices
 - **Format-aware output** — opus for voice notes (Telegram, WhatsApp), mp3 otherwise
 - **Inline directives** — switch voice, speed, model, and latency mid-message
 - **No core changes required** — standard `SpeechProviderPlugin` extension
 ## Installation
 ```bash
 openclaw plugins install @openclaw/fish-audio-speech
 ```
 ## Configuration
 In your `openclaw.json`:
 ```json
 {
  "messages": {
    "tts": {
      "provider": "fish-audio",
      "providers": {
        "fish-audio": {
          "apiKey": "your-fish-audio-api-key",
          "voiceId": "8a2d42279389471993460b85340235c5",
          "model": "s2-pro",
          "latency": "normal",
          "speed": 1.0
        }
      }
    }
  }
 }
 ```
 ### Config Options
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
 | `apiKey` | string | — | **Required.** Fish Audio API key |
 | `voiceId` | string | `8a2d42...` | Reference ID of the voice to use |
 | `model` | string | `s2-pro` | TTS model (`s2-pro`, `s1`, `s2`) |
 | `latency` | string | `normal` | Latency mode (`normal`, `balanced`, `low`) |
 | `speed` | number | — | Prosody speed (0.5–2.0) |
 | `temperature` | number | — | Sampling temperature (0–1) |
 | `topP` | number | — | Top-p sampling (0–1) |
 | `baseUrl` | string | `https://api.fish.audio` | API base URL |
 ### Environment Variable
 You can also set the API key via environment variable:
 ```bash
 FISH_AUDIO_API_KEY=your-key
 ```
 ## Directives
 Use inline directives in your messages to control TTS per-message:
 ```
 [[tts:voice=<ref_id>]]     Switch voice
 [[tts:speed=1.2]]          Prosody speed (0.5–2.0)
 [[tts:model=s1]]           Model override
 [[tts:latency=low]]        Latency mode
 [[tts:temperature=0.7]]    Sampling temperature
 [[tts:top_p=0.8]]          Top-p sampling
 ```
 ## Voice Listing
 The plugin dynamically lists available voices via `/tts voices`:
 - **Official Fish Audio voices** (~38 voices)
 - **Your own cloned/trained voices** (marked with "(mine)")
 ## Output Format
 The plugin automatically selects the best format based on the channel:
 - **Voice note channels** (Telegram, WhatsApp, Matrix, Feishu) → Opus
 - **All other channels** → MP3
 Both formats set `voiceCompatible: true` — Fish Audio output works cleanly as native voice notes.
 ## Requirements
 - OpenClaw ≥ 2026.3.0
 - Fish Audio API key ([get one here](https://fish.audio))
 ## License
 MIT
--- a/index.ts
+++ b/index.ts
@@ -0,0 +1,11 @@
 import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
 import { buildFishAudioSpeechProvider } from "./speech-provider.js";
 export default definePluginEntry({
  id: "fish-audio",
  name: "Fish Audio Speech",
  description: "Fish Audio TTS speech provider for OpenClaw",
  register(api) {
    api.registerSpeechProvider(buildFishAudioSpeechProvider());
  },
 });
--- a/openclaw.plugin.json
+++ b/openclaw.plugin.json
@@ -0,0 +1,11 @@
 {
  "id": "fish-audio",
  "contracts": {
    "speechProviders": ["fish-audio"]
  },
  "configSchema": {
    "type": "object",
    "additionalProperties": false,
    "properties": {}
  }
 }
--- a/package.json
+++ b/package.json
@@ -0,0 +1,21 @@
 {
  "name": "@openclaw/fish-audio-speech",
  "version": "0.1.0",
  "description": "Fish Audio TTS speech provider plugin for OpenClaw",
  "type": "module",
  "license": "MIT",
  "openclaw": {
    "extensions": [
      "./index.ts"
    ]
  },
  "peerDependencies": {
    "openclaw": ">=2026.3.0"
  },
  "devDependencies": {
    "vitest": "^3.0.0"
  },
  "scripts": {
    "test": "vitest run"
  }
 }
--- a/speech-provider.test.ts
+++ b/speech-provider.test.ts
@@ -0,0 +1,39 @@
 import { describe, expect, it } from "vitest";
 import { isValidFishAudioVoiceId } from "./speech-provider.js";
 describe("fish-audio speech provider", () => {
  describe("isValidFishAudioVoiceId", () => {
    it("accepts valid Fish Audio ref IDs (24-40 char hex)", () => {
      const valid = [
        "8a2d42279389471993460b85340235c5", // 32 char - standard
        "0dad9e24630447cf97803f4beee10481", // 32 char
        "5796fe24630447cf97803f4beee10481", // 32 char
        "d8b0991f96b44e489422ca2ddf0bd31d", // 32 char - author id
        "aabbccddee112233445566778899", // 28 char
        "aabbccddee11223344556677", // 24 char (minimum)
      ];
      for (const v of valid) {
        expect(isValidFishAudioVoiceId(v), `expected valid: ${v}`).toBe(true);
      }
    });
    it("rejects invalid voice IDs", () => {
      const invalid = [
        "", // empty
        "abc123", // too short
        "12345678901234567890123", // 23 chars - below minimum
        "a".repeat(41), // too long
        "8a2d4227-9389-4719-9346-0b85340235c5", // UUID with dashes
        "../../../etc/passwd", // path traversal
        "voice?param=value", // query string
        "pMsXgVXv3BLzUgSXRplE", // ElevenLabs-style (mixed case, 20 chars)
        "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ", // non-hex chars
      ];
      for (const v of invalid) {
        expect(isValidFishAudioVoiceId(v), `expected invalid: ${v}`).toBe(
          false,
        );
      }
    });
  });
 });
--- a/speech-provider.ts
+++ b/speech-provider.ts
@@ -0,0 +1,322 @@
 import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
 import type {
  SpeechDirectiveTokenParseContext,
  SpeechProviderConfig,
  SpeechProviderPlugin,
  SpeechVoiceOption,
 } from "openclaw/plugin-sdk/speech-core";
 import { requireInRange } from "openclaw/plugin-sdk/speech-core";
 import { fishAudioTTS, listFishAudioVoices } from "./tts.js";
 // ── Defaults ────────────────────────────────────────────────────────────────
 const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
 const DEFAULT_VOICE_ID = "8a2d42279389471993460b85340235c5"; // SJ voice
 const DEFAULT_MODEL = "s2-pro";
 const DEFAULT_LATENCY = "normal" as const;
 const FISH_AUDIO_MODELS = ["s2-pro", "s1", "s2"] as const;
 // ── Types ───────────────────────────────────────────────────────────────────
 type FishAudioProviderConfig = {
  apiKey?: string;
  baseUrl: string;
  voiceId: string;
  model: string;
  latency: "normal" | "balanced" | "low";
  speed?: number;
  temperature?: number;
  topP?: number;
 };
 // ── Helpers ─────────────────────────────────────────────────────────────────
 function trimToUndefined(value: unknown): string | undefined {
  return typeof value === "string" && value.trim() ? value.trim() : undefined;
 }
 function asNumber(value: unknown): number | undefined {
  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
 }
 function asObject(value: unknown): Record<string, unknown> | undefined {
  return typeof value === "object" && value !== null && !Array.isArray(value)
    ? (value as Record<string, unknown>)
    : undefined;
 }
 function parseNumberValue(value: string): number | undefined {
  const parsed = Number.parseFloat(value);
  return Number.isFinite(parsed) ? parsed : undefined;
 }
 function normalizeBaseUrl(baseUrl: string | undefined): string {
  const trimmed = baseUrl?.trim();
  return trimmed?.replace(/\/+$/, "") || DEFAULT_FISH_AUDIO_BASE_URL;
 }
 function normalizeLatency(value: unknown): "normal" | "balanced" | "low" {
  const s = typeof value === "string" ? value.trim().toLowerCase() : "";
  if (s === "balanced" || s === "low") return s;
  return DEFAULT_LATENCY;
 }
 function normalizeModel(value: unknown): string {
  const s = typeof value === "string" ? value.trim() : "";
  return s || DEFAULT_MODEL;
 }
 /** Fish Audio ref IDs are 32-char hex strings */
 export function isValidFishAudioVoiceId(voiceId: string): boolean {
  return /^[a-f0-9]{24,40}$/i.test(voiceId);
 }
 // ── Config resolution ───────────────────────────────────────────────────────
 function normalizeFishAudioProviderConfig(
  rawConfig: Record<string, unknown>,
 ): FishAudioProviderConfig {
  const providers = asObject(rawConfig.providers);
  const raw =
    asObject(providers?.["fish-audio"]) ?? asObject(rawConfig["fish-audio"]);
  return {
    apiKey: normalizeResolvedSecretInputString({
      value: raw?.apiKey,
      path: "messages.tts.providers.fish-audio.apiKey",
    }),
    baseUrl: normalizeBaseUrl(trimToUndefined(raw?.baseUrl)),
    voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_VOICE_ID,
    model: normalizeModel(raw?.model),
    latency: normalizeLatency(raw?.latency),
    speed: asNumber(raw?.speed),
    temperature: asNumber(raw?.temperature),
    topP: asNumber(raw?.topP),
  };
 }
 function readFishAudioProviderConfig(
  config: SpeechProviderConfig,
 ): FishAudioProviderConfig {
  const defaults = normalizeFishAudioProviderConfig({});
  return {
    apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
    baseUrl: normalizeBaseUrl(
      trimToUndefined(config.baseUrl) ?? defaults.baseUrl,
    ),
    voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
    model: normalizeModel(config.model) || defaults.model,
    latency: normalizeLatency(config.latency),
    speed: asNumber(config.speed) ?? defaults.speed,
    temperature: asNumber(config.temperature) ?? defaults.temperature,
    topP: asNumber(config.topP) ?? defaults.topP,
  };
 }
 // ── Directive parsing ───────────────────────────────────────────────────────
 function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
  try {
    switch (ctx.key) {
      case "voice":
      case "voiceid":
      case "voice_id":
      case "fish_voice":
      case "fishvoice":
      case "reference_id":
        if (!ctx.policy.allowVoice) {
          return { handled: true };
        }
        if (!isValidFishAudioVoiceId(ctx.value)) {
          return {
            handled: true,
            warnings: [`invalid Fish Audio voice ID "${ctx.value}"`],
          };
        }
        return {
          handled: true,
          overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
        };
      case "model":
      case "modelid":
      case "model_id":
      case "fish_model":
      case "fishmodel":
        if (!ctx.policy.allowModelId) {
          return { handled: true };
        }
        return {
          handled: true,
          overrides: { ...(ctx.currentOverrides ?? {}), model: ctx.value },
        };
      case "speed": {
        if (!ctx.policy.allowVoiceSettings) {
          return { handled: true };
        }
        const value = parseNumberValue(ctx.value);
        if (value == null) {
          return { handled: true, warnings: ["invalid speed value"] };
        }
        requireInRange(value, 0.5, 2.0, "speed");
        return {
          handled: true,
          overrides: { ...(ctx.currentOverrides ?? {}), speed: value },
        };
      }
      case "latency":
      case "fish_latency":
        if (!ctx.policy.allowVoiceSettings) {
          return { handled: true };
        }
        {
          const lat = normalizeLatency(ctx.value);
          return {
            handled: true,
            overrides: { ...(ctx.currentOverrides ?? {}), latency: lat },
          };
        }
      case "temperature":
      case "temp": {
        if (!ctx.policy.allowVoiceSettings) {
          return { handled: true };
        }
        const value = parseNumberValue(ctx.value);
        if (value == null) {
          return { handled: true, warnings: ["invalid temperature value"] };
        }
        requireInRange(value, 0, 1, "temperature");
        return {
          handled: true,
          overrides: { ...(ctx.currentOverrides ?? {}), temperature: value },
        };
      }
      case "top_p":
      case "topp": {
        if (!ctx.policy.allowVoiceSettings) {
          return { handled: true };
        }
        const value = parseNumberValue(ctx.value);
        if (value == null) {
          return { handled: true, warnings: ["invalid top_p value"] };
        }
        requireInRange(value, 0, 1, "top_p");
        return {
          handled: true,
          overrides: { ...(ctx.currentOverrides ?? {}), topP: value },
        };
      }
      default:
        return { handled: false };
    }
  } catch (error) {
    return {
      handled: true,
      warnings: [error instanceof Error ? error.message : String(error)],
    };
  }
 }
 // ── Provider ────────────────────────────────────────────────────────────────
 export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
  return {
    id: "fish-audio",
    label: "Fish Audio",
    autoSelectOrder: 30,
    models: FISH_AUDIO_MODELS,
    resolveConfig: ({ rawConfig }) =>
      normalizeFishAudioProviderConfig(rawConfig),
    parseDirectiveToken,
    // Talk Mode — v2, stub for now
    resolveTalkConfig: ({ baseTtsConfig }) =>
      normalizeFishAudioProviderConfig(baseTtsConfig),
    resolveTalkOverrides: ({ params }) => ({
      ...(trimToUndefined(params.voiceId) == null
        ? {}
        : { voiceId: trimToUndefined(params.voiceId) }),
      ...(trimToUndefined(params.model) == null
        ? {}
        : { model: trimToUndefined(params.model) }),
      ...(asNumber(params.speed) == null
        ? {}
        : { speed: asNumber(params.speed) }),
    }),
    listVoices: async (req) => {
      const config = req.providerConfig
        ? readFishAudioProviderConfig(req.providerConfig)
        : undefined;
      const apiKey =
        req.apiKey ||
        config?.apiKey ||
        process.env.FISH_AUDIO_API_KEY;
      if (!apiKey) {
        throw new Error("Fish Audio API key missing");
      }
      const raw = await listFishAudioVoices({
        apiKey,
        baseUrl: req.baseUrl ?? config?.baseUrl,
      });
      return raw as SpeechVoiceOption[];
    },
    isConfigured: ({ providerConfig }) =>
      Boolean(
        readFishAudioProviderConfig(providerConfig).apiKey ||
          process.env.FISH_AUDIO_API_KEY,
      ),
    synthesize: async (req) => {
      const config = readFishAudioProviderConfig(req.providerConfig);
      const overrides = req.providerOverrides ?? {};
      const apiKey =
        config.apiKey || process.env.FISH_AUDIO_API_KEY;
      if (!apiKey) {
        throw new Error("Fish Audio API key missing");
      }
      // Pick format based on target channel
      const useOpus = req.target === "voice-note";
      const format = useOpus ? "opus" : "mp3";
      const speed = asNumber(overrides.speed) ?? config.speed;
      if (speed != null) {
        requireInRange(speed, 0.5, 2.0, "speed");
      }
      const audioBuffer = await fishAudioTTS({
        text: req.text,
        apiKey,
        baseUrl: config.baseUrl,
        referenceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
        model: trimToUndefined(overrides.model) ?? config.model,
        format,
        latency:
          normalizeLatency(overrides.latency) !== DEFAULT_LATENCY
            ? normalizeLatency(overrides.latency)
            : config.latency,
        speed,
        temperature: asNumber(overrides.temperature) ?? config.temperature,
        topP: asNumber(overrides.topP) ?? config.topP,
        timeoutMs: req.timeoutMs,
      });
      return {
        audioBuffer,
        outputFormat: format,
        fileExtension: useOpus ? ".opus" : ".mp3",
        voiceCompatible: true, // Fish Audio output works as voice note in both formats
      };
    },
  };
 }
--- a/tts.ts
+++ b/tts.ts
@@ -0,0 +1,175 @@
 const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
 function normalizeFishAudioBaseUrl(baseUrl?: string): string {
  const trimmed = baseUrl?.trim();
  if (!trimmed) {
    return DEFAULT_FISH_AUDIO_BASE_URL;
  }
  return trimmed.replace(/\/+$/, "");
 }
 export async function fishAudioTTS(params: {
  text: string;
  apiKey: string;
  baseUrl?: string;
  referenceId: string;
  model: string;
  format: "mp3" | "opus" | "wav" | "pcm";
  latency?: "normal" | "balanced" | "low";
  speed?: number;
  temperature?: number;
  topP?: number;
  timeoutMs: number;
 }): Promise<Buffer> {
  const {
    text,
    apiKey,
    baseUrl,
    referenceId,
    model,
    format,
    latency,
    speed,
    temperature,
    topP,
    timeoutMs,
  } = params;
  if (!text.trim()) {
    throw new Error("Fish Audio TTS: empty text");
  }
  if (!referenceId.trim()) {
    throw new Error("Fish Audio TTS: missing reference_id (voice)");
  }
  const controller = new AbortController();
  const timeout = setTimeout(() => controller.abort(), timeoutMs);
  try {
    const url = `${normalizeFishAudioBaseUrl(baseUrl)}/v1/tts`;
    const body: Record<string, unknown> = {
      text,
      reference_id: referenceId,
      format,
    };
    if (latency && latency !== "normal") {
      body.latency = latency;
    }
    // Prosody settings
    if (speed != null) {
      body.prosody = { speed };
    }
    if (temperature != null) {
      body.temperature = temperature;
    }
    if (topP != null) {
      body.top_p = topP;
    }
    const response = await fetch(url, {
      method: "POST",
      headers: {
        Authorization: `Bearer ${apiKey}`,
        "Content-Type": "application/json",
        model,
      },
      body: JSON.stringify(body),
      signal: controller.signal,
    });
    if (!response.ok) {
      let errorDetail = "";
      try {
        const errorBody = await response.text();
        errorDetail = errorBody ? `: ${errorBody}` : "";
      } catch {
        // Ignore error body read failure
      }
      throw new Error(`Fish Audio API error (${response.status})${errorDetail}`);
    }
    const buffer = Buffer.from(await response.arrayBuffer());
    if (buffer.length === 0) {
      throw new Error("Fish Audio TTS produced empty audio");
    }
    return buffer;
  } finally {
    clearTimeout(timeout);
  }
 }
 export async function listFishAudioVoices(params: {
  apiKey: string;
  baseUrl?: string;
 }): Promise<Array<{ id: string; name: string }>> {
  const base = normalizeFishAudioBaseUrl(params.baseUrl);
  // Two parallel calls: official voices + user's own voices
  const [officialRes, selfRes] = await Promise.allSettled([
    fetch(`${base}/model?type=tts&author_id=d8b0991f96b44e489422ca2ddf0bd31d&page_size=100`, {
      headers: { Authorization: `Bearer ${params.apiKey}` },
    }),
    fetch(`${base}/model?type=tts&self=true&page_size=100`, {
      headers: { Authorization: `Bearer ${params.apiKey}` },
    }),
  ]);
  const voices = new Map<string, string>();
  // Process official voices first
  if (officialRes.status === "fulfilled" && officialRes.value.ok) {
    const json = (await officialRes.value.json()) as {
      items?: Array<{ _id?: string; title?: string }>;
    };
    if (Array.isArray(json.items)) {
      for (const v of json.items) {
        const id = v._id?.trim();
        const name = v.title?.trim();
        if (id) {
          voices.set(id, name || id);
        }
      }
    }
  }
  // User's own voices take precedence on conflict
  if (selfRes.status === "fulfilled" && selfRes.value.ok) {
    const json = (await selfRes.value.json()) as {
      items?: Array<{ _id?: string; title?: string }>;
    };
    if (Array.isArray(json.items)) {
      for (const v of json.items) {
        const id = v._id?.trim();
        const name = v.title?.trim();
        if (id) {
          voices.set(id, name ? `${name} (mine)` : id);
        }
      }
    }
  }
  // If both calls failed, throw
  if (voices.size === 0) {
    const errors: string[] = [];
    if (officialRes.status === "rejected") {
      errors.push(`official: ${officialRes.reason}`);
    } else if (!officialRes.value.ok) {
      errors.push(`official: HTTP ${officialRes.value.status}`);
    }
    if (selfRes.status === "rejected") {
      errors.push(`self: ${selfRes.reason}`);
    } else if (!selfRes.value.ok) {
      errors.push(`self: HTTP ${selfRes.value.status}`);
    }
    if (errors.length > 0) {
      throw new Error(`Fish Audio voices API error: ${errors.join("; ")}`);
    }
  }
  return Array.from(voices.entries()).map(([id, name]) => ({ id, name }));
 }