feat: scaffold Fish Audio speech provider plugin

- index.ts: plugin entry with definePluginEntry + registerSpeechProvider - speech-provider.ts: full SpeechProviderPlugin implementation - resolveConfig from messages.tts.providers.fish-audio - parseDirectiveToken for voice, model, speed, latency, temperature, top_p - listVoices merging official + user's own voices - synthesize with format-aware output (opus for voice-note, mp3 otherwise) - stub Talk Mode (resolveTalkConfig/resolveTalkOverrides) - tts.ts: raw fishAudioTTS() fetch + listFishAudioVoices() - streaming chunked → buffer, error body included in exceptions - parallel voice listing with graceful partial failure - speech-provider.test.ts: voice ID validation tests - openclaw.plugin.json: speechProviders contract - package.json: peer dep on openclaw >=2026.3.0
2026-03-29 18:14:29 +11:00
parent ee1eb27cf0
commit 4842dc64a5
7 changed files with 675 additions and 2 deletions
--- a/speech-provider.ts
+++ b/speech-provider.ts
@@ -0,0 +1,322 @@
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import type {
+  SpeechDirectiveTokenParseContext,
+  SpeechProviderConfig,
+  SpeechProviderPlugin,
+  SpeechVoiceOption,
+} from "openclaw/plugin-sdk/speech-core";
+import { requireInRange } from "openclaw/plugin-sdk/speech-core";
+import { fishAudioTTS, listFishAudioVoices } from "./tts.js";
+
+// ── Defaults ────────────────────────────────────────────────────────────────
+
+const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
+const DEFAULT_VOICE_ID = "8a2d42279389471993460b85340235c5"; // SJ voice
+const DEFAULT_MODEL = "s2-pro";
+const DEFAULT_LATENCY = "normal" as const;
+
+const FISH_AUDIO_MODELS = ["s2-pro", "s1", "s2"] as const;
+
+// ── Types ───────────────────────────────────────────────────────────────────
+
+type FishAudioProviderConfig = {
+  apiKey?: string;
+  baseUrl: string;
+  voiceId: string;
+  model: string;
+  latency: "normal" | "balanced" | "low";
+  speed?: number;
+  temperature?: number;
+  topP?: number;
+};
+
+// ── Helpers ─────────────────────────────────────────────────────────────────
+
+function trimToUndefined(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim() ? value.trim() : undefined;
+}
+
+function asNumber(value: unknown): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function asObject(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function parseNumberValue(value: string): number | undefined {
+  const parsed = Number.parseFloat(value);
+  return Number.isFinite(parsed) ? parsed : undefined;
+}
+
+function normalizeBaseUrl(baseUrl: string | undefined): string {
+  const trimmed = baseUrl?.trim();
+  return trimmed?.replace(/\/+$/, "") || DEFAULT_FISH_AUDIO_BASE_URL;
+}
+
+function normalizeLatency(value: unknown): "normal" | "balanced" | "low" {
+  const s = typeof value === "string" ? value.trim().toLowerCase() : "";
+  if (s === "balanced" || s === "low") return s;
+  return DEFAULT_LATENCY;
+}
+
+function normalizeModel(value: unknown): string {
+  const s = typeof value === "string" ? value.trim() : "";
+  return s || DEFAULT_MODEL;
+}
+
+/** Fish Audio ref IDs are 32-char hex strings */
+export function isValidFishAudioVoiceId(voiceId: string): boolean {
+  return /^[a-f0-9]{24,40}$/i.test(voiceId);
+}
+
+// ── Config resolution ───────────────────────────────────────────────────────
+
+function normalizeFishAudioProviderConfig(
+  rawConfig: Record<string, unknown>,
+): FishAudioProviderConfig {
+  const providers = asObject(rawConfig.providers);
+  const raw =
+    asObject(providers?.["fish-audio"]) ?? asObject(rawConfig["fish-audio"]);
+  return {
+    apiKey: normalizeResolvedSecretInputString({
+      value: raw?.apiKey,
+      path: "messages.tts.providers.fish-audio.apiKey",
+    }),
+    baseUrl: normalizeBaseUrl(trimToUndefined(raw?.baseUrl)),
+    voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_VOICE_ID,
+    model: normalizeModel(raw?.model),
+    latency: normalizeLatency(raw?.latency),
+    speed: asNumber(raw?.speed),
+    temperature: asNumber(raw?.temperature),
+    topP: asNumber(raw?.topP),
+  };
+}
+
+function readFishAudioProviderConfig(
+  config: SpeechProviderConfig,
+): FishAudioProviderConfig {
+  const defaults = normalizeFishAudioProviderConfig({});
+  return {
+    apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
+    baseUrl: normalizeBaseUrl(
+      trimToUndefined(config.baseUrl) ?? defaults.baseUrl,
+    ),
+    voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
+    model: normalizeModel(config.model) || defaults.model,
+    latency: normalizeLatency(config.latency),
+    speed: asNumber(config.speed) ?? defaults.speed,
+    temperature: asNumber(config.temperature) ?? defaults.temperature,
+    topP: asNumber(config.topP) ?? defaults.topP,
+  };
+}
+
+// ── Directive parsing ───────────────────────────────────────────────────────
+
+function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
+  try {
+    switch (ctx.key) {
+      case "voice":
+      case "voiceid":
+      case "voice_id":
+      case "fish_voice":
+      case "fishvoice":
+      case "reference_id":
+        if (!ctx.policy.allowVoice) {
+          return { handled: true };
+        }
+        if (!isValidFishAudioVoiceId(ctx.value)) {
+          return {
+            handled: true,
+            warnings: [`invalid Fish Audio voice ID "${ctx.value}"`],
+          };
+        }
+        return {
+          handled: true,
+          overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
+        };
+
+      case "model":
+      case "modelid":
+      case "model_id":
+      case "fish_model":
+      case "fishmodel":
+        if (!ctx.policy.allowModelId) {
+          return { handled: true };
+        }
+        return {
+          handled: true,
+          overrides: { ...(ctx.currentOverrides ?? {}), model: ctx.value },
+        };
+
+      case "speed": {
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        const value = parseNumberValue(ctx.value);
+        if (value == null) {
+          return { handled: true, warnings: ["invalid speed value"] };
+        }
+        requireInRange(value, 0.5, 2.0, "speed");
+        return {
+          handled: true,
+          overrides: { ...(ctx.currentOverrides ?? {}), speed: value },
+        };
+      }
+
+      case "latency":
+      case "fish_latency":
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        {
+          const lat = normalizeLatency(ctx.value);
+          return {
+            handled: true,
+            overrides: { ...(ctx.currentOverrides ?? {}), latency: lat },
+          };
+        }
+
+      case "temperature":
+      case "temp": {
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        const value = parseNumberValue(ctx.value);
+        if (value == null) {
+          return { handled: true, warnings: ["invalid temperature value"] };
+        }
+        requireInRange(value, 0, 1, "temperature");
+        return {
+          handled: true,
+          overrides: { ...(ctx.currentOverrides ?? {}), temperature: value },
+        };
+      }
+
+      case "top_p":
+      case "topp": {
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        const value = parseNumberValue(ctx.value);
+        if (value == null) {
+          return { handled: true, warnings: ["invalid top_p value"] };
+        }
+        requireInRange(value, 0, 1, "top_p");
+        return {
+          handled: true,
+          overrides: { ...(ctx.currentOverrides ?? {}), topP: value },
+        };
+      }
+
+      default:
+        return { handled: false };
+    }
+  } catch (error) {
+    return {
+      handled: true,
+      warnings: [error instanceof Error ? error.message : String(error)],
+    };
+  }
+}
+
+// ── Provider ────────────────────────────────────────────────────────────────
+
+export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "fish-audio",
+    label: "Fish Audio",
+    autoSelectOrder: 30,
+    models: FISH_AUDIO_MODELS,
+
+    resolveConfig: ({ rawConfig }) =>
+      normalizeFishAudioProviderConfig(rawConfig),
+
+    parseDirectiveToken,
+
+    // Talk Mode — v2, stub for now
+    resolveTalkConfig: ({ baseTtsConfig }) =>
+      normalizeFishAudioProviderConfig(baseTtsConfig),
+
+    resolveTalkOverrides: ({ params }) => ({
+      ...(trimToUndefined(params.voiceId) == null
+        ? {}
+        : { voiceId: trimToUndefined(params.voiceId) }),
+      ...(trimToUndefined(params.model) == null
+        ? {}
+        : { model: trimToUndefined(params.model) }),
+      ...(asNumber(params.speed) == null
+        ? {}
+        : { speed: asNumber(params.speed) }),
+    }),
+
+    listVoices: async (req) => {
+      const config = req.providerConfig
+        ? readFishAudioProviderConfig(req.providerConfig)
+        : undefined;
+      const apiKey =
+        req.apiKey ||
+        config?.apiKey ||
+        process.env.FISH_AUDIO_API_KEY;
+      if (!apiKey) {
+        throw new Error("Fish Audio API key missing");
+      }
+      const raw = await listFishAudioVoices({
+        apiKey,
+        baseUrl: req.baseUrl ?? config?.baseUrl,
+      });
+      return raw as SpeechVoiceOption[];
+    },
+
+    isConfigured: ({ providerConfig }) =>
+      Boolean(
+        readFishAudioProviderConfig(providerConfig).apiKey ||
+          process.env.FISH_AUDIO_API_KEY,
+      ),
+
+    synthesize: async (req) => {
+      const config = readFishAudioProviderConfig(req.providerConfig);
+      const overrides = req.providerOverrides ?? {};
+      const apiKey =
+        config.apiKey || process.env.FISH_AUDIO_API_KEY;
+      if (!apiKey) {
+        throw new Error("Fish Audio API key missing");
+      }
+
+      // Pick format based on target channel
+      const useOpus = req.target === "voice-note";
+      const format = useOpus ? "opus" : "mp3";
+
+      const speed = asNumber(overrides.speed) ?? config.speed;
+      if (speed != null) {
+        requireInRange(speed, 0.5, 2.0, "speed");
+      }
+
+      const audioBuffer = await fishAudioTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: config.baseUrl,
+        referenceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
+        model: trimToUndefined(overrides.model) ?? config.model,
+        format,
+        latency:
+          normalizeLatency(overrides.latency) !== DEFAULT_LATENCY
+            ? normalizeLatency(overrides.latency)
+            : config.latency,
+        speed,
+        temperature: asNumber(overrides.temperature) ?? config.temperature,
+        topP: asNumber(overrides.topP) ?? config.topP,
+        timeoutMs: req.timeoutMs,
+      });
+
+      return {
+        audioBuffer,
+        outputFormat: format,
+        fileExtension: useOpus ? ".opus" : ".mp3",
+        voiceCompatible: true, // Fish Audio output works as voice note in both formats
+      };
+    },
+  };
+}