sync: match upstream PR #56891 at rebase 2026-03-30

Brings Gitea mirror up to date with the current state of the openclaw/openclaw PR branch, including all fixes from Codex review: - Namespaced directive keys (fishaudio_*/fish_* prefixes only) - Strict latency directive validation with warnings - Code quality cleanup, s2 model removal - Contract and directive parsing tests - README updated with prefixed directive docs Source: Conan-Scott/openclaw@9787ef6e (feat/fish-audio-speech-provider)
2026-03-30 18:14:52 +11:00
parent c23359d50d
commit 4af0789ebe
7 changed files with 212 additions and 191 deletions
--- a/speech-provider.ts
+++ b/speech-provider.ts
@@ -6,11 +6,9 @@ import type {
  SpeechVoiceOption,
 } from "openclaw/plugin-sdk/speech-core";
 import { requireInRange } from "openclaw/plugin-sdk/speech-core";
-import { fishAudioTTS, listFishAudioVoices } from "./tts.js";
+import { DEFAULT_FISH_AUDIO_BASE_URL, fishAudioTTS, listFishAudioVoices, normalizeFishAudioBaseUrl } from "./tts.js";

 // ── Defaults ────────────────────────────────────────────────────────────────
-
-const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
 // No default voice — users must configure one. Fish Audio has no universal
 // "default" voice like ElevenLabs does, and shipping a personal clone ID
 // as default would be wrong for community users.
@@ -18,7 +16,7 @@ const DEFAULT_VOICE_ID = "";
 const DEFAULT_MODEL = "s2-pro";
 const DEFAULT_LATENCY = "normal" as const;

-const FISH_AUDIO_MODELS = ["s2-pro", "s1", "s2"] as const;
+const FISH_AUDIO_MODELS = ["s2-pro", "s1"] as const;

 // ── Types ───────────────────────────────────────────────────────────────────

@@ -54,11 +52,6 @@ function parseNumberValue(value: string): number | undefined {
  return Number.isFinite(parsed) ? parsed : undefined;
 }

-function normalizeBaseUrl(baseUrl: string | undefined): string {
-  const trimmed = baseUrl?.trim();
-  return trimmed?.replace(/\/+$/, "") || DEFAULT_FISH_AUDIO_BASE_URL;
-}
-
 function normalizeLatency(value: unknown): "normal" | "balanced" | "low" {
  const s = typeof value === "string" ? value.trim().toLowerCase() : "";
  if (s === "balanced" || s === "low") return s;
@@ -90,7 +83,7 @@ function normalizeFishAudioProviderConfig(
      value: raw?.apiKey,
      path: "messages.tts.providers.fish-audio.apiKey",
    }),
-    baseUrl: normalizeBaseUrl(trimToUndefined(raw?.baseUrl)),
+    baseUrl: normalizeFishAudioBaseUrl(trimToUndefined(raw?.baseUrl)),
    voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_VOICE_ID,
    model: normalizeModel(raw?.model),
    latency: normalizeLatency(raw?.latency),
@@ -106,7 +99,7 @@ function readFishAudioProviderConfig(
  const defaults = normalizeFishAudioProviderConfig({});
  return {
    apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
-    baseUrl: normalizeBaseUrl(
+    baseUrl: normalizeFishAudioBaseUrl(
      trimToUndefined(config.baseUrl) ?? defaults.baseUrl,
    ),
    voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
@@ -121,14 +114,17 @@ function readFishAudioProviderConfig(
 // ── Directive parsing ───────────────────────────────────────────────────────

 function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
+  // Only claim provider-prefixed keys to avoid dispatch collisions.
+  // `parseTtsDirectives` stops at the first provider whose `parseDirectiveToken`
+  // returns `handled: true`, and bundled providers with lower `autoSelectOrder`
+  // (e.g. OpenAI at 10) are visited first. Generic keys like "voice" or "model"
+  // would be swallowed by earlier providers and never reach us.
+  // Convention matches ElevenLabs: `elevenlabs_voice`, `elevenlabs_model`, etc.
  try {
    switch (ctx.key) {
-      case "voice":
-      case "voiceid":
-      case "voice_id":
+      case "fishaudio_voice":
      case "fish_voice":
-      case "fishvoice":
-      case "reference_id":
+      case "fishaudio_voiceid":
        if (!ctx.policy.allowVoice) {
          return { handled: true };
        }
@@ -143,11 +139,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
          overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
        };

-      case "model":
-      case "modelid":
-      case "model_id":
+      case "fishaudio_model":
      case "fish_model":
-      case "fishmodel":
        if (!ctx.policy.allowModelId) {
          return { handled: true };
        }
@@ -156,7 +149,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
          overrides: { ...(ctx.currentOverrides ?? {}), model: ctx.value },
        };

-      case "speed": {
+      case "fishaudio_speed":
+      case "fish_speed": {
        if (!ctx.policy.allowVoiceSettings) {
          return { handled: true };
        }
@@ -171,21 +165,26 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
        };
      }

-      case "latency":
-      case "fish_latency":
+      case "fishaudio_latency":
+      case "fish_latency": {
        if (!ctx.policy.allowVoiceSettings) {
          return { handled: true };
        }
-        {
-          const lat = normalizeLatency(ctx.value);
+        const raw = typeof ctx.value === "string" ? ctx.value.trim().toLowerCase() : "";
+        if (raw !== "normal" && raw !== "balanced" && raw !== "low") {
          return {
            handled: true,
-            overrides: { ...(ctx.currentOverrides ?? {}), latency: lat },
+            warnings: [`invalid Fish Audio latency "${ctx.value}" (expected: normal, balanced, low)`],
          };
        }
+        return {
+          handled: true,
+          overrides: { ...(ctx.currentOverrides ?? {}), latency: raw },
+        };
+      }

-      case "temperature":
-      case "temp": {
+      case "fishaudio_temperature":
+      case "fish_temperature": {
        if (!ctx.policy.allowVoiceSettings) {
          return { handled: true };
        }
@@ -200,8 +199,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
        };
      }

-      case "top_p":
-      case "topp": {
+      case "fishaudio_top_p":
+      case "fish_top_p": {
        if (!ctx.policy.allowVoiceSettings) {
          return { handled: true };
        }
@@ -233,6 +232,9 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
  return {
    id: "fish-audio",
    label: "Fish Audio",
+    // Lower = higher priority in auto-detect fallback. Positioned below OpenAI (10)
+    // but above ElevenLabs (20) and Microsoft (30) since Fish Audio requires
+    // explicit configuration (apiKey + voiceId) to pass isConfigured().
    autoSelectOrder: 15,
    models: FISH_AUDIO_MODELS,

@@ -241,17 +243,43 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {

    parseDirectiveToken,

-    // Talk Mode — v2, stub for now
-    resolveTalkConfig: ({ baseTtsConfig }) =>
-      normalizeFishAudioProviderConfig(baseTtsConfig),
+    resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
+      const base = normalizeFishAudioProviderConfig(baseTtsConfig);
+      return {
+        ...base,
+        ...(talkProviderConfig.apiKey === undefined
+          ? {}
+          : {
+              apiKey: normalizeResolvedSecretInputString({
+                value: talkProviderConfig.apiKey,
+                path: "talk.providers.fish-audio.apiKey",
+              }),
+            }),
+        ...(trimToUndefined(talkProviderConfig.baseUrl) == null
+          ? {}
+          : { baseUrl: normalizeFishAudioBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
+        ...(trimToUndefined(talkProviderConfig.voiceId) == null
+          ? {}
+          : { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
+        ...(trimToUndefined(talkProviderConfig.modelId) == null
+          ? {}
+          : { model: normalizeModel(talkProviderConfig.modelId) }),
+        ...(talkProviderConfig.latency == null
+          ? {}
+          : { latency: normalizeLatency(talkProviderConfig.latency) }),
+        ...(asNumber(talkProviderConfig.speed) == null
+          ? {}
+          : { speed: asNumber(talkProviderConfig.speed) }),
+      };
+    },

    resolveTalkOverrides: ({ params }) => ({
      ...(trimToUndefined(params.voiceId) == null
        ? {}
        : { voiceId: trimToUndefined(params.voiceId) }),
-      ...(trimToUndefined(params.model) == null
+      ...(trimToUndefined(params.modelId) == null
        ? {}
-        : { model: trimToUndefined(params.model) }),
+        : { model: trimToUndefined(params.modelId) }),
      ...(asNumber(params.speed) == null
        ? {}
        : { speed: asNumber(params.speed) }),
@@ -314,10 +342,9 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
        referenceId: voiceId,
        model: trimToUndefined(overrides.model) ?? config.model,
        format,
-        latency:
-          normalizeLatency(overrides.latency) !== DEFAULT_LATENCY
-            ? normalizeLatency(overrides.latency)
-            : config.latency,
+        latency: overrides.latency != null
+          ? normalizeLatency(overrides.latency)
+          : config.latency,
        speed,
        temperature: asNumber(overrides.temperature) ?? config.temperature,
        topP: asNumber(overrides.topP) ?? config.topP,