From ed505dcce15f07879ae3e7ea12251a1acb81ff02 Mon Sep 17 00:00:00 2001
From: Clawdbot <clawdbot@apilab.us>
Date: Sun, 29 Mar 2026 18:17:06 +1100
Subject: [PATCH] =?UTF-8?q?fix:=20Opus=20review=20pass=20=E2=80=94=20harde?=
 =?UTF-8?q?n=20before=20building?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Widen voice ID validation to 20-64 alphanumeric (future-proof)
- Remove hardcoded default voiceId (SJ personal clone)
- Require voiceId in isConfigured + synthesize guard with clear error
- Add model header comment explaining Fish Audio's non-standard API
- Truncate error bodies to 500 chars to prevent log pollution
- Update tests and README to match
---
 README.md               |  2 +-
 speech-provider.test.ts | 25 +++++++++++++------------
 speech-provider.ts      | 31 ++++++++++++++++++++++---------
 tts.ts                  |  8 +++++++-
 4 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 47c6521..81a1b2a 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ In your `openclaw.json`:
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
 | `apiKey` | string | — | **Required.** Fish Audio API key |
-| `voiceId` | string | `8a2d42...` | Reference ID of the voice to use |
+| `voiceId` | string | — | **Required.** Reference ID of the voice to use |
 | `model` | string | `s2-pro` | TTS model (`s2-pro`, `s1`, `s2`) |
 | `latency` | string | `normal` | Latency mode (`normal`, `balanced`, `low`) |
 | `speed` | number | — | Prosody speed (0.5–2.0) |
diff --git a/speech-provider.test.ts b/speech-provider.test.ts
index ce974b9..c3673ae 100644
--- a/speech-provider.test.ts
+++ b/speech-provider.test.ts
@@ -3,14 +3,15 @@ import { isValidFishAudioVoiceId } from "./speech-provider.js";
 
 describe("fish-audio speech provider", () => {
   describe("isValidFishAudioVoiceId", () => {
-    it("accepts valid Fish Audio ref IDs (24-40 char hex)", () => {
+    it("accepts valid Fish Audio ref IDs (20-64 alphanumeric chars)", () => {
       const valid = [
-        "8a2d42279389471993460b85340235c5", // 32 char - standard
-        "0dad9e24630447cf97803f4beee10481", // 32 char
-        "5796fe24630447cf97803f4beee10481", // 32 char
-        "d8b0991f96b44e489422ca2ddf0bd31d", // 32 char - author id
-        "aabbccddee112233445566778899", // 28 char
-        "aabbccddee11223344556677", // 24 char (minimum)
+        "8a2d42279389471993460b85340235c5", // 32 char hex - standard
+        "0dad9e24630447cf97803f4beee10481", // 32 char hex
+        "d8b0991f96b44e489422ca2ddf0bd31d", // 32 char hex - author id
+        "aabbccddee112233445566778899aabb", // 32 char hex
+        "abcdefABCDEF12345678901234567890", // mixed case alphanumeric
+        "a1b2c3d4e5f6g7h8i9j0", // 20 char (minimum)
+        "a".repeat(64), // 64 char (maximum)
       ];
       for (const v of valid) {
         expect(isValidFishAudioVoiceId(v), `expected valid: ${v}`).toBe(true);
@@ -20,14 +21,14 @@ describe("fish-audio speech provider", () => {
     it("rejects invalid voice IDs", () => {
       const invalid = [
         "", // empty
-        "abc123", // too short
-        "12345678901234567890123", // 23 chars - below minimum
-        "a".repeat(41), // too long
+        "abc123", // too short (6)
+        "1234567890123456789", // 19 chars - below minimum
+        "a".repeat(65), // too long (65)
         "8a2d4227-9389-4719-9346-0b85340235c5", // UUID with dashes
         "../../../etc/passwd", // path traversal
         "voice?param=value", // query string
-        "pMsXgVXv3BLzUgSXRplE", // ElevenLabs-style (mixed case, 20 chars)
-        "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ", // non-hex chars
+        "hello world 1234567890", // spaces
+        "abcdef!@#$%^&*()12345678", // special chars
       ];
       for (const v of invalid) {
         expect(isValidFishAudioVoiceId(v), `expected invalid: ${v}`).toBe(
diff --git a/speech-provider.ts b/speech-provider.ts
index 5505dee..1530493 100644
--- a/speech-provider.ts
+++ b/speech-provider.ts
@@ -11,7 +11,10 @@ import { fishAudioTTS, listFishAudioVoices } from "./tts.js";
 // ── Defaults ────────────────────────────────────────────────────────────────
 
 const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
-const DEFAULT_VOICE_ID = "8a2d42279389471993460b85340235c5"; // SJ voice
+// No default voice — users must configure one. Fish Audio has no universal
+// "default" voice like ElevenLabs does, and shipping a personal clone ID
+// as default would be wrong for community users.
+const DEFAULT_VOICE_ID = "";
 const DEFAULT_MODEL = "s2-pro";
 const DEFAULT_LATENCY = "normal" as const;
 
@@ -67,9 +70,11 @@ function normalizeModel(value: unknown): string {
   return s || DEFAULT_MODEL;
 }
 
-/** Fish Audio ref IDs are 32-char hex strings */
+/** Fish Audio voice ref IDs — alphanumeric, 20-64 chars. Permissive enough
+ *  to handle future ID format changes while still rejecting path traversal
+ *  and injection attempts. */
 export function isValidFishAudioVoiceId(voiceId: string): boolean {
-  return /^[a-f0-9]{24,40}$/i.test(voiceId);
+  return /^[a-zA-Z0-9]{20,64}$/.test(voiceId);
 }
 
 // ── Config resolution ───────────────────────────────────────────────────────
@@ -270,11 +275,12 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
       return raw as SpeechVoiceOption[];
     },
 
-    isConfigured: ({ providerConfig }) =>
-      Boolean(
-        readFishAudioProviderConfig(providerConfig).apiKey ||
-          process.env.FISH_AUDIO_API_KEY,
-      ),
+    isConfigured: ({ providerConfig }) => {
+      const config = readFishAudioProviderConfig(providerConfig);
+      const hasKey = Boolean(config.apiKey || process.env.FISH_AUDIO_API_KEY);
+      const hasVoice = Boolean(config.voiceId);
+      return hasKey && hasVoice;
+    },
 
     synthesize: async (req) => {
       const config = readFishAudioProviderConfig(req.providerConfig);
@@ -285,6 +291,13 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
         throw new Error("Fish Audio API key missing");
       }
 
+      const voiceId = trimToUndefined(overrides.voiceId) ?? config.voiceId;
+      if (!voiceId) {
+        throw new Error(
+          "Fish Audio: no voiceId configured. Set messages.tts.providers.fish-audio.voiceId",
+        );
+      }
+
       // Pick format based on target channel
       const useOpus = req.target === "voice-note";
       const format = useOpus ? "opus" : "mp3";
@@ -298,7 +311,7 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
         text: req.text,
         apiKey,
         baseUrl: config.baseUrl,
-        referenceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
+        referenceId: voiceId,
         model: trimToUndefined(overrides.model) ?? config.model,
         format,
         latency:
diff --git a/tts.ts b/tts.ts
index e7c17a9..05d66b7 100644
--- a/tts.ts
+++ b/tts.ts
@@ -70,6 +70,9 @@ export async function fishAudioTTS(params: {
       body.top_p = topP;
     }
 
+    // Fish Audio uses the `model` HTTP header (not a body field) to select
+    // the TTS model. This is intentional per their API spec — don't move it
+    // into the JSON body.
     const response = await fetch(url, {
       method: "POST",
       headers: {
@@ -85,7 +88,10 @@ export async function fishAudioTTS(params: {
       let errorDetail = "";
       try {
         const errorBody = await response.text();
-        errorDetail = errorBody ? `: ${errorBody}` : "";
+        // Cap at 500 chars to avoid log pollution from large error responses
+        const truncated =
+          errorBody.length > 500 ? `${errorBody.slice(0, 500)}…` : errorBody;
+        errorDetail = truncated ? `: ${truncated}` : "";
       } catch {
         // Ignore error body read failure
       }