From ed505dcce15f07879ae3e7ea12251a1acb81ff02 Mon Sep 17 00:00:00 2001 From: Clawdbot Date: Sun, 29 Mar 2026 18:17:06 +1100 Subject: [PATCH] =?UTF-8?q?fix:=20Opus=20review=20pass=20=E2=80=94=20harde?= =?UTF-8?q?n=20before=20building?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Widen voice ID validation to 20-64 alphanumeric (future-proof) - Remove hardcoded default voiceId (SJ personal clone) - Require voiceId in isConfigured + synthesize guard with clear error - Add model header comment explaining Fish Audio's non-standard API - Truncate error bodies to 500 chars to prevent log pollution - Update tests and README to match --- README.md | 2 +- speech-provider.test.ts | 25 +++++++++++++------------ speech-provider.ts | 31 ++++++++++++++++++++++--------- tts.ts | 8 +++++++- 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 47c6521..81a1b2a 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ In your `openclaw.json`: | Field | Type | Default | Description | |-------|------|---------|-------------| | `apiKey` | string | — | **Required.** Fish Audio API key | -| `voiceId` | string | `8a2d42...` | Reference ID of the voice to use | +| `voiceId` | string | — | **Required.** Reference ID of the voice to use | | `model` | string | `s2-pro` | TTS model (`s2-pro`, `s1`, `s2`) | | `latency` | string | `normal` | Latency mode (`normal`, `balanced`, `low`) | | `speed` | number | — | Prosody speed (0.5–2.0) | diff --git a/speech-provider.test.ts b/speech-provider.test.ts index ce974b9..c3673ae 100644 --- a/speech-provider.test.ts +++ b/speech-provider.test.ts @@ -3,14 +3,15 @@ import { isValidFishAudioVoiceId } from "./speech-provider.js"; describe("fish-audio speech provider", () => { describe("isValidFishAudioVoiceId", () => { - it("accepts valid Fish Audio ref IDs (24-40 char hex)", () => { + it("accepts valid Fish Audio ref IDs (20-64 alphanumeric chars)", () => { const valid = [ - "8a2d42279389471993460b85340235c5", // 32 char - standard - "0dad9e24630447cf97803f4beee10481", // 32 char - "5796fe24630447cf97803f4beee10481", // 32 char - "d8b0991f96b44e489422ca2ddf0bd31d", // 32 char - author id - "aabbccddee112233445566778899", // 28 char - "aabbccddee11223344556677", // 24 char (minimum) + "8a2d42279389471993460b85340235c5", // 32 char hex - standard + "0dad9e24630447cf97803f4beee10481", // 32 char hex + "d8b0991f96b44e489422ca2ddf0bd31d", // 32 char hex - author id + "aabbccddee112233445566778899aabb", // 32 char hex + "abcdefABCDEF12345678901234567890", // mixed case alphanumeric + "a1b2c3d4e5f6g7h8i9j0", // 20 char (minimum) + "a".repeat(64), // 64 char (maximum) ]; for (const v of valid) { expect(isValidFishAudioVoiceId(v), `expected valid: ${v}`).toBe(true); @@ -20,14 +21,14 @@ describe("fish-audio speech provider", () => { it("rejects invalid voice IDs", () => { const invalid = [ "", // empty - "abc123", // too short - "12345678901234567890123", // 23 chars - below minimum - "a".repeat(41), // too long + "abc123", // too short (6) + "1234567890123456789", // 19 chars - below minimum + "a".repeat(65), // too long (65) "8a2d4227-9389-4719-9346-0b85340235c5", // UUID with dashes "../../../etc/passwd", // path traversal "voice?param=value", // query string - "pMsXgVXv3BLzUgSXRplE", // ElevenLabs-style (mixed case, 20 chars) - "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ", // non-hex chars + "hello world 1234567890", // spaces + "abcdef!@#$%^&*()12345678", // special chars ]; for (const v of invalid) { expect(isValidFishAudioVoiceId(v), `expected invalid: ${v}`).toBe( diff --git a/speech-provider.ts b/speech-provider.ts index 5505dee..1530493 100644 --- a/speech-provider.ts +++ b/speech-provider.ts @@ -11,7 +11,10 @@ import { fishAudioTTS, listFishAudioVoices } from "./tts.js"; // ── Defaults ──────────────────────────────────────────────────────────────── const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio"; -const DEFAULT_VOICE_ID = "8a2d42279389471993460b85340235c5"; // SJ voice +// No default voice — users must configure one. Fish Audio has no universal +// "default" voice like ElevenLabs does, and shipping a personal clone ID +// as default would be wrong for community users. +const DEFAULT_VOICE_ID = ""; const DEFAULT_MODEL = "s2-pro"; const DEFAULT_LATENCY = "normal" as const; @@ -67,9 +70,11 @@ function normalizeModel(value: unknown): string { return s || DEFAULT_MODEL; } -/** Fish Audio ref IDs are 32-char hex strings */ +/** Fish Audio voice ref IDs — alphanumeric, 20-64 chars. Permissive enough + * to handle future ID format changes while still rejecting path traversal + * and injection attempts. */ export function isValidFishAudioVoiceId(voiceId: string): boolean { - return /^[a-f0-9]{24,40}$/i.test(voiceId); + return /^[a-zA-Z0-9]{20,64}$/.test(voiceId); } // ── Config resolution ─────────────────────────────────────────────────────── @@ -270,11 +275,12 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin { return raw as SpeechVoiceOption[]; }, - isConfigured: ({ providerConfig }) => - Boolean( - readFishAudioProviderConfig(providerConfig).apiKey || - process.env.FISH_AUDIO_API_KEY, - ), + isConfigured: ({ providerConfig }) => { + const config = readFishAudioProviderConfig(providerConfig); + const hasKey = Boolean(config.apiKey || process.env.FISH_AUDIO_API_KEY); + const hasVoice = Boolean(config.voiceId); + return hasKey && hasVoice; + }, synthesize: async (req) => { const config = readFishAudioProviderConfig(req.providerConfig); @@ -285,6 +291,13 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin { throw new Error("Fish Audio API key missing"); } + const voiceId = trimToUndefined(overrides.voiceId) ?? config.voiceId; + if (!voiceId) { + throw new Error( + "Fish Audio: no voiceId configured. Set messages.tts.providers.fish-audio.voiceId", + ); + } + // Pick format based on target channel const useOpus = req.target === "voice-note"; const format = useOpus ? "opus" : "mp3"; @@ -298,7 +311,7 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin { text: req.text, apiKey, baseUrl: config.baseUrl, - referenceId: trimToUndefined(overrides.voiceId) ?? config.voiceId, + referenceId: voiceId, model: trimToUndefined(overrides.model) ?? config.model, format, latency: diff --git a/tts.ts b/tts.ts index e7c17a9..05d66b7 100644 --- a/tts.ts +++ b/tts.ts @@ -70,6 +70,9 @@ export async function fishAudioTTS(params: { body.top_p = topP; } + // Fish Audio uses the `model` HTTP header (not a body field) to select + // the TTS model. This is intentional per their API spec — don't move it + // into the JSON body. const response = await fetch(url, { method: "POST", headers: { @@ -85,7 +88,10 @@ export async function fishAudioTTS(params: { let errorDetail = ""; try { const errorBody = await response.text(); - errorDetail = errorBody ? `: ${errorBody}` : ""; + // Cap at 500 chars to avoid log pollution from large error responses + const truncated = + errorBody.length > 500 ? `${errorBody.slice(0, 500)}…` : errorBody; + errorDetail = truncated ? `: ${truncated}` : ""; } catch { // Ignore error body read failure }