fix: Opus review pass — harden before building
- Widen voice ID validation to 20-64 alphanumeric (future-proof) - Remove hardcoded default voiceId (SJ personal clone) - Require voiceId in isConfigured + synthesize guard with clear error - Add model header comment explaining Fish Audio's non-standard API - Truncate error bodies to 500 chars to prevent log pollution - Update tests and README to match
This commit is contained in:
@@ -44,7 +44,7 @@ In your `openclaw.json`:
|
|||||||
| Field | Type | Default | Description |
|
| Field | Type | Default | Description |
|
||||||
|-------|------|---------|-------------|
|
|-------|------|---------|-------------|
|
||||||
| `apiKey` | string | — | **Required.** Fish Audio API key |
|
| `apiKey` | string | — | **Required.** Fish Audio API key |
|
||||||
| `voiceId` | string | `8a2d42...` | Reference ID of the voice to use |
|
| `voiceId` | string | — | **Required.** Reference ID of the voice to use |
|
||||||
| `model` | string | `s2-pro` | TTS model (`s2-pro`, `s1`, `s2`) |
|
| `model` | string | `s2-pro` | TTS model (`s2-pro`, `s1`, `s2`) |
|
||||||
| `latency` | string | `normal` | Latency mode (`normal`, `balanced`, `low`) |
|
| `latency` | string | `normal` | Latency mode (`normal`, `balanced`, `low`) |
|
||||||
| `speed` | number | — | Prosody speed (0.5–2.0) |
|
| `speed` | number | — | Prosody speed (0.5–2.0) |
|
||||||
|
|||||||
@@ -3,14 +3,15 @@ import { isValidFishAudioVoiceId } from "./speech-provider.js";
|
|||||||
|
|
||||||
describe("fish-audio speech provider", () => {
|
describe("fish-audio speech provider", () => {
|
||||||
describe("isValidFishAudioVoiceId", () => {
|
describe("isValidFishAudioVoiceId", () => {
|
||||||
it("accepts valid Fish Audio ref IDs (24-40 char hex)", () => {
|
it("accepts valid Fish Audio ref IDs (20-64 alphanumeric chars)", () => {
|
||||||
const valid = [
|
const valid = [
|
||||||
"8a2d42279389471993460b85340235c5", // 32 char - standard
|
"8a2d42279389471993460b85340235c5", // 32 char hex - standard
|
||||||
"0dad9e24630447cf97803f4beee10481", // 32 char
|
"0dad9e24630447cf97803f4beee10481", // 32 char hex
|
||||||
"5796fe24630447cf97803f4beee10481", // 32 char
|
"d8b0991f96b44e489422ca2ddf0bd31d", // 32 char hex - author id
|
||||||
"d8b0991f96b44e489422ca2ddf0bd31d", // 32 char - author id
|
"aabbccddee112233445566778899aabb", // 32 char hex
|
||||||
"aabbccddee112233445566778899", // 28 char
|
"abcdefABCDEF12345678901234567890", // mixed case alphanumeric
|
||||||
"aabbccddee11223344556677", // 24 char (minimum)
|
"a1b2c3d4e5f6g7h8i9j0", // 20 char (minimum)
|
||||||
|
"a".repeat(64), // 64 char (maximum)
|
||||||
];
|
];
|
||||||
for (const v of valid) {
|
for (const v of valid) {
|
||||||
expect(isValidFishAudioVoiceId(v), `expected valid: ${v}`).toBe(true);
|
expect(isValidFishAudioVoiceId(v), `expected valid: ${v}`).toBe(true);
|
||||||
@@ -20,14 +21,14 @@ describe("fish-audio speech provider", () => {
|
|||||||
it("rejects invalid voice IDs", () => {
|
it("rejects invalid voice IDs", () => {
|
||||||
const invalid = [
|
const invalid = [
|
||||||
"", // empty
|
"", // empty
|
||||||
"abc123", // too short
|
"abc123", // too short (6)
|
||||||
"12345678901234567890123", // 23 chars - below minimum
|
"1234567890123456789", // 19 chars - below minimum
|
||||||
"a".repeat(41), // too long
|
"a".repeat(65), // too long (65)
|
||||||
"8a2d4227-9389-4719-9346-0b85340235c5", // UUID with dashes
|
"8a2d4227-9389-4719-9346-0b85340235c5", // UUID with dashes
|
||||||
"../../../etc/passwd", // path traversal
|
"../../../etc/passwd", // path traversal
|
||||||
"voice?param=value", // query string
|
"voice?param=value", // query string
|
||||||
"pMsXgVXv3BLzUgSXRplE", // ElevenLabs-style (mixed case, 20 chars)
|
"hello world 1234567890", // spaces
|
||||||
"ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ", // non-hex chars
|
"abcdef!@#$%^&*()12345678", // special chars
|
||||||
];
|
];
|
||||||
for (const v of invalid) {
|
for (const v of invalid) {
|
||||||
expect(isValidFishAudioVoiceId(v), `expected invalid: ${v}`).toBe(
|
expect(isValidFishAudioVoiceId(v), `expected invalid: ${v}`).toBe(
|
||||||
|
|||||||
@@ -11,7 +11,10 @@ import { fishAudioTTS, listFishAudioVoices } from "./tts.js";
|
|||||||
// ── Defaults ────────────────────────────────────────────────────────────────
|
// ── Defaults ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
|
const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
|
||||||
const DEFAULT_VOICE_ID = "8a2d42279389471993460b85340235c5"; // SJ voice
|
// No default voice — users must configure one. Fish Audio has no universal
|
||||||
|
// "default" voice like ElevenLabs does, and shipping a personal clone ID
|
||||||
|
// as default would be wrong for community users.
|
||||||
|
const DEFAULT_VOICE_ID = "";
|
||||||
const DEFAULT_MODEL = "s2-pro";
|
const DEFAULT_MODEL = "s2-pro";
|
||||||
const DEFAULT_LATENCY = "normal" as const;
|
const DEFAULT_LATENCY = "normal" as const;
|
||||||
|
|
||||||
@@ -67,9 +70,11 @@ function normalizeModel(value: unknown): string {
|
|||||||
return s || DEFAULT_MODEL;
|
return s || DEFAULT_MODEL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Fish Audio ref IDs are 32-char hex strings */
|
/** Fish Audio voice ref IDs — alphanumeric, 20-64 chars. Permissive enough
|
||||||
|
* to handle future ID format changes while still rejecting path traversal
|
||||||
|
* and injection attempts. */
|
||||||
export function isValidFishAudioVoiceId(voiceId: string): boolean {
|
export function isValidFishAudioVoiceId(voiceId: string): boolean {
|
||||||
return /^[a-f0-9]{24,40}$/i.test(voiceId);
|
return /^[a-zA-Z0-9]{20,64}$/.test(voiceId);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Config resolution ───────────────────────────────────────────────────────
|
// ── Config resolution ───────────────────────────────────────────────────────
|
||||||
@@ -270,11 +275,12 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
|
|||||||
return raw as SpeechVoiceOption[];
|
return raw as SpeechVoiceOption[];
|
||||||
},
|
},
|
||||||
|
|
||||||
isConfigured: ({ providerConfig }) =>
|
isConfigured: ({ providerConfig }) => {
|
||||||
Boolean(
|
const config = readFishAudioProviderConfig(providerConfig);
|
||||||
readFishAudioProviderConfig(providerConfig).apiKey ||
|
const hasKey = Boolean(config.apiKey || process.env.FISH_AUDIO_API_KEY);
|
||||||
process.env.FISH_AUDIO_API_KEY,
|
const hasVoice = Boolean(config.voiceId);
|
||||||
),
|
return hasKey && hasVoice;
|
||||||
|
},
|
||||||
|
|
||||||
synthesize: async (req) => {
|
synthesize: async (req) => {
|
||||||
const config = readFishAudioProviderConfig(req.providerConfig);
|
const config = readFishAudioProviderConfig(req.providerConfig);
|
||||||
@@ -285,6 +291,13 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
|
|||||||
throw new Error("Fish Audio API key missing");
|
throw new Error("Fish Audio API key missing");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const voiceId = trimToUndefined(overrides.voiceId) ?? config.voiceId;
|
||||||
|
if (!voiceId) {
|
||||||
|
throw new Error(
|
||||||
|
"Fish Audio: no voiceId configured. Set messages.tts.providers.fish-audio.voiceId",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Pick format based on target channel
|
// Pick format based on target channel
|
||||||
const useOpus = req.target === "voice-note";
|
const useOpus = req.target === "voice-note";
|
||||||
const format = useOpus ? "opus" : "mp3";
|
const format = useOpus ? "opus" : "mp3";
|
||||||
@@ -298,7 +311,7 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
|
|||||||
text: req.text,
|
text: req.text,
|
||||||
apiKey,
|
apiKey,
|
||||||
baseUrl: config.baseUrl,
|
baseUrl: config.baseUrl,
|
||||||
referenceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
|
referenceId: voiceId,
|
||||||
model: trimToUndefined(overrides.model) ?? config.model,
|
model: trimToUndefined(overrides.model) ?? config.model,
|
||||||
format,
|
format,
|
||||||
latency:
|
latency:
|
||||||
|
|||||||
8
tts.ts
8
tts.ts
@@ -70,6 +70,9 @@ export async function fishAudioTTS(params: {
|
|||||||
body.top_p = topP;
|
body.top_p = topP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fish Audio uses the `model` HTTP header (not a body field) to select
|
||||||
|
// the TTS model. This is intentional per their API spec — don't move it
|
||||||
|
// into the JSON body.
|
||||||
const response = await fetch(url, {
|
const response = await fetch(url, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
@@ -85,7 +88,10 @@ export async function fishAudioTTS(params: {
|
|||||||
let errorDetail = "";
|
let errorDetail = "";
|
||||||
try {
|
try {
|
||||||
const errorBody = await response.text();
|
const errorBody = await response.text();
|
||||||
errorDetail = errorBody ? `: ${errorBody}` : "";
|
// Cap at 500 chars to avoid log pollution from large error responses
|
||||||
|
const truncated =
|
||||||
|
errorBody.length > 500 ? `${errorBody.slice(0, 500)}…` : errorBody;
|
||||||
|
errorDetail = truncated ? `: ${truncated}` : "";
|
||||||
} catch {
|
} catch {
|
||||||
// Ignore error body read failure
|
// Ignore error body read failure
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user