sync: match upstream PR #56891 at rebase 2026-03-30
Brings Gitea mirror up to date with the current state of the openclaw/openclaw PR branch, including all fixes from Codex review: - Namespaced directive keys (fishaudio_*/fish_* prefixes only) - Strict latency directive validation with warnings - Code quality cleanup, s2 model removal - Contract and directive parsing tests - README updated with prefixed directive docs Source: Conan-Scott/openclaw@9787ef6e (feat/fish-audio-speech-provider)
This commit is contained in:
@@ -6,11 +6,9 @@ import type {
|
||||
SpeechVoiceOption,
|
||||
} from "openclaw/plugin-sdk/speech-core";
|
||||
import { requireInRange } from "openclaw/plugin-sdk/speech-core";
|
||||
import { fishAudioTTS, listFishAudioVoices } from "./tts.js";
|
||||
import { DEFAULT_FISH_AUDIO_BASE_URL, fishAudioTTS, listFishAudioVoices, normalizeFishAudioBaseUrl } from "./tts.js";
|
||||
|
||||
// ── Defaults ────────────────────────────────────────────────────────────────
|
||||
|
||||
const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
|
||||
// No default voice — users must configure one. Fish Audio has no universal
|
||||
// "default" voice like ElevenLabs does, and shipping a personal clone ID
|
||||
// as default would be wrong for community users.
|
||||
@@ -18,7 +16,7 @@ const DEFAULT_VOICE_ID = "";
|
||||
const DEFAULT_MODEL = "s2-pro";
|
||||
const DEFAULT_LATENCY = "normal" as const;
|
||||
|
||||
const FISH_AUDIO_MODELS = ["s2-pro", "s1", "s2"] as const;
|
||||
const FISH_AUDIO_MODELS = ["s2-pro", "s1"] as const;
|
||||
|
||||
// ── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -54,11 +52,6 @@ function parseNumberValue(value: string): number | undefined {
|
||||
return Number.isFinite(parsed) ? parsed : undefined;
|
||||
}
|
||||
|
||||
function normalizeBaseUrl(baseUrl: string | undefined): string {
|
||||
const trimmed = baseUrl?.trim();
|
||||
return trimmed?.replace(/\/+$/, "") || DEFAULT_FISH_AUDIO_BASE_URL;
|
||||
}
|
||||
|
||||
function normalizeLatency(value: unknown): "normal" | "balanced" | "low" {
|
||||
const s = typeof value === "string" ? value.trim().toLowerCase() : "";
|
||||
if (s === "balanced" || s === "low") return s;
|
||||
@@ -90,7 +83,7 @@ function normalizeFishAudioProviderConfig(
|
||||
value: raw?.apiKey,
|
||||
path: "messages.tts.providers.fish-audio.apiKey",
|
||||
}),
|
||||
baseUrl: normalizeBaseUrl(trimToUndefined(raw?.baseUrl)),
|
||||
baseUrl: normalizeFishAudioBaseUrl(trimToUndefined(raw?.baseUrl)),
|
||||
voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_VOICE_ID,
|
||||
model: normalizeModel(raw?.model),
|
||||
latency: normalizeLatency(raw?.latency),
|
||||
@@ -106,7 +99,7 @@ function readFishAudioProviderConfig(
|
||||
const defaults = normalizeFishAudioProviderConfig({});
|
||||
return {
|
||||
apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
|
||||
baseUrl: normalizeBaseUrl(
|
||||
baseUrl: normalizeFishAudioBaseUrl(
|
||||
trimToUndefined(config.baseUrl) ?? defaults.baseUrl,
|
||||
),
|
||||
voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
|
||||
@@ -121,14 +114,17 @@ function readFishAudioProviderConfig(
|
||||
// ── Directive parsing ───────────────────────────────────────────────────────
|
||||
|
||||
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
|
||||
// Only claim provider-prefixed keys to avoid dispatch collisions.
|
||||
// `parseTtsDirectives` stops at the first provider whose `parseDirectiveToken`
|
||||
// returns `handled: true`, and bundled providers with lower `autoSelectOrder`
|
||||
// (e.g. OpenAI at 10) are visited first. Generic keys like "voice" or "model"
|
||||
// would be swallowed by earlier providers and never reach us.
|
||||
// Convention matches ElevenLabs: `elevenlabs_voice`, `elevenlabs_model`, etc.
|
||||
try {
|
||||
switch (ctx.key) {
|
||||
case "voice":
|
||||
case "voiceid":
|
||||
case "voice_id":
|
||||
case "fishaudio_voice":
|
||||
case "fish_voice":
|
||||
case "fishvoice":
|
||||
case "reference_id":
|
||||
case "fishaudio_voiceid":
|
||||
if (!ctx.policy.allowVoice) {
|
||||
return { handled: true };
|
||||
}
|
||||
@@ -143,11 +139,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
|
||||
};
|
||||
|
||||
case "model":
|
||||
case "modelid":
|
||||
case "model_id":
|
||||
case "fishaudio_model":
|
||||
case "fish_model":
|
||||
case "fishmodel":
|
||||
if (!ctx.policy.allowModelId) {
|
||||
return { handled: true };
|
||||
}
|
||||
@@ -156,7 +149,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), model: ctx.value },
|
||||
};
|
||||
|
||||
case "speed": {
|
||||
case "fishaudio_speed":
|
||||
case "fish_speed": {
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
@@ -171,21 +165,26 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
|
||||
};
|
||||
}
|
||||
|
||||
case "latency":
|
||||
case "fish_latency":
|
||||
case "fishaudio_latency":
|
||||
case "fish_latency": {
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
{
|
||||
const lat = normalizeLatency(ctx.value);
|
||||
const raw = typeof ctx.value === "string" ? ctx.value.trim().toLowerCase() : "";
|
||||
if (raw !== "normal" && raw !== "balanced" && raw !== "low") {
|
||||
return {
|
||||
handled: true,
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), latency: lat },
|
||||
warnings: [`invalid Fish Audio latency "${ctx.value}" (expected: normal, balanced, low)`],
|
||||
};
|
||||
}
|
||||
return {
|
||||
handled: true,
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), latency: raw },
|
||||
};
|
||||
}
|
||||
|
||||
case "temperature":
|
||||
case "temp": {
|
||||
case "fishaudio_temperature":
|
||||
case "fish_temperature": {
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
@@ -200,8 +199,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
|
||||
};
|
||||
}
|
||||
|
||||
case "top_p":
|
||||
case "topp": {
|
||||
case "fishaudio_top_p":
|
||||
case "fish_top_p": {
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
@@ -233,6 +232,9 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "fish-audio",
|
||||
label: "Fish Audio",
|
||||
// Lower = higher priority in auto-detect fallback. Positioned below OpenAI (10)
|
||||
// but above ElevenLabs (20) and Microsoft (30) since Fish Audio requires
|
||||
// explicit configuration (apiKey + voiceId) to pass isConfigured().
|
||||
autoSelectOrder: 15,
|
||||
models: FISH_AUDIO_MODELS,
|
||||
|
||||
@@ -241,17 +243,43 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
|
||||
|
||||
parseDirectiveToken,
|
||||
|
||||
// Talk Mode — v2, stub for now
|
||||
resolveTalkConfig: ({ baseTtsConfig }) =>
|
||||
normalizeFishAudioProviderConfig(baseTtsConfig),
|
||||
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
|
||||
const base = normalizeFishAudioProviderConfig(baseTtsConfig);
|
||||
return {
|
||||
...base,
|
||||
...(talkProviderConfig.apiKey === undefined
|
||||
? {}
|
||||
: {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: talkProviderConfig.apiKey,
|
||||
path: "talk.providers.fish-audio.apiKey",
|
||||
}),
|
||||
}),
|
||||
...(trimToUndefined(talkProviderConfig.baseUrl) == null
|
||||
? {}
|
||||
: { baseUrl: normalizeFishAudioBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
|
||||
...(trimToUndefined(talkProviderConfig.voiceId) == null
|
||||
? {}
|
||||
: { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
|
||||
...(trimToUndefined(talkProviderConfig.modelId) == null
|
||||
? {}
|
||||
: { model: normalizeModel(talkProviderConfig.modelId) }),
|
||||
...(talkProviderConfig.latency == null
|
||||
? {}
|
||||
: { latency: normalizeLatency(talkProviderConfig.latency) }),
|
||||
...(asNumber(talkProviderConfig.speed) == null
|
||||
? {}
|
||||
: { speed: asNumber(talkProviderConfig.speed) }),
|
||||
};
|
||||
},
|
||||
|
||||
resolveTalkOverrides: ({ params }) => ({
|
||||
...(trimToUndefined(params.voiceId) == null
|
||||
? {}
|
||||
: { voiceId: trimToUndefined(params.voiceId) }),
|
||||
...(trimToUndefined(params.model) == null
|
||||
...(trimToUndefined(params.modelId) == null
|
||||
? {}
|
||||
: { model: trimToUndefined(params.model) }),
|
||||
: { model: trimToUndefined(params.modelId) }),
|
||||
...(asNumber(params.speed) == null
|
||||
? {}
|
||||
: { speed: asNumber(params.speed) }),
|
||||
@@ -314,10 +342,9 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
|
||||
referenceId: voiceId,
|
||||
model: trimToUndefined(overrides.model) ?? config.model,
|
||||
format,
|
||||
latency:
|
||||
normalizeLatency(overrides.latency) !== DEFAULT_LATENCY
|
||||
? normalizeLatency(overrides.latency)
|
||||
: config.latency,
|
||||
latency: overrides.latency != null
|
||||
? normalizeLatency(overrides.latency)
|
||||
: config.latency,
|
||||
speed,
|
||||
temperature: asNumber(overrides.temperature) ?? config.temperature,
|
||||
topP: asNumber(overrides.topP) ?? config.topP,
|
||||
|
||||
Reference in New Issue
Block a user