feat: scaffold Fish Audio speech provider plugin
- index.ts: plugin entry with definePluginEntry + registerSpeechProvider - speech-provider.ts: full SpeechProviderPlugin implementation - resolveConfig from messages.tts.providers.fish-audio - parseDirectiveToken for voice, model, speed, latency, temperature, top_p - listVoices merging official + user's own voices - synthesize with format-aware output (opus for voice-note, mp3 otherwise) - stub Talk Mode (resolveTalkConfig/resolveTalkOverrides) - tts.ts: raw fishAudioTTS() fetch + listFishAudioVoices() - streaming chunked → buffer, error body included in exceptions - parallel voice listing with graceful partial failure - speech-provider.test.ts: voice ID validation tests - openclaw.plugin.json: speechProviders contract - package.json: peer dep on openclaw >=2026.3.0
This commit is contained in:
322
speech-provider.ts
Normal file
322
speech-provider.ts
Normal file
@@ -0,0 +1,322 @@
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import type {
|
||||
SpeechDirectiveTokenParseContext,
|
||||
SpeechProviderConfig,
|
||||
SpeechProviderPlugin,
|
||||
SpeechVoiceOption,
|
||||
} from "openclaw/plugin-sdk/speech-core";
|
||||
import { requireInRange } from "openclaw/plugin-sdk/speech-core";
|
||||
import { fishAudioTTS, listFishAudioVoices } from "./tts.js";
|
||||
|
||||
// ── Defaults ────────────────────────────────────────────────────────────────
|
||||
|
||||
const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
|
||||
const DEFAULT_VOICE_ID = "8a2d42279389471993460b85340235c5"; // SJ voice
|
||||
const DEFAULT_MODEL = "s2-pro";
|
||||
const DEFAULT_LATENCY = "normal" as const;
|
||||
|
||||
const FISH_AUDIO_MODELS = ["s2-pro", "s1", "s2"] as const;
|
||||
|
||||
// ── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
type FishAudioProviderConfig = {
|
||||
apiKey?: string;
|
||||
baseUrl: string;
|
||||
voiceId: string;
|
||||
model: string;
|
||||
latency: "normal" | "balanced" | "low";
|
||||
speed?: number;
|
||||
temperature?: number;
|
||||
topP?: number;
|
||||
};
|
||||
|
||||
// ── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function trimToUndefined(value: unknown): string | undefined {
|
||||
return typeof value === "string" && value.trim() ? value.trim() : undefined;
|
||||
}
|
||||
|
||||
function asNumber(value: unknown): number | undefined {
|
||||
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
||||
}
|
||||
|
||||
function asObject(value: unknown): Record<string, unknown> | undefined {
|
||||
return typeof value === "object" && value !== null && !Array.isArray(value)
|
||||
? (value as Record<string, unknown>)
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function parseNumberValue(value: string): number | undefined {
|
||||
const parsed = Number.parseFloat(value);
|
||||
return Number.isFinite(parsed) ? parsed : undefined;
|
||||
}
|
||||
|
||||
function normalizeBaseUrl(baseUrl: string | undefined): string {
|
||||
const trimmed = baseUrl?.trim();
|
||||
return trimmed?.replace(/\/+$/, "") || DEFAULT_FISH_AUDIO_BASE_URL;
|
||||
}
|
||||
|
||||
function normalizeLatency(value: unknown): "normal" | "balanced" | "low" {
|
||||
const s = typeof value === "string" ? value.trim().toLowerCase() : "";
|
||||
if (s === "balanced" || s === "low") return s;
|
||||
return DEFAULT_LATENCY;
|
||||
}
|
||||
|
||||
function normalizeModel(value: unknown): string {
|
||||
const s = typeof value === "string" ? value.trim() : "";
|
||||
return s || DEFAULT_MODEL;
|
||||
}
|
||||
|
||||
/** Fish Audio ref IDs are 32-char hex strings */
|
||||
export function isValidFishAudioVoiceId(voiceId: string): boolean {
|
||||
return /^[a-f0-9]{24,40}$/i.test(voiceId);
|
||||
}
|
||||
|
||||
// ── Config resolution ───────────────────────────────────────────────────────
|
||||
|
||||
function normalizeFishAudioProviderConfig(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): FishAudioProviderConfig {
|
||||
const providers = asObject(rawConfig.providers);
|
||||
const raw =
|
||||
asObject(providers?.["fish-audio"]) ?? asObject(rawConfig["fish-audio"]);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
path: "messages.tts.providers.fish-audio.apiKey",
|
||||
}),
|
||||
baseUrl: normalizeBaseUrl(trimToUndefined(raw?.baseUrl)),
|
||||
voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_VOICE_ID,
|
||||
model: normalizeModel(raw?.model),
|
||||
latency: normalizeLatency(raw?.latency),
|
||||
speed: asNumber(raw?.speed),
|
||||
temperature: asNumber(raw?.temperature),
|
||||
topP: asNumber(raw?.topP),
|
||||
};
|
||||
}
|
||||
|
||||
function readFishAudioProviderConfig(
|
||||
config: SpeechProviderConfig,
|
||||
): FishAudioProviderConfig {
|
||||
const defaults = normalizeFishAudioProviderConfig({});
|
||||
return {
|
||||
apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
|
||||
baseUrl: normalizeBaseUrl(
|
||||
trimToUndefined(config.baseUrl) ?? defaults.baseUrl,
|
||||
),
|
||||
voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
|
||||
model: normalizeModel(config.model) || defaults.model,
|
||||
latency: normalizeLatency(config.latency),
|
||||
speed: asNumber(config.speed) ?? defaults.speed,
|
||||
temperature: asNumber(config.temperature) ?? defaults.temperature,
|
||||
topP: asNumber(config.topP) ?? defaults.topP,
|
||||
};
|
||||
}
|
||||
|
||||
// ── Directive parsing ───────────────────────────────────────────────────────
|
||||
|
||||
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
|
||||
try {
|
||||
switch (ctx.key) {
|
||||
case "voice":
|
||||
case "voiceid":
|
||||
case "voice_id":
|
||||
case "fish_voice":
|
||||
case "fishvoice":
|
||||
case "reference_id":
|
||||
if (!ctx.policy.allowVoice) {
|
||||
return { handled: true };
|
||||
}
|
||||
if (!isValidFishAudioVoiceId(ctx.value)) {
|
||||
return {
|
||||
handled: true,
|
||||
warnings: [`invalid Fish Audio voice ID "${ctx.value}"`],
|
||||
};
|
||||
}
|
||||
return {
|
||||
handled: true,
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
|
||||
};
|
||||
|
||||
case "model":
|
||||
case "modelid":
|
||||
case "model_id":
|
||||
case "fish_model":
|
||||
case "fishmodel":
|
||||
if (!ctx.policy.allowModelId) {
|
||||
return { handled: true };
|
||||
}
|
||||
return {
|
||||
handled: true,
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), model: ctx.value },
|
||||
};
|
||||
|
||||
case "speed": {
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
const value = parseNumberValue(ctx.value);
|
||||
if (value == null) {
|
||||
return { handled: true, warnings: ["invalid speed value"] };
|
||||
}
|
||||
requireInRange(value, 0.5, 2.0, "speed");
|
||||
return {
|
||||
handled: true,
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), speed: value },
|
||||
};
|
||||
}
|
||||
|
||||
case "latency":
|
||||
case "fish_latency":
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
{
|
||||
const lat = normalizeLatency(ctx.value);
|
||||
return {
|
||||
handled: true,
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), latency: lat },
|
||||
};
|
||||
}
|
||||
|
||||
case "temperature":
|
||||
case "temp": {
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
const value = parseNumberValue(ctx.value);
|
||||
if (value == null) {
|
||||
return { handled: true, warnings: ["invalid temperature value"] };
|
||||
}
|
||||
requireInRange(value, 0, 1, "temperature");
|
||||
return {
|
||||
handled: true,
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), temperature: value },
|
||||
};
|
||||
}
|
||||
|
||||
case "top_p":
|
||||
case "topp": {
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
const value = parseNumberValue(ctx.value);
|
||||
if (value == null) {
|
||||
return { handled: true, warnings: ["invalid top_p value"] };
|
||||
}
|
||||
requireInRange(value, 0, 1, "top_p");
|
||||
return {
|
||||
handled: true,
|
||||
overrides: { ...(ctx.currentOverrides ?? {}), topP: value },
|
||||
};
|
||||
}
|
||||
|
||||
default:
|
||||
return { handled: false };
|
||||
}
|
||||
} catch (error) {
|
||||
return {
|
||||
handled: true,
|
||||
warnings: [error instanceof Error ? error.message : String(error)],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ── Provider ────────────────────────────────────────────────────────────────
|
||||
|
||||
export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "fish-audio",
|
||||
label: "Fish Audio",
|
||||
autoSelectOrder: 30,
|
||||
models: FISH_AUDIO_MODELS,
|
||||
|
||||
resolveConfig: ({ rawConfig }) =>
|
||||
normalizeFishAudioProviderConfig(rawConfig),
|
||||
|
||||
parseDirectiveToken,
|
||||
|
||||
// Talk Mode — v2, stub for now
|
||||
resolveTalkConfig: ({ baseTtsConfig }) =>
|
||||
normalizeFishAudioProviderConfig(baseTtsConfig),
|
||||
|
||||
resolveTalkOverrides: ({ params }) => ({
|
||||
...(trimToUndefined(params.voiceId) == null
|
||||
? {}
|
||||
: { voiceId: trimToUndefined(params.voiceId) }),
|
||||
...(trimToUndefined(params.model) == null
|
||||
? {}
|
||||
: { model: trimToUndefined(params.model) }),
|
||||
...(asNumber(params.speed) == null
|
||||
? {}
|
||||
: { speed: asNumber(params.speed) }),
|
||||
}),
|
||||
|
||||
listVoices: async (req) => {
|
||||
const config = req.providerConfig
|
||||
? readFishAudioProviderConfig(req.providerConfig)
|
||||
: undefined;
|
||||
const apiKey =
|
||||
req.apiKey ||
|
||||
config?.apiKey ||
|
||||
process.env.FISH_AUDIO_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Fish Audio API key missing");
|
||||
}
|
||||
const raw = await listFishAudioVoices({
|
||||
apiKey,
|
||||
baseUrl: req.baseUrl ?? config?.baseUrl,
|
||||
});
|
||||
return raw as SpeechVoiceOption[];
|
||||
},
|
||||
|
||||
isConfigured: ({ providerConfig }) =>
|
||||
Boolean(
|
||||
readFishAudioProviderConfig(providerConfig).apiKey ||
|
||||
process.env.FISH_AUDIO_API_KEY,
|
||||
),
|
||||
|
||||
synthesize: async (req) => {
|
||||
const config = readFishAudioProviderConfig(req.providerConfig);
|
||||
const overrides = req.providerOverrides ?? {};
|
||||
const apiKey =
|
||||
config.apiKey || process.env.FISH_AUDIO_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Fish Audio API key missing");
|
||||
}
|
||||
|
||||
// Pick format based on target channel
|
||||
const useOpus = req.target === "voice-note";
|
||||
const format = useOpus ? "opus" : "mp3";
|
||||
|
||||
const speed = asNumber(overrides.speed) ?? config.speed;
|
||||
if (speed != null) {
|
||||
requireInRange(speed, 0.5, 2.0, "speed");
|
||||
}
|
||||
|
||||
const audioBuffer = await fishAudioTTS({
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: config.baseUrl,
|
||||
referenceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
|
||||
model: trimToUndefined(overrides.model) ?? config.model,
|
||||
format,
|
||||
latency:
|
||||
normalizeLatency(overrides.latency) !== DEFAULT_LATENCY
|
||||
? normalizeLatency(overrides.latency)
|
||||
: config.latency,
|
||||
speed,
|
||||
temperature: asNumber(overrides.temperature) ?? config.temperature,
|
||||
topP: asNumber(overrides.topP) ?? config.topP,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
|
||||
return {
|
||||
audioBuffer,
|
||||
outputFormat: format,
|
||||
fileExtension: useOpus ? ".opus" : ".mp3",
|
||||
voiceCompatible: true, // Fish Audio output works as voice note in both formats
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user