From 4842dc64a5b367b71d44d10ab210c344e1ef5850 Mon Sep 17 00:00:00 2001 From: Clawdbot Date: Sun, 29 Mar 2026 18:14:29 +1100 Subject: [PATCH] feat: scaffold Fish Audio speech provider plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - index.ts: plugin entry with definePluginEntry + registerSpeechProvider - speech-provider.ts: full SpeechProviderPlugin implementation - resolveConfig from messages.tts.providers.fish-audio - parseDirectiveToken for voice, model, speed, latency, temperature, top_p - listVoices merging official + user's own voices - synthesize with format-aware output (opus for voice-note, mp3 otherwise) - stub Talk Mode (resolveTalkConfig/resolveTalkOverrides) - tts.ts: raw fishAudioTTS() fetch + listFishAudioVoices() - streaming chunked → buffer, error body included in exceptions - parallel voice listing with graceful partial failure - speech-provider.test.ts: voice ID validation tests - openclaw.plugin.json: speechProviders contract - package.json: peer dep on openclaw >=2026.3.0 --- README.md | 98 +++++++++++- index.ts | 11 ++ openclaw.plugin.json | 11 ++ package.json | 21 +++ speech-provider.test.ts | 39 +++++ speech-provider.ts | 322 ++++++++++++++++++++++++++++++++++++++++ tts.ts | 175 ++++++++++++++++++++++ 7 files changed, 675 insertions(+), 2 deletions(-) create mode 100644 index.ts create mode 100644 openclaw.plugin.json create mode 100644 package.json create mode 100644 speech-provider.test.ts create mode 100644 speech-provider.ts create mode 100644 tts.ts diff --git a/README.md b/README.md index 1a9b6c8..47c6521 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,97 @@ -# fish-audio-plugin +# Fish Audio Speech Plugin for OpenClaw -Fish Audio TTS speech provider plugin for OpenClaw \ No newline at end of file +A speech provider plugin that integrates [Fish Audio](https://fish.audio) TTS with OpenClaw. + +## Features + +- **Fish Audio S2-Pro / S1 / S2** model support +- **Dynamic voice listing** — your own cloned voices + official Fish Audio voices +- **Format-aware output** — opus for voice notes (Telegram, WhatsApp), mp3 otherwise +- **Inline directives** — switch voice, speed, model, and latency mid-message +- **No core changes required** — standard `SpeechProviderPlugin` extension + +## Installation + +```bash +openclaw plugins install @openclaw/fish-audio-speech +``` + +## Configuration + +In your `openclaw.json`: + +```json +{ + "messages": { + "tts": { + "provider": "fish-audio", + "providers": { + "fish-audio": { + "apiKey": "your-fish-audio-api-key", + "voiceId": "8a2d42279389471993460b85340235c5", + "model": "s2-pro", + "latency": "normal", + "speed": 1.0 + } + } + } + } +} +``` + +### Config Options + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `apiKey` | string | — | **Required.** Fish Audio API key | +| `voiceId` | string | `8a2d42...` | Reference ID of the voice to use | +| `model` | string | `s2-pro` | TTS model (`s2-pro`, `s1`, `s2`) | +| `latency` | string | `normal` | Latency mode (`normal`, `balanced`, `low`) | +| `speed` | number | — | Prosody speed (0.5–2.0) | +| `temperature` | number | — | Sampling temperature (0–1) | +| `topP` | number | — | Top-p sampling (0–1) | +| `baseUrl` | string | `https://api.fish.audio` | API base URL | + +### Environment Variable + +You can also set the API key via environment variable: + +```bash +FISH_AUDIO_API_KEY=your-key +``` + +## Directives + +Use inline directives in your messages to control TTS per-message: + +``` +[[tts:voice=]] Switch voice +[[tts:speed=1.2]] Prosody speed (0.5–2.0) +[[tts:model=s1]] Model override +[[tts:latency=low]] Latency mode +[[tts:temperature=0.7]] Sampling temperature +[[tts:top_p=0.8]] Top-p sampling +``` + +## Voice Listing + +The plugin dynamically lists available voices via `/tts voices`: +- **Official Fish Audio voices** (~38 voices) +- **Your own cloned/trained voices** (marked with "(mine)") + +## Output Format + +The plugin automatically selects the best format based on the channel: +- **Voice note channels** (Telegram, WhatsApp, Matrix, Feishu) → Opus +- **All other channels** → MP3 + +Both formats set `voiceCompatible: true` — Fish Audio output works cleanly as native voice notes. + +## Requirements + +- OpenClaw ≥ 2026.3.0 +- Fish Audio API key ([get one here](https://fish.audio)) + +## License + +MIT diff --git a/index.ts b/index.ts new file mode 100644 index 0000000..0619571 --- /dev/null +++ b/index.ts @@ -0,0 +1,11 @@ +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; +import { buildFishAudioSpeechProvider } from "./speech-provider.js"; + +export default definePluginEntry({ + id: "fish-audio", + name: "Fish Audio Speech", + description: "Fish Audio TTS speech provider for OpenClaw", + register(api) { + api.registerSpeechProvider(buildFishAudioSpeechProvider()); + }, +}); diff --git a/openclaw.plugin.json b/openclaw.plugin.json new file mode 100644 index 0000000..7f25a2f --- /dev/null +++ b/openclaw.plugin.json @@ -0,0 +1,11 @@ +{ + "id": "fish-audio", + "contracts": { + "speechProviders": ["fish-audio"] + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..3d68da6 --- /dev/null +++ b/package.json @@ -0,0 +1,21 @@ +{ + "name": "@openclaw/fish-audio-speech", + "version": "0.1.0", + "description": "Fish Audio TTS speech provider plugin for OpenClaw", + "type": "module", + "license": "MIT", + "openclaw": { + "extensions": [ + "./index.ts" + ] + }, + "peerDependencies": { + "openclaw": ">=2026.3.0" + }, + "devDependencies": { + "vitest": "^3.0.0" + }, + "scripts": { + "test": "vitest run" + } +} diff --git a/speech-provider.test.ts b/speech-provider.test.ts new file mode 100644 index 0000000..ce974b9 --- /dev/null +++ b/speech-provider.test.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from "vitest"; +import { isValidFishAudioVoiceId } from "./speech-provider.js"; + +describe("fish-audio speech provider", () => { + describe("isValidFishAudioVoiceId", () => { + it("accepts valid Fish Audio ref IDs (24-40 char hex)", () => { + const valid = [ + "8a2d42279389471993460b85340235c5", // 32 char - standard + "0dad9e24630447cf97803f4beee10481", // 32 char + "5796fe24630447cf97803f4beee10481", // 32 char + "d8b0991f96b44e489422ca2ddf0bd31d", // 32 char - author id + "aabbccddee112233445566778899", // 28 char + "aabbccddee11223344556677", // 24 char (minimum) + ]; + for (const v of valid) { + expect(isValidFishAudioVoiceId(v), `expected valid: ${v}`).toBe(true); + } + }); + + it("rejects invalid voice IDs", () => { + const invalid = [ + "", // empty + "abc123", // too short + "12345678901234567890123", // 23 chars - below minimum + "a".repeat(41), // too long + "8a2d4227-9389-4719-9346-0b85340235c5", // UUID with dashes + "../../../etc/passwd", // path traversal + "voice?param=value", // query string + "pMsXgVXv3BLzUgSXRplE", // ElevenLabs-style (mixed case, 20 chars) + "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ", // non-hex chars + ]; + for (const v of invalid) { + expect(isValidFishAudioVoiceId(v), `expected invalid: ${v}`).toBe( + false, + ); + } + }); + }); +}); diff --git a/speech-provider.ts b/speech-provider.ts new file mode 100644 index 0000000..5505dee --- /dev/null +++ b/speech-provider.ts @@ -0,0 +1,322 @@ +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import type { + SpeechDirectiveTokenParseContext, + SpeechProviderConfig, + SpeechProviderPlugin, + SpeechVoiceOption, +} from "openclaw/plugin-sdk/speech-core"; +import { requireInRange } from "openclaw/plugin-sdk/speech-core"; +import { fishAudioTTS, listFishAudioVoices } from "./tts.js"; + +// ── Defaults ──────────────────────────────────────────────────────────────── + +const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio"; +const DEFAULT_VOICE_ID = "8a2d42279389471993460b85340235c5"; // SJ voice +const DEFAULT_MODEL = "s2-pro"; +const DEFAULT_LATENCY = "normal" as const; + +const FISH_AUDIO_MODELS = ["s2-pro", "s1", "s2"] as const; + +// ── Types ─────────────────────────────────────────────────────────────────── + +type FishAudioProviderConfig = { + apiKey?: string; + baseUrl: string; + voiceId: string; + model: string; + latency: "normal" | "balanced" | "low"; + speed?: number; + temperature?: number; + topP?: number; +}; + +// ── Helpers ───────────────────────────────────────────────────────────────── + +function trimToUndefined(value: unknown): string | undefined { + return typeof value === "string" && value.trim() ? value.trim() : undefined; +} + +function asNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function asObject(value: unknown): Record | undefined { + return typeof value === "object" && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function parseNumberValue(value: string): number | undefined { + const parsed = Number.parseFloat(value); + return Number.isFinite(parsed) ? parsed : undefined; +} + +function normalizeBaseUrl(baseUrl: string | undefined): string { + const trimmed = baseUrl?.trim(); + return trimmed?.replace(/\/+$/, "") || DEFAULT_FISH_AUDIO_BASE_URL; +} + +function normalizeLatency(value: unknown): "normal" | "balanced" | "low" { + const s = typeof value === "string" ? value.trim().toLowerCase() : ""; + if (s === "balanced" || s === "low") return s; + return DEFAULT_LATENCY; +} + +function normalizeModel(value: unknown): string { + const s = typeof value === "string" ? value.trim() : ""; + return s || DEFAULT_MODEL; +} + +/** Fish Audio ref IDs are 32-char hex strings */ +export function isValidFishAudioVoiceId(voiceId: string): boolean { + return /^[a-f0-9]{24,40}$/i.test(voiceId); +} + +// ── Config resolution ─────────────────────────────────────────────────────── + +function normalizeFishAudioProviderConfig( + rawConfig: Record, +): FishAudioProviderConfig { + const providers = asObject(rawConfig.providers); + const raw = + asObject(providers?.["fish-audio"]) ?? asObject(rawConfig["fish-audio"]); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "messages.tts.providers.fish-audio.apiKey", + }), + baseUrl: normalizeBaseUrl(trimToUndefined(raw?.baseUrl)), + voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_VOICE_ID, + model: normalizeModel(raw?.model), + latency: normalizeLatency(raw?.latency), + speed: asNumber(raw?.speed), + temperature: asNumber(raw?.temperature), + topP: asNumber(raw?.topP), + }; +} + +function readFishAudioProviderConfig( + config: SpeechProviderConfig, +): FishAudioProviderConfig { + const defaults = normalizeFishAudioProviderConfig({}); + return { + apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey, + baseUrl: normalizeBaseUrl( + trimToUndefined(config.baseUrl) ?? defaults.baseUrl, + ), + voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId, + model: normalizeModel(config.model) || defaults.model, + latency: normalizeLatency(config.latency), + speed: asNumber(config.speed) ?? defaults.speed, + temperature: asNumber(config.temperature) ?? defaults.temperature, + topP: asNumber(config.topP) ?? defaults.topP, + }; +} + +// ── Directive parsing ─────────────────────────────────────────────────────── + +function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) { + try { + switch (ctx.key) { + case "voice": + case "voiceid": + case "voice_id": + case "fish_voice": + case "fishvoice": + case "reference_id": + if (!ctx.policy.allowVoice) { + return { handled: true }; + } + if (!isValidFishAudioVoiceId(ctx.value)) { + return { + handled: true, + warnings: [`invalid Fish Audio voice ID "${ctx.value}"`], + }; + } + return { + handled: true, + overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value }, + }; + + case "model": + case "modelid": + case "model_id": + case "fish_model": + case "fishmodel": + if (!ctx.policy.allowModelId) { + return { handled: true }; + } + return { + handled: true, + overrides: { ...(ctx.currentOverrides ?? {}), model: ctx.value }, + }; + + case "speed": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const value = parseNumberValue(ctx.value); + if (value == null) { + return { handled: true, warnings: ["invalid speed value"] }; + } + requireInRange(value, 0.5, 2.0, "speed"); + return { + handled: true, + overrides: { ...(ctx.currentOverrides ?? {}), speed: value }, + }; + } + + case "latency": + case "fish_latency": + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + { + const lat = normalizeLatency(ctx.value); + return { + handled: true, + overrides: { ...(ctx.currentOverrides ?? {}), latency: lat }, + }; + } + + case "temperature": + case "temp": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const value = parseNumberValue(ctx.value); + if (value == null) { + return { handled: true, warnings: ["invalid temperature value"] }; + } + requireInRange(value, 0, 1, "temperature"); + return { + handled: true, + overrides: { ...(ctx.currentOverrides ?? {}), temperature: value }, + }; + } + + case "top_p": + case "topp": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const value = parseNumberValue(ctx.value); + if (value == null) { + return { handled: true, warnings: ["invalid top_p value"] }; + } + requireInRange(value, 0, 1, "top_p"); + return { + handled: true, + overrides: { ...(ctx.currentOverrides ?? {}), topP: value }, + }; + } + + default: + return { handled: false }; + } + } catch (error) { + return { + handled: true, + warnings: [error instanceof Error ? error.message : String(error)], + }; + } +} + +// ── Provider ──────────────────────────────────────────────────────────────── + +export function buildFishAudioSpeechProvider(): SpeechProviderPlugin { + return { + id: "fish-audio", + label: "Fish Audio", + autoSelectOrder: 30, + models: FISH_AUDIO_MODELS, + + resolveConfig: ({ rawConfig }) => + normalizeFishAudioProviderConfig(rawConfig), + + parseDirectiveToken, + + // Talk Mode — v2, stub for now + resolveTalkConfig: ({ baseTtsConfig }) => + normalizeFishAudioProviderConfig(baseTtsConfig), + + resolveTalkOverrides: ({ params }) => ({ + ...(trimToUndefined(params.voiceId) == null + ? {} + : { voiceId: trimToUndefined(params.voiceId) }), + ...(trimToUndefined(params.model) == null + ? {} + : { model: trimToUndefined(params.model) }), + ...(asNumber(params.speed) == null + ? {} + : { speed: asNumber(params.speed) }), + }), + + listVoices: async (req) => { + const config = req.providerConfig + ? readFishAudioProviderConfig(req.providerConfig) + : undefined; + const apiKey = + req.apiKey || + config?.apiKey || + process.env.FISH_AUDIO_API_KEY; + if (!apiKey) { + throw new Error("Fish Audio API key missing"); + } + const raw = await listFishAudioVoices({ + apiKey, + baseUrl: req.baseUrl ?? config?.baseUrl, + }); + return raw as SpeechVoiceOption[]; + }, + + isConfigured: ({ providerConfig }) => + Boolean( + readFishAudioProviderConfig(providerConfig).apiKey || + process.env.FISH_AUDIO_API_KEY, + ), + + synthesize: async (req) => { + const config = readFishAudioProviderConfig(req.providerConfig); + const overrides = req.providerOverrides ?? {}; + const apiKey = + config.apiKey || process.env.FISH_AUDIO_API_KEY; + if (!apiKey) { + throw new Error("Fish Audio API key missing"); + } + + // Pick format based on target channel + const useOpus = req.target === "voice-note"; + const format = useOpus ? "opus" : "mp3"; + + const speed = asNumber(overrides.speed) ?? config.speed; + if (speed != null) { + requireInRange(speed, 0.5, 2.0, "speed"); + } + + const audioBuffer = await fishAudioTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + referenceId: trimToUndefined(overrides.voiceId) ?? config.voiceId, + model: trimToUndefined(overrides.model) ?? config.model, + format, + latency: + normalizeLatency(overrides.latency) !== DEFAULT_LATENCY + ? normalizeLatency(overrides.latency) + : config.latency, + speed, + temperature: asNumber(overrides.temperature) ?? config.temperature, + topP: asNumber(overrides.topP) ?? config.topP, + timeoutMs: req.timeoutMs, + }); + + return { + audioBuffer, + outputFormat: format, + fileExtension: useOpus ? ".opus" : ".mp3", + voiceCompatible: true, // Fish Audio output works as voice note in both formats + }; + }, + }; +} diff --git a/tts.ts b/tts.ts new file mode 100644 index 0000000..e7c17a9 --- /dev/null +++ b/tts.ts @@ -0,0 +1,175 @@ +const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio"; + +function normalizeFishAudioBaseUrl(baseUrl?: string): string { + const trimmed = baseUrl?.trim(); + if (!trimmed) { + return DEFAULT_FISH_AUDIO_BASE_URL; + } + return trimmed.replace(/\/+$/, ""); +} + +export async function fishAudioTTS(params: { + text: string; + apiKey: string; + baseUrl?: string; + referenceId: string; + model: string; + format: "mp3" | "opus" | "wav" | "pcm"; + latency?: "normal" | "balanced" | "low"; + speed?: number; + temperature?: number; + topP?: number; + timeoutMs: number; +}): Promise { + const { + text, + apiKey, + baseUrl, + referenceId, + model, + format, + latency, + speed, + temperature, + topP, + timeoutMs, + } = params; + + if (!text.trim()) { + throw new Error("Fish Audio TTS: empty text"); + } + if (!referenceId.trim()) { + throw new Error("Fish Audio TTS: missing reference_id (voice)"); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const url = `${normalizeFishAudioBaseUrl(baseUrl)}/v1/tts`; + + const body: Record = { + text, + reference_id: referenceId, + format, + }; + + if (latency && latency !== "normal") { + body.latency = latency; + } + + // Prosody settings + if (speed != null) { + body.prosody = { speed }; + } + + if (temperature != null) { + body.temperature = temperature; + } + if (topP != null) { + body.top_p = topP; + } + + const response = await fetch(url, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + model, + }, + body: JSON.stringify(body), + signal: controller.signal, + }); + + if (!response.ok) { + let errorDetail = ""; + try { + const errorBody = await response.text(); + errorDetail = errorBody ? `: ${errorBody}` : ""; + } catch { + // Ignore error body read failure + } + throw new Error(`Fish Audio API error (${response.status})${errorDetail}`); + } + + const buffer = Buffer.from(await response.arrayBuffer()); + if (buffer.length === 0) { + throw new Error("Fish Audio TTS produced empty audio"); + } + + return buffer; + } finally { + clearTimeout(timeout); + } +} + +export async function listFishAudioVoices(params: { + apiKey: string; + baseUrl?: string; +}): Promise> { + const base = normalizeFishAudioBaseUrl(params.baseUrl); + + // Two parallel calls: official voices + user's own voices + const [officialRes, selfRes] = await Promise.allSettled([ + fetch(`${base}/model?type=tts&author_id=d8b0991f96b44e489422ca2ddf0bd31d&page_size=100`, { + headers: { Authorization: `Bearer ${params.apiKey}` }, + }), + fetch(`${base}/model?type=tts&self=true&page_size=100`, { + headers: { Authorization: `Bearer ${params.apiKey}` }, + }), + ]); + + const voices = new Map(); + + // Process official voices first + if (officialRes.status === "fulfilled" && officialRes.value.ok) { + const json = (await officialRes.value.json()) as { + items?: Array<{ _id?: string; title?: string }>; + }; + if (Array.isArray(json.items)) { + for (const v of json.items) { + const id = v._id?.trim(); + const name = v.title?.trim(); + if (id) { + voices.set(id, name || id); + } + } + } + } + + // User's own voices take precedence on conflict + if (selfRes.status === "fulfilled" && selfRes.value.ok) { + const json = (await selfRes.value.json()) as { + items?: Array<{ _id?: string; title?: string }>; + }; + if (Array.isArray(json.items)) { + for (const v of json.items) { + const id = v._id?.trim(); + const name = v.title?.trim(); + if (id) { + voices.set(id, name ? `${name} (mine)` : id); + } + } + } + } + + // If both calls failed, throw + if (voices.size === 0) { + const errors: string[] = []; + if (officialRes.status === "rejected") { + errors.push(`official: ${officialRes.reason}`); + } else if (!officialRes.value.ok) { + errors.push(`official: HTTP ${officialRes.value.status}`); + } + if (selfRes.status === "rejected") { + errors.push(`self: ${selfRes.reason}`); + } else if (!selfRes.value.ok) { + errors.push(`self: HTTP ${selfRes.value.status}`); + } + if (errors.length > 0) { + throw new Error(`Fish Audio voices API error: ${errors.join("; ")}`); + } + } + + return Array.from(voices.entries()).map(([id, name]) => ({ id, name })); +}