diff --git a/README.md b/README.md index 81a1b2a..d7cac26 100644 --- a/README.md +++ b/README.md @@ -1,97 +1,51 @@ -# Fish Audio Speech Plugin for OpenClaw +# Fish Audio Speech -A speech provider plugin that integrates [Fish Audio](https://fish.audio) TTS with OpenClaw. +Bundled [Fish Audio](https://fish.audio) TTS speech provider for OpenClaw. ## Features -- **Fish Audio S2-Pro / S1 / S2** model support -- **Dynamic voice listing** — your own cloned voices + official Fish Audio voices -- **Format-aware output** — opus for voice notes (Telegram, WhatsApp), mp3 otherwise -- **Inline directives** — switch voice, speed, model, and latency mid-message -- **No core changes required** — standard `SpeechProviderPlugin` extension - -## Installation - -```bash -openclaw plugins install @openclaw/fish-audio-speech -``` +- Fish Audio S2-Pro and S1 model support +- Dynamic voice listing (user's own cloned/trained voices via `self=true`) +- Format-aware output: opus for voice notes (Telegram, WhatsApp), mp3 otherwise +- Inline directives: voice, speed, model, latency, temperature, top_p +- `voiceCompatible: true` for both formats ## Configuration -In your `openclaw.json`: - -```json +```json5 { - "messages": { - "tts": { - "provider": "fish-audio", - "providers": { + messages: { + tts: { + provider: "fish-audio", + providers: { "fish-audio": { - "apiKey": "your-fish-audio-api-key", - "voiceId": "8a2d42279389471993460b85340235c5", - "model": "s2-pro", - "latency": "normal", - "speed": 1.0 - } - } - } - } + apiKey: "your-fish-audio-api-key", + voiceId: "reference-id-of-voice", + model: "s2-pro", // s2-pro | s1 + latency: "normal", // normal | balanced | low + // speed: 1.0, // 0.5–2.0 (optional) + // temperature: 0.7, // 0–1 (optional) + // topP: 0.8, // 0–1 (optional) + }, + }, + }, + }, } ``` -### Config Options - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `apiKey` | string | — | **Required.** Fish Audio API key | -| `voiceId` | string | — | **Required.** Reference ID of the voice to use | -| `model` | string | `s2-pro` | TTS model (`s2-pro`, `s1`, `s2`) | -| `latency` | string | `normal` | Latency mode (`normal`, `balanced`, `low`) | -| `speed` | number | — | Prosody speed (0.5–2.0) | -| `temperature` | number | — | Sampling temperature (0–1) | -| `topP` | number | — | Top-p sampling (0–1) | -| `baseUrl` | string | `https://api.fish.audio` | API base URL | - -### Environment Variable - -You can also set the API key via environment variable: - -```bash -FISH_AUDIO_API_KEY=your-key -``` +Environment variable fallback: `FISH_AUDIO_API_KEY`. ## Directives -Use inline directives in your messages to control TTS per-message: +All directive keys are provider-prefixed to avoid dispatch collisions with +bundled providers (OpenAI, ElevenLabs) that claim generic keys like `voice` +and `model`. Both `fishaudio_*` and shorter `fish_*` aliases are accepted. ``` -[[tts:voice=]] Switch voice -[[tts:speed=1.2]] Prosody speed (0.5–2.0) -[[tts:model=s1]] Model override -[[tts:latency=low]] Latency mode -[[tts:temperature=0.7]] Sampling temperature -[[tts:top_p=0.8]] Top-p sampling +[[tts:fishaudio_voice=]] Switch voice (or fish_voice) +[[tts:fishaudio_speed=1.2]] Prosody speed 0.5–2.0 (or fish_speed) +[[tts:fishaudio_model=s1]] Model override (or fish_model) +[[tts:fishaudio_latency=low]] Latency mode (or fish_latency) +[[tts:fishaudio_temperature=0.7]] Sampling temperature (or fish_temperature) +[[tts:fishaudio_top_p=0.8]] Top-p sampling (or fish_top_p) ``` - -## Voice Listing - -The plugin dynamically lists available voices via `/tts voices`: -- **Official Fish Audio voices** (~38 voices) -- **Your own cloned/trained voices** (marked with "(mine)") - -## Output Format - -The plugin automatically selects the best format based on the channel: -- **Voice note channels** (Telegram, WhatsApp, Matrix, Feishu) → Opus -- **All other channels** → MP3 - -Both formats set `voiceCompatible: true` — Fish Audio output works cleanly as native voice notes. - -## Requirements - -- OpenClaw ≥ 2026.3.0 -- Fish Audio API key ([get one here](https://fish.audio)) - -## License - -MIT diff --git a/package.json b/package.json index 3d68da6..ffdd463 100644 --- a/package.json +++ b/package.json @@ -1,21 +1,12 @@ { "name": "@openclaw/fish-audio-speech", - "version": "0.1.0", - "description": "Fish Audio TTS speech provider plugin for OpenClaw", + "version": "2026.3.28", + "private": true, + "description": "OpenClaw Fish Audio speech plugin", "type": "module", - "license": "MIT", "openclaw": { "extensions": [ "./index.ts" ] - }, - "peerDependencies": { - "openclaw": ">=2026.3.0" - }, - "devDependencies": { - "vitest": "^3.0.0" - }, - "scripts": { - "test": "vitest run" } } diff --git a/plugin-registration.contract.test.ts b/plugin-registration.contract.test.ts new file mode 100644 index 0000000..b17d702 --- /dev/null +++ b/plugin-registration.contract.test.ts @@ -0,0 +1,7 @@ +import { describePluginRegistrationContract } from "../../test/helpers/plugins/plugin-registration-contract.js"; + +describePluginRegistrationContract({ + pluginId: "fish-audio", + speechProviderIds: ["fish-audio"], + requireSpeechVoices: true, +}); diff --git a/speech-provider.test.ts b/speech-provider.test.ts index c3673ae..6c342c2 100644 --- a/speech-provider.test.ts +++ b/speech-provider.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import { isValidFishAudioVoiceId } from "./speech-provider.js"; +import { buildFishAudioSpeechProvider, isValidFishAudioVoiceId } from "./speech-provider.js"; describe("fish-audio speech provider", () => { describe("isValidFishAudioVoiceId", () => { @@ -37,4 +37,80 @@ describe("fish-audio speech provider", () => { } }); }); + + describe("parseDirectiveToken", () => { + const provider = buildFishAudioSpeechProvider(); + const parse = provider.parseDirectiveToken!; + + const policy = { allowVoice: true, allowModelId: true, allowVoiceSettings: true, allowProvider: true }; + + it("handles provider-prefixed voice keys", () => { + const voiceId = "8a2d42279389471993460b85340235c5"; + for (const key of ["fishaudio_voice", "fish_voice", "fishaudio_voiceid"]) { + const result = parse({ key, value: voiceId, policy, currentOverrides: {} }); + expect(result.handled, `${key} should be handled`).toBe(true); + expect(result.overrides?.voiceId).toBe(voiceId); + } + }); + + it("handles provider-prefixed model keys", () => { + for (const key of ["fishaudio_model", "fish_model"]) { + const result = parse({ key, value: "s1", policy, currentOverrides: {} }); + expect(result.handled, `${key} should be handled`).toBe(true); + expect(result.overrides?.model).toBe("s1"); + } + }); + + it("handles provider-prefixed speed keys", () => { + for (const key of ["fishaudio_speed", "fish_speed"]) { + const result = parse({ key, value: "1.5", policy, currentOverrides: {} }); + expect(result.handled, `${key} should be handled`).toBe(true); + expect(result.overrides?.speed).toBe(1.5); + } + }); + + it("handles provider-prefixed latency keys", () => { + for (const key of ["fishaudio_latency", "fish_latency"]) { + const result = parse({ key, value: "low", policy, currentOverrides: {} }); + expect(result.handled, `${key} should be handled`).toBe(true); + expect(result.overrides?.latency).toBe("low"); + } + }); + + it("does NOT claim generic keys (voice, model, speed)", () => { + for (const key of ["voice", "model", "speed", "voiceid", "voice_id", "modelid", "model_id", "latency", "temperature", "temp", "top_p", "topp"]) { + const result = parse({ key, value: "anything", policy, currentOverrides: {} }); + expect(result.handled, `generic key "${key}" should NOT be handled`).toBe(false); + } + }); + + it("rejects invalid voice ID with warning", () => { + const result = parse({ key: "fishaudio_voice", value: "bad!", policy, currentOverrides: {} }); + expect(result.handled).toBe(true); + expect(result.warnings?.length).toBeGreaterThan(0); + expect(result.overrides).toBeUndefined(); + }); + + it("validates speed range", () => { + const result = parse({ key: "fishaudio_speed", value: "5.0", policy, currentOverrides: {} }); + expect(result.handled).toBe(true); + expect(result.warnings?.length).toBeGreaterThan(0); + }); + + it("rejects invalid latency values with warning instead of silently defaulting", () => { + const result = parse({ key: "fishaudio_latency", value: "fast", policy, currentOverrides: {} }); + expect(result.handled).toBe(true); + expect(result.warnings?.length).toBeGreaterThan(0); + expect(result.overrides).toBeUndefined(); + }); + + it("accepts valid latency values", () => { + for (const value of ["normal", "balanced", "low"]) { + const result = parse({ key: "fishaudio_latency", value, policy, currentOverrides: {} }); + expect(result.handled).toBe(true); + expect(result.overrides?.latency).toBe(value); + expect(result.warnings).toBeUndefined(); + } + }); + }); }); diff --git a/speech-provider.ts b/speech-provider.ts index 556b387..dd7b417 100644 --- a/speech-provider.ts +++ b/speech-provider.ts @@ -6,11 +6,9 @@ import type { SpeechVoiceOption, } from "openclaw/plugin-sdk/speech-core"; import { requireInRange } from "openclaw/plugin-sdk/speech-core"; -import { fishAudioTTS, listFishAudioVoices } from "./tts.js"; +import { DEFAULT_FISH_AUDIO_BASE_URL, fishAudioTTS, listFishAudioVoices, normalizeFishAudioBaseUrl } from "./tts.js"; // ── Defaults ──────────────────────────────────────────────────────────────── - -const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio"; // No default voice — users must configure one. Fish Audio has no universal // "default" voice like ElevenLabs does, and shipping a personal clone ID // as default would be wrong for community users. @@ -18,7 +16,7 @@ const DEFAULT_VOICE_ID = ""; const DEFAULT_MODEL = "s2-pro"; const DEFAULT_LATENCY = "normal" as const; -const FISH_AUDIO_MODELS = ["s2-pro", "s1", "s2"] as const; +const FISH_AUDIO_MODELS = ["s2-pro", "s1"] as const; // ── Types ─────────────────────────────────────────────────────────────────── @@ -54,11 +52,6 @@ function parseNumberValue(value: string): number | undefined { return Number.isFinite(parsed) ? parsed : undefined; } -function normalizeBaseUrl(baseUrl: string | undefined): string { - const trimmed = baseUrl?.trim(); - return trimmed?.replace(/\/+$/, "") || DEFAULT_FISH_AUDIO_BASE_URL; -} - function normalizeLatency(value: unknown): "normal" | "balanced" | "low" { const s = typeof value === "string" ? value.trim().toLowerCase() : ""; if (s === "balanced" || s === "low") return s; @@ -90,7 +83,7 @@ function normalizeFishAudioProviderConfig( value: raw?.apiKey, path: "messages.tts.providers.fish-audio.apiKey", }), - baseUrl: normalizeBaseUrl(trimToUndefined(raw?.baseUrl)), + baseUrl: normalizeFishAudioBaseUrl(trimToUndefined(raw?.baseUrl)), voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_VOICE_ID, model: normalizeModel(raw?.model), latency: normalizeLatency(raw?.latency), @@ -106,7 +99,7 @@ function readFishAudioProviderConfig( const defaults = normalizeFishAudioProviderConfig({}); return { apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey, - baseUrl: normalizeBaseUrl( + baseUrl: normalizeFishAudioBaseUrl( trimToUndefined(config.baseUrl) ?? defaults.baseUrl, ), voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId, @@ -121,14 +114,17 @@ function readFishAudioProviderConfig( // ── Directive parsing ─────────────────────────────────────────────────────── function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) { + // Only claim provider-prefixed keys to avoid dispatch collisions. + // `parseTtsDirectives` stops at the first provider whose `parseDirectiveToken` + // returns `handled: true`, and bundled providers with lower `autoSelectOrder` + // (e.g. OpenAI at 10) are visited first. Generic keys like "voice" or "model" + // would be swallowed by earlier providers and never reach us. + // Convention matches ElevenLabs: `elevenlabs_voice`, `elevenlabs_model`, etc. try { switch (ctx.key) { - case "voice": - case "voiceid": - case "voice_id": + case "fishaudio_voice": case "fish_voice": - case "fishvoice": - case "reference_id": + case "fishaudio_voiceid": if (!ctx.policy.allowVoice) { return { handled: true }; } @@ -143,11 +139,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) { overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value }, }; - case "model": - case "modelid": - case "model_id": + case "fishaudio_model": case "fish_model": - case "fishmodel": if (!ctx.policy.allowModelId) { return { handled: true }; } @@ -156,7 +149,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) { overrides: { ...(ctx.currentOverrides ?? {}), model: ctx.value }, }; - case "speed": { + case "fishaudio_speed": + case "fish_speed": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } @@ -171,21 +165,26 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) { }; } - case "latency": - case "fish_latency": + case "fishaudio_latency": + case "fish_latency": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } - { - const lat = normalizeLatency(ctx.value); + const raw = typeof ctx.value === "string" ? ctx.value.trim().toLowerCase() : ""; + if (raw !== "normal" && raw !== "balanced" && raw !== "low") { return { handled: true, - overrides: { ...(ctx.currentOverrides ?? {}), latency: lat }, + warnings: [`invalid Fish Audio latency "${ctx.value}" (expected: normal, balanced, low)`], }; } + return { + handled: true, + overrides: { ...(ctx.currentOverrides ?? {}), latency: raw }, + }; + } - case "temperature": - case "temp": { + case "fishaudio_temperature": + case "fish_temperature": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } @@ -200,8 +199,8 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) { }; } - case "top_p": - case "topp": { + case "fishaudio_top_p": + case "fish_top_p": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } @@ -233,6 +232,9 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin { return { id: "fish-audio", label: "Fish Audio", + // Lower = higher priority in auto-detect fallback. Positioned below OpenAI (10) + // but above ElevenLabs (20) and Microsoft (30) since Fish Audio requires + // explicit configuration (apiKey + voiceId) to pass isConfigured(). autoSelectOrder: 15, models: FISH_AUDIO_MODELS, @@ -241,17 +243,43 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin { parseDirectiveToken, - // Talk Mode — v2, stub for now - resolveTalkConfig: ({ baseTtsConfig }) => - normalizeFishAudioProviderConfig(baseTtsConfig), + resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { + const base = normalizeFishAudioProviderConfig(baseTtsConfig); + return { + ...base, + ...(talkProviderConfig.apiKey === undefined + ? {} + : { + apiKey: normalizeResolvedSecretInputString({ + value: talkProviderConfig.apiKey, + path: "talk.providers.fish-audio.apiKey", + }), + }), + ...(trimToUndefined(talkProviderConfig.baseUrl) == null + ? {} + : { baseUrl: normalizeFishAudioBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }), + ...(trimToUndefined(talkProviderConfig.voiceId) == null + ? {} + : { voiceId: trimToUndefined(talkProviderConfig.voiceId) }), + ...(trimToUndefined(talkProviderConfig.modelId) == null + ? {} + : { model: normalizeModel(talkProviderConfig.modelId) }), + ...(talkProviderConfig.latency == null + ? {} + : { latency: normalizeLatency(talkProviderConfig.latency) }), + ...(asNumber(talkProviderConfig.speed) == null + ? {} + : { speed: asNumber(talkProviderConfig.speed) }), + }; + }, resolveTalkOverrides: ({ params }) => ({ ...(trimToUndefined(params.voiceId) == null ? {} : { voiceId: trimToUndefined(params.voiceId) }), - ...(trimToUndefined(params.model) == null + ...(trimToUndefined(params.modelId) == null ? {} - : { model: trimToUndefined(params.model) }), + : { model: trimToUndefined(params.modelId) }), ...(asNumber(params.speed) == null ? {} : { speed: asNumber(params.speed) }), @@ -314,10 +342,9 @@ export function buildFishAudioSpeechProvider(): SpeechProviderPlugin { referenceId: voiceId, model: trimToUndefined(overrides.model) ?? config.model, format, - latency: - normalizeLatency(overrides.latency) !== DEFAULT_LATENCY - ? normalizeLatency(overrides.latency) - : config.latency, + latency: overrides.latency != null + ? normalizeLatency(overrides.latency) + : config.latency, speed, temperature: asNumber(overrides.temperature) ?? config.temperature, topP: asNumber(overrides.topP) ?? config.topP, diff --git a/test-api.ts b/test-api.ts new file mode 100644 index 0000000..be9945d --- /dev/null +++ b/test-api.ts @@ -0,0 +1 @@ +export { buildFishAudioSpeechProvider } from "./speech-provider.js"; diff --git a/tts.ts b/tts.ts index 05d66b7..5b5866d 100644 --- a/tts.ts +++ b/tts.ts @@ -1,6 +1,6 @@ -const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio"; +export const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio"; -function normalizeFishAudioBaseUrl(baseUrl?: string): string { +export function normalizeFishAudioBaseUrl(baseUrl?: string): string { const trimmed = baseUrl?.trim(); if (!trimmed) { return DEFAULT_FISH_AUDIO_BASE_URL; @@ -115,67 +115,32 @@ export async function listFishAudioVoices(params: { }): Promise> { const base = normalizeFishAudioBaseUrl(params.baseUrl); - // Two parallel calls: official voices + user's own voices - const [officialRes, selfRes] = await Promise.allSettled([ - fetch(`${base}/model?type=tts&author_id=d8b0991f96b44e489422ca2ddf0bd31d&page_size=100`, { - headers: { Authorization: `Bearer ${params.apiKey}` }, - }), - fetch(`${base}/model?type=tts&self=true&page_size=100`, { - headers: { Authorization: `Bearer ${params.apiKey}` }, - }), - ]); + // List the authenticated user's own voices (cloned/trained). + // Fish Audio has no stable API for fetching a curated "official" voice + // catalogue — the public model listing returns the entire community corpus + // (1M+ entries) and filtering by undocumented author IDs would be fragile. + // Users can browse and select voices at https://fish.audio and configure + // their chosen voiceId directly. + const res = await fetch(`${base}/model?type=tts&self=true&page_size=100`, { + headers: { Authorization: `Bearer ${params.apiKey}` }, + }); - const voices = new Map(); - - // Process official voices first - if (officialRes.status === "fulfilled" && officialRes.value.ok) { - const json = (await officialRes.value.json()) as { - items?: Array<{ _id?: string; title?: string }>; - }; - if (Array.isArray(json.items)) { - for (const v of json.items) { - const id = v._id?.trim(); - const name = v.title?.trim(); - if (id) { - voices.set(id, name || id); - } - } - } + if (!res.ok) { + throw new Error(`Fish Audio voices API error (${res.status})`); } - // User's own voices take precedence on conflict - if (selfRes.status === "fulfilled" && selfRes.value.ok) { - const json = (await selfRes.value.json()) as { - items?: Array<{ _id?: string; title?: string }>; - }; - if (Array.isArray(json.items)) { - for (const v of json.items) { - const id = v._id?.trim(); - const name = v.title?.trim(); - if (id) { - voices.set(id, name ? `${name} (mine)` : id); - } - } - } + const json = (await res.json()) as { + items?: Array<{ _id?: string; title?: string }>; + }; + + if (!Array.isArray(json.items)) { + return []; } - // If both calls failed, throw - if (voices.size === 0) { - const errors: string[] = []; - if (officialRes.status === "rejected") { - errors.push(`official: ${officialRes.reason}`); - } else if (!officialRes.value.ok) { - errors.push(`official: HTTP ${officialRes.value.status}`); - } - if (selfRes.status === "rejected") { - errors.push(`self: ${selfRes.reason}`); - } else if (!selfRes.value.ok) { - errors.push(`self: HTTP ${selfRes.value.status}`); - } - if (errors.length > 0) { - throw new Error(`Fish Audio voices API error: ${errors.join("; ")}`); - } - } - - return Array.from(voices.entries()).map(([id, name]) => ({ id, name })); + return json.items + .map((v) => ({ + id: v._id?.trim() ?? "", + name: v.title?.trim() || v._id?.trim() || "", + })) + .filter((v) => v.id.length > 0); }