feat: scaffold Fish Audio speech provider plugin
- index.ts: plugin entry with definePluginEntry + registerSpeechProvider - speech-provider.ts: full SpeechProviderPlugin implementation - resolveConfig from messages.tts.providers.fish-audio - parseDirectiveToken for voice, model, speed, latency, temperature, top_p - listVoices merging official + user's own voices - synthesize with format-aware output (opus for voice-note, mp3 otherwise) - stub Talk Mode (resolveTalkConfig/resolveTalkOverrides) - tts.ts: raw fishAudioTTS() fetch + listFishAudioVoices() - streaming chunked → buffer, error body included in exceptions - parallel voice listing with graceful partial failure - speech-provider.test.ts: voice ID validation tests - openclaw.plugin.json: speechProviders contract - package.json: peer dep on openclaw >=2026.3.0
This commit is contained in:
98
README.md
98
README.md
@@ -1,3 +1,97 @@
|
|||||||
# fish-audio-plugin
|
# Fish Audio Speech Plugin for OpenClaw
|
||||||
|
|
||||||
Fish Audio TTS speech provider plugin for OpenClaw
|
A speech provider plugin that integrates [Fish Audio](https://fish.audio) TTS with OpenClaw.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Fish Audio S2-Pro / S1 / S2** model support
|
||||||
|
- **Dynamic voice listing** — your own cloned voices + official Fish Audio voices
|
||||||
|
- **Format-aware output** — opus for voice notes (Telegram, WhatsApp), mp3 otherwise
|
||||||
|
- **Inline directives** — switch voice, speed, model, and latency mid-message
|
||||||
|
- **No core changes required** — standard `SpeechProviderPlugin` extension
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
openclaw plugins install @openclaw/fish-audio-speech
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
In your `openclaw.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"messages": {
|
||||||
|
"tts": {
|
||||||
|
"provider": "fish-audio",
|
||||||
|
"providers": {
|
||||||
|
"fish-audio": {
|
||||||
|
"apiKey": "your-fish-audio-api-key",
|
||||||
|
"voiceId": "8a2d42279389471993460b85340235c5",
|
||||||
|
"model": "s2-pro",
|
||||||
|
"latency": "normal",
|
||||||
|
"speed": 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Config Options
|
||||||
|
|
||||||
|
| Field | Type | Default | Description |
|
||||||
|
|-------|------|---------|-------------|
|
||||||
|
| `apiKey` | string | — | **Required.** Fish Audio API key |
|
||||||
|
| `voiceId` | string | `8a2d42...` | Reference ID of the voice to use |
|
||||||
|
| `model` | string | `s2-pro` | TTS model (`s2-pro`, `s1`, `s2`) |
|
||||||
|
| `latency` | string | `normal` | Latency mode (`normal`, `balanced`, `low`) |
|
||||||
|
| `speed` | number | — | Prosody speed (0.5–2.0) |
|
||||||
|
| `temperature` | number | — | Sampling temperature (0–1) |
|
||||||
|
| `topP` | number | — | Top-p sampling (0–1) |
|
||||||
|
| `baseUrl` | string | `https://api.fish.audio` | API base URL |
|
||||||
|
|
||||||
|
### Environment Variable
|
||||||
|
|
||||||
|
You can also set the API key via environment variable:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
FISH_AUDIO_API_KEY=your-key
|
||||||
|
```
|
||||||
|
|
||||||
|
## Directives
|
||||||
|
|
||||||
|
Use inline directives in your messages to control TTS per-message:
|
||||||
|
|
||||||
|
```
|
||||||
|
[[tts:voice=<ref_id>]] Switch voice
|
||||||
|
[[tts:speed=1.2]] Prosody speed (0.5–2.0)
|
||||||
|
[[tts:model=s1]] Model override
|
||||||
|
[[tts:latency=low]] Latency mode
|
||||||
|
[[tts:temperature=0.7]] Sampling temperature
|
||||||
|
[[tts:top_p=0.8]] Top-p sampling
|
||||||
|
```
|
||||||
|
|
||||||
|
## Voice Listing
|
||||||
|
|
||||||
|
The plugin dynamically lists available voices via `/tts voices`:
|
||||||
|
- **Official Fish Audio voices** (~38 voices)
|
||||||
|
- **Your own cloned/trained voices** (marked with "(mine)")
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
The plugin automatically selects the best format based on the channel:
|
||||||
|
- **Voice note channels** (Telegram, WhatsApp, Matrix, Feishu) → Opus
|
||||||
|
- **All other channels** → MP3
|
||||||
|
|
||||||
|
Both formats set `voiceCompatible: true` — Fish Audio output works cleanly as native voice notes.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- OpenClaw ≥ 2026.3.0
|
||||||
|
- Fish Audio API key ([get one here](https://fish.audio))
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT
|
||||||
|
|||||||
11
index.ts
Normal file
11
index.ts
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
|
||||||
|
import { buildFishAudioSpeechProvider } from "./speech-provider.js";
|
||||||
|
|
||||||
|
export default definePluginEntry({
|
||||||
|
id: "fish-audio",
|
||||||
|
name: "Fish Audio Speech",
|
||||||
|
description: "Fish Audio TTS speech provider for OpenClaw",
|
||||||
|
register(api) {
|
||||||
|
api.registerSpeechProvider(buildFishAudioSpeechProvider());
|
||||||
|
},
|
||||||
|
});
|
||||||
11
openclaw.plugin.json
Normal file
11
openclaw.plugin.json
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
{
|
||||||
|
"id": "fish-audio",
|
||||||
|
"contracts": {
|
||||||
|
"speechProviders": ["fish-audio"]
|
||||||
|
},
|
||||||
|
"configSchema": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"properties": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
21
package.json
Normal file
21
package.json
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"name": "@openclaw/fish-audio-speech",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"description": "Fish Audio TTS speech provider plugin for OpenClaw",
|
||||||
|
"type": "module",
|
||||||
|
"license": "MIT",
|
||||||
|
"openclaw": {
|
||||||
|
"extensions": [
|
||||||
|
"./index.ts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"peerDependencies": {
|
||||||
|
"openclaw": ">=2026.3.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"vitest": "^3.0.0"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"test": "vitest run"
|
||||||
|
}
|
||||||
|
}
|
||||||
39
speech-provider.test.ts
Normal file
39
speech-provider.test.ts
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
import { isValidFishAudioVoiceId } from "./speech-provider.js";
|
||||||
|
|
||||||
|
describe("fish-audio speech provider", () => {
|
||||||
|
describe("isValidFishAudioVoiceId", () => {
|
||||||
|
it("accepts valid Fish Audio ref IDs (24-40 char hex)", () => {
|
||||||
|
const valid = [
|
||||||
|
"8a2d42279389471993460b85340235c5", // 32 char - standard
|
||||||
|
"0dad9e24630447cf97803f4beee10481", // 32 char
|
||||||
|
"5796fe24630447cf97803f4beee10481", // 32 char
|
||||||
|
"d8b0991f96b44e489422ca2ddf0bd31d", // 32 char - author id
|
||||||
|
"aabbccddee112233445566778899", // 28 char
|
||||||
|
"aabbccddee11223344556677", // 24 char (minimum)
|
||||||
|
];
|
||||||
|
for (const v of valid) {
|
||||||
|
expect(isValidFishAudioVoiceId(v), `expected valid: ${v}`).toBe(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("rejects invalid voice IDs", () => {
|
||||||
|
const invalid = [
|
||||||
|
"", // empty
|
||||||
|
"abc123", // too short
|
||||||
|
"12345678901234567890123", // 23 chars - below minimum
|
||||||
|
"a".repeat(41), // too long
|
||||||
|
"8a2d4227-9389-4719-9346-0b85340235c5", // UUID with dashes
|
||||||
|
"../../../etc/passwd", // path traversal
|
||||||
|
"voice?param=value", // query string
|
||||||
|
"pMsXgVXv3BLzUgSXRplE", // ElevenLabs-style (mixed case, 20 chars)
|
||||||
|
"ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ", // non-hex chars
|
||||||
|
];
|
||||||
|
for (const v of invalid) {
|
||||||
|
expect(isValidFishAudioVoiceId(v), `expected invalid: ${v}`).toBe(
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
322
speech-provider.ts
Normal file
322
speech-provider.ts
Normal file
@@ -0,0 +1,322 @@
|
|||||||
|
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||||
|
import type {
|
||||||
|
SpeechDirectiveTokenParseContext,
|
||||||
|
SpeechProviderConfig,
|
||||||
|
SpeechProviderPlugin,
|
||||||
|
SpeechVoiceOption,
|
||||||
|
} from "openclaw/plugin-sdk/speech-core";
|
||||||
|
import { requireInRange } from "openclaw/plugin-sdk/speech-core";
|
||||||
|
import { fishAudioTTS, listFishAudioVoices } from "./tts.js";
|
||||||
|
|
||||||
|
// ── Defaults ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
|
||||||
|
const DEFAULT_VOICE_ID = "8a2d42279389471993460b85340235c5"; // SJ voice
|
||||||
|
const DEFAULT_MODEL = "s2-pro";
|
||||||
|
const DEFAULT_LATENCY = "normal" as const;
|
||||||
|
|
||||||
|
const FISH_AUDIO_MODELS = ["s2-pro", "s1", "s2"] as const;
|
||||||
|
|
||||||
|
// ── Types ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type FishAudioProviderConfig = {
|
||||||
|
apiKey?: string;
|
||||||
|
baseUrl: string;
|
||||||
|
voiceId: string;
|
||||||
|
model: string;
|
||||||
|
latency: "normal" | "balanced" | "low";
|
||||||
|
speed?: number;
|
||||||
|
temperature?: number;
|
||||||
|
topP?: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Helpers ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function trimToUndefined(value: unknown): string | undefined {
|
||||||
|
return typeof value === "string" && value.trim() ? value.trim() : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function asNumber(value: unknown): number | undefined {
|
||||||
|
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function asObject(value: unknown): Record<string, unknown> | undefined {
|
||||||
|
return typeof value === "object" && value !== null && !Array.isArray(value)
|
||||||
|
? (value as Record<string, unknown>)
|
||||||
|
: undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseNumberValue(value: string): number | undefined {
|
||||||
|
const parsed = Number.parseFloat(value);
|
||||||
|
return Number.isFinite(parsed) ? parsed : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeBaseUrl(baseUrl: string | undefined): string {
|
||||||
|
const trimmed = baseUrl?.trim();
|
||||||
|
return trimmed?.replace(/\/+$/, "") || DEFAULT_FISH_AUDIO_BASE_URL;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeLatency(value: unknown): "normal" | "balanced" | "low" {
|
||||||
|
const s = typeof value === "string" ? value.trim().toLowerCase() : "";
|
||||||
|
if (s === "balanced" || s === "low") return s;
|
||||||
|
return DEFAULT_LATENCY;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeModel(value: unknown): string {
|
||||||
|
const s = typeof value === "string" ? value.trim() : "";
|
||||||
|
return s || DEFAULT_MODEL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Fish Audio ref IDs are 32-char hex strings */
|
||||||
|
export function isValidFishAudioVoiceId(voiceId: string): boolean {
|
||||||
|
return /^[a-f0-9]{24,40}$/i.test(voiceId);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Config resolution ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function normalizeFishAudioProviderConfig(
|
||||||
|
rawConfig: Record<string, unknown>,
|
||||||
|
): FishAudioProviderConfig {
|
||||||
|
const providers = asObject(rawConfig.providers);
|
||||||
|
const raw =
|
||||||
|
asObject(providers?.["fish-audio"]) ?? asObject(rawConfig["fish-audio"]);
|
||||||
|
return {
|
||||||
|
apiKey: normalizeResolvedSecretInputString({
|
||||||
|
value: raw?.apiKey,
|
||||||
|
path: "messages.tts.providers.fish-audio.apiKey",
|
||||||
|
}),
|
||||||
|
baseUrl: normalizeBaseUrl(trimToUndefined(raw?.baseUrl)),
|
||||||
|
voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_VOICE_ID,
|
||||||
|
model: normalizeModel(raw?.model),
|
||||||
|
latency: normalizeLatency(raw?.latency),
|
||||||
|
speed: asNumber(raw?.speed),
|
||||||
|
temperature: asNumber(raw?.temperature),
|
||||||
|
topP: asNumber(raw?.topP),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function readFishAudioProviderConfig(
|
||||||
|
config: SpeechProviderConfig,
|
||||||
|
): FishAudioProviderConfig {
|
||||||
|
const defaults = normalizeFishAudioProviderConfig({});
|
||||||
|
return {
|
||||||
|
apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
|
||||||
|
baseUrl: normalizeBaseUrl(
|
||||||
|
trimToUndefined(config.baseUrl) ?? defaults.baseUrl,
|
||||||
|
),
|
||||||
|
voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
|
||||||
|
model: normalizeModel(config.model) || defaults.model,
|
||||||
|
latency: normalizeLatency(config.latency),
|
||||||
|
speed: asNumber(config.speed) ?? defaults.speed,
|
||||||
|
temperature: asNumber(config.temperature) ?? defaults.temperature,
|
||||||
|
topP: asNumber(config.topP) ?? defaults.topP,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Directive parsing ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
|
||||||
|
try {
|
||||||
|
switch (ctx.key) {
|
||||||
|
case "voice":
|
||||||
|
case "voiceid":
|
||||||
|
case "voice_id":
|
||||||
|
case "fish_voice":
|
||||||
|
case "fishvoice":
|
||||||
|
case "reference_id":
|
||||||
|
if (!ctx.policy.allowVoice) {
|
||||||
|
return { handled: true };
|
||||||
|
}
|
||||||
|
if (!isValidFishAudioVoiceId(ctx.value)) {
|
||||||
|
return {
|
||||||
|
handled: true,
|
||||||
|
warnings: [`invalid Fish Audio voice ID "${ctx.value}"`],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
handled: true,
|
||||||
|
overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
|
||||||
|
};
|
||||||
|
|
||||||
|
case "model":
|
||||||
|
case "modelid":
|
||||||
|
case "model_id":
|
||||||
|
case "fish_model":
|
||||||
|
case "fishmodel":
|
||||||
|
if (!ctx.policy.allowModelId) {
|
||||||
|
return { handled: true };
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
handled: true,
|
||||||
|
overrides: { ...(ctx.currentOverrides ?? {}), model: ctx.value },
|
||||||
|
};
|
||||||
|
|
||||||
|
case "speed": {
|
||||||
|
if (!ctx.policy.allowVoiceSettings) {
|
||||||
|
return { handled: true };
|
||||||
|
}
|
||||||
|
const value = parseNumberValue(ctx.value);
|
||||||
|
if (value == null) {
|
||||||
|
return { handled: true, warnings: ["invalid speed value"] };
|
||||||
|
}
|
||||||
|
requireInRange(value, 0.5, 2.0, "speed");
|
||||||
|
return {
|
||||||
|
handled: true,
|
||||||
|
overrides: { ...(ctx.currentOverrides ?? {}), speed: value },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
case "latency":
|
||||||
|
case "fish_latency":
|
||||||
|
if (!ctx.policy.allowVoiceSettings) {
|
||||||
|
return { handled: true };
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const lat = normalizeLatency(ctx.value);
|
||||||
|
return {
|
||||||
|
handled: true,
|
||||||
|
overrides: { ...(ctx.currentOverrides ?? {}), latency: lat },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
case "temperature":
|
||||||
|
case "temp": {
|
||||||
|
if (!ctx.policy.allowVoiceSettings) {
|
||||||
|
return { handled: true };
|
||||||
|
}
|
||||||
|
const value = parseNumberValue(ctx.value);
|
||||||
|
if (value == null) {
|
||||||
|
return { handled: true, warnings: ["invalid temperature value"] };
|
||||||
|
}
|
||||||
|
requireInRange(value, 0, 1, "temperature");
|
||||||
|
return {
|
||||||
|
handled: true,
|
||||||
|
overrides: { ...(ctx.currentOverrides ?? {}), temperature: value },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
case "top_p":
|
||||||
|
case "topp": {
|
||||||
|
if (!ctx.policy.allowVoiceSettings) {
|
||||||
|
return { handled: true };
|
||||||
|
}
|
||||||
|
const value = parseNumberValue(ctx.value);
|
||||||
|
if (value == null) {
|
||||||
|
return { handled: true, warnings: ["invalid top_p value"] };
|
||||||
|
}
|
||||||
|
requireInRange(value, 0, 1, "top_p");
|
||||||
|
return {
|
||||||
|
handled: true,
|
||||||
|
overrides: { ...(ctx.currentOverrides ?? {}), topP: value },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
return { handled: false };
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
handled: true,
|
||||||
|
warnings: [error instanceof Error ? error.message : String(error)],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Provider ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export function buildFishAudioSpeechProvider(): SpeechProviderPlugin {
|
||||||
|
return {
|
||||||
|
id: "fish-audio",
|
||||||
|
label: "Fish Audio",
|
||||||
|
autoSelectOrder: 30,
|
||||||
|
models: FISH_AUDIO_MODELS,
|
||||||
|
|
||||||
|
resolveConfig: ({ rawConfig }) =>
|
||||||
|
normalizeFishAudioProviderConfig(rawConfig),
|
||||||
|
|
||||||
|
parseDirectiveToken,
|
||||||
|
|
||||||
|
// Talk Mode — v2, stub for now
|
||||||
|
resolveTalkConfig: ({ baseTtsConfig }) =>
|
||||||
|
normalizeFishAudioProviderConfig(baseTtsConfig),
|
||||||
|
|
||||||
|
resolveTalkOverrides: ({ params }) => ({
|
||||||
|
...(trimToUndefined(params.voiceId) == null
|
||||||
|
? {}
|
||||||
|
: { voiceId: trimToUndefined(params.voiceId) }),
|
||||||
|
...(trimToUndefined(params.model) == null
|
||||||
|
? {}
|
||||||
|
: { model: trimToUndefined(params.model) }),
|
||||||
|
...(asNumber(params.speed) == null
|
||||||
|
? {}
|
||||||
|
: { speed: asNumber(params.speed) }),
|
||||||
|
}),
|
||||||
|
|
||||||
|
listVoices: async (req) => {
|
||||||
|
const config = req.providerConfig
|
||||||
|
? readFishAudioProviderConfig(req.providerConfig)
|
||||||
|
: undefined;
|
||||||
|
const apiKey =
|
||||||
|
req.apiKey ||
|
||||||
|
config?.apiKey ||
|
||||||
|
process.env.FISH_AUDIO_API_KEY;
|
||||||
|
if (!apiKey) {
|
||||||
|
throw new Error("Fish Audio API key missing");
|
||||||
|
}
|
||||||
|
const raw = await listFishAudioVoices({
|
||||||
|
apiKey,
|
||||||
|
baseUrl: req.baseUrl ?? config?.baseUrl,
|
||||||
|
});
|
||||||
|
return raw as SpeechVoiceOption[];
|
||||||
|
},
|
||||||
|
|
||||||
|
isConfigured: ({ providerConfig }) =>
|
||||||
|
Boolean(
|
||||||
|
readFishAudioProviderConfig(providerConfig).apiKey ||
|
||||||
|
process.env.FISH_AUDIO_API_KEY,
|
||||||
|
),
|
||||||
|
|
||||||
|
synthesize: async (req) => {
|
||||||
|
const config = readFishAudioProviderConfig(req.providerConfig);
|
||||||
|
const overrides = req.providerOverrides ?? {};
|
||||||
|
const apiKey =
|
||||||
|
config.apiKey || process.env.FISH_AUDIO_API_KEY;
|
||||||
|
if (!apiKey) {
|
||||||
|
throw new Error("Fish Audio API key missing");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pick format based on target channel
|
||||||
|
const useOpus = req.target === "voice-note";
|
||||||
|
const format = useOpus ? "opus" : "mp3";
|
||||||
|
|
||||||
|
const speed = asNumber(overrides.speed) ?? config.speed;
|
||||||
|
if (speed != null) {
|
||||||
|
requireInRange(speed, 0.5, 2.0, "speed");
|
||||||
|
}
|
||||||
|
|
||||||
|
const audioBuffer = await fishAudioTTS({
|
||||||
|
text: req.text,
|
||||||
|
apiKey,
|
||||||
|
baseUrl: config.baseUrl,
|
||||||
|
referenceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
|
||||||
|
model: trimToUndefined(overrides.model) ?? config.model,
|
||||||
|
format,
|
||||||
|
latency:
|
||||||
|
normalizeLatency(overrides.latency) !== DEFAULT_LATENCY
|
||||||
|
? normalizeLatency(overrides.latency)
|
||||||
|
: config.latency,
|
||||||
|
speed,
|
||||||
|
temperature: asNumber(overrides.temperature) ?? config.temperature,
|
||||||
|
topP: asNumber(overrides.topP) ?? config.topP,
|
||||||
|
timeoutMs: req.timeoutMs,
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
audioBuffer,
|
||||||
|
outputFormat: format,
|
||||||
|
fileExtension: useOpus ? ".opus" : ".mp3",
|
||||||
|
voiceCompatible: true, // Fish Audio output works as voice note in both formats
|
||||||
|
};
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
175
tts.ts
Normal file
175
tts.ts
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
const DEFAULT_FISH_AUDIO_BASE_URL = "https://api.fish.audio";
|
||||||
|
|
||||||
|
function normalizeFishAudioBaseUrl(baseUrl?: string): string {
|
||||||
|
const trimmed = baseUrl?.trim();
|
||||||
|
if (!trimmed) {
|
||||||
|
return DEFAULT_FISH_AUDIO_BASE_URL;
|
||||||
|
}
|
||||||
|
return trimmed.replace(/\/+$/, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fishAudioTTS(params: {
|
||||||
|
text: string;
|
||||||
|
apiKey: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
referenceId: string;
|
||||||
|
model: string;
|
||||||
|
format: "mp3" | "opus" | "wav" | "pcm";
|
||||||
|
latency?: "normal" | "balanced" | "low";
|
||||||
|
speed?: number;
|
||||||
|
temperature?: number;
|
||||||
|
topP?: number;
|
||||||
|
timeoutMs: number;
|
||||||
|
}): Promise<Buffer> {
|
||||||
|
const {
|
||||||
|
text,
|
||||||
|
apiKey,
|
||||||
|
baseUrl,
|
||||||
|
referenceId,
|
||||||
|
model,
|
||||||
|
format,
|
||||||
|
latency,
|
||||||
|
speed,
|
||||||
|
temperature,
|
||||||
|
topP,
|
||||||
|
timeoutMs,
|
||||||
|
} = params;
|
||||||
|
|
||||||
|
if (!text.trim()) {
|
||||||
|
throw new Error("Fish Audio TTS: empty text");
|
||||||
|
}
|
||||||
|
if (!referenceId.trim()) {
|
||||||
|
throw new Error("Fish Audio TTS: missing reference_id (voice)");
|
||||||
|
}
|
||||||
|
|
||||||
|
const controller = new AbortController();
|
||||||
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const url = `${normalizeFishAudioBaseUrl(baseUrl)}/v1/tts`;
|
||||||
|
|
||||||
|
const body: Record<string, unknown> = {
|
||||||
|
text,
|
||||||
|
reference_id: referenceId,
|
||||||
|
format,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (latency && latency !== "normal") {
|
||||||
|
body.latency = latency;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prosody settings
|
||||||
|
if (speed != null) {
|
||||||
|
body.prosody = { speed };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (temperature != null) {
|
||||||
|
body.temperature = temperature;
|
||||||
|
}
|
||||||
|
if (topP != null) {
|
||||||
|
body.top_p = topP;
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${apiKey}`,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
model,
|
||||||
|
},
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
signal: controller.signal,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
let errorDetail = "";
|
||||||
|
try {
|
||||||
|
const errorBody = await response.text();
|
||||||
|
errorDetail = errorBody ? `: ${errorBody}` : "";
|
||||||
|
} catch {
|
||||||
|
// Ignore error body read failure
|
||||||
|
}
|
||||||
|
throw new Error(`Fish Audio API error (${response.status})${errorDetail}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const buffer = Buffer.from(await response.arrayBuffer());
|
||||||
|
if (buffer.length === 0) {
|
||||||
|
throw new Error("Fish Audio TTS produced empty audio");
|
||||||
|
}
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
} finally {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function listFishAudioVoices(params: {
|
||||||
|
apiKey: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
}): Promise<Array<{ id: string; name: string }>> {
|
||||||
|
const base = normalizeFishAudioBaseUrl(params.baseUrl);
|
||||||
|
|
||||||
|
// Two parallel calls: official voices + user's own voices
|
||||||
|
const [officialRes, selfRes] = await Promise.allSettled([
|
||||||
|
fetch(`${base}/model?type=tts&author_id=d8b0991f96b44e489422ca2ddf0bd31d&page_size=100`, {
|
||||||
|
headers: { Authorization: `Bearer ${params.apiKey}` },
|
||||||
|
}),
|
||||||
|
fetch(`${base}/model?type=tts&self=true&page_size=100`, {
|
||||||
|
headers: { Authorization: `Bearer ${params.apiKey}` },
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const voices = new Map<string, string>();
|
||||||
|
|
||||||
|
// Process official voices first
|
||||||
|
if (officialRes.status === "fulfilled" && officialRes.value.ok) {
|
||||||
|
const json = (await officialRes.value.json()) as {
|
||||||
|
items?: Array<{ _id?: string; title?: string }>;
|
||||||
|
};
|
||||||
|
if (Array.isArray(json.items)) {
|
||||||
|
for (const v of json.items) {
|
||||||
|
const id = v._id?.trim();
|
||||||
|
const name = v.title?.trim();
|
||||||
|
if (id) {
|
||||||
|
voices.set(id, name || id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// User's own voices take precedence on conflict
|
||||||
|
if (selfRes.status === "fulfilled" && selfRes.value.ok) {
|
||||||
|
const json = (await selfRes.value.json()) as {
|
||||||
|
items?: Array<{ _id?: string; title?: string }>;
|
||||||
|
};
|
||||||
|
if (Array.isArray(json.items)) {
|
||||||
|
for (const v of json.items) {
|
||||||
|
const id = v._id?.trim();
|
||||||
|
const name = v.title?.trim();
|
||||||
|
if (id) {
|
||||||
|
voices.set(id, name ? `${name} (mine)` : id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If both calls failed, throw
|
||||||
|
if (voices.size === 0) {
|
||||||
|
const errors: string[] = [];
|
||||||
|
if (officialRes.status === "rejected") {
|
||||||
|
errors.push(`official: ${officialRes.reason}`);
|
||||||
|
} else if (!officialRes.value.ok) {
|
||||||
|
errors.push(`official: HTTP ${officialRes.value.status}`);
|
||||||
|
}
|
||||||
|
if (selfRes.status === "rejected") {
|
||||||
|
errors.push(`self: ${selfRes.reason}`);
|
||||||
|
} else if (!selfRes.value.ok) {
|
||||||
|
errors.push(`self: HTTP ${selfRes.value.status}`);
|
||||||
|
}
|
||||||
|
if (errors.length > 0) {
|
||||||
|
throw new Error(`Fish Audio voices API error: ${errors.join("; ")}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Array.from(voices.entries()).map(([id, name]) => ({ id, name }));
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user