Use official fishaudio SDK for ASR

This commit is contained in:
2026-03-24 01:23:53 +00:00
parent 1bdf28e1d4
commit f18791e807

View File

@@ -20,20 +20,15 @@ Dependencies (must be on PATH):
"""
import argparse
import http.client
import json
import math
import os
import ssl
import subprocess
import sys
import time
import uuid
from pathlib import Path
FISH_AUDIO_ASR_URL = "https://api.fish.audio/v1/asr"
# Chapters where frame extraction adds real value (demo-heavy sections).
DEMO_CHAPTER_KEYWORDS = [
"transcoding",
@@ -110,64 +105,34 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
"""
Send a chunk to Fish Audio ASR. Returns a list of segments with
timestamps adjusted by start_offset_sec.
Send a chunk to Fish Audio ASR via the official SDK.
Returns a list of segments with timestamps adjusted by start_offset_sec.
Note: Fish Audio returns timestamps in milliseconds.
"""
from fishaudio import FishAudio
print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
boundary = uuid.uuid4().hex
client = FishAudio(api_key=api_key)
with open(chunk_path, "rb") as f:
audio_data = f.read()
body = (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'
f"Content-Type: audio/mpeg\r\n\r\n"
).encode() + audio_data + (
f"\r\n--{boundary}\r\n"
f'Content-Disposition: form-data; name="language"\r\n\r\n'
f"en"
f"\r\n--{boundary}--\r\n"
).encode()
ctx = ssl.create_default_context()
conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx)
conn.request(
"POST",
"/v1/asr",
body=body,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": f"multipart/form-data; boundary={boundary}",
"Content-Length": str(len(body)),
},
)
resp = conn.getresponse()
resp_body = resp.read().decode()
conn.close()
if resp.status != 200:
raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}")
data = json.loads(resp_body)
result = client.asr.transcribe(audio=f.read(), language="en")
# Fish Audio returns timestamps in milliseconds — convert to seconds
# and adjust by the chunk's start offset in the full audio
segments = []
for seg in data.get("segments", []):
for seg in result.segments:
segments.append({
"start": seg["start"] / 1000.0 + start_offset_sec,
"end": seg["end"] / 1000.0 + start_offset_sec,
"text": seg["text"],
"start": seg.start / 1000.0 + start_offset_sec,
"end": seg.end / 1000.0 + start_offset_sec,
"text": seg.text,
})
# If Fish returns flat text with no segments, synthesise one segment
if not segments and data.get("text"):
# Fallback: if no segments, use full text as one block
if not segments and result.text:
segments.append({
"start": start_offset_sec,
"end": start_offset_sec + 60,
"text": data["text"],
"text": result.text,
})
return segments