Use official fishaudio SDK for ASR

This commit is contained in:
2026-03-24 01:23:53 +00:00
parent 1bdf28e1d4
commit f18791e807

View File

@@ -20,20 +20,15 @@ Dependencies (must be on PATH):
""" """
import argparse import argparse
import http.client
import json import json
import math import math
import os import os
import ssl
import subprocess import subprocess
import sys import sys
import time import time
import uuid
from pathlib import Path from pathlib import Path
FISH_AUDIO_ASR_URL = "https://api.fish.audio/v1/asr"
# Chapters where frame extraction adds real value (demo-heavy sections). # Chapters where frame extraction adds real value (demo-heavy sections).
DEMO_CHAPTER_KEYWORDS = [ DEMO_CHAPTER_KEYWORDS = [
"transcoding", "transcoding",
@@ -110,64 +105,34 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec): def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
""" """
Send a chunk to Fish Audio ASR. Returns a list of segments with Send a chunk to Fish Audio ASR via the official SDK.
timestamps adjusted by start_offset_sec. Returns a list of segments with timestamps adjusted by start_offset_sec.
Note: Fish Audio returns timestamps in milliseconds. Note: Fish Audio returns timestamps in milliseconds.
""" """
from fishaudio import FishAudio
print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True) print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
boundary = uuid.uuid4().hex client = FishAudio(api_key=api_key)
with open(chunk_path, "rb") as f: with open(chunk_path, "rb") as f:
audio_data = f.read() result = client.asr.transcribe(audio=f.read(), language="en")
body = (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'
f"Content-Type: audio/mpeg\r\n\r\n"
).encode() + audio_data + (
f"\r\n--{boundary}\r\n"
f'Content-Disposition: form-data; name="language"\r\n\r\n'
f"en"
f"\r\n--{boundary}--\r\n"
).encode()
ctx = ssl.create_default_context()
conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx)
conn.request(
"POST",
"/v1/asr",
body=body,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": f"multipart/form-data; boundary={boundary}",
"Content-Length": str(len(body)),
},
)
resp = conn.getresponse()
resp_body = resp.read().decode()
conn.close()
if resp.status != 200:
raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}")
data = json.loads(resp_body)
# Fish Audio returns timestamps in milliseconds — convert to seconds # Fish Audio returns timestamps in milliseconds — convert to seconds
# and adjust by the chunk's start offset in the full audio # and adjust by the chunk's start offset in the full audio
segments = [] segments = []
for seg in data.get("segments", []): for seg in result.segments:
segments.append({ segments.append({
"start": seg["start"] / 1000.0 + start_offset_sec, "start": seg.start / 1000.0 + start_offset_sec,
"end": seg["end"] / 1000.0 + start_offset_sec, "end": seg.end / 1000.0 + start_offset_sec,
"text": seg["text"], "text": seg.text,
}) })
# If Fish returns flat text with no segments, synthesise one segment # Fallback: if no segments, use full text as one block
if not segments and data.get("text"): if not segments and result.text:
segments.append({ segments.append({
"start": start_offset_sec, "start": start_offset_sec,
"end": start_offset_sec + 60, "end": start_offset_sec + 60,
"text": data["text"], "text": result.text,
}) })
return segments return segments