Use official fishaudio SDK for ASR

2026-03-24 01:23:53 +00:00
parent 1bdf28e1d4
commit f18791e807
1 changed files with 13 additions and 48 deletions
--- a/ingest.py
+++ b/ingest.py
@@ -20,20 +20,15 @@ Dependencies (must be on PATH):
 """

 import argparse
-import http.client
 import json
 import math
 import os
-import ssl
 import subprocess
 import sys
 import time
-import uuid
 from pathlib import Path


-FISH_AUDIO_ASR_URL = "https://api.fish.audio/v1/asr"
-
 # Chapters where frame extraction adds real value (demo-heavy sections).
 DEMO_CHAPTER_KEYWORDS = [
    "transcoding",
@@ -110,64 +105,34 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):

 def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
    """
-    Send a chunk to Fish Audio ASR. Returns a list of segments with
-    timestamps adjusted by start_offset_sec.
+    Send a chunk to Fish Audio ASR via the official SDK.
+    Returns a list of segments with timestamps adjusted by start_offset_sec.
    Note: Fish Audio returns timestamps in milliseconds.
    """
+    from fishaudio import FishAudio
+
    print(f"  → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)

-    boundary = uuid.uuid4().hex
+    client = FishAudio(api_key=api_key)
    with open(chunk_path, "rb") as f:
-        audio_data = f.read()
-
-    body = (
-        f"--{boundary}\r\n"
-        f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'
-        f"Content-Type: audio/mpeg\r\n\r\n"
-    ).encode() + audio_data + (
-        f"\r\n--{boundary}\r\n"
-        f'Content-Disposition: form-data; name="language"\r\n\r\n'
-        f"en"
-        f"\r\n--{boundary}--\r\n"
-    ).encode()
-
-    ctx = ssl.create_default_context()
-    conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx)
-    conn.request(
-        "POST",
-        "/v1/asr",
-        body=body,
-        headers={
-            "Authorization": f"Bearer {api_key}",
-            "Content-Type": f"multipart/form-data; boundary={boundary}",
-            "Content-Length": str(len(body)),
-        },
-    )
-    resp = conn.getresponse()
-    resp_body = resp.read().decode()
-    conn.close()
-
-    if resp.status != 200:
-        raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}")
-
-    data = json.loads(resp_body)
+        result = client.asr.transcribe(audio=f.read(), language="en")

    # Fish Audio returns timestamps in milliseconds — convert to seconds
    # and adjust by the chunk's start offset in the full audio
    segments = []
-    for seg in data.get("segments", []):
+    for seg in result.segments:
        segments.append({
-            "start": seg["start"] / 1000.0 + start_offset_sec,
-            "end": seg["end"] / 1000.0 + start_offset_sec,
-            "text": seg["text"],
+            "start": seg.start / 1000.0 + start_offset_sec,
+            "end": seg.end / 1000.0 + start_offset_sec,
+            "text": seg.text,
        })

-    # If Fish returns flat text with no segments, synthesise one segment
-    if not segments and data.get("text"):
+    # Fallback: if no segments, use full text as one block
+    if not segments and result.text:
        segments.append({
            "start": start_offset_sec,
            "end": start_offset_sec + 60,
-            "text": data["text"],
+            "text": result.text,
        })

    return segments