Use official fishaudio SDK for ASR

2026-03-24 01:23:53 +00:00
parent 1bdf28e1d4
commit f18791e807
1 changed files with 13 additions and 48 deletions
--- a/ingest.py
+++ b/ingest.py
@@ -20,20 +20,15 @@ Dependencies (must be on PATH):
 """
 import argparse
 import http.client
 import json
 import math
 import os
 import ssl
 import subprocess
 import sys
 import time
 import uuid
 from pathlib import Path
 FISH_AUDIO_ASR_URL = "https://api.fish.audio/v1/asr"
 # Chapters where frame extraction adds real value (demo-heavy sections).
 DEMO_CHAPTER_KEYWORDS = [
    "transcoding",
@@ -110,64 +105,34 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
 def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
    """
-    Send a chunk to Fish Audio ASR. Returns a list of segments with
+    Send a chunk to Fish Audio ASR via the official SDK.
-    timestamps adjusted by start_offset_sec.
+    Returns a list of segments with timestamps adjusted by start_offset_sec.
    Note: Fish Audio returns timestamps in milliseconds.
    """
    from fishaudio import FishAudio
    print(f"  → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
-    boundary = uuid.uuid4().hex
+    client = FishAudio(api_key=api_key)
    with open(chunk_path, "rb") as f:
-        audio_data = f.read()
+        result = client.asr.transcribe(audio=f.read(), language="en")
    body = (
        f"--{boundary}\r\n"
        f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'
        f"Content-Type: audio/mpeg\r\n\r\n"
    ).encode() + audio_data + (
        f"\r\n--{boundary}\r\n"
        f'Content-Disposition: form-data; name="language"\r\n\r\n'
        f"en"
        f"\r\n--{boundary}--\r\n"
    ).encode()
    ctx = ssl.create_default_context()
    conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx)
    conn.request(
        "POST",
        "/v1/asr",
        body=body,
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": f"multipart/form-data; boundary={boundary}",
            "Content-Length": str(len(body)),
        },
    )
    resp = conn.getresponse()
    resp_body = resp.read().decode()
    conn.close()
    if resp.status != 200:
        raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}")
    data = json.loads(resp_body)
    # Fish Audio returns timestamps in milliseconds — convert to seconds
    # and adjust by the chunk's start offset in the full audio
    segments = []
-    for seg in data.get("segments", []):
+    for seg in result.segments:
        segments.append({
-            "start": seg["start"] / 1000.0 + start_offset_sec,
+            "start": seg.start / 1000.0 + start_offset_sec,
-            "end": seg["end"] / 1000.0 + start_offset_sec,
+            "end": seg.end / 1000.0 + start_offset_sec,
-            "text": seg["text"],
+            "text": seg.text,
        })
-    # If Fish returns flat text with no segments, synthesise one segment
+    # Fallback: if no segments, use full text as one block
-    if not segments and data.get("text"):
+    if not segments and result.text:
        segments.append({
            "start": start_offset_sec,
            "end": start_offset_sec + 60,
-            "text": data["text"],
+            "text": result.text,
        })
    return segments