Switch ASR from Fish Audio to Deepgram Nova-3

2026-03-24 03:20:41 +00:00
parent ecf37b8798
commit eff844cde5
1 changed files with 57 additions and 42 deletions
--- a/ingest.py
+++ b/ingest.py
@@ -44,7 +44,7 @@ DEMO_CHAPTER_KEYWORDS = [
 # Target chunk size in seconds — split audio into groups of chapters
 # totalling no more than this before sending to ASR.
-# 10 min = 600s keeps each chunk well under Fish Audio's size limit.
+# 10 min = 600s keeps chunks to a manageable size for Deepgram API.
 CHUNK_MAX_SECONDS = 600
@@ -103,51 +103,66 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
+def transcribe_chunk_deepgram(chunk_path, api_key, start_offset_sec):
    """
-    Send a chunk to Fish Audio ASR via the official SDK.
+    Send a chunk to Deepgram ASR. Returns segments with timestamps in seconds.
    Returns a list of segments with timestamps adjusted by start_offset_sec.
-    Note: Fish Audio returns timestamps in milliseconds.
+    Deepgram Nova-3 model, utterance-level segments.
    """
-    import httpx
+    import http.client
    import ssl
-    print(f"  → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
+    print(f"  → Deepgram ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
    with open(chunk_path, "rb") as f:
        audio_bytes = f.read()
-    print(f"    Sending {len(audio_bytes)/1024/1024:.1f}MB to Fish Audio...", flush=True)
+    print(f"    Sending {len(audio_bytes)/1024/1024:.1f}MB...", flush=True)
-    with httpx.Client(timeout=300) as client:
+    params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
-        resp = client.post(
+    ctx = ssl.create_default_context()
-            "https://api.fish.audio/v1/asr",
+    conn = http.client.HTTPSConnection("api.deepgram.com", timeout=120, context=ctx)
-            headers={"Authorization": f"Bearer {api_key}"},
+    conn.request(
-            files={"audio": (chunk_path.name, audio_bytes, "audio/mpeg")},
+        "POST",
-            data={"language": "en", "ignore_timestamps": "false"},
+        f"/v1/listen{params}",
-        )
+        body=audio_bytes,
        headers={
            "Authorization": f"Token {api_key}",
            "Content-Type": "audio/mp3",
        },
    )
    resp = conn.getresponse()
    resp_body = resp.read().decode()
    conn.close()
-    print(f"    HTTP {resp.status_code}", flush=True)
+    print(f"    HTTP {resp.status}", flush=True)
-    if resp.status_code != 200:
+    if resp.status != 200:
-        raise RuntimeError(f"Fish Audio ASR error {resp.status_code}: {resp.text!r}")
+        raise RuntimeError(f"Deepgram ASR error {resp.status}: {resp_body!r}")
-    data = resp.json()
+    data = json.loads(resp_body)
-    # Fish Audio returns timestamps in milliseconds — convert to seconds
+    # Deepgram returns timestamps in seconds natively
-    # and adjust by the chunk's start offset in the full audio
+    # Use utterances for clean sentence-level segments
    utterances = data.get("results", {}).get("utterances", [])
    segments = []
-    for seg in data.get("segments", []):
+    for u in utterances:
        segments.append({
-            "start": seg["start"] / 1000.0 + start_offset_sec,
+            "start": u["start"] + start_offset_sec,
-            "end": seg["end"] / 1000.0 + start_offset_sec,
+            "end": u["end"] + start_offset_sec,
-            "text": seg["text"],
+            "text": u["transcript"],
        })
-    # Fallback: if no segments, use full text as one block
+    # Fallback: if no utterances, use word-grouped transcript
-    if not segments and data.get("text"):
+    if not segments:
-        segments.append({
+        try:
-            "start": start_offset_sec,
+            transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"]
-            "end": start_offset_sec + 60,
+            if transcript:
-            "text": data["text"],
+                segments.append({
-        })
+                    "start": start_offset_sec,
                    "end": start_offset_sec + 60,
                    "text": transcript,
                })
        except (KeyError, IndexError):
            pass
    return segments
@@ -188,9 +203,9 @@ def group_chapters_into_chunks(chapters, total_duration_sec):
    return groups
-def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
+def transcribe_with_deepgram(audio_path, out_dir, chapters, api_key):
    """
-    Split audio by chapter groups, transcribe each with Fish Audio ASR,
+    Split audio by chapter groups, transcribe each with Deepgram ASR,
    reassemble into a unified segment list. Caches per-chunk results.
    """
    transcript_path = out_dir / "transcript.json"
@@ -226,8 +241,8 @@ def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
            if not chunk_path.exists():
                split_audio_chunk(audio_path, g_start, g_end, chunk_path)
-            # Send to Fish Audio
+            # Send to Deepgram
-            segments = transcribe_chunk_fish(chunk_path, api_key, g_start)
+            segments = transcribe_chunk_deepgram(chunk_path, api_key, g_start)
            # Cache the result
            with open(cache_path, "w") as f:
@@ -282,14 +297,14 @@ def main():
    parser.add_argument("--chapters", required=True, help="Path to chapters JSON file")
    parser.add_argument("--out", default="out", help="Output directory")
    parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters")
-    parser.add_argument("--fish-api-key", default=os.environ.get("FISH_API_KEY", ""),
+    parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
-                        help="Fish Audio API key (or set FISH_API_KEY env var)")
+                        help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
    parser.add_argument("--video-id", default=None,
                        help="Override video ID (extracted from URL if omitted)")
    args = parser.parse_args()
-    if not args.fish_api_key:
+    if not args.deepgram_api_key:
-        print("ERROR: Fish Audio API key required. Pass --fish-api-key or set FISH_API_KEY env var.")
+        print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY env var.")
        sys.exit(1)
    video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
@@ -339,9 +354,9 @@ def main():
                video_path = matches[0]
        print()
-    # Step 2: Transcribe via Fish Audio
+    # Step 2: Transcribe via Deepgram
-    print("=== Step 2: Transcribe via Fish Audio ASR ===")
+    print("=== Step 2: Transcribe via Deepgram ASR ===")
-    transcript = transcribe_with_fish(audio_path, out_dir, chapters, args.fish_api_key)
+    transcript = transcribe_with_deepgram(audio_path, out_dir, chapters, args.deepgram_api_key)
    segments = transcript.get("segments", [])
    print(f"  Got {len(segments)} transcript segments\n")