From eff844cde5f0281a09a77106da9d6de0641f673a Mon Sep 17 00:00:00 2001
From: Conan Scott <conanscott@gmail.com>
Date: Tue, 24 Mar 2026 03:20:41 +0000
Subject: [PATCH] Switch ASR from Fish Audio to Deepgram Nova-3

---
 ingest.py | 99 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 42 deletions(-)

diff --git a/ingest.py b/ingest.py
index 19937a1..11218e2 100644
--- a/ingest.py
+++ b/ingest.py
@@ -44,7 +44,7 @@ DEMO_CHAPTER_KEYWORDS = [
 
 # Target chunk size in seconds — split audio into groups of chapters
 # totalling no more than this before sending to ASR.
-# 10 min = 600s keeps each chunk well under Fish Audio's size limit.
+# 10 min = 600s keeps chunks to a manageable size for Deepgram API.
 CHUNK_MAX_SECONDS = 600
 
 
@@ -103,51 +103,66 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
     ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
 
-def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
+def transcribe_chunk_deepgram(chunk_path, api_key, start_offset_sec):
     """
-    Send a chunk to Fish Audio ASR via the official SDK.
+    Send a chunk to Deepgram ASR. Returns segments with timestamps in seconds.
     Returns a list of segments with timestamps adjusted by start_offset_sec.
-    Note: Fish Audio returns timestamps in milliseconds.
+    Deepgram Nova-3 model, utterance-level segments.
     """
-    import httpx
+    import http.client
+    import ssl
 
-    print(f"  → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
+    print(f"  → Deepgram ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
 
     with open(chunk_path, "rb") as f:
         audio_bytes = f.read()
-    print(f"    Sending {len(audio_bytes)/1024/1024:.1f}MB to Fish Audio...", flush=True)
+    print(f"    Sending {len(audio_bytes)/1024/1024:.1f}MB...", flush=True)
 
-    with httpx.Client(timeout=300) as client:
-        resp = client.post(
-            "https://api.fish.audio/v1/asr",
-            headers={"Authorization": f"Bearer {api_key}"},
-            files={"audio": (chunk_path.name, audio_bytes, "audio/mpeg")},
-            data={"language": "en", "ignore_timestamps": "false"},
-        )
+    params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
+    ctx = ssl.create_default_context()
+    conn = http.client.HTTPSConnection("api.deepgram.com", timeout=120, context=ctx)
+    conn.request(
+        "POST",
+        f"/v1/listen{params}",
+        body=audio_bytes,
+        headers={
+            "Authorization": f"Token {api_key}",
+            "Content-Type": "audio/mp3",
+        },
+    )
+    resp = conn.getresponse()
+    resp_body = resp.read().decode()
+    conn.close()
 
-    print(f"    HTTP {resp.status_code}", flush=True)
-    if resp.status_code != 200:
-        raise RuntimeError(f"Fish Audio ASR error {resp.status_code}: {resp.text!r}")
+    print(f"    HTTP {resp.status}", flush=True)
+    if resp.status != 200:
+        raise RuntimeError(f"Deepgram ASR error {resp.status}: {resp_body!r}")
 
-    data = resp.json()
+    data = json.loads(resp_body)
 
-    # Fish Audio returns timestamps in milliseconds — convert to seconds
-    # and adjust by the chunk's start offset in the full audio
+    # Deepgram returns timestamps in seconds natively
+    # Use utterances for clean sentence-level segments
+    utterances = data.get("results", {}).get("utterances", [])
     segments = []
-    for seg in data.get("segments", []):
+    for u in utterances:
         segments.append({
-            "start": seg["start"] / 1000.0 + start_offset_sec,
-            "end": seg["end"] / 1000.0 + start_offset_sec,
-            "text": seg["text"],
+            "start": u["start"] + start_offset_sec,
+            "end": u["end"] + start_offset_sec,
+            "text": u["transcript"],
         })
 
-    # Fallback: if no segments, use full text as one block
-    if not segments and data.get("text"):
-        segments.append({
-            "start": start_offset_sec,
-            "end": start_offset_sec + 60,
-            "text": data["text"],
-        })
+    # Fallback: if no utterances, use word-grouped transcript
+    if not segments:
+        try:
+            transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"]
+            if transcript:
+                segments.append({
+                    "start": start_offset_sec,
+                    "end": start_offset_sec + 60,
+                    "text": transcript,
+                })
+        except (KeyError, IndexError):
+            pass
 
     return segments
 
@@ -188,9 +203,9 @@ def group_chapters_into_chunks(chapters, total_duration_sec):
     return groups
 
 
-def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
+def transcribe_with_deepgram(audio_path, out_dir, chapters, api_key):
     """
-    Split audio by chapter groups, transcribe each with Fish Audio ASR,
+    Split audio by chapter groups, transcribe each with Deepgram ASR,
     reassemble into a unified segment list. Caches per-chunk results.
     """
     transcript_path = out_dir / "transcript.json"
@@ -226,8 +241,8 @@ def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
             if not chunk_path.exists():
                 split_audio_chunk(audio_path, g_start, g_end, chunk_path)
 
-            # Send to Fish Audio
-            segments = transcribe_chunk_fish(chunk_path, api_key, g_start)
+            # Send to Deepgram
+            segments = transcribe_chunk_deepgram(chunk_path, api_key, g_start)
 
             # Cache the result
             with open(cache_path, "w") as f:
@@ -282,14 +297,14 @@ def main():
     parser.add_argument("--chapters", required=True, help="Path to chapters JSON file")
     parser.add_argument("--out", default="out", help="Output directory")
     parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters")
-    parser.add_argument("--fish-api-key", default=os.environ.get("FISH_API_KEY", ""),
-                        help="Fish Audio API key (or set FISH_API_KEY env var)")
+    parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
+                        help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
     parser.add_argument("--video-id", default=None,
                         help="Override video ID (extracted from URL if omitted)")
     args = parser.parse_args()
 
-    if not args.fish_api_key:
-        print("ERROR: Fish Audio API key required. Pass --fish-api-key or set FISH_API_KEY env var.")
+    if not args.deepgram_api_key:
+        print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY env var.")
         sys.exit(1)
 
     video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
@@ -339,9 +354,9 @@ def main():
                 video_path = matches[0]
         print()
 
-    # Step 2: Transcribe via Fish Audio
-    print("=== Step 2: Transcribe via Fish Audio ASR ===")
-    transcript = transcribe_with_fish(audio_path, out_dir, chapters, args.fish_api_key)
+    # Step 2: Transcribe via Deepgram
+    print("=== Step 2: Transcribe via Deepgram ASR ===")
+    transcript = transcribe_with_deepgram(audio_path, out_dir, chapters, args.deepgram_api_key)
     segments = transcript.get("segments", [])
     print(f"  Got {len(segments)} transcript segments\n")