From eff844cde5f0281a09a77106da9d6de0641f673a Mon Sep 17 00:00:00 2001 From: Conan Scott Date: Tue, 24 Mar 2026 03:20:41 +0000 Subject: [PATCH] Switch ASR from Fish Audio to Deepgram Nova-3 --- ingest.py | 99 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 42 deletions(-) diff --git a/ingest.py b/ingest.py index 19937a1..11218e2 100644 --- a/ingest.py +++ b/ingest.py @@ -44,7 +44,7 @@ DEMO_CHAPTER_KEYWORDS = [ # Target chunk size in seconds — split audio into groups of chapters # totalling no more than this before sending to ASR. -# 10 min = 600s keeps each chunk well under Fish Audio's size limit. +# 10 min = 600s keeps chunks to a manageable size for Deepgram API. CHUNK_MAX_SECONDS = 600 @@ -103,51 +103,66 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path): ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) -def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec): +def transcribe_chunk_deepgram(chunk_path, api_key, start_offset_sec): """ - Send a chunk to Fish Audio ASR via the official SDK. + Send a chunk to Deepgram ASR. Returns segments with timestamps in seconds. Returns a list of segments with timestamps adjusted by start_offset_sec. - Note: Fish Audio returns timestamps in milliseconds. + Deepgram Nova-3 model, utterance-level segments. """ - import httpx + import http.client + import ssl - print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True) + print(f" → Deepgram ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True) with open(chunk_path, "rb") as f: audio_bytes = f.read() - print(f" Sending {len(audio_bytes)/1024/1024:.1f}MB to Fish Audio...", flush=True) + print(f" Sending {len(audio_bytes)/1024/1024:.1f}MB...", flush=True) - with httpx.Client(timeout=300) as client: - resp = client.post( - "https://api.fish.audio/v1/asr", - headers={"Authorization": f"Bearer {api_key}"}, - files={"audio": (chunk_path.name, audio_bytes, "audio/mpeg")}, - data={"language": "en", "ignore_timestamps": "false"}, - ) + params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true" + ctx = ssl.create_default_context() + conn = http.client.HTTPSConnection("api.deepgram.com", timeout=120, context=ctx) + conn.request( + "POST", + f"/v1/listen{params}", + body=audio_bytes, + headers={ + "Authorization": f"Token {api_key}", + "Content-Type": "audio/mp3", + }, + ) + resp = conn.getresponse() + resp_body = resp.read().decode() + conn.close() - print(f" HTTP {resp.status_code}", flush=True) - if resp.status_code != 200: - raise RuntimeError(f"Fish Audio ASR error {resp.status_code}: {resp.text!r}") + print(f" HTTP {resp.status}", flush=True) + if resp.status != 200: + raise RuntimeError(f"Deepgram ASR error {resp.status}: {resp_body!r}") - data = resp.json() + data = json.loads(resp_body) - # Fish Audio returns timestamps in milliseconds — convert to seconds - # and adjust by the chunk's start offset in the full audio + # Deepgram returns timestamps in seconds natively + # Use utterances for clean sentence-level segments + utterances = data.get("results", {}).get("utterances", []) segments = [] - for seg in data.get("segments", []): + for u in utterances: segments.append({ - "start": seg["start"] / 1000.0 + start_offset_sec, - "end": seg["end"] / 1000.0 + start_offset_sec, - "text": seg["text"], + "start": u["start"] + start_offset_sec, + "end": u["end"] + start_offset_sec, + "text": u["transcript"], }) - # Fallback: if no segments, use full text as one block - if not segments and data.get("text"): - segments.append({ - "start": start_offset_sec, - "end": start_offset_sec + 60, - "text": data["text"], - }) + # Fallback: if no utterances, use word-grouped transcript + if not segments: + try: + transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"] + if transcript: + segments.append({ + "start": start_offset_sec, + "end": start_offset_sec + 60, + "text": transcript, + }) + except (KeyError, IndexError): + pass return segments @@ -188,9 +203,9 @@ def group_chapters_into_chunks(chapters, total_duration_sec): return groups -def transcribe_with_fish(audio_path, out_dir, chapters, api_key): +def transcribe_with_deepgram(audio_path, out_dir, chapters, api_key): """ - Split audio by chapter groups, transcribe each with Fish Audio ASR, + Split audio by chapter groups, transcribe each with Deepgram ASR, reassemble into a unified segment list. Caches per-chunk results. """ transcript_path = out_dir / "transcript.json" @@ -226,8 +241,8 @@ def transcribe_with_fish(audio_path, out_dir, chapters, api_key): if not chunk_path.exists(): split_audio_chunk(audio_path, g_start, g_end, chunk_path) - # Send to Fish Audio - segments = transcribe_chunk_fish(chunk_path, api_key, g_start) + # Send to Deepgram + segments = transcribe_chunk_deepgram(chunk_path, api_key, g_start) # Cache the result with open(cache_path, "w") as f: @@ -282,14 +297,14 @@ def main(): parser.add_argument("--chapters", required=True, help="Path to chapters JSON file") parser.add_argument("--out", default="out", help="Output directory") parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters") - parser.add_argument("--fish-api-key", default=os.environ.get("FISH_API_KEY", ""), - help="Fish Audio API key (or set FISH_API_KEY env var)") + parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""), + help="Deepgram API key (or set DEEPGRAM_API_KEY env var)") parser.add_argument("--video-id", default=None, help="Override video ID (extracted from URL if omitted)") args = parser.parse_args() - if not args.fish_api_key: - print("ERROR: Fish Audio API key required. Pass --fish-api-key or set FISH_API_KEY env var.") + if not args.deepgram_api_key: + print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY env var.") sys.exit(1) video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0] @@ -339,9 +354,9 @@ def main(): video_path = matches[0] print() - # Step 2: Transcribe via Fish Audio - print("=== Step 2: Transcribe via Fish Audio ASR ===") - transcript = transcribe_with_fish(audio_path, out_dir, chapters, args.fish_api_key) + # Step 2: Transcribe via Deepgram + print("=== Step 2: Transcribe via Deepgram ASR ===") + transcript = transcribe_with_deepgram(audio_path, out_dir, chapters, args.deepgram_api_key) segments = transcript.get("segments", []) print(f" Got {len(segments)} transcript segments\n")