Switch ASR from Fish Audio to Deepgram Nova-3

This commit is contained in:
2026-03-24 03:20:41 +00:00
parent ecf37b8798
commit eff844cde5

View File

@@ -44,7 +44,7 @@ DEMO_CHAPTER_KEYWORDS = [
# Target chunk size in seconds — split audio into groups of chapters # Target chunk size in seconds — split audio into groups of chapters
# totalling no more than this before sending to ASR. # totalling no more than this before sending to ASR.
# 10 min = 600s keeps each chunk well under Fish Audio's size limit. # 10 min = 600s keeps chunks to a manageable size for Deepgram API.
CHUNK_MAX_SECONDS = 600 CHUNK_MAX_SECONDS = 600
@@ -103,51 +103,66 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec): def transcribe_chunk_deepgram(chunk_path, api_key, start_offset_sec):
""" """
Send a chunk to Fish Audio ASR via the official SDK. Send a chunk to Deepgram ASR. Returns segments with timestamps in seconds.
Returns a list of segments with timestamps adjusted by start_offset_sec. Returns a list of segments with timestamps adjusted by start_offset_sec.
Note: Fish Audio returns timestamps in milliseconds. Deepgram Nova-3 model, utterance-level segments.
""" """
import httpx import http.client
import ssl
print(f"Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True) print(f"Deepgram ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
with open(chunk_path, "rb") as f: with open(chunk_path, "rb") as f:
audio_bytes = f.read() audio_bytes = f.read()
print(f" Sending {len(audio_bytes)/1024/1024:.1f}MB to Fish Audio...", flush=True) print(f" Sending {len(audio_bytes)/1024/1024:.1f}MB...", flush=True)
with httpx.Client(timeout=300) as client: params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
resp = client.post( ctx = ssl.create_default_context()
"https://api.fish.audio/v1/asr", conn = http.client.HTTPSConnection("api.deepgram.com", timeout=120, context=ctx)
headers={"Authorization": f"Bearer {api_key}"}, conn.request(
files={"audio": (chunk_path.name, audio_bytes, "audio/mpeg")}, "POST",
data={"language": "en", "ignore_timestamps": "false"}, f"/v1/listen{params}",
body=audio_bytes,
headers={
"Authorization": f"Token {api_key}",
"Content-Type": "audio/mp3",
},
) )
resp = conn.getresponse()
resp_body = resp.read().decode()
conn.close()
print(f" HTTP {resp.status_code}", flush=True) print(f" HTTP {resp.status}", flush=True)
if resp.status_code != 200: if resp.status != 200:
raise RuntimeError(f"Fish Audio ASR error {resp.status_code}: {resp.text!r}") raise RuntimeError(f"Deepgram ASR error {resp.status}: {resp_body!r}")
data = resp.json() data = json.loads(resp_body)
# Fish Audio returns timestamps in milliseconds — convert to seconds # Deepgram returns timestamps in seconds natively
# and adjust by the chunk's start offset in the full audio # Use utterances for clean sentence-level segments
utterances = data.get("results", {}).get("utterances", [])
segments = [] segments = []
for seg in data.get("segments", []): for u in utterances:
segments.append({ segments.append({
"start": seg["start"] / 1000.0 + start_offset_sec, "start": u["start"] + start_offset_sec,
"end": seg["end"] / 1000.0 + start_offset_sec, "end": u["end"] + start_offset_sec,
"text": seg["text"], "text": u["transcript"],
}) })
# Fallback: if no segments, use full text as one block # Fallback: if no utterances, use word-grouped transcript
if not segments and data.get("text"): if not segments:
try:
transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"]
if transcript:
segments.append({ segments.append({
"start": start_offset_sec, "start": start_offset_sec,
"end": start_offset_sec + 60, "end": start_offset_sec + 60,
"text": data["text"], "text": transcript,
}) })
except (KeyError, IndexError):
pass
return segments return segments
@@ -188,9 +203,9 @@ def group_chapters_into_chunks(chapters, total_duration_sec):
return groups return groups
def transcribe_with_fish(audio_path, out_dir, chapters, api_key): def transcribe_with_deepgram(audio_path, out_dir, chapters, api_key):
""" """
Split audio by chapter groups, transcribe each with Fish Audio ASR, Split audio by chapter groups, transcribe each with Deepgram ASR,
reassemble into a unified segment list. Caches per-chunk results. reassemble into a unified segment list. Caches per-chunk results.
""" """
transcript_path = out_dir / "transcript.json" transcript_path = out_dir / "transcript.json"
@@ -226,8 +241,8 @@ def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
if not chunk_path.exists(): if not chunk_path.exists():
split_audio_chunk(audio_path, g_start, g_end, chunk_path) split_audio_chunk(audio_path, g_start, g_end, chunk_path)
# Send to Fish Audio # Send to Deepgram
segments = transcribe_chunk_fish(chunk_path, api_key, g_start) segments = transcribe_chunk_deepgram(chunk_path, api_key, g_start)
# Cache the result # Cache the result
with open(cache_path, "w") as f: with open(cache_path, "w") as f:
@@ -282,14 +297,14 @@ def main():
parser.add_argument("--chapters", required=True, help="Path to chapters JSON file") parser.add_argument("--chapters", required=True, help="Path to chapters JSON file")
parser.add_argument("--out", default="out", help="Output directory") parser.add_argument("--out", default="out", help="Output directory")
parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters") parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters")
parser.add_argument("--fish-api-key", default=os.environ.get("FISH_API_KEY", ""), parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
help="Fish Audio API key (or set FISH_API_KEY env var)") help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
parser.add_argument("--video-id", default=None, parser.add_argument("--video-id", default=None,
help="Override video ID (extracted from URL if omitted)") help="Override video ID (extracted from URL if omitted)")
args = parser.parse_args() args = parser.parse_args()
if not args.fish_api_key: if not args.deepgram_api_key:
print("ERROR: Fish Audio API key required. Pass --fish-api-key or set FISH_API_KEY env var.") print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY env var.")
sys.exit(1) sys.exit(1)
video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0] video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
@@ -339,9 +354,9 @@ def main():
video_path = matches[0] video_path = matches[0]
print() print()
# Step 2: Transcribe via Fish Audio # Step 2: Transcribe via Deepgram
print("=== Step 2: Transcribe via Fish Audio ASR ===") print("=== Step 2: Transcribe via Deepgram ASR ===")
transcript = transcribe_with_fish(audio_path, out_dir, chapters, args.fish_api_key) transcript = transcribe_with_deepgram(audio_path, out_dir, chapters, args.deepgram_api_key)
segments = transcript.get("segments", []) segments = transcript.get("segments", [])
print(f" Got {len(segments)} transcript segments\n") print(f" Got {len(segments)} transcript segments\n")