Switch ASR from Fish Audio to Deepgram Nova-3

This commit is contained in:
2026-03-24 03:20:41 +00:00
parent ecf37b8798
commit eff844cde5

View File

@@ -44,7 +44,7 @@ DEMO_CHAPTER_KEYWORDS = [
# Target chunk size in seconds — split audio into groups of chapters
# totalling no more than this before sending to ASR.
# 10 min = 600s keeps each chunk well under Fish Audio's size limit.
# 10 min = 600s keeps chunks to a manageable size for Deepgram API.
CHUNK_MAX_SECONDS = 600
@@ -103,51 +103,66 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
def transcribe_chunk_deepgram(chunk_path, api_key, start_offset_sec):
"""
Send a chunk to Fish Audio ASR via the official SDK.
Send a chunk to Deepgram ASR. Returns segments with timestamps in seconds.
Returns a list of segments with timestamps adjusted by start_offset_sec.
Note: Fish Audio returns timestamps in milliseconds.
Deepgram Nova-3 model, utterance-level segments.
"""
import httpx
import http.client
import ssl
print(f"Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
print(f"Deepgram ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
with open(chunk_path, "rb") as f:
audio_bytes = f.read()
print(f" Sending {len(audio_bytes)/1024/1024:.1f}MB to Fish Audio...", flush=True)
print(f" Sending {len(audio_bytes)/1024/1024:.1f}MB...", flush=True)
with httpx.Client(timeout=300) as client:
resp = client.post(
"https://api.fish.audio/v1/asr",
headers={"Authorization": f"Bearer {api_key}"},
files={"audio": (chunk_path.name, audio_bytes, "audio/mpeg")},
data={"language": "en", "ignore_timestamps": "false"},
params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
ctx = ssl.create_default_context()
conn = http.client.HTTPSConnection("api.deepgram.com", timeout=120, context=ctx)
conn.request(
"POST",
f"/v1/listen{params}",
body=audio_bytes,
headers={
"Authorization": f"Token {api_key}",
"Content-Type": "audio/mp3",
},
)
resp = conn.getresponse()
resp_body = resp.read().decode()
conn.close()
print(f" HTTP {resp.status_code}", flush=True)
if resp.status_code != 200:
raise RuntimeError(f"Fish Audio ASR error {resp.status_code}: {resp.text!r}")
print(f" HTTP {resp.status}", flush=True)
if resp.status != 200:
raise RuntimeError(f"Deepgram ASR error {resp.status}: {resp_body!r}")
data = resp.json()
data = json.loads(resp_body)
# Fish Audio returns timestamps in milliseconds — convert to seconds
# and adjust by the chunk's start offset in the full audio
# Deepgram returns timestamps in seconds natively
# Use utterances for clean sentence-level segments
utterances = data.get("results", {}).get("utterances", [])
segments = []
for seg in data.get("segments", []):
for u in utterances:
segments.append({
"start": seg["start"] / 1000.0 + start_offset_sec,
"end": seg["end"] / 1000.0 + start_offset_sec,
"text": seg["text"],
"start": u["start"] + start_offset_sec,
"end": u["end"] + start_offset_sec,
"text": u["transcript"],
})
# Fallback: if no segments, use full text as one block
if not segments and data.get("text"):
# Fallback: if no utterances, use word-grouped transcript
if not segments:
try:
transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"]
if transcript:
segments.append({
"start": start_offset_sec,
"end": start_offset_sec + 60,
"text": data["text"],
"text": transcript,
})
except (KeyError, IndexError):
pass
return segments
@@ -188,9 +203,9 @@ def group_chapters_into_chunks(chapters, total_duration_sec):
return groups
def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
def transcribe_with_deepgram(audio_path, out_dir, chapters, api_key):
"""
Split audio by chapter groups, transcribe each with Fish Audio ASR,
Split audio by chapter groups, transcribe each with Deepgram ASR,
reassemble into a unified segment list. Caches per-chunk results.
"""
transcript_path = out_dir / "transcript.json"
@@ -226,8 +241,8 @@ def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
if not chunk_path.exists():
split_audio_chunk(audio_path, g_start, g_end, chunk_path)
# Send to Fish Audio
segments = transcribe_chunk_fish(chunk_path, api_key, g_start)
# Send to Deepgram
segments = transcribe_chunk_deepgram(chunk_path, api_key, g_start)
# Cache the result
with open(cache_path, "w") as f:
@@ -282,14 +297,14 @@ def main():
parser.add_argument("--chapters", required=True, help="Path to chapters JSON file")
parser.add_argument("--out", default="out", help="Output directory")
parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters")
parser.add_argument("--fish-api-key", default=os.environ.get("FISH_API_KEY", ""),
help="Fish Audio API key (or set FISH_API_KEY env var)")
parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
parser.add_argument("--video-id", default=None,
help="Override video ID (extracted from URL if omitted)")
args = parser.parse_args()
if not args.fish_api_key:
print("ERROR: Fish Audio API key required. Pass --fish-api-key or set FISH_API_KEY env var.")
if not args.deepgram_api_key:
print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY env var.")
sys.exit(1)
video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
@@ -339,9 +354,9 @@ def main():
video_path = matches[0]
print()
# Step 2: Transcribe via Fish Audio
print("=== Step 2: Transcribe via Fish Audio ASR ===")
transcript = transcribe_with_fish(audio_path, out_dir, chapters, args.fish_api_key)
# Step 2: Transcribe via Deepgram
print("=== Step 2: Transcribe via Deepgram ASR ===")
transcript = transcribe_with_deepgram(audio_path, out_dir, chapters, args.deepgram_api_key)
segments = transcript.get("segments", [])
print(f" Got {len(segments)} transcript segments\n")