Switch ASR from Fish Audio to Deepgram Nova-3
This commit is contained in:
99
ingest.py
99
ingest.py
@@ -44,7 +44,7 @@ DEMO_CHAPTER_KEYWORDS = [
|
|||||||
|
|
||||||
# Target chunk size in seconds — split audio into groups of chapters
|
# Target chunk size in seconds — split audio into groups of chapters
|
||||||
# totalling no more than this before sending to ASR.
|
# totalling no more than this before sending to ASR.
|
||||||
# 10 min = 600s keeps each chunk well under Fish Audio's size limit.
|
# 10 min = 600s keeps chunks to a manageable size for Deepgram API.
|
||||||
CHUNK_MAX_SECONDS = 600
|
CHUNK_MAX_SECONDS = 600
|
||||||
|
|
||||||
|
|
||||||
@@ -103,51 +103,66 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
|
|||||||
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||||
|
|
||||||
|
|
||||||
def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
|
def transcribe_chunk_deepgram(chunk_path, api_key, start_offset_sec):
|
||||||
"""
|
"""
|
||||||
Send a chunk to Fish Audio ASR via the official SDK.
|
Send a chunk to Deepgram ASR. Returns segments with timestamps in seconds.
|
||||||
Returns a list of segments with timestamps adjusted by start_offset_sec.
|
Returns a list of segments with timestamps adjusted by start_offset_sec.
|
||||||
Note: Fish Audio returns timestamps in milliseconds.
|
Deepgram Nova-3 model, utterance-level segments.
|
||||||
"""
|
"""
|
||||||
import httpx
|
import http.client
|
||||||
|
import ssl
|
||||||
|
|
||||||
print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
|
print(f" → Deepgram ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
|
||||||
|
|
||||||
with open(chunk_path, "rb") as f:
|
with open(chunk_path, "rb") as f:
|
||||||
audio_bytes = f.read()
|
audio_bytes = f.read()
|
||||||
print(f" Sending {len(audio_bytes)/1024/1024:.1f}MB to Fish Audio...", flush=True)
|
print(f" Sending {len(audio_bytes)/1024/1024:.1f}MB...", flush=True)
|
||||||
|
|
||||||
with httpx.Client(timeout=300) as client:
|
params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
|
||||||
resp = client.post(
|
ctx = ssl.create_default_context()
|
||||||
"https://api.fish.audio/v1/asr",
|
conn = http.client.HTTPSConnection("api.deepgram.com", timeout=120, context=ctx)
|
||||||
headers={"Authorization": f"Bearer {api_key}"},
|
conn.request(
|
||||||
files={"audio": (chunk_path.name, audio_bytes, "audio/mpeg")},
|
"POST",
|
||||||
data={"language": "en", "ignore_timestamps": "false"},
|
f"/v1/listen{params}",
|
||||||
)
|
body=audio_bytes,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Token {api_key}",
|
||||||
|
"Content-Type": "audio/mp3",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp = conn.getresponse()
|
||||||
|
resp_body = resp.read().decode()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
print(f" HTTP {resp.status_code}", flush=True)
|
print(f" HTTP {resp.status}", flush=True)
|
||||||
if resp.status_code != 200:
|
if resp.status != 200:
|
||||||
raise RuntimeError(f"Fish Audio ASR error {resp.status_code}: {resp.text!r}")
|
raise RuntimeError(f"Deepgram ASR error {resp.status}: {resp_body!r}")
|
||||||
|
|
||||||
data = resp.json()
|
data = json.loads(resp_body)
|
||||||
|
|
||||||
# Fish Audio returns timestamps in milliseconds — convert to seconds
|
# Deepgram returns timestamps in seconds natively
|
||||||
# and adjust by the chunk's start offset in the full audio
|
# Use utterances for clean sentence-level segments
|
||||||
|
utterances = data.get("results", {}).get("utterances", [])
|
||||||
segments = []
|
segments = []
|
||||||
for seg in data.get("segments", []):
|
for u in utterances:
|
||||||
segments.append({
|
segments.append({
|
||||||
"start": seg["start"] / 1000.0 + start_offset_sec,
|
"start": u["start"] + start_offset_sec,
|
||||||
"end": seg["end"] / 1000.0 + start_offset_sec,
|
"end": u["end"] + start_offset_sec,
|
||||||
"text": seg["text"],
|
"text": u["transcript"],
|
||||||
})
|
})
|
||||||
|
|
||||||
# Fallback: if no segments, use full text as one block
|
# Fallback: if no utterances, use word-grouped transcript
|
||||||
if not segments and data.get("text"):
|
if not segments:
|
||||||
segments.append({
|
try:
|
||||||
"start": start_offset_sec,
|
transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"]
|
||||||
"end": start_offset_sec + 60,
|
if transcript:
|
||||||
"text": data["text"],
|
segments.append({
|
||||||
})
|
"start": start_offset_sec,
|
||||||
|
"end": start_offset_sec + 60,
|
||||||
|
"text": transcript,
|
||||||
|
})
|
||||||
|
except (KeyError, IndexError):
|
||||||
|
pass
|
||||||
|
|
||||||
return segments
|
return segments
|
||||||
|
|
||||||
@@ -188,9 +203,9 @@ def group_chapters_into_chunks(chapters, total_duration_sec):
|
|||||||
return groups
|
return groups
|
||||||
|
|
||||||
|
|
||||||
def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
|
def transcribe_with_deepgram(audio_path, out_dir, chapters, api_key):
|
||||||
"""
|
"""
|
||||||
Split audio by chapter groups, transcribe each with Fish Audio ASR,
|
Split audio by chapter groups, transcribe each with Deepgram ASR,
|
||||||
reassemble into a unified segment list. Caches per-chunk results.
|
reassemble into a unified segment list. Caches per-chunk results.
|
||||||
"""
|
"""
|
||||||
transcript_path = out_dir / "transcript.json"
|
transcript_path = out_dir / "transcript.json"
|
||||||
@@ -226,8 +241,8 @@ def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
|
|||||||
if not chunk_path.exists():
|
if not chunk_path.exists():
|
||||||
split_audio_chunk(audio_path, g_start, g_end, chunk_path)
|
split_audio_chunk(audio_path, g_start, g_end, chunk_path)
|
||||||
|
|
||||||
# Send to Fish Audio
|
# Send to Deepgram
|
||||||
segments = transcribe_chunk_fish(chunk_path, api_key, g_start)
|
segments = transcribe_chunk_deepgram(chunk_path, api_key, g_start)
|
||||||
|
|
||||||
# Cache the result
|
# Cache the result
|
||||||
with open(cache_path, "w") as f:
|
with open(cache_path, "w") as f:
|
||||||
@@ -282,14 +297,14 @@ def main():
|
|||||||
parser.add_argument("--chapters", required=True, help="Path to chapters JSON file")
|
parser.add_argument("--chapters", required=True, help="Path to chapters JSON file")
|
||||||
parser.add_argument("--out", default="out", help="Output directory")
|
parser.add_argument("--out", default="out", help="Output directory")
|
||||||
parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters")
|
parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters")
|
||||||
parser.add_argument("--fish-api-key", default=os.environ.get("FISH_API_KEY", ""),
|
parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
|
||||||
help="Fish Audio API key (or set FISH_API_KEY env var)")
|
help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
|
||||||
parser.add_argument("--video-id", default=None,
|
parser.add_argument("--video-id", default=None,
|
||||||
help="Override video ID (extracted from URL if omitted)")
|
help="Override video ID (extracted from URL if omitted)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not args.fish_api_key:
|
if not args.deepgram_api_key:
|
||||||
print("ERROR: Fish Audio API key required. Pass --fish-api-key or set FISH_API_KEY env var.")
|
print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY env var.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
|
video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
|
||||||
@@ -339,9 +354,9 @@ def main():
|
|||||||
video_path = matches[0]
|
video_path = matches[0]
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Step 2: Transcribe via Fish Audio
|
# Step 2: Transcribe via Deepgram
|
||||||
print("=== Step 2: Transcribe via Fish Audio ASR ===")
|
print("=== Step 2: Transcribe via Deepgram ASR ===")
|
||||||
transcript = transcribe_with_fish(audio_path, out_dir, chapters, args.fish_api_key)
|
transcript = transcribe_with_deepgram(audio_path, out_dir, chapters, args.deepgram_api_key)
|
||||||
segments = transcript.get("segments", [])
|
segments = transcript.get("segments", [])
|
||||||
print(f" Got {len(segments)} transcript segments\n")
|
print(f" Got {len(segments)} transcript segments\n")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user