diff --git a/ingest.py b/ingest.py index 73a8755..0135be0 100644 --- a/ingest.py +++ b/ingest.py @@ -20,20 +20,15 @@ Dependencies (must be on PATH): """ import argparse -import http.client import json import math import os -import ssl import subprocess import sys import time -import uuid from pathlib import Path -FISH_AUDIO_ASR_URL = "https://api.fish.audio/v1/asr" - # Chapters where frame extraction adds real value (demo-heavy sections). DEMO_CHAPTER_KEYWORDS = [ "transcoding", @@ -110,64 +105,34 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path): def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec): """ - Send a chunk to Fish Audio ASR. Returns a list of segments with - timestamps adjusted by start_offset_sec. + Send a chunk to Fish Audio ASR via the official SDK. + Returns a list of segments with timestamps adjusted by start_offset_sec. Note: Fish Audio returns timestamps in milliseconds. """ + from fishaudio import FishAudio + print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True) - boundary = uuid.uuid4().hex + client = FishAudio(api_key=api_key) with open(chunk_path, "rb") as f: - audio_data = f.read() - - body = ( - f"--{boundary}\r\n" - f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n' - f"Content-Type: audio/mpeg\r\n\r\n" - ).encode() + audio_data + ( - f"\r\n--{boundary}\r\n" - f'Content-Disposition: form-data; name="language"\r\n\r\n' - f"en" - f"\r\n--{boundary}--\r\n" - ).encode() - - ctx = ssl.create_default_context() - conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx) - conn.request( - "POST", - "/v1/asr", - body=body, - headers={ - "Authorization": f"Bearer {api_key}", - "Content-Type": f"multipart/form-data; boundary={boundary}", - "Content-Length": str(len(body)), - }, - ) - resp = conn.getresponse() - resp_body = resp.read().decode() - conn.close() - - if resp.status != 200: - raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}") - - data = json.loads(resp_body) + result = client.asr.transcribe(audio=f.read(), language="en") # Fish Audio returns timestamps in milliseconds — convert to seconds # and adjust by the chunk's start offset in the full audio segments = [] - for seg in data.get("segments", []): + for seg in result.segments: segments.append({ - "start": seg["start"] / 1000.0 + start_offset_sec, - "end": seg["end"] / 1000.0 + start_offset_sec, - "text": seg["text"], + "start": seg.start / 1000.0 + start_offset_sec, + "end": seg.end / 1000.0 + start_offset_sec, + "text": seg.text, }) - # If Fish returns flat text with no segments, synthesise one segment - if not segments and data.get("text"): + # Fallback: if no segments, use full text as one block + if not segments and result.text: segments.append({ "start": start_offset_sec, "end": start_offset_sec + 60, - "text": data["text"], + "text": result.text, }) return segments