diff --git a/ingest.py b/ingest.py index 363579e..73a8755 100644 --- a/ingest.py +++ b/ingest.py @@ -20,13 +20,14 @@ Dependencies (must be on PATH): """ import argparse +import http.client import json import math import os +import ssl import subprocess import sys import time -import urllib.request import uuid from pathlib import Path @@ -111,50 +112,53 @@ def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec): """ Send a chunk to Fish Audio ASR. Returns a list of segments with timestamps adjusted by start_offset_sec. - Uses stdlib urllib + manual multipart encoding (no requests dep). + Note: Fish Audio returns timestamps in milliseconds. """ print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True) - # Build multipart/form-data manually boundary = uuid.uuid4().hex with open(chunk_path, "rb") as f: audio_data = f.read() - def field(name, value): - return ( - f"--{boundary}\r\n" - f'Content-Disposition: form-data; name="{name}"\r\n\r\n' - f"{value}\r\n" - ).encode() - body = ( - field("language", "en") + - field("ignore_timestamps", "false") + - f"--{boundary}\r\n".encode() + - f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'.encode() + - b"Content-Type: audio/mpeg\r\n\r\n" + - audio_data + - f"\r\n--{boundary}--\r\n".encode() - ) + f"--{boundary}\r\n" + f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n' + f"Content-Type: audio/mpeg\r\n\r\n" + ).encode() + audio_data + ( + f"\r\n--{boundary}\r\n" + f'Content-Disposition: form-data; name="language"\r\n\r\n' + f"en" + f"\r\n--{boundary}--\r\n" + ).encode() - req = urllib.request.Request( - FISH_AUDIO_ASR_URL, - data=body, + ctx = ssl.create_default_context() + conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx) + conn.request( + "POST", + "/v1/asr", + body=body, headers={ "Authorization": f"Bearer {api_key}", "Content-Type": f"multipart/form-data; boundary={boundary}", + "Content-Length": str(len(body)), }, - method="POST", ) - with urllib.request.urlopen(req, timeout=300) as resp: - data = json.loads(resp.read().decode()) + resp = conn.getresponse() + resp_body = resp.read().decode() + conn.close() - # Adjust timestamps by the chunk's start offset + if resp.status != 200: + raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}") + + data = json.loads(resp_body) + + # Fish Audio returns timestamps in milliseconds — convert to seconds + # and adjust by the chunk's start offset in the full audio segments = [] for seg in data.get("segments", []): segments.append({ - "start": seg["start"] + start_offset_sec, - "end": seg["end"] + start_offset_sec, + "start": seg["start"] / 1000.0 + start_offset_sec, + "end": seg["end"] / 1000.0 + start_offset_sec, "text": seg["text"], })