Fix HTTP client, multipart, ms->s timestamp conversion

2026-03-24 01:18:28 +00:00
parent e69b42fc19
commit 1bdf28e1d4
1 changed files with 31 additions and 27 deletions
--- a/ingest.py
+++ b/ingest.py
@@ -20,13 +20,14 @@ Dependencies (must be on PATH):
 """

 import argparse
+import http.client
 import json
 import math
 import os
+import ssl
 import subprocess
 import sys
 import time
-import urllib.request
 import uuid
 from pathlib import Path

@@ -111,50 +112,53 @@ def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
    """
    Send a chunk to Fish Audio ASR. Returns a list of segments with
    timestamps adjusted by start_offset_sec.
-    Uses stdlib urllib + manual multipart encoding (no requests dep).
+    Note: Fish Audio returns timestamps in milliseconds.
    """
    print(f"  → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)

-    # Build multipart/form-data manually
    boundary = uuid.uuid4().hex
    with open(chunk_path, "rb") as f:
        audio_data = f.read()

-    def field(name, value):
-        return (
-            f"--{boundary}\r\n"
-            f'Content-Disposition: form-data; name="{name}"\r\n\r\n'
-            f"{value}\r\n"
-        ).encode()
-
    body = (
-        field("language", "en") +
-        field("ignore_timestamps", "false") +
-        f"--{boundary}\r\n".encode() +
-        f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'.encode() +
-        b"Content-Type: audio/mpeg\r\n\r\n" +
-        audio_data +
-        f"\r\n--{boundary}--\r\n".encode()
-    )
+        f"--{boundary}\r\n"
+        f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'
+        f"Content-Type: audio/mpeg\r\n\r\n"
+    ).encode() + audio_data + (
+        f"\r\n--{boundary}\r\n"
+        f'Content-Disposition: form-data; name="language"\r\n\r\n'
+        f"en"
+        f"\r\n--{boundary}--\r\n"
+    ).encode()

-    req = urllib.request.Request(
-        FISH_AUDIO_ASR_URL,
-        data=body,
+    ctx = ssl.create_default_context()
+    conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx)
+    conn.request(
+        "POST",
+        "/v1/asr",
+        body=body,
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": f"multipart/form-data; boundary={boundary}",
+            "Content-Length": str(len(body)),
        },
-        method="POST",
    )
-    with urllib.request.urlopen(req, timeout=300) as resp:
-        data = json.loads(resp.read().decode())
+    resp = conn.getresponse()
+    resp_body = resp.read().decode()
+    conn.close()

-    # Adjust timestamps by the chunk's start offset
+    if resp.status != 200:
+        raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}")
+
+    data = json.loads(resp_body)
+
+    # Fish Audio returns timestamps in milliseconds — convert to seconds
+    # and adjust by the chunk's start offset in the full audio
    segments = []
    for seg in data.get("segments", []):
        segments.append({
-            "start": seg["start"] + start_offset_sec,
-            "end": seg["end"] + start_offset_sec,
+            "start": seg["start"] / 1000.0 + start_offset_sec,
+            "end": seg["end"] / 1000.0 + start_offset_sec,
            "text": seg["text"],
        })