Fix HTTP client, multipart, ms->s timestamp conversion

This commit is contained in:
2026-03-24 01:18:28 +00:00
parent e69b42fc19
commit 1bdf28e1d4

View File

@@ -20,13 +20,14 @@ Dependencies (must be on PATH):
"""
import argparse
import http.client
import json
import math
import os
import ssl
import subprocess
import sys
import time
import urllib.request
import uuid
from pathlib import Path
@@ -111,50 +112,53 @@ def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
"""
Send a chunk to Fish Audio ASR. Returns a list of segments with
timestamps adjusted by start_offset_sec.
Uses stdlib urllib + manual multipart encoding (no requests dep).
Note: Fish Audio returns timestamps in milliseconds.
"""
print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
# Build multipart/form-data manually
boundary = uuid.uuid4().hex
with open(chunk_path, "rb") as f:
audio_data = f.read()
def field(name, value):
return (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="{name}"\r\n\r\n'
f"{value}\r\n"
).encode()
body = (
field("language", "en") +
field("ignore_timestamps", "false") +
f"--{boundary}\r\n".encode() +
f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'.encode() +
b"Content-Type: audio/mpeg\r\n\r\n" +
audio_data +
f"\r\n--{boundary}--\r\n".encode()
)
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'
f"Content-Type: audio/mpeg\r\n\r\n"
).encode() + audio_data + (
f"\r\n--{boundary}\r\n"
f'Content-Disposition: form-data; name="language"\r\n\r\n'
f"en"
f"\r\n--{boundary}--\r\n"
).encode()
req = urllib.request.Request(
FISH_AUDIO_ASR_URL,
data=body,
ctx = ssl.create_default_context()
conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx)
conn.request(
"POST",
"/v1/asr",
body=body,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": f"multipart/form-data; boundary={boundary}",
"Content-Length": str(len(body)),
},
method="POST",
)
with urllib.request.urlopen(req, timeout=300) as resp:
data = json.loads(resp.read().decode())
resp = conn.getresponse()
resp_body = resp.read().decode()
conn.close()
# Adjust timestamps by the chunk's start offset
if resp.status != 200:
raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}")
data = json.loads(resp_body)
# Fish Audio returns timestamps in milliseconds — convert to seconds
# and adjust by the chunk's start offset in the full audio
segments = []
for seg in data.get("segments", []):
segments.append({
"start": seg["start"] + start_offset_sec,
"end": seg["end"] + start_offset_sec,
"start": seg["start"] / 1000.0 + start_offset_sec,
"end": seg["end"] / 1000.0 + start_offset_sec,
"text": seg["text"],
})