Fix HTTP client, multipart, ms->s timestamp conversion

This commit is contained in:
2026-03-24 01:18:28 +00:00
parent e69b42fc19
commit 1bdf28e1d4

View File

@@ -20,13 +20,14 @@ Dependencies (must be on PATH):
""" """
import argparse import argparse
import http.client
import json import json
import math import math
import os import os
import ssl
import subprocess import subprocess
import sys import sys
import time import time
import urllib.request
import uuid import uuid
from pathlib import Path from pathlib import Path
@@ -111,50 +112,53 @@ def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
""" """
Send a chunk to Fish Audio ASR. Returns a list of segments with Send a chunk to Fish Audio ASR. Returns a list of segments with
timestamps adjusted by start_offset_sec. timestamps adjusted by start_offset_sec.
Uses stdlib urllib + manual multipart encoding (no requests dep). Note: Fish Audio returns timestamps in milliseconds.
""" """
print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True) print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
# Build multipart/form-data manually
boundary = uuid.uuid4().hex boundary = uuid.uuid4().hex
with open(chunk_path, "rb") as f: with open(chunk_path, "rb") as f:
audio_data = f.read() audio_data = f.read()
def field(name, value):
return (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="{name}"\r\n\r\n'
f"{value}\r\n"
).encode()
body = ( body = (
field("language", "en") + f"--{boundary}\r\n"
field("ignore_timestamps", "false") + f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'
f"--{boundary}\r\n".encode() + f"Content-Type: audio/mpeg\r\n\r\n"
f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'.encode() + ).encode() + audio_data + (
b"Content-Type: audio/mpeg\r\n\r\n" + f"\r\n--{boundary}\r\n"
audio_data + f'Content-Disposition: form-data; name="language"\r\n\r\n'
f"\r\n--{boundary}--\r\n".encode() f"en"
) f"\r\n--{boundary}--\r\n"
).encode()
req = urllib.request.Request( ctx = ssl.create_default_context()
FISH_AUDIO_ASR_URL, conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx)
data=body, conn.request(
"POST",
"/v1/asr",
body=body,
headers={ headers={
"Authorization": f"Bearer {api_key}", "Authorization": f"Bearer {api_key}",
"Content-Type": f"multipart/form-data; boundary={boundary}", "Content-Type": f"multipart/form-data; boundary={boundary}",
"Content-Length": str(len(body)),
}, },
method="POST",
) )
with urllib.request.urlopen(req, timeout=300) as resp: resp = conn.getresponse()
data = json.loads(resp.read().decode()) resp_body = resp.read().decode()
conn.close()
# Adjust timestamps by the chunk's start offset if resp.status != 200:
raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}")
data = json.loads(resp_body)
# Fish Audio returns timestamps in milliseconds — convert to seconds
# and adjust by the chunk's start offset in the full audio
segments = [] segments = []
for seg in data.get("segments", []): for seg in data.get("segments", []):
segments.append({ segments.append({
"start": seg["start"] + start_offset_sec, "start": seg["start"] / 1000.0 + start_offset_sec,
"end": seg["end"] + start_offset_sec, "end": seg["end"] / 1000.0 + start_offset_sec,
"text": seg["text"], "text": seg["text"],
}) })