Use official fishaudio SDK for ASR
This commit is contained in:
61
ingest.py
61
ingest.py
@@ -20,20 +20,15 @@ Dependencies (must be on PATH):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import http.client
|
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import ssl
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import uuid
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
FISH_AUDIO_ASR_URL = "https://api.fish.audio/v1/asr"
|
|
||||||
|
|
||||||
# Chapters where frame extraction adds real value (demo-heavy sections).
|
# Chapters where frame extraction adds real value (demo-heavy sections).
|
||||||
DEMO_CHAPTER_KEYWORDS = [
|
DEMO_CHAPTER_KEYWORDS = [
|
||||||
"transcoding",
|
"transcoding",
|
||||||
@@ -110,64 +105,34 @@ def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
|
|||||||
|
|
||||||
def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
|
def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
|
||||||
"""
|
"""
|
||||||
Send a chunk to Fish Audio ASR. Returns a list of segments with
|
Send a chunk to Fish Audio ASR via the official SDK.
|
||||||
timestamps adjusted by start_offset_sec.
|
Returns a list of segments with timestamps adjusted by start_offset_sec.
|
||||||
Note: Fish Audio returns timestamps in milliseconds.
|
Note: Fish Audio returns timestamps in milliseconds.
|
||||||
"""
|
"""
|
||||||
|
from fishaudio import FishAudio
|
||||||
|
|
||||||
print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
|
print(f" → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)
|
||||||
|
|
||||||
boundary = uuid.uuid4().hex
|
client = FishAudio(api_key=api_key)
|
||||||
with open(chunk_path, "rb") as f:
|
with open(chunk_path, "rb") as f:
|
||||||
audio_data = f.read()
|
result = client.asr.transcribe(audio=f.read(), language="en")
|
||||||
|
|
||||||
body = (
|
|
||||||
f"--{boundary}\r\n"
|
|
||||||
f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'
|
|
||||||
f"Content-Type: audio/mpeg\r\n\r\n"
|
|
||||||
).encode() + audio_data + (
|
|
||||||
f"\r\n--{boundary}\r\n"
|
|
||||||
f'Content-Disposition: form-data; name="language"\r\n\r\n'
|
|
||||||
f"en"
|
|
||||||
f"\r\n--{boundary}--\r\n"
|
|
||||||
).encode()
|
|
||||||
|
|
||||||
ctx = ssl.create_default_context()
|
|
||||||
conn = http.client.HTTPSConnection("api.fish.audio", timeout=300, context=ctx)
|
|
||||||
conn.request(
|
|
||||||
"POST",
|
|
||||||
"/v1/asr",
|
|
||||||
body=body,
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {api_key}",
|
|
||||||
"Content-Type": f"multipart/form-data; boundary={boundary}",
|
|
||||||
"Content-Length": str(len(body)),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
resp = conn.getresponse()
|
|
||||||
resp_body = resp.read().decode()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if resp.status != 200:
|
|
||||||
raise RuntimeError(f"Fish Audio ASR error {resp.status}: {resp_body}")
|
|
||||||
|
|
||||||
data = json.loads(resp_body)
|
|
||||||
|
|
||||||
# Fish Audio returns timestamps in milliseconds — convert to seconds
|
# Fish Audio returns timestamps in milliseconds — convert to seconds
|
||||||
# and adjust by the chunk's start offset in the full audio
|
# and adjust by the chunk's start offset in the full audio
|
||||||
segments = []
|
segments = []
|
||||||
for seg in data.get("segments", []):
|
for seg in result.segments:
|
||||||
segments.append({
|
segments.append({
|
||||||
"start": seg["start"] / 1000.0 + start_offset_sec,
|
"start": seg.start / 1000.0 + start_offset_sec,
|
||||||
"end": seg["end"] / 1000.0 + start_offset_sec,
|
"end": seg.end / 1000.0 + start_offset_sec,
|
||||||
"text": seg["text"],
|
"text": seg.text,
|
||||||
})
|
})
|
||||||
|
|
||||||
# If Fish returns flat text with no segments, synthesise one segment
|
# Fallback: if no segments, use full text as one block
|
||||||
if not segments and data.get("text"):
|
if not segments and result.text:
|
||||||
segments.append({
|
segments.append({
|
||||||
"start": start_offset_sec,
|
"start": start_offset_sec,
|
||||||
"end": start_offset_sec + 60,
|
"end": start_offset_sec + 60,
|
||||||
"text": data["text"],
|
"text": result.text,
|
||||||
})
|
})
|
||||||
|
|
||||||
return segments
|
return segments
|
||||||
|
|||||||
Reference in New Issue
Block a user