156 lines
5.1 KiB
Python
156 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ask-annie/transcribe.py
|
|
|
|
Step 1 of the Ask Annie pipeline: download audio and transcribe with Deepgram.
|
|
Saves transcript.json ready for chapter generation.
|
|
|
|
Usage:
|
|
python3 transcribe.py --url <vimeo_url> --out out/ [--deepgram-api-key KEY]
|
|
|
|
Output:
|
|
out/<video_id>/audio.mp3 — downloaded audio (cached)
|
|
out/<video_id>/transcript.json — full Deepgram transcript with timestamps
|
|
out/<video_id>/transcript.txt — plain text version (for human review)
|
|
|
|
After running this, share transcript.txt for chapter generation, then run:
|
|
python3 ingest.py --url <url> --chapters chapters/<id>.json --out out/
|
|
|
|
Dependencies: yt-dlp, ffmpeg (on PATH)
|
|
"""
|
|
|
|
import argparse
|
|
import http.client
|
|
import json
|
|
import os
|
|
import ssl
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
|
|
def run(cmd, **kwargs):
|
|
print(f" $ {' '.join(str(c) for c in cmd)}", flush=True)
|
|
result = subprocess.run(cmd, check=True, **kwargs)
|
|
return result
|
|
|
|
|
|
def download_audio(url, out_dir):
|
|
audio_path = out_dir / "audio.%(ext)s"
|
|
run([
|
|
"yt-dlp",
|
|
"--extract-audio",
|
|
"--audio-format", "mp3",
|
|
"--audio-quality", "3",
|
|
"-o", str(audio_path),
|
|
url,
|
|
])
|
|
matches = [f for f in out_dir.glob("audio.*") if f.suffix != ".part"]
|
|
if not matches:
|
|
raise FileNotFoundError("Audio download failed — no audio.* file found")
|
|
return matches[0]
|
|
|
|
|
|
def transcribe_deepgram(audio_path, api_key):
|
|
"""Send full audio to Deepgram, return raw response JSON."""
|
|
print(f" Sending {audio_path.stat().st_size/1024/1024:.1f}MB to Deepgram...", flush=True)
|
|
|
|
with open(audio_path, "rb") as f:
|
|
audio_bytes = f.read()
|
|
|
|
params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
|
|
ctx = ssl.create_default_context()
|
|
conn = http.client.HTTPSConnection("api.deepgram.com", timeout=600, context=ctx)
|
|
conn.request(
|
|
"POST",
|
|
f"/v1/listen{params}",
|
|
body=audio_bytes,
|
|
headers={
|
|
"Authorization": f"Token {api_key}",
|
|
"Content-Type": "audio/mp3",
|
|
},
|
|
)
|
|
resp = conn.getresponse()
|
|
resp_body = resp.read().decode()
|
|
conn.close()
|
|
|
|
print(f" HTTP {resp.status}", flush=True)
|
|
if resp.status != 200:
|
|
raise RuntimeError(f"Deepgram error {resp.status}: {resp_body[:200]}")
|
|
|
|
return json.loads(resp_body)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Download and transcribe Ask Annie Vimeo session")
|
|
parser.add_argument("--url", required=True, help="Vimeo URL")
|
|
parser.add_argument("--out", default="out", help="Output directory")
|
|
parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
|
|
help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
|
|
parser.add_argument("--video-id", default=None, help="Override video ID")
|
|
args = parser.parse_args()
|
|
|
|
if not args.deepgram_api_key:
|
|
print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY.")
|
|
sys.exit(1)
|
|
|
|
video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
|
|
print(f"\n=== Transcribe: {video_id} ===\n")
|
|
|
|
out_dir = Path(args.out) / video_id
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
transcript_path = out_dir / "transcript.json"
|
|
txt_path = out_dir / "transcript.txt"
|
|
|
|
# Step 1: Download audio
|
|
print("=== Step 1: Download audio ===")
|
|
audio_path = None
|
|
for f in out_dir.glob("audio.*"):
|
|
if f.suffix == ".part":
|
|
continue
|
|
print(f" [cache] {f}")
|
|
audio_path = f
|
|
break
|
|
if audio_path is None:
|
|
audio_path = download_audio(args.url, out_dir)
|
|
print(f" Audio: {audio_path}\n")
|
|
|
|
# Step 2: Transcribe
|
|
if transcript_path.exists():
|
|
print(f"=== Step 2: Transcript cached at {transcript_path} ===\n")
|
|
with open(transcript_path) as f:
|
|
raw = json.load(f)
|
|
else:
|
|
print("=== Step 2: Transcribe with Deepgram ===")
|
|
raw = transcribe_deepgram(audio_path, args.deepgram_api_key)
|
|
with open(transcript_path, "w") as f:
|
|
json.dump(raw, f, indent=2)
|
|
print(f" Saved: {transcript_path}\n")
|
|
|
|
# Step 3: Write plain text with timestamps
|
|
utterances = raw.get("results", {}).get("utterances", [])
|
|
duration = raw.get("metadata", {}).get("duration", 0)
|
|
|
|
with open(txt_path, "w") as f:
|
|
f.write(f"# Transcript: {video_id}\n")
|
|
f.write(f"# URL: {args.url}\n")
|
|
f.write(f"# Duration: {duration:.0f}s ({duration/60:.1f} min)\n")
|
|
f.write(f"# Utterances: {len(utterances)}\n\n")
|
|
for u in utterances:
|
|
start = u["start"]
|
|
m, s = divmod(int(start), 60)
|
|
f.write(f"[{m}:{s:02d}] {u['transcript']}\n")
|
|
|
|
print(f"=== Done ===")
|
|
print(f" Transcript JSON: {transcript_path}")
|
|
print(f" Transcript TXT: {txt_path}")
|
|
print(f" Duration: {duration:.0f}s ({duration/60:.1f} min)")
|
|
print(f" Utterances: {len(utterances)}")
|
|
print(f"\nNext: share transcript.txt for chapter generation, then run ingest.py")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|