ask-annie/transcribe.py

#!/usr/bin/env python3
"""
ask-annie/transcribe.py

Step 1 of the Ask Annie pipeline: download audio and transcribe with Deepgram.
Saves transcript.json ready for chapter generation.

Usage:
  python3 transcribe.py --url <vimeo_url> --out out/ [--deepgram-api-key KEY]

Output:
  out/<video_id>/audio.mp3         — downloaded audio (cached)
  out/<video_id>/transcript.json   — full Deepgram transcript with timestamps
  out/<video_id>/transcript.txt    — plain text version (for human review)

After running this, share transcript.txt for chapter generation, then run:
  python3 ingest.py --url <url> --chapters chapters/<id>.json --out out/

Dependencies: yt-dlp, ffmpeg (on PATH)
"""

import argparse
import http.client
import json
import os
import ssl
import subprocess
import sys
import time
from pathlib import Path


def run(cmd, **kwargs):
    print(f"  $ {' '.join(str(c) for c in cmd)}", flush=True)
    result = subprocess.run(cmd, check=True, **kwargs)
    return result


def download_audio(url, out_dir):
    audio_path = out_dir / "audio.%(ext)s"
    run([
        "yt-dlp",
        "--extract-audio",
        "--audio-format", "mp3",
        "--audio-quality", "3",
        "-o", str(audio_path),
        url,
    ])
    matches = [f for f in out_dir.glob("audio.*") if f.suffix != ".part"]
    if not matches:
        raise FileNotFoundError("Audio download failed — no audio.* file found")
    return matches[0]


def transcribe_deepgram(audio_path, api_key):
    """Send full audio to Deepgram, return raw response JSON."""
    print(f"  Sending {audio_path.stat().st_size/1024/1024:.1f}MB to Deepgram...", flush=True)

    with open(audio_path, "rb") as f:
        audio_bytes = f.read()

    params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
    ctx = ssl.create_default_context()
    conn = http.client.HTTPSConnection("api.deepgram.com", timeout=600, context=ctx)
    conn.request(
        "POST",
        f"/v1/listen{params}",
        body=audio_bytes,
        headers={
            "Authorization": f"Token {api_key}",
            "Content-Type": "audio/mp3",
        },
    )
    resp = conn.getresponse()
    resp_body = resp.read().decode()
    conn.close()

    print(f"  HTTP {resp.status}", flush=True)
    if resp.status != 200:
        raise RuntimeError(f"Deepgram error {resp.status}: {resp_body[:200]}")

    return json.loads(resp_body)


def main():
    parser = argparse.ArgumentParser(description="Download and transcribe Ask Annie Vimeo session")
    parser.add_argument("--url", required=True, help="Vimeo URL")
    parser.add_argument("--out", default="out", help="Output directory")
    parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
                        help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
    parser.add_argument("--video-id", default=None, help="Override video ID")
    args = parser.parse_args()

    if not args.deepgram_api_key:
        print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY.")
        sys.exit(1)

    video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
    print(f"\n=== Transcribe: {video_id} ===\n")

    out_dir = Path(args.out) / video_id
    out_dir.mkdir(parents=True, exist_ok=True)

    transcript_path = out_dir / "transcript.json"
    txt_path = out_dir / "transcript.txt"

    # Step 1: Download audio
    print("=== Step 1: Download audio ===")
    audio_path = None
    for f in out_dir.glob("audio.*"):
        if f.suffix == ".part":
            continue
        print(f"  [cache] {f}")
        audio_path = f
        break
    if audio_path is None:
        audio_path = download_audio(args.url, out_dir)
    print(f"  Audio: {audio_path}\n")

    # Step 2: Transcribe
    if transcript_path.exists():
        print(f"=== Step 2: Transcript cached at {transcript_path} ===\n")
        with open(transcript_path) as f:
            raw = json.load(f)
    else:
        print("=== Step 2: Transcribe with Deepgram ===")
        raw = transcribe_deepgram(audio_path, args.deepgram_api_key)
        with open(transcript_path, "w") as f:
            json.dump(raw, f, indent=2)
        print(f"  Saved: {transcript_path}\n")

    # Step 3: Write plain text with timestamps
    utterances = raw.get("results", {}).get("utterances", [])
    duration = raw.get("metadata", {}).get("duration", 0)

    with open(txt_path, "w") as f:
        f.write(f"# Transcript: {video_id}\n")
        f.write(f"# URL: {args.url}\n")
        f.write(f"# Duration: {duration:.0f}s ({duration/60:.1f} min)\n")
        f.write(f"# Utterances: {len(utterances)}\n\n")
        for u in utterances:
            start = u["start"]
            m, s = divmod(int(start), 60)
            f.write(f"[{m}:{s:02d}] {u['transcript']}\n")

    print(f"=== Done ===")
    print(f"  Transcript JSON: {transcript_path}")
    print(f"  Transcript TXT:  {txt_path}")
    print(f"  Duration: {duration:.0f}s ({duration/60:.1f} min)")
    print(f"  Utterances: {len(utterances)}")
    print(f"\nNext: share transcript.txt for chapter generation, then run ingest.py")


if __name__ == "__main__":
    main()