Add transcribe.py — step 1 of pipeline

2026-03-24 04:25:43 +00:00
parent 53f748487e
commit cd8707f008
1 changed files with 155 additions and 0 deletions
--- a/transcribe.py
+++ b/transcribe.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+ask-annie/transcribe.py
+
+Step 1 of the Ask Annie pipeline: download audio and transcribe with Deepgram.
+Saves transcript.json ready for chapter generation.
+
+Usage:
+  python3 transcribe.py --url <vimeo_url> --out out/ [--deepgram-api-key KEY]
+
+Output:
+  out/<video_id>/audio.mp3         — downloaded audio (cached)
+  out/<video_id>/transcript.json   — full Deepgram transcript with timestamps
+  out/<video_id>/transcript.txt    — plain text version (for human review)
+
+After running this, share transcript.txt for chapter generation, then run:
+  python3 ingest.py --url <url> --chapters chapters/<id>.json --out out/
+
+Dependencies: yt-dlp, ffmpeg (on PATH)
+"""
+
+import argparse
+import http.client
+import json
+import os
+import ssl
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+def run(cmd, **kwargs):
+    print(f"  $ {' '.join(str(c) for c in cmd)}", flush=True)
+    result = subprocess.run(cmd, check=True, **kwargs)
+    return result
+
+
+def download_audio(url, out_dir):
+    audio_path = out_dir / "audio.%(ext)s"
+    run([
+        "yt-dlp",
+        "--extract-audio",
+        "--audio-format", "mp3",
+        "--audio-quality", "3",
+        "-o", str(audio_path),
+        url,
+    ])
+    matches = [f for f in out_dir.glob("audio.*") if f.suffix != ".part"]
+    if not matches:
+        raise FileNotFoundError("Audio download failed — no audio.* file found")
+    return matches[0]
+
+
+def transcribe_deepgram(audio_path, api_key):
+    """Send full audio to Deepgram, return raw response JSON."""
+    print(f"  Sending {audio_path.stat().st_size/1024/1024:.1f}MB to Deepgram...", flush=True)
+
+    with open(audio_path, "rb") as f:
+        audio_bytes = f.read()
+
+    params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
+    ctx = ssl.create_default_context()
+    conn = http.client.HTTPSConnection("api.deepgram.com", timeout=600, context=ctx)
+    conn.request(
+        "POST",
+        f"/v1/listen{params}",
+        body=audio_bytes,
+        headers={
+            "Authorization": f"Token {api_key}",
+            "Content-Type": "audio/mp3",
+        },
+    )
+    resp = conn.getresponse()
+    resp_body = resp.read().decode()
+    conn.close()
+
+    print(f"  HTTP {resp.status}", flush=True)
+    if resp.status != 200:
+        raise RuntimeError(f"Deepgram error {resp.status}: {resp_body[:200]}")
+
+    return json.loads(resp_body)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download and transcribe Ask Annie Vimeo session")
+    parser.add_argument("--url", required=True, help="Vimeo URL")
+    parser.add_argument("--out", default="out", help="Output directory")
+    parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
+                        help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
+    parser.add_argument("--video-id", default=None, help="Override video ID")
+    args = parser.parse_args()
+
+    if not args.deepgram_api_key:
+        print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY.")
+        sys.exit(1)
+
+    video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
+    print(f"\n=== Transcribe: {video_id} ===\n")
+
+    out_dir = Path(args.out) / video_id
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    transcript_path = out_dir / "transcript.json"
+    txt_path = out_dir / "transcript.txt"
+
+    # Step 1: Download audio
+    print("=== Step 1: Download audio ===")
+    audio_path = None
+    for f in out_dir.glob("audio.*"):
+        if f.suffix == ".part":
+            continue
+        print(f"  [cache] {f}")
+        audio_path = f
+        break
+    if audio_path is None:
+        audio_path = download_audio(args.url, out_dir)
+    print(f"  Audio: {audio_path}\n")
+
+    # Step 2: Transcribe
+    if transcript_path.exists():
+        print(f"=== Step 2: Transcript cached at {transcript_path} ===\n")
+        with open(transcript_path) as f:
+            raw = json.load(f)
+    else:
+        print("=== Step 2: Transcribe with Deepgram ===")
+        raw = transcribe_deepgram(audio_path, args.deepgram_api_key)
+        with open(transcript_path, "w") as f:
+            json.dump(raw, f, indent=2)
+        print(f"  Saved: {transcript_path}\n")
+
+    # Step 3: Write plain text with timestamps
+    utterances = raw.get("results", {}).get("utterances", [])
+    duration = raw.get("metadata", {}).get("duration", 0)
+
+    with open(txt_path, "w") as f:
+        f.write(f"# Transcript: {video_id}\n")
+        f.write(f"# URL: {args.url}\n")
+        f.write(f"# Duration: {duration:.0f}s ({duration/60:.1f} min)\n")
+        f.write(f"# Utterances: {len(utterances)}\n\n")
+        for u in utterances:
+            start = u["start"]
+            m, s = divmod(int(start), 60)
+            f.write(f"[{m}:{s:02d}] {u['transcript']}\n")
+
+    print(f"=== Done ===")
+    print(f"  Transcript JSON: {transcript_path}")
+    print(f"  Transcript TXT:  {txt_path}")
+    print(f"  Duration: {duration:.0f}s ({duration/60:.1f} min)")
+    print(f"  Utterances: {len(utterances)}")
+    print(f"\nNext: share transcript.txt for chapter generation, then run ingest.py")
+
+
+if __name__ == "__main__":
+    main()