Add batch_transcribe.py

2026-03-24 04:27:51 +00:00
parent cd8707f008
commit 6b3a2e739a
1 changed files with 149 additions and 0 deletions
--- a/batch_transcribe.py
+++ b/batch_transcribe.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+ask-annie/batch_transcribe.py
+
+Batch download + transcribe all sessions in ask-annie-videos-list.txt
+Skips any session that already has a transcript.json cached.
+
+Usage:
+  DEEPGRAM_API_KEY=xxx python3 batch_transcribe.py [--out ./out] [--delay 45]
+
+After this completes, git add/commit/push the out/ directory, then ping Clawd
+to generate chapters for each session.
+"""
+
+import argparse
+import http.client
+import json
+import os
+import ssl
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+def run(cmd, **kwargs):
+    print(f"  $ {' '.join(str(c) for c in cmd)}", flush=True)
+    result = subprocess.run(cmd, check=True, **kwargs)
+    return result
+
+
+def download_audio(url, out_dir):
+    audio_path = out_dir / "audio.%(ext)s"
+    run(["yt-dlp", "--extract-audio", "--audio-format", "mp3",
+         "--audio-quality", "3", "-o", str(audio_path), url])
+    matches = [f for f in out_dir.glob("audio.*") if f.suffix != ".part"]
+    if not matches:
+        raise FileNotFoundError("Audio download failed")
+    return matches[0]
+
+
+def transcribe_deepgram(audio_path, api_key):
+    print(f"  Sending {audio_path.stat().st_size/1024/1024:.1f}MB to Deepgram...", flush=True)
+    with open(audio_path, "rb") as f:
+        audio_bytes = f.read()
+    params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
+    ctx = ssl.create_default_context()
+    conn = http.client.HTTPSConnection("api.deepgram.com", timeout=600, context=ctx)
+    conn.request("POST", f"/v1/listen{params}", body=audio_bytes,
+                 headers={"Authorization": f"Token {api_key}", "Content-Type": "audio/mp3"})
+    resp = conn.getresponse()
+    resp_body = resp.read().decode()
+    conn.close()
+    print(f"  HTTP {resp.status}", flush=True)
+    if resp.status != 200:
+        raise RuntimeError(f"Deepgram error {resp.status}: {resp_body[:200]}")
+    return json.loads(resp_body)
+
+
+def process_video(url, out_base, api_key):
+    video_id = url.rstrip("/").split("/")[-1].split("?")[0]
+    out_dir = out_base / video_id
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    transcript_path = out_dir / "transcript.json"
+    txt_path = out_dir / "transcript.txt"
+
+    if transcript_path.exists() and txt_path.exists():
+        print(f"  [skip] {video_id} — transcript already exists")
+        return True, video_id
+
+    # Download audio
+    audio_path = None
+    for f in out_dir.glob("audio.*"):
+        if f.suffix != ".part":
+            print(f"  [cache] audio: {f.name}")
+            audio_path = f
+            break
+    if audio_path is None:
+        print(f"  Downloading audio...")
+        audio_path = download_audio(url, out_dir)
+
+    # Transcribe
+    raw = transcribe_deepgram(audio_path, api_key)
+    with open(transcript_path, "w") as f:
+        json.dump(raw, f, indent=2)
+
+    # Write plain text
+    utterances = raw.get("results", {}).get("utterances", [])
+    duration = raw.get("metadata", {}).get("duration", 0)
+    with open(txt_path, "w") as f:
+        f.write(f"# Transcript: {video_id}\n")
+        f.write(f"# URL: {url}\n")
+        f.write(f"# Duration: {duration:.0f}s ({duration/60:.1f} min)\n\n")
+        for u in utterances:
+            m, s = divmod(int(u["start"]), 60)
+            f.write(f"[{m}:{s:02d}] {u['transcript']}\n")
+
+    print(f"  ✅ {video_id} — {duration/60:.1f} min, {len(utterances)} utterances")
+    return True, video_id
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--videos", default="ask-annie-videos-list.txt")
+    parser.add_argument("--out", default="out")
+    parser.add_argument("--delay", type=int, default=45,
+                        help="Seconds to wait between videos (default: 45)")
+    parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""))
+    args = parser.parse_args()
+
+    if not args.deepgram_api_key:
+        print("ERROR: Set DEEPGRAM_API_KEY or pass --deepgram-api-key")
+        sys.exit(1)
+
+    with open(args.videos) as f:
+        urls = [l.strip() for l in f if l.strip() and not l.startswith("#")]
+
+    # Skip already-done session from session 1
+    out_base = Path(args.out)
+    print(f"=== Batch transcribe: {len(urls)} videos ===\n")
+
+    done, failed = [], []
+    for i, url in enumerate(urls):
+        video_id = url.rstrip("/").split("/")[-1].split("?")[0]
+        print(f"\n[{i+1}/{len(urls)}] {video_id}")
+        print(f"  URL: {url}")
+        try:
+            ok, vid = process_video(url, out_base, args.deepgram_api_key)
+            done.append(vid)
+        except Exception as e:
+            print(f"  ❌ FAILED: {e}")
+            failed.append((video_id, str(e)))
+
+        # Delay between videos (skip after last)
+        if i < len(urls) - 1:
+            print(f"  Waiting {args.delay}s before next video...")
+            time.sleep(args.delay)
+
+    print(f"\n=== Done: {len(done)} ok, {len(failed)} failed ===")
+    if failed:
+        for vid, err in failed:
+            print(f"  ❌ {vid}: {err}")
+    print(f"\nNext: git add out/ && git commit -m 'batch transcripts' && git push")
+    print("Then ping Clawd to generate chapters for each session.")
+
+
+if __name__ == "__main__":
+    main()