diff --git a/transcribe.py b/transcribe.py new file mode 100644 index 0000000..07067c7 --- /dev/null +++ b/transcribe.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +""" +ask-annie/transcribe.py + +Step 1 of the Ask Annie pipeline: download audio and transcribe with Deepgram. +Saves transcript.json ready for chapter generation. + +Usage: + python3 transcribe.py --url --out out/ [--deepgram-api-key KEY] + +Output: + out//audio.mp3 — downloaded audio (cached) + out//transcript.json — full Deepgram transcript with timestamps + out//transcript.txt — plain text version (for human review) + +After running this, share transcript.txt for chapter generation, then run: + python3 ingest.py --url --chapters chapters/.json --out out/ + +Dependencies: yt-dlp, ffmpeg (on PATH) +""" + +import argparse +import http.client +import json +import os +import ssl +import subprocess +import sys +import time +from pathlib import Path + + +def run(cmd, **kwargs): + print(f" $ {' '.join(str(c) for c in cmd)}", flush=True) + result = subprocess.run(cmd, check=True, **kwargs) + return result + + +def download_audio(url, out_dir): + audio_path = out_dir / "audio.%(ext)s" + run([ + "yt-dlp", + "--extract-audio", + "--audio-format", "mp3", + "--audio-quality", "3", + "-o", str(audio_path), + url, + ]) + matches = [f for f in out_dir.glob("audio.*") if f.suffix != ".part"] + if not matches: + raise FileNotFoundError("Audio download failed — no audio.* file found") + return matches[0] + + +def transcribe_deepgram(audio_path, api_key): + """Send full audio to Deepgram, return raw response JSON.""" + print(f" Sending {audio_path.stat().st_size/1024/1024:.1f}MB to Deepgram...", flush=True) + + with open(audio_path, "rb") as f: + audio_bytes = f.read() + + params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true" + ctx = ssl.create_default_context() + conn = http.client.HTTPSConnection("api.deepgram.com", timeout=600, context=ctx) + conn.request( + "POST", + f"/v1/listen{params}", + body=audio_bytes, + headers={ + "Authorization": f"Token {api_key}", + "Content-Type": "audio/mp3", + }, + ) + resp = conn.getresponse() + resp_body = resp.read().decode() + conn.close() + + print(f" HTTP {resp.status}", flush=True) + if resp.status != 200: + raise RuntimeError(f"Deepgram error {resp.status}: {resp_body[:200]}") + + return json.loads(resp_body) + + +def main(): + parser = argparse.ArgumentParser(description="Download and transcribe Ask Annie Vimeo session") + parser.add_argument("--url", required=True, help="Vimeo URL") + parser.add_argument("--out", default="out", help="Output directory") + parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""), + help="Deepgram API key (or set DEEPGRAM_API_KEY env var)") + parser.add_argument("--video-id", default=None, help="Override video ID") + args = parser.parse_args() + + if not args.deepgram_api_key: + print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY.") + sys.exit(1) + + video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0] + print(f"\n=== Transcribe: {video_id} ===\n") + + out_dir = Path(args.out) / video_id + out_dir.mkdir(parents=True, exist_ok=True) + + transcript_path = out_dir / "transcript.json" + txt_path = out_dir / "transcript.txt" + + # Step 1: Download audio + print("=== Step 1: Download audio ===") + audio_path = None + for f in out_dir.glob("audio.*"): + if f.suffix == ".part": + continue + print(f" [cache] {f}") + audio_path = f + break + if audio_path is None: + audio_path = download_audio(args.url, out_dir) + print(f" Audio: {audio_path}\n") + + # Step 2: Transcribe + if transcript_path.exists(): + print(f"=== Step 2: Transcript cached at {transcript_path} ===\n") + with open(transcript_path) as f: + raw = json.load(f) + else: + print("=== Step 2: Transcribe with Deepgram ===") + raw = transcribe_deepgram(audio_path, args.deepgram_api_key) + with open(transcript_path, "w") as f: + json.dump(raw, f, indent=2) + print(f" Saved: {transcript_path}\n") + + # Step 3: Write plain text with timestamps + utterances = raw.get("results", {}).get("utterances", []) + duration = raw.get("metadata", {}).get("duration", 0) + + with open(txt_path, "w") as f: + f.write(f"# Transcript: {video_id}\n") + f.write(f"# URL: {args.url}\n") + f.write(f"# Duration: {duration:.0f}s ({duration/60:.1f} min)\n") + f.write(f"# Utterances: {len(utterances)}\n\n") + for u in utterances: + start = u["start"] + m, s = divmod(int(start), 60) + f.write(f"[{m}:{s:02d}] {u['transcript']}\n") + + print(f"=== Done ===") + print(f" Transcript JSON: {transcript_path}") + print(f" Transcript TXT: {txt_path}") + print(f" Duration: {duration:.0f}s ({duration/60:.1f} min)") + print(f" Utterances: {len(utterances)}") + print(f"\nNext: share transcript.txt for chapter generation, then run ingest.py") + + +if __name__ == "__main__": + main()