Add transcribe.py — step 1 of pipeline

This commit is contained in:
2026-03-24 04:25:43 +00:00
parent 53f748487e
commit cd8707f008

155
transcribe.py Normal file
View File

@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
ask-annie/transcribe.py
Step 1 of the Ask Annie pipeline: download audio and transcribe with Deepgram.
Saves transcript.json ready for chapter generation.
Usage:
python3 transcribe.py --url <vimeo_url> --out out/ [--deepgram-api-key KEY]
Output:
out/<video_id>/audio.mp3 — downloaded audio (cached)
out/<video_id>/transcript.json — full Deepgram transcript with timestamps
out/<video_id>/transcript.txt — plain text version (for human review)
After running this, share transcript.txt for chapter generation, then run:
python3 ingest.py --url <url> --chapters chapters/<id>.json --out out/
Dependencies: yt-dlp, ffmpeg (on PATH)
"""
import argparse
import http.client
import json
import os
import ssl
import subprocess
import sys
import time
from pathlib import Path
def run(cmd, **kwargs):
print(f" $ {' '.join(str(c) for c in cmd)}", flush=True)
result = subprocess.run(cmd, check=True, **kwargs)
return result
def download_audio(url, out_dir):
audio_path = out_dir / "audio.%(ext)s"
run([
"yt-dlp",
"--extract-audio",
"--audio-format", "mp3",
"--audio-quality", "3",
"-o", str(audio_path),
url,
])
matches = [f for f in out_dir.glob("audio.*") if f.suffix != ".part"]
if not matches:
raise FileNotFoundError("Audio download failed — no audio.* file found")
return matches[0]
def transcribe_deepgram(audio_path, api_key):
"""Send full audio to Deepgram, return raw response JSON."""
print(f" Sending {audio_path.stat().st_size/1024/1024:.1f}MB to Deepgram...", flush=True)
with open(audio_path, "rb") as f:
audio_bytes = f.read()
params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
ctx = ssl.create_default_context()
conn = http.client.HTTPSConnection("api.deepgram.com", timeout=600, context=ctx)
conn.request(
"POST",
f"/v1/listen{params}",
body=audio_bytes,
headers={
"Authorization": f"Token {api_key}",
"Content-Type": "audio/mp3",
},
)
resp = conn.getresponse()
resp_body = resp.read().decode()
conn.close()
print(f" HTTP {resp.status}", flush=True)
if resp.status != 200:
raise RuntimeError(f"Deepgram error {resp.status}: {resp_body[:200]}")
return json.loads(resp_body)
def main():
parser = argparse.ArgumentParser(description="Download and transcribe Ask Annie Vimeo session")
parser.add_argument("--url", required=True, help="Vimeo URL")
parser.add_argument("--out", default="out", help="Output directory")
parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
parser.add_argument("--video-id", default=None, help="Override video ID")
args = parser.parse_args()
if not args.deepgram_api_key:
print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY.")
sys.exit(1)
video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
print(f"\n=== Transcribe: {video_id} ===\n")
out_dir = Path(args.out) / video_id
out_dir.mkdir(parents=True, exist_ok=True)
transcript_path = out_dir / "transcript.json"
txt_path = out_dir / "transcript.txt"
# Step 1: Download audio
print("=== Step 1: Download audio ===")
audio_path = None
for f in out_dir.glob("audio.*"):
if f.suffix == ".part":
continue
print(f" [cache] {f}")
audio_path = f
break
if audio_path is None:
audio_path = download_audio(args.url, out_dir)
print(f" Audio: {audio_path}\n")
# Step 2: Transcribe
if transcript_path.exists():
print(f"=== Step 2: Transcript cached at {transcript_path} ===\n")
with open(transcript_path) as f:
raw = json.load(f)
else:
print("=== Step 2: Transcribe with Deepgram ===")
raw = transcribe_deepgram(audio_path, args.deepgram_api_key)
with open(transcript_path, "w") as f:
json.dump(raw, f, indent=2)
print(f" Saved: {transcript_path}\n")
# Step 3: Write plain text with timestamps
utterances = raw.get("results", {}).get("utterances", [])
duration = raw.get("metadata", {}).get("duration", 0)
with open(txt_path, "w") as f:
f.write(f"# Transcript: {video_id}\n")
f.write(f"# URL: {args.url}\n")
f.write(f"# Duration: {duration:.0f}s ({duration/60:.1f} min)\n")
f.write(f"# Utterances: {len(utterances)}\n\n")
for u in utterances:
start = u["start"]
m, s = divmod(int(start), 60)
f.write(f"[{m}:{s:02d}] {u['transcript']}\n")
print(f"=== Done ===")
print(f" Transcript JSON: {transcript_path}")
print(f" Transcript TXT: {txt_path}")
print(f" Duration: {duration:.0f}s ({duration/60:.1f} min)")
print(f" Utterances: {len(utterances)}")
print(f"\nNext: share transcript.txt for chapter generation, then run ingest.py")
if __name__ == "__main__":
main()