Add transcribe.py — step 1 of pipeline
This commit is contained in:
155
transcribe.py
Normal file
155
transcribe.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ask-annie/transcribe.py
|
||||
|
||||
Step 1 of the Ask Annie pipeline: download audio and transcribe with Deepgram.
|
||||
Saves transcript.json ready for chapter generation.
|
||||
|
||||
Usage:
|
||||
python3 transcribe.py --url <vimeo_url> --out out/ [--deepgram-api-key KEY]
|
||||
|
||||
Output:
|
||||
out/<video_id>/audio.mp3 — downloaded audio (cached)
|
||||
out/<video_id>/transcript.json — full Deepgram transcript with timestamps
|
||||
out/<video_id>/transcript.txt — plain text version (for human review)
|
||||
|
||||
After running this, share transcript.txt for chapter generation, then run:
|
||||
python3 ingest.py --url <url> --chapters chapters/<id>.json --out out/
|
||||
|
||||
Dependencies: yt-dlp, ffmpeg (on PATH)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import http.client
|
||||
import json
|
||||
import os
|
||||
import ssl
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def run(cmd, **kwargs):
|
||||
print(f" $ {' '.join(str(c) for c in cmd)}", flush=True)
|
||||
result = subprocess.run(cmd, check=True, **kwargs)
|
||||
return result
|
||||
|
||||
|
||||
def download_audio(url, out_dir):
|
||||
audio_path = out_dir / "audio.%(ext)s"
|
||||
run([
|
||||
"yt-dlp",
|
||||
"--extract-audio",
|
||||
"--audio-format", "mp3",
|
||||
"--audio-quality", "3",
|
||||
"-o", str(audio_path),
|
||||
url,
|
||||
])
|
||||
matches = [f for f in out_dir.glob("audio.*") if f.suffix != ".part"]
|
||||
if not matches:
|
||||
raise FileNotFoundError("Audio download failed — no audio.* file found")
|
||||
return matches[0]
|
||||
|
||||
|
||||
def transcribe_deepgram(audio_path, api_key):
|
||||
"""Send full audio to Deepgram, return raw response JSON."""
|
||||
print(f" Sending {audio_path.stat().st_size/1024/1024:.1f}MB to Deepgram...", flush=True)
|
||||
|
||||
with open(audio_path, "rb") as f:
|
||||
audio_bytes = f.read()
|
||||
|
||||
params = "?model=nova-3&language=en&punctuate=true&utterances=true&smart_format=true"
|
||||
ctx = ssl.create_default_context()
|
||||
conn = http.client.HTTPSConnection("api.deepgram.com", timeout=600, context=ctx)
|
||||
conn.request(
|
||||
"POST",
|
||||
f"/v1/listen{params}",
|
||||
body=audio_bytes,
|
||||
headers={
|
||||
"Authorization": f"Token {api_key}",
|
||||
"Content-Type": "audio/mp3",
|
||||
},
|
||||
)
|
||||
resp = conn.getresponse()
|
||||
resp_body = resp.read().decode()
|
||||
conn.close()
|
||||
|
||||
print(f" HTTP {resp.status}", flush=True)
|
||||
if resp.status != 200:
|
||||
raise RuntimeError(f"Deepgram error {resp.status}: {resp_body[:200]}")
|
||||
|
||||
return json.loads(resp_body)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Download and transcribe Ask Annie Vimeo session")
|
||||
parser.add_argument("--url", required=True, help="Vimeo URL")
|
||||
parser.add_argument("--out", default="out", help="Output directory")
|
||||
parser.add_argument("--deepgram-api-key", default=os.environ.get("DEEPGRAM_API_KEY", ""),
|
||||
help="Deepgram API key (or set DEEPGRAM_API_KEY env var)")
|
||||
parser.add_argument("--video-id", default=None, help="Override video ID")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.deepgram_api_key:
|
||||
print("ERROR: Deepgram API key required. Pass --deepgram-api-key or set DEEPGRAM_API_KEY.")
|
||||
sys.exit(1)
|
||||
|
||||
video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
|
||||
print(f"\n=== Transcribe: {video_id} ===\n")
|
||||
|
||||
out_dir = Path(args.out) / video_id
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
transcript_path = out_dir / "transcript.json"
|
||||
txt_path = out_dir / "transcript.txt"
|
||||
|
||||
# Step 1: Download audio
|
||||
print("=== Step 1: Download audio ===")
|
||||
audio_path = None
|
||||
for f in out_dir.glob("audio.*"):
|
||||
if f.suffix == ".part":
|
||||
continue
|
||||
print(f" [cache] {f}")
|
||||
audio_path = f
|
||||
break
|
||||
if audio_path is None:
|
||||
audio_path = download_audio(args.url, out_dir)
|
||||
print(f" Audio: {audio_path}\n")
|
||||
|
||||
# Step 2: Transcribe
|
||||
if transcript_path.exists():
|
||||
print(f"=== Step 2: Transcript cached at {transcript_path} ===\n")
|
||||
with open(transcript_path) as f:
|
||||
raw = json.load(f)
|
||||
else:
|
||||
print("=== Step 2: Transcribe with Deepgram ===")
|
||||
raw = transcribe_deepgram(audio_path, args.deepgram_api_key)
|
||||
with open(transcript_path, "w") as f:
|
||||
json.dump(raw, f, indent=2)
|
||||
print(f" Saved: {transcript_path}\n")
|
||||
|
||||
# Step 3: Write plain text with timestamps
|
||||
utterances = raw.get("results", {}).get("utterances", [])
|
||||
duration = raw.get("metadata", {}).get("duration", 0)
|
||||
|
||||
with open(txt_path, "w") as f:
|
||||
f.write(f"# Transcript: {video_id}\n")
|
||||
f.write(f"# URL: {args.url}\n")
|
||||
f.write(f"# Duration: {duration:.0f}s ({duration/60:.1f} min)\n")
|
||||
f.write(f"# Utterances: {len(utterances)}\n\n")
|
||||
for u in utterances:
|
||||
start = u["start"]
|
||||
m, s = divmod(int(start), 60)
|
||||
f.write(f"[{m}:{s:02d}] {u['transcript']}\n")
|
||||
|
||||
print(f"=== Done ===")
|
||||
print(f" Transcript JSON: {transcript_path}")
|
||||
print(f" Transcript TXT: {txt_path}")
|
||||
print(f" Duration: {duration:.0f}s ({duration/60:.1f} min)")
|
||||
print(f" Utterances: {len(utterances)}")
|
||||
print(f"\nNext: share transcript.txt for chapter generation, then run ingest.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user