ask-annie/ingest.py

#!/usr/bin/env python3
"""
ask-annie/ingest.py

Given a Vimeo URL and a chapter list (JSON file), produces structured
per-chapter knowledge chunks suitable for ingestion into knowledge-mcp.

Usage:
  python3 ingest.py --url <vimeo_url> --chapters chapters/XXXX.json --out out/ [--frames]

Output:
  out/<video_id>/chunks.json        — array of chapter chunks
  out/<video_id>/frames/            — extracted frame images (if --frames)
  out/<video_id>/transcript.json    — full reassembled transcript (cached)
  out/<video_id>/audio_chunks/      — per-chapter audio splits (cached)

Dependencies (must be on PATH):
  yt-dlp, ffmpeg
  pip install requests
"""

import argparse
import json
import math
import os
import subprocess
import sys
import time
import urllib.request
import uuid
from pathlib import Path


FISH_AUDIO_ASR_URL = "https://api.fish.audio/v1/asr"

# Chapters where frame extraction adds real value (demo-heavy sections).
DEMO_CHAPTER_KEYWORDS = [
    "transcoding",
    "s3",
    "file tracking",
    "workbench",
    "new capability",
    "preview",
    "demo",
    "ui",
    "setup",
]

# Target chunk size in seconds — split audio into groups of chapters
# totalling no more than this before sending to ASR.
# 10 min = 600s keeps each chunk well under Fish Audio's size limit.
CHUNK_MAX_SECONDS = 600


def run(cmd, **kwargs):
    print(f"  $ {' '.join(str(c) for c in cmd)}", flush=True)
    result = subprocess.run(cmd, check=True, **kwargs)
    return result


def parse_timestamp(ts_str):
    """Parse 'M:SS' or 'H:MM:SS' to seconds."""
    parts = ts_str.strip().split(":")
    parts = [int(p) for p in parts]
    if len(parts) == 2:
        return parts[0] * 60 + parts[1]
    elif len(parts) == 3:
        return parts[0] * 3600 + parts[1] * 60 + parts[2]
    raise ValueError(f"Cannot parse timestamp: {ts_str}")


def download_audio(url, out_dir):
    audio_path = out_dir / "audio.%(ext)s"
    run([
        "yt-dlp",
        "--extract-audio",
        "--audio-format", "mp3",
        "--audio-quality", "3",
        "-o", str(audio_path),
        url,
    ])
    matches = [f for f in out_dir.glob("audio.*") if f.suffix != ".part"]
    if not matches:
        raise FileNotFoundError("Audio download failed — no audio.* file found")
    return matches[0]


def get_audio_duration(audio_path):
    """Return audio duration in seconds using ffprobe."""
    result = subprocess.run(
        ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", str(audio_path)],
        capture_output=True, text=True, check=True
    )
    info = json.loads(result.stdout)
    return float(info["format"]["duration"])


def split_audio_chunk(audio_path, start_sec, end_sec, out_path):
    """Extract a segment from audio_path into out_path using ffmpeg."""
    run([
        "ffmpeg", "-y",
        "-ss", str(start_sec),
        "-to", str(end_sec),
        "-i", str(audio_path),
        "-c", "copy",
        str(out_path),
    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


def transcribe_chunk_fish(chunk_path, api_key, start_offset_sec):
    """
    Send a chunk to Fish Audio ASR. Returns a list of segments with
    timestamps adjusted by start_offset_sec.
    Uses stdlib urllib + manual multipart encoding (no requests dep).
    """
    print(f"  → Fish Audio ASR: {chunk_path.name} (offset +{start_offset_sec}s)", flush=True)

    # Build multipart/form-data manually
    boundary = uuid.uuid4().hex
    with open(chunk_path, "rb") as f:
        audio_data = f.read()

    def field(name, value):
        return (
            f"--{boundary}\r\n"
            f'Content-Disposition: form-data; name="{name}"\r\n\r\n'
            f"{value}\r\n"
        ).encode()

    body = (
        field("language", "en") +
        field("ignore_timestamps", "false") +
        f"--{boundary}\r\n".encode() +
        f'Content-Disposition: form-data; name="audio"; filename="{chunk_path.name}"\r\n'.encode() +
        b"Content-Type: audio/mpeg\r\n\r\n" +
        audio_data +
        f"\r\n--{boundary}--\r\n".encode()
    )

    req = urllib.request.Request(
        FISH_AUDIO_ASR_URL,
        data=body,
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": f"multipart/form-data; boundary={boundary}",
        },
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=300) as resp:
        data = json.loads(resp.read().decode())

    # Adjust timestamps by the chunk's start offset
    segments = []
    for seg in data.get("segments", []):
        segments.append({
            "start": seg["start"] + start_offset_sec,
            "end": seg["end"] + start_offset_sec,
            "text": seg["text"],
        })

    # If Fish returns flat text with no segments, synthesise one segment
    if not segments and data.get("text"):
        segments.append({
            "start": start_offset_sec,
            "end": start_offset_sec + 60,
            "text": data["text"],
        })

    return segments


def group_chapters_into_chunks(chapters, total_duration_sec):
    """
    Group chapters into batches where each batch's audio is <= CHUNK_MAX_SECONDS.
    Returns list of (start_sec, end_sec, [chapter_indices]).
    """
    groups = []
    current_indices = []
    current_start = None

    for i, chapter in enumerate(chapters):
        ch_start = parse_timestamp(chapter["timestamp"])
        if i + 1 < len(chapters):
            ch_end = parse_timestamp(chapters[i + 1]["timestamp"])
        else:
            ch_end = total_duration_sec

        if current_start is None:
            current_start = ch_start

        # Would adding this chapter exceed the max chunk size?
        if (ch_end - current_start) > CHUNK_MAX_SECONDS and current_indices:
            # Flush current group
            prev_end = parse_timestamp(chapters[current_indices[-1] + 1]["timestamp"]) \
                if current_indices[-1] + 1 < len(chapters) else total_duration_sec
            groups.append((current_start, prev_end, list(current_indices)))
            current_indices = [i]
            current_start = ch_start
        else:
            current_indices.append(i)

    if current_indices:
        groups.append((current_start, total_duration_sec, current_indices))

    return groups


def transcribe_with_fish(audio_path, out_dir, chapters, api_key):
    """
    Split audio by chapter groups, transcribe each with Fish Audio ASR,
    reassemble into a unified segment list. Caches per-chunk results.
    """
    transcript_path = out_dir / "transcript.json"
    if transcript_path.exists():
        print(f"  [cache] Using existing transcript at {transcript_path}")
        with open(transcript_path) as f:
            return json.load(f)

    chunks_dir = out_dir / "audio_chunks"
    chunks_dir.mkdir(exist_ok=True)

    total_duration = get_audio_duration(audio_path)
    print(f"  Audio duration: {total_duration:.0f}s ({total_duration/60:.1f} min)")

    groups = group_chapters_into_chunks(chapters, total_duration)
    print(f"  Split into {len(groups)} chunk(s) for ASR\n")

    all_segments = []

    for g_idx, (g_start, g_end, ch_indices) in enumerate(groups):
        chunk_path = chunks_dir / f"chunk_{g_idx:02d}.mp3"
        cache_path = chunks_dir / f"chunk_{g_idx:02d}.json"

        ch_titles = [chapters[i]["title"][:40] for i in ch_indices]
        print(f"  Chunk {g_idx}: {g_start:.0f}s–{g_end:.0f}s ({len(ch_indices)} chapters: {ch_titles[0]}…)")

        if cache_path.exists():
            print(f"    [cache] Using cached ASR result")
            with open(cache_path) as f:
                segments = json.load(f)
        else:
            # Extract audio chunk
            if not chunk_path.exists():
                split_audio_chunk(audio_path, g_start, g_end, chunk_path)

            # Send to Fish Audio
            segments = transcribe_chunk_fish(chunk_path, api_key, g_start)

            # Cache the result
            with open(cache_path, "w") as f:
                json.dump(segments, f, indent=2)

            # Brief pause between API calls
            if g_idx < len(groups) - 1:
                time.sleep(1)

        all_segments.extend(segments)
        print(f"    Got {len(segments)} segments")

    # Save full reassembled transcript
    transcript = {"segments": all_segments}
    with open(transcript_path, "w") as f:
        json.dump(transcript, f, indent=2)

    print(f"\n  Total segments: {len(all_segments)}")
    return transcript


def extract_frame(video_path, timestamp_sec, out_path):
    run([
        "ffmpeg", "-y",
        "-ss", str(timestamp_sec),
        "-i", str(video_path),
        "-frames:v", "1",
        "-q:v", "2",
        str(out_path),
    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


def is_demo_chapter(title):
    t = title.lower()
    return any(kw in t for kw in DEMO_CHAPTER_KEYWORDS)


def segments_in_window(segments, start_sec, end_sec):
    """Extract transcript text for a time window from segments."""
    texts = []
    for seg in segments:
        seg_start = seg.get("start", 0)
        seg_end = seg.get("end", seg_start)
        if seg_end >= start_sec and seg_start <= end_sec:
            texts.append(seg["text"].strip())
    return " ".join(texts)


def main():
    parser = argparse.ArgumentParser(description="Ingest Ask Annie Vimeo session")
    parser.add_argument("--url", required=True, help="Vimeo URL")
    parser.add_argument("--chapters", required=True, help="Path to chapters JSON file")
    parser.add_argument("--out", default="out", help="Output directory")
    parser.add_argument("--frames", action="store_true", help="Extract frames for demo chapters")
    parser.add_argument("--fish-api-key", default=os.environ.get("FISH_API_KEY", ""),
                        help="Fish Audio API key (or set FISH_API_KEY env var)")
    parser.add_argument("--video-id", default=None,
                        help="Override video ID (extracted from URL if omitted)")
    args = parser.parse_args()

    if not args.fish_api_key:
        print("ERROR: Fish Audio API key required. Pass --fish-api-key or set FISH_API_KEY env var.")
        sys.exit(1)

    video_id = args.video_id or args.url.rstrip("/").split("/")[-1].split("?")[0]
    print(f"\n=== Ask Annie Ingest: {video_id} ===\n")

    out_dir = Path(args.out) / video_id
    frames_dir = out_dir / "frames"
    out_dir.mkdir(parents=True, exist_ok=True)
    if args.frames:
        frames_dir.mkdir(exist_ok=True)

    # Load chapters
    with open(args.chapters) as f:
        chapters = json.load(f)
    print(f"Loaded {len(chapters)} chapters\n")

    # Step 1: Download audio
    print("=== Step 1: Download audio ===")
    audio_path = None
    for f in out_dir.glob("audio.*"):
        if f.suffix == ".part":
            print(f"  [skip] Ignoring partial download: {f}")
            continue
        print(f"  [cache] Found existing audio: {f}")
        audio_path = f
        break
    if audio_path is None:
        audio_path = download_audio(args.url, out_dir)
    print(f"  Audio: {audio_path}\n")

    # Step 1b: Download video (only if --frames requested)
    video_path = None
    if args.frames:
        print("=== Step 1b: Download video (for frame extraction) ===")
        existing = [f for f in out_dir.glob("video.*") if f.suffix != ".part"]
        if existing:
            video_path = existing[0]
            print(f"  [cache] Found existing video: {video_path}")
        else:
            vid_out = out_dir / "video.%(ext)s"
            run(["yt-dlp", "-f", "bestvideo[height<=720]", "-o", str(vid_out), args.url])
            matches = [f for f in out_dir.glob("video.*") if f.suffix != ".part"]
            if not matches:
                print("  WARNING: Video download failed, skipping frame extraction")
                args.frames = False
            else:
                video_path = matches[0]
        print()

    # Step 2: Transcribe via Fish Audio
    print("=== Step 2: Transcribe via Fish Audio ASR ===")
    transcript = transcribe_with_fish(audio_path, out_dir, chapters, args.fish_api_key)
    segments = transcript.get("segments", [])
    print(f"  Got {len(segments)} transcript segments\n")

    # Step 3: Build chunks
    print("=== Step 3: Build chunks ===")
    chunks = []
    for i, chapter in enumerate(chapters):
        start_sec = parse_timestamp(chapter["timestamp"])
        if i + 1 < len(chapters):
            end_sec = parse_timestamp(chapters[i + 1]["timestamp"])
        else:
            end_sec = start_sec + 600

        text = segments_in_window(segments, start_sec, end_sec)
        demo = is_demo_chapter(chapter["title"])

        frame_desc = None
        if args.frames and demo and video_path:
            frame_file = frames_dir / f"chapter_{i:02d}.jpg"
            print(f"  Extracting frame for: {chapter['title']}")
            extract_frame(video_path, start_sec + 5, frame_file)
            frame_desc = f"[frame: {frame_file.name} — vision description pending]"

        chunk = {
            "video_id": video_id,
            "video_url": args.url,
            "chapter_index": i,
            "timestamp": chapter["timestamp"],
            "timestamp_sec": start_sec,
            "title": chapter["title"],
            "summary": chapter.get("summary", ""),
            "transcript": text,
            "is_demo": demo,
            "frame_description": frame_desc,
            "source": "ask-annie",
            "series": "ST Best Practices Q&A",
        }
        chunks.append(chunk)
        print(f"  [{i:02d}] {chapter['timestamp']} — {chapter['title'][:60]} ({'demo' if demo else 'qa'})")

    # Step 4: Write output
    chunks_path = out_dir / "chunks.json"
    with open(chunks_path, "w") as f:
        json.dump(chunks, f, indent=2)
    print(f"\n=== Done: {len(chunks)} chunks → {chunks_path} ===\n")


if __name__ == "__main__":
    main()