CtrlK
BlogDocsLog inGet started
Tessl Logo

catalan-adobe/demo-narrate

Use when the user wants to narrate a demo, add voice-over to a screen recording, or create AI narration for a silent video. End-to-end pipeline that extracts frames, analyzes with parallel subagents, writes a word-budgeted voice-over script, generates TTS audio per act, and merges everything back.

94

Quality

94%

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

SecuritybySnyk

Passed

No known issues

Overview
Quality
Evals
Security
Files

demo-narrate.shscripts/

#!/usr/bin/env bash
set -euo pipefail

# demo-narrate: Free pipeline to analyze demo videos and produce voice-over scripts.
#
# Workflow:
#   1. extract    → ffmpeg pulls timestamped contact sheets from your video
#   2. (Claude)   → read the sheets in Claude Code, iterate on a voice-over script
#   3. tts-acts   → edge-tts converts per-act scripts to individual audio clips
#   4. merge-acts → ffmpeg layers timed audio clips onto the original video
#
# Dependencies: ffmpeg (required), edge-tts (auto-installed via uv/pipx)

MAX_RATE=15   # max speed-up percentage before we ask for text trimming
SILENCE_GAP=1 # seconds of silence between acts

usage() {
  cat <<'EOF'
Usage: demo-narrate <command> [options]

Commands:
  extract <video> [fps]           Extract timestamped contact sheets (default: 1 fps)
  tts <script.txt> [voice]        Generate single voice-over audio from script
  tts-acts <acts-dir> <timing.txt> [voice]
                                  Generate per-act audio, auto-adjusting rate to fit
  tts-acts --dry-run <acts-dir> <timing.txt>
                                  Show word budgets without generating audio
  merge <video> <audio> [out]     Combine video with single audio file
  merge-acts <video> <acts-dir> <timing.txt> [out]
                                  Combine video with per-act audio at timed offsets
  fade-intro <video> [secs] [out] Add fade-in from black (freeze first frame)
  voices                          List available edge-tts voices
  deps                            Check and install dependencies

Timing file format (one line per act):
  <filename.mp3> <start_seconds>
  # Lines starting with # are ignored

  The max duration for each act is calculated as:
  (next_act_start - this_act_start - 1s silence gap).
  The last act has no upper bound (plays to end of video).

Workflow:
  1. demo-narrate extract my-demo.mp4          # or: extract my-demo.mp4 1
  2. In Claude Code: analyze contact sheets, write per-act scripts
  3. demo-narrate tts-acts ./narration/ ./narration/timing.txt
  4. demo-narrate merge-acts my-demo.mp4 ./narration/ ./narration/timing.txt

Optional:
  demo-narrate fade-intro my-demo.mp4 0.5      # add fade-in before step 4
  demo-narrate tts-acts --dry-run ./narration/ ./narration/timing.txt
EOF
}

ensure_ffmpeg() {
  if ! command -v ffmpeg &>/dev/null; then
    echo "Error: ffmpeg not found. Install with: brew install ffmpeg" >&2
    exit 1
  fi
}

ensure_edge_tts() {
  if command -v edge-tts &>/dev/null; then
    return 0
  fi

  echo "edge-tts not found. Installing..." >&2

  if command -v uv &>/dev/null; then
    uv tool install edge-tts >&2
  elif command -v pipx &>/dev/null; then
    pipx install edge-tts >&2
  else
    echo "Error: need uv or pipx to install edge-tts." >&2
    echo "  brew install uv   # then re-run" >&2
    exit 1
  fi

  if ! command -v edge-tts &>/dev/null; then
    echo "Error: edge-tts installed but not on PATH." >&2
    echo "  Check: uv tool dir / pipx list" >&2
    exit 1
  fi

  echo "edge-tts installed successfully." >&2
}

# Get audio duration in seconds (floating point)
get_duration() {
  ffprobe -v error -show_entries format=duration -of csv=p=0 "$1" 2>/dev/null
}

# Get video framerate as a fraction string (e.g., "30000/1001")
get_fps() {
  ffprobe -v error -select_streams v:0 \
    -show_entries stream=r_frame_rate -of csv=p=0 "$1" 2>/dev/null
}

# Count files matching a glob pattern safely (no ls)
count_files() {
  local pattern="$1"
  local count=0
  local f
  # Intentional unquoted glob expansion — pattern is always internal
  for f in $pattern; do
    [[ -e "$f" ]] && (( count++ )) || true
  done
  echo "$count"
}

cmd_deps() {
  echo "Checking dependencies..."
  echo ""

  if command -v ffmpeg &>/dev/null; then
    echo "  ffmpeg: $(ffmpeg -version 2>&1 | head -1)"
  else
    echo "  ffmpeg: NOT FOUND — brew install ffmpeg"
  fi

  if command -v ffprobe &>/dev/null; then
    echo "  ffprobe: OK"
  else
    echo "  ffprobe: NOT FOUND — comes with ffmpeg"
  fi

  if command -v edge-tts &>/dev/null; then
    echo "  edge-tts: $(edge-tts --version 2>&1 || echo 'OK')"
  else
    echo "  edge-tts: NOT FOUND — will auto-install on first use"
  fi
}

cmd_extract() {
  local video="${1:?Usage: demo-narrate extract <video> [fps]}"
  local fps="${2:-1}"

  ensure_ffmpeg

  if [[ ! -f "$video" ]]; then
    echo "Error: file not found: $video" >&2
    exit 1
  fi

  if ! [[ "$fps" =~ ^[0-9]*\.?[0-9]+$ ]] || [[ "$(echo "$fps == 0" | bc -l)" == "1" ]]; then
    echo "Error: fps must be a positive number, got: $fps" >&2
    exit 1
  fi

  # Output next to the video file, not in cwd
  local video_dir video_base outdir
  video_dir="$(cd "$(dirname "$video")" && pwd)"
  video_base="$(basename "${video%.*}")"
  outdir="${video_dir}/${video_base}_frames"

  mkdir -p "$outdir"

  local duration_full duration
  duration_full=$(ffprobe -v error -show_entries format=duration \
    -of csv=p=0 "$video")

  if [[ -z "$duration_full" ]]; then
    echo "Error: could not read video duration (is '$video' a valid video file?)" >&2
    exit 1
  fi

  duration=$(echo "$duration_full" | cut -d. -f1)

  local frame_count
  frame_count=$(printf "%.0f" "$(echo "$duration_full * $fps" | bc -l)")

  local interval
  interval=$(printf "%.1f" "$(echo "scale=2; 1 / $fps" | bc)")
  interval="every ${interval}s"

  # Choose tile dimensions to match actual frame count
  local tile_spec
  if (( frame_count <= 4 )); then
    tile_spec="${frame_count}x1"
  elif (( frame_count <= 12 )); then
    tile_spec="4x$(( (frame_count + 3) / 4 ))"
  else
    tile_spec="5x4"
  fi

  echo "Video: $video (${duration}s)"
  echo "Extracting ~${frame_count} frames (${interval}, fps=${fps})..."
  echo ""

  # Individual frames with timestamps
  ffmpeg -y -loglevel error -i "$video" \
    -vf "fps=${fps},scale=640:-2,drawtext=text='%{pts\:hms}':x=10:y=10:fontsize=20:fontcolor=white:box=1:boxcolor=black@0.7:boxborderw=5" \
    -q:v 2 \
    "${outdir}/frame_%04d.jpg"

  # Contact sheets with dynamic tile dimensions
  ffmpeg -y -loglevel error -i "$video" \
    -vf "fps=${fps},scale=384:-2,drawtext=text='%{pts\:hms}':x=5:y=5:fontsize=14:fontcolor=white:box=1:boxcolor=black@0.7:boxborderw=3,tile=${tile_spec}" \
    "${outdir}/sheet_%03d.jpg"

  local sheet_count individual_count
  sheet_count=$(count_files "${outdir}/sheet_*.jpg")
  individual_count=$(count_files "${outdir}/frame_*.jpg")

  echo "Done: ${sheet_count} contact sheet(s), ${individual_count} individual frames"
  echo "Output: ${outdir}/"
}

cmd_tts() {
  local script_file="${1:?Usage: demo-narrate tts <script.txt> [voice]}"
  local voice="${2:-en-US-AriaNeural}"
  local output="${script_file%.*}_voiceover.mp3"

  if [[ ! -f "$script_file" ]]; then
    echo "Error: file not found: $script_file" >&2
    exit 1
  fi

  ensure_edge_tts

  echo "Voice: $voice"
  echo "Script: $script_file"
  edge-tts --file "$script_file" --voice "$voice" --write-media "$output"

  local dur
  dur=$(ffprobe -v error -show_entries format=duration \
    -of csv=p=0 "$output" 2>/dev/null | cut -d. -f1)
  echo "Audio: $output (${dur:-?}s)"
}

# Parse timing file into parallel arrays.
# Sets: TIMING_FILES[], TIMING_OFFSETS[], TIMING_COUNT
parse_timing_file() {
  local timing_file="$1"
  TIMING_FILES=()
  TIMING_OFFSETS=()
  local line_num=0
  while IFS=$' \t' read -r filename offset_s; do
    line_num=$((line_num + 1))
    [[ -z "$filename" || "$filename" == "#"* ]] && continue
    if ! [[ "$offset_s" =~ ^[0-9]*\.?[0-9]+$ ]]; then
      echo "Error: timing.txt line $line_num: invalid offset '$offset_s' (must be a number)" >&2
      exit 1
    fi
    TIMING_FILES+=("$filename")
    TIMING_OFFSETS+=("$offset_s")
  done < "$timing_file"
  TIMING_COUNT=${#TIMING_FILES[@]}
}

cmd_tts_acts_dry_run() {
  local acts_dir="$1"
  local timing_file="$2"
  local silence_gap=$SILENCE_GAP

  if [[ ! -d "$acts_dir" ]]; then
    echo "Error: directory not found: $acts_dir" >&2
    exit 1
  fi
  if [[ ! -f "$timing_file" ]]; then
    echo "Error: timing file not found: $timing_file" >&2
    exit 1
  fi

  parse_timing_file "$timing_file"

  if [[ $TIMING_COUNT -eq 0 ]]; then
    echo "Error: no entries in timing file" >&2
    exit 1
  fi

  echo "Dry run — word budgets (${silence_gap}s gap, ~2 words/sec):"
  echo ""

  for ((i = 0; i < TIMING_COUNT; i++)); do
    local mp3_name="${TIMING_FILES[i]}"
    local txt_name="${mp3_name%.mp3}.txt"
    local txt_path="${acts_dir}/${txt_name}"

    local max_label budget
    if (( i + 1 < TIMING_COUNT )); then
      local window
      window=$(echo "${TIMING_OFFSETS[i+1]} - ${TIMING_OFFSETS[i]}" | bc)
      local max_s
      max_s=$(echo "$window - $silence_gap" | bc)
      if (( $(echo "$max_s <= 0" | bc -l) )); then
        printf "  %-30s OVERLAP (window is %ss — check timing order)\n" "$txt_name" "$window"
        continue
      fi
      max_label="${max_s}s"
      budget="~$(printf "%.0f" "$(echo "$max_s * 2" | bc)") words"
    else
      max_label="(none)"
      budget="no limit"
    fi

    local word_count="(missing)"
    if [[ -f "$txt_path" ]]; then
      word_count="$(wc -w < "$txt_path" | tr -d ' ') words"
    fi

    printf "  %-30s max %7s  budget: %-14s  current: %s\n" \
      "$txt_name" "$max_label" "$budget" "$word_count"
  done
}

cmd_tts_acts() {
  # Handle --dry-run flag
  if [[ "${1:-}" == "--dry-run" ]]; then
    shift
    local dr_dir="${1:?Usage: demo-narrate tts-acts --dry-run <acts-dir> <timing.txt>}"
    local dr_timing="${2:?Usage: demo-narrate tts-acts --dry-run <acts-dir> <timing.txt>}"
    cmd_tts_acts_dry_run "$dr_dir" "$dr_timing"
    return
  fi

  local acts_dir="${1:?Usage: demo-narrate tts-acts <acts-dir> <timing.txt> [voice]}"
  local timing_file="${2:?Usage: demo-narrate tts-acts <acts-dir> <timing.txt> [voice]}"
  local voice="${3:-en-US-AriaNeural}"
  local silence_gap=$SILENCE_GAP

  if [[ ! -d "$acts_dir" ]]; then
    echo "Error: directory not found: $acts_dir" >&2
    exit 1
  fi
  if [[ ! -f "$timing_file" ]]; then
    echo "Error: timing file not found: $timing_file" >&2
    exit 1
  fi

  ensure_edge_tts
  ensure_ffmpeg

  parse_timing_file "$timing_file"

  if [[ $TIMING_COUNT -eq 0 ]]; then
    echo "Error: no entries in timing file" >&2
    exit 1
  fi

  # Calculate max duration for each act (next_start - this_start - gap)
  local max_durs=()
  for ((i = 0; i < TIMING_COUNT; i++)); do
    if (( i + 1 < TIMING_COUNT )); then
      local max_dur
      max_dur=$(printf "%.0f" "$(echo "${TIMING_OFFSETS[i+1]} - ${TIMING_OFFSETS[i]} - $silence_gap" | bc -l)")
      if (( max_dur < 1 )); then max_dur=1; fi
      max_durs+=("$max_dur")
    else
      max_durs+=("")  # last act: no upper bound
    fi
  done

  echo "Generating TTS for $TIMING_COUNT acts (voice: $voice, ${silence_gap}s gap)..."
  echo ""

  local has_errors=0
  for ((i = 0; i < TIMING_COUNT; i++)); do
    local mp3_name="${TIMING_FILES[i]}"
    local txt_name="${mp3_name%.mp3}.txt"
    local txt_path="${acts_dir}/${txt_name}"
    local mp3_path="${acts_dir}/${mp3_name}"
    local max="${max_durs[i]}"

    if [[ ! -f "$txt_path" ]]; then
      echo "  SKIP  ${txt_name} (file not found)"
      continue
    fi

    # First pass: generate at normal rate (measurement)
    if ! edge-tts --file "$txt_path" --voice "$voice" \
      --write-media "$mp3_path"; then
      echo "  ERROR ${txt_name} (edge-tts failed)" >&2
      has_errors=1
      continue
    fi

    local dur
    dur=$(get_duration "$mp3_path")
    if [[ -z "$dur" ]]; then
      echo "  ERROR ${txt_name} (could not read duration from generated MP3)" >&2
      has_errors=1
      continue
    fi

    # If no max (last act), just report and move on
    if [[ -z "$max" ]]; then
      printf "  OK    %-30s %5.1fs (last act, no limit)\n" "$txt_name" "$dur"
      continue
    fi

    local max_f="${max}.0"

    # Check if it fits
    if (( $(echo "$dur <= $max_f" | bc -l) )); then
      local margin
      margin=$(echo "$max_f - $dur" | bc)
      printf "  OK    %-30s %5.1fs / %ss (%.1fs margin)\n" \
        "$txt_name" "$dur" "$max" "$margin"
      continue
    fi

    # Doesn't fit — calculate needed rate increase
    # Round up to ensure the rate increase is sufficient
    local rate_pct
    rate_pct=$(echo "scale=0; ($dur * 100 + $max_f - 1) / $max_f - 100" | bc -l)

    if (( rate_pct > MAX_RATE )); then
      # Rate alone won't fix it — report the problem
      printf "  LONG  %-30s %5.1fs / %ss (needs +%s%%, max +%s%%)\n" \
        "$txt_name" "$dur" "$max" "$rate_pct" "$MAX_RATE"
      echo "        → Trim text in ${txt_name} and re-run" >&2
      has_errors=1
      continue
    fi

    # Try rate adjustment, escalating until it fits or we hit MAX_RATE
    local applied_rate=$rate_pct
    while true; do
      if ! edge-tts --file "$txt_path" --voice "$voice" \
        --rate "+${applied_rate}%" --write-media "$mp3_path"; then
        echo "  ERROR ${txt_name} (edge-tts failed at +${applied_rate}%)" >&2
        has_errors=1
        break
      fi
      dur=$(get_duration "$mp3_path")
      if [[ -z "$dur" ]]; then
        echo "  ERROR ${txt_name} (could not read duration after rate adjust)" >&2
        has_errors=1
        break
      fi

      if (( $(echo "$dur <= $max_f" | bc -l) )); then
        break  # fits
      fi

      # Bump and retry
      applied_rate=$((applied_rate + 3))
      if (( applied_rate > MAX_RATE )); then
        # Exhausted rate budget — mark as LONG
        printf "  LONG  %-30s %5.1fs / %ss (still over at +%s%%)\n" \
          "$txt_name" "$dur" "$max" "$MAX_RATE"
        echo "        → Trim text in ${txt_name} and re-run" >&2
        has_errors=1
        break
      fi
    done

    # Report success if we broke out of the loop with a fit
    if (( $(echo "$dur <= $max_f" | bc -l) )); then
      local margin
      margin=$(echo "$max_f - $dur" | bc)
      printf "  RATE  %-30s %5.1fs / %ss (+%s%%, %.1fs margin)\n" \
        "$txt_name" "$dur" "$max" "$applied_rate" "$margin"
    fi
  done

  echo ""
  if (( has_errors )); then
    echo "Some acts are too long. Trim the marked text files and re-run."
    echo "Max rate adjustment: +${MAX_RATE}%"
    exit 1
  else
    echo "All acts fit within their windows."
  fi
}

cmd_merge() {
  local video="${1:?Usage: demo-narrate merge <video> <audio> [output]}"
  local audio="${2:?Usage: demo-narrate merge <video> <audio> [output]}"
  local output="${3:-${video%.*}_narrated.mp4}"

  ensure_ffmpeg

  for f in "$video" "$audio"; do
    if [[ ! -f "$f" ]]; then
      echo "Error: file not found: $f" >&2
      exit 1
    fi
  done

  echo "Merging: $(basename "$video") + $(basename "$audio")"
  ffmpeg -y -loglevel error -i "$video" -i "$audio" \
    -c:v copy -c:a aac -b:a 192k \
    -map 0:v:0 -map 1:a:0 \
    -shortest \
    "$output"

  local dur
  dur=$(ffprobe -v error -show_entries format=duration \
    -of csv=p=0 "$output" 2>/dev/null | cut -d. -f1)
  echo "Output: $output (${dur:-?}s)"
}

cmd_merge_acts() {
  local video="${1:?Usage: demo-narrate merge-acts <video> <acts-dir> <timing.txt> [output]}"
  local acts_dir="${2:?Usage: demo-narrate merge-acts <video> <acts-dir> <timing.txt> [output]}"
  local timing_file="${3:?Usage: demo-narrate merge-acts <video> <acts-dir> <timing.txt> [output]}"
  local output="${4:-${video%.*}_narrated.mp4}"

  ensure_ffmpeg

  if [[ ! -f "$video" ]]; then
    echo "Error: video not found: $video" >&2
    exit 1
  fi
  if [[ ! -d "$acts_dir" ]]; then
    echo "Error: acts directory not found: $acts_dir" >&2
    exit 1
  fi
  if [[ ! -f "$timing_file" ]]; then
    echo "Error: timing file not found: $timing_file" >&2
    exit 1
  fi

  parse_timing_file "$timing_file"

  local inputs=() filters=() labels=() idx=1
  for ((i = 0; i < TIMING_COUNT; i++)); do
    local mp3="${acts_dir}/${TIMING_FILES[i]}"
    if [[ ! -f "$mp3" ]]; then
      echo "Warning: $mp3 not found, skipping" >&2
      continue
    fi
    local offset_ms
    offset_ms=$(printf "%.0f" "$(echo "${TIMING_OFFSETS[i]} * 1000" | bc -l)")
    if [[ -z "$offset_ms" ]]; then
      echo "Warning: could not compute offset for ${TIMING_FILES[i]}, skipping" >&2
      continue
    fi
    inputs+=(-i "$mp3")
    filters+=("[${idx}]adelay=${offset_ms}|${offset_ms}[a${idx}]")
    labels+=("[a${idx}]")
    idx=$((idx + 1))
  done

  local n_inputs=${#labels[@]}
  if [[ $n_inputs -eq 0 ]]; then
    echo "Error: no valid audio files found" >&2
    exit 1
  fi

  local filter_complex
  filter_complex="$(IFS=';'; echo "${filters[*]}"); "
  filter_complex+="$(IFS=''; echo "${labels[*]}")"
  filter_complex+="amix=inputs=${n_inputs}:duration=longest:dropout_transition=0:normalize=0[aout]"

  echo "Merging ${n_inputs} audio clips onto $(basename "$video")..."
  ffmpeg -y -loglevel error \
    -i "$video" "${inputs[@]}" \
    -filter_complex "$filter_complex" \
    -map 0:v -map "[aout]" \
    -c:v copy -c:a aac -b:a 192k -shortest \
    "$output"

  local dur
  dur=$(ffprobe -v error -show_entries format=duration \
    -of csv=p=0 "$output" 2>/dev/null | cut -d. -f1)
  echo "Output: $output (${dur:-?}s)"
}

cmd_fade_intro() {
  local video="${1:?Usage: demo-narrate fade-intro <video> [fade_seconds] [output]}"
  local fade="${2:-0.5}"
  local output="${3:-${video%.*}_intro.mp4}"

  ensure_ffmpeg

  if [[ ! -f "$video" ]]; then
    echo "Error: file not found: $video" >&2
    exit 1
  fi

  local dur fps_str
  dur=$(get_duration "$video")
  fps_str=$(get_fps "$video")

  echo "Adding ${fade}s fade-in from black (freeze frame)..."
  echo "  Source: $(basename "$video") (${dur}s, ${fps_str} fps)"

  # Extract first frame (cleaned up on exit or error)
  local tmp_frame
  tmp_frame="$(mktemp "${TMPDIR:-/tmp}/fade_frame_XXXXXX.jpg")"
  trap 'rm -f "$tmp_frame" 2>/dev/null' EXIT INT TERM

  ffmpeg -y -loglevel error -i "$video" \
    -vf "select=eq(n\,0)" -vsync vfr -q:v 2 -frames:v 1 \
    "$tmp_frame"

  # Build the output in one pass:
  # - Input 0: frozen first frame looped for ${fade}s with fade-from-black
  # - Input 1: original video
  # - Use source video's fps for the intro to match
  ffmpeg -y -loglevel error \
    -loop 1 -t "$fade" -i "$tmp_frame" \
    -i "$video" \
    -filter_complex "\
      [0:v]fps=${fps_str},format=yuv420p,fade=t=in:st=0:d=${fade}[intro]; \
      [1:v]setpts=PTS-STARTPTS[main]; \
      [intro][main]concat=n=2:v=1:a=0[vout]" \
    -map "[vout]" \
    -c:v libx264 -preset fast -crf 18 \
    -movflags +faststart \
    "$output"

  local new_dur
  new_dur=$(get_duration "$output")
  echo "Output: $output (${new_dur}s = ${fade}s intro + ${dur}s video)"
  echo "  Note: video re-encoded (libx264 crf=18) for seamless concat"
  echo ""
  echo "Remember: shift all timing offsets by +${fade}s in your timing.txt"
}

cmd_voices() {
  ensure_edge_tts
  echo "Popular English voices for demo narration:"
  echo ""
  edge-tts --list-voices 2>/dev/null | grep -E "en-(US|GB)" | head -20 || true
  echo ""
  echo "All voices: edge-tts --list-voices"
}

if [[ -z "${1:-}" ]]; then
  usage
  exit 0
fi

case "$1" in
  extract)      shift; cmd_extract "$@" ;;
  tts)          shift; cmd_tts "$@" ;;
  tts-acts)     shift; cmd_tts_acts "$@" ;;
  merge)        shift; cmd_merge "$@" ;;
  merge-acts)   shift; cmd_merge_acts "$@" ;;
  fade-intro)   shift; cmd_fade_intro "$@" ;;
  voices)       cmd_voices ;;
  deps)         cmd_deps ;;
  -h|--help|help) usage ;;
  *)
    echo "Error: unknown command: $1" >&2
    usage >&2
    exit 1
    ;;
esac

SKILL.md

tile.json