or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

audio

audio-processing.mdrealtime-transcription.mdspeech-to-speech.mdspeech-to-text.mdtext-to-speech.md
index.md
tile.json

forced-alignment.mddocs/content/

Forced Alignment

Align audio files to text with precise character and word timing. Forced alignment provides accurate synchronization between audio and text for subtitles, karaoke, animation, and accessibility features.

Quick Reference

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";

const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Access this API via: client.forcedAlignment

Capabilities

Create Forced Alignment

Align audio to text and get detailed timing information.

/**
 * @param request - Audio file and text to align
 * @param requestOptions - Optional request configuration
 * @returns Alignment with character and word timestamps
 * @throws UnprocessableEntityError if request fails
 */
client.forcedAlignment.create(
  request: BodyCreateForcedAlignmentV1ForcedAlignmentPost,
  requestOptions?: RequestOptions
): HttpResponsePromise<ForcedAlignmentResponseModel>;

interface BodyCreateForcedAlignmentV1ForcedAlignmentPost {
  /** Audio file to align */
  audio: File | Blob;
  /** Text transcript */
  text: string;
  /** Language code (e.g., "en", "es") */
  language?: string;
}

interface ForcedAlignmentResponseModel {
  /** Character-level alignment */
  alignment: Alignment;
  /** Word-level alignment */
  words: WordAlignment[];
}

interface Alignment {
  /** Array of characters */
  characters: string[];
  /** Start times for each character (seconds) */
  character_start_times_seconds: number[];
  /** End times for each character (seconds) */
  character_end_times_seconds: number[];
}

interface WordAlignment {
  /** Word text */
  word: string;
  /** Start time in seconds */
  start: number;
  /** End time in seconds */
  end: number;
  /** Confidence score (0.0 to 1.0) */
  confidence?: number;
}

Usage Examples

Basic Alignment

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
import { readFile } from "fs/promises";

const client = new ElevenLabsClient({ apiKey: "your-api-key" });

// Load audio file
const audioFile = await readFile("speech.mp3");
const transcript = "Hello world, this is a test of forced alignment.";

// Create alignment
const alignment = await client.forcedAlignment.create({
  audio: new File([audioFile], "speech.mp3"),
  text: transcript,
  language: "en",
});

// Access character timing
console.log("Characters:", alignment.alignment.characters);
console.log("Character start times:", alignment.alignment.character_start_times_seconds);

// Access word timing
for (const word of alignment.words) {
  console.log(`"${word.word}": ${word.start}s - ${word.end}s`);
}

Generate Subtitles

// Create SRT subtitles from forced alignment
async function generateSubtitles(
  audioPath: string,
  transcript: string
): Promise<string> {
  const audioFile = await readFile(audioPath);

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], audioPath),
    text: transcript,
  });

  let srt = "";
  let index = 1;

  // Group words into subtitle chunks (max 10 words per subtitle)
  const chunkSize = 10;
  for (let i = 0; i < alignment.words.length; i += chunkSize) {
    const chunk = alignment.words.slice(i, i + chunkSize);

    const start = chunk[0].start;
    const end = chunk[chunk.length - 1].end;
    const text = chunk.map(w => w.word).join(" ");

    srt += `${index}\n`;
    srt += `${formatSrtTime(start)} --> ${formatSrtTime(end)}\n`;
    srt += `${text}\n\n`;

    index++;
  }

  return srt;
}

function formatSrtTime(seconds: number): string {
  const hours = Math.floor(seconds / 3600);
  const minutes = Math.floor((seconds % 3600) / 60);
  const secs = Math.floor(seconds % 60);
  const ms = Math.floor((seconds % 1) * 1000);

  return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(2, "0")}:${String(secs).padStart(2, "0")},${String(ms).padStart(3, "0")}`;
}

const srt = await generateSubtitles("audio.mp3", "The quick brown fox...");
await writeFile("subtitles.srt", srt);

Character Animation Timing

// Get timing for character-by-character animation
async function getCharacterTimings(
  audioPath: string,
  text: string
): Promise<Array<{ char: string; start: number; end: number }>> {
  const audioFile = await readFile(audioPath);

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], audioPath),
    text,
  });

  const timings = [];
  const chars = alignment.alignment.characters;
  const starts = alignment.alignment.character_start_times_seconds;
  const ends = alignment.alignment.character_end_times_seconds;

  for (let i = 0; i < chars.length; i++) {
    timings.push({
      char: chars[i],
      start: starts[i],
      end: ends[i],
    });
  }

  return timings;
}

const timings = await getCharacterTimings("audio.mp3", "Hello");
// Use timings for animation keyframes

Karaoke Lyrics

// Create karaoke-style lyrics with word highlighting
interface KaraokeLine {
  words: Array<{
    text: string;
    startTime: number;
    endTime: number;
  }>;
}

async function createKaraokeLyrics(
  audioPath: string,
  lyrics: string
): Promise<KaraokeLine[]> {
  const audioFile = await readFile(audioPath);

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], audioPath),
    text: lyrics,
  });

  // Group into lines (split on newlines or periods)
  const sentences = lyrics.split(/[.\n]/);
  const lines: KaraokeLine[] = [];

  let wordIndex = 0;
  for (const sentence of sentences) {
    const wordCount = sentence.trim().split(/\s+/).length;
    const lineWords = alignment.words.slice(wordIndex, wordIndex + wordCount);

    lines.push({
      words: lineWords.map(w => ({
        text: w.word,
        startTime: w.start,
        endTime: w.end,
      })),
    });

    wordIndex += wordCount;
  }

  return lines;
}

Accessibility Captions

// Generate accessible captions with proper timing
async function generateAccessibleCaptions(
  audioPath: string,
  transcript: string
): Promise<string> {
  const audioFile = await readFile(audioPath);

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], audioPath),
    text: transcript,
  });

  // WebVTT format
  let vtt = "WEBVTT\n\n";

  // Create captions with proper reading pace (max 2 seconds per caption)
  const maxDuration = 2.0;
  let currentCaption = [];
  let startTime = 0;

  for (const word of alignment.words) {
    if (currentCaption.length === 0) {
      startTime = word.start;
    }

    currentCaption.push(word.word);

    const duration = word.end - startTime;

    if (duration >= maxDuration || word === alignment.words[alignment.words.length - 1]) {
      vtt += `${formatWebVttTime(startTime)} --> ${formatWebVttTime(word.end)}\n`;
      vtt += `${currentCaption.join(" ")}\n\n`;
      currentCaption = [];
    }
  }

  return vtt;
}

function formatWebVttTime(seconds: number): string {
  const mins = Math.floor(seconds / 60);
  const secs = seconds % 60;
  return `${String(mins).padStart(2, "0")}:${secs.toFixed(3).padStart(6, "0")}`;
}

Validate Transcript

// Check if transcript matches audio using confidence scores
async function validateTranscript(
  audioPath: string,
  transcript: string
): Promise<{ isValid: boolean; avgConfidence: number; lowConfidenceWords: string[] }> {
  const audioFile = await readFile(audioPath);

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], audioPath),
    text: transcript,
  });

  const confidences = alignment.words
    .filter(w => w.confidence !== undefined)
    .map(w => w.confidence!);

  const avgConfidence = confidences.reduce((a, b) => a + b, 0) / confidences.length;

  const lowConfidenceWords = alignment.words
    .filter(w => w.confidence && w.confidence < 0.5)
    .map(w => w.word);

  return {
    isValid: avgConfidence > 0.7,
    avgConfidence,
    lowConfidenceWords,
  };
}

Export to JSON

// Export alignment data as JSON
async function exportAlignment(
  audioPath: string,
  transcript: string,
  outputPath: string
): Promise<void> {
  const audioFile = await readFile(audioPath);

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], audioPath),
    text: transcript,
  });

  const data = {
    transcript,
    words: alignment.words,
    characters: {
      chars: alignment.alignment.characters,
      start_times: alignment.alignment.character_start_times_seconds,
      end_times: alignment.alignment.character_end_times_seconds,
    },
  };

  await writeFile(outputPath, JSON.stringify(data, null, 2));
}

await exportAlignment("audio.mp3", "Transcript text", "alignment.json");

Phone Lip Sync

// Generate phoneme timing for lip sync animation
async function getPhonemeTimings(
  audioPath: string,
  transcript: string
): Promise<Array<{ phoneme: string; start: number; end: number }>> {
  const audioFile = await readFile(audioPath);

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], audioPath),
    text: transcript,
  });

  // Use character timings as approximation for phonemes
  // In production, you'd map characters to phonemes
  return alignment.alignment.characters.map((char, i) => ({
    phoneme: char,
    start: alignment.alignment.character_start_times_seconds[i],
    end: alignment.alignment.character_end_times_seconds[i],
  }));
}

Highlight Text in Real-time

// Create data for real-time text highlighting
interface HighlightSegment {
  text: string;
  startTime: number;
  endTime: number;
  charStart: number;
  charEnd: number;
}

async function createHighlightMap(
  audioPath: string,
  transcript: string
): Promise<HighlightSegment[]> {
  const audioFile = await readFile(audioPath);

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], audioPath),
    text: transcript,
  });

  const segments: HighlightSegment[] = [];
  let charOffset = 0;

  for (const word of alignment.words) {
    const wordLength = word.word.length;

    segments.push({
      text: word.word,
      startTime: word.start,
      endTime: word.end,
      charStart: charOffset,
      charEnd: charOffset + wordLength,
    });

    charOffset += wordLength + 1; // +1 for space
  }

  return segments;
}

// Use in player:
// segments.forEach(segment => {
//   at time segment.startTime: highlight text from charStart to charEnd
// });

Multi-Language Support

// Align audio in different languages
const languages = ["en", "es", "fr", "de"];

for (const lang of languages) {
  const audioFile = await readFile(`audio_${lang}.mp3`);
  const transcript = getTranscript(lang); // Get transcript for language

  const alignment = await client.forcedAlignment.create({
    audio: new File([audioFile], `audio_${lang}.mp3`),
    text: transcript,
    language: lang,
  });

  console.log(`${lang} alignment complete:`, alignment.words.length, "words");
}

Batch Processing

// Process multiple files
interface AlignmentJob {
  audioPath: string;
  transcript: string;
  language: string;
}

async function batchAlign(jobs: AlignmentJob[]): Promise<void> {
  for (const job of jobs) {
    try {
      const audioFile = await readFile(job.audioPath);

      const alignment = await client.forcedAlignment.create({
        audio: new File([audioFile], job.audioPath),
        text: job.transcript,
        language: job.language,
      });

      console.log(`✓ Aligned ${job.audioPath}`);

      // Save results
      await writeFile(
        `${job.audioPath}.json`,
        JSON.stringify(alignment, null, 2)
      );
    } catch (error) {
      console.error(`✗ Failed ${job.audioPath}:`, error);
    }
  }
}