Align audio files to text with precise character and word timing. Forced alignment provides accurate synchronization between audio and text for subtitles, karaoke, animation, and accessibility features.
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Access this API via: client.forcedAlignmentAlign audio to text and get detailed timing information.
/**
* @param request - Audio file and text to align
* @param requestOptions - Optional request configuration
* @returns Alignment with character and word timestamps
* @throws UnprocessableEntityError if request fails
*/
client.forcedAlignment.create(
request: BodyCreateForcedAlignmentV1ForcedAlignmentPost,
requestOptions?: RequestOptions
): HttpResponsePromise<ForcedAlignmentResponseModel>;
interface BodyCreateForcedAlignmentV1ForcedAlignmentPost {
/** Audio file to align */
audio: File | Blob;
/** Text transcript */
text: string;
/** Language code (e.g., "en", "es") */
language?: string;
}
interface ForcedAlignmentResponseModel {
/** Character-level alignment */
alignment: Alignment;
/** Word-level alignment */
words: WordAlignment[];
}
interface Alignment {
/** Array of characters */
characters: string[];
/** Start times for each character (seconds) */
character_start_times_seconds: number[];
/** End times for each character (seconds) */
character_end_times_seconds: number[];
}
interface WordAlignment {
/** Word text */
word: string;
/** Start time in seconds */
start: number;
/** End time in seconds */
end: number;
/** Confidence score (0.0 to 1.0) */
confidence?: number;
}import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
import { readFile } from "fs/promises";
const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Load audio file
const audioFile = await readFile("speech.mp3");
const transcript = "Hello world, this is a test of forced alignment.";
// Create alignment
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], "speech.mp3"),
text: transcript,
language: "en",
});
// Access character timing
console.log("Characters:", alignment.alignment.characters);
console.log("Character start times:", alignment.alignment.character_start_times_seconds);
// Access word timing
for (const word of alignment.words) {
console.log(`"${word.word}": ${word.start}s - ${word.end}s`);
}// Create SRT subtitles from forced alignment
async function generateSubtitles(
audioPath: string,
transcript: string
): Promise<string> {
const audioFile = await readFile(audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], audioPath),
text: transcript,
});
let srt = "";
let index = 1;
// Group words into subtitle chunks (max 10 words per subtitle)
const chunkSize = 10;
for (let i = 0; i < alignment.words.length; i += chunkSize) {
const chunk = alignment.words.slice(i, i + chunkSize);
const start = chunk[0].start;
const end = chunk[chunk.length - 1].end;
const text = chunk.map(w => w.word).join(" ");
srt += `${index}\n`;
srt += `${formatSrtTime(start)} --> ${formatSrtTime(end)}\n`;
srt += `${text}\n\n`;
index++;
}
return srt;
}
function formatSrtTime(seconds: number): string {
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const secs = Math.floor(seconds % 60);
const ms = Math.floor((seconds % 1) * 1000);
return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(2, "0")}:${String(secs).padStart(2, "0")},${String(ms).padStart(3, "0")}`;
}
const srt = await generateSubtitles("audio.mp3", "The quick brown fox...");
await writeFile("subtitles.srt", srt);// Get timing for character-by-character animation
async function getCharacterTimings(
audioPath: string,
text: string
): Promise<Array<{ char: string; start: number; end: number }>> {
const audioFile = await readFile(audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], audioPath),
text,
});
const timings = [];
const chars = alignment.alignment.characters;
const starts = alignment.alignment.character_start_times_seconds;
const ends = alignment.alignment.character_end_times_seconds;
for (let i = 0; i < chars.length; i++) {
timings.push({
char: chars[i],
start: starts[i],
end: ends[i],
});
}
return timings;
}
const timings = await getCharacterTimings("audio.mp3", "Hello");
// Use timings for animation keyframes// Create karaoke-style lyrics with word highlighting
interface KaraokeLine {
words: Array<{
text: string;
startTime: number;
endTime: number;
}>;
}
async function createKaraokeLyrics(
audioPath: string,
lyrics: string
): Promise<KaraokeLine[]> {
const audioFile = await readFile(audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], audioPath),
text: lyrics,
});
// Group into lines (split on newlines or periods)
const sentences = lyrics.split(/[.\n]/);
const lines: KaraokeLine[] = [];
let wordIndex = 0;
for (const sentence of sentences) {
const wordCount = sentence.trim().split(/\s+/).length;
const lineWords = alignment.words.slice(wordIndex, wordIndex + wordCount);
lines.push({
words: lineWords.map(w => ({
text: w.word,
startTime: w.start,
endTime: w.end,
})),
});
wordIndex += wordCount;
}
return lines;
}// Generate accessible captions with proper timing
async function generateAccessibleCaptions(
audioPath: string,
transcript: string
): Promise<string> {
const audioFile = await readFile(audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], audioPath),
text: transcript,
});
// WebVTT format
let vtt = "WEBVTT\n\n";
// Create captions with proper reading pace (max 2 seconds per caption)
const maxDuration = 2.0;
let currentCaption = [];
let startTime = 0;
for (const word of alignment.words) {
if (currentCaption.length === 0) {
startTime = word.start;
}
currentCaption.push(word.word);
const duration = word.end - startTime;
if (duration >= maxDuration || word === alignment.words[alignment.words.length - 1]) {
vtt += `${formatWebVttTime(startTime)} --> ${formatWebVttTime(word.end)}\n`;
vtt += `${currentCaption.join(" ")}\n\n`;
currentCaption = [];
}
}
return vtt;
}
function formatWebVttTime(seconds: number): string {
const mins = Math.floor(seconds / 60);
const secs = seconds % 60;
return `${String(mins).padStart(2, "0")}:${secs.toFixed(3).padStart(6, "0")}`;
}// Check if transcript matches audio using confidence scores
async function validateTranscript(
audioPath: string,
transcript: string
): Promise<{ isValid: boolean; avgConfidence: number; lowConfidenceWords: string[] }> {
const audioFile = await readFile(audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], audioPath),
text: transcript,
});
const confidences = alignment.words
.filter(w => w.confidence !== undefined)
.map(w => w.confidence!);
const avgConfidence = confidences.reduce((a, b) => a + b, 0) / confidences.length;
const lowConfidenceWords = alignment.words
.filter(w => w.confidence && w.confidence < 0.5)
.map(w => w.word);
return {
isValid: avgConfidence > 0.7,
avgConfidence,
lowConfidenceWords,
};
}// Export alignment data as JSON
async function exportAlignment(
audioPath: string,
transcript: string,
outputPath: string
): Promise<void> {
const audioFile = await readFile(audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], audioPath),
text: transcript,
});
const data = {
transcript,
words: alignment.words,
characters: {
chars: alignment.alignment.characters,
start_times: alignment.alignment.character_start_times_seconds,
end_times: alignment.alignment.character_end_times_seconds,
},
};
await writeFile(outputPath, JSON.stringify(data, null, 2));
}
await exportAlignment("audio.mp3", "Transcript text", "alignment.json");// Generate phoneme timing for lip sync animation
async function getPhonemeTimings(
audioPath: string,
transcript: string
): Promise<Array<{ phoneme: string; start: number; end: number }>> {
const audioFile = await readFile(audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], audioPath),
text: transcript,
});
// Use character timings as approximation for phonemes
// In production, you'd map characters to phonemes
return alignment.alignment.characters.map((char, i) => ({
phoneme: char,
start: alignment.alignment.character_start_times_seconds[i],
end: alignment.alignment.character_end_times_seconds[i],
}));
}// Create data for real-time text highlighting
interface HighlightSegment {
text: string;
startTime: number;
endTime: number;
charStart: number;
charEnd: number;
}
async function createHighlightMap(
audioPath: string,
transcript: string
): Promise<HighlightSegment[]> {
const audioFile = await readFile(audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], audioPath),
text: transcript,
});
const segments: HighlightSegment[] = [];
let charOffset = 0;
for (const word of alignment.words) {
const wordLength = word.word.length;
segments.push({
text: word.word,
startTime: word.start,
endTime: word.end,
charStart: charOffset,
charEnd: charOffset + wordLength,
});
charOffset += wordLength + 1; // +1 for space
}
return segments;
}
// Use in player:
// segments.forEach(segment => {
// at time segment.startTime: highlight text from charStart to charEnd
// });// Align audio in different languages
const languages = ["en", "es", "fr", "de"];
for (const lang of languages) {
const audioFile = await readFile(`audio_${lang}.mp3`);
const transcript = getTranscript(lang); // Get transcript for language
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], `audio_${lang}.mp3`),
text: transcript,
language: lang,
});
console.log(`${lang} alignment complete:`, alignment.words.length, "words");
}// Process multiple files
interface AlignmentJob {
audioPath: string;
transcript: string;
language: string;
}
async function batchAlign(jobs: AlignmentJob[]): Promise<void> {
for (const job of jobs) {
try {
const audioFile = await readFile(job.audioPath);
const alignment = await client.forcedAlignment.create({
audio: new File([audioFile], job.audioPath),
text: job.transcript,
language: job.language,
});
console.log(`✓ Aligned ${job.audioPath}`);
// Save results
await writeFile(
`${job.audioPath}.json`,
JSON.stringify(alignment, null, 2)
);
} catch (error) {
console.error(`✗ Failed ${job.audioPath}:`, error);
}
}
}