0
# Audio Processing
1
2
## Audio Isolation
3
4
### audioIsolation.convert()
5
6
Remove background noise from audio.
7
8
```typescript { .api }
9
convert(
10
request: {
11
audio: File | Blob;
12
fileFormat?: string; // e.g., "mp3", "wav"
13
},
14
options?: RequestOptions
15
): Promise<ReadableStream<Uint8Array>>
16
```
17
18
**Example:**
19
```typescript
20
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
21
import fs from "fs";
22
23
const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
24
25
const noisyAudio = fs.readFileSync("noisy.mp3");
26
27
const clean = await client.audioIsolation.convert({
28
audio: noisyAudio,
29
fileFormat: "mp3"
30
});
31
32
const output = fs.createWriteStream("clean.mp3");
33
for await (const chunk of clean) {
34
output.write(chunk);
35
}
36
output.end();
37
```
38
39
## Forced Alignment
40
41
### forcedAlignment.create()
42
43
Align audio to text with precise timing for each word.
44
45
```typescript { .api }
46
create(
47
request: {
48
file: File | Blob;
49
text: string;
50
enabledSpooledFile?: boolean;
51
},
52
options?: RequestOptions
53
): Promise<ForcedAlignmentResponseModel>
54
55
interface ForcedAlignmentResponseModel {
56
characters: string[];
57
characterStartTimesSeconds: number[];
58
characterEndTimesSeconds: number[];
59
audioBase64?: string; // If enabledSpooledFile true
60
}
61
```
62
63
**Example:**
64
```typescript
65
const audioFile = fs.readFileSync("speech.mp3");
66
67
const alignment = await client.forcedAlignment.create({
68
file: audioFile,
69
text: "This is the spoken text",
70
enabledSpooledFile: true
71
});
72
73
alignment.characters.forEach((char, i) => {
74
console.log(
75
`"${char}": ${alignment.characterStartTimesSeconds[i]}s - ${alignment.characterEndTimesSeconds[i]}s`
76
);
77
});
78
79
if (alignment.audioBase64) {
80
const audioBuffer = Buffer.from(alignment.audioBase64, "base64");
81
fs.writeFileSync("aligned.mp3", audioBuffer);
82
}
83
```
84
85
## Speech-to-Speech
86
87
### speechToSpeech.convert()
88
89
Transform voice characteristics while preserving speech content.
90
91
```typescript { .api }
92
convert(
93
voiceId: string,
94
request: {
95
audio: File | Blob;
96
modelId?: string;
97
voiceSettings?: string;
98
seed?: number;
99
removeBackgroundNoise?: boolean;
100
},
101
options?: RequestOptions
102
): Promise<ReadableStream<Uint8Array>>
103
```
104
105
**Example:**
106
```typescript
107
const sourceAudio = fs.readFileSync("original_voice.mp3");
108
109
const transformed = await client.speechToSpeech.convert(
110
"21m00Tcm4TlvDq8ikWAM", // Target voice ID
111
{
112
audio: sourceAudio,
113
modelId: "eleven_multilingual_v2",
114
removeBackgroundNoise: true
115
}
116
);
117
118
const output = fs.createWriteStream("transformed_voice.mp3");
119
for await (const chunk of transformed) {
120
output.write(chunk);
121
}
122
output.end();
123
```
124
125
## Stream Consumption Pattern
126
127
**CRITICAL**: Do NOT use `.pipe()` with Web Streams API.
128
129
```typescript
130
// ❌ WRONG - Do not use .pipe()
131
const stream = await client.audioIsolation.convert({ audio: file });
132
stream.pipe(outputStream); // This will fail
133
134
// ✅ CORRECT - Use async iteration
135
const stream = await client.audioIsolation.convert({ audio: file });
136
for await (const chunk of stream) {
137
outputStream.write(chunk);
138
}
139
140
// ✅ CORRECT - Manual reader
141
const stream = await client.audioIsolation.convert({ audio: file });
142
const reader = stream.getReader();
143
while (true) {
144
const { done, value } = await reader.read();
145
if (done) break;
146
outputStream.write(value);
147
}
148
```
149
150
## Complete Examples
151
152
### Audio Cleanup Pipeline
153
154
```typescript
155
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
156
import fs from "fs";
157
158
const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
159
160
// 1. Remove background noise
161
const noisyAudio = fs.readFileSync("recording.mp3");
162
const isolated = await client.audioIsolation.convert({
163
audio: noisyAudio,
164
fileFormat: "mp3"
165
});
166
167
// Save cleaned audio
168
const cleanFile = fs.createWriteStream("cleaned.mp3");
169
for await (const chunk of isolated) {
170
cleanFile.write(chunk);
171
}
172
cleanFile.end();
173
174
console.log("Audio cleaned");
175
```
176
177
### Forced Alignment for Subtitles
178
179
```typescript
180
const audioFile = fs.readFileSync("speech.mp3");
181
const spokenText = "Hello, welcome to our presentation. Today we'll discuss...";
182
183
const alignment = await client.forcedAlignment.create({
184
file: audioFile,
185
text: spokenText,
186
enabledSpooledFile: false
187
});
188
189
// Generate word-level timestamps
190
let currentWord = "";
191
let wordStart = 0;
192
const words: { word: string; start: number; end: number }[] = [];
193
194
alignment.characters.forEach((char, i) => {
195
if (char === " " || char === "\n") {
196
if (currentWord) {
197
words.push({
198
word: currentWord,
199
start: wordStart,
200
end: alignment.characterEndTimesSeconds[i - 1]
201
});
202
currentWord = "";
203
}
204
} else {
205
if (!currentWord) {
206
wordStart = alignment.characterStartTimesSeconds[i];
207
}
208
currentWord += char;
209
}
210
});
211
212
// Add last word
213
if (currentWord) {
214
words.push({
215
word: currentWord,
216
start: wordStart,
217
end: alignment.characterEndTimesSeconds[alignment.characters.length - 1]
218
});
219
}
220
221
words.forEach(w => {
222
console.log(`"${w.word}": ${w.start}s - ${w.end}s`);
223
});
224
```
225
226
### Voice Transformation
227
228
```typescript
229
// Transform speaking style
230
const recording = fs.readFileSync("my_recording.mp3");
231
232
const professional = await client.speechToSpeech.convert(
233
"professional_voice_id",
234
{
235
audio: recording,
236
modelId: "eleven_multilingual_v2",
237
removeBackgroundNoise: true,
238
seed: 12345 // For reproducibility
239
}
240
);
241
242
const output = fs.createWriteStream("professional_voice.mp3");
243
for await (const chunk of professional) {
244
output.write(chunk);
245
}
246
output.end();
247
248
console.log("Voice transformed");
249
```
250
251
## Important Notes
252
253
- **Audio Isolation**: Removes background noise, improves voice clarity
254
- **Forced Alignment**: Character-level precision, useful for subtitles/karaoke
255
- **Speech-to-Speech**: Transforms voice while preserving content and timing
256
- **Stream Handling**: Never use `.pipe()` with Web Streams API
257
- Use `for await...of` or manual reader
258
- Web Streams API returns `ReadableStream<Uint8Array>`
259
- **File Formats**: Supports MP3, WAV, M4A, FLAC, etc.
260
- **Background Noise Removal**: Available in multiple endpoints
261
- `audioIsolation.convert()`: Dedicated noise removal
262
- `speechToSpeech.convert()`: Optional parameter
263
- Voice cloning: Optional during voice creation
264