Python Client SDK for the Mistral AI API with chat completions, embeddings, fine-tuning, and agent capabilities.
—
Transcribe audio files to text with support for various audio formats and streaming. The audio API provides accurate speech-to-text conversion with language detection and formatting options.
Convert audio files to text with customizable options.
def transcribe(
file: Union[str, BinaryIO],
model: str,
language: Optional[str] = None,
prompt: Optional[str] = None,
response_format: Optional[str] = None,
temperature: Optional[float] = None,
timestamp_granularities: Optional[List[str]] = None,
**kwargs
) -> TranscriptionResponse:
"""
Transcribe audio to text.
Parameters:
- file: Audio file path (string) or file-like object (BinaryIO)
- model: Transcription model identifier
- language: Optional language code (e.g., "en", "fr", "es")
- prompt: Optional prompt to guide transcription
- response_format: Output format ("json", "text", "srt", "vtt")
- temperature: Sampling temperature for transcription
- timestamp_granularities: Timestamp precision levels
Returns:
TranscriptionResponse with transcribed text and metadata
"""Transcribe audio in real-time from streaming input.
def transcribe_stream(
stream: Iterator[bytes],
model: str,
language: Optional[str] = None,
**kwargs
) -> Iterator[TranscriptionStreamEvents]:
"""
Transcribe streaming audio.
Parameters:
- stream: Iterator of audio bytes
- model: Transcription model identifier
- language: Optional language code
Returns:
Iterator of transcription events with partial and final results
"""from mistralai import Mistral
client = Mistral(api_key="your-api-key")
# Transcribe an audio file
with open("recording.mp3", "rb") as audio_file:
response = client.audio.transcribe(
file=audio_file,
model="whisper-1",
language="en",
response_format="json"
)
print("Transcription:")
print(response.text)
print(f"Language detected: {response.language}")
print(f"Duration: {response.duration} seconds")# Get detailed transcription with timestamps
response = client.audio.transcribe(
file="meeting_recording.wav",
model="whisper-1",
response_format="json",
timestamp_granularities=["word", "segment"]
)
print("Detailed transcription:")
for segment in response.segments:
start_time = segment.start
end_time = segment.end
text = segment.text
print(f"[{start_time:.2f}s - {end_time:.2f}s]: {text}")
# Word-level timestamps
if hasattr(response, 'words'):
print("\nWord-level timing:")
for word in response.words[:10]: # First 10 words
print(f"'{word.word}' at {word.start:.2f}s")# Get transcription in different formats
formats = ["json", "text", "srt", "vtt"]
for format in formats:
response = client.audio.transcribe(
file="presentation.m4a",
model="whisper-1",
response_format=format
)
# Save to file
extension = "txt" if format == "text" else format
with open(f"transcription.{extension}", "w") as f:
if format == "json":
f.write(response.text)
else:
f.write(response)
print(f"Saved transcription in {format} format")import pyaudio
import threading
import queue
# Setup audio stream
def audio_stream_generator():
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True,
frames_per_buffer=1024
)
try:
while True:
data = stream.read(1024)
yield data
finally:
stream.stop_stream()
stream.close()
audio.terminate()
# Transcribe streaming audio
print("Starting real-time transcription...")
stream = client.audio.transcribe_stream(
stream=audio_stream_generator(),
model="whisper-1",
language="en"
)
for event in stream:
if event.type == "transcription.partial":
print(f"Partial: {event.text}", end="\r")
elif event.type == "transcription.completed":
print(f"\nFinal: {event.text}")import os
# Process multiple audio files
audio_files = ["interview1.mp3", "interview2.wav", "lecture.m4a"]
transcriptions = {}
for audio_file in audio_files:
if os.path.exists(audio_file):
print(f"Processing {audio_file}...")
response = client.audio.transcribe(
file=audio_file,
model="whisper-1",
language="auto", # Auto-detect language
response_format="json"
)
transcriptions[audio_file] = {
"text": response.text,
"language": response.language,
"duration": response.duration
}
print(f" Completed: {len(response.text)} characters")
# Save all transcriptions
import json
with open("all_transcriptions.json", "w") as f:
json.dump(transcriptions, f, indent=2)class AudioTranscriptionRequest:
file: Union[str, BinaryIO]
model: str
language: Optional[str]
prompt: Optional[str]
response_format: Optional[str]
temperature: Optional[float]
timestamp_granularities: Optional[List[str]]
class AudioTranscriptionRequestStream:
stream: Iterator[bytes]
model: str
language: Optional[str]class TranscriptionResponse:
text: str
language: Optional[str]
duration: Optional[float]
segments: Optional[List[TranscriptionSegment]]
words: Optional[List[TranscriptionWord]]
class TranscriptionSegment:
id: int
start: float
end: float
text: str
temperature: Optional[float]
avg_logprob: Optional[float]
compression_ratio: Optional[float]
no_speech_prob: Optional[float]
class TranscriptionWord:
word: str
start: float
end: float
class TranscriptionStreamEvents:
type: str # "transcription.partial", "transcription.completed", "error"
text: Optional[str]
language: Optional[str]
timestamp: Optional[float]class TranscriptionStreamEventTypes:
PARTIAL = "transcription.partial"
COMPLETED = "transcription.completed"
ERROR = "error"
DONE = "done"Supports many languages including:
Install with Tessl CLI
npx tessl i tessl/pypi-mistralai