The official Python library for the cerebras API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Modern chat completion API for conversational AI applications. Supports system messages, user messages, assistant messages, streaming responses, function calling, and comprehensive response metadata including token usage and timing information.
Creates chat completions using the conversational message format with support for various AI models and extensive configuration options.
def create(
self,
*,
messages: Iterable[completion_create_params.Message],
model: str,
frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
logit_bias: Optional[object] | NotGiven = NOT_GIVEN,
logprobs: Optional[bool] | NotGiven = NOT_GIVEN,
max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
min_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
min_tokens: Optional[int] | NotGiven = NOT_GIVEN,
n: Optional[int] | NotGiven = NOT_GIVEN,
parallel_tool_calls: Optional[bool] | NotGiven = NOT_GIVEN,
presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
reasoning_effort: Optional[Literal["low", "medium", "high"]] | NotGiven = NOT_GIVEN,
response_format: Optional[completion_create_params.ResponseFormat] | NotGiven = NOT_GIVEN,
seed: Optional[int] | NotGiven = NOT_GIVEN,
service_tier: Optional[Literal["auto", "default"]] | NotGiven = NOT_GIVEN,
stop: Union[str, List[str], None] | NotGiven = NOT_GIVEN,
stream: Optional[bool] | NotGiven = NOT_GIVEN,
stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
temperature: Optional[float] | NotGiven = NOT_GIVEN,
tool_choice: Optional[completion_create_params.ToolChoice] | NotGiven = NOT_GIVEN,
tools: Optional[Iterable[completion_create_params.Tool]] | NotGiven = NOT_GIVEN,
top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
top_p: Optional[float] | NotGiven = NOT_GIVEN,
user: Optional[str] | NotGiven = NOT_GIVEN,
cf_ray: str | NotGiven = NOT_GIVEN,
x_amz_cf_id: str | NotGiven = NOT_GIVEN,
x_delay_time: float | NotGiven = NOT_GIVEN,
extra_headers: Headers | None = None,
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
) -> ChatCompletion | Stream[ChatCompletion]:
"""
Create a chat completion.
Parameters:
- messages: List of conversation messages with roles (system, user, assistant)
- model: ID of the model to use (e.g., "llama3.1-70b")
- frequency_penalty: Penalty for frequent token usage (-2.0 to 2.0)
- logit_bias: Modify likelihood of specific tokens appearing (JSON object)
- logprobs: Return log probabilities of output tokens
- max_completion_tokens: Maximum number of completion tokens to generate
- max_tokens: Maximum number of tokens to generate (legacy parameter)
- min_completion_tokens: Minimum number of completion tokens to generate
- min_tokens: Minimum number of tokens to generate (legacy parameter)
- n: Number of completion choices to generate
- parallel_tool_calls: Enable parallel tool calling
- presence_penalty: Penalty for token presence (-2.0 to 2.0)
- reasoning_effort: Reasoning effort level ("low", "medium", "high")
- response_format: Format of the response (text or json_object)
- seed: Random seed for deterministic generation
- service_tier: Service tier for request processing ("auto", "default")
- stop: Sequences where generation should stop
- stream: Enable streaming response (use stream=True for streaming)
- stream_options: Additional streaming options
- temperature: Sampling temperature (0.0 to 2.0)
- tool_choice: Control tool calling behavior
- tools: List of available tools/functions
- top_logprobs: Number of top log probabilities to return per token
- top_p: Nucleus sampling parameter
- user: Unique identifier for the end-user
- cf_ray: CloudFlare Ray ID for request tracing
- x_amz_cf_id: Amazon CloudFront ID for request tracing
- x_delay_time: Additional delay time for request processing
- extra_headers: Additional headers to include with the request
- extra_query: Additional query parameters
- extra_body: Additional request body data
- timeout: Request timeout override
Returns:
ChatCompletion object or Stream[ChatCompletion] for streaming responses
"""Creates streaming chat completions for real-time token generation and immediate response delivery.
def create(
self,
*,
messages: Iterable[CompletionCreateParams.Message],
model: str,
stream: Literal[True],
**kwargs
) -> Stream[ChatCompletionChunk]:
"""
Create a streaming chat completion.
Parameters:
- stream: Must be True for streaming responses
- All other parameters same as non-streaming create()
Returns:
Stream object yielding ChatCompletionChunk objects
"""Synchronous and asynchronous resource classes that provide the chat completion API methods.
class ChatResource:
"""Synchronous chat resource."""
completions: CompletionsResource
@cached_property
def with_raw_response(self) -> ChatResourceWithRawResponse: ...
@cached_property
def with_streaming_response(self) -> ChatResourceWithStreamingResponse: ...
class AsyncChatResource:
"""Asynchronous chat resource."""
completions: AsyncCompletionsResource
@cached_property
def with_raw_response(self) -> AsyncChatResourceWithRawResponse: ...
@cached_property
def with_streaming_response(self) -> AsyncChatResourceWithStreamingResponse: ...
class CompletionsResource(SyncAPIResource):
"""Synchronous chat completions resource."""
class AsyncCompletionsResource(AsyncAPIResource):
"""Asynchronous chat completions resource."""class Message(TypedDict):
"""Base message structure for chat completions."""
role: Literal["system", "user", "assistant", "tool"]
content: str
name: NotRequired[str] # Optional name for the message author
class SystemMessage(Message):
"""System message for setting context and instructions."""
role: Literal["system"]
content: str
class UserMessage(Message):
"""User message containing the user's input."""
role: Literal["user"]
content: str
class AssistantMessage(Message):
"""Assistant message with the AI's response."""
role: Literal["assistant"]
content: str
tool_calls: NotRequired[List[ToolCall]] # Optional tool calls
class ToolMessage(Message):
"""Tool message containing tool execution results."""
role: Literal["tool"]
content: str
tool_call_id: str # ID of the tool call this responds toclass Tool(TypedDict):
"""Tool/function definition for function calling."""
type: Literal["function"]
function: FunctionDefinition
class FunctionDefinition(TypedDict):
"""Function definition with name, description, and parameters."""
name: str
description: str
parameters: Dict[str, Any] # JSON Schema for parameters
class ToolCall(TypedDict):
"""Tool call made by the assistant."""
id: str
type: Literal["function"]
function: FunctionCall
class FunctionCall(TypedDict):
"""Function call details."""
name: str
arguments: str # JSON string of arguments
class ToolChoice(TypedDict):
"""Tool choice configuration."""
type: Literal["function"]
function: Dict[str, str] # {"name": "function_name"}class ChatCompletion(BaseModel):
"""Complete chat completion response."""
id: str
choices: List[ChatCompletionChoice]
created: int
model: str
object: Literal["chat.completion"]
system_fingerprint: Optional[str]
usage: Optional[ChatCompletionUsage]
time_info: Optional[ChatCompletionTimeInfo]
class ChatCompletionChoice(BaseModel):
"""Individual completion choice."""
finish_reason: Optional[Literal["stop", "length", "tool_calls", "content_filter"]]
index: int
logprobs: Optional[ChatCompletionLogprobs]
message: ChatCompletionMessage
class ChatCompletionMessage(BaseModel):
"""Message in the completion response."""
content: Optional[str]
role: Literal["assistant"]
tool_calls: Optional[List[ChatCompletionMessageToolCall]]
class ChatCompletionUsage(BaseModel):
"""Token usage information."""
completion_tokens: int
prompt_tokens: int
total_tokens: int
prompt_tokens_details: Optional[ChatCompletionUsagePromptTokensDetails]
class ChatCompletionTimeInfo(BaseModel):
"""Timing information for the completion."""
queue_time: Optional[float]
prompt_time: Optional[float]
completion_time: Optional[float]
total_time: Optional[float]class ChatCompletionChunk(BaseModel):
"""Streaming chunk in chat completion."""
id: str
choices: List[ChatCompletionChunkChoice]
created: int
model: str
object: Literal["chat.completion.chunk"]
system_fingerprint: Optional[str]
usage: Optional[ChatCompletionUsage]
time_info: Optional[ChatCompletionTimeInfo]
class ChatCompletionChunkChoice(BaseModel):
"""Choice in streaming chunk."""
delta: ChatCompletionChunkDelta
finish_reason: Optional[Literal["stop", "length", "tool_calls", "content_filter"]]
index: int
logprobs: Optional[ChatCompletionLogprobs]
class ChatCompletionChunkDelta(BaseModel):
"""Delta information in streaming chunk."""
content: Optional[str]
role: Optional[Literal["assistant"]]
tool_calls: Optional[List[ChatCompletionChunkDeltaToolCall]]from cerebras.cloud.sdk import Cerebras
client = Cerebras()
response = client.chat.completions.create(
model="llama3.1-70b",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is machine learning?"}
],
max_tokens=200,
temperature=0.7
)
print(response.choices[0].message.content)
print(f"Used {response.usage.total_tokens} tokens")from cerebras.cloud.sdk import Cerebras
client = Cerebras()
stream = client.chat.completions.create(
model="llama3.1-70b",
messages=[
{"role": "user", "content": "Tell me a short story"}
],
stream=True,
max_tokens=500
)
print("Story: ", end="")
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print()from cerebras.cloud.sdk import Cerebras
import json
client = Cerebras()
# Define a function
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get weather information for a location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state/country"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "Temperature unit"
}
},
"required": ["location"]
}
}
}
]
response = client.chat.completions.create(
model="llama3.1-70b",
messages=[
{"role": "user", "content": "What's the weather like in San Francisco?"}
],
tools=tools,
tool_choice="auto"
)
# Check if the model wants to call a function
message = response.choices[0].message
if message.tool_calls:
tool_call = message.tool_calls[0]
function_name = tool_call.function.name
function_args = json.loads(tool_call.function.arguments)
print(f"Model wants to call {function_name} with args: {function_args}")import asyncio
from cerebras.cloud.sdk import AsyncCerebras
async def chat_example():
client = AsyncCerebras()
response = await client.chat.completions.create(
model="llama3.1-70b",
messages=[
{"role": "user", "content": "Explain quantum computing"}
],
max_tokens=300
)
print(response.choices[0].message.content)
await client.aclose()
asyncio.run(chat_example())from cerebras.cloud.sdk import Cerebras
client = Cerebras()
response = client.chat.completions.create(
model="llama3.1-70b",
messages=[
{"role": "user", "content": "Write a creative opening line for a story"}
],
n=3, # Generate 3 different completions
max_tokens=50,
temperature=0.9
)
for i, choice in enumerate(response.choices):
print(f"Option {i+1}: {choice.message.content}")Install with Tessl CLI
npx tessl i tessl/pypi-cerebras-cloud-sdk