CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-cerebras-cloud-sdk

The official Python library for the cerebras API

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

chat-completions.mddocs/

Chat Completions

Modern chat completion API for conversational AI applications. Supports system messages, user messages, assistant messages, streaming responses, function calling, and comprehensive response metadata including token usage and timing information.

Capabilities

Chat Completion Creation

Creates chat completions using the conversational message format with support for various AI models and extensive configuration options.

def create(
    self,
    *,
    messages: Iterable[completion_create_params.Message],
    model: str,
    frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
    logit_bias: Optional[object] | NotGiven = NOT_GIVEN,
    logprobs: Optional[bool] | NotGiven = NOT_GIVEN,
    max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
    max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
    min_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
    min_tokens: Optional[int] | NotGiven = NOT_GIVEN,
    n: Optional[int] | NotGiven = NOT_GIVEN,
    parallel_tool_calls: Optional[bool] | NotGiven = NOT_GIVEN,
    presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
    reasoning_effort: Optional[Literal["low", "medium", "high"]] | NotGiven = NOT_GIVEN,
    response_format: Optional[completion_create_params.ResponseFormat] | NotGiven = NOT_GIVEN,
    seed: Optional[int] | NotGiven = NOT_GIVEN,
    service_tier: Optional[Literal["auto", "default"]] | NotGiven = NOT_GIVEN,
    stop: Union[str, List[str], None] | NotGiven = NOT_GIVEN,
    stream: Optional[bool] | NotGiven = NOT_GIVEN,
    stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
    temperature: Optional[float] | NotGiven = NOT_GIVEN,
    tool_choice: Optional[completion_create_params.ToolChoice] | NotGiven = NOT_GIVEN,
    tools: Optional[Iterable[completion_create_params.Tool]] | NotGiven = NOT_GIVEN,
    top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
    top_p: Optional[float] | NotGiven = NOT_GIVEN,
    user: Optional[str] | NotGiven = NOT_GIVEN,
    cf_ray: str | NotGiven = NOT_GIVEN,
    x_amz_cf_id: str | NotGiven = NOT_GIVEN,
    x_delay_time: float | NotGiven = NOT_GIVEN,
    extra_headers: Headers | None = None,
    extra_query: Query | None = None,
    extra_body: Body | None = None,
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
) -> ChatCompletion | Stream[ChatCompletion]:
    """
    Create a chat completion.
    
    Parameters:
    - messages: List of conversation messages with roles (system, user, assistant)
    - model: ID of the model to use (e.g., "llama3.1-70b")
    - frequency_penalty: Penalty for frequent token usage (-2.0 to 2.0)
    - logit_bias: Modify likelihood of specific tokens appearing (JSON object)
    - logprobs: Return log probabilities of output tokens
    - max_completion_tokens: Maximum number of completion tokens to generate
    - max_tokens: Maximum number of tokens to generate (legacy parameter)
    - min_completion_tokens: Minimum number of completion tokens to generate
    - min_tokens: Minimum number of tokens to generate (legacy parameter)
    - n: Number of completion choices to generate
    - parallel_tool_calls: Enable parallel tool calling
    - presence_penalty: Penalty for token presence (-2.0 to 2.0)
    - reasoning_effort: Reasoning effort level ("low", "medium", "high")
    - response_format: Format of the response (text or json_object)
    - seed: Random seed for deterministic generation
    - service_tier: Service tier for request processing ("auto", "default")
    - stop: Sequences where generation should stop
    - stream: Enable streaming response (use stream=True for streaming)
    - stream_options: Additional streaming options
    - temperature: Sampling temperature (0.0 to 2.0)
    - tool_choice: Control tool calling behavior
    - tools: List of available tools/functions
    - top_logprobs: Number of top log probabilities to return per token
    - top_p: Nucleus sampling parameter
    - user: Unique identifier for the end-user
    - cf_ray: CloudFlare Ray ID for request tracing
    - x_amz_cf_id: Amazon CloudFront ID for request tracing
    - x_delay_time: Additional delay time for request processing
    - extra_headers: Additional headers to include with the request
    - extra_query: Additional query parameters
    - extra_body: Additional request body data
    - timeout: Request timeout override
    
    Returns:
    ChatCompletion object or Stream[ChatCompletion] for streaming responses
    """

Streaming Chat Completion

Creates streaming chat completions for real-time token generation and immediate response delivery.

def create(
    self,
    *,
    messages: Iterable[CompletionCreateParams.Message],
    model: str,
    stream: Literal[True],
    **kwargs
) -> Stream[ChatCompletionChunk]:
    """
    Create a streaming chat completion.
    
    Parameters:
    - stream: Must be True for streaming responses
    - All other parameters same as non-streaming create()
    
    Returns:
    Stream object yielding ChatCompletionChunk objects
    """

Resource Classes

Synchronous and asynchronous resource classes that provide the chat completion API methods.

class ChatResource:
    """Synchronous chat resource."""
    completions: CompletionsResource
    
    @cached_property
    def with_raw_response(self) -> ChatResourceWithRawResponse: ...
    
    @cached_property
    def with_streaming_response(self) -> ChatResourceWithStreamingResponse: ...

class AsyncChatResource:
    """Asynchronous chat resource."""
    completions: AsyncCompletionsResource
    
    @cached_property
    def with_raw_response(self) -> AsyncChatResourceWithRawResponse: ...
    
    @cached_property
    def with_streaming_response(self) -> AsyncChatResourceWithStreamingResponse: ...

class CompletionsResource(SyncAPIResource):
    """Synchronous chat completions resource."""
    
class AsyncCompletionsResource(AsyncAPIResource):
    """Asynchronous chat completions resource."""

Message Types

Message Structure

class Message(TypedDict):
    """Base message structure for chat completions."""
    role: Literal["system", "user", "assistant", "tool"]
    content: str
    name: NotRequired[str]  # Optional name for the message author

class SystemMessage(Message):
    """System message for setting context and instructions."""
    role: Literal["system"]
    content: str

class UserMessage(Message):
    """User message containing the user's input."""
    role: Literal["user"]
    content: str

class AssistantMessage(Message):
    """Assistant message with the AI's response."""
    role: Literal["assistant"]
    content: str
    tool_calls: NotRequired[List[ToolCall]]  # Optional tool calls

class ToolMessage(Message):
    """Tool message containing tool execution results."""
    role: Literal["tool"]
    content: str
    tool_call_id: str  # ID of the tool call this responds to

Tool Calling Types

class Tool(TypedDict):
    """Tool/function definition for function calling."""
    type: Literal["function"]
    function: FunctionDefinition

class FunctionDefinition(TypedDict):
    """Function definition with name, description, and parameters."""
    name: str
    description: str
    parameters: Dict[str, Any]  # JSON Schema for parameters

class ToolCall(TypedDict):
    """Tool call made by the assistant."""
    id: str
    type: Literal["function"]
    function: FunctionCall

class FunctionCall(TypedDict):
    """Function call details."""
    name: str
    arguments: str  # JSON string of arguments

class ToolChoice(TypedDict):
    """Tool choice configuration."""
    type: Literal["function"]
    function: Dict[str, str]  # {"name": "function_name"}

Response Types

Chat Completion Response

class ChatCompletion(BaseModel):
    """Complete chat completion response."""
    id: str
    choices: List[ChatCompletionChoice]
    created: int
    model: str
    object: Literal["chat.completion"]
    system_fingerprint: Optional[str]
    usage: Optional[ChatCompletionUsage]
    time_info: Optional[ChatCompletionTimeInfo]

class ChatCompletionChoice(BaseModel):
    """Individual completion choice."""
    finish_reason: Optional[Literal["stop", "length", "tool_calls", "content_filter"]]
    index: int
    logprobs: Optional[ChatCompletionLogprobs]
    message: ChatCompletionMessage

class ChatCompletionMessage(BaseModel):
    """Message in the completion response."""
    content: Optional[str]
    role: Literal["assistant"]
    tool_calls: Optional[List[ChatCompletionMessageToolCall]]

class ChatCompletionUsage(BaseModel):
    """Token usage information."""
    completion_tokens: int
    prompt_tokens: int
    total_tokens: int
    prompt_tokens_details: Optional[ChatCompletionUsagePromptTokensDetails]

class ChatCompletionTimeInfo(BaseModel):
    """Timing information for the completion."""
    queue_time: Optional[float]
    prompt_time: Optional[float]
    completion_time: Optional[float]
    total_time: Optional[float]

Streaming Response Types

class ChatCompletionChunk(BaseModel):
    """Streaming chunk in chat completion."""
    id: str
    choices: List[ChatCompletionChunkChoice]
    created: int
    model: str
    object: Literal["chat.completion.chunk"]
    system_fingerprint: Optional[str]
    usage: Optional[ChatCompletionUsage]
    time_info: Optional[ChatCompletionTimeInfo]

class ChatCompletionChunkChoice(BaseModel):
    """Choice in streaming chunk."""
    delta: ChatCompletionChunkDelta
    finish_reason: Optional[Literal["stop", "length", "tool_calls", "content_filter"]]
    index: int
    logprobs: Optional[ChatCompletionLogprobs]

class ChatCompletionChunkDelta(BaseModel):
    """Delta information in streaming chunk."""
    content: Optional[str]
    role: Optional[Literal["assistant"]]
    tool_calls: Optional[List[ChatCompletionChunkDeltaToolCall]]

Usage Examples

Basic Chat Completion

from cerebras.cloud.sdk import Cerebras

client = Cerebras()

response = client.chat.completions.create(
    model="llama3.1-70b",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is machine learning?"}
    ],
    max_tokens=200,
    temperature=0.7
)

print(response.choices[0].message.content)
print(f"Used {response.usage.total_tokens} tokens")

Streaming Chat Completion

from cerebras.cloud.sdk import Cerebras

client = Cerebras()

stream = client.chat.completions.create(
    model="llama3.1-70b",
    messages=[
        {"role": "user", "content": "Tell me a short story"}
    ],
    stream=True,
    max_tokens=500
)

print("Story: ", end="")
for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)
print()

Function Calling

from cerebras.cloud.sdk import Cerebras
import json

client = Cerebras()

# Define a function
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get weather information for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state/country"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

response = client.chat.completions.create(
    model="llama3.1-70b",
    messages=[
        {"role": "user", "content": "What's the weather like in San Francisco?"}
    ],
    tools=tools,
    tool_choice="auto"
)

# Check if the model wants to call a function
message = response.choices[0].message
if message.tool_calls:
    tool_call = message.tool_calls[0]
    function_name = tool_call.function.name
    function_args = json.loads(tool_call.function.arguments)
    print(f"Model wants to call {function_name} with args: {function_args}")

Async Chat Completion

import asyncio
from cerebras.cloud.sdk import AsyncCerebras

async def chat_example():
    client = AsyncCerebras()
    
    response = await client.chat.completions.create(
        model="llama3.1-70b",
        messages=[
            {"role": "user", "content": "Explain quantum computing"}
        ],
        max_tokens=300
    )
    
    print(response.choices[0].message.content)
    
    await client.aclose()

asyncio.run(chat_example())

Multiple Completions

from cerebras.cloud.sdk import Cerebras

client = Cerebras()

response = client.chat.completions.create(
    model="llama3.1-70b",
    messages=[
        {"role": "user", "content": "Write a creative opening line for a story"}
    ],
    n=3,  # Generate 3 different completions
    max_tokens=50,
    temperature=0.9
)

for i, choice in enumerate(response.choices):
    print(f"Option {i+1}: {choice.message.content}")

Install with Tessl CLI

npx tessl i tessl/pypi-cerebras-cloud-sdk

docs

chat-completions.md

client-management.md

index.md

legacy-completions.md

models.md

types-and-configuration.md

tile.json