The official Python library for the cerebras API
npx @tessl/cli install tessl/pypi-cerebras-cloud-sdk@1.50.0The official Python library for the Cerebras Cloud API, providing access to Cerebras' Wafer-Scale Engine-3 (WSE-3) powered AI inference capabilities. The SDK offers both synchronous and asynchronous clients with comprehensive type definitions, streaming support, and built-in retry mechanisms for high-throughput AI inference workloads.
pip install cerebras_cloud_sdkimport cerebras.cloud.sdk as cerebrasMost common imports:
from cerebras.cloud.sdk import Cerebras, AsyncCerebrasFor type annotations:
from cerebras.cloud.sdk.types.chat import ChatCompletion, CompletionCreateParams
from cerebras.cloud.sdk import typesComplete import options:
# Main client classes
from cerebras.cloud.sdk import Cerebras, AsyncCerebras, Client, AsyncClient
# Core types and utilities
from cerebras.cloud.sdk import BaseModel, NOT_GIVEN, NotGiven, Omit, NoneType
from cerebras.cloud.sdk import Timeout, RequestOptions, Transport, ProxiesTypes
# Streaming classes
from cerebras.cloud.sdk import Stream, AsyncStream
# Response wrappers
from cerebras.cloud.sdk import APIResponse, AsyncAPIResponse
# Exception handling
from cerebras.cloud.sdk import (
CerebrasError, APIError, APIStatusError, APITimeoutError,
APIConnectionError, APIResponseValidationError,
BadRequestError, AuthenticationError, PermissionDeniedError,
NotFoundError, ConflictError, UnprocessableEntityError,
RateLimitError, InternalServerError
)
# Configuration constants
from cerebras.cloud.sdk import DEFAULT_TIMEOUT, DEFAULT_MAX_RETRIES, DEFAULT_CONNECTION_LIMITS
# HTTP clients
from cerebras.cloud.sdk import DefaultHttpxClient, DefaultAsyncHttpxClient, DefaultAioHttpClient
# Utility functions
from cerebras.cloud.sdk import file_from_path
# Direct resources access (alternative)
from cerebras.cloud.sdk import resourcesimport os
from cerebras.cloud.sdk import Cerebras
# Initialize client (API key from CEREBRAS_API_KEY env var)
client = Cerebras(api_key=os.getenv("CEREBRAS_API_KEY"))
# Simple chat completion
response = client.chat.completions.create(
model="llama3.1-70b",
messages=[
{"role": "user", "content": "What is machine learning?"}
],
max_tokens=100
)
print(response.choices[0].message.content)
# Async usage
import asyncio
from cerebras.cloud.sdk import AsyncCerebras
async def main():
client = AsyncCerebras()
response = await client.chat.completions.create(
model="llama3.1-70b",
messages=[{"role": "user", "content": "Hello!"}],
max_tokens=50
)
print(response.choices[0].message.content)
asyncio.run(main())The SDK follows a resource-based architecture:
Cerebras (sync) and AsyncCerebras (async) as main entry pointschat, completions, models)Stream and AsyncStreamThis design enables both simple usage patterns and advanced customization while maintaining full type safety and async/await compatibility.
Client initialization, configuration, and authentication for both synchronous and asynchronous usage patterns. Supports environment variable configuration, custom timeouts, retry policies, and HTTP client customization.
class Cerebras:
def __init__(
self,
*,
api_key: str | None = None,
base_url: str | httpx.URL | None = None,
timeout: Union[float, Timeout, None, NotGiven] = NOT_GIVEN,
max_retries: int = DEFAULT_MAX_RETRIES,
default_headers: Mapping[str, str] | None = None,
default_query: Mapping[str, object] | None = None,
http_client: httpx.Client | None = None,
_strict_response_validation: bool = False,
warm_tcp_connection: bool = True,
) -> None: ...
class AsyncCerebras:
def __init__(
self,
*,
api_key: str | None = None,
base_url: str | httpx.URL | None = None,
timeout: Union[float, Timeout, None, NotGiven] = NOT_GIVEN,
max_retries: int = DEFAULT_MAX_RETRIES,
default_headers: Mapping[str, str] | None = None,
default_query: Mapping[str, object] | None = None,
http_client: httpx.AsyncClient | None = None,
_strict_response_validation: bool = False,
warm_tcp_connection: bool = True,
) -> None: ...Modern chat completion API for conversational AI applications. Supports system messages, user messages, assistant messages, streaming responses, function calling, and comprehensive response metadata including token usage and timing information.
def create(
self,
*,
messages: Iterable[completion_create_params.Message],
model: str,
max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
min_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
parallel_tool_calls: Optional[bool] | NotGiven = NOT_GIVEN,
reasoning_effort: Optional[Literal["low", "medium", "high"]] | NotGiven = NOT_GIVEN,
service_tier: Optional[Literal["auto", "default"]] | NotGiven = NOT_GIVEN,
temperature: Optional[float] | NotGiven = NOT_GIVEN,
tool_choice: Optional[completion_create_params.ToolChoice] | NotGiven = NOT_GIVEN,
tools: Optional[Iterable[completion_create_params.Tool]] | NotGiven = NOT_GIVEN,
# ... additional parameters including cf_ray, x_amz_cf_id, extra_headers, etc.
) -> ChatCompletion | Stream[ChatCompletion]: ...Model listing and information retrieval for discovering available models and their capabilities. Provides access to model metadata, supported features, and configuration options.
def list(
self,
*
) -> ModelListResponse: ...
def retrieve(
self,
model_id: str,
*
) -> ModelRetrieveResponse: ...Legacy text completion API for traditional completion-style interactions. Supports text generation with various parameters including temperature, top-p sampling, frequency penalties, and custom stop sequences.
def create(
self,
*,
model: str,
best_of: Optional[int] = NOT_GIVEN,
echo: Optional[bool] = NOT_GIVEN,
frequency_penalty: Optional[float] = NOT_GIVEN,
logit_bias: Optional[Dict[str, int]] = NOT_GIVEN,
logprobs: Optional[int] = NOT_GIVEN,
max_tokens: Optional[int] = NOT_GIVEN,
n: Optional[int] = NOT_GIVEN,
presence_penalty: Optional[float] = NOT_GIVEN,
prompt: Union[str, List[str], List[int], List[List[int]], None] = NOT_GIVEN,
seed: Optional[int] = NOT_GIVEN,
stop: Union[Optional[str], List[str], None] = NOT_GIVEN,
stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
suffix: Optional[str] = NOT_GIVEN,
temperature: Optional[float] = NOT_GIVEN,
top_p: Optional[float] = NOT_GIVEN,
user: str | NotGiven = NOT_GIVEN,
**kwargs
) -> Completion: ...Comprehensive type system, exception handling, and configuration utilities. Includes Pydantic models for all API responses, TypedDict parameter classes, complete exception hierarchy, and utility functions for file handling and configuration.
# Core types
class BaseModel: ...
class NotGiven: ...
NOT_GIVEN: NotGiven
# Exception hierarchy
class CerebrasError(Exception): ...
class APIError(CerebrasError): ...
class APIStatusError(APIError): ...
class BadRequestError(APIStatusError): ...
class AuthenticationError(APIStatusError): ...
class RateLimitError(APIStatusError): ...
# Configuration types
Timeout: TypeAlias
Transport: TypeAlias
ProxiesTypes: TypeAlias
RequestOptions: TypeAlias
# Streaming classes
class Stream: ...
class AsyncStream: ...
# Response wrappers
class APIResponse: ...
class AsyncAPIResponse: ...All API methods return structured response objects with consistent patterns:
ChatCompletion objects with choices, usage metadata, and timing informationCompletion objects with generated text and token informationModelListResponse and ModelRetrieveResponse with model metadataBoth chat and legacy completions support streaming responses for real-time token generation:
# Streaming chat completion
stream = client.chat.completions.create(
model="llama3.1-70b",
messages=[{"role": "user", "content": "Tell me a story"}],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
## Alternative Resource Access
The SDK provides an alternative way to access resources directly through the resources module:
```python
from cerebras.cloud.sdk import resources
# Direct resource access (not bound to a client instance)
# Note: Still requires a configured client context