The official Python client library for Ollama, providing both synchronous and asynchronous interfaces for text generation, chat, embeddings, and model management.
—
Complete synchronous and asynchronous client classes providing the full Ollama API with configurable hosts, custom headers, timeouts, and comprehensive error handling.
Type Imports: The signatures in this documentation use these typing imports:
from typing import Union, Sequence, Mapping, Callable, Literal, Any, Iterator
from pydantic.json_schema import JsonSchemaValueSynchronous HTTP client for Ollama API operations with configurable connection settings.
class Client:
def __init__(
self,
host: str = None,
*,
follow_redirects: bool = True,
timeout: Any = None,
headers: dict[str, str] = None,
**kwargs
):
"""
Create a synchronous Ollama client.
Parameters:
- host (str, optional): Ollama server host URL. Defaults to OLLAMA_HOST env var or localhost:11434
- follow_redirects (bool): Whether to follow HTTP redirects. Default: True
- timeout: Request timeout configuration
- headers (dict): Custom HTTP headers
- **kwargs: Additional httpx client arguments
"""Generate text completions from prompts with extensive configuration options.
def generate(
self,
model: str = '',
prompt: str = '',
suffix: str = None,
*,
system: str = None,
template: str = None,
context: Sequence[int] = None,
stream: bool = False,
think: bool = None,
raw: bool = None,
format: str = None,
images: Sequence[Union[str, bytes, Image]] = None,
options: Union[Mapping[str, Any], Options] = None,
keep_alive: Union[float, str] = None
) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
"""
Generate text from a prompt.
Parameters:
- model (str): Model name to use for generation. Default: ''
- prompt (str, optional): Text prompt for generation. Default: None
- suffix (str, optional): Text to append after generation
- system (str, optional): System message to set context
- template (str, optional): Custom prompt template
- context (list[int], optional): Token context from previous generation
- stream (bool): Return streaming responses. Default: False
- think (bool): Enable thinking mode for reasoning models
- raw (bool): Use raw mode (no template processing)
- format (str, optional): Response format ('json', etc.)
- images (list[Image], optional): Images for multimodal models
- options (Options, optional): Model configuration options
- keep_alive (str, optional): Keep model loaded duration
Returns:
GenerateResponse or Iterator[GenerateResponse] if streaming
"""Conduct multi-turn conversations with context preservation and tool calling support.
def chat(
self,
model: str = '',
messages: Sequence[Union[Mapping[str, Any], Message]] = None,
*,
tools: Sequence[Union[Mapping[str, Any], Tool, Callable]] = None,
stream: bool = False,
think: Union[bool, Literal['low', 'medium', 'high']] = None,
format: Union[Literal['', 'json'], JsonSchemaValue] = None,
options: Union[Mapping[str, Any], Options] = None,
keep_alive: Union[float, str] = None
) -> Union[ChatResponse, Iterator[ChatResponse]]:
"""
Chat with a model using conversation history.
Parameters:
- model (str): Model name to use for chat. Default: ''
- messages (Sequence[Union[Mapping, Message]], optional): Conversation messages. Default: None
- tools (Sequence[Union[Mapping, Tool, Callable]], optional): Available tools for function calling
- stream (bool): Return streaming responses. Default: False
- think (Union[bool, Literal['low', 'medium', 'high']], optional): Enable thinking mode for reasoning models
- format (str, optional): Response format ('json', etc.)
- options (Options, optional): Model configuration options
- keep_alive (str, optional): Keep model loaded duration
Returns:
ChatResponse or Iterator[ChatResponse] if streaming
"""Generate vector embeddings from text inputs for semantic similarity and search applications.
def embed(
self,
model: str = '',
input: Union[str, Sequence[str]] = '',
truncate: bool = None,
options: Options = None,
keep_alive: str = None
) -> EmbedResponse:
"""
Generate embeddings for input text(s).
Parameters:
- model (str): Embedding model name
- input (str | list[str]): Text or list of texts to embed
- truncate (bool, optional): Truncate inputs that exceed model limits
- options (Options, optional): Model configuration options
- keep_alive (str, optional): Keep model loaded duration
Returns:
EmbedResponse containing embedding vectors
"""
def embeddings(
self,
model: str,
prompt: str,
options: Options = None,
keep_alive: str = None
) -> EmbeddingsResponse:
"""
Generate embeddings (deprecated - use embed instead).
Parameters:
- model (str): Embedding model name
- prompt (str): Text to embed
- options (Options, optional): Model configuration options
- keep_alive (str, optional): Keep model loaded duration
Returns:
EmbeddingsResponse containing single embedding vector
"""Download, upload, create, and manage Ollama models with progress tracking.
def pull(
self,
model: str,
*,
insecure: bool = False,
stream: bool = False
) -> ProgressResponse | Iterator[ProgressResponse]:
"""
Download a model from a model library.
Parameters:
- model (str): Model name to download
- insecure (bool): Allow insecure connections. Default: False
- stream (bool): Return streaming progress. Default: False
Returns:
ProgressResponse or Iterator[ProgressResponse] if streaming
"""
def push(
self,
model: str,
*,
insecure: bool = False,
stream: bool = False
) -> ProgressResponse | Iterator[ProgressResponse]:
"""
Upload a model to a model library.
Parameters:
- model (str): Model name to upload
- insecure (bool): Allow insecure connections. Default: False
- stream (bool): Return streaming progress. Default: False
Returns:
ProgressResponse or Iterator[ProgressResponse] if streaming
"""
def create(
self,
model: str,
quantize: str = None,
from_: str = None,
files: dict = None,
adapters: dict[str, str] = None,
template: str = None,
license: Union[str, list[str]] = None,
system: str = None,
parameters: dict = None,
messages: list[Message] = None,
*,
stream: bool = False
) -> ProgressResponse | Iterator[ProgressResponse]:
"""
Create a new model from a Modelfile.
Parameters:
- model (str): Name for the new model
- quantize (str, optional): Quantization method
- from_ (str, optional): Base model to inherit from
- files (dict, optional): Additional files to include
- adapters (list[str], optional): Model adapters to apply
- template (str, optional): Prompt template
- license (str, optional): Model license
- system (str, optional): System message template
- parameters (dict, optional): Model parameters
- messages (list[Message], optional): Example messages
- stream (bool): Return streaming progress. Default: False
Returns:
ProgressResponse or Iterator[ProgressResponse] if streaming
"""
def create_blob(
self,
path: Union[str, Path]
) -> str:
"""
Create a blob from a file for model creation.
Parameters:
- path (str | Path): Path to file to create blob from
Returns:
str: Blob digest hash
"""
def delete(
self,
model: str
) -> StatusResponse:
"""
Delete a model.
Parameters:
- model (str): Name of model to delete
Returns:
StatusResponse with deletion status
"""
def copy(
self,
source: str,
destination: str
) -> StatusResponse:
"""
Copy a model.
Parameters:
- source (str): Source model name
- destination (str): Destination model name
Returns:
StatusResponse with copy status
"""Retrieve information about available and running models.
def list(
self
) -> ListResponse:
"""
List available models.
Returns:
ListResponse containing model information
"""
def show(
self,
model: str
) -> ShowResponse:
"""
Show information about a specific model.
Parameters:
- model (str): Model name to show information for
Returns:
ShowResponse with detailed model information
"""
def ps(
self
) -> ProcessResponse:
"""
List running models and their resource usage.
Returns:
ProcessResponse with currently running models
"""Asynchronous HTTP client for Ollama API operations with the same interface as Client but using async/await patterns.
class AsyncClient:
def __init__(
self,
host: str = None,
*,
follow_redirects: bool = True,
timeout: Any = None,
headers: dict[str, str] = None,
**kwargs
):
"""
Create an asynchronous Ollama client.
Parameters: Same as Client class
"""
async def generate(self, model: str = '', prompt: str = '', **kwargs):
"""Async version of Client.generate()"""
async def chat(self, model: str = '', messages: Sequence[Union[Mapping, Message]] = None, **kwargs):
"""Async version of Client.chat()"""
async def embed(self, model: str = '', input: Union[str, Sequence[str]] = '', **kwargs):
"""Async version of Client.embed()"""
async def embeddings(self, model: str, prompt: str, **kwargs):
"""Async version of Client.embeddings() (deprecated)"""
async def pull(self, model: str, **kwargs):
"""Async version of Client.pull()"""
async def push(self, model: str, **kwargs):
"""Async version of Client.push()"""
async def create(self, model: str, **kwargs):
"""Async version of Client.create()"""
async def create_blob(self, path: Union[str, Path]) -> str:
"""Async version of Client.create_blob()"""
async def delete(self, model: str) -> StatusResponse:
"""Async version of Client.delete()"""
async def copy(self, source: str, destination: str) -> StatusResponse:
"""Async version of Client.copy()"""
async def list(self) -> ListResponse:
"""Async version of Client.list()"""
async def show(self, model: str) -> ShowResponse:
"""Async version of Client.show()"""
async def ps(self) -> ProcessResponse:
"""Async version of Client.ps()"""from ollama import Client
import httpx
# Custom client with authentication
client = Client(
host='https://my-ollama-server.com',
headers={'Authorization': 'Bearer token'},
timeout=httpx.Timeout(30.0)
)
# Generate with custom client
response = client.generate(
model='custom-model',
prompt='Hello, world!',
options={'temperature': 0.7}
)from ollama import Client
client = Client()
# Stream text generation
print("Generating story...")
for chunk in client.generate(
model='llama3.2',
prompt='Write a short story about a robot',
stream=True
):
if chunk.get('response'):
print(chunk['response'], end='', flush=True)
print("\n\nPulling model...")
# Stream model download progress
for progress in client.pull('phi3', stream=True):
if progress.get('completed') and progress.get('total'):
percent = (progress['completed'] / progress['total']) * 100
print(f"Progress: {percent:.1f}%")import asyncio
from ollama import AsyncClient
async def main():
async with AsyncClient() as client:
# Concurrent requests
tasks = [
client.generate(model='llama3.2', prompt=f'Story {i}')
for i in range(3)
]
responses = await asyncio.gather(*tasks)
for i, response in enumerate(responses):
print(f"Story {i}: {response['response'][:100]}...")
asyncio.run(main())Install with Tessl CLI
npx tessl i tessl/pypi-ollama