Python client for Together's Cloud Platform providing comprehensive AI model APIs
Dedicated endpoint management for deploying and scaling AI models on Together's infrastructure. Provides capabilities for creating, managing, and monitoring custom deployments with dedicated compute resources, autoscaling configurations, and hardware optimization.
List all available endpoints with optional filtering by type (dedicated or serverless).
def list(type: Optional[Literal["dedicated", "serverless"]] = None) -> List[ListEndpoint]:
"""
List all endpoints, can be filtered by type.
Args:
type: Filter endpoints by type ("dedicated" or "serverless")
Returns:
List[ListEndpoint]: List of endpoint objects
"""Usage Example:
from together import Together
client = Together()
# List all endpoints
all_endpoints = client.endpoints.list()
# List only dedicated endpoints
dedicated_endpoints = client.endpoints.list(type="dedicated")
for endpoint in dedicated_endpoints:
print(f"Endpoint: {endpoint.id} - Status: {endpoint.status}")Create new dedicated endpoints with custom model deployment, hardware configuration, and autoscaling settings.
def create(
*,
model: str,
hardware: str,
min_replicas: int,
max_replicas: int,
display_name: Optional[str] = None,
disable_prompt_cache: bool = False,
disable_speculative_decoding: bool = False,
state: Literal["STARTED", "STOPPED"] = "STARTED",
inactive_timeout: Optional[int] = None
) -> DedicatedEndpoint:
"""
Create a new dedicated endpoint.
Args:
model: The model to deploy on this endpoint
hardware: The hardware configuration to use for this endpoint
min_replicas: The minimum number of replicas to maintain
max_replicas: The maximum number of replicas to scale up to
display_name: A human-readable name for the endpoint
disable_prompt_cache: Whether to disable the prompt cache
disable_speculative_decoding: Whether to disable speculative decoding
state: The desired state of the endpoint ("STARTED" or "STOPPED")
inactive_timeout: Minutes of inactivity before automatic shutdown (0 to disable)
Returns:
DedicatedEndpoint: Object containing endpoint information
"""Usage Example:
from together import Together
client = Together()
# Create a dedicated endpoint with autoscaling
endpoint = client.endpoints.create(
model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
hardware="gpu_h100_80gb",
min_replicas=1,
max_replicas=5,
display_name="My Custom Llama Endpoint",
inactive_timeout=30 # Auto-stop after 30 minutes of inactivity
)
print(f"Created endpoint: {endpoint.id}")
print(f"Status: {endpoint.status}")
print(f"Model: {endpoint.model}")Get detailed information about a specific endpoint including status, configuration, and performance metrics.
def get(endpoint_id: str) -> DedicatedEndpoint:
"""
Get details of a specific endpoint.
Args:
endpoint_id: ID of the endpoint to retrieve
Returns:
DedicatedEndpoint: Object containing endpoint information
"""Usage Example:
from together import Together
client = Together()
# Get endpoint details
endpoint = client.endpoints.get("endpoint-abc123")
print(f"Endpoint ID: {endpoint.id}")
print(f"Model: {endpoint.model}")
print(f"Status: {endpoint.status}")
print(f"Hardware: {endpoint.hardware}")
print(f"Min replicas: {endpoint.autoscaling.min_replicas}")
print(f"Max replicas: {endpoint.autoscaling.max_replicas}")Update endpoint configuration including scaling parameters, state, and display properties.
def update(
endpoint_id: str,
*,
min_replicas: Optional[int] = None,
max_replicas: Optional[int] = None,
state: Optional[Literal["STARTED", "STOPPED"]] = None,
display_name: Optional[str] = None,
inactive_timeout: Optional[int] = None
) -> DedicatedEndpoint:
"""
Update an endpoint's configuration.
Args:
endpoint_id: ID of the endpoint to update
min_replicas: The minimum number of replicas to maintain
max_replicas: The maximum number of replicas to scale up to
state: The desired state of the endpoint ("STARTED" or "STOPPED")
display_name: A human-readable name for the endpoint
inactive_timeout: Minutes of inactivity before automatic shutdown
Returns:
DedicatedEndpoint: Object containing updated endpoint information
"""Usage Example:
from together import Together
client = Together()
# Scale up an endpoint and change its state
updated_endpoint = client.endpoints.update(
endpoint_id="endpoint-abc123",
min_replicas=2,
max_replicas=10,
state="STARTED",
display_name="High-Performance Llama Endpoint"
)
print(f"Updated endpoint: {updated_endpoint.id}")
print(f"New scaling: {updated_endpoint.autoscaling.min_replicas}-{updated_endpoint.autoscaling.max_replicas}")Delete dedicated endpoints to clean up resources and stop billing.
def delete(endpoint_id: str) -> None:
"""
Delete a specific endpoint.
Args:
endpoint_id: ID of the endpoint to delete
"""Usage Example:
from together import Together
client = Together()
# Delete an endpoint
client.endpoints.delete("endpoint-abc123")
print("Endpoint deleted successfully")List available hardware configurations with compatibility and availability information for different models.
def list_hardware(model: Optional[str] = None) -> List[HardwareWithStatus]:
"""
List available hardware configurations.
Args:
model: Filter hardware configurations by model compatibility
Returns:
List[HardwareWithStatus]: List of hardware configurations with status
"""Usage Example:
from together import Together
client = Together()
# List all available hardware
all_hardware = client.endpoints.list_hardware()
# List hardware compatible with a specific model
compatible_hw = client.endpoints.list_hardware(
model="meta-llama/Llama-3.2-3B-Instruct-Turbo"
)
for hw in compatible_hw:
print(f"Hardware: {hw.name}")
print(f" GPUs: {hw.gpu_count}x {hw.gpu_type}")
print(f" Memory: {hw.memory_gb}GB")
print(f" Status: {hw.status}")
print(f" Available: {hw.available}")class DedicatedEndpoint:
id: str
model: str
hardware: str
status: str
display_name: Optional[str]
autoscaling: AutoscalingConfig
disable_prompt_cache: bool
disable_speculative_decoding: bool
inactive_timeout: Optional[int]
created_at: str
updated_at: str
class AutoscalingConfig:
min_replicas: int
max_replicas: int
class ListEndpoint:
id: str
model: str
status: str
type: str
display_name: Optional[str]
created_at: str
class HardwareWithStatus:
name: str
gpu_type: str
gpu_count: int
memory_gb: int
status: str
available: bool
description: Optional[str]All endpoint operations support asynchronous execution through the AsyncTogether client:
import asyncio
from together import AsyncTogether
async def manage_endpoints():
client = AsyncTogether()
# Create endpoint asynchronously
endpoint = await client.endpoints.create(
model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
hardware="gpu_h100_80gb",
min_replicas=1,
max_replicas=3
)
# List endpoints asynchronously
endpoints = await client.endpoints.list(type="dedicated")
# Update endpoint asynchronously
updated = await client.endpoints.update(
endpoint_id=endpoint.id,
max_replicas=5
)
return updated
asyncio.run(manage_endpoints())Endpoint operations may raise specific exceptions for various error conditions:
from together import Together
from together.error import APIError, RateLimitError
client = Together()
try:
endpoint = client.endpoints.create(
model="invalid-model",
hardware="gpu_h100_80gb",
min_replicas=1,
max_replicas=3
)
except APIError as e:
print(f"API Error: {e}")
except RateLimitError as e:
print(f"Rate limit exceeded: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-together