CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-together

Python client for Together's Cloud Platform providing comprehensive AI model APIs

Overview
Eval results
Files

endpoints.mddocs/

Dedicated Endpoint Management

Dedicated endpoint management for deploying and scaling AI models on Together's infrastructure. Provides capabilities for creating, managing, and monitoring custom deployments with dedicated compute resources, autoscaling configurations, and hardware optimization.

Capabilities

Endpoint Listing

List all available endpoints with optional filtering by type (dedicated or serverless).

def list(type: Optional[Literal["dedicated", "serverless"]] = None) -> List[ListEndpoint]:
    """
    List all endpoints, can be filtered by type.
    
    Args:
        type: Filter endpoints by type ("dedicated" or "serverless")
    
    Returns:
        List[ListEndpoint]: List of endpoint objects
    """

Usage Example:

from together import Together

client = Together()

# List all endpoints
all_endpoints = client.endpoints.list()

# List only dedicated endpoints
dedicated_endpoints = client.endpoints.list(type="dedicated")

for endpoint in dedicated_endpoints:
    print(f"Endpoint: {endpoint.id} - Status: {endpoint.status}")

Endpoint Creation

Create new dedicated endpoints with custom model deployment, hardware configuration, and autoscaling settings.

def create(
    *,
    model: str,
    hardware: str,
    min_replicas: int,
    max_replicas: int,
    display_name: Optional[str] = None,
    disable_prompt_cache: bool = False,
    disable_speculative_decoding: bool = False,
    state: Literal["STARTED", "STOPPED"] = "STARTED",
    inactive_timeout: Optional[int] = None
) -> DedicatedEndpoint:
    """
    Create a new dedicated endpoint.
    
    Args:
        model: The model to deploy on this endpoint
        hardware: The hardware configuration to use for this endpoint
        min_replicas: The minimum number of replicas to maintain
        max_replicas: The maximum number of replicas to scale up to
        display_name: A human-readable name for the endpoint
        disable_prompt_cache: Whether to disable the prompt cache
        disable_speculative_decoding: Whether to disable speculative decoding
        state: The desired state of the endpoint ("STARTED" or "STOPPED")
        inactive_timeout: Minutes of inactivity before automatic shutdown (0 to disable)
    
    Returns:
        DedicatedEndpoint: Object containing endpoint information
    """

Usage Example:

from together import Together

client = Together()

# Create a dedicated endpoint with autoscaling
endpoint = client.endpoints.create(
    model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
    hardware="gpu_h100_80gb",
    min_replicas=1,
    max_replicas=5,
    display_name="My Custom Llama Endpoint",
    inactive_timeout=30  # Auto-stop after 30 minutes of inactivity
)

print(f"Created endpoint: {endpoint.id}")
print(f"Status: {endpoint.status}")
print(f"Model: {endpoint.model}")

Endpoint Retrieval

Get detailed information about a specific endpoint including status, configuration, and performance metrics.

def get(endpoint_id: str) -> DedicatedEndpoint:
    """
    Get details of a specific endpoint.
    
    Args:
        endpoint_id: ID of the endpoint to retrieve
    
    Returns:
        DedicatedEndpoint: Object containing endpoint information
    """

Usage Example:

from together import Together

client = Together()

# Get endpoint details
endpoint = client.endpoints.get("endpoint-abc123")

print(f"Endpoint ID: {endpoint.id}")
print(f"Model: {endpoint.model}")
print(f"Status: {endpoint.status}")
print(f"Hardware: {endpoint.hardware}")
print(f"Min replicas: {endpoint.autoscaling.min_replicas}")
print(f"Max replicas: {endpoint.autoscaling.max_replicas}")

Endpoint Updates

Update endpoint configuration including scaling parameters, state, and display properties.

def update(
    endpoint_id: str,
    *,
    min_replicas: Optional[int] = None,
    max_replicas: Optional[int] = None,
    state: Optional[Literal["STARTED", "STOPPED"]] = None,
    display_name: Optional[str] = None,
    inactive_timeout: Optional[int] = None
) -> DedicatedEndpoint:
    """
    Update an endpoint's configuration.
    
    Args:
        endpoint_id: ID of the endpoint to update
        min_replicas: The minimum number of replicas to maintain
        max_replicas: The maximum number of replicas to scale up to
        state: The desired state of the endpoint ("STARTED" or "STOPPED")
        display_name: A human-readable name for the endpoint
        inactive_timeout: Minutes of inactivity before automatic shutdown
    
    Returns:
        DedicatedEndpoint: Object containing updated endpoint information
    """

Usage Example:

from together import Together

client = Together()

# Scale up an endpoint and change its state
updated_endpoint = client.endpoints.update(
    endpoint_id="endpoint-abc123",
    min_replicas=2,
    max_replicas=10,
    state="STARTED",
    display_name="High-Performance Llama Endpoint"
)

print(f"Updated endpoint: {updated_endpoint.id}")
print(f"New scaling: {updated_endpoint.autoscaling.min_replicas}-{updated_endpoint.autoscaling.max_replicas}")

Endpoint Deletion

Delete dedicated endpoints to clean up resources and stop billing.

def delete(endpoint_id: str) -> None:
    """
    Delete a specific endpoint.
    
    Args:
        endpoint_id: ID of the endpoint to delete
    """

Usage Example:

from together import Together

client = Together()

# Delete an endpoint
client.endpoints.delete("endpoint-abc123")
print("Endpoint deleted successfully")

Hardware Configuration Discovery

List available hardware configurations with compatibility and availability information for different models.

def list_hardware(model: Optional[str] = None) -> List[HardwareWithStatus]:
    """
    List available hardware configurations.
    
    Args:
        model: Filter hardware configurations by model compatibility
    
    Returns:
        List[HardwareWithStatus]: List of hardware configurations with status
    """

Usage Example:

from together import Together

client = Together()

# List all available hardware
all_hardware = client.endpoints.list_hardware()

# List hardware compatible with a specific model
compatible_hw = client.endpoints.list_hardware(
    model="meta-llama/Llama-3.2-3B-Instruct-Turbo"
)

for hw in compatible_hw:
    print(f"Hardware: {hw.name}")
    print(f"  GPUs: {hw.gpu_count}x {hw.gpu_type}")
    print(f"  Memory: {hw.memory_gb}GB")
    print(f"  Status: {hw.status}")
    print(f"  Available: {hw.available}")

Types

Core Endpoint Types

class DedicatedEndpoint:
    id: str
    model: str
    hardware: str
    status: str
    display_name: Optional[str]
    autoscaling: AutoscalingConfig
    disable_prompt_cache: bool
    disable_speculative_decoding: bool
    inactive_timeout: Optional[int]
    created_at: str
    updated_at: str

class AutoscalingConfig:
    min_replicas: int
    max_replicas: int

class ListEndpoint:
    id: str
    model: str
    status: str
    type: str
    display_name: Optional[str]
    created_at: str

class HardwareWithStatus:
    name: str
    gpu_type: str
    gpu_count: int
    memory_gb: int
    status: str
    available: bool
    description: Optional[str]

Asynchronous Usage

All endpoint operations support asynchronous execution through the AsyncTogether client:

import asyncio
from together import AsyncTogether

async def manage_endpoints():
    client = AsyncTogether()
    
    # Create endpoint asynchronously
    endpoint = await client.endpoints.create(
        model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
        hardware="gpu_h100_80gb",
        min_replicas=1,
        max_replicas=3
    )
    
    # List endpoints asynchronously
    endpoints = await client.endpoints.list(type="dedicated")
    
    # Update endpoint asynchronously
    updated = await client.endpoints.update(
        endpoint_id=endpoint.id,
        max_replicas=5
    )
    
    return updated

asyncio.run(manage_endpoints())

Error Handling

Endpoint operations may raise specific exceptions for various error conditions:

from together import Together
from together.error import APIError, RateLimitError

client = Together()

try:
    endpoint = client.endpoints.create(
        model="invalid-model",
        hardware="gpu_h100_80gb",
        min_replicas=1,
        max_replicas=3
    )
except APIError as e:
    print(f"API Error: {e}")
except RateLimitError as e:
    print(f"Rate limit exceeded: {e}")

Install with Tessl CLI

npx tessl i tessl/pypi-together

docs

audio.md

batch.md

chat-completions.md

code-interpreter.md

completions.md

embeddings.md

endpoints.md

evaluation.md

files.md

fine-tuning.md

images.md

index.md

models.md

rerank.md

tile.json