Tessl Tile for pypi/together@1.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio.md batch.md chat-completions.md code-interpreter.md completions.md embeddings.md endpoints.md evaluation.md files.md fine-tuning.md images.md index.md models.md rerank.md

endpoints.mddocs/

0
# Dedicated Endpoint Management
1

2
Dedicated endpoint management for deploying and scaling AI models on Together's infrastructure. Provides capabilities for creating, managing, and monitoring custom deployments with dedicated compute resources, autoscaling configurations, and hardware optimization.
3

4
## Capabilities
5

6
### Endpoint Listing
7

8
List all available endpoints with optional filtering by type (dedicated or serverless).
9

10
```python { .api }
11
def list(type: Optional[Literal["dedicated", "serverless"]] = None) -> List[ListEndpoint]:
12
    """
13
    List all endpoints, can be filtered by type.
14
    
15
    Args:
16
        type: Filter endpoints by type ("dedicated" or "serverless")
17
    
18
    Returns:
19
        List[ListEndpoint]: List of endpoint objects
20
    """
21
```
22

23
**Usage Example:**
24

25
```python
26
from together import Together
27

28
client = Together()
29

30
# List all endpoints
31
all_endpoints = client.endpoints.list()
32

33
# List only dedicated endpoints
34
dedicated_endpoints = client.endpoints.list(type="dedicated")
35

36
for endpoint in dedicated_endpoints:
37
    print(f"Endpoint: {endpoint.id} - Status: {endpoint.status}")
38
```
39

40
### Endpoint Creation
41

42
Create new dedicated endpoints with custom model deployment, hardware configuration, and autoscaling settings.
43

44
```python { .api }
45
def create(
46
    *,
47
    model: str,
48
    hardware: str,
49
    min_replicas: int,
50
    max_replicas: int,
51
    display_name: Optional[str] = None,
52
    disable_prompt_cache: bool = False,
53
    disable_speculative_decoding: bool = False,
54
    state: Literal["STARTED", "STOPPED"] = "STARTED",
55
    inactive_timeout: Optional[int] = None
56
) -> DedicatedEndpoint:
57
    """
58
    Create a new dedicated endpoint.
59
    
60
    Args:
61
        model: The model to deploy on this endpoint
62
        hardware: The hardware configuration to use for this endpoint
63
        min_replicas: The minimum number of replicas to maintain
64
        max_replicas: The maximum number of replicas to scale up to
65
        display_name: A human-readable name for the endpoint
66
        disable_prompt_cache: Whether to disable the prompt cache
67
        disable_speculative_decoding: Whether to disable speculative decoding
68
        state: The desired state of the endpoint ("STARTED" or "STOPPED")
69
        inactive_timeout: Minutes of inactivity before automatic shutdown (0 to disable)
70
    
71
    Returns:
72
        DedicatedEndpoint: Object containing endpoint information
73
    """
74
```
75

76
**Usage Example:**
77

78
```python
79
from together import Together
80

81
client = Together()
82

83
# Create a dedicated endpoint with autoscaling
84
endpoint = client.endpoints.create(
85
    model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
86
    hardware="gpu_h100_80gb",
87
    min_replicas=1,
88
    max_replicas=5,
89
    display_name="My Custom Llama Endpoint",
90
    inactive_timeout=30  # Auto-stop after 30 minutes of inactivity
91
)
92

93
print(f"Created endpoint: {endpoint.id}")
94
print(f"Status: {endpoint.status}")
95
print(f"Model: {endpoint.model}")
96
```
97

98
### Endpoint Retrieval
99

100
Get detailed information about a specific endpoint including status, configuration, and performance metrics.
101

102
```python { .api }
103
def get(endpoint_id: str) -> DedicatedEndpoint:
104
    """
105
    Get details of a specific endpoint.
106
    
107
    Args:
108
        endpoint_id: ID of the endpoint to retrieve
109
    
110
    Returns:
111
        DedicatedEndpoint: Object containing endpoint information
112
    """
113
```
114

115
**Usage Example:**
116

117
```python
118
from together import Together
119

120
client = Together()
121

122
# Get endpoint details
123
endpoint = client.endpoints.get("endpoint-abc123")
124

125
print(f"Endpoint ID: {endpoint.id}")
126
print(f"Model: {endpoint.model}")
127
print(f"Status: {endpoint.status}")
128
print(f"Hardware: {endpoint.hardware}")
129
print(f"Min replicas: {endpoint.autoscaling.min_replicas}")
130
print(f"Max replicas: {endpoint.autoscaling.max_replicas}")
131
```
132

133
### Endpoint Updates
134

135
Update endpoint configuration including scaling parameters, state, and display properties.
136

137
```python { .api }
138
def update(
139
    endpoint_id: str,
140
    *,
141
    min_replicas: Optional[int] = None,
142
    max_replicas: Optional[int] = None,
143
    state: Optional[Literal["STARTED", "STOPPED"]] = None,
144
    display_name: Optional[str] = None,
145
    inactive_timeout: Optional[int] = None
146
) -> DedicatedEndpoint:
147
    """
148
    Update an endpoint's configuration.
149
    
150
    Args:
151
        endpoint_id: ID of the endpoint to update
152
        min_replicas: The minimum number of replicas to maintain
153
        max_replicas: The maximum number of replicas to scale up to
154
        state: The desired state of the endpoint ("STARTED" or "STOPPED")
155
        display_name: A human-readable name for the endpoint
156
        inactive_timeout: Minutes of inactivity before automatic shutdown
157
    
158
    Returns:
159
        DedicatedEndpoint: Object containing updated endpoint information
160
    """
161
```
162

163
**Usage Example:**
164

165
```python
166
from together import Together
167

168
client = Together()
169

170
# Scale up an endpoint and change its state
171
updated_endpoint = client.endpoints.update(
172
    endpoint_id="endpoint-abc123",
173
    min_replicas=2,
174
    max_replicas=10,
175
    state="STARTED",
176
    display_name="High-Performance Llama Endpoint"
177
)
178

179
print(f"Updated endpoint: {updated_endpoint.id}")
180
print(f"New scaling: {updated_endpoint.autoscaling.min_replicas}-{updated_endpoint.autoscaling.max_replicas}")
181
```
182

183
### Endpoint Deletion
184

185
Delete dedicated endpoints to clean up resources and stop billing.
186

187
```python { .api }
188
def delete(endpoint_id: str) -> None:
189
    """
190
    Delete a specific endpoint.
191
    
192
    Args:
193
        endpoint_id: ID of the endpoint to delete
194
    """
195
```
196

197
**Usage Example:**
198

199
```python
200
from together import Together
201

202
client = Together()
203

204
# Delete an endpoint
205
client.endpoints.delete("endpoint-abc123")
206
print("Endpoint deleted successfully")
207
```
208

209
### Hardware Configuration Discovery
210

211
List available hardware configurations with compatibility and availability information for different models.
212

213
```python { .api }
214
def list_hardware(model: Optional[str] = None) -> List[HardwareWithStatus]:
215
    """
216
    List available hardware configurations.
217
    
218
    Args:
219
        model: Filter hardware configurations by model compatibility
220
    
221
    Returns:
222
        List[HardwareWithStatus]: List of hardware configurations with status
223
    """
224
```
225

226
**Usage Example:**
227

228
```python
229
from together import Together
230

231
client = Together()
232

233
# List all available hardware
234
all_hardware = client.endpoints.list_hardware()
235

236
# List hardware compatible with a specific model
237
compatible_hw = client.endpoints.list_hardware(
238
    model="meta-llama/Llama-3.2-3B-Instruct-Turbo"
239
)
240

241
for hw in compatible_hw:
242
    print(f"Hardware: {hw.name}")
243
    print(f"  GPUs: {hw.gpu_count}x {hw.gpu_type}")
244
    print(f"  Memory: {hw.memory_gb}GB")
245
    print(f"  Status: {hw.status}")
246
    print(f"  Available: {hw.available}")
247
```
248

249
## Types
250

251
### Core Endpoint Types
252

253
```python { .api }
254
class DedicatedEndpoint:
255
    id: str
256
    model: str
257
    hardware: str
258
    status: str
259
    display_name: Optional[str]
260
    autoscaling: AutoscalingConfig
261
    disable_prompt_cache: bool
262
    disable_speculative_decoding: bool
263
    inactive_timeout: Optional[int]
264
    created_at: str
265
    updated_at: str
266

267
class AutoscalingConfig:
268
    min_replicas: int
269
    max_replicas: int
270

271
class ListEndpoint:
272
    id: str
273
    model: str
274
    status: str
275
    type: str
276
    display_name: Optional[str]
277
    created_at: str
278

279
class HardwareWithStatus:
280
    name: str
281
    gpu_type: str
282
    gpu_count: int
283
    memory_gb: int
284
    status: str
285
    available: bool
286
    description: Optional[str]
287
```
288

289
## Asynchronous Usage
290

291
All endpoint operations support asynchronous execution through the `AsyncTogether` client:
292

293
```python
294
import asyncio
295
from together import AsyncTogether
296

297
async def manage_endpoints():
298
    client = AsyncTogether()
299
    
300
    # Create endpoint asynchronously
301
    endpoint = await client.endpoints.create(
302
        model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
303
        hardware="gpu_h100_80gb",
304
        min_replicas=1,
305
        max_replicas=3
306
    )
307
    
308
    # List endpoints asynchronously
309
    endpoints = await client.endpoints.list(type="dedicated")
310
    
311
    # Update endpoint asynchronously
312
    updated = await client.endpoints.update(
313
        endpoint_id=endpoint.id,
314
        max_replicas=5
315
    )
316
    
317
    return updated
318

319
asyncio.run(manage_endpoints())
320
```
321

322
## Error Handling
323

324
Endpoint operations may raise specific exceptions for various error conditions:
325

326
```python
327
from together import Together
328
from together.error import APIError, RateLimitError
329

330
client = Together()
331

332
try:
333
    endpoint = client.endpoints.create(
334
        model="invalid-model",
335
        hardware="gpu_h100_80gb",
336
        min_replicas=1,
337
        max_replicas=3
338
    )
339
except APIError as e:
340
    print(f"API Error: {e}")
341
except RateLimitError as e:
342
    print(f"Rate limit exceeded: {e}")
343
```

Version

Tile

Files

endpoints.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

endpoints.mddocs/