or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio.mdbatch.mdchat-completions.mdcode-interpreter.mdcompletions.mdembeddings.mdendpoints.mdevaluation.mdfiles.mdfine-tuning.mdimages.mdindex.mdmodels.mdrerank.md

endpoints.mddocs/

0

# Dedicated Endpoint Management

1

2

Dedicated endpoint management for deploying and scaling AI models on Together's infrastructure. Provides capabilities for creating, managing, and monitoring custom deployments with dedicated compute resources, autoscaling configurations, and hardware optimization.

3

4

## Capabilities

5

6

### Endpoint Listing

7

8

List all available endpoints with optional filtering by type (dedicated or serverless).

9

10

```python { .api }

11

def list(type: Optional[Literal["dedicated", "serverless"]] = None) -> List[ListEndpoint]:

12

"""

13

List all endpoints, can be filtered by type.

14

15

Args:

16

type: Filter endpoints by type ("dedicated" or "serverless")

17

18

Returns:

19

List[ListEndpoint]: List of endpoint objects

20

"""

21

```

22

23

**Usage Example:**

24

25

```python

26

from together import Together

27

28

client = Together()

29

30

# List all endpoints

31

all_endpoints = client.endpoints.list()

32

33

# List only dedicated endpoints

34

dedicated_endpoints = client.endpoints.list(type="dedicated")

35

36

for endpoint in dedicated_endpoints:

37

print(f"Endpoint: {endpoint.id} - Status: {endpoint.status}")

38

```

39

40

### Endpoint Creation

41

42

Create new dedicated endpoints with custom model deployment, hardware configuration, and autoscaling settings.

43

44

```python { .api }

45

def create(

46

*,

47

model: str,

48

hardware: str,

49

min_replicas: int,

50

max_replicas: int,

51

display_name: Optional[str] = None,

52

disable_prompt_cache: bool = False,

53

disable_speculative_decoding: bool = False,

54

state: Literal["STARTED", "STOPPED"] = "STARTED",

55

inactive_timeout: Optional[int] = None

56

) -> DedicatedEndpoint:

57

"""

58

Create a new dedicated endpoint.

59

60

Args:

61

model: The model to deploy on this endpoint

62

hardware: The hardware configuration to use for this endpoint

63

min_replicas: The minimum number of replicas to maintain

64

max_replicas: The maximum number of replicas to scale up to

65

display_name: A human-readable name for the endpoint

66

disable_prompt_cache: Whether to disable the prompt cache

67

disable_speculative_decoding: Whether to disable speculative decoding

68

state: The desired state of the endpoint ("STARTED" or "STOPPED")

69

inactive_timeout: Minutes of inactivity before automatic shutdown (0 to disable)

70

71

Returns:

72

DedicatedEndpoint: Object containing endpoint information

73

"""

74

```

75

76

**Usage Example:**

77

78

```python

79

from together import Together

80

81

client = Together()

82

83

# Create a dedicated endpoint with autoscaling

84

endpoint = client.endpoints.create(

85

model="meta-llama/Llama-3.2-3B-Instruct-Turbo",

86

hardware="gpu_h100_80gb",

87

min_replicas=1,

88

max_replicas=5,

89

display_name="My Custom Llama Endpoint",

90

inactive_timeout=30 # Auto-stop after 30 minutes of inactivity

91

)

92

93

print(f"Created endpoint: {endpoint.id}")

94

print(f"Status: {endpoint.status}")

95

print(f"Model: {endpoint.model}")

96

```

97

98

### Endpoint Retrieval

99

100

Get detailed information about a specific endpoint including status, configuration, and performance metrics.

101

102

```python { .api }

103

def get(endpoint_id: str) -> DedicatedEndpoint:

104

"""

105

Get details of a specific endpoint.

106

107

Args:

108

endpoint_id: ID of the endpoint to retrieve

109

110

Returns:

111

DedicatedEndpoint: Object containing endpoint information

112

"""

113

```

114

115

**Usage Example:**

116

117

```python

118

from together import Together

119

120

client = Together()

121

122

# Get endpoint details

123

endpoint = client.endpoints.get("endpoint-abc123")

124

125

print(f"Endpoint ID: {endpoint.id}")

126

print(f"Model: {endpoint.model}")

127

print(f"Status: {endpoint.status}")

128

print(f"Hardware: {endpoint.hardware}")

129

print(f"Min replicas: {endpoint.autoscaling.min_replicas}")

130

print(f"Max replicas: {endpoint.autoscaling.max_replicas}")

131

```

132

133

### Endpoint Updates

134

135

Update endpoint configuration including scaling parameters, state, and display properties.

136

137

```python { .api }

138

def update(

139

endpoint_id: str,

140

*,

141

min_replicas: Optional[int] = None,

142

max_replicas: Optional[int] = None,

143

state: Optional[Literal["STARTED", "STOPPED"]] = None,

144

display_name: Optional[str] = None,

145

inactive_timeout: Optional[int] = None

146

) -> DedicatedEndpoint:

147

"""

148

Update an endpoint's configuration.

149

150

Args:

151

endpoint_id: ID of the endpoint to update

152

min_replicas: The minimum number of replicas to maintain

153

max_replicas: The maximum number of replicas to scale up to

154

state: The desired state of the endpoint ("STARTED" or "STOPPED")

155

display_name: A human-readable name for the endpoint

156

inactive_timeout: Minutes of inactivity before automatic shutdown

157

158

Returns:

159

DedicatedEndpoint: Object containing updated endpoint information

160

"""

161

```

162

163

**Usage Example:**

164

165

```python

166

from together import Together

167

168

client = Together()

169

170

# Scale up an endpoint and change its state

171

updated_endpoint = client.endpoints.update(

172

endpoint_id="endpoint-abc123",

173

min_replicas=2,

174

max_replicas=10,

175

state="STARTED",

176

display_name="High-Performance Llama Endpoint"

177

)

178

179

print(f"Updated endpoint: {updated_endpoint.id}")

180

print(f"New scaling: {updated_endpoint.autoscaling.min_replicas}-{updated_endpoint.autoscaling.max_replicas}")

181

```

182

183

### Endpoint Deletion

184

185

Delete dedicated endpoints to clean up resources and stop billing.

186

187

```python { .api }

188

def delete(endpoint_id: str) -> None:

189

"""

190

Delete a specific endpoint.

191

192

Args:

193

endpoint_id: ID of the endpoint to delete

194

"""

195

```

196

197

**Usage Example:**

198

199

```python

200

from together import Together

201

202

client = Together()

203

204

# Delete an endpoint

205

client.endpoints.delete("endpoint-abc123")

206

print("Endpoint deleted successfully")

207

```

208

209

### Hardware Configuration Discovery

210

211

List available hardware configurations with compatibility and availability information for different models.

212

213

```python { .api }

214

def list_hardware(model: Optional[str] = None) -> List[HardwareWithStatus]:

215

"""

216

List available hardware configurations.

217

218

Args:

219

model: Filter hardware configurations by model compatibility

220

221

Returns:

222

List[HardwareWithStatus]: List of hardware configurations with status

223

"""

224

```

225

226

**Usage Example:**

227

228

```python

229

from together import Together

230

231

client = Together()

232

233

# List all available hardware

234

all_hardware = client.endpoints.list_hardware()

235

236

# List hardware compatible with a specific model

237

compatible_hw = client.endpoints.list_hardware(

238

model="meta-llama/Llama-3.2-3B-Instruct-Turbo"

239

)

240

241

for hw in compatible_hw:

242

print(f"Hardware: {hw.name}")

243

print(f" GPUs: {hw.gpu_count}x {hw.gpu_type}")

244

print(f" Memory: {hw.memory_gb}GB")

245

print(f" Status: {hw.status}")

246

print(f" Available: {hw.available}")

247

```

248

249

## Types

250

251

### Core Endpoint Types

252

253

```python { .api }

254

class DedicatedEndpoint:

255

id: str

256

model: str

257

hardware: str

258

status: str

259

display_name: Optional[str]

260

autoscaling: AutoscalingConfig

261

disable_prompt_cache: bool

262

disable_speculative_decoding: bool

263

inactive_timeout: Optional[int]

264

created_at: str

265

updated_at: str

266

267

class AutoscalingConfig:

268

min_replicas: int

269

max_replicas: int

270

271

class ListEndpoint:

272

id: str

273

model: str

274

status: str

275

type: str

276

display_name: Optional[str]

277

created_at: str

278

279

class HardwareWithStatus:

280

name: str

281

gpu_type: str

282

gpu_count: int

283

memory_gb: int

284

status: str

285

available: bool

286

description: Optional[str]

287

```

288

289

## Asynchronous Usage

290

291

All endpoint operations support asynchronous execution through the `AsyncTogether` client:

292

293

```python

294

import asyncio

295

from together import AsyncTogether

296

297

async def manage_endpoints():

298

client = AsyncTogether()

299

300

# Create endpoint asynchronously

301

endpoint = await client.endpoints.create(

302

model="meta-llama/Llama-3.2-3B-Instruct-Turbo",

303

hardware="gpu_h100_80gb",

304

min_replicas=1,

305

max_replicas=3

306

)

307

308

# List endpoints asynchronously

309

endpoints = await client.endpoints.list(type="dedicated")

310

311

# Update endpoint asynchronously

312

updated = await client.endpoints.update(

313

endpoint_id=endpoint.id,

314

max_replicas=5

315

)

316

317

return updated

318

319

asyncio.run(manage_endpoints())

320

```

321

322

## Error Handling

323

324

Endpoint operations may raise specific exceptions for various error conditions:

325

326

```python

327

from together import Together

328

from together.error import APIError, RateLimitError

329

330

client = Together()

331

332

try:

333

endpoint = client.endpoints.create(

334

model="invalid-model",

335

hardware="gpu_h100_80gb",

336

min_replicas=1,

337

max_replicas=3

338

)

339

except APIError as e:

340

print(f"API Error: {e}")

341

except RateLimitError as e:

342

print(f"Rate limit exceeded: {e}")

343

```