or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

assistants.mdaudio.mdbatches.mdchat-completions.mdchatkit.mdclient-initialization.mdcompletions.mdcontainers.mdconversations.mdembeddings.mdevals.mdfiles.mdfine-tuning.mdimages.mdindex.mdmodels.mdmoderations.mdrealtime.mdresponses.mdruns.mdthreads-messages.mduploads.mdvector-stores.mdvideos.mdwebhooks.md
KNOWN_ISSUES.md

moderations.mddocs/

0

# Moderations

1

2

Check content against OpenAI's usage policies to detect potentially harmful content across multiple categories including hate speech, violence, sexual content, and self-harm. Supports both text and image inputs for multi-modal moderation.

3

4

## Capabilities

5

6

### Create Moderation

7

8

Classify text and/or image content for policy violations.

9

10

```python { .api }

11

def create(

12

self,

13

*,

14

input: str | list[str] | list[ModerationMultiModalInputParam],

15

model: str | ModerationModel | Omit = omit,

16

extra_headers: dict[str, str] | None = None,

17

extra_query: dict[str, object] | None = None,

18

extra_body: dict[str, object] | None = None,

19

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

20

) -> ModerationCreateResponse:

21

"""

22

Classify text and/or image inputs against OpenAI's usage policies.

23

24

Args:

25

input: Content to classify. Can be:

26

- Single string: "Text to moderate"

27

- List of strings: ["Text 1", "Text 2"]

28

- List of multi-modal inputs: [{"type": "text", "text": "..."},

29

{"type": "image_url", "image_url": {"url": "..."}}]

30

Maximum 32,768 characters per text input.

31

32

model: Moderation model to use. Options:

33

- "text-moderation-latest": Latest text model, automatically updated

34

- "text-moderation-stable": Stable text model, less frequent updates

35

- "omni-moderation-latest": Latest multi-modal model (supports text + images, default)

36

- "omni-moderation-2024-09-26": Specific omni model version

37

38

extra_headers: Additional HTTP headers.

39

extra_query: Additional query parameters.

40

extra_body: Additional JSON fields.

41

timeout: Request timeout in seconds.

42

43

Returns:

44

ModerationCreateResponse: Contains flagged status and category scores

45

for each input.

46

47

Raises:

48

BadRequestError: Input exceeds maximum length

49

AuthenticationError: Invalid API key

50

"""

51

```

52

53

Usage examples:

54

55

```python

56

from openai import OpenAI

57

58

client = OpenAI()

59

60

# Check single text

61

response = client.moderations.create(

62

input="I want to hurt someone"

63

)

64

65

result = response.results[0]

66

print(f"Flagged: {result.flagged}")

67

68

if result.flagged:

69

print("Violated categories:")

70

for category, flagged in result.categories.model_dump().items():

71

if flagged:

72

score = getattr(result.category_scores, category)

73

print(f" {category}: {score:.4f}")

74

75

# Check multiple texts

76

texts = [

77

"Hello, how are you?",

78

"This is inappropriate content",

79

"What's the weather like today?"

80

]

81

82

response = client.moderations.create(input=texts)

83

84

for i, result in enumerate(response.results):

85

print(f"Text {i + 1}: {'Flagged' if result.flagged else 'Safe'}")

86

87

# Use latest omni model

88

response = client.moderations.create(

89

model="omni-moderation-latest",

90

input="Check this message for violations"

91

)

92

93

# Use stable model for consistent behavior

94

response = client.moderations.create(

95

model="text-moderation-stable",

96

input="Testing moderation"

97

)

98

99

# Multi-modal moderation with text and images

100

response = client.moderations.create(

101

model="omni-moderation-latest",

102

input=[

103

{"type": "text", "text": "Check this message"},

104

{

105

"type": "image_url",

106

"image_url": {"url": "https://example.com/image.jpg"}

107

}

108

]

109

)

110

111

# Moderate image from base64

112

import base64

113

114

with open("image.jpg", "rb") as f:

115

image_data = base64.b64encode(f.read()).decode()

116

117

response = client.moderations.create(

118

model="omni-moderation-latest",

119

input=[

120

{

121

"type": "image_url",

122

"image_url": {"url": f"data:image/jpeg;base64,{image_data}"}

123

}

124

]

125

)

126

127

# Detailed category analysis

128

response = client.moderations.create(

129

input="Potentially problematic text"

130

)

131

132

result = response.results[0]

133

134

# All categories and scores

135

categories = result.categories

136

scores = result.category_scores

137

138

print("Category Analysis:")

139

print(f" Hate: {scores.hate:.4f} (flagged: {categories.hate})")

140

print(f" Hate/Threatening: {scores.hate_threatening:.4f} (flagged: {categories.hate_threatening})")

141

print(f" Harassment: {scores.harassment:.4f} (flagged: {categories.harassment})")

142

print(f" Harassment/Threatening: {scores.harassment_threatening:.4f} (flagged: {categories.harassment_threatening})")

143

print(f" Self-Harm: {scores.self_harm:.4f} (flagged: {categories.self_harm})")

144

print(f" Self-Harm/Intent: {scores.self_harm_intent:.4f} (flagged: {categories.self_harm_intent})")

145

print(f" Self-Harm/Instructions: {scores.self_harm_instructions:.4f} (flagged: {categories.self_harm_instructions})")

146

print(f" Sexual: {scores.sexual:.4f} (flagged: {categories.sexual})")

147

print(f" Sexual/Minors: {scores.sexual_minors:.4f} (flagged: {categories.sexual_minors})")

148

print(f" Violence: {scores.violence:.4f} (flagged: {categories.violence})")

149

print(f" Violence/Graphic: {scores.violence_graphic:.4f} (flagged: {categories.violence_graphic})")

150

151

# Filter user content example

152

def is_safe_content(text: str) -> tuple[bool, list[str]]:

153

"""

154

Check if content is safe to use.

155

Returns (is_safe, violated_categories)

156

"""

157

response = client.moderations.create(input=text)

158

result = response.results[0]

159

160

if not result.flagged:

161

return True, []

162

163

violated = [

164

category for category, flagged in result.categories.model_dump().items()

165

if flagged

166

]

167

168

return False, violated

169

170

# Use in application

171

user_input = "Some user-generated content"

172

is_safe, violations = is_safe_content(user_input)

173

174

if is_safe:

175

print("Content approved")

176

else:

177

print(f"Content rejected. Violations: {', '.join(violations)}")

178

```

179

180

## Types

181

182

```python { .api }

183

from typing import Literal, Union

184

from typing_extensions import TypedDict

185

from pydantic import BaseModel

186

187

class ModerationCreateResponse(BaseModel):

188

"""Moderation response."""

189

id: str

190

model: str

191

results: list[ModerationResult]

192

193

class ModerationResult(BaseModel):

194

"""Single moderation result."""

195

flagged: bool

196

categories: ModerationCategories

197

category_scores: ModerationCategoryScores

198

category_applied_input_types: ModerationCategoryAppliedInputTypes

199

200

class ModerationCategories(BaseModel):

201

"""Category flags (true if violated)."""

202

hate: bool

203

hate_threatening: bool

204

harassment: bool

205

harassment_threatening: bool

206

self_harm: bool

207

self_harm_intent: bool

208

self_harm_instructions: bool

209

sexual: bool

210

sexual_minors: bool

211

violence: bool

212

violence_graphic: bool

213

illicit: bool

214

illicit_violent: bool

215

216

class ModerationCategoryScores(BaseModel):

217

"""Confidence scores (0-1) for each category."""

218

hate: float

219

hate_threatening: float

220

harassment: float

221

harassment_threatening: float

222

self_harm: float

223

self_harm_intent: float

224

self_harm_instructions: float

225

sexual: float

226

sexual_minors: float

227

violence: float

228

violence_graphic: float

229

illicit: float

230

illicit_violent: float

231

232

class ModerationCategoryAppliedInputTypes(BaseModel):

233

"""Input types that triggered each category."""

234

hate: list[str]

235

hate_threatening: list[str]

236

harassment: list[str]

237

harassment_threatening: list[str]

238

self_harm: list[str]

239

self_harm_intent: list[str]

240

self_harm_instructions: list[str]

241

sexual: list[str]

242

sexual_minors: list[str]

243

violence: list[str]

244

violence_graphic: list[str]

245

illicit: list[str]

246

illicit_violent: list[str]

247

248

# Model type

249

ModerationModel = Literal[

250

"text-moderation-latest",

251

"text-moderation-stable",

252

"omni-moderation-latest",

253

"omni-moderation-2024-09-26"

254

]

255

256

# Multi-modal input types

257

class ModerationTextInputParam(TypedDict):

258

"""Text input for moderation."""

259

text: str # Required: Text content to moderate

260

type: Literal["text"] # Required: Always "text"

261

262

class ImageURL(TypedDict):

263

"""Image URL or base64 data."""

264

url: str # Required: URL or data:image/...;base64,... string

265

266

class ModerationImageURLInputParam(TypedDict):

267

"""Image input for moderation."""

268

image_url: ImageURL # Required: Image URL or base64 data

269

type: Literal["image_url"] # Required: Always "image_url"

270

271

# Union type for multi-modal inputs

272

ModerationMultiModalInputParam = Union[

273

ModerationTextInputParam,

274

ModerationImageURLInputParam

275

]

276

```

277

278

## Category Descriptions

279

280

| Category | Description |

281

|----------|-------------|

282

| hate | Content expressing, inciting, or promoting hate based on protected characteristics |

283

| hate/threatening | Hateful content that also includes violence or serious harm |

284

| harassment | Content harassing, bullying, or abusing an individual |

285

| harassment/threatening | Harassing content that also includes violence or serious harm |

286

| self-harm | Content promoting, encouraging, or depicting acts of self-harm |

287

| self-harm/intent | Content indicating intent to engage in self-harm |

288

| self-harm/instructions | Content providing instructions or advice for self-harm |

289

| sexual | Content meant to arouse sexual excitement |

290

| sexual/minors | Sexual content involving individuals under 18 |

291

| violence | Content depicting death, violence, or physical injury |

292

| violence/graphic | Graphic violent content with extreme detail |

293

| illicit | Content promoting illicit substances or illegal activities |

294

| illicit/violent | Illicit content involving violence |

295

296

## Best Practices

297

298

```python

299

from openai import OpenAI

300

301

client = OpenAI()

302

303

# 1. Moderate user-generated content before processing

304

def moderate_before_processing(user_text: str):

305

response = client.moderations.create(input=user_text)

306

307

if response.results[0].flagged:

308

return None, "Content violates policies"

309

310

# Safe to process

311

return process_safe_content(user_text), None

312

313

# 2. Batch moderation for efficiency

314

user_messages = ["msg1", "msg2", "msg3"]

315

response = client.moderations.create(input=user_messages)

316

317

safe_messages = [

318

msg for msg, result in zip(user_messages, response.results)

319

if not result.flagged

320

]

321

322

# 3. Log violations for analysis

323

for i, result in enumerate(response.results):

324

if result.flagged:

325

violated_categories = [

326

cat for cat, flagged in result.categories.model_dump().items()

327

if flagged

328

]

329

log_violation(user_messages[i], violated_categories)

330

331

# 4. Use thresholds for borderline content

332

def is_definitely_safe(text: str, threshold: float = 0.5) -> bool:

333

response = client.moderations.create(input=text)

334

result = response.results[0]

335

336

# Check if any score exceeds threshold

337

scores = result.category_scores.model_dump()

338

return all(score < threshold for score in scores.values())

339

```

340

341

## Async Usage

342

343

```python

344

import asyncio

345

from openai import AsyncOpenAI

346

347

async def moderate_content(text: str):

348

client = AsyncOpenAI()

349

350

response = await client.moderations.create(input=text)

351

return response.results[0].flagged

352

353

# Run async

354

is_flagged = asyncio.run(moderate_content("Check this text"))

355

```

356