docs
0
# Moderations
1
2
Check content against OpenAI's usage policies to detect potentially harmful content across multiple categories including hate speech, violence, sexual content, and self-harm. Supports both text and image inputs for multi-modal moderation.
3
4
## Capabilities
5
6
### Create Moderation
7
8
Classify text and/or image content for policy violations.
9
10
```python { .api }
11
def create(
12
self,
13
*,
14
input: str | list[str] | list[ModerationMultiModalInputParam],
15
model: str | ModerationModel | Omit = omit,
16
extra_headers: dict[str, str] | None = None,
17
extra_query: dict[str, object] | None = None,
18
extra_body: dict[str, object] | None = None,
19
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
20
) -> ModerationCreateResponse:
21
"""
22
Classify text and/or image inputs against OpenAI's usage policies.
23
24
Args:
25
input: Content to classify. Can be:
26
- Single string: "Text to moderate"
27
- List of strings: ["Text 1", "Text 2"]
28
- List of multi-modal inputs: [{"type": "text", "text": "..."},
29
{"type": "image_url", "image_url": {"url": "..."}}]
30
Maximum 32,768 characters per text input.
31
32
model: Moderation model to use. Options:
33
- "text-moderation-latest": Latest text model, automatically updated
34
- "text-moderation-stable": Stable text model, less frequent updates
35
- "omni-moderation-latest": Latest multi-modal model (supports text + images, default)
36
- "omni-moderation-2024-09-26": Specific omni model version
37
38
extra_headers: Additional HTTP headers.
39
extra_query: Additional query parameters.
40
extra_body: Additional JSON fields.
41
timeout: Request timeout in seconds.
42
43
Returns:
44
ModerationCreateResponse: Contains flagged status and category scores
45
for each input.
46
47
Raises:
48
BadRequestError: Input exceeds maximum length
49
AuthenticationError: Invalid API key
50
"""
51
```
52
53
Usage examples:
54
55
```python
56
from openai import OpenAI
57
58
client = OpenAI()
59
60
# Check single text
61
response = client.moderations.create(
62
input="I want to hurt someone"
63
)
64
65
result = response.results[0]
66
print(f"Flagged: {result.flagged}")
67
68
if result.flagged:
69
print("Violated categories:")
70
for category, flagged in result.categories.model_dump().items():
71
if flagged:
72
score = getattr(result.category_scores, category)
73
print(f" {category}: {score:.4f}")
74
75
# Check multiple texts
76
texts = [
77
"Hello, how are you?",
78
"This is inappropriate content",
79
"What's the weather like today?"
80
]
81
82
response = client.moderations.create(input=texts)
83
84
for i, result in enumerate(response.results):
85
print(f"Text {i + 1}: {'Flagged' if result.flagged else 'Safe'}")
86
87
# Use latest omni model
88
response = client.moderations.create(
89
model="omni-moderation-latest",
90
input="Check this message for violations"
91
)
92
93
# Use stable model for consistent behavior
94
response = client.moderations.create(
95
model="text-moderation-stable",
96
input="Testing moderation"
97
)
98
99
# Multi-modal moderation with text and images
100
response = client.moderations.create(
101
model="omni-moderation-latest",
102
input=[
103
{"type": "text", "text": "Check this message"},
104
{
105
"type": "image_url",
106
"image_url": {"url": "https://example.com/image.jpg"}
107
}
108
]
109
)
110
111
# Moderate image from base64
112
import base64
113
114
with open("image.jpg", "rb") as f:
115
image_data = base64.b64encode(f.read()).decode()
116
117
response = client.moderations.create(
118
model="omni-moderation-latest",
119
input=[
120
{
121
"type": "image_url",
122
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
123
}
124
]
125
)
126
127
# Detailed category analysis
128
response = client.moderations.create(
129
input="Potentially problematic text"
130
)
131
132
result = response.results[0]
133
134
# All categories and scores
135
categories = result.categories
136
scores = result.category_scores
137
138
print("Category Analysis:")
139
print(f" Hate: {scores.hate:.4f} (flagged: {categories.hate})")
140
print(f" Hate/Threatening: {scores.hate_threatening:.4f} (flagged: {categories.hate_threatening})")
141
print(f" Harassment: {scores.harassment:.4f} (flagged: {categories.harassment})")
142
print(f" Harassment/Threatening: {scores.harassment_threatening:.4f} (flagged: {categories.harassment_threatening})")
143
print(f" Self-Harm: {scores.self_harm:.4f} (flagged: {categories.self_harm})")
144
print(f" Self-Harm/Intent: {scores.self_harm_intent:.4f} (flagged: {categories.self_harm_intent})")
145
print(f" Self-Harm/Instructions: {scores.self_harm_instructions:.4f} (flagged: {categories.self_harm_instructions})")
146
print(f" Sexual: {scores.sexual:.4f} (flagged: {categories.sexual})")
147
print(f" Sexual/Minors: {scores.sexual_minors:.4f} (flagged: {categories.sexual_minors})")
148
print(f" Violence: {scores.violence:.4f} (flagged: {categories.violence})")
149
print(f" Violence/Graphic: {scores.violence_graphic:.4f} (flagged: {categories.violence_graphic})")
150
151
# Filter user content example
152
def is_safe_content(text: str) -> tuple[bool, list[str]]:
153
"""
154
Check if content is safe to use.
155
Returns (is_safe, violated_categories)
156
"""
157
response = client.moderations.create(input=text)
158
result = response.results[0]
159
160
if not result.flagged:
161
return True, []
162
163
violated = [
164
category for category, flagged in result.categories.model_dump().items()
165
if flagged
166
]
167
168
return False, violated
169
170
# Use in application
171
user_input = "Some user-generated content"
172
is_safe, violations = is_safe_content(user_input)
173
174
if is_safe:
175
print("Content approved")
176
else:
177
print(f"Content rejected. Violations: {', '.join(violations)}")
178
```
179
180
## Types
181
182
```python { .api }
183
from typing import Literal, Union
184
from typing_extensions import TypedDict
185
from pydantic import BaseModel
186
187
class ModerationCreateResponse(BaseModel):
188
"""Moderation response."""
189
id: str
190
model: str
191
results: list[ModerationResult]
192
193
class ModerationResult(BaseModel):
194
"""Single moderation result."""
195
flagged: bool
196
categories: ModerationCategories
197
category_scores: ModerationCategoryScores
198
category_applied_input_types: ModerationCategoryAppliedInputTypes
199
200
class ModerationCategories(BaseModel):
201
"""Category flags (true if violated)."""
202
hate: bool
203
hate_threatening: bool
204
harassment: bool
205
harassment_threatening: bool
206
self_harm: bool
207
self_harm_intent: bool
208
self_harm_instructions: bool
209
sexual: bool
210
sexual_minors: bool
211
violence: bool
212
violence_graphic: bool
213
illicit: bool
214
illicit_violent: bool
215
216
class ModerationCategoryScores(BaseModel):
217
"""Confidence scores (0-1) for each category."""
218
hate: float
219
hate_threatening: float
220
harassment: float
221
harassment_threatening: float
222
self_harm: float
223
self_harm_intent: float
224
self_harm_instructions: float
225
sexual: float
226
sexual_minors: float
227
violence: float
228
violence_graphic: float
229
illicit: float
230
illicit_violent: float
231
232
class ModerationCategoryAppliedInputTypes(BaseModel):
233
"""Input types that triggered each category."""
234
hate: list[str]
235
hate_threatening: list[str]
236
harassment: list[str]
237
harassment_threatening: list[str]
238
self_harm: list[str]
239
self_harm_intent: list[str]
240
self_harm_instructions: list[str]
241
sexual: list[str]
242
sexual_minors: list[str]
243
violence: list[str]
244
violence_graphic: list[str]
245
illicit: list[str]
246
illicit_violent: list[str]
247
248
# Model type
249
ModerationModel = Literal[
250
"text-moderation-latest",
251
"text-moderation-stable",
252
"omni-moderation-latest",
253
"omni-moderation-2024-09-26"
254
]
255
256
# Multi-modal input types
257
class ModerationTextInputParam(TypedDict):
258
"""Text input for moderation."""
259
text: str # Required: Text content to moderate
260
type: Literal["text"] # Required: Always "text"
261
262
class ImageURL(TypedDict):
263
"""Image URL or base64 data."""
264
url: str # Required: URL or data:image/...;base64,... string
265
266
class ModerationImageURLInputParam(TypedDict):
267
"""Image input for moderation."""
268
image_url: ImageURL # Required: Image URL or base64 data
269
type: Literal["image_url"] # Required: Always "image_url"
270
271
# Union type for multi-modal inputs
272
ModerationMultiModalInputParam = Union[
273
ModerationTextInputParam,
274
ModerationImageURLInputParam
275
]
276
```
277
278
## Category Descriptions
279
280
| Category | Description |
281
|----------|-------------|
282
| hate | Content expressing, inciting, or promoting hate based on protected characteristics |
283
| hate/threatening | Hateful content that also includes violence or serious harm |
284
| harassment | Content harassing, bullying, or abusing an individual |
285
| harassment/threatening | Harassing content that also includes violence or serious harm |
286
| self-harm | Content promoting, encouraging, or depicting acts of self-harm |
287
| self-harm/intent | Content indicating intent to engage in self-harm |
288
| self-harm/instructions | Content providing instructions or advice for self-harm |
289
| sexual | Content meant to arouse sexual excitement |
290
| sexual/minors | Sexual content involving individuals under 18 |
291
| violence | Content depicting death, violence, or physical injury |
292
| violence/graphic | Graphic violent content with extreme detail |
293
| illicit | Content promoting illicit substances or illegal activities |
294
| illicit/violent | Illicit content involving violence |
295
296
## Best Practices
297
298
```python
299
from openai import OpenAI
300
301
client = OpenAI()
302
303
# 1. Moderate user-generated content before processing
304
def moderate_before_processing(user_text: str):
305
response = client.moderations.create(input=user_text)
306
307
if response.results[0].flagged:
308
return None, "Content violates policies"
309
310
# Safe to process
311
return process_safe_content(user_text), None
312
313
# 2. Batch moderation for efficiency
314
user_messages = ["msg1", "msg2", "msg3"]
315
response = client.moderations.create(input=user_messages)
316
317
safe_messages = [
318
msg for msg, result in zip(user_messages, response.results)
319
if not result.flagged
320
]
321
322
# 3. Log violations for analysis
323
for i, result in enumerate(response.results):
324
if result.flagged:
325
violated_categories = [
326
cat for cat, flagged in result.categories.model_dump().items()
327
if flagged
328
]
329
log_violation(user_messages[i], violated_categories)
330
331
# 4. Use thresholds for borderline content
332
def is_definitely_safe(text: str, threshold: float = 0.5) -> bool:
333
response = client.moderations.create(input=text)
334
result = response.results[0]
335
336
# Check if any score exceeds threshold
337
scores = result.category_scores.model_dump()
338
return all(score < threshold for score in scores.values())
339
```
340
341
## Async Usage
342
343
```python
344
import asyncio
345
from openai import AsyncOpenAI
346
347
async def moderate_content(text: str):
348
client = AsyncOpenAI()
349
350
response = await client.moderations.create(input=text)
351
return response.results[0].flagged
352
353
# Run async
354
is_flagged = asyncio.run(moderate_content("Check this text"))
355
```
356