docs
0
# Evaluations
1
2
Create and manage evaluations to test model performance with custom testing criteria and data sources. The Evals API enables systematic evaluation of different models and parameters against consistent benchmarks.
3
4
## Capabilities
5
6
### Create Evaluation
7
8
Create an evaluation structure with testing criteria and data source configuration.
9
10
```python { .api }
11
def create(
12
self,
13
*,
14
data_source_config: dict,
15
testing_criteria: Iterable[dict],
16
metadata: dict[str, str] | None | Omit = omit,
17
name: str | Omit = omit,
18
extra_headers: dict[str, str] | None = None,
19
extra_query: dict[str, object] | None = None,
20
extra_body: dict[str, object] | None = None,
21
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
22
) -> Eval:
23
"""
24
Create an evaluation for testing model performance.
25
26
Args:
27
data_source_config: Configuration for the data source used in eval runs.
28
Dictates the schema of data used in the evaluation.
29
Example: {
30
"type": "file",
31
"file_id": "file-abc123",
32
"schema": {
33
"input": {"type": "string"},
34
"expected_output": {"type": "string"}
35
}
36
}
37
38
testing_criteria: List of graders for all eval runs. Graders can reference
39
variables in the data source using double curly braces notation like
40
{{item.variable_name}}. To reference model output, use {{sample.output_text}}.
41
Example: [
42
{
43
"type": "exact_match",
44
"expected": "{{item.expected_output}}",
45
"actual": "{{sample.output_text}}"
46
},
47
{
48
"type": "contains",
49
"substring": "{{item.keyword}}",
50
"text": "{{sample.output_text}}"
51
}
52
]
53
54
metadata: Up to 16 key-value pairs for storing additional information.
55
Keys max 64 characters, values max 512 characters.
56
57
name: Name of the evaluation for identification.
58
59
extra_headers: Additional HTTP headers.
60
extra_query: Additional query parameters.
61
extra_body: Additional JSON fields.
62
timeout: Request timeout in seconds.
63
64
Returns:
65
Eval: Created evaluation object with ID for running evaluations.
66
67
Notes:
68
- After creating an evaluation, run it on different models/parameters
69
- See https://platform.openai.com/docs/guides/evals for grader types
70
- Supported graders: exact_match, contains, llm_judge, custom_code
71
"""
72
```
73
74
Usage example:
75
76
```python
77
from openai import OpenAI
78
79
client = OpenAI()
80
81
# Create evaluation with testing criteria
82
eval = client.evals.create(
83
name="Customer Support Eval",
84
data_source_config={
85
"type": "file",
86
"file_id": "file-abc123",
87
"schema": {
88
"customer_query": {"type": "string"},
89
"expected_tone": {"type": "string"},
90
"expected_answer": {"type": "string"}
91
}
92
},
93
testing_criteria=[
94
{
95
"type": "exact_match",
96
"name": "Answer Correctness",
97
"expected": "{{item.expected_answer}}",
98
"actual": "{{sample.output_text}}"
99
},
100
{
101
"type": "llm_judge",
102
"name": "Tone Check",
103
"prompt": "Does the response match the tone: {{item.expected_tone}}?",
104
"text": "{{sample.output_text}}"
105
}
106
],
107
metadata={
108
"team": "customer-success",
109
"version": "v1"
110
}
111
)
112
113
print(f"Created evaluation: {eval.id}")
114
```
115
116
### Retrieve Evaluation
117
118
Get an evaluation by ID.
119
120
```python { .api }
121
def retrieve(
122
self,
123
eval_id: str,
124
*,
125
extra_headers: dict[str, str] | None = None,
126
extra_query: dict[str, object] | None = None,
127
extra_body: dict[str, object] | None = None,
128
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
129
) -> Eval:
130
"""
131
Retrieve an evaluation by its ID.
132
133
Args:
134
eval_id: ID of the evaluation to retrieve.
135
136
extra_headers: Additional HTTP headers.
137
extra_query: Additional query parameters.
138
extra_body: Additional JSON fields.
139
timeout: Request timeout in seconds.
140
141
Returns:
142
Eval: Evaluation object with full configuration and metadata.
143
"""
144
```
145
146
### Update Evaluation
147
148
Update evaluation properties like name or metadata.
149
150
```python { .api }
151
def update(
152
self,
153
eval_id: str,
154
*,
155
metadata: dict[str, str] | None | Omit = omit,
156
name: str | Omit = omit,
157
extra_headers: dict[str, str] | None = None,
158
extra_query: dict[str, object] | None = None,
159
extra_body: dict[str, object] | None = None,
160
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
161
) -> Eval:
162
"""
163
Update certain properties of an evaluation.
164
165
Args:
166
eval_id: ID of the evaluation to update.
167
168
metadata: New metadata key-value pairs. Replaces existing metadata.
169
Up to 16 pairs, keys max 64 chars, values max 512 chars.
170
171
name: New name for the evaluation.
172
173
extra_headers: Additional HTTP headers.
174
extra_query: Additional query parameters.
175
extra_body: Additional JSON fields.
176
timeout: Request timeout in seconds.
177
178
Returns:
179
Eval: Updated evaluation object.
180
181
Notes:
182
- Only name and metadata can be updated
183
- Cannot modify data_source_config or testing_criteria after creation
184
"""
185
```
186
187
### List Evaluations
188
189
List all evaluations for the current project.
190
191
```python { .api }
192
def list(
193
self,
194
*,
195
after: str | Omit = omit,
196
limit: int | Omit = omit,
197
order: Literal["asc", "desc"] | Omit = omit,
198
order_by: Literal["created_at", "updated_at"] | Omit = omit,
199
extra_headers: dict[str, str] | None = None,
200
extra_query: dict[str, object] | None = None,
201
extra_body: dict[str, object] | None = None,
202
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
203
) -> SyncCursorPage[Eval]:
204
"""
205
List evaluations for a project.
206
207
Args:
208
after: Cursor for pagination. ID of last eval from previous request.
209
210
limit: Number of evaluations to retrieve. Default varies by API.
211
212
order: Sort order by timestamp.
213
- "asc": Ascending (oldest first)
214
- "desc": Descending (newest first, default)
215
216
order_by: Field to order by.
217
- "created_at": Creation time (default)
218
- "updated_at": Last modification time
219
220
extra_headers: Additional HTTP headers.
221
extra_query: Additional query parameters.
222
extra_body: Additional JSON fields.
223
timeout: Request timeout in seconds.
224
225
Returns:
226
SyncCursorPage[Eval]: Paginated list of evaluations.
227
Supports iteration: for eval in client.evals.list(): ...
228
"""
229
```
230
231
Usage examples:
232
233
```python
234
from openai import OpenAI
235
236
client = OpenAI()
237
238
# List all evaluations
239
for eval in client.evals.list():
240
print(f"{eval.name}: {eval.id}")
241
242
# List with pagination
243
page = client.evals.list(limit=10)
244
for eval in page:
245
print(eval.name)
246
247
# Get next page
248
if page.has_more:
249
next_page = client.evals.list(
250
limit=10,
251
after=page.data[-1].id
252
)
253
254
# List by last updated
255
for eval in client.evals.list(order_by="updated_at", order="desc"):
256
print(f"{eval.name} - Updated: {eval.updated_at}")
257
```
258
259
### Delete Evaluation
260
261
Delete an evaluation.
262
263
```python { .api }
264
def delete(
265
self,
266
eval_id: str,
267
*,
268
extra_headers: dict[str, str] | None = None,
269
extra_query: dict[str, object] | None = None,
270
extra_body: dict[str, object] | None = None,
271
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
272
) -> EvalDeleteResponse:
273
"""
274
Delete an evaluation.
275
276
Args:
277
eval_id: ID of the evaluation to delete.
278
279
extra_headers: Additional HTTP headers.
280
extra_query: Additional query parameters.
281
extra_body: Additional JSON fields.
282
timeout: Request timeout in seconds.
283
284
Returns:
285
EvalDeleteResponse: Confirmation of deletion with ID and status.
286
287
Notes:
288
- Deletion is permanent
289
- Associated eval runs are also deleted
290
"""
291
```
292
293
### Evaluation Runs
294
295
Run an evaluation against a model configuration.
296
297
```python { .api }
298
# Access via client.evals.runs
299
300
def create(
301
self,
302
eval_id: str,
303
*,
304
data_source: dict,
305
metadata: dict[str, str] | None | Omit = omit,
306
name: str | Omit = omit,
307
extra_headers: dict[str, str] | None = None,
308
extra_query: dict[str, object] | None = None,
309
extra_body: dict[str, object] | None = None,
310
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
311
) -> RunCreateResponse:
312
"""
313
Create a run of the evaluation with specified data source and configuration.
314
315
Args:
316
eval_id: ID of the evaluation to run.
317
318
data_source: Details about the run's data source. Can be either:
319
- File content: {"type": "file_content", "content": [...]}
320
- File ID: {"type": "file_id", "file_id": "file-xxx"}
321
The data source will be validated against the schema specified in the evaluation config.
322
323
metadata: Set of 16 key-value pairs that can be attached to an object.
324
Keys have a maximum length of 64 characters.
325
Values have a maximum length of 512 characters.
326
327
name: The name of the run.
328
329
extra_headers: Additional HTTP headers.
330
extra_query: Additional query parameters.
331
extra_body: Additional JSON fields.
332
timeout: Request timeout in seconds.
333
334
Returns:
335
RunCreateResponse: Created run object. Use retrieve() to check status and results.
336
"""
337
338
def retrieve(
339
self,
340
eval_id: str,
341
run_id: str,
342
*,
343
extra_headers: dict[str, str] | None = None,
344
extra_query: dict[str, object] | None = None,
345
extra_body: dict[str, object] | None = None,
346
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
347
) -> EvalRun:
348
"""
349
Retrieve a specific evaluation run.
350
351
Args:
352
eval_id: ID of the evaluation.
353
run_id: ID of the run.
354
355
extra_headers: Additional HTTP headers.
356
extra_query: Additional query parameters.
357
extra_body: Additional JSON fields.
358
timeout: Request timeout in seconds.
359
360
Returns:
361
EvalRun: Run object with status, results, and scores.
362
"""
363
364
def cancel(
365
self,
366
run_id: str,
367
*,
368
eval_id: str,
369
extra_headers: dict[str, str] | None = None,
370
extra_query: dict[str, object] | None = None,
371
extra_body: dict[str, object] | None = None,
372
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
373
) -> RunCancelResponse:
374
"""
375
Cancel an ongoing evaluation run.
376
377
Args:
378
run_id: ID of the run to cancel.
379
eval_id: ID of the evaluation.
380
381
extra_headers: Additional HTTP headers.
382
extra_query: Additional query parameters.
383
extra_body: Additional JSON fields.
384
timeout: Request timeout in seconds.
385
386
Returns:
387
RunCancelResponse: Confirmation of run cancellation.
388
"""
389
390
def delete(
391
self,
392
run_id: str,
393
*,
394
eval_id: str,
395
extra_headers: dict[str, str] | None = None,
396
extra_query: dict[str, object] | None = None,
397
extra_body: dict[str, object] | None = None,
398
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
399
) -> RunDeleteResponse:
400
"""
401
Permanently delete an evaluation run.
402
403
Args:
404
run_id: ID of the run to delete.
405
eval_id: ID of the evaluation.
406
407
extra_headers: Additional HTTP headers.
408
extra_query: Additional query parameters.
409
extra_body: Additional JSON fields.
410
timeout: Request timeout in seconds.
411
412
Returns:
413
RunDeleteResponse: Confirmation of run deletion.
414
"""
415
416
def list(
417
self,
418
eval_id: str,
419
*,
420
after: str | Omit = omit,
421
limit: int | Omit = omit,
422
order: Literal["asc", "desc"] | Omit = omit,
423
status: Literal["queued", "in_progress", "completed", "canceled", "failed"] | Omit = omit,
424
extra_headers: dict[str, str] | None = None,
425
extra_query: dict[str, object] | None = None,
426
extra_body: dict[str, object] | None = None,
427
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
428
) -> SyncCursorPage[RunListResponse]:
429
"""
430
List all runs for an evaluation.
431
432
Args:
433
eval_id: ID of the evaluation.
434
after: Cursor for pagination.
435
limit: Number of runs to retrieve.
436
order: Sort order ("asc" or "desc").
437
status: Filter by run status ("queued", "in_progress", "completed", "canceled", or "failed").
438
439
extra_headers: Additional HTTP headers.
440
extra_query: Additional query parameters.
441
extra_body: Additional JSON fields.
442
timeout: Request timeout in seconds.
443
444
Returns:
445
SyncCursorPage[RunListResponse]: Paginated list of runs.
446
"""
447
```
448
449
### Evaluation Run Output Items
450
451
Inspect individual output items from an evaluation run.
452
453
```python { .api }
454
# Access via client.evals.runs.output_items
455
456
def retrieve(
457
self,
458
output_item_id: str,
459
*,
460
eval_id: str,
461
run_id: str,
462
extra_headers: dict[str, str] | None = None,
463
extra_query: dict[str, object] | None = None,
464
extra_body: dict[str, object] | None = None,
465
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
466
) -> OutputItemRetrieveResponse:
467
"""
468
Get an evaluation run output item by ID.
469
470
Args:
471
output_item_id: ID of the output item to retrieve.
472
473
eval_id: ID of the evaluation.
474
475
run_id: ID of the evaluation run.
476
477
extra_headers: Additional HTTP headers.
478
extra_query: Additional query parameters.
479
extra_body: Additional JSON fields.
480
timeout: Request timeout in seconds.
481
482
Returns:
483
OutputItemRetrieveResponse: Individual output item with test results
484
and grader scores for a specific data point in the evaluation.
485
"""
486
487
def list(
488
self,
489
run_id: str,
490
*,
491
eval_id: str,
492
after: str | Omit = omit,
493
limit: int | Omit = omit,
494
order: Literal["asc", "desc"] | Omit = omit,
495
status: Literal["fail", "pass"] | Omit = omit,
496
extra_headers: dict[str, str] | None = None,
497
extra_query: dict[str, object] | None = None,
498
extra_body: dict[str, object] | None = None,
499
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
500
) -> SyncCursorPage[OutputItemListResponse]:
501
"""
502
Get a list of output items for an evaluation run.
503
504
Args:
505
run_id: ID of the evaluation run.
506
507
eval_id: ID of the evaluation.
508
509
after: Cursor for pagination. ID of last output item from previous request.
510
511
limit: Number of output items to retrieve.
512
513
order: Sort order by timestamp.
514
- "asc": Ascending (oldest first)
515
- "desc": Descending (newest first, default)
516
517
status: Filter output items by status.
518
- "fail": Only failed output items
519
- "pass": Only passed output items
520
Omit to retrieve all output items.
521
522
extra_headers: Additional HTTP headers.
523
extra_query: Additional query parameters.
524
extra_body: Additional JSON fields.
525
timeout: Request timeout in seconds.
526
527
Returns:
528
SyncCursorPage[OutputItemListResponse]: Paginated list of output items.
529
Each output item contains the model output and grader results for
530
a single test case in the evaluation run.
531
"""
532
```
533
534
Usage example:
535
536
```python
537
from openai import OpenAI
538
539
client = OpenAI()
540
541
# List all output items for a run
542
for output_item in client.evals.runs.output_items.list(
543
eval_id="eval-abc123",
544
run_id="run-def456"
545
):
546
print(f"Item {output_item.id}: {output_item.status}")
547
if output_item.status == "fail":
548
print(f" Failed on: {output_item.grader_results}")
549
550
# Filter only failed items
551
failed_items = client.evals.runs.output_items.list(
552
eval_id="eval-abc123",
553
run_id="run-def456",
554
status="fail"
555
)
556
557
for item in failed_items:
558
print(f"Failed item: {item.id}")
559
# Retrieve detailed information
560
detail = client.evals.runs.output_items.retrieve(
561
eval_id="eval-abc123",
562
run_id="run-def456",
563
output_item_id=item.id
564
)
565
print(f" Model output: {detail.model_output}")
566
print(f" Expected: {detail.expected}")
567
568
# Paginate through all output items
569
page = client.evals.runs.output_items.list(
570
eval_id="eval-abc123",
571
run_id="run-def456",
572
limit=10,
573
order="asc"
574
)
575
576
for item in page:
577
print(f"{item.id}: {item.status}")
578
```
579
580
Complete workflow example:
581
582
```python
583
from openai import OpenAI
584
import time
585
586
client = OpenAI()
587
588
# 1. Create evaluation
589
eval = client.evals.create(
590
name="Model Comparison",
591
data_source_config={
592
"type": "file",
593
"file_id": "file-abc123"
594
},
595
testing_criteria=[
596
{
597
"type": "exact_match",
598
"expected": "{{item.expected}}",
599
"actual": "{{sample.output_text}}"
600
}
601
]
602
)
603
604
# 2. Run evaluation with different models
605
run_gpt4 = client.evals.runs.create(
606
eval_id=eval.id,
607
model="gpt-4",
608
temperature=0.7,
609
metadata={"variant": "gpt-4"}
610
)
611
612
run_gpt35 = client.evals.runs.create(
613
eval_id=eval.id,
614
model="gpt-3.5-turbo",
615
temperature=0.7,
616
metadata={"variant": "gpt-3.5"}
617
)
618
619
# 3. Wait for completion and check results
620
def wait_for_run(eval_id: str, run_id: str):
621
while True:
622
run = client.evals.runs.retrieve(eval_id=eval_id, run_id=run_id)
623
if run.status in ["completed", "failed", "cancelled"]:
624
return run
625
time.sleep(2)
626
627
gpt4_results = wait_for_run(eval.id, run_gpt4.id)
628
gpt35_results = wait_for_run(eval.id, run_gpt35.id)
629
630
print(f"GPT-4 Score: {gpt4_results.score}")
631
print(f"GPT-3.5 Score: {gpt35_results.score}")
632
633
# 4. Compare results
634
for run in client.evals.runs.list(eval_id=eval.id):
635
print(f"{run.model}: {run.score} ({run.status})")
636
637
# 5. Update evaluation name
638
client.evals.update(
639
eval_id=eval.id,
640
name="Model Comparison - Updated",
641
metadata={"status": "active"}
642
)
643
644
# 6. Clean up
645
client.evals.delete(eval_id=eval.id)
646
```
647
648
## Async Usage
649
650
```python
651
import asyncio
652
from openai import AsyncOpenAI
653
654
async def run_eval():
655
client = AsyncOpenAI()
656
657
# Create evaluation
658
eval = await client.evals.create(
659
name="Async Eval",
660
data_source_config={"type": "file", "file_id": "file-abc"},
661
testing_criteria=[{"type": "exact_match", "expected": "{{item.expected}}", "actual": "{{sample.output_text}}"}]
662
)
663
664
# Run evaluation
665
run = await client.evals.runs.create(
666
eval_id=eval.id,
667
model="gpt-4"
668
)
669
670
# Check results
671
result = await client.evals.runs.retrieve(
672
eval_id=eval.id,
673
run_id=run.id
674
)
675
676
return result.score
677
678
score = asyncio.run(run_eval())
679
```
680
681
## Types
682
683
```python { .api }
684
from typing import Literal
685
from pydantic import BaseModel
686
687
class Eval(BaseModel):
688
"""Evaluation object."""
689
id: str
690
created_at: int
691
updated_at: int
692
name: str | None
693
object: Literal["eval"]
694
data_source_config: dict
695
testing_criteria: list[dict]
696
metadata: dict[str, str] | None
697
698
class EvalRun(BaseModel):
699
"""Evaluation run object."""
700
id: str
701
eval_id: str
702
created_at: int
703
model: str
704
status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
705
score: float | None # Present when status="completed"
706
results: list[dict] | None # Detailed results per test case
707
metadata: dict[str, str] | None
708
709
class EvalDeleteResponse(BaseModel):
710
"""Response from delete operation."""
711
id: str
712
deleted: bool
713
object: Literal["eval.deleted"]
714
715
class OutputItemRetrieveResponse(BaseModel):
716
"""Individual evaluation run output item."""
717
id: str
718
eval_id: str
719
run_id: str
720
status: Literal["fail", "pass"]
721
model_output: str | dict # Model's generated output
722
expected: str | dict | None # Expected output if defined
723
grader_results: list[dict] # Results from each grader
724
created_at: int
725
726
class OutputItemListResponse(BaseModel):
727
"""Evaluation run output item in list responses."""
728
id: str
729
eval_id: str
730
run_id: str
731
status: Literal["fail", "pass"]
732
created_at: int
733
734
class SyncCursorPage[T]:
735
"""Cursor-based pagination."""
736
data: list[T]
737
has_more: bool
738
def __iter__(self) -> Iterator[T]: ...
739
740
class Omit:
741
"""Sentinel value for omitted parameters."""
742
```
743
744
## Access Pattern
745
746
```python
747
# Synchronous
748
from openai import OpenAI
749
client = OpenAI()
750
client.evals.create(...)
751
client.evals.retrieve(...)
752
client.evals.update(...)
753
client.evals.list(...)
754
client.evals.delete(...)
755
client.evals.runs.create(...)
756
client.evals.runs.retrieve(...)
757
client.evals.runs.list(...)
758
client.evals.runs.output_items.retrieve(...)
759
client.evals.runs.output_items.list(...)
760
761
# Asynchronous
762
from openai import AsyncOpenAI
763
client = AsyncOpenAI()
764
await client.evals.create(...)
765
await client.evals.retrieve(...)
766
await client.evals.update(...)
767
await client.evals.list(...)
768
await client.evals.delete(...)
769
await client.evals.runs.create(...)
770
await client.evals.runs.retrieve(...)
771
await client.evals.runs.list(...)
772
await client.evals.runs.output_items.retrieve(...)
773
await client.evals.runs.output_items.list(...)
774
```
775