Tessl Tile for pypi/openai@2.8.1

or run

npx @tessl/cli init

evals.mddocs/

0
# Evaluations
1

2
Create and manage evaluations to test model performance with custom testing criteria and data sources. The Evals API enables systematic evaluation of different models and parameters against consistent benchmarks.
3

4
## Capabilities
5

6
### Create Evaluation
7

8
Create an evaluation structure with testing criteria and data source configuration.
9

10
```python { .api }
11
def create(
12
    self,
13
    *,
14
    data_source_config: dict,
15
    testing_criteria: Iterable[dict],
16
    metadata: dict[str, str] | None | Omit = omit,
17
    name: str | Omit = omit,
18
    extra_headers: dict[str, str] | None = None,
19
    extra_query: dict[str, object] | None = None,
20
    extra_body: dict[str, object] | None = None,
21
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
22
) -> Eval:
23
    """
24
    Create an evaluation for testing model performance.
25

26
    Args:
27
        data_source_config: Configuration for the data source used in eval runs.
28
            Dictates the schema of data used in the evaluation.
29
            Example: {
30
                "type": "file",
31
                "file_id": "file-abc123",
32
                "schema": {
33
                    "input": {"type": "string"},
34
                    "expected_output": {"type": "string"}
35
                }
36
            }
37

38
        testing_criteria: List of graders for all eval runs. Graders can reference
39
            variables in the data source using double curly braces notation like
40
            {{item.variable_name}}. To reference model output, use {{sample.output_text}}.
41
            Example: [
42
                {
43
                    "type": "exact_match",
44
                    "expected": "{{item.expected_output}}",
45
                    "actual": "{{sample.output_text}}"
46
                },
47
                {
48
                    "type": "contains",
49
                    "substring": "{{item.keyword}}",
50
                    "text": "{{sample.output_text}}"
51
                }
52
            ]
53

54
        metadata: Up to 16 key-value pairs for storing additional information.
55
            Keys max 64 characters, values max 512 characters.
56

57
        name: Name of the evaluation for identification.
58

59
        extra_headers: Additional HTTP headers.
60
        extra_query: Additional query parameters.
61
        extra_body: Additional JSON fields.
62
        timeout: Request timeout in seconds.
63

64
    Returns:
65
        Eval: Created evaluation object with ID for running evaluations.
66

67
    Notes:
68
        - After creating an evaluation, run it on different models/parameters
69
        - See https://platform.openai.com/docs/guides/evals for grader types
70
        - Supported graders: exact_match, contains, llm_judge, custom_code
71
    """
72
```
73

74
Usage example:
75

76
```python
77
from openai import OpenAI
78

79
client = OpenAI()
80

81
# Create evaluation with testing criteria
82
eval = client.evals.create(
83
    name="Customer Support Eval",
84
    data_source_config={
85
        "type": "file",
86
        "file_id": "file-abc123",
87
        "schema": {
88
            "customer_query": {"type": "string"},
89
            "expected_tone": {"type": "string"},
90
            "expected_answer": {"type": "string"}
91
        }
92
    },
93
    testing_criteria=[
94
        {
95
            "type": "exact_match",
96
            "name": "Answer Correctness",
97
            "expected": "{{item.expected_answer}}",
98
            "actual": "{{sample.output_text}}"
99
        },
100
        {
101
            "type": "llm_judge",
102
            "name": "Tone Check",
103
            "prompt": "Does the response match the tone: {{item.expected_tone}}?",
104
            "text": "{{sample.output_text}}"
105
        }
106
    ],
107
    metadata={
108
        "team": "customer-success",
109
        "version": "v1"
110
    }
111
)
112

113
print(f"Created evaluation: {eval.id}")
114
```
115

116
### Retrieve Evaluation
117

118
Get an evaluation by ID.
119

120
```python { .api }
121
def retrieve(
122
    self,
123
    eval_id: str,
124
    *,
125
    extra_headers: dict[str, str] | None = None,
126
    extra_query: dict[str, object] | None = None,
127
    extra_body: dict[str, object] | None = None,
128
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
129
) -> Eval:
130
    """
131
    Retrieve an evaluation by its ID.
132

133
    Args:
134
        eval_id: ID of the evaluation to retrieve.
135

136
        extra_headers: Additional HTTP headers.
137
        extra_query: Additional query parameters.
138
        extra_body: Additional JSON fields.
139
        timeout: Request timeout in seconds.
140

141
    Returns:
142
        Eval: Evaluation object with full configuration and metadata.
143
    """
144
```
145

146
### Update Evaluation
147

148
Update evaluation properties like name or metadata.
149

150
```python { .api }
151
def update(
152
    self,
153
    eval_id: str,
154
    *,
155
    metadata: dict[str, str] | None | Omit = omit,
156
    name: str | Omit = omit,
157
    extra_headers: dict[str, str] | None = None,
158
    extra_query: dict[str, object] | None = None,
159
    extra_body: dict[str, object] | None = None,
160
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
161
) -> Eval:
162
    """
163
    Update certain properties of an evaluation.
164

165
    Args:
166
        eval_id: ID of the evaluation to update.
167

168
        metadata: New metadata key-value pairs. Replaces existing metadata.
169
            Up to 16 pairs, keys max 64 chars, values max 512 chars.
170

171
        name: New name for the evaluation.
172

173
        extra_headers: Additional HTTP headers.
174
        extra_query: Additional query parameters.
175
        extra_body: Additional JSON fields.
176
        timeout: Request timeout in seconds.
177

178
    Returns:
179
        Eval: Updated evaluation object.
180

181
    Notes:
182
        - Only name and metadata can be updated
183
        - Cannot modify data_source_config or testing_criteria after creation
184
    """
185
```
186

187
### List Evaluations
188

189
List all evaluations for the current project.
190

191
```python { .api }
192
def list(
193
    self,
194
    *,
195
    after: str | Omit = omit,
196
    limit: int | Omit = omit,
197
    order: Literal["asc", "desc"] | Omit = omit,
198
    order_by: Literal["created_at", "updated_at"] | Omit = omit,
199
    extra_headers: dict[str, str] | None = None,
200
    extra_query: dict[str, object] | None = None,
201
    extra_body: dict[str, object] | None = None,
202
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
203
) -> SyncCursorPage[Eval]:
204
    """
205
    List evaluations for a project.
206

207
    Args:
208
        after: Cursor for pagination. ID of last eval from previous request.
209

210
        limit: Number of evaluations to retrieve. Default varies by API.
211

212
        order: Sort order by timestamp.
213
            - "asc": Ascending (oldest first)
214
            - "desc": Descending (newest first, default)
215

216
        order_by: Field to order by.
217
            - "created_at": Creation time (default)
218
            - "updated_at": Last modification time
219

220
        extra_headers: Additional HTTP headers.
221
        extra_query: Additional query parameters.
222
        extra_body: Additional JSON fields.
223
        timeout: Request timeout in seconds.
224

225
    Returns:
226
        SyncCursorPage[Eval]: Paginated list of evaluations.
227
            Supports iteration: for eval in client.evals.list(): ...
228
    """
229
```
230

231
Usage examples:
232

233
```python
234
from openai import OpenAI
235

236
client = OpenAI()
237

238
# List all evaluations
239
for eval in client.evals.list():
240
    print(f"{eval.name}: {eval.id}")
241

242
# List with pagination
243
page = client.evals.list(limit=10)
244
for eval in page:
245
    print(eval.name)
246

247
# Get next page
248
if page.has_more:
249
    next_page = client.evals.list(
250
        limit=10,
251
        after=page.data[-1].id
252
    )
253

254
# List by last updated
255
for eval in client.evals.list(order_by="updated_at", order="desc"):
256
    print(f"{eval.name} - Updated: {eval.updated_at}")
257
```
258

259
### Delete Evaluation
260

261
Delete an evaluation.
262

263
```python { .api }
264
def delete(
265
    self,
266
    eval_id: str,
267
    *,
268
    extra_headers: dict[str, str] | None = None,
269
    extra_query: dict[str, object] | None = None,
270
    extra_body: dict[str, object] | None = None,
271
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
272
) -> EvalDeleteResponse:
273
    """
274
    Delete an evaluation.
275

276
    Args:
277
        eval_id: ID of the evaluation to delete.
278

279
        extra_headers: Additional HTTP headers.
280
        extra_query: Additional query parameters.
281
        extra_body: Additional JSON fields.
282
        timeout: Request timeout in seconds.
283

284
    Returns:
285
        EvalDeleteResponse: Confirmation of deletion with ID and status.
286

287
    Notes:
288
        - Deletion is permanent
289
        - Associated eval runs are also deleted
290
    """
291
```
292

293
### Evaluation Runs
294

295
Run an evaluation against a model configuration.
296

297
```python { .api }
298
# Access via client.evals.runs
299

300
def create(
301
    self,
302
    eval_id: str,
303
    *,
304
    data_source: dict,
305
    metadata: dict[str, str] | None | Omit = omit,
306
    name: str | Omit = omit,
307
    extra_headers: dict[str, str] | None = None,
308
    extra_query: dict[str, object] | None = None,
309
    extra_body: dict[str, object] | None = None,
310
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
311
) -> RunCreateResponse:
312
    """
313
    Create a run of the evaluation with specified data source and configuration.
314

315
    Args:
316
        eval_id: ID of the evaluation to run.
317

318
        data_source: Details about the run's data source. Can be either:
319
            - File content: {"type": "file_content", "content": [...]}
320
            - File ID: {"type": "file_id", "file_id": "file-xxx"}
321
            The data source will be validated against the schema specified in the evaluation config.
322

323
        metadata: Set of 16 key-value pairs that can be attached to an object.
324
            Keys have a maximum length of 64 characters.
325
            Values have a maximum length of 512 characters.
326

327
        name: The name of the run.
328

329
        extra_headers: Additional HTTP headers.
330
        extra_query: Additional query parameters.
331
        extra_body: Additional JSON fields.
332
        timeout: Request timeout in seconds.
333

334
    Returns:
335
        RunCreateResponse: Created run object. Use retrieve() to check status and results.
336
    """
337

338
def retrieve(
339
    self,
340
    eval_id: str,
341
    run_id: str,
342
    *,
343
    extra_headers: dict[str, str] | None = None,
344
    extra_query: dict[str, object] | None = None,
345
    extra_body: dict[str, object] | None = None,
346
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
347
) -> EvalRun:
348
    """
349
    Retrieve a specific evaluation run.
350

351
    Args:
352
        eval_id: ID of the evaluation.
353
        run_id: ID of the run.
354

355
        extra_headers: Additional HTTP headers.
356
        extra_query: Additional query parameters.
357
        extra_body: Additional JSON fields.
358
        timeout: Request timeout in seconds.
359

360
    Returns:
361
        EvalRun: Run object with status, results, and scores.
362
    """
363

364
def cancel(
365
    self,
366
    run_id: str,
367
    *,
368
    eval_id: str,
369
    extra_headers: dict[str, str] | None = None,
370
    extra_query: dict[str, object] | None = None,
371
    extra_body: dict[str, object] | None = None,
372
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
373
) -> RunCancelResponse:
374
    """
375
    Cancel an ongoing evaluation run.
376

377
    Args:
378
        run_id: ID of the run to cancel.
379
        eval_id: ID of the evaluation.
380

381
        extra_headers: Additional HTTP headers.
382
        extra_query: Additional query parameters.
383
        extra_body: Additional JSON fields.
384
        timeout: Request timeout in seconds.
385

386
    Returns:
387
        RunCancelResponse: Confirmation of run cancellation.
388
    """
389

390
def delete(
391
    self,
392
    run_id: str,
393
    *,
394
    eval_id: str,
395
    extra_headers: dict[str, str] | None = None,
396
    extra_query: dict[str, object] | None = None,
397
    extra_body: dict[str, object] | None = None,
398
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
399
) -> RunDeleteResponse:
400
    """
401
    Permanently delete an evaluation run.
402

403
    Args:
404
        run_id: ID of the run to delete.
405
        eval_id: ID of the evaluation.
406

407
        extra_headers: Additional HTTP headers.
408
        extra_query: Additional query parameters.
409
        extra_body: Additional JSON fields.
410
        timeout: Request timeout in seconds.
411

412
    Returns:
413
        RunDeleteResponse: Confirmation of run deletion.
414
    """
415

416
def list(
417
    self,
418
    eval_id: str,
419
    *,
420
    after: str | Omit = omit,
421
    limit: int | Omit = omit,
422
    order: Literal["asc", "desc"] | Omit = omit,
423
    status: Literal["queued", "in_progress", "completed", "canceled", "failed"] | Omit = omit,
424
    extra_headers: dict[str, str] | None = None,
425
    extra_query: dict[str, object] | None = None,
426
    extra_body: dict[str, object] | None = None,
427
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
428
) -> SyncCursorPage[RunListResponse]:
429
    """
430
    List all runs for an evaluation.
431

432
    Args:
433
        eval_id: ID of the evaluation.
434
        after: Cursor for pagination.
435
        limit: Number of runs to retrieve.
436
        order: Sort order ("asc" or "desc").
437
        status: Filter by run status ("queued", "in_progress", "completed", "canceled", or "failed").
438

439
        extra_headers: Additional HTTP headers.
440
        extra_query: Additional query parameters.
441
        extra_body: Additional JSON fields.
442
        timeout: Request timeout in seconds.
443

444
    Returns:
445
        SyncCursorPage[RunListResponse]: Paginated list of runs.
446
    """
447
```
448

449
### Evaluation Run Output Items
450

451
Inspect individual output items from an evaluation run.
452

453
```python { .api }
454
# Access via client.evals.runs.output_items
455

456
def retrieve(
457
    self,
458
    output_item_id: str,
459
    *,
460
    eval_id: str,
461
    run_id: str,
462
    extra_headers: dict[str, str] | None = None,
463
    extra_query: dict[str, object] | None = None,
464
    extra_body: dict[str, object] | None = None,
465
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
466
) -> OutputItemRetrieveResponse:
467
    """
468
    Get an evaluation run output item by ID.
469

470
    Args:
471
        output_item_id: ID of the output item to retrieve.
472

473
        eval_id: ID of the evaluation.
474

475
        run_id: ID of the evaluation run.
476

477
        extra_headers: Additional HTTP headers.
478
        extra_query: Additional query parameters.
479
        extra_body: Additional JSON fields.
480
        timeout: Request timeout in seconds.
481

482
    Returns:
483
        OutputItemRetrieveResponse: Individual output item with test results
484
            and grader scores for a specific data point in the evaluation.
485
    """
486

487
def list(
488
    self,
489
    run_id: str,
490
    *,
491
    eval_id: str,
492
    after: str | Omit = omit,
493
    limit: int | Omit = omit,
494
    order: Literal["asc", "desc"] | Omit = omit,
495
    status: Literal["fail", "pass"] | Omit = omit,
496
    extra_headers: dict[str, str] | None = None,
497
    extra_query: dict[str, object] | None = None,
498
    extra_body: dict[str, object] | None = None,
499
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
500
) -> SyncCursorPage[OutputItemListResponse]:
501
    """
502
    Get a list of output items for an evaluation run.
503

504
    Args:
505
        run_id: ID of the evaluation run.
506

507
        eval_id: ID of the evaluation.
508

509
        after: Cursor for pagination. ID of last output item from previous request.
510

511
        limit: Number of output items to retrieve.
512

513
        order: Sort order by timestamp.
514
            - "asc": Ascending (oldest first)
515
            - "desc": Descending (newest first, default)
516

517
        status: Filter output items by status.
518
            - "fail": Only failed output items
519
            - "pass": Only passed output items
520
            Omit to retrieve all output items.
521

522
        extra_headers: Additional HTTP headers.
523
        extra_query: Additional query parameters.
524
        extra_body: Additional JSON fields.
525
        timeout: Request timeout in seconds.
526

527
    Returns:
528
        SyncCursorPage[OutputItemListResponse]: Paginated list of output items.
529
            Each output item contains the model output and grader results for
530
            a single test case in the evaluation run.
531
    """
532
```
533

534
Usage example:
535

536
```python
537
from openai import OpenAI
538

539
client = OpenAI()
540

541
# List all output items for a run
542
for output_item in client.evals.runs.output_items.list(
543
    eval_id="eval-abc123",
544
    run_id="run-def456"
545
):
546
    print(f"Item {output_item.id}: {output_item.status}")
547
    if output_item.status == "fail":
548
        print(f"  Failed on: {output_item.grader_results}")
549

550
# Filter only failed items
551
failed_items = client.evals.runs.output_items.list(
552
    eval_id="eval-abc123",
553
    run_id="run-def456",
554
    status="fail"
555
)
556

557
for item in failed_items:
558
    print(f"Failed item: {item.id}")
559
    # Retrieve detailed information
560
    detail = client.evals.runs.output_items.retrieve(
561
        eval_id="eval-abc123",
562
        run_id="run-def456",
563
        output_item_id=item.id
564
    )
565
    print(f"  Model output: {detail.model_output}")
566
    print(f"  Expected: {detail.expected}")
567

568
# Paginate through all output items
569
page = client.evals.runs.output_items.list(
570
    eval_id="eval-abc123",
571
    run_id="run-def456",
572
    limit=10,
573
    order="asc"
574
)
575

576
for item in page:
577
    print(f"{item.id}: {item.status}")
578
```
579

580
Complete workflow example:
581

582
```python
583
from openai import OpenAI
584
import time
585

586
client = OpenAI()
587

588
# 1. Create evaluation
589
eval = client.evals.create(
590
    name="Model Comparison",
591
    data_source_config={
592
        "type": "file",
593
        "file_id": "file-abc123"
594
    },
595
    testing_criteria=[
596
        {
597
            "type": "exact_match",
598
            "expected": "{{item.expected}}",
599
            "actual": "{{sample.output_text}}"
600
        }
601
    ]
602
)
603

604
# 2. Run evaluation with different models
605
run_gpt4 = client.evals.runs.create(
606
    eval_id=eval.id,
607
    model="gpt-4",
608
    temperature=0.7,
609
    metadata={"variant": "gpt-4"}
610
)
611

612
run_gpt35 = client.evals.runs.create(
613
    eval_id=eval.id,
614
    model="gpt-3.5-turbo",
615
    temperature=0.7,
616
    metadata={"variant": "gpt-3.5"}
617
)
618

619
# 3. Wait for completion and check results
620
def wait_for_run(eval_id: str, run_id: str):
621
    while True:
622
        run = client.evals.runs.retrieve(eval_id=eval_id, run_id=run_id)
623
        if run.status in ["completed", "failed", "cancelled"]:
624
            return run
625
        time.sleep(2)
626

627
gpt4_results = wait_for_run(eval.id, run_gpt4.id)
628
gpt35_results = wait_for_run(eval.id, run_gpt35.id)
629

630
print(f"GPT-4 Score: {gpt4_results.score}")
631
print(f"GPT-3.5 Score: {gpt35_results.score}")
632

633
# 4. Compare results
634
for run in client.evals.runs.list(eval_id=eval.id):
635
    print(f"{run.model}: {run.score} ({run.status})")
636

637
# 5. Update evaluation name
638
client.evals.update(
639
    eval_id=eval.id,
640
    name="Model Comparison - Updated",
641
    metadata={"status": "active"}
642
)
643

644
# 6. Clean up
645
client.evals.delete(eval_id=eval.id)
646
```
647

648
## Async Usage
649

650
```python
651
import asyncio
652
from openai import AsyncOpenAI
653

654
async def run_eval():
655
    client = AsyncOpenAI()
656

657
    # Create evaluation
658
    eval = await client.evals.create(
659
        name="Async Eval",
660
        data_source_config={"type": "file", "file_id": "file-abc"},
661
        testing_criteria=[{"type": "exact_match", "expected": "{{item.expected}}", "actual": "{{sample.output_text}}"}]
662
    )
663

664
    # Run evaluation
665
    run = await client.evals.runs.create(
666
        eval_id=eval.id,
667
        model="gpt-4"
668
    )
669

670
    # Check results
671
    result = await client.evals.runs.retrieve(
672
        eval_id=eval.id,
673
        run_id=run.id
674
    )
675

676
    return result.score
677

678
score = asyncio.run(run_eval())
679
```
680

681
## Types
682

683
```python { .api }
684
from typing import Literal
685
from pydantic import BaseModel
686

687
class Eval(BaseModel):
688
    """Evaluation object."""
689
    id: str
690
    created_at: int
691
    updated_at: int
692
    name: str | None
693
    object: Literal["eval"]
694
    data_source_config: dict
695
    testing_criteria: list[dict]
696
    metadata: dict[str, str] | None
697

698
class EvalRun(BaseModel):
699
    """Evaluation run object."""
700
    id: str
701
    eval_id: str
702
    created_at: int
703
    model: str
704
    status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
705
    score: float | None  # Present when status="completed"
706
    results: list[dict] | None  # Detailed results per test case
707
    metadata: dict[str, str] | None
708

709
class EvalDeleteResponse(BaseModel):
710
    """Response from delete operation."""
711
    id: str
712
    deleted: bool
713
    object: Literal["eval.deleted"]
714

715
class OutputItemRetrieveResponse(BaseModel):
716
    """Individual evaluation run output item."""
717
    id: str
718
    eval_id: str
719
    run_id: str
720
    status: Literal["fail", "pass"]
721
    model_output: str | dict  # Model's generated output
722
    expected: str | dict | None  # Expected output if defined
723
    grader_results: list[dict]  # Results from each grader
724
    created_at: int
725

726
class OutputItemListResponse(BaseModel):
727
    """Evaluation run output item in list responses."""
728
    id: str
729
    eval_id: str
730
    run_id: str
731
    status: Literal["fail", "pass"]
732
    created_at: int
733

734
class SyncCursorPage[T]:
735
    """Cursor-based pagination."""
736
    data: list[T]
737
    has_more: bool
738
    def __iter__(self) -> Iterator[T]: ...
739

740
class Omit:
741
    """Sentinel value for omitted parameters."""
742
```
743

744
## Access Pattern
745

746
```python
747
# Synchronous
748
from openai import OpenAI
749
client = OpenAI()
750
client.evals.create(...)
751
client.evals.retrieve(...)
752
client.evals.update(...)
753
client.evals.list(...)
754
client.evals.delete(...)
755
client.evals.runs.create(...)
756
client.evals.runs.retrieve(...)
757
client.evals.runs.list(...)
758
client.evals.runs.output_items.retrieve(...)
759
client.evals.runs.output_items.list(...)
760

761
# Asynchronous
762
from openai import AsyncOpenAI
763
client = AsyncOpenAI()
764
await client.evals.create(...)
765
await client.evals.retrieve(...)
766
await client.evals.update(...)
767
await client.evals.list(...)
768
await client.evals.delete(...)
769
await client.evals.runs.create(...)
770
await client.evals.runs.retrieve(...)
771
await client.evals.runs.list(...)
772
await client.evals.runs.output_items.retrieve(...)
773
await client.evals.runs.output_items.list(...)
774
```
775

Version

Tile

Files

evals.mddocs/

Version

Tile

Files

evals.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

evals.mddocs/