The official TypeScript library for the OpenAI API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Comprehensive API reference for batch processing and evaluation management in the OpenAI Node.js library. Use batches to process multiple API requests asynchronously, and use evaluations to systematically test model performance against defined criteria.
Batches allow you to send multiple API requests in a single operation, processed asynchronously by OpenAI. This is ideal for high-volume, non-time-sensitive workloads where cost efficiency is important.
Evaluations provide a framework to systematically assess model outputs against defined testing criteria. Run evaluations on different models, configurations, and data sources to compare performance.
Submits a new batch job for processing. The batch is created from a JSONL file containing API requests.
create(params: BatchCreateParams): Promise<Batch>Parameters:
interface BatchCreateParams {
/**
* The time frame within which the batch should be processed.
* Currently only `24h` is supported.
*/
completion_window: '24h';
/**
* The endpoint to be used for all requests in the batch.
* Supported: `/v1/responses`, `/v1/chat/completions`, `/v1/embeddings`,
* `/v1/completions`, `/v1/moderations`.
* Note: `/v1/embeddings` batches limited to 50,000 embedding inputs.
*/
endpoint:
| '/v1/responses'
| '/v1/chat/completions'
| '/v1/embeddings'
| '/v1/completions'
| '/v1/moderations';
/**
* The ID of an uploaded file containing requests for the batch.
* Must be a JSONL file uploaded with purpose `batch`.
* Max 50,000 requests, 200 MB file size.
*/
input_file_id: string;
/**
* Optional metadata (16 key-value pairs max).
* Keys: max 64 chars; Values: max 512 chars.
*/
metadata?: Metadata | null;
/**
* Optional expiration policy for output/error files.
*/
output_expires_after?: {
/**
* Anchor timestamp: `created_at` (file creation time).
*/
anchor: 'created_at';
/**
* Seconds after anchor: 3600 (1 hour) to 2592000 (30 days).
*/
seconds: number;
};
}Example:
import { OpenAI } from 'openai';
const client = new OpenAI();
// 1. Create a JSONL file with batch requests
const batchRequests = [
{
custom_id: 'request-1',
method: 'POST',
url: '/v1/chat/completions',
body: {
model: 'gpt-4o',
messages: [{ role: 'user', content: 'Translate "hello" to French' }],
max_tokens: 100,
},
},
{
custom_id: 'request-2',
method: 'POST',
url: '/v1/chat/completions',
body: {
model: 'gpt-4o',
messages: [{ role: 'user', content: 'Translate "goodbye" to Spanish' }],
max_tokens: 100,
},
},
];
// 2. Upload the file
const file = await client.files.create({
file: new Blob([batchRequests.map(r => JSON.stringify(r)).join('\n')], {
type: 'application/jsonl',
}),
purpose: 'batch',
});
// 3. Create the batch
const batch = await client.batches.create({
input_file_id: file.id,
endpoint: '/v1/chat/completions',
completion_window: '24h',
});
console.log(`Batch ${batch.id} submitted`);Retrieves details about a specific batch job.
retrieve(batchID: string): Promise<Batch>Example:
const batch = await client.batches.retrieve('batch_abc123');
console.log(`Batch status: ${batch.status}`);
console.log(`Completed: ${batch.request_counts.completed}`);
console.log(`Failed: ${batch.request_counts.failed}`);Retrieves a paginated list of batch jobs for your organization.
list(params?: BatchListParams): Promise<BatchesPage>Parameters:
interface BatchListParams extends CursorPageParams {
// Pagination parameters inherited from CursorPageParams
}Example:
// List all batches
for await (const batch of client.batches.list()) {
console.log(`${batch.id}: ${batch.status}`);
}
// List with pagination
const page = await client.batches.list();
if (page.hasNextPage()) {
const nextPage = await page.getNextPage();
}Cancels a batch that is in progress. The batch transitions to cancelling status for up to 10 minutes, then becomes cancelled with any partial results available.
cancel(batchID: string): Promise<Batch>Example:
const cancelled = await client.batches.cancel('batch_abc123');
console.log(`Batch status: ${cancelled.status}`); // 'cancelling'interface Batch {
/**
* Unique batch identifier.
*/
id: string;
/**
* The completion window (currently always `24h`).
*/
completion_window: string;
/**
* Unix timestamp (seconds) when batch was created.
*/
created_at: number;
/**
* The API endpoint used by this batch.
*/
endpoint: string;
/**
* The ID of the input file containing requests.
*/
input_file_id: string;
/**
* Object type identifier (always `batch`).
*/
object: 'batch';
/**
* Current batch status: validating, failed, in_progress, finalizing,
* completed, expired, cancelling, or cancelled.
*/
status:
| 'validating'
| 'failed'
| 'in_progress'
| 'finalizing'
| 'completed'
| 'expired'
| 'cancelling'
| 'cancelled';
/**
* Unix timestamp (seconds) when batch was cancelled (if applicable).
*/
cancelled_at?: number;
/**
* Unix timestamp (seconds) when batch started cancelling.
*/
cancelling_at?: number;
/**
* Unix timestamp (seconds) when batch completed.
*/
completed_at?: number;
/**
* ID of file containing errors (if any).
*/
error_file_id?: string;
/**
* List of batch-level errors.
*/
errors?: {
data?: BatchError[];
object?: string;
};
/**
* Unix timestamp (seconds) when batch expired.
*/
expired_at?: number;
/**
* Unix timestamp (seconds) when batch will expire.
*/
expires_at?: number;
/**
* Unix timestamp (seconds) when batch failed.
*/
failed_at?: number;
/**
* Unix timestamp (seconds) when batch started finalizing.
*/
finalizing_at?: number;
/**
* Unix timestamp (seconds) when batch started processing.
*/
in_progress_at?: number;
/**
* Optional metadata (16 key-value pairs max).
*/
metadata?: Metadata | null;
/**
* Model ID used (e.g., `gpt-5-2025-08-07`).
*/
model?: string;
/**
* ID of file containing successful outputs.
*/
output_file_id?: string;
/**
* Request count statistics.
*/
request_counts?: BatchRequestCounts;
/**
* Token usage details (batches created after Sept 7, 2025).
*/
usage?: BatchUsage;
}interface BatchError {
/**
* Error code identifying the error type.
*/
code?: string;
/**
* Line number in input file where error occurred.
*/
line?: number | null;
/**
* Human-readable error message.
*/
message?: string;
/**
* Parameter name that caused the error.
*/
param?: string | null;
}interface BatchRequestCounts {
/**
* Number of requests completed successfully.
*/
completed: number;
/**
* Number of requests that failed.
*/
failed: number;
/**
* Total number of requests in the batch.
*/
total: number;
}interface BatchUsage {
/**
* Number of input tokens.
*/
input_tokens: number;
/**
* Detailed breakdown of input tokens.
*/
input_tokens_details: {
/**
* Tokens retrieved from cache.
*/
cached_tokens: number;
};
/**
* Number of output tokens.
*/
output_tokens: number;
/**
* Detailed breakdown of output tokens.
*/
output_tokens_details: {
/**
* Reasoning tokens used (for reasoning models).
*/
reasoning_tokens: number;
};
/**
* Total tokens used.
*/
total_tokens: number;
}Defines the structure of an evaluation with testing criteria and data source configuration. After creation, run the evaluation on different models and parameters.
create(params: EvalCreateParams): Promise<EvalCreateResponse>Parameters:
interface EvalCreateParams {
/**
* Data source configuration determining the schema of data used in runs.
* Can be custom, logs, or stored_completions.
*/
data_source_config:
| EvalCreateParams.Custom
| EvalCreateParams.Logs
| EvalCreateParams.StoredCompletions;
/**
* List of graders (testing criteria) for all eval runs.
* Can reference variables using {{item.variable_name}} or {{sample.output_text}}.
*/
testing_criteria: Array<
| EvalCreateParams.LabelModel
| StringCheckGrader
| EvalCreateParams.TextSimilarity
| EvalCreateParams.Python
| EvalCreateParams.ScoreModel
>;
/**
* Optional metadata (16 key-value pairs max).
*/
metadata?: Metadata | null;
/**
* Optional evaluation name.
*/
name?: string;
}
namespace EvalCreateParams {
interface Custom {
/**
* JSON schema for each row in the data source.
*/
item_schema: Record<string, unknown>;
/**
* Data source type (always `custom`).
*/
type: 'custom';
/**
* Whether eval expects you to populate sample namespace.
*/
include_sample_schema?: boolean;
}
interface Logs {
/**
* Data source type (always `logs`).
*/
type: 'logs';
/**
* Metadata filters for logs query.
*/
metadata?: Record<string, unknown>;
}
interface StoredCompletions {
/**
* Data source type (always `stored_completions`).
* @deprecated Use Logs instead.
*/
type: 'stored_completions';
/**
* Metadata filters for stored completions.
*/
metadata?: Record<string, unknown>;
}
interface LabelModel {
/**
* List of messages forming the prompt (may include {{item.variable}}).
*/
input: Array<{
content: string;
role: string;
}>;
/**
* Labels to classify each item.
*/
labels: string[];
/**
* Model to use (must support structured outputs).
*/
model: string;
/**
* Grader name.
*/
name: string;
/**
* Labels indicating a passing result.
*/
passing_labels: string[];
/**
* Type (always `label_model`).
*/
type: 'label_model';
}
interface TextSimilarity extends GraderModelsAPI.TextSimilarityGrader {
/**
* Threshold for passing score.
*/
pass_threshold: number;
}
interface Python extends GraderModelsAPI.PythonGrader {
/**
* Optional threshold for passing score.
*/
pass_threshold?: number;
}
interface ScoreModel extends GraderModelsAPI.ScoreModelGrader {
/**
* Optional threshold for passing score.
*/
pass_threshold?: number;
}
}Example:
// Create evaluation for customer support chatbot
const evalResponse = await client.evals.create({
name: 'Customer Support Quality',
data_source_config: {
type: 'custom',
item_schema: {
type: 'object',
properties: {
customer_question: { type: 'string' },
expected_keywords: { type: 'array', items: { type: 'string' } },
},
required: ['customer_question'],
},
include_sample_schema: true,
},
testing_criteria: [
{
type: 'string_check',
name: 'Contains Required Keywords',
pass_keywords: ['{{item.expected_keywords}}'],
},
{
type: 'label_model',
name: 'Tone Assessment',
model: 'gpt-4o',
labels: ['professional', 'friendly', 'hostile'],
passing_labels: ['professional', 'friendly'],
input: [
{
role: 'system',
content: 'Assess the tone of the response.',
},
{
role: 'user',
content: '{{sample.output_text}}',
},
],
},
],
});
console.log(`Created evaluation: ${evalResponse.id}`);Gets details about a specific evaluation.
retrieve(evalID: string): Promise<EvalRetrieveResponse>Example:
const eval = await client.evals.retrieve('eval_abc123');
console.log(`Evaluation: ${eval.name}`);
console.log(`Testing criteria count: ${eval.testing_criteria.length}`);Updates evaluation properties (name, metadata).
update(evalID: string, params: EvalUpdateParams): Promise<EvalUpdateResponse>Parameters:
interface EvalUpdateParams {
/**
* Optional metadata (16 key-value pairs max).
*/
metadata?: Metadata | null;
/**
* Rename the evaluation.
*/
name?: string;
}Example:
const updated = await client.evals.update('eval_abc123', {
name: 'Customer Support Quality - v2',
metadata: { version: '2', status: 'production' },
});Lists evaluations for your project.
list(params?: EvalListParams): Promise<EvalListResponsesPage>Parameters:
interface EvalListParams extends CursorPageParams {
/**
* Sort order: `asc` or `desc` (default: `asc`).
*/
order?: 'asc' | 'desc';
/**
* Sort by: `created_at` or `updated_at` (default: `created_at`).
*/
order_by?: 'created_at' | 'updated_at';
}Example:
// List all evaluations sorted by creation date
for await (const eval of client.evals.list({ order_by: 'created_at', order: 'desc' })) {
console.log(`${eval.name} (${eval.id})`);
}Deletes an evaluation and all associated runs.
delete(evalID: string): Promise<EvalDeleteResponse>Example:
const result = await client.evals.delete('eval_abc123');
console.log(`Deleted: ${result.deleted}`);interface EvalCreateResponse {
/**
* Unique evaluation identifier.
*/
id: string;
/**
* Unix timestamp (seconds) when evaluation was created.
*/
created_at: number;
/**
* Data source configuration for runs.
*/
data_source_config:
| EvalCustomDataSourceConfig
| EvalCreateResponse.Logs
| EvalStoredCompletionsDataSourceConfig;
/**
* Optional metadata.
*/
metadata: Metadata | null;
/**
* Evaluation name.
*/
name: string;
/**
* Object type (always `eval`).
*/
object: 'eval';
/**
* List of testing criteria (graders).
*/
testing_criteria: Array<
| LabelModelGrader
| StringCheckGrader
| EvalCreateResponse.EvalGraderTextSimilarity
| EvalCreateResponse.EvalGraderPython
| EvalCreateResponse.EvalGraderScoreModel
>;
}interface EvalCustomDataSourceConfig {
/**
* JSON schema for run data source items.
*/
schema: Record<string, unknown>;
/**
* Data source type (always `custom`).
*/
type: 'custom';
}interface EvalStoredCompletionsDataSourceConfig {
/**
* JSON schema for run data source items.
*/
schema: Record<string, unknown>;
/**
* Data source type (always `stored_completions`).
* @deprecated Use LogsDataSourceConfig instead.
*/
type: 'stored_completions';
/**
* Optional metadata.
*/
metadata?: Metadata | null;
}interface EvalDeleteResponse {
deleted: boolean;
eval_id: string;
object: string;
}Starts an evaluation run for a given evaluation. Validates data source against evaluation schema.
create(evalID: string, params: RunCreateParams): Promise<RunCreateResponse>Parameters:
interface RunCreateParams {
/**
* Run data source: JSONL, completions, or responses.
*/
data_source:
| CreateEvalJSONLRunDataSource
| CreateEvalCompletionsRunDataSource
| RunCreateParams.CreateEvalResponsesRunDataSource;
/**
* Optional metadata (16 key-value pairs max).
*/
metadata?: Metadata | null;
/**
* Optional run name.
*/
name?: string;
}Data Sources:
interface CreateEvalJSONLRunDataSource {
/**
* JSONL source (file content or file ID).
*/
source:
| { type: 'file_content'; content: Array<{ item: Record<string, unknown>; sample?: Record<string, unknown> }> }
| { type: 'file_id'; id: string };
/**
* Data source type (always `jsonl`).
*/
type: 'jsonl';
}
interface CreateEvalCompletionsRunDataSource {
/**
* Source configuration.
*/
source:
| { type: 'file_content'; content: Array<{ item: Record<string, unknown> }> }
| { type: 'file_id'; id: string }
| {
type: 'stored_completions';
created_after?: number | null;
created_before?: number | null;
limit?: number | null;
metadata?: Metadata | null;
model?: string | null;
};
/**
* Data source type (always `completions`).
*/
type: 'completions';
/**
* Input messages (template or item reference).
*/
input_messages?: { type: 'template'; template: unknown[] } | { type: 'item_reference'; item_reference: string };
/**
* Model to use for sampling.
*/
model?: string;
/**
* Sampling parameters (temperature, max_tokens, etc.).
*/
sampling_params?: Record<string, unknown>;
}Example:
// Create run with JSONL data source
const run = await client.evals.runs.create('eval_abc123', {
name: 'Production Test Run',
data_source: {
type: 'jsonl',
source: {
type: 'file_content',
content: [
{
item: {
customer_question: 'How do I reset my password?',
expected_keywords: ['password', 'reset', 'account'],
},
sample: {
output_text: 'To reset your password, go to the login page and click "Forgot Password".',
},
},
{
item: {
customer_question: 'What are your business hours?',
expected_keywords: ['hours', 'open', 'close'],
},
sample: {
output_text: 'We are open 9 AM to 5 PM EST, Monday through Friday.',
},
},
],
},
},
});
console.log(`Run ${run.id} started with status: ${run.status}`);Gets details about a specific evaluation run.
retrieve(runID: string, params: RunRetrieveParams): Promise<RunRetrieveResponse>Example:
const run = await client.evals.runs.retrieve('run_xyz789', {
eval_id: 'eval_abc123',
});
console.log(`Status: ${run.status}`);
console.log(`Passed: ${run.result_counts.passed}`);
console.log(`Failed: ${run.result_counts.failed}`);
console.log(`Report: ${run.report_url}`);Lists evaluation runs for a given evaluation.
list(evalID: string, params?: RunListParams): Promise<RunListResponsesPage>Parameters:
interface RunListParams extends CursorPageParams {
/**
* Sort order: `asc` or `desc` (default: `asc`).
*/
order?: 'asc' | 'desc';
/**
* Filter by status: queued, in_progress, completed, canceled, failed.
*/
status?: 'queued' | 'in_progress' | 'completed' | 'canceled' | 'failed';
}Example:
// List completed runs
const runs = await client.evals.runs.list('eval_abc123', {
status: 'completed',
order: 'desc',
});
for await (const run of runs) {
console.log(`${run.name}: ${run.status}`);
}Deletes an evaluation run.
delete(runID: string, params: RunDeleteParams): Promise<RunDeleteResponse>Example:
await client.evals.runs.delete('run_xyz789', { eval_id: 'eval_abc123' });Cancels an ongoing evaluation run.
cancel(runID: string, params: RunCancelParams): Promise<RunCancelResponse>Example:
const cancelled = await client.evals.runs.cancel('run_xyz789', {
eval_id: 'eval_abc123',
});
console.log(`Cancelled: ${cancelled.id}`);interface RunCreateResponse {
/**
* Unique run identifier.
*/
id: string;
/**
* Unix timestamp (seconds) when run was created.
*/
created_at: number;
/**
* Run data source configuration.
*/
data_source:
| CreateEvalJSONLRunDataSource
| CreateEvalCompletionsRunDataSource
| RunCreateResponse.Responses;
/**
* Error information (if applicable).
*/
error: EvalAPIError;
/**
* Associated evaluation ID.
*/
eval_id: string;
/**
* Optional metadata.
*/
metadata: Metadata | null;
/**
* Model being evaluated.
*/
model: string;
/**
* Run name.
*/
name: string;
/**
* Object type (always `eval.run`).
*/
object: 'eval.run';
/**
* Per-model token usage statistics.
*/
per_model_usage: Array<{
cached_tokens: number;
completion_tokens: number;
invocation_count: number;
model_name: string;
prompt_tokens: number;
total_tokens: number;
}>;
/**
* Results per testing criteria.
*/
per_testing_criteria_results: Array<{
failed: number;
passed: number;
testing_criteria: string;
}>;
/**
* URL to rendered report on dashboard.
*/
report_url: string;
/**
* Result counts summarizing outcomes.
*/
result_counts: {
errored: number;
failed: number;
passed: number;
total: number;
};
/**
* Run status.
*/
status: string;
}interface EvalAPIError {
/**
* Error code.
*/
code: string;
/**
* Error message.
*/
message: string;
}Gets a specific output item from an evaluation run.
retrieve(outputItemID: string, params: OutputItemRetrieveParams): Promise<OutputItemRetrieveResponse>Parameters:
interface OutputItemRetrieveParams {
/**
* The evaluation ID.
*/
eval_id: string;
/**
* The run ID.
*/
run_id: string;
}Example:
const item = await client.evals.runs.outputItems.retrieve('item_123', {
eval_id: 'eval_abc123',
run_id: 'run_xyz789',
});
console.log(`Item status: ${item.status}`);
console.log(`Results:`);
item.results.forEach(r => {
console.log(` ${r.name}: ${r.passed ? 'PASSED' : 'FAILED'} (${r.score})`);
});Lists output items for an evaluation run.
list(runID: string, params: OutputItemListParams): Promise<OutputItemListResponsesPage>Parameters:
interface OutputItemListParams extends CursorPageParams {
/**
* The evaluation ID.
*/
eval_id: string;
/**
* Sort order: `asc` or `desc` (default: `asc`).
*/
order?: 'asc' | 'desc';
/**
* Filter by status: `fail` or `pass`.
*/
status?: 'fail' | 'pass';
}Example:
// List failed output items
const failed = await client.evals.runs.outputItems.list('run_xyz789', {
eval_id: 'eval_abc123',
status: 'fail',
});
for await (const item of failed) {
console.log(`Failed item: ${item.id}`);
item.results.forEach(r => {
if (!r.passed) {
console.log(` ${r.name}: score ${r.score}`);
}
});
}interface OutputItemRetrieveResponse {
/**
* Unique output item identifier.
*/
id: string;
/**
* Unix timestamp (seconds) when created.
*/
created_at: number;
/**
* Input data source item details.
*/
datasource_item: Record<string, unknown>;
/**
* Data source item identifier.
*/
datasource_item_id: number;
/**
* Evaluation ID.
*/
eval_id: string;
/**
* Object type (always `eval.run.output_item`).
*/
object: 'eval.run.output_item';
/**
* List of grader results.
*/
results: Array<{
/**
* Grader name.
*/
name: string;
/**
* Whether grader passed.
*/
passed: boolean;
/**
* Numeric score from grader.
*/
score: number;
/**
* Optional sample data from grader.
*/
sample?: Record<string, unknown> | null;
/**
* Grader type identifier.
*/
type?: string;
[k: string]: unknown;
}>;
/**
* Associated run ID.
*/
run_id: string;
/**
* Sample with input and output.
*/
sample: {
/**
* Error information (if applicable).
*/
error: EvalAPIError;
/**
* Finish reason (e.g., "stop", "max_tokens").
*/
finish_reason: string;
/**
* Input messages.
*/
input: Array<{
content: string;
role: string;
}>;
/**
* Maximum tokens for completion.
*/
max_completion_tokens: number;
/**
* Model used.
*/
model: string;
/**
* Output messages.
*/
output: Array<{
content?: string;
role?: string;
}>;
/**
* Seed used.
*/
seed: number;
/**
* Temperature used.
*/
temperature: number;
/**
* Top-p (nucleus sampling) value.
*/
top_p: number;
/**
* Token usage.
*/
usage: {
cached_tokens: number;
completion_tokens: number;
prompt_tokens: number;
total_tokens: number;
};
};
/**
* Status of the output item.
*/
status: string;
}import { OpenAI } from 'openai';
const client = new OpenAI();
async function processBatch() {
// 1. Prepare batch requests
const requests = [
{
custom_id: 'translation-1',
method: 'POST',
url: '/v1/chat/completions',
body: {
model: 'gpt-4o',
messages: [{ role: 'user', content: 'Translate "hello" to French' }],
},
},
{
custom_id: 'summary-1',
method: 'POST',
url: '/v1/chat/completions',
body: {
model: 'gpt-4o',
messages: [{ role: 'user', content: 'Summarize: OpenAI creates AI models.' }],
},
},
];
// 2. Upload requests file
const file = await client.files.create({
file: new Blob([requests.map(r => JSON.stringify(r)).join('\n')]),
purpose: 'batch',
});
// 3. Create batch
const batch = await client.batches.create({
input_file_id: file.id,
endpoint: '/v1/chat/completions',
completion_window: '24h',
});
console.log(`Batch ${batch.id} created with status: ${batch.status}`);
// 4. Poll until completion (or use webhooks)
let completed = batch;
while (!['completed', 'failed', 'expired'].includes(completed.status)) {
await new Promise(resolve => setTimeout(resolve, 30000)); // Wait 30s
completed = await client.batches.retrieve(batch.id);
console.log(`Status: ${completed.status}`);
}
// 5. Retrieve results
if (completed.output_file_id) {
const results = await client.files.content(completed.output_file_id);
console.log('Results:', results);
}
// 6. Check for errors
if (completed.error_file_id) {
const errors = await client.files.content(completed.error_file_id);
console.log('Errors:', errors);
}
}
processBatch().catch(console.error);import { OpenAI } from 'openai';
const client = new OpenAI();
async function runEvaluation() {
// 1. Create evaluation
const eval = await client.evals.create({
name: 'Support Response Quality',
data_source_config: {
type: 'custom',
item_schema: {
type: 'object',
properties: {
question: { type: 'string' },
expected_answer: { type: 'string' },
},
},
include_sample_schema: true,
},
testing_criteria: [
{
type: 'string_check',
name: 'Contains Key Term',
pass_keywords: ['resolved'],
},
{
type: 'label_model',
name: 'Tone Check',
model: 'gpt-4o',
labels: ['professional', 'casual', 'rude'],
passing_labels: ['professional', 'casual'],
input: [
{
role: 'system',
content: 'Rate the tone of this response.',
},
{
role: 'user',
content: '{{sample.output_text}}',
},
],
},
],
});
console.log(`Created evaluation ${eval.id}`);
// 2. Create run with test data
const run = await client.evals.runs.create(eval.id, {
name: 'First Run',
data_source: {
type: 'jsonl',
source: {
type: 'file_content',
content: [
{
item: {
question: 'How do I upgrade my account?',
expected_answer: 'Go to settings',
},
sample: {
output_text: 'Your issue has been resolved. Go to settings to upgrade.',
},
},
],
},
},
});
console.log(`Run ${run.id} started`);
// 3. Poll for completion
let completed = false;
while (!completed) {
const updated = await client.evals.runs.retrieve(run.id, { eval_id: eval.id });
if (['completed', 'failed'].includes(updated.status)) {
completed = true;
console.log(`Run completed with status: ${updated.status}`);
console.log(`Results: ${updated.result_counts.passed} passed, ${updated.result_counts.failed} failed`);
console.log(`Report: ${updated.report_url}`);
} else {
await new Promise(resolve => setTimeout(resolve, 5000));
}
}
// 4. Analyze output items
for await (const item of client.evals.runs.outputItems.list(run.id, { eval_id: eval.id, status: 'fail' })) {
console.log(`Failed item ${item.id}:`);
item.results.forEach(r => {
console.log(` ${r.name}: score ${r.score}`);
});
}
}
runEvaluation().catch(console.error);{"item": {"question": "What is 2+2?", "expected": "4"}, "sample": {"output": "2+2 equals 4"}}
{"item": {"question": "What is the capital of France?", "expected": "Paris"}, "sample": {"output": "The capital of France is Paris"}}const run = await client.evals.runs.create(eval.id, {
name: 'Test with Stored Completions',
data_source: {
type: 'completions',
source: {
type: 'stored_completions',
created_after: 1700000000, // Unix timestamp
model: 'gpt-4o',
metadata: { usecase: 'support', version: 'v2' },
},
model: 'gpt-4o-mini',
input_messages: {
type: 'template',
template: [
{ role: 'system', content: 'You are a helpful assistant.' },
{ role: 'user', content: '{{item.prompt}}' },
],
},
},
});{{item.field}} for inputs and {{sample.output}} for model outputsreport_url for visualizationsInstall with Tessl CLI
npx tessl i tessl/npm-openai