A TypeScript client for the Phoenix API providing AI observability, prompt management, datasets, experiments, and tracing capabilities.
Comprehensive experiment execution system with evaluation capabilities, progress tracking, and OpenTelemetry instrumentation for AI model testing and systematic evaluation workflows.
Run experiments on datasets with custom tasks and evaluators, including automatic instrumentation and progress tracking.
/**
* Run an experiment on a dataset with evaluation
* @param params - Experiment execution parameters
* @returns Promise resolving to experiment results
*/
function runExperiment(params: {
client?: PhoenixClient;
experimentName?: string;
experimentDescription?: string;
experimentMetadata?: Record<string, unknown>;
dataset: DatasetSelector;
task: ExperimentTask;
evaluators?: Evaluator[];
logger?: Logger;
record?: boolean;
concurrency?: number;
dryRun?: number | boolean;
setGlobalTracerProvider?: boolean;
repetitions?: number;
useBatchSpanProcessor?: boolean;
}): Promise<RanExperiment>;
interface ExperimentTask {
(example: Example): Promise<Record<string, unknown>>;
}
interface Evaluator {
name: string;
kind: AnnotatorKind;
evaluate: (
example: Example,
output: Record<string, unknown>
) => Promise<EvaluationResult>;
}
/**
* Helper function to create an evaluator with proper typing
* @param params - Evaluator configuration
* @returns Evaluator instance
*/
function asEvaluator(params: {
name: string;
kind: AnnotatorKind;
evaluate: Evaluator["evaluate"];
}): Evaluator;
interface EvaluationResult {
name: string;
score?: number;
label?: string;
explanation?: string;
metadata?: Record<string, unknown>;
}
interface RanExperiment extends ExperimentInfo {
runs: Record<string, ExperimentRun>;
evaluationRuns?: ExperimentEvaluationRun[];
}
interface ExperimentInfo {
id: string;
datasetId: string;
datasetVersionId: string;
projectName: string;
metadata: Record<string, unknown>;
}
interface ExperimentRun {
id: string;
startTime: Date;
endTime: Date;
experimentId: string;
datasetExampleId: string;
output?: string | boolean | number | object | null;
error: string | null;
repetition: number;
}
interface ExperimentEvaluationRun {
id: string;
runId: string;
evaluatorName: string;
result: EvaluationResult;
startTime: Date;
endTime: Date;
}Usage Example:
import { runExperiment, asEvaluator } from "@arizeai/phoenix-client/experiments";
// Define your task function
const myTask: ExperimentTask = async (example) => {
const { question } = example.input;
// Call your AI model/API
const response = await callMyModel(question);
return {
answer: response.answer,
confidence: response.confidence
};
};
// Define evaluators using asEvaluator helper
const accuracyEvaluator = asEvaluator({
name: "accuracy",
kind: "HEURISTIC",
evaluate: async (example, output) => {
const expectedAnswer = example.output?.answer;
const actualAnswer = output.answer;
const isCorrect = expectedAnswer === actualAnswer;
return {
name: "accuracy",
score: isCorrect ? 1 : 0,
label: isCorrect ? "correct" : "incorrect"
};
}
});
// Run the experiment
const results = await runExperiment({
dataset: { datasetName: "qa-eval-set" },
task: myTask,
evaluators: [accuracyEvaluator],
metadata: {
model: "gpt-4o",
temperature: 0.3,
experiment_type: "accuracy_test"
},
concurrency: 5,
repetitions: 1
});
console.log(`Experiment ${results.experimentId} completed`);
console.log(`Processed ${results.runs.length} examples`);
console.log(`Generated ${results.evaluations.length} evaluations`);Get experiment metadata without full run details for lightweight operations.
/**
* Get experiment metadata
* @param params - Experiment info parameters
* @returns Promise resolving to experiment information
*/
function getExperimentInfo(params: {
client?: PhoenixClient;
experimentId: string;
}): Promise<ExperimentInfo>;
interface ExperimentInfo {
id: string;
datasetId: string;
datasetVersionId: string;
projectName: string;
metadata: Record<string, unknown>;
}Retrieve complete experiment data including all runs and evaluations.
/**
* Get complete experiment data
* @param params - Experiment retrieval parameters
* @returns Promise resolving to full experiment data
*/
function getExperiment(params: {
client?: PhoenixClient;
experimentId: string;
}): Promise<RanExperiment>;
interface RanExperiment extends ExperimentInfo {
runs: Record<string, ExperimentRun>;
evaluationRuns?: ExperimentEvaluationRun[];
}Get experiment runs with optional filtering and pagination.
/**
* Get experiment runs with filtering
* @param params - Experiment runs parameters
* @returns Promise resolving to experiment runs
*/
function getExperimentRuns(params: {
client?: PhoenixClient;
experimentId: string;
}): Promise<{ runs: ExperimentRun[] }>
type ExperimentRunID = string;Usage Examples:
import { getExperimentInfo, getExperiment, getExperimentRuns } from "@arizeai/phoenix-client/experiments";
// Get basic experiment info
const info = await getExperimentInfo({
experimentId: "exp_123"
});
// Get complete experiment with runs
const experiment = await getExperiment({
experimentId: "exp_123"
});
// Get runs with pagination
const runs = await getExperimentRuns({
experimentId: "exp_123",
limit: 50,
offset: 0,
status: "COMPLETED"
});Advanced configuration options for experiment execution behavior.
Concurrency Control:
// Run up to 10 examples in parallel
await runExperiment({
dataset: { datasetId: "dataset_123" },
task: myTask,
concurrency: 10 // Default: 1
});Repetitions:
// Run each example 3 times for reliability testing
await runExperiment({
dataset: { datasetId: "dataset_123" },
task: myTask,
repetitions: 3 // Default: 1
});Custom Logging:
import { Logger } from "@arizeai/phoenix-client/types/logger";
const customLogger: Logger = {
info: (message: string) => console.log(`[INFO] ${message}`),
error: (message: string) => console.error(`[ERROR] ${message}`),
warn: (message: string) => console.warn(`[WARN] ${message}`)
};
await runExperiment({
dataset: { datasetId: "dataset_123" },
task: myTask,
logger: customLogger
});Experiments automatically generate OpenTelemetry traces for observability and debugging.
Automatic Instrumentation:
Custom Instrumentation Provider:
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
const provider = new NodeTracerProvider({
// Custom tracer configuration
});
await runExperiment({
dataset: { datasetId: "dataset_123" },
task: myTask,
instructionProvider: provider
});Common patterns for implementing evaluators for different use cases.
Binary Classification:
const binaryEvaluator: Evaluator = {
name: "binary_accuracy",
evaluate: async (example, output) => {
const expected = example.output?.label;
const predicted = output.prediction;
return {
name: "binary_accuracy",
score: expected === predicted ? 1 : 0,
label: expected === predicted ? "correct" : "incorrect",
explanation: `Expected: ${expected}, Got: ${predicted}`
};
}
};Similarity-Based Evaluation:
const similarityEvaluator: Evaluator = {
name: "semantic_similarity",
evaluate: async (example, output) => {
const expected = example.output?.text;
const generated = output.text;
// Use your similarity calculation
const similarity = await calculateSimilarity(expected, generated);
return {
name: "semantic_similarity",
score: similarity,
explanation: `Similarity score between expected and generated text`
};
}
};LLM-as-Judge:
const llmJudgeEvaluator: Evaluator = {
name: "llm_judge",
evaluate: async (example, output) => {
const prompt = `Rate the quality of this response on a scale of 1-5:
Question: ${example.input.question}
Response: ${output.answer}
Provide a numeric score and brief explanation.`;
const judgeResponse = await callJudgeModel(prompt);
return {
name: "llm_judge",
score: judgeResponse.score,
explanation: judgeResponse.explanation
};
}
};Experiments include robust error handling with detailed error reporting.
Task Error Handling:
const robustTask: ExperimentTask = async (example) => {
try {
const result = await callAPI(example.input);
return result;
} catch (error) {
// Errors are automatically captured in experiment runs
throw new Error(`Task failed: ${error.message}`);
}
};Evaluator Error Handling:
const safeEvaluator: Evaluator = {
name: "safe_evaluator",
evaluate: async (example, output) => {
try {
const score = await computeScore(example, output);
return { name: "safe_evaluator", score };
} catch (error) {
// Return error information in evaluation result
return {
name: "safe_evaluator",
score: null,
explanation: `Evaluation failed: ${error.message}`
};
}
}
};Install with Tessl CLI
npx tessl i tessl/npm-arizeai--phoenix-client