OpenAI compatible model factory for the Embabel Agent Framework
Real-world scenarios and patterns for using the OpenAI-compatible model factory.
Configure for Azure OpenAI Service with deployment-specific endpoints:
val azureFactory = OpenAiCompatibleModelFactory(
baseUrl = "https://your-resource.openai.azure.com",
apiKey = System.getenv("AZURE_OPENAI_API_KEY"),
completionsPath = "/openai/deployments/your-deployment/chat/completions?api-version=2024-02-15-preview",
embeddingsPath = "/openai/deployments/your-embedding-deployment/embeddings?api-version=2024-02-15-preview",
observationRegistry = ObservationRegistry.create()
)
val azureGpt4 = azureFactory.openAiCompatibleLlm(
model = "gpt-4",
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "Azure OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
val azureEmbeddings = azureFactory.openAiCompatibleEmbeddingService(
model = "text-embedding-3-small",
provider = "Azure OpenAI"
)Key points:
your-resource with your Azure OpenAI resource nameyour-deployment with your GPT-4 deployment nameyour-embedding-deployment with your embedding deployment nameUse cloud models for production and local models for development:
// OpenAI for production
val openAiFactory = OpenAiCompatibleModelFactory(
baseUrl = null,
apiKey = System.getenv("OPENAI_API_KEY"),
completionsPath = null,
embeddingsPath = null,
observationRegistry = observationRegistry
)
// Local Ollama for development
val localFactory = OpenAiCompatibleModelFactory(
baseUrl = "http://localhost:11434",
apiKey = null,
completionsPath = "/api/chat",
embeddingsPath = "/api/embeddings",
observationRegistry = observationRegistry
)
// Production service
val productionService = openAiFactory.openAiCompatibleLlm(
model = "gpt-4",
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
// Development service
val devService = localFactory.openAiCompatibleLlm(
model = "llama3:70b",
pricingModel = PricingModel.ALL_YOU_CAN_EAT,
provider = "Ollama",
knowledgeCutoffDate = null
)
// Choose based on environment
val service = if (System.getenv("ENVIRONMENT") == "production")
productionService else devServiceConfigure different models optimized for different use cases:
val factory = OpenAiCompatibleModelFactory(
baseUrl = null,
apiKey = System.getenv("OPENAI_API_KEY"),
completionsPath = null,
embeddingsPath = null,
observationRegistry = ObservationRegistry.create()
)
// Fast, cheap model for simple classification
val classifier = factory.openAiCompatibleLlm(
model = "gpt-3.5-turbo",
pricingModel = PricingModel.usdPer1MTokens(0.5, 1.5),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2021, 9, 1)
)
// Powerful model for complex reasoning
val reasoner = factory.openAiCompatibleLlm(
model = "gpt-4",
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
// Latest model for cutting-edge features
val latest = factory.openAiCompatibleLlm(
model = "gpt-5-turbo",
pricingModel = PricingModel.usdPer1MTokens(10.0, 30.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2024, 10, 1),
optionsConverter = Gpt5ChatOptionsConverter
)
// Embedding model for semantic search
val embeddings = factory.openAiCompatibleEmbeddingService(
model = "text-embedding-3-large",
provider = "OpenAI"
)
// Usage
fun processTask(complexity: TaskComplexity) {
val service = when (complexity) {
TaskComplexity.SIMPLE -> classifier
TaskComplexity.COMPLEX -> reasoner
TaskComplexity.CUTTING_EDGE -> latest
}
// Use service...
}Configure for LM Studio local server:
val lmStudioFactory = OpenAiCompatibleModelFactory(
baseUrl = "http://localhost:1234",
apiKey = null, // LM Studio doesn't require auth
completionsPath = "/v1/chat/completions",
embeddingsPath = null, // LM Studio may not support embeddings
observationRegistry = ObservationRegistry.create()
)
val localModel = lmStudioFactory.openAiCompatibleLlm(
model = "local-model", // Model name from LM Studio
pricingModel = PricingModel.ALL_YOU_CAN_EAT,
provider = "LM Studio",
knowledgeCutoffDate = null
)Configure for vLLM deployment:
val vllmFactory = OpenAiCompatibleModelFactory(
baseUrl = "http://vllm-server:8000",
apiKey = null,
completionsPath = "/v1/chat/completions",
embeddingsPath = null,
observationRegistry = ObservationRegistry.create()
)
val vllmService = vllmFactory.openAiCompatibleLlm(
model = "meta-llama/Llama-3-70b-chat-hf",
pricingModel = PricingModel.ALL_YOU_CAN_EAT,
provider = "vLLM",
knowledgeCutoffDate = null
)Wrap services with rate limiting to avoid API quotas:
class RateLimitedLlmService(
private val underlying: LlmService<*>,
private val maxRequestsPerMinute: Int
) {
private val rateLimiter = RateLimiter.create(maxRequestsPerMinute / 60.0)
fun createMessageSender(options: LlmOptions): LlmMessageSender {
rateLimiter.acquire()
return underlying.createMessageSender(options)
}
}
// Usage
val factory = OpenAiCompatibleModelFactory(...)
val unlimitedService = factory.openAiCompatibleLlm(
model = "gpt-4",
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
val rateLimitedService = RateLimitedLlmService(
underlying = unlimitedService,
maxRequestsPerMinute = 60
)Select model based on cost constraints:
class CostAwareModelSelector(
private val factory: OpenAiCompatibleModelFactory,
private val maxCostPerRequest: Double
) {
private val cheapModel = factory.openAiCompatibleLlm(
model = "gpt-3.5-turbo",
pricingModel = PricingModel.usdPer1MTokens(0.5, 1.5),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2021, 9, 1)
)
private val expensiveModel = factory.openAiCompatibleLlm(
model = "gpt-4",
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
fun selectModel(estimatedTokens: Int): LlmService<*> {
val cheapCost = (estimatedTokens / 1_000_000.0) * 1.5 // Average of input/output
val expensiveCost = (estimatedTokens / 1_000_000.0) * 45.0
return if (expensiveCost <= maxCostPerRequest) {
expensiveModel
} else if (cheapCost <= maxCostPerRequest) {
cheapModel
} else {
throw IllegalArgumentException("Request exceeds cost limit")
}
}
}Implement fallback to alternative models on failure:
class FallbackLlmService(
private val primary: LlmService<*>,
private val fallback: LlmService<*>
) {
fun createMessageSender(options: LlmOptions): LlmMessageSender {
return try {
primary.createMessageSender(options)
} catch (e: Exception) {
logger.warn("Primary service failed, falling back", e)
fallback.createMessageSender(options)
}
}
}
// Usage
val openAiFactory = OpenAiCompatibleModelFactory(
baseUrl = null,
apiKey = System.getenv("OPENAI_API_KEY"),
completionsPath = null,
embeddingsPath = null,
observationRegistry = observationRegistry
)
val localFactory = OpenAiCompatibleModelFactory(
baseUrl = "http://localhost:11434",
apiKey = null,
completionsPath = null,
embeddingsPath = null,
observationRegistry = observationRegistry
)
val primaryService = openAiFactory.openAiCompatibleLlm(
model = "gpt-4",
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
val fallbackService = localFactory.openAiCompatibleLlm(
model = "llama3:70b",
pricingModel = PricingModel.ALL_YOU_CAN_EAT,
provider = "Ollama",
knowledgeCutoffDate = null
)
val resilientService = FallbackLlmService(primaryService, fallbackService)Cache responses to reduce API calls and costs:
class CachedLlmService(
private val underlying: LlmService<*>,
private val cache: Cache<String, Response>
) {
fun createMessageSender(options: LlmOptions): LlmMessageSender {
return object : LlmMessageSender {
override fun send(prompt: String): Response {
val cacheKey = "${options.hashCode()}_${prompt.hashCode()}"
return cache.get(cacheKey) {
underlying.createMessageSender(options).send(prompt)
}
}
}
}
}
// Usage with Caffeine cache
val cache = Caffeine.newBuilder()
.maximumSize(1000)
.expireAfterWrite(1, TimeUnit.HOURS)
.build<String, Response>()
val factory = OpenAiCompatibleModelFactory(...)
val uncachedService = factory.openAiCompatibleLlm(...)
val cachedService = CachedLlmService(uncachedService, cache)Route requests to different models based on content:
class ContentBasedRouter(
private val codeModel: LlmService<*>,
private val textModel: LlmService<*>
) {
fun routeRequest(prompt: String, options: LlmOptions): Response {
val service = if (isCodeRelated(prompt)) {
codeModel
} else {
textModel
}
return service.createMessageSender(options).send(prompt)
}
private fun isCodeRelated(prompt: String): Boolean {
val codeKeywords = listOf("code", "function", "class", "method", "bug", "error")
return codeKeywords.any { prompt.lowercase().contains(it) }
}
}
// Usage
val factory = OpenAiCompatibleModelFactory(...)
val codeSpecialist = factory.openAiCompatibleLlm(
model = "gpt-4", // Better at code
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
val textSpecialist = factory.openAiCompatibleLlm(
model = "gpt-3.5-turbo", // Cheaper for text
pricingModel = PricingModel.usdPer1MTokens(0.5, 1.5),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2021, 9, 1)
)
val router = ContentBasedRouter(codeSpecialist, textSpecialist)Distribute load across multiple API keys to increase rate limits:
class LoadBalancedFactory(
private val apiKeys: List<String>,
private val observationRegistry: ObservationRegistry
) {
private val factories = apiKeys.map { apiKey ->
OpenAiCompatibleModelFactory(
baseUrl = null,
apiKey = apiKey,
completionsPath = null,
embeddingsPath = null,
observationRegistry = observationRegistry
)
}
private val roundRobin = AtomicInteger(0)
fun getFactory(): OpenAiCompatibleModelFactory {
val index = roundRobin.getAndIncrement() % factories.size
return factories[index]
}
fun createLlmService(model: String): LlmService<*> {
return getFactory().openAiCompatibleLlm(
model = model,
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
}
}
// Usage
val apiKeys = listOf(
System.getenv("OPENAI_API_KEY_1"),
System.getenv("OPENAI_API_KEY_2"),
System.getenv("OPENAI_API_KEY_3")
)
val loadBalancer = LoadBalancedFactory(apiKeys, observationRegistry)
// Each call uses a different API key
val service1 = loadBalancer.createLlmService("gpt-4")
val service2 = loadBalancer.createLlmService("gpt-4")
val service3 = loadBalancer.createLlmService("gpt-4")Use local models or mocks for testing:
// Production configuration
@Configuration
@Profile("production")
class ProductionConfig(private val observationRegistry: ObservationRegistry) {
@Bean
fun modelFactory(@Value("\${openai.api.key}") apiKey: String) =
OpenAiCompatibleModelFactory(
baseUrl = null,
apiKey = apiKey,
completionsPath = null,
embeddingsPath = null,
observationRegistry = observationRegistry
)
@Bean
fun llmService(factory: OpenAiCompatibleModelFactory) =
factory.openAiCompatibleLlm(
model = "gpt-4",
pricingModel = PricingModel.usdPer1MTokens(30.0, 60.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 4, 1)
)
}
// Test configuration with local model
@Configuration
@Profile("test")
class TestConfig(private val observationRegistry: ObservationRegistry) {
@Bean
fun modelFactory() =
OpenAiCompatibleModelFactory(
baseUrl = "http://localhost:11434",
apiKey = null,
completionsPath = null,
embeddingsPath = null,
observationRegistry = observationRegistry
)
@Bean
fun llmService(factory: OpenAiCompatibleModelFactory) =
factory.openAiCompatibleLlm(
model = "llama3:8b", // Smaller, faster model for tests
pricingModel = PricingModel.ALL_YOU_CAN_EAT,
provider = "Ollama",
knowledgeCutoffDate = null
)
}Configure different regions with fallback:
class MultiRegionFactory(
private val observationRegistry: ObservationRegistry
) {
private val usEastFactory = OpenAiCompatibleModelFactory(
baseUrl = "https://api.openai.com",
apiKey = System.getenv("OPENAI_API_KEY_US_EAST"),
completionsPath = null,
embeddingsPath = null,
observationRegistry = observationRegistry
)
private val euWestFactory = OpenAiCompatibleModelFactory(
baseUrl = "https://api.openai.com",
apiKey = System.getenv("OPENAI_API_KEY_EU_WEST"),
completionsPath = null,
embeddingsPath = null,
observationRegistry = observationRegistry
)
fun getFactoryForRegion(region: String): OpenAiCompatibleModelFactory {
return when (region) {
"us-east" -> usEastFactory
"eu-west" -> euWestFactory
else -> usEastFactory // Default
}
}
}While the factory creates services that support streaming through Spring AI, here's how to configure for optimal streaming:
val factory = OpenAiCompatibleModelFactory(
baseUrl = null,
apiKey = System.getenv("OPENAI_API_KEY"),
completionsPath = null,
embeddingsPath = null,
observationRegistry = ObservationRegistry.create()
)
val streamingService = factory.openAiCompatibleLlm(
model = "gpt-4-turbo",
pricingModel = PricingModel.usdPer1MTokens(10.0, 30.0),
provider = "OpenAI",
knowledgeCutoffDate = LocalDate.of(2023, 12, 1)
)
// The service supports streaming through Spring AI's ChatModel interface
// Call stream() method on the underlying ChatModel for streaming responsesInstall with Tessl CLI
npx tessl i tessl/maven-com-embabel-agent--embabel-agent-openai