tessl install tessl/pypi-kserve@0.16.1KServe is a comprehensive Python SDK that provides standardized interfaces for building and deploying machine learning model serving infrastructure on Kubernetes.
KServe provides OpenAI-compatible endpoints for seamless integration with Large Language Models (LLMs) and embedding models. This enables drop-in compatibility with existing OpenAI client libraries and tools.
IMPORTANT: The OpenAI protocol support requires the vLLM dependency. Install it with:
pip install kserve[llm]Or install vLLM directly:
pip install vllmThe OpenAI protocol support allows KServe model servers to expose endpoints compatible with OpenAI's API specification, including chat completions, text completions, embeddings, and reranking. This is particularly useful for serving LLMs using the familiar OpenAI API interface.
from kserve.protocol.rest.openai import (
OpenAIModel,
OpenAIGenerativeModel,
OpenAIEncoderModel,
OpenAIProxyModel,
OpenAIChatAdapterModel,
ChatPrompt
)
from kserve.protocol.rest.openai.types import (
ChatCompletion,
ChatCompletionRequest,
Completion,
CompletionRequest,
Embedding,
EmbeddingRequest,
Rerank,
RerankRequest,
ErrorResponse
)Abstract base class for all OpenAI-compatible models. Provides the foundation for implementing OpenAI protocol endpoints.
class OpenAIModel(BaseKServeModel):
"""
Abstract base class for OpenAI-compatible models.
Args:
name (str): Model name
"""
def __init__(self, name: str): ...Abstract base class for generative models supporting chat completions and text completions.
class OpenAIGenerativeModel(OpenAIModel):
"""
Abstract base for generative models (chat, completions).
Subclass this to implement custom LLM serving with OpenAI-compatible endpoints.
"""
async def create_completion(
self,
request: CompletionRequest,
raw_request: Request,
context: Dict
) -> Union[AsyncGenerator, Completion, ErrorResponse]:
"""
Generate text completion.
Args:
request (CompletionRequest): Completion request parameters
raw_request (Request): Raw FastAPI request object
context (Dict): Request context
Returns:
Union[AsyncGenerator, Completion, ErrorResponse]: Completion response or stream
"""
...
async def create_chat_completion(
self,
request: ChatCompletionRequest,
raw_request: Request,
context: Dict
) -> Union[AsyncGenerator, ChatCompletion, ErrorResponse]:
"""
Generate chat completion.
Args:
request (ChatCompletionRequest): Chat completion request with messages
raw_request (Request): Raw FastAPI request object
context (Dict): Request context
Returns:
Union[AsyncGenerator, ChatCompletion, ErrorResponse]: Chat completion or stream
"""
...Usage Example:
from kserve.protocol.rest.openai import OpenAIGenerativeModel
from kserve.protocol.rest.openai.types import ChatCompletionRequest, ChatCompletion
class MyLLMModel(OpenAIGenerativeModel):
def __init__(self, name: str):
super().__init__(name)
self.model = None
def load(self):
# Load your LLM model
self.model = load_llm_model()
self.ready = True
async def create_chat_completion(self, request, raw_request, context):
# Extract messages from request
messages = request.messages
# Generate response using your model
response_text = await self.model.generate(messages)
# Return OpenAI-compatible response
return ChatCompletion(
id=f"chatcmpl-{uuid.uuid4()}",
object="chat.completion",
created=int(time.time()),
model=self.name,
choices=[{
"index": 0,
"message": {
"role": "assistant",
"content": response_text
},
"finish_reason": "stop"
}]
)Abstract base class for encoder models supporting embeddings and reranking.
class OpenAIEncoderModel(OpenAIModel):
"""
Abstract base for encoder models (embeddings, reranking).
Subclass this to implement custom embedding or reranking models.
"""
async def create_embedding(
self,
request: EmbeddingRequest,
raw_request: Request,
context: Dict
) -> Union[AsyncGenerator, Embedding, ErrorResponse]:
"""
Create embeddings for input text.
Args:
request (EmbeddingRequest): Embedding request with input texts
raw_request (Request): Raw FastAPI request object
context (Dict): Request context
Returns:
Union[AsyncGenerator, Embedding, ErrorResponse]: Embedding response
"""
...
async def create_rerank(
self,
request: RerankRequest,
raw_request: Request,
context: Dict
) -> Union[AsyncGenerator, Rerank, ErrorResponse]:
"""
Rerank documents based on query relevance.
Args:
request (RerankRequest): Reranking request with query and documents
raw_request (Request): Raw FastAPI request object
context (Dict): Request context
Returns:
Union[AsyncGenerator, Rerank, ErrorResponse]: Reranking results
"""
...Usage Example:
from kserve.protocol.rest.openai import OpenAIEncoderModel
from kserve.protocol.rest.openai.types import EmbeddingRequest, Embedding
class MyEmbeddingModel(OpenAIEncoderModel):
def __init__(self, name: str):
super().__init__(name)
self.model = None
def load(self):
# Load your embedding model
self.model = load_embedding_model()
self.ready = True
async def create_embedding(self, request, raw_request, context):
# Extract input texts
inputs = request.input if isinstance(request.input, list) else [request.input]
# Generate embeddings
embeddings = await self.model.encode(inputs)
# Return OpenAI-compatible response
return Embedding(
object="list",
data=[{
"object": "embedding",
"embedding": emb.tolist(),
"index": i
} for i, emb in enumerate(embeddings)],
model=self.name,
usage={
"prompt_tokens": len(inputs),
"total_tokens": len(inputs)
}
)Proxy model that forwards requests to an external OpenAI-compatible endpoint. Useful for load balancing, caching, or adding preprocessing logic.
class OpenAIProxyModel(OpenAIGenerativeModel, OpenAIEncoderModel):
"""
Proxy model that forwards requests to external OpenAI-compatible endpoint.
Supports chat completions, completions, embeddings, and reranking by proxying
to an external service.
Args:
name (str): Model name
target_url (str): Target OpenAI-compatible endpoint URL
api_key (str, optional): API key for authentication
"""
def __init__(self, name: str, target_url: str, api_key: str = None): ...Adapter model that converts between OpenAI chat format and custom model interfaces. Useful for wrapping existing models to expose OpenAI-compatible endpoints.
class OpenAIChatAdapterModel(OpenAIGenerativeModel):
"""
Adapter that converts OpenAI chat requests to custom model format.
Wraps an existing inference model to expose OpenAI chat completion endpoints.
Args:
name (str): Model name
underlying_model: The model to wrap
"""
def __init__(self, name: str, underlying_model): ...Data class for structured chat prompts with role and content.
class ChatPrompt:
"""
Chat prompt with role and content.
Attributes:
response_role (str): Role for the response (e.g., "assistant")
prompt (str): The actual prompt text
"""
response_role: str
prompt: strclass ChatCompletionRequest:
"""
Request for chat completion.
Attributes:
model (str): Model identifier
messages (List[Dict]): List of message dicts with 'role' and 'content'
temperature (float, optional): Sampling temperature (0.0-2.0)
top_p (float, optional): Nucleus sampling parameter
n (int, optional): Number of completions to generate
stream (bool, optional): Whether to stream responses
max_tokens (int, optional): Maximum tokens to generate
presence_penalty (float, optional): Presence penalty (-2.0-2.0)
frequency_penalty (float, optional): Frequency penalty (-2.0-2.0)
stop (Union[str, List[str]], optional): Stop sequences
"""
...
class ChatCompletion:
"""
Chat completion response.
Attributes:
id (str): Unique identifier
object (str): Object type ("chat.completion")
created (int): Unix timestamp
model (str): Model used
choices (List[Dict]): Completion choices with message and finish_reason
usage (Dict, optional): Token usage statistics
"""
...class CompletionRequest:
"""
Request for text completion.
Attributes:
model (str): Model identifier
prompt (Union[str, List[str]]): Input prompt(s)
temperature (float, optional): Sampling temperature
max_tokens (int, optional): Maximum tokens to generate
stream (bool, optional): Whether to stream responses
stop (Union[str, List[str]], optional): Stop sequences
"""
...
class Completion:
"""
Text completion response.
Attributes:
id (str): Unique identifier
object (str): Object type ("text_completion")
created (int): Unix timestamp
model (str): Model used
choices (List[Dict]): Completion choices with text and finish_reason
usage (Dict, optional): Token usage statistics
"""
...class EmbeddingRequest:
"""
Request for embeddings.
Attributes:
model (str): Model identifier
input (Union[str, List[str]]): Input text(s) to embed
encoding_format (str, optional): Format for embeddings ("float" or "base64")
"""
...
class Embedding:
"""
Embedding response.
Attributes:
object (str): Object type ("list")
data (List[Dict]): List of embedding objects with 'embedding' and 'index'
model (str): Model used
usage (Dict): Token usage statistics
"""
...class RerankRequest:
"""
Request for reranking documents.
Attributes:
model (str): Model identifier
query (str): Search query
documents (List[str]): Documents to rerank
top_n (int, optional): Number of top documents to return
"""
...
class Rerank:
"""
Reranking response.
Attributes:
object (str): Object type ("list")
data (List[Dict]): Reranked documents with relevance scores and indices
model (str): Model used
"""
...class ErrorResponse:
"""
OpenAI-compatible error response.
Attributes:
error (Dict): Error object with 'message', 'type', 'code'
"""
error: Dict[str, Any]When an OpenAI-compatible model is registered with ModelServer, the following endpoints are automatically exposed:
These endpoints follow the OpenAI API specification and can be used with standard OpenAI client libraries.
from kserve import ModelServer
from kserve.protocol.rest.openai import OpenAIGenerativeModel
# Create your OpenAI-compatible model
model = MyLLMModel("my-llm")
model.load()
# Start server with OpenAI endpoints
server = ModelServer()
server.start([model])The server will automatically detect OpenAI models and register the appropriate endpoints.