Kubernetes Custom Resource Definition for serving predictive and generative machine learning models with high abstraction interfaces and features like GPU autoscaling, scale to zero, and canary rollouts.
pkg:github/kserve/kserve@0.15.x
npx @tessl/cli install tessl/github-kserve@0.15.0KServe is a comprehensive Kubernetes-native machine learning model serving platform that enables production deployment of ML models through Custom Resource Definitions. It provides a unified interface for serving predictive and generative models with enterprise-grade features like GPU autoscaling, scale-to-zero, canary rollouts, and multi-framework support.
pip install kservepip install kserve[storage] for cloud storage, pip install kserve[ray] for Ray integrationimport kserveCommon imports for model serving:
from kserve import Model, ModelServerFor client operations:
from kserve import InferenceRESTClient, InferenceGRPCClient, KServeClientFor protocol types:
from kserve import InferRequest, InferResponse, InferInput, InferOutputfrom kserve import Model, ModelServer
import asyncio
class MyModel(Model):
def __init__(self, name: str):
super().__init__(name)
self.model = None
self.ready = False
def load(self):
# Load your model here
# self.model = load_model("path/to/model")
self.ready = True
async def predict(self, payload):
# Implement prediction logic
# result = self.model.predict(payload)
return {"predictions": "example_result"}
if __name__ == "__main__":
model = MyModel("my-model")
ModelServer().start([model])import asyncio
from kserve import InferenceRESTClient, InferRequest, InferInput
async def main():
client = InferenceRESTClient("http://localhost:8080")
# Create input data
input_data = InferInput(name="data", shape=[1, 784], datatype="FP32")
input_data.set_data_from_numpy(data_array)
# Create inference request
request = InferRequest(model_name="my-model", inputs=[input_data])
# Make prediction
response = await client.infer(request)
predictions = response.outputs[0].as_numpy()
asyncio.run(main())from kserve import KServeClient, V1beta1InferenceService, V1beta1InferenceServiceSpec
from kserve import V1beta1PredictorSpec, V1beta1SKLearnSpec
# Create Kubernetes client
client = KServeClient()
# Define inference service
isvc = V1beta1InferenceService(
api_version="serving.kserve.io/v1beta1",
kind="InferenceService",
metadata={"name": "sklearn-iris", "namespace": "default"},
spec=V1beta1InferenceServiceSpec(
predictor=V1beta1PredictorSpec(
sklearn=V1beta1SKLearnSpec(
storage_uri="gs://kfserving-examples/models/sklearn/1.0/model"
)
)
)
)
# Deploy the service
client.create(isvc, namespace="default")KServe consists of several key components that work together to provide a complete ML serving solution:
KServe provides built-in support for popular ML frameworks through specialized servers:
Core classes and interfaces for implementing custom model servers with lifecycle management, health checking, and protocol support.
class Model:
def __init__(self, name: str): ...
def load(self): ...
async def predict(self, payload): ...
async def preprocess(self, payload): ...
async def postprocess(self, payload): ...
class ModelServer:
def __init__(self, http_port: int = 8080, grpc_port: int = 8081): ...
def start(self, models: List[Model]): ...
def register_model(self, model: Model): ...High-level async clients for making inference requests to KServe models with retry logic, SSL support, and protocol conversion.
class InferenceRESTClient:
def __init__(self, url: str, config: RESTConfig = None): ...
async def infer(self, request: InferRequest) -> InferResponse: ...
async def explain(self, request: InferRequest) -> InferResponse: ...
async def is_model_ready(self, model_name: str) -> bool: ...
class InferenceGRPCClient:
def __init__(self, url: str): ...
async def infer(self, request: InferRequest) -> InferResponse: ...Standardized data structures for inference requests and responses with support for multiple protocols and data formats.
class InferRequest:
def __init__(self, model_name: str, inputs: List[InferInput]): ...
def as_dataframe(self) -> pandas.DataFrame: ...
def to_rest(self) -> dict: ...
class InferResponse:
def __init__(self, model_name: str, outputs: List[InferOutput]): ...
@classmethod
def from_rest(cls, response: dict) -> 'InferResponse': ...
class InferInput:
def __init__(self, name: str, shape: List[int], datatype: str): ...
def set_data_from_numpy(self, input_tensor: numpy.ndarray): ...
def as_numpy(self) -> numpy.ndarray: ...Python client for managing KServe resources in Kubernetes clusters including InferenceServices, TrainedModels, and InferenceGraphs.
class KServeClient:
def __init__(self, config_file: str = None): ...
def create(self, obj, namespace: str = "default"): ...
def get(self, name: str, namespace: str = "default"): ...
def delete(self, name: str, namespace: str = "default"): ...
def set_credentials(self, storage_type: str, **kwargs): ...Comprehensive set of Kubernetes Custom Resource Definitions for defining inference services, serving runtimes, and model configurations.
class V1beta1InferenceService:
def __init__(self, metadata: dict, spec: V1beta1InferenceServiceSpec): ...
class V1beta1PredictorSpec:
def __init__(self, sklearn: V1beta1SKLearnSpec = None,
pytorch: V1beta1TorchServeSpec = None): ...
class V1alpha1ServingRuntime:
def __init__(self, metadata: dict, spec: V1alpha1ServingRuntimeSpec): ...Pre-built model servers for popular ML frameworks that extend the core KServe functionality with framework-specific optimizations.
# Scikit-learn
from sklearnserver import SKLearnModel
# XGBoost
from xgbserver import XGBoostModel
# HuggingFace
from huggingfaceserver import HuggingFaceModelUnified storage interface supporting multiple cloud providers and local storage for model artifact management.
from kserve.storage import Storage
def download_model(uri: str, dest: str):
Storage.download(uri, dest)