tessl install tessl/pypi-kserve@0.16.1KServe is a comprehensive Python SDK that provides standardized interfaces for building and deploying machine learning model serving infrastructure on Kubernetes.
Get started with KServe in minutes by building your first model server, making inference requests, and deploying to Kubernetes.
# Base installation
pip install kserve
# With storage support (S3, GCS, Azure)
pip install kserve[storage]
# With OpenAI protocol support for LLMs
pip install kserve[llm]
# With Ray Serve integration
pip install kserve[ray]from kserve import Model
import joblib
class SKLearnModel(Model):
def __init__(self, name: str):
super().__init__(name)
self.model = None
self.ready = False
def load(self):
"""Load model artifacts"""
self.model = joblib.load("/mnt/models/model.pkl")
self.ready = True
print(f"Model {self.name} loaded successfully")
def predict(self, payload, headers=None):
"""Run inference"""
instances = payload["instances"]
predictions = self.model.predict(instances)
return {"predictions": predictions.tolist()}from kserve import ModelServer
if __name__ == "__main__":
# Create and load model
model = SKLearnModel("sklearn-model")
model.load()
# Start server
ModelServer().start([model])# Check server health
curl http://localhost:8080/v2/health/live
curl http://localhost:8080/v2/health/ready
# Check model readiness
curl http://localhost:8080/v2/models/sklearn-model/ready
# Make prediction (v2 protocol)
curl -X POST http://localhost:8080/v2/models/sklearn-model/infer \
-H "Content-Type: application/json" \
-d '{
"inputs": [{
"name": "input-0",
"shape": [1, 4],
"datatype": "FP32",
"data": [[5.1, 3.5, 1.4, 0.2]]
}]
}'
# Make prediction (v1 protocol)
curl -X POST http://localhost:8080/v1/models/sklearn-model:predict \
-H "Content-Type: application/json" \
-d '{"instances": [[5.1, 3.5, 1.4, 0.2]]}'import asyncio
from kserve import InferenceRESTClient, RESTConfig
async def main():
# Create client
config = RESTConfig(protocol="v2", timeout=60)
client = InferenceRESTClient(config=config)
# Make inference request
response = await client.infer(
base_url="http://localhost:8080",
model_name="sklearn-model",
data={
"inputs": [{
"name": "input-0",
"shape": [1, 4],
"datatype": "FP32",
"data": [[5.1, 3.5, 1.4, 0.2]]
}]
}
)
print(f"Response: {response}")
await client.close()
asyncio.run(main())import asyncio
from kserve import InferenceGRPCClient, InferInput, InferRequest
import numpy as np
async def main():
# Create client
client = InferenceGRPCClient(url="localhost:8081")
# Prepare input
data = np.array([[5.1, 3.5, 1.4, 0.2]], dtype=np.float32)
input_tensor = InferInput(
name="input-0",
shape=list(data.shape),
datatype="FP32"
)
input_tensor.set_data_from_numpy(data)
# Create request
request = InferRequest(
model_name="sklearn-model",
infer_inputs=[input_tensor]
)
# Make inference
response = await client.infer(request)
# Extract results
output = response.outputs[0]
predictions = output.as_numpy()
print(f"Predictions: {predictions}")
await client.close()
asyncio.run(main())from kserve import (
KServeClient,
V1beta1InferenceService,
V1beta1InferenceServiceSpec,
V1beta1PredictorSpec,
V1beta1SKLearnSpec
)
# Create KServe client
kserve_client = KServeClient()
# Define InferenceService
isvc = V1beta1InferenceService(
api_version="serving.kserve.io/v1beta1",
kind="InferenceService",
metadata={
"name": "sklearn-iris",
"namespace": "default"
},
spec=V1beta1InferenceServiceSpec(
predictor=V1beta1PredictorSpec(
sklearn=V1beta1SKLearnSpec(
storage_uri="gs://kfserving-examples/models/sklearn/iris",
resources={
"limits": {"cpu": "1", "memory": "2Gi"},
"requests": {"cpu": "500m", "memory": "1Gi"}
}
)
)
)
)
# Create InferenceService
kserve_client.create(isvc, namespace="default")# Wait for InferenceService to be ready
try:
isvc = kserve_client.wait_isvc_ready(
name="sklearn-iris",
namespace="default",
timeout_seconds=300
)
print(f"InferenceService ready at: {isvc.status.url}")
except RuntimeError as e:
print(f"Failed to become ready: {e}")# Get InferenceService status
status = kserve_client.get_isvc_status("sklearn-iris", namespace="default")
url = status.url
# Make prediction using the URL
from kserve import InferenceRESTClient
client = InferenceRESTClient()
response = await client.infer(
base_url=url,
model_name="sklearn-iris",
data={
"inputs": [{
"name": "input-0",
"shape": [1, 4],
"datatype": "FP32",
"data": [[5.1, 3.5, 1.4, 0.2]]
}]
}
)from kserve import Model, ModelServer
# Create multiple models
model1 = SKLearnModel("iris-classifier")
model1.load()
model2 = SKLearnModel("wine-classifier")
model2.load()
# Start server with multiple models
ModelServer().start([model1, model2])Access models at:
/v2/models/iris-classifier/infer/v2/models/wine-classifier/inferfrom kserve import Model
import numpy as np
class PreprocessedModel(Model):
def load(self):
self.model = joblib.load("/mnt/models/model.pkl")
self.mean = np.array([5.0, 3.0, 3.5, 1.0])
self.std = np.array([1.0, 0.5, 1.5, 0.5])
self.ready = True
def preprocess(self, body, headers=None):
"""Normalize input data"""
instances = np.array(body["instances"])
normalized = (instances - self.mean) / self.std
return {"instances": normalized.tolist()}
def predict(self, payload, headers=None):
"""Run inference"""
instances = payload["instances"]
predictions = self.model.predict(instances)
probabilities = self.model.predict_proba(instances)
return {
"predictions": predictions.tolist(),
"probabilities": probabilities.tolist()
}
def postprocess(self, response, headers=None):
"""Add metadata"""
return {
**response,
"model_name": self.name,
"model_version": "1.0.0"
}from kserve import KServeClient
client = KServeClient()
client.set_credentials(
storage_type="GCS",
namespace="default",
credentials_file="/path/to/gcs-credentials.json",
service_account="kserve-sa"
)client.set_credentials(
storage_type="S3",
namespace="default",
service_account="kserve-sa",
aws_access_key_id="AKIAIOSFODNN7EXAMPLE",
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
s3_endpoint="s3.amazonaws.com",
s3_region="us-west-2"
)client.set_credentials(
storage_type="Azure",
namespace="default",
credentials_file="/path/to/azure-credentials.json",
service_account="kserve-sa"
)from kserve import Model, logger
from kserve.metrics import PREDICT_HIST_TIME, get_labels
import time
class MonitoredModel(Model):
def predict(self, payload, headers=None):
# Log request
logger.info(f"Processing prediction for {self.name}")
# Time inference
start = time.time()
instances = payload["instances"]
predictions = self.model.predict(instances)
elapsed = time.time() - start
# Record metrics
labels = get_labels(self.name)
PREDICT_HIST_TIME.labels(**labels).observe(elapsed)
# Log completion
logger.info(f"Prediction completed in {elapsed:.3f}s")
return {"predictions": predictions.tolist()}# Start with custom ports and workers
python model.py \
--http_port 9000 \
--grpc_port 9001 \
--workers 4 \
--max_threads 8
# Enable API documentation
python model.py --enable_docs_url true
# Access at http://localhost:8080/docs
# Disable gRPC server
python model.py --enable_grpc false
# Custom logging configuration
python model.py --log_config_file /path/to/logging.yamlpip install kserveload() and predict() methodsModelServer().start([model])KServeClientwait_isvc_ready()preprocess() and postprocess()PredictorConfigModel not loading?
kubectl logs <pod-name>Predictions failing?
GET /v2/models/{name}/readyConnection timeouts?