Data serialization and deserialization for converting between Python objects and wire formats for model inference.
Serializers convert Python objects to bytes for sending to SageMaker endpoints.
class BaseSerializer:
"""
Abstract base class for serializers.
Properties:
CONTENT_TYPE: str - Content type for serialized data
- MIME type string
- Used in HTTP Content-Type header
Abstract Methods:
serialize(data: object) -> bytes
Serialize data to bytes.
Parameters:
data: object - Python object to serialize
Returns:
bytes: Serialized data
Raises:
Exception: Serialization errors
Notes:
- Subclass for custom serialization
- Implement serialize() method
- Set CONTENT_TYPE class attribute
"""class JSONSerializer(BaseSerializer):
"""
JSON format serialization.
Properties:
CONTENT_TYPE: str = "application/json"
Methods:
serialize(data: object) -> bytes
Serialize to JSON bytes.
Parameters:
data: object - JSON-serializable object
Returns:
bytes: UTF-8 encoded JSON
Raises:
TypeError: If data not JSON-serializable
Supported Types:
- Dictionaries
- Lists
- Primitives (str, int, float, bool, None)
- JSON-serializable objects (with __dict__ or custom encoder)
Notes:
- Uses json.dumps() internally
- Handles nested structures
- Float precision: double precision
"""Usage:
from sagemaker.core.serializers import JSONSerializer
serializer = JSONSerializer()
# Serialize dictionary
data = {
"features": [1.0, 2.0, 3.0],
"metadata": {"id": 123, "timestamp": "2024-01-15"}
}
payload = serializer.serialize(data)
# Use with endpoint
response = endpoint.invoke(
body=payload,
content_type=serializer.CONTENT_TYPE
)
# Serialize list
batch_data = [
{"features": [1.0, 2.0, 3.0]},
{"features": [4.0, 5.0, 6.0]}
]
payload = serializer.serialize(batch_data)class CSVSerializer(SimpleBaseSerializer):
"""
CSV format serialization.
Properties:
CONTENT_TYPE: str = "text/csv"
Methods:
serialize(data: object) -> bytes
Serialize to CSV bytes.
Parameters:
data: object - Data to serialize
Returns:
bytes: CSV formatted bytes
Raises:
ValueError: If data format unsupported
Supported Types:
- Lists of lists (rows): [[1.0, 2.0], [3.0, 4.0]]
- NumPy arrays: np.array([[1, 2], [3, 4]])
- Pandas DataFrames: pd.DataFrame(...)
- Single row as list: [1.0, 2.0, 3.0]
Notes:
- No header row included
- Values comma-separated
- Newline-separated rows
- DataFrames serialized without index/column names
"""Usage:
from sagemaker.core.serializers import CSVSerializer
import pandas as pd
import numpy as np
serializer = CSVSerializer()
# Serialize list of rows
data = [
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0]
]
payload = serializer.serialize(data)
# Result: b"1.0,2.0,3.0\n4.0,5.0,6.0\n"
# Serialize pandas DataFrame
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'b', 'c'])
payload = serializer.serialize(df)
# Serialize NumPy array
arr = np.array([[1, 2, 3], [4, 5, 6]])
payload = serializer.serialize(arr)
# Single row
single_row = [1.0, 2.0, 3.0]
payload = serializer.serialize(single_row)class NumpySerializer(BaseSerializer):
"""
NumPy array serialization to NPY format.
Properties:
CONTENT_TYPE: str = "application/x-npy"
Methods:
serialize(data: np.ndarray) -> bytes
Serialize NumPy array.
Parameters:
data: np.ndarray - NumPy array (required)
Returns:
bytes: NPY format bytes
Raises:
ValueError: If data not NumPy array
Supported Types:
- NumPy ndarrays of any dtype
- Preserves dtype and shape information
- Supports: float32, float64, int32, int64, bool, etc.
Notes:
- Binary format, more efficient than CSV/JSON
- Preserves numeric precision
- Shape and dtype encoded in format
"""Usage:
from sagemaker.core.serializers import NumpySerializer
import numpy as np
serializer = NumpySerializer()
# Serialize NumPy array
data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
payload = serializer.serialize(data)
# Preserves shape and dtype
multi_dim = np.random.randn(2, 3, 4) # 3D array
payload = serializer.serialize(multi_dim)
# Integer arrays
int_data = np.array([1, 2, 3, 4, 5], dtype=np.int64)
payload = serializer.serialize(int_data)class TorchTensorSerializer(BaseSerializer):
"""
PyTorch tensor serialization.
Properties:
CONTENT_TYPE: str = "application/x-torch"
Methods:
serialize(data: torch.Tensor) -> bytes
Serialize PyTorch tensor.
Parameters:
data: torch.Tensor - PyTorch tensor (required)
Returns:
bytes: Serialized tensor
Raises:
ValueError: If data not torch.Tensor
Supported Types:
- PyTorch tensors of any dtype
- Preserves dtype, shape, and device info
- GPU tensors automatically moved to CPU for serialization
Notes:
- Uses torch.save() internally
- Binary format
- Larger than NumPy for same data
"""class JSONLinesSerializer(SimpleBaseSerializer):
"""
JSON Lines format (newline-delimited JSON).
Properties:
CONTENT_TYPE: str = "application/jsonlines"
Methods:
serialize(data: List[dict]) -> bytes
Serialize to JSON Lines.
Parameters:
data: List[dict] - List of dictionaries (required)
Returns:
bytes: JSON Lines formatted bytes
Raises:
ValueError: If data not list of dicts
Supported Types:
- List of dictionaries
- Each dict becomes one JSON line
Notes:
- One JSON object per line
- No commas between objects
- Useful for batch data
"""Usage:
from sagemaker.core.serializers import JSONLinesSerializer
serializer = JSONLinesSerializer()
# Serialize list of records
data = [
{"text": "example 1", "label": 0, "score": 0.8},
{"text": "example 2", "label": 1, "score": 0.92},
{"text": "example 3", "label": 0, "score": 0.75}
]
payload = serializer.serialize(data)
# Result: b'{"text":"example 1","label":0,"score":0.8}\n{"text":"example 2",...}\n...'class StringSerializer(SimpleBaseSerializer):
"""
String serialization.
Properties:
CONTENT_TYPE: str = "text/plain"
Methods:
serialize(data: str) -> bytes
Serialize string to UTF-8 bytes.
Parameters:
data: str - String to serialize (required)
Returns:
bytes: UTF-8 encoded bytes
Supported Types:
- Strings only
Notes:
- UTF-8 encoding
- For plain text data
"""class DataSerializer(BaseSerializer):
"""
Generic data serialization with configurable content type.
Parameters:
content_type: str - Content type (default: "application/octet-stream")
Methods:
serialize(data: bytes) -> bytes
Pass through bytes unchanged.
Parameters:
data: bytes - Pre-serialized bytes (required)
Returns:
bytes: Same bytes
Usage:
For pre-serialized or binary data.
Set custom content_type for special formats.
Notes:
- No transformation applied
- Use when data already in correct format
"""class IdentitySerializer(SimpleBaseSerializer):
"""
Pass-through serialization (no transformation).
Properties:
CONTENT_TYPE: str = "application/octet-stream"
Methods:
serialize(data: bytes) -> bytes
Return data unchanged.
Usage:
When data is already in the correct format.
Notes:
- Equivalent to DataSerializer with default content type
"""class LibSVMSerializer(SimpleBaseSerializer):
"""
LibSVM format serialization for sparse data.
Properties:
CONTENT_TYPE: str = "text/libsvm"
Methods:
serialize(data: object) -> bytes
Serialize to LibSVM format.
Parameters:
data: object - Sparse data structure
Returns:
bytes: LibSVM formatted bytes
LibSVM Format:
label index1:value1 index2:value2 ...
Example:
1 1:0.5 3:0.8 7:1.0
0 2:0.3 5:0.9
Usage:
For sparse feature vectors (SVMs, linear models).
Notes:
- Efficient for sparse data
- One-based indexing
- Used by SageMaker built-in algorithms
"""class SparseMatrixSerializer(BaseSerializer):
"""
Sparse matrix serialization.
Properties:
CONTENT_TYPE: str = "application/x-sparse"
Methods:
serialize(data: scipy.sparse.spmatrix) -> bytes
Serialize sparse matrix.
Parameters:
data: scipy.sparse.spmatrix - Sparse matrix (required)
Returns:
bytes: Serialized sparse matrix
Supported Types:
- SciPy sparse matrices: CSR, CSC, COO formats
- Preserves sparsity pattern and values
Notes:
- Efficient for large sparse matrices
- Preserves matrix format
"""class RecordSerializer(BaseSerializer):
"""
RecordIO-protobuf format serialization.
Properties:
CONTENT_TYPE: str = "application/x-recordio-protobuf"
Methods:
serialize(data: object) -> bytes
Serialize to RecordIO format.
Returns:
bytes: RecordIO-protobuf formatted bytes
Usage:
For SageMaker built-in algorithms requiring RecordIO format.
Examples: Linear Learner, XGBoost, Image Classification.
Notes:
- Binary format
- Efficient for built-in algorithms
- Not human-readable
"""Deserializers convert bytes from SageMaker endpoints back to Python objects.
class BaseDeserializer:
"""
Abstract base class for deserializers.
Properties:
ACCEPT: str - Accept type for responses
- MIME type string
- Used in HTTP Accept header
Abstract Methods:
deserialize(stream: bytes, content_type: str) -> object
Deserialize bytes to Python object.
Parameters:
stream: bytes - Response bytes (required)
content_type: str - Content type from response (required)
Returns:
object: Deserialized Python object
Raises:
Exception: Deserialization errors
Notes:
- Subclass for custom deserialization
- Implement deserialize() method
- Set ACCEPT class attribute
"""class JSONDeserializer(SimpleBaseDeserializer):
"""
JSON format deserialization.
Properties:
ACCEPT: str = "application/json"
Methods:
deserialize(stream: bytes, content_type: str) -> object
Deserialize from JSON.
Returns:
object: Python dict, list, or primitive
Returns:
- Dictionaries
- Lists
- Strings, numbers, booleans, None
Notes:
- UTF-8 decoding
- Handles nested structures
"""Usage:
from sagemaker.core.deserializers import JSONDeserializer
import json
deserializer = JSONDeserializer()
# Deserialize response
response = endpoint.invoke(
body=input_data,
accept=deserializer.ACCEPT
)
result = deserializer.deserialize(
response['Body'].read(),
response['ContentType']
)
print(result) # {'predictions': [0.8, 0.2], 'metadata': {...}}class CSVDeserializer(SimpleBaseDeserializer):
"""
CSV format deserialization.
Properties:
ACCEPT: str = "text/csv"
Methods:
deserialize(stream: bytes, content_type: str) -> List[List]
Deserialize from CSV.
Returns:
List[List]: List of rows (each row is a list)
Returns:
- List of lists (rows)
- No header parsing
- All values as strings
Notes:
- Comma-separated values
- Newline-separated rows
- No type conversion (all strings)
"""class NumpyDeserializer(BaseDeserializer):
"""
NumPy array deserialization from NPY format.
Properties:
ACCEPT: str = "application/x-npy"
Methods:
deserialize(stream: bytes, content_type: str) -> np.ndarray
Deserialize to NumPy array.
Returns:
np.ndarray: NumPy array with original dtype and shape
Returns:
- NumPy ndarray
- Preserves dtype and shape from serialization
Notes:
- Binary format
- Exact reconstruction of array
"""class PandasDeserializer(BaseDeserializer):
"""
Pandas DataFrame deserialization.
Properties:
ACCEPT: str = "text/csv" or "application/json"
- Configurable based on response
Methods:
deserialize(stream: bytes, content_type: str) -> pd.DataFrame
Deserialize to DataFrame.
Parameters:
stream: bytes - Response bytes
content_type: str - Response content type
- "text/csv": Parse as CSV
- "application/json": Parse as JSON
Returns:
pd.DataFrame: Pandas DataFrame
Returns:
- Pandas DataFrame
- Infers dtypes automatically
Notes:
- Flexible format support
- Type inference for CSV
- Handles both CSV and JSON responses
"""Usage:
from sagemaker.core.deserializers import PandasDeserializer
import pandas as pd
deserializer = PandasDeserializer()
# Invoke endpoint requesting CSV
response = endpoint.invoke(
body=input_data,
accept="text/csv"
)
df = deserializer.deserialize(
response['Body'].read(),
response['ContentType']
)
print(df.head())
print(df.dtypes)
# Or JSON format
response = endpoint.invoke(body=input_data, accept="application/json")
df = deserializer.deserialize(response['Body'].read(), response['ContentType'])class TorchTensorDeserializer(BaseDeserializer):
"""
PyTorch tensor deserialization.
Properties:
ACCEPT: str = "application/x-torch"
Methods:
deserialize(stream: bytes, content_type: str) -> torch.Tensor
Deserialize to tensor.
Returns:
torch.Tensor: PyTorch tensor
Returns:
- PyTorch tensor
- Preserves dtype and shape
- Loaded to CPU by default
Notes:
- Uses torch.load() internally
- Move to GPU manually if needed
"""class JSONLinesDeserializer(SimpleBaseDeserializer):
"""
JSON Lines format deserialization.
Properties:
ACCEPT: str = "application/jsonlines"
Methods:
deserialize(stream: bytes, content_type: str) -> List[dict]
Deserialize from JSON Lines.
Returns:
List[dict]: List of dictionaries
Returns:
- List of dictionaries
- One dict per line
Notes:
- Inverse of JSONLinesSerializer
- Handles batch responses
"""class StringDeserializer(SimpleBaseDeserializer):
"""
String deserialization.
Properties:
ACCEPT: str = "text/plain"
Methods:
deserialize(stream: bytes, content_type: str) -> str
Deserialize to string.
Returns:
str: UTF-8 decoded string
Returns:
- UTF-8 decoded string
Notes:
- For plain text responses
"""class BytesDeserializer(SimpleBaseDeserializer):
"""
Bytes deserialization (no transformation).
Properties:
ACCEPT: str = "application/octet-stream"
Methods:
deserialize(stream: bytes, content_type: str) -> bytes
Return bytes unchanged.
Returns:
bytes: Raw bytes
Returns:
- Raw bytes
- No transformation
Notes:
- Use for binary responses
- Images, audio, compressed data, etc.
"""class StreamDeserializer(BaseDeserializer):
"""
Stream deserialization for large responses.
Properties:
ACCEPT: str = "*/*"
Methods:
deserialize(stream: IO, content_type: str) -> IO
Return stream handle.
Parameters:
stream: IO - Response stream (required)
content_type: str - Content type
Returns:
IO: File-like object for streaming
Returns:
- File-like stream object
- Read incrementally to avoid loading entire response
Notes:
- For very large responses
- Memory efficient
- Read in chunks
"""class RecordDeserializer(BaseDeserializer):
"""
RecordIO-protobuf format deserialization.
Properties:
ACCEPT: str = "application/x-recordio-protobuf"
Methods:
deserialize(stream: bytes, content_type: str) -> object
Deserialize from RecordIO.
Returns:
object: Deserialized records
Returns:
- Deserialized records
- Format depends on algorithm
Notes:
- For built-in algorithm responses
- Binary format
"""from sagemaker.serve import ModelBuilder, SchemaBuilder
from sagemaker.core.serializers import JSONSerializer
from sagemaker.core.deserializers import JSONDeserializer
import numpy as np
# Auto-detect from samples
schema_builder = SchemaBuilder(
sample_input={"features": [1.0, 2.0, 3.0]},
sample_output={"prediction": 0.8, "confidence": 0.95}
)
builder = ModelBuilder(
model=my_model,
schema_builder=schema_builder,
role_arn=role_arn
)
# schema_builder automatically sets appropriate serializers/deserializers
endpoint = builder.deploy(endpoint_name="auto-serialized-endpoint")
# Automatic serialization/deserialization
result = endpoint.invoke(data={"features": [2.0, 3.0, 4.0]})
print(result) # Already deserialized to dictfrom sagemaker.core.resources import Endpoint
from sagemaker.core.serializers import CSVSerializer
from sagemaker.core.deserializers import JSONDeserializer
# Get endpoint
endpoint = Endpoint.get(endpoint_name="my-endpoint")
# Attach serializers manually
serializer = CSVSerializer()
deserializer = JSONDeserializer()
# Serialize input
data = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
payload = serializer.serialize(data)
# Invoke endpoint
response = endpoint.invoke(
body=payload,
content_type=serializer.CONTENT_TYPE,
accept=deserializer.ACCEPT
)
# Deserialize response
result = deserializer.deserialize(
response['Body'].read(),
response['ContentType']
)
print(result) # {'predictions': [0.8, 0.2]}from sagemaker.core.serializers import DataSerializer
from sagemaker.core.deserializers import BytesDeserializer
# Custom image serialization
image_serializer = DataSerializer(content_type="image/jpeg")
image_deserializer = BytesDeserializer()
# Read image
with open("image.jpg", "rb") as f:
image_bytes = f.read()
# Send to endpoint
response = endpoint.invoke(
body=image_serializer.serialize(image_bytes),
content_type=image_serializer.content_type,
accept=image_deserializer.ACCEPT
)
# Get binary response
result = image_deserializer.deserialize(
response['Body'].read(),
response['ContentType']
)
# Save result image
with open("result.jpg", "wb") as f:
f.write(result)import numpy as np
import pandas as pd
# Different serializers for different data types
csv_serializer = CSVSerializer()
json_serializer = JSONSerializer()
numpy_serializer = NumpySerializer()
def smart_serialize(data):
"""Choose serializer based on data type."""
if isinstance(data, pd.DataFrame):
return csv_serializer.serialize(data), csv_serializer.CONTENT_TYPE
elif isinstance(data, np.ndarray):
return numpy_serializer.serialize(data), numpy_serializer.CONTENT_TYPE
elif isinstance(data, (dict, list)):
return json_serializer.serialize(data), json_serializer.CONTENT_TYPE
else:
raise ValueError(f"Unsupported data type: {type(data)}")
# Use appropriate serializer
data = np.array([[1, 2, 3], [4, 5, 6]])
payload, content_type = smart_serialize(data)
response = endpoint.invoke(body=payload, content_type=content_type)from sagemaker.core.serializers import BaseSerializer
import pickle
class PickleSerializer(BaseSerializer):
"""Custom pickle serializer for Python objects."""
CONTENT_TYPE = "application/python-pickle"
def serialize(self, data):
"""Serialize with pickle."""
try:
return pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
except Exception as e:
raise RuntimeError(f"Pickle serialization failed: {e}")
# Use custom serializer
serializer = PickleSerializer()
payload = serializer.serialize({"data": my_complex_object})
response = endpoint.invoke(
body=payload,
content_type=serializer.CONTENT_TYPE
)from sagemaker.core.deserializers import BaseDeserializer
import pickle
class PickleDeserializer(BaseDeserializer):
"""Custom pickle deserializer."""
ACCEPT = "application/python-pickle"
def deserialize(self, stream, content_type):
"""Deserialize with pickle."""
try:
return pickle.loads(stream)
except Exception as e:
raise RuntimeError(f"Pickle deserialization failed: {e}")
# Use custom deserializer
deserializer = PickleDeserializer()
result = deserializer.deserialize(response_bytes, "application/python-pickle")import gzip
class GzipJSONSerializer(BaseSerializer):
"""JSON serializer with gzip compression."""
CONTENT_TYPE = "application/json"
def serialize(self, data):
"""Serialize and compress."""
import json
json_bytes = json.dumps(data).encode('utf-8')
compressed = gzip.compress(json_bytes)
return compressed
# Reduces payload size for large JSON
serializer = GzipJSONSerializer()Serialization Type Error:
JSON Serialization Error:
Payload Too Large:
Deserialization Error:
Encoding Error:
Shape Mismatch: