or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

amazon-algorithms.mdautoml.mdcore-training.mddata-processing.mddebugging-profiling.mdexperiments.mdframework-training.mdhyperparameter-tuning.mdindex.mdmodel-monitoring.mdmodel-serving.mdremote-functions.md

data-processing.mddocs/

0

# Data Processing

1

2

Data preprocessing capabilities including built-in processing containers, custom processing jobs, and framework integration for large-scale data transformation, feature engineering, and model evaluation.

3

4

## Capabilities

5

6

### Base Processing Classes

7

8

Core processing functionality for running data preprocessing, validation, and evaluation jobs on SageMaker managed infrastructure.

9

10

```python { .api }

11

class Processor:

12

"""

13

Base class for SageMaker Processing Jobs.

14

15

Parameters:

16

- role (str): IAM role ARN

17

- image_uri (str): Docker image URI for processing

18

- instance_count (int): Number of processing instances

19

- instance_type (str): EC2 instance type

20

- output_kms_key (str, optional): KMS key for output encryption

21

- volume_size_in_gb (int, optional): Storage volume size

22

- volume_kms_key (str, optional): KMS key for volume encryption

23

- max_runtime_in_seconds (int, optional): Maximum runtime

24

- base_job_name (str, optional): Base name for processing jobs

25

- sagemaker_session (Session, optional): SageMaker session

26

- env (dict, optional): Environment variables

27

- tags (list, optional): Resource tags

28

- network_config (NetworkConfig, optional): Network configuration

29

"""

30

def __init__(self, role: str, image_uri: str, instance_count: int,

31

instance_type: str, **kwargs): ...

32

33

def run(self, inputs: List[ProcessingInput] = None,

34

outputs: List[ProcessingOutput] = None,

35

arguments: List[str] = None, wait: bool = True,

36

logs: bool = True, job_name: str = None,

37

experiment_config: dict = None, kms_key: str = None): ...

38

39

class ScriptProcessor(Processor):

40

"""

41

Processor for running custom scripts with configurable commands.

42

43

Parameters:

44

- command (List[str]): Command to run (e.g., ["python3"])

45

- code_location (str, optional): S3 location for source code

46

- All Processor parameters

47

"""

48

def __init__(self, command: List[str], **kwargs): ...

49

50

class FrameworkProcessor(Processor):

51

"""

52

Base class for framework-specific processors with pre-built containers.

53

54

Parameters:

55

- estimator_cls: Framework estimator class

56

- framework_version (str): Framework version

57

- py_version (str, optional): Python version

58

- All Processor parameters

59

"""

60

def __init__(self, estimator_cls, framework_version: str, **kwargs): ...

61

```

62

63

### Framework-Specific Processors

64

65

Pre-built processing containers for popular ML frameworks with optimized environments and dependencies.

66

67

```python { .api }

68

class PyTorchProcessor(FrameworkProcessor):

69

"""

70

Processor for PyTorch-based data processing.

71

72

Parameters:

73

- framework_version (str): PyTorch version

74

- py_version (str, optional): Python version ("py38", "py39", "py310")

75

- pytorch_version (str, optional): Alias for framework_version

76

- All FrameworkProcessor parameters

77

"""

78

def __init__(self, framework_version: str, **kwargs): ...

79

80

class TensorFlowProcessor(FrameworkProcessor):

81

"""

82

Processor for TensorFlow-based data processing.

83

84

Parameters:

85

- framework_version (str): TensorFlow version

86

- py_version (str, optional): Python version

87

- All FrameworkProcessor parameters

88

"""

89

def __init__(self, framework_version: str, **kwargs): ...

90

91

class SKLearnProcessor(FrameworkProcessor):

92

"""

93

Processor for Scikit-learn-based data processing.

94

95

Parameters:

96

- framework_version (str): Scikit-learn version

97

- py_version (str, optional): Python version

98

- All FrameworkProcessor parameters

99

"""

100

def __init__(self, framework_version: str, **kwargs): ...

101

102

class XGBoostProcessor(FrameworkProcessor):

103

"""

104

Processor for XGBoost-based data processing.

105

106

Parameters:

107

- framework_version (str): XGBoost version

108

- py_version (str, optional): Python version

109

- All FrameworkProcessor parameters

110

"""

111

def __init__(self, framework_version: str, **kwargs): ...

112

113

class HuggingFaceProcessor(FrameworkProcessor):

114

"""

115

Processor for Hugging Face transformers-based processing.

116

117

Parameters:

118

- transformers_version (str): Transformers version

119

- pytorch_version (str, optional): PyTorch version

120

- tensorflow_version (str, optional): TensorFlow version

121

- py_version (str, optional): Python version

122

- All FrameworkProcessor parameters

123

"""

124

def __init__(self, transformers_version: str, **kwargs): ...

125

126

class MXNetProcessor(FrameworkProcessor):

127

"""

128

Processor for MXNet-based data processing.

129

130

Parameters:

131

- framework_version (str): MXNet version

132

- py_version (str, optional): Python version

133

- All FrameworkProcessor parameters

134

"""

135

def __init__(self, framework_version: str, **kwargs): ...

136

```

137

138

### Spark Integration

139

140

Apache Spark integration for large-scale distributed data processing on SageMaker.

141

142

```python { .api }

143

class SparkMLProcessor(Processor):

144

"""

145

Processor for Apache Spark MLlib processing jobs.

146

147

Parameters:

148

- framework_version (str): Spark version

149

- submit_app (str): Path to Spark application

150

- submit_py_files (List[str], optional): Python files for Spark context

151

- submit_files (List[str], optional): Additional files for Spark

152

- submit_jars (List[str], optional): JAR files for Spark

153

- submit_class (str, optional): Main class for Spark application

154

- All Processor parameters

155

"""

156

def __init__(self, framework_version: str, submit_app: str, **kwargs): ...

157

```

158

159

## Processing Input/Output Configuration

160

161

```python { .api }

162

class ProcessingInput:

163

"""

164

Input configuration for processing jobs.

165

166

Parameters:

167

- source (str): S3 URI or local path for input data

168

- destination (str): Container path where input will be available

169

- input_name (str, optional): Name for the input

170

- s3_data_type (str, optional): S3 data type ("S3Prefix" or "ManifestFile")

171

- s3_input_mode (str, optional): Input mode ("File" or "Pipe")

172

- s3_data_distribution_type (str, optional): Distribution type

173

- s3_compression_type (str, optional): Compression type

174

"""

175

def __init__(self, source: str, destination: str, **kwargs): ...

176

177

class ProcessingOutput:

178

"""

179

Output configuration for processing jobs.

180

181

Parameters:

182

- source (str): Container path where output will be generated

183

- destination (str): S3 URI for output storage

184

- output_name (str, optional): Name for the output

185

- s3_upload_mode (str, optional): Upload mode ("Continuous" or "EndOfJob")

186

"""

187

def __init__(self, source: str, destination: str, **kwargs): ...

188

```

189

190

## Usage Examples

191

192

### Custom Script Processing

193

194

```python

195

from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

196

197

# Create a script processor

198

processor = ScriptProcessor(

199

command=["python3"],

200

image_uri="your-account.dkr.ecr.region.amazonaws.com/processing:latest",

201

role=role,

202

instance_type="ml.m5.xlarge",

203

instance_count=1

204

)

205

206

# Run processing job

207

processor.run(

208

inputs=[

209

ProcessingInput(

210

source="s3://bucket/raw-data",

211

destination="/opt/ml/processing/input"

212

)

213

],

214

outputs=[

215

ProcessingOutput(

216

source="/opt/ml/processing/output",

217

destination="s3://bucket/processed-data"

218

)

219

],

220

arguments=["--input-path", "/opt/ml/processing/input",

221

"--output-path", "/opt/ml/processing/output"]

222

)

223

```

224

225

### Framework Processing

226

227

```python

228

from sagemaker.sklearn.processing import SKLearnProcessor

229

230

# Create sklearn processor

231

sklearn_processor = SKLearnProcessor(

232

framework_version="1.2-1",

233

role=role,

234

instance_type="ml.m5.large",

235

instance_count=1

236

)

237

238

# Run feature engineering

239

sklearn_processor.run(

240

code="preprocess.py",

241

inputs=[

242

ProcessingInput(

243

source="s3://bucket/train.csv",

244

destination="/opt/ml/processing/train"

245

)

246

],

247

outputs=[

248

ProcessingOutput(

249

source="/opt/ml/processing/features",

250

destination="s3://bucket/features"

251

)

252

]

253

)

254

```