or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddatasets.mdevaluation.mdfeature-engineering.mdfile-io.mdindex.mdmath-utils.mdpattern-mining.mdplotting.mdpreprocessing.mdregression.mdtext-processing.mdutilities.md

datasets.mddocs/

0

# Dataset Loading

1

2

Utilities for loading common machine learning datasets and generating synthetic data for experimentation and learning.

3

4

## Capabilities

5

6

### Classification Datasets

7

8

Standard datasets for classification tasks.

9

10

```python { .api }

11

def iris_data():

12

"""

13

Load the Iris flower classification dataset.

14

15

Returns:

16

- X: array, feature matrix (150 samples, 4 features)

17

- y: array, class labels (3 classes: setosa, versicolor, virginica)

18

"""

19

20

def wine_data():

21

"""

22

Load the Wine recognition dataset.

23

24

Returns:

25

- X: array, feature matrix (178 samples, 13 features)

26

- y: array, class labels (3 wine types)

27

"""

28

```

29

30

### Regression Datasets

31

32

Standard datasets for regression tasks.

33

34

```python { .api }

35

def boston_housing_data():

36

"""

37

Load the Boston Housing dataset.

38

39

Returns:

40

- X: array, feature matrix (506 samples, 13 features)

41

- y: array, housing prices (regression targets)

42

"""

43

44

def autompg_data():

45

"""

46

Load the Auto MPG dataset for regression.

47

48

Returns:

49

- X: array, feature matrix (398 samples, 7 features)

50

- y: array, miles per gallon (regression target)

51

"""

52

```

53

54

### Image Datasets

55

56

Image recognition datasets.

57

58

```python { .api }

59

def mnist_data():

60

"""

61

Load the MNIST handwritten digit dataset.

62

63

Returns:

64

- X: array, image data (70000 samples, 784 features)

65

- y: array, digit labels (0-9)

66

"""

67

68

def loadlocal_mnist(images_path, labels_path):

69

"""

70

Load MNIST data from local files.

71

72

Parameters:

73

- images_path: str, path to images file

74

- labels_path: str, path to labels file

75

76

Returns:

77

- X: array, image data

78

- y: array, digit labels

79

"""

80

```

81

82

### Synthetic Datasets

83

84

Functions for generating synthetic datasets.

85

86

```python { .api }

87

def three_blobs_data():

88

"""

89

Generate three Gaussian blobs for clustering.

90

91

Returns:

92

- X: array, feature matrix (150 samples, 2 features)

93

- y: array, cluster labels (3 clusters)

94

"""

95

96

def make_multiplexer_dataset(address_bits, sample_size, positive_class_ratio=0.5):

97

"""

98

Generate multiplexer boolean function dataset.

99

100

Parameters:

101

- address_bits: int, number of address bits

102

- sample_size: int, number of samples to generate

103

- positive_class_ratio: float, ratio of positive class samples

104

105

Returns:

106

- X: array, binary feature matrix

107

- y: array, binary class labels

108

"""

109

```

110

111

## Usage Examples

112

113

### Loading Standard Datasets

114

115

```python

116

from mlxtend.data import iris_data, wine_data, boston_housing_data

117

import matplotlib.pyplot as plt

118

119

# Load Iris dataset

120

X_iris, y_iris = iris_data()

121

print(f"Iris dataset: {X_iris.shape[0]} samples, {X_iris.shape[1]} features, {len(set(y_iris))} classes")

122

123

# Load Wine dataset

124

X_wine, y_wine = wine_data()

125

print(f"Wine dataset: {X_wine.shape[0]} samples, {X_wine.shape[1]} features, {len(set(y_wine))} classes")

126

127

# Load Boston Housing dataset

128

X_boston, y_boston = boston_housing_data()

129

print(f"Boston Housing: {X_boston.shape[0]} samples, {X_boston.shape[1]} features")

130

131

# Visualize Iris data

132

plt.figure(figsize=(8, 6))

133

colors = ['red', 'green', 'blue']

134

for i, color in enumerate(colors):

135

mask = y_iris == i

136

plt.scatter(X_iris[mask, 0], X_iris[mask, 1], c=color, label=f'Class {i}')

137

plt.xlabel('Sepal Length')

138

plt.ylabel('Sepal Width')

139

plt.title('Iris Dataset Visualization')

140

plt.legend()

141

plt.show()

142

```

143

144

### Working with MNIST Data

145

146

```python

147

from mlxtend.data import mnist_data

148

import matplotlib.pyplot as plt

149

import numpy as np

150

151

# Load MNIST dataset

152

X, y = mnist_data()

153

print(f"MNIST dataset: {X.shape[0]} samples, {X.shape[1]} features")

154

155

# Display sample digits

156

fig, axes = plt.subplots(2, 5, figsize=(10, 4))

157

for i, ax in enumerate(axes.flat):

158

# Reshape flat vector to 28x28 image

159

image = X[i].reshape(28, 28)

160

ax.imshow(image, cmap='gray')

161

ax.set_title(f'Label: {y[i]}')

162

ax.axis('off')

163

plt.tight_layout()

164

plt.show()

165

```

166

167

### Generating Synthetic Data

168

169

```python

170

from mlxtend.data import three_blobs_data, make_multiplexer_dataset

171

import matplotlib.pyplot as plt

172

173

# Generate three blobs for clustering

174

X_blobs, y_blobs = three_blobs_data()

175

plt.figure(figsize=(8, 6))

176

plt.scatter(X_blobs[:, 0], X_blobs[:, 1], c=y_blobs, cmap='viridis')

177

plt.title('Three Blobs Dataset')

178

plt.xlabel('Feature 1')

179

plt.ylabel('Feature 2')

180

plt.show()

181

182

# Generate multiplexer dataset

183

X_mult, y_mult = make_multiplexer_dataset(address_bits=2, sample_size=1000)

184

print(f"Multiplexer dataset: {X_mult.shape[0]} samples, {X_mult.shape[1]} features")

185

print(f"Class distribution: {np.bincount(y_mult)}")

186

```