or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

actors.mdbuilds.mdindex.mdlogging.mdrequest-queues.mdruns.mdschedules.mdstorage.mdstore.mdtasks.mdusers.mdwebhooks.md

storage.mddocs/

0

# Data Storage

1

2

Access to Apify's data storage systems including datasets for structured data and key-value stores for arbitrary data storage. These storage systems provide persistent, scalable data management for Actor runs and general use.

3

4

## Capabilities

5

6

### Dataset Operations

7

8

Dataset management for structured data storage with support for multiple formats and streaming access.

9

10

```python { .api }

11

class DatasetClient:

12

def get(self) -> dict | None:

13

"""Get dataset information."""

14

15

def update(self, *, name: str | None = None, general_access: StorageGeneralAccess | None = None) -> dict:

16

"""Update dataset configuration.

17

18

Args:

19

name: Dataset name

20

general_access: Storage access level (from apify_shared.consts)

21

"""

22

23

def delete(self) -> None:

24

"""Delete dataset."""

25

26

def list_items(self, **kwargs) -> ListPage:

27

"""List dataset items with filtering and pagination.

28

29

Args:

30

offset (int, optional): Starting offset

31

limit (int, optional): Maximum items to return

32

desc (bool, optional): Sort in descending order

33

fields (list[str], optional): Fields to include

34

omit (list[str], optional): Fields to exclude

35

format (str, optional): Response format ('json', 'csv', 'xlsx', etc.)

36

clean (bool, optional): Clean items before return

37

**kwargs: Additional filtering parameters

38

"""

39

40

def iterate_items(self, **kwargs) -> Iterator[dict]:

41

"""Iterate over all dataset items.

42

43

Args:

44

offset (int, optional): Starting offset

45

limit (int, optional): Maximum items to iterate

46

**kwargs: Additional parameters passed to list_items

47

"""

48

49

def download_items(self, **kwargs) -> bytes:

50

"""Download items as bytes (deprecated - use get_items_as_bytes)."""

51

52

def get_items_as_bytes(self, **kwargs) -> bytes:

53

"""Get items as raw bytes.

54

55

Args:

56

format (str, optional): Export format

57

**kwargs: Additional export parameters

58

"""

59

60

def stream_items(self, **kwargs) -> Iterator[Response]:

61

"""Stream items as context manager.

62

63

Args:

64

format (str, optional): Stream format

65

**kwargs: Additional streaming parameters

66

"""

67

68

def push_items(self, items: list | dict) -> None:

69

"""Push items to dataset.

70

71

Args:

72

items: Items to push (single item or list of items)

73

"""

74

75

def get_statistics(self) -> dict | None:

76

"""Get dataset statistics including item count and size."""

77

78

def create_items_public_url(self, **kwargs) -> str:

79

"""Generate public URL for dataset items.

80

81

Args:

82

format (str, optional): Export format

83

**kwargs: Additional URL parameters

84

"""

85

86

class DatasetClientAsync:

87

"""Async version of DatasetClient with identical methods."""

88

89

class DatasetCollectionClient:

90

def list(self, **kwargs) -> ListPage[dict]:

91

"""List datasets.

92

93

Args:

94

unnamed (bool, optional): Include unnamed datasets

95

limit (int, optional): Maximum number of items

96

offset (int, optional): Offset for pagination

97

desc (bool, optional): Sort in descending order

98

"""

99

100

def get_or_create(self, *, name: str | None = None, schema: dict | None = None) -> dict:

101

"""Get or create dataset.

102

103

Args:

104

name: Dataset name

105

schema: Dataset schema definition

106

"""

107

108

class DatasetCollectionClientAsync:

109

"""Async version of DatasetCollectionClient with identical methods."""

110

```

111

112

### Key-Value Store Operations

113

114

Key-value store management for arbitrary data storage with support for binary data and streaming.

115

116

```python { .api }

117

class KeyValueStoreClient:

118

def get(self) -> dict | None:

119

"""Get key-value store information."""

120

121

def update(self, *, name: str | None = None, general_access: StorageGeneralAccess | None = None) -> dict:

122

"""Update store configuration.

123

124

Args:

125

name: Store name

126

general_access: Storage access level (from apify_shared.consts)

127

"""

128

129

def delete(self) -> None:

130

"""Delete store."""

131

132

def list_keys(self, **kwargs) -> dict:

133

"""List keys in the store.

134

135

Args:

136

limit (int, optional): Maximum keys to return

137

exclusive_start_key (str, optional): Key to start listing from

138

"""

139

140

def get_record(self, key: str) -> dict | None:

141

"""Get record by key.

142

143

Args:

144

key: Record key

145

"""

146

147

def record_exists(self, key: str) -> bool:

148

"""Check if record exists.

149

150

Args:

151

key: Record key

152

"""

153

154

def get_record_as_bytes(self, key: str) -> bytes | None:

155

"""Get record as raw bytes.

156

157

Args:

158

key: Record key

159

"""

160

161

def stream_record(self, key: str) -> Iterator[dict | None]:

162

"""Stream record as context manager.

163

164

Args:

165

key: Record key

166

"""

167

168

def set_record(self, key: str, value: Any, content_type: str | None = None) -> None:

169

"""Set record value.

170

171

Args:

172

key: Record key

173

value: Record value (dict, str, bytes, etc.)

174

content_type: MIME content type

175

"""

176

177

def delete_record(self, key: str) -> None:

178

"""Delete record.

179

180

Args:

181

key: Record key

182

"""

183

184

def create_keys_public_url(self, **kwargs) -> str:

185

"""Generate public URL for accessing keys."""

186

187

class KeyValueStoreClientAsync:

188

"""Async version of KeyValueStoreClient with identical methods."""

189

190

class KeyValueStoreCollectionClient:

191

def list(self, **kwargs) -> ListPage[dict]:

192

"""List key-value stores.

193

194

Args:

195

unnamed (bool, optional): Include unnamed stores

196

limit (int, optional): Maximum number of items

197

offset (int, optional): Offset for pagination

198

desc (bool, optional): Sort in descending order

199

"""

200

201

def get_or_create(self, *, name: str | None = None, schema: dict | None = None) -> dict:

202

"""Get or create key-value store.

203

204

Args:

205

name: Store name

206

schema: Store schema definition

207

"""

208

209

class KeyValueStoreCollectionClientAsync:

210

"""Async version of KeyValueStoreCollectionClient with identical methods."""

211

```

212

213

## Usage Examples

214

215

### Dataset Operations

216

217

```python

218

from apify_client import ApifyClient

219

220

client = ApifyClient('your-api-token')

221

222

# Create or get dataset

223

dataset = client.datasets().get_or_create(name='web-scraping-results')

224

dataset_client = client.dataset(dataset['id'])

225

226

# Push data to dataset

227

data = [

228

{'url': 'https://example.com', 'title': 'Example Page', 'price': 29.99},

229

{'url': 'https://example.org', 'title': 'Another Page', 'price': 39.99}

230

]

231

dataset_client.push_items(data)

232

233

# List items with pagination

234

items = dataset_client.list_items(limit=100, offset=0, format='json')

235

print(f"Retrieved {items.count} items")

236

237

# Iterate over all items

238

for item in dataset_client.iterate_items():

239

print(f"Title: {item['title']}, Price: {item['price']}")

240

241

# Export dataset as CSV

242

csv_data = dataset_client.get_items_as_bytes(format='csv')

243

with open('results.csv', 'wb') as f:

244

f.write(csv_data)

245

246

# Get dataset statistics

247

stats = dataset_client.get_statistics()

248

print(f"Dataset contains {stats['itemCount']} items")

249

```

250

251

### Key-Value Store Operations

252

253

```python

254

# Create or get key-value store

255

store = client.key_value_stores().get_or_create(name='app-config')

256

store_client = client.key_value_store(store['id'])

257

258

# Store configuration data

259

config = {

260

'api_endpoint': 'https://api.example.com',

261

'timeout': 30,

262

'retry_count': 3

263

}

264

store_client.set_record('config', config, content_type='application/json')

265

266

# Store binary data

267

with open('screenshot.png', 'rb') as f:

268

image_data = f.read()

269

store_client.set_record('screenshot', image_data, content_type='image/png')

270

271

# Retrieve data

272

stored_config = store_client.get_record('config')

273

print(f"API endpoint: {stored_config['api_endpoint']}")

274

275

# Check if record exists

276

if store_client.record_exists('screenshot'):

277

image_bytes = store_client.get_record_as_bytes('screenshot')

278

print(f"Screenshot size: {len(image_bytes)} bytes")

279

280

# List all keys

281

keys = store_client.list_keys()

282

print(f"Store contains keys: {keys['keys']}")

283

284

# Stream large records

285

with store_client.stream_record('large-file') as stream:

286

for chunk in stream:

287

process_chunk(chunk)

288

```

289

290

### Advanced Data Processing

291

292

```python

293

# Process dataset items in batches

294

dataset_client = client.dataset('dataset-id')

295

296

def process_batch(items):

297

# Process items in batch

298

processed = []

299

for item in items:

300

processed.append({

301

**item,

302

'processed_at': datetime.now().isoformat(),

303

'price_usd': item['price'] * 1.2 # Convert currency

304

})

305

return processed

306

307

# Iterate with batch processing

308

batch_size = 1000

309

offset = 0

310

311

while True:

312

batch = dataset_client.list_items(limit=batch_size, offset=offset)

313

if not batch.items:

314

break

315

316

processed_items = process_batch(batch.items)

317

318

# Store processed results

319

processed_dataset = client.datasets().get_or_create(name='processed-results')

320

client.dataset(processed_dataset['id']).push_items(processed_items)

321

322

offset += batch_size

323

print(f"Processed {offset} items")

324

```