0
# Data Storage
1
2
Access to Apify's data storage systems including datasets for structured data and key-value stores for arbitrary data storage. These storage systems provide persistent, scalable data management for Actor runs and general use.
3
4
## Capabilities
5
6
### Dataset Operations
7
8
Dataset management for structured data storage with support for multiple formats and streaming access.
9
10
```python { .api }
11
class DatasetClient:
12
def get(self) -> dict | None:
13
"""Get dataset information."""
14
15
def update(self, *, name: str | None = None, general_access: StorageGeneralAccess | None = None) -> dict:
16
"""Update dataset configuration.
17
18
Args:
19
name: Dataset name
20
general_access: Storage access level (from apify_shared.consts)
21
"""
22
23
def delete(self) -> None:
24
"""Delete dataset."""
25
26
def list_items(self, **kwargs) -> ListPage:
27
"""List dataset items with filtering and pagination.
28
29
Args:
30
offset (int, optional): Starting offset
31
limit (int, optional): Maximum items to return
32
desc (bool, optional): Sort in descending order
33
fields (list[str], optional): Fields to include
34
omit (list[str], optional): Fields to exclude
35
format (str, optional): Response format ('json', 'csv', 'xlsx', etc.)
36
clean (bool, optional): Clean items before return
37
**kwargs: Additional filtering parameters
38
"""
39
40
def iterate_items(self, **kwargs) -> Iterator[dict]:
41
"""Iterate over all dataset items.
42
43
Args:
44
offset (int, optional): Starting offset
45
limit (int, optional): Maximum items to iterate
46
**kwargs: Additional parameters passed to list_items
47
"""
48
49
def download_items(self, **kwargs) -> bytes:
50
"""Download items as bytes (deprecated - use get_items_as_bytes)."""
51
52
def get_items_as_bytes(self, **kwargs) -> bytes:
53
"""Get items as raw bytes.
54
55
Args:
56
format (str, optional): Export format
57
**kwargs: Additional export parameters
58
"""
59
60
def stream_items(self, **kwargs) -> Iterator[Response]:
61
"""Stream items as context manager.
62
63
Args:
64
format (str, optional): Stream format
65
**kwargs: Additional streaming parameters
66
"""
67
68
def push_items(self, items: list | dict) -> None:
69
"""Push items to dataset.
70
71
Args:
72
items: Items to push (single item or list of items)
73
"""
74
75
def get_statistics(self) -> dict | None:
76
"""Get dataset statistics including item count and size."""
77
78
def create_items_public_url(self, **kwargs) -> str:
79
"""Generate public URL for dataset items.
80
81
Args:
82
format (str, optional): Export format
83
**kwargs: Additional URL parameters
84
"""
85
86
class DatasetClientAsync:
87
"""Async version of DatasetClient with identical methods."""
88
89
class DatasetCollectionClient:
90
def list(self, **kwargs) -> ListPage[dict]:
91
"""List datasets.
92
93
Args:
94
unnamed (bool, optional): Include unnamed datasets
95
limit (int, optional): Maximum number of items
96
offset (int, optional): Offset for pagination
97
desc (bool, optional): Sort in descending order
98
"""
99
100
def get_or_create(self, *, name: str | None = None, schema: dict | None = None) -> dict:
101
"""Get or create dataset.
102
103
Args:
104
name: Dataset name
105
schema: Dataset schema definition
106
"""
107
108
class DatasetCollectionClientAsync:
109
"""Async version of DatasetCollectionClient with identical methods."""
110
```
111
112
### Key-Value Store Operations
113
114
Key-value store management for arbitrary data storage with support for binary data and streaming.
115
116
```python { .api }
117
class KeyValueStoreClient:
118
def get(self) -> dict | None:
119
"""Get key-value store information."""
120
121
def update(self, *, name: str | None = None, general_access: StorageGeneralAccess | None = None) -> dict:
122
"""Update store configuration.
123
124
Args:
125
name: Store name
126
general_access: Storage access level (from apify_shared.consts)
127
"""
128
129
def delete(self) -> None:
130
"""Delete store."""
131
132
def list_keys(self, **kwargs) -> dict:
133
"""List keys in the store.
134
135
Args:
136
limit (int, optional): Maximum keys to return
137
exclusive_start_key (str, optional): Key to start listing from
138
"""
139
140
def get_record(self, key: str) -> dict | None:
141
"""Get record by key.
142
143
Args:
144
key: Record key
145
"""
146
147
def record_exists(self, key: str) -> bool:
148
"""Check if record exists.
149
150
Args:
151
key: Record key
152
"""
153
154
def get_record_as_bytes(self, key: str) -> bytes | None:
155
"""Get record as raw bytes.
156
157
Args:
158
key: Record key
159
"""
160
161
def stream_record(self, key: str) -> Iterator[dict | None]:
162
"""Stream record as context manager.
163
164
Args:
165
key: Record key
166
"""
167
168
def set_record(self, key: str, value: Any, content_type: str | None = None) -> None:
169
"""Set record value.
170
171
Args:
172
key: Record key
173
value: Record value (dict, str, bytes, etc.)
174
content_type: MIME content type
175
"""
176
177
def delete_record(self, key: str) -> None:
178
"""Delete record.
179
180
Args:
181
key: Record key
182
"""
183
184
def create_keys_public_url(self, **kwargs) -> str:
185
"""Generate public URL for accessing keys."""
186
187
class KeyValueStoreClientAsync:
188
"""Async version of KeyValueStoreClient with identical methods."""
189
190
class KeyValueStoreCollectionClient:
191
def list(self, **kwargs) -> ListPage[dict]:
192
"""List key-value stores.
193
194
Args:
195
unnamed (bool, optional): Include unnamed stores
196
limit (int, optional): Maximum number of items
197
offset (int, optional): Offset for pagination
198
desc (bool, optional): Sort in descending order
199
"""
200
201
def get_or_create(self, *, name: str | None = None, schema: dict | None = None) -> dict:
202
"""Get or create key-value store.
203
204
Args:
205
name: Store name
206
schema: Store schema definition
207
"""
208
209
class KeyValueStoreCollectionClientAsync:
210
"""Async version of KeyValueStoreCollectionClient with identical methods."""
211
```
212
213
## Usage Examples
214
215
### Dataset Operations
216
217
```python
218
from apify_client import ApifyClient
219
220
client = ApifyClient('your-api-token')
221
222
# Create or get dataset
223
dataset = client.datasets().get_or_create(name='web-scraping-results')
224
dataset_client = client.dataset(dataset['id'])
225
226
# Push data to dataset
227
data = [
228
{'url': 'https://example.com', 'title': 'Example Page', 'price': 29.99},
229
{'url': 'https://example.org', 'title': 'Another Page', 'price': 39.99}
230
]
231
dataset_client.push_items(data)
232
233
# List items with pagination
234
items = dataset_client.list_items(limit=100, offset=0, format='json')
235
print(f"Retrieved {items.count} items")
236
237
# Iterate over all items
238
for item in dataset_client.iterate_items():
239
print(f"Title: {item['title']}, Price: {item['price']}")
240
241
# Export dataset as CSV
242
csv_data = dataset_client.get_items_as_bytes(format='csv')
243
with open('results.csv', 'wb') as f:
244
f.write(csv_data)
245
246
# Get dataset statistics
247
stats = dataset_client.get_statistics()
248
print(f"Dataset contains {stats['itemCount']} items")
249
```
250
251
### Key-Value Store Operations
252
253
```python
254
# Create or get key-value store
255
store = client.key_value_stores().get_or_create(name='app-config')
256
store_client = client.key_value_store(store['id'])
257
258
# Store configuration data
259
config = {
260
'api_endpoint': 'https://api.example.com',
261
'timeout': 30,
262
'retry_count': 3
263
}
264
store_client.set_record('config', config, content_type='application/json')
265
266
# Store binary data
267
with open('screenshot.png', 'rb') as f:
268
image_data = f.read()
269
store_client.set_record('screenshot', image_data, content_type='image/png')
270
271
# Retrieve data
272
stored_config = store_client.get_record('config')
273
print(f"API endpoint: {stored_config['api_endpoint']}")
274
275
# Check if record exists
276
if store_client.record_exists('screenshot'):
277
image_bytes = store_client.get_record_as_bytes('screenshot')
278
print(f"Screenshot size: {len(image_bytes)} bytes")
279
280
# List all keys
281
keys = store_client.list_keys()
282
print(f"Store contains keys: {keys['keys']}")
283
284
# Stream large records
285
with store_client.stream_record('large-file') as stream:
286
for chunk in stream:
287
process_chunk(chunk)
288
```
289
290
### Advanced Data Processing
291
292
```python
293
# Process dataset items in batches
294
dataset_client = client.dataset('dataset-id')
295
296
def process_batch(items):
297
# Process items in batch
298
processed = []
299
for item in items:
300
processed.append({
301
**item,
302
'processed_at': datetime.now().isoformat(),
303
'price_usd': item['price'] * 1.2 # Convert currency
304
})
305
return processed
306
307
# Iterate with batch processing
308
batch_size = 1000
309
offset = 0
310
311
while True:
312
batch = dataset_client.list_items(limit=batch_size, offset=offset)
313
if not batch.items:
314
break
315
316
processed_items = process_batch(batch.items)
317
318
# Store processed results
319
processed_dataset = client.datasets().get_or_create(name='processed-results')
320
client.dataset(processed_dataset['id']).push_items(processed_items)
321
322
offset += batch_size
323
print(f"Processed {offset} items")
324
```