0
# Pagination Support
1
2
Pagination utilities for handling large result sets across different TAXII versions with automatic page traversal. The `as_pages` function provides a consistent interface for paginated requests regardless of TAXII version.
3
4
## Capabilities
5
6
### TAXII 2.1 Pagination
7
8
TAXII 2.1 uses limit/next parameters for pagination with server-driven continuation tokens.
9
10
```python { .api }
11
def as_pages(func, per_request=0, *args, **kwargs):
12
"""
13
Generator for TAXII 2.1 endpoints supporting pagination.
14
15
Parameters:
16
- func (callable): Collection method supporting pagination (get_objects, get_manifest)
17
- per_request (int): Number of items to request per page (0 for server default)
18
- *args: Positional arguments to pass to the function
19
- **kwargs: Keyword arguments to pass to the function (filters, etc.)
20
21
Yields:
22
dict: Response envelope for each page containing objects and metadata
23
24
Note:
25
- Automatically handles 'next' tokens from server responses
26
- Adjusts per_request if server returns different amount than requested
27
- Stops when server indicates no more pages available
28
"""
29
```
30
31
### TAXII 2.0 Pagination
32
33
TAXII 2.0 uses start/per_request parameters with HTTP Range headers for pagination.
34
35
```python { .api }
36
def as_pages(func, start=0, per_request=0, *args, **kwargs):
37
"""
38
Generator for TAXII 2.0 endpoints supporting pagination.
39
40
Parameters:
41
- func (callable): Collection method supporting pagination (get_objects, get_manifest)
42
- start (int): Starting index for pagination (default: 0)
43
- per_request (int): Number of items to request per page (0 for server default)
44
- *args: Positional arguments to pass to the function
45
- **kwargs: Keyword arguments to pass to the function (filters, etc.)
46
47
Yields:
48
dict: Response bundle for each page containing objects and metadata
49
50
Note:
51
- Uses HTTP Content-Range headers to determine total available items
52
- Automatically calculates next start position
53
- Handles server-specific Range header format variations
54
"""
55
```
56
57
## Usage Examples
58
59
### Basic Pagination (TAXII 2.1)
60
61
```python
62
from taxii2client import Collection, as_pages
63
64
collection = Collection("https://taxii-server.example.com/taxii2/api1/collections/indicators/")
65
66
# Paginate through all objects with default page size
67
total_objects = 0
68
for page in as_pages(collection.get_objects):
69
objects = page.get('objects', [])
70
total_objects += len(objects)
71
print(f"Page contains {len(objects)} objects (total so far: {total_objects})")
72
73
# Process objects in this page
74
for obj in objects:
75
print(f" {obj.get('type')}: {obj.get('id')}")
76
77
print(f"Total objects retrieved: {total_objects}")
78
```
79
80
### Custom Page Size
81
82
```python
83
# Request 50 objects per page
84
for page_num, page in enumerate(as_pages(collection.get_objects, per_request=50), 1):
85
objects = page.get('objects', [])
86
print(f"Page {page_num}: {len(objects)} objects")
87
88
# Check if this is the last page
89
if not page.get('more', False): # TAXII 2.1
90
print("This is the last page")
91
break
92
93
# Request 100 objects per page with filter
94
for page in as_pages(collection.get_objects, per_request=100, type="indicator"):
95
indicators = page.get('objects', [])
96
print(f"Retrieved {len(indicators)} indicators")
97
```
98
99
### Paginated Manifest Retrieval
100
101
```python
102
# Paginate through object manifests instead of full objects
103
total_manifests = 0
104
for page in as_pages(collection.get_manifest, per_request=200):
105
manifests = page.get('objects', []) # Manifests are in 'objects' array
106
total_manifests += len(manifests)
107
108
print(f"Manifest page: {len(manifests)} objects")
109
for manifest in manifests:
110
obj_id = manifest.get('id')
111
versions = manifest.get('versions', [])
112
print(f" {obj_id}: {len(versions)} versions")
113
114
print(f"Total objects in collection: {total_manifests}")
115
```
116
117
### Filtered Pagination
118
119
```python
120
from datetime import datetime, timezone
121
122
# Paginate with date filter
123
recent_date = datetime(2023, 1, 1, tzinfo=timezone.utc)
124
for page in as_pages(collection.get_objects, per_request=100, added_after=recent_date):
125
objects = page.get('objects', [])
126
print(f"Recent objects page: {len(objects)}")
127
128
# Paginate with type filter
129
for page in as_pages(collection.get_objects, per_request=50, type=["indicator", "malware"]):
130
objects = page.get('objects', [])
131
indicators = [obj for obj in objects if obj.get('type') == 'indicator']
132
malware = [obj for obj in objects if obj.get('type') == 'malware']
133
print(f"Page: {len(indicators)} indicators, {len(malware)} malware")
134
135
# Paginate with multiple filters
136
filters = {
137
'type': 'indicator',
138
'added_after': recent_date
139
}
140
for page in as_pages(collection.get_objects, per_request=75, **filters):
141
indicators = page.get('objects', [])
142
print(f"Recent indicators: {len(indicators)}")
143
```
144
145
### TAXII 2.0 Specific Pagination
146
147
```python
148
from taxii2client.v20 import Collection, as_pages
149
150
# For TAXII 2.0, as_pages uses start/per_request parameters
151
collection = Collection("https://taxii2-server.example.com/api1/collections/indicators/")
152
153
# Start from beginning with custom page size
154
for page in as_pages(collection.get_objects, start=0, per_request=100):
155
objects = page.get('objects', [])
156
print(f"TAXII 2.0 page: {len(objects)} objects")
157
158
# Start from specific offset
159
for page in as_pages(collection.get_objects, start=500, per_request=50):
160
objects = page.get('objects', [])
161
print(f"Starting from offset 500: {len(objects)} objects")
162
```
163
164
### Processing Large Collections
165
166
```python
167
import time
168
from datetime import datetime
169
170
# Process very large collection with progress tracking
171
start_time = datetime.now()
172
total_processed = 0
173
page_count = 0
174
175
try:
176
for page in as_pages(collection.get_objects, per_request=1000):
177
page_count += 1
178
objects = page.get('objects', [])
179
180
# Process objects in batch
181
for obj in objects:
182
# Your processing logic here
183
process_stix_object(obj)
184
185
total_processed += len(objects)
186
elapsed = (datetime.now() - start_time).total_seconds()
187
rate = total_processed / elapsed if elapsed > 0 else 0
188
189
print(f"Page {page_count}: Processed {len(objects)} objects")
190
print(f" Total: {total_processed} objects in {elapsed:.1f}s ({rate:.1f} obj/s)")
191
192
# Optional: Add delay to avoid overwhelming the server
193
time.sleep(0.1)
194
195
except KeyboardInterrupt:
196
print(f"\nInterrupted after processing {total_processed} objects")
197
except Exception as e:
198
print(f"Error during pagination: {e}")
199
200
print(f"Final: Processed {total_processed} objects across {page_count} pages")
201
```
202
203
### Memory-Efficient Processing
204
205
```python
206
# Process large datasets without storing everything in memory
207
def process_collection_efficiently(collection, batch_size=500):
208
"""Process all objects in collection without loading everything into memory."""
209
210
processed_count = 0
211
error_count = 0
212
213
for page in as_pages(collection.get_objects, per_request=batch_size):
214
objects = page.get('objects', [])
215
216
for obj in objects:
217
try:
218
# Process individual object
219
result = analyze_stix_object(obj)
220
if result:
221
processed_count += 1
222
except Exception as e:
223
print(f"Error processing {obj.get('id', 'unknown')}: {e}")
224
error_count += 1
225
226
# Clear page from memory
227
del objects
228
229
# Periodic status update
230
if processed_count % 5000 == 0:
231
print(f"Processed: {processed_count}, Errors: {error_count}")
232
233
return processed_count, error_count
234
235
# Use the efficient processor
236
success_count, error_count = process_collection_efficiently(collection, batch_size=1000)
237
print(f"Processing complete: {success_count} successful, {error_count} errors")
238
```
239
240
### Handling Pagination Errors
241
242
```python
243
from taxii2client.exceptions import TAXIIServiceException
244
245
def robust_pagination(collection, page_size=100):
246
"""Paginate with error handling and retry logic."""
247
248
page_count = 0
249
total_objects = 0
250
retry_count = 0
251
max_retries = 3
252
253
try:
254
for page in as_pages(collection.get_objects, per_request=page_size):
255
try:
256
objects = page.get('objects', [])
257
page_count += 1
258
total_objects += len(objects)
259
260
print(f"Page {page_count}: {len(objects)} objects")
261
262
# Reset retry count on successful page
263
retry_count = 0
264
265
except TAXIIServiceException as e:
266
retry_count += 1
267
print(f"TAXII error on page {page_count + 1}: {e}")
268
269
if retry_count >= max_retries:
270
print(f"Max retries ({max_retries}) exceeded, stopping")
271
break
272
273
print(f"Retrying page {page_count + 1} (attempt {retry_count + 1})")
274
time.sleep(2 ** retry_count) # Exponential backoff
275
276
except Exception as e:
277
print(f"Unexpected error during pagination: {e}")
278
279
return total_objects, page_count
280
281
total, pages = robust_pagination(collection, page_size=500)
282
print(f"Retrieved {total} objects across {pages} pages")
283
```
284
285
### Server-Specific Optimizations
286
287
```python
288
# Adapt page size based on server behavior
289
def adaptive_pagination(collection, initial_page_size=100):
290
"""Automatically adjust page size based on server responses."""
291
292
page_size = initial_page_size
293
total_objects = 0
294
295
for page_num, page in enumerate(as_pages(collection.get_objects, per_request=page_size), 1):
296
objects = page.get('objects', [])
297
actual_size = len(objects)
298
total_objects += actual_size
299
300
print(f"Page {page_num}: requested {page_size}, got {actual_size}")
301
302
# Adjust page size based on server response
303
if actual_size < page_size * 0.5 and page_size > 50:
304
# Server returned much less than requested, reduce page size
305
page_size = max(50, page_size // 2)
306
print(f" Reducing page size to {page_size}")
307
elif actual_size == page_size and page_size < 1000:
308
# Server returned exactly what we asked for, try larger pages
309
page_size = min(1000, int(page_size * 1.5))
310
print(f" Increasing page size to {page_size}")
311
312
return total_objects
313
314
total = adaptive_pagination(collection)
315
print(f"Total objects retrieved with adaptive pagination: {total}")
316
```