0
# Batch Operations
1
2
Concurrent processing of multiple URLs with job monitoring, error handling, and webhook support.
3
4
## Core Batch Methods
5
6
```typescript { .api }
7
/**
8
* Start a batch scrape job for multiple URLs
9
* @param urls - Array of URLs to scrape
10
* @param opts - Batch scraping options
11
* @returns Promise resolving to job ID and invalid URLs
12
*/
13
startBatchScrape(urls: string[], opts?: BatchScrapeOptions): Promise<BatchScrapeResponse>;
14
15
/**
16
* Get batch scrape job status and partial data
17
* @param jobId - Batch job identifier
18
* @param pagination - Pagination configuration for results
19
* @returns Promise resolving to job status and data
20
*/
21
getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
22
23
/**
24
* Retrieve batch scrape errors and robots.txt blocks
25
* @param jobId - Batch job identifier
26
* @returns Promise resolving to error details
27
*/
28
getBatchScrapeErrors(jobId: string): Promise<CrawlErrorsResponse>;
29
30
/**
31
* Cancel a running batch scrape job
32
* @param jobId - Batch job identifier
33
* @returns Promise resolving to true if cancelled
34
*/
35
cancelBatchScrape(jobId: string): Promise<boolean>;
36
37
/**
38
* Convenience waiter: start batch scrape and poll until completion
39
* @param urls - Array of URLs to scrape
40
* @param opts - Batch options plus waiter controls
41
* @returns Promise resolving to final job snapshot
42
*/
43
batchScrape(urls: string[], opts?: BatchScrapeOptions & { pollInterval?: number; timeout?: number }): Promise<BatchScrapeJob>;
44
```
45
46
## Batch Configuration
47
48
```typescript { .api }
49
interface BatchScrapeOptions {
50
// Scraping configuration for all URLs
51
options?: ScrapeOptions;
52
53
// Webhook notifications
54
webhook?: string | WebhookConfig;
55
56
// Job management
57
appendToId?: string;
58
ignoreInvalidURLs?: boolean;
59
maxConcurrency?: number;
60
61
// Privacy
62
zeroDataRetention?: boolean;
63
64
// Idempotency
65
idempotencyKey?: string;
66
67
// Integration tracking
68
integration?: string;
69
}
70
```
71
72
## Response Types
73
74
```typescript { .api }
75
// Batch job initiation response
76
interface BatchScrapeResponse {
77
id: string;
78
url: string;
79
invalidURLs?: string[];
80
}
81
82
// Batch job status and data
83
interface BatchScrapeJob {
84
status: "scraping" | "completed" | "failed" | "cancelled";
85
completed: number;
86
total: number;
87
creditsUsed?: number;
88
expiresAt?: string;
89
next?: string | null;
90
data: Document[];
91
}
92
```
93
94
## Usage Examples
95
96
### Basic Batch Scraping
97
98
```typescript
99
const urls = [
100
'https://example.com/page1',
101
'https://example.com/page2',
102
'https://example.com/page3',
103
'https://example.com/page4'
104
];
105
106
const batchJob = await app.batchScrape(urls, {
107
options: {
108
formats: ['markdown', 'links']
109
},
110
maxConcurrency: 3
111
});
112
113
console.log(`Scraped ${batchJob.completed} of ${batchJob.total} URLs`);
114
console.log(batchJob.data); // Array of scraped documents
115
```
116
117
### Async Batch with Status Monitoring
118
119
```typescript
120
const urls = [
121
'https://news.example.com/article1',
122
'https://news.example.com/article2',
123
'https://news.example.com/article3'
124
];
125
126
// Start batch job
127
const batchResponse = await app.startBatchScrape(urls, {
128
options: {
129
formats: ['markdown', {
130
type: 'json',
131
schema: {
132
type: 'object',
133
properties: {
134
title: { type: 'string' },
135
author: { type: 'string' },
136
publishDate: { type: 'string' },
137
content: { type: 'string' }
138
}
139
}
140
}]
141
},
142
maxConcurrency: 2
143
});
144
145
console.log(`Started batch job: ${batchResponse.id}`);
146
if (batchResponse.invalidURLs?.length) {
147
console.log('Invalid URLs detected:', batchResponse.invalidURLs);
148
}
149
150
// Monitor progress
151
let job: BatchScrapeJob;
152
do {
153
await new Promise(resolve => setTimeout(resolve, 3000)); // Wait 3 seconds
154
job = await app.getBatchScrapeStatus(batchResponse.id);
155
console.log(`Progress: ${job.completed}/${job.total} - Status: ${job.status}`);
156
} while (job.status === 'scraping');
157
158
console.log('Batch completed!', job.data.length, 'documents scraped');
159
```
160
161
### Structured Data Extraction
162
163
```typescript
164
import { z } from 'zod';
165
166
const ProductSchema = z.object({
167
name: z.string(),
168
price: z.number(),
169
description: z.string(),
170
availability: z.enum(['in-stock', 'out-of-stock', 'pre-order']),
171
images: z.array(z.string()),
172
rating: z.number().optional()
173
});
174
175
const productUrls = [
176
'https://shop.example.com/products/laptop-1',
177
'https://shop.example.com/products/laptop-2',
178
'https://shop.example.com/products/laptop-3'
179
];
180
181
const batchJob = await app.batchScrape(productUrls, {
182
options: {
183
formats: [{
184
type: 'json',
185
schema: ProductSchema
186
}],
187
onlyMainContent: true
188
},
189
maxConcurrency: 2
190
});
191
192
// Each document.json will be typed as ProductSchema
193
for (const doc of batchJob.data) {
194
console.log('Product:', doc.json); // Typed product data
195
}
196
```
197
198
### Batch with Webhooks
199
200
```typescript
201
const urls = [
202
'https://api.example.com/data1',
203
'https://api.example.com/data2',
204
'https://api.example.com/data3'
205
];
206
207
const batchJob = await app.batchScrape(urls, {
208
options: {
209
formats: ['json'],
210
headers: {
211
'Authorization': 'Bearer api-token'
212
}
213
},
214
webhook: {
215
url: 'https://myapp.com/webhooks/batch-complete',
216
headers: {
217
'X-API-Key': 'webhook-secret'
218
},
219
metadata: {
220
'batchType': 'api-data-sync',
221
'userId': '12345'
222
},
223
events: ['completed', 'failed']
224
},
225
maxConcurrency: 5
226
});
227
```
228
229
### Error Handling and Invalid URLs
230
231
```typescript
232
const urls = [
233
'https://example.com/valid-page',
234
'https://invalid-domain-xyz.com/page',
235
'https://example.com/another-valid-page',
236
'not-a-valid-url'
237
];
238
239
const batchResponse = await app.startBatchScrape(urls, {
240
options: {
241
formats: ['markdown']
242
},
243
ignoreInvalidURLs: true // Continue processing despite invalid URLs
244
});
245
246
// Check which URLs were invalid
247
if (batchResponse.invalidURLs?.length) {
248
console.log('Invalid URLs that were skipped:', batchResponse.invalidURLs);
249
}
250
251
// Monitor and handle errors
252
const job = await app.getBatchScrapeStatus(batchResponse.id);
253
if (job.status === 'completed') {
254
// Get detailed error information
255
const errors = await app.getBatchScrapeErrors(batchResponse.id);
256
257
if (errors.errors.length > 0) {
258
console.log('Scraping errors:');
259
errors.errors.forEach(error => {
260
console.log(`- ${error.url}: ${error.error} (${error.code})`);
261
});
262
}
263
264
if (errors.robotsBlocked.length > 0) {
265
console.log('URLs blocked by robots.txt:', errors.robotsBlocked);
266
}
267
}
268
```
269
270
### Idempotent Operations
271
272
```typescript
273
const urls = [
274
'https://example.com/data1',
275
'https://example.com/data2'
276
];
277
278
// First request with idempotency key
279
const batchJob1 = await app.startBatchScrape(urls, {
280
options: { formats: ['markdown'] },
281
idempotencyKey: 'batch-operation-123'
282
});
283
284
// Duplicate request with same key - will return existing job
285
const batchJob2 = await app.startBatchScrape(urls, {
286
options: { formats: ['markdown'] },
287
idempotencyKey: 'batch-operation-123'
288
});
289
290
console.log(batchJob1.id === batchJob2.id); // true
291
```
292
293
### Advanced Batch Configuration
294
295
```typescript
296
const urls = Array.from({ length: 100 }, (_, i) =>
297
`https://api.example.com/items/${i + 1}`
298
);
299
300
const batchJob = await app.batchScrape(urls, {
301
options: {
302
formats: ['json', 'markdown'],
303
headers: {
304
'User-Agent': 'MyBot/1.0',
305
'Accept': 'application/json'
306
},
307
timeout: 30000,
308
mobile: false,
309
fastMode: true,
310
proxy: 'basic'
311
},
312
maxConcurrency: 10,
313
zeroDataRetention: true,
314
ignoreInvalidURLs: true,
315
webhook: 'https://myapp.com/batch-webhook'
316
});
317
```
318
319
### Pagination with Large Results
320
321
```typescript
322
const urls = Array.from({ length: 1000 }, (_, i) =>
323
`https://catalog.example.com/item/${i + 1}`
324
);
325
326
// Start large batch job
327
const batchResponse = await app.startBatchScrape(urls, {
328
options: { formats: ['markdown'] },
329
maxConcurrency: 20
330
});
331
332
// Get results with pagination
333
const job = await app.getBatchScrapeStatus(batchResponse.id, {
334
autoPaginate: true,
335
maxPages: 50,
336
maxResults: 5000,
337
maxWaitTime: 600 // 10 minutes
338
});
339
340
console.log(`Retrieved ${job.data.length} documents`);
341
```
342
343
### Canceling Batch Jobs
344
345
```typescript
346
// Start a large batch job
347
const batchResponse = await app.startBatchScrape(urls, {
348
options: { formats: ['markdown'] }
349
});
350
351
// Cancel if needed
352
setTimeout(async () => {
353
const cancelled = await app.cancelBatchScrape(batchResponse.id);
354
if (cancelled) {
355
console.log('Batch job cancelled successfully');
356
}
357
}, 30000); // Cancel after 30 seconds
358
```