or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch.mdcrawling.mdextraction.mdindex.mdmapping.mdmonitoring.mdscraping.mdsearch.mdusage.mdv1-api.md

batch.mddocs/

0

# Batch Operations

1

2

Concurrent processing of multiple URLs with job monitoring, error handling, and webhook support.

3

4

## Core Batch Methods

5

6

```typescript { .api }

7

/**

8

* Start a batch scrape job for multiple URLs

9

* @param urls - Array of URLs to scrape

10

* @param opts - Batch scraping options

11

* @returns Promise resolving to job ID and invalid URLs

12

*/

13

startBatchScrape(urls: string[], opts?: BatchScrapeOptions): Promise<BatchScrapeResponse>;

14

15

/**

16

* Get batch scrape job status and partial data

17

* @param jobId - Batch job identifier

18

* @param pagination - Pagination configuration for results

19

* @returns Promise resolving to job status and data

20

*/

21

getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;

22

23

/**

24

* Retrieve batch scrape errors and robots.txt blocks

25

* @param jobId - Batch job identifier

26

* @returns Promise resolving to error details

27

*/

28

getBatchScrapeErrors(jobId: string): Promise<CrawlErrorsResponse>;

29

30

/**

31

* Cancel a running batch scrape job

32

* @param jobId - Batch job identifier

33

* @returns Promise resolving to true if cancelled

34

*/

35

cancelBatchScrape(jobId: string): Promise<boolean>;

36

37

/**

38

* Convenience waiter: start batch scrape and poll until completion

39

* @param urls - Array of URLs to scrape

40

* @param opts - Batch options plus waiter controls

41

* @returns Promise resolving to final job snapshot

42

*/

43

batchScrape(urls: string[], opts?: BatchScrapeOptions & { pollInterval?: number; timeout?: number }): Promise<BatchScrapeJob>;

44

```

45

46

## Batch Configuration

47

48

```typescript { .api }

49

interface BatchScrapeOptions {

50

// Scraping configuration for all URLs

51

options?: ScrapeOptions;

52

53

// Webhook notifications

54

webhook?: string | WebhookConfig;

55

56

// Job management

57

appendToId?: string;

58

ignoreInvalidURLs?: boolean;

59

maxConcurrency?: number;

60

61

// Privacy

62

zeroDataRetention?: boolean;

63

64

// Idempotency

65

idempotencyKey?: string;

66

67

// Integration tracking

68

integration?: string;

69

}

70

```

71

72

## Response Types

73

74

```typescript { .api }

75

// Batch job initiation response

76

interface BatchScrapeResponse {

77

id: string;

78

url: string;

79

invalidURLs?: string[];

80

}

81

82

// Batch job status and data

83

interface BatchScrapeJob {

84

status: "scraping" | "completed" | "failed" | "cancelled";

85

completed: number;

86

total: number;

87

creditsUsed?: number;

88

expiresAt?: string;

89

next?: string | null;

90

data: Document[];

91

}

92

```

93

94

## Usage Examples

95

96

### Basic Batch Scraping

97

98

```typescript

99

const urls = [

100

'https://example.com/page1',

101

'https://example.com/page2',

102

'https://example.com/page3',

103

'https://example.com/page4'

104

];

105

106

const batchJob = await app.batchScrape(urls, {

107

options: {

108

formats: ['markdown', 'links']

109

},

110

maxConcurrency: 3

111

});

112

113

console.log(`Scraped ${batchJob.completed} of ${batchJob.total} URLs`);

114

console.log(batchJob.data); // Array of scraped documents

115

```

116

117

### Async Batch with Status Monitoring

118

119

```typescript

120

const urls = [

121

'https://news.example.com/article1',

122

'https://news.example.com/article2',

123

'https://news.example.com/article3'

124

];

125

126

// Start batch job

127

const batchResponse = await app.startBatchScrape(urls, {

128

options: {

129

formats: ['markdown', {

130

type: 'json',

131

schema: {

132

type: 'object',

133

properties: {

134

title: { type: 'string' },

135

author: { type: 'string' },

136

publishDate: { type: 'string' },

137

content: { type: 'string' }

138

}

139

}

140

}]

141

},

142

maxConcurrency: 2

143

});

144

145

console.log(`Started batch job: ${batchResponse.id}`);

146

if (batchResponse.invalidURLs?.length) {

147

console.log('Invalid URLs detected:', batchResponse.invalidURLs);

148

}

149

150

// Monitor progress

151

let job: BatchScrapeJob;

152

do {

153

await new Promise(resolve => setTimeout(resolve, 3000)); // Wait 3 seconds

154

job = await app.getBatchScrapeStatus(batchResponse.id);

155

console.log(`Progress: ${job.completed}/${job.total} - Status: ${job.status}`);

156

} while (job.status === 'scraping');

157

158

console.log('Batch completed!', job.data.length, 'documents scraped');

159

```

160

161

### Structured Data Extraction

162

163

```typescript

164

import { z } from 'zod';

165

166

const ProductSchema = z.object({

167

name: z.string(),

168

price: z.number(),

169

description: z.string(),

170

availability: z.enum(['in-stock', 'out-of-stock', 'pre-order']),

171

images: z.array(z.string()),

172

rating: z.number().optional()

173

});

174

175

const productUrls = [

176

'https://shop.example.com/products/laptop-1',

177

'https://shop.example.com/products/laptop-2',

178

'https://shop.example.com/products/laptop-3'

179

];

180

181

const batchJob = await app.batchScrape(productUrls, {

182

options: {

183

formats: [{

184

type: 'json',

185

schema: ProductSchema

186

}],

187

onlyMainContent: true

188

},

189

maxConcurrency: 2

190

});

191

192

// Each document.json will be typed as ProductSchema

193

for (const doc of batchJob.data) {

194

console.log('Product:', doc.json); // Typed product data

195

}

196

```

197

198

### Batch with Webhooks

199

200

```typescript

201

const urls = [

202

'https://api.example.com/data1',

203

'https://api.example.com/data2',

204

'https://api.example.com/data3'

205

];

206

207

const batchJob = await app.batchScrape(urls, {

208

options: {

209

formats: ['json'],

210

headers: {

211

'Authorization': 'Bearer api-token'

212

}

213

},

214

webhook: {

215

url: 'https://myapp.com/webhooks/batch-complete',

216

headers: {

217

'X-API-Key': 'webhook-secret'

218

},

219

metadata: {

220

'batchType': 'api-data-sync',

221

'userId': '12345'

222

},

223

events: ['completed', 'failed']

224

},

225

maxConcurrency: 5

226

});

227

```

228

229

### Error Handling and Invalid URLs

230

231

```typescript

232

const urls = [

233

'https://example.com/valid-page',

234

'https://invalid-domain-xyz.com/page',

235

'https://example.com/another-valid-page',

236

'not-a-valid-url'

237

];

238

239

const batchResponse = await app.startBatchScrape(urls, {

240

options: {

241

formats: ['markdown']

242

},

243

ignoreInvalidURLs: true // Continue processing despite invalid URLs

244

});

245

246

// Check which URLs were invalid

247

if (batchResponse.invalidURLs?.length) {

248

console.log('Invalid URLs that were skipped:', batchResponse.invalidURLs);

249

}

250

251

// Monitor and handle errors

252

const job = await app.getBatchScrapeStatus(batchResponse.id);

253

if (job.status === 'completed') {

254

// Get detailed error information

255

const errors = await app.getBatchScrapeErrors(batchResponse.id);

256

257

if (errors.errors.length > 0) {

258

console.log('Scraping errors:');

259

errors.errors.forEach(error => {

260

console.log(`- ${error.url}: ${error.error} (${error.code})`);

261

});

262

}

263

264

if (errors.robotsBlocked.length > 0) {

265

console.log('URLs blocked by robots.txt:', errors.robotsBlocked);

266

}

267

}

268

```

269

270

### Idempotent Operations

271

272

```typescript

273

const urls = [

274

'https://example.com/data1',

275

'https://example.com/data2'

276

];

277

278

// First request with idempotency key

279

const batchJob1 = await app.startBatchScrape(urls, {

280

options: { formats: ['markdown'] },

281

idempotencyKey: 'batch-operation-123'

282

});

283

284

// Duplicate request with same key - will return existing job

285

const batchJob2 = await app.startBatchScrape(urls, {

286

options: { formats: ['markdown'] },

287

idempotencyKey: 'batch-operation-123'

288

});

289

290

console.log(batchJob1.id === batchJob2.id); // true

291

```

292

293

### Advanced Batch Configuration

294

295

```typescript

296

const urls = Array.from({ length: 100 }, (_, i) =>

297

`https://api.example.com/items/${i + 1}`

298

);

299

300

const batchJob = await app.batchScrape(urls, {

301

options: {

302

formats: ['json', 'markdown'],

303

headers: {

304

'User-Agent': 'MyBot/1.0',

305

'Accept': 'application/json'

306

},

307

timeout: 30000,

308

mobile: false,

309

fastMode: true,

310

proxy: 'basic'

311

},

312

maxConcurrency: 10,

313

zeroDataRetention: true,

314

ignoreInvalidURLs: true,

315

webhook: 'https://myapp.com/batch-webhook'

316

});

317

```

318

319

### Pagination with Large Results

320

321

```typescript

322

const urls = Array.from({ length: 1000 }, (_, i) =>

323

`https://catalog.example.com/item/${i + 1}`

324

);

325

326

// Start large batch job

327

const batchResponse = await app.startBatchScrape(urls, {

328

options: { formats: ['markdown'] },

329

maxConcurrency: 20

330

});

331

332

// Get results with pagination

333

const job = await app.getBatchScrapeStatus(batchResponse.id, {

334

autoPaginate: true,

335

maxPages: 50,

336

maxResults: 5000,

337

maxWaitTime: 600 // 10 minutes

338

});

339

340

console.log(`Retrieved ${job.data.length} documents`);

341

```

342

343

### Canceling Batch Jobs

344

345

```typescript

346

// Start a large batch job

347

const batchResponse = await app.startBatchScrape(urls, {

348

options: { formats: ['markdown'] }

349

});

350

351

// Cancel if needed

352

setTimeout(async () => {

353

const cancelled = await app.cancelBatchScrape(batchResponse.id);

354

if (cancelled) {

355

console.log('Batch job cancelled successfully');

356

}

357

}, 30000); // Cancel after 30 seconds

358

```