or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch.mdcrawling.mdextraction.mdindex.mdmonitoring.mdscraping.mdusage.mdv1-api.md

batch.mddocs/

0

# Batch Processing

1

2

Batch operations for processing multiple URLs efficiently. Includes both batch scraping with full result polling and asynchronous job management for large-scale operations.

3

4

## Capabilities

5

6

### Complete Batch Scraping

7

8

Process multiple URLs in batch and return complete results, automatically polling for completion. Best for smaller batches or when you need immediate complete results.

9

10

```python { .api }

11

def batch_scrape(urls: List[str], options: Optional[ScrapeOptions] = None) -> BatchScrapeResponse:

12

"""

13

Scrape multiple URLs in batch and return complete results.

14

15

Parameters:

16

- urls: List[str], list of URLs to scrape

17

- options: ScrapeOptions, optional configuration applied to all URLs

18

19

Returns:

20

- BatchScrapeResponse: complete batch scraping results

21

"""

22

```

23

24

### Asynchronous Batch Processing

25

26

Start a batch scrape job and manage it asynchronously, ideal for large batches or when you need to track progress.

27

28

```python { .api }

29

def start_batch_scrape(urls: List[str], options: Optional[ScrapeOptions] = None) -> str:

30

"""

31

Start a batch scrape job asynchronously.

32

33

Parameters:

34

- urls: List[str], list of URLs to scrape

35

- options: ScrapeOptions, optional configuration for scraping behavior

36

37

Returns:

38

- str: batch job ID for status tracking

39

"""

40

41

def get_batch_scrape_status(batch_id: str) -> BatchScrapeJobStatus:

42

"""

43

Get status of a running batch scrape job.

44

45

Parameters:

46

- batch_id: str, batch job ID from start_batch_scrape

47

48

Returns:

49

- BatchScrapeJobStatus: current status and progress information

50

"""

51

52

def cancel_batch_scrape(batch_id: str) -> dict:

53

"""

54

Cancel a running batch scrape job.

55

56

Parameters:

57

- batch_id: str, batch job ID to cancel

58

59

Returns:

60

- dict: cancellation confirmation

61

"""

62

63

# Async clients only

64

async def wait_batch_scrape(job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> BatchScrapeResponse:

65

"""

66

Wait for batch scrape completion with automatic polling (AsyncFirecrawl only).

67

68

Parameters:

69

- job_id: str, batch job ID to wait for

70

- poll_interval: int, polling interval in seconds (default: 2)

71

- timeout: Optional[int], maximum wait time in seconds

72

73

Returns:

74

- BatchScrapeResponse: completed batch scrape results

75

"""

76

```

77

78

### Batch Job Management

79

80

Manage and monitor batch jobs with error handling and progress tracking.

81

82

```python { .api }

83

def get_batch_scrape_errors(batch_id: str) -> dict:

84

"""

85

Get errors from a batch scrape job.

86

87

Parameters:

88

- batch_id: str, batch job ID

89

90

Returns:

91

- dict: error information for failed URLs

92

"""

93

```

94

95

## Usage Examples

96

97

### Basic Batch Scraping

98

99

```python

100

from firecrawl import Firecrawl, ScrapeOptions

101

102

app = Firecrawl(api_key="your-api-key")

103

104

# Simple batch scrape

105

urls = [

106

"https://example.com/page1",

107

"https://example.com/page2",

108

"https://example.com/page3"

109

]

110

result = app.batch_scrape(urls)

111

print(f"Scraped {len(result.data)} URLs")

112

113

# Batch scrape with options

114

options = ScrapeOptions(

115

formats=["markdown"],

116

include_tags=["article", "main"],

117

wait_for=1000

118

)

119

result = app.batch_scrape(urls, options)

120

```

121

122

### Asynchronous Batch Management

123

124

```python

125

from firecrawl import Firecrawl

126

import time

127

128

app = Firecrawl(api_key="your-api-key")

129

130

# Large batch of URLs

131

urls = [f"https://example.com/page{i}" for i in range(1, 101)]

132

133

# Start batch job

134

batch_id = app.start_batch_scrape(urls,

135

ScrapeOptions(formats=["markdown"]))

136

print(f"Started batch job: {batch_id}")

137

138

# Monitor progress

139

while True:

140

status = app.get_batch_scrape_status(batch_id)

141

print(f"Status: {status.status}")

142

print(f"Completed: {status.completed}/{status.total}")

143

144

if status.status in ["completed", "failed", "cancelled"]:

145

break

146

147

time.sleep(10)

148

149

# Get final results

150

if status.status == "completed":

151

print(f"Batch completed with {len(status.data)} pages")

152

for doc in status.data:

153

print(f"URL: {doc.url}, Content length: {len(doc.content)}")

154

else:

155

# Check for errors

156

errors = app.get_batch_scrape_errors(batch_id)

157

print(f"Batch failed URLs: {len(errors.get('failed_urls', []))}")

158

```

159

160

### Processing Large URL Lists

161

162

```python

163

from firecrawl import Firecrawl, ScrapeOptions

164

import csv

165

166

app = Firecrawl(api_key="your-api-key")

167

168

# Read URLs from CSV file

169

urls = []

170

with open('urls.csv', 'r') as file:

171

reader = csv.reader(file)

172

urls = [row[0] for row in reader]

173

174

print(f"Processing {len(urls)} URLs")

175

176

# Configure scraping options

177

options = ScrapeOptions(

178

formats=["markdown", "html"],

179

include_tags=["article", "main", "content"],

180

exclude_tags=["nav", "footer", "sidebar"],

181

wait_for=2000

182

)

183

184

# Process in batches to manage resources

185

batch_size = 50

186

results = []

187

188

for i in range(0, len(urls), batch_size):

189

batch_urls = urls[i:i+batch_size]

190

print(f"Processing batch {i//batch_size + 1}")

191

192

batch_result = app.batch_scrape(batch_urls, options)

193

results.extend(batch_result.data)

194

195

print(f"Completed {len(results)} URLs so far")

196

197

print(f"Total scraped: {len(results)} pages")

198

```

199

200

## Types

201

202

```python { .api }

203

class BatchScrapeResponse:

204

"""Response from batch scrape operation"""

205

success: bool

206

data: List[Document]

207

208

class BatchScrapeJobStatus:

209

"""Status information for batch scrape job"""

210

status: str # "pending", "running", "completed", "failed", "cancelled"

211

job_id: str

212

total: int # Total URLs to scrape

213

completed: int # URLs completed

214

data: Optional[List[Document]] # Results (available when completed)

215

216

class BatchScrapeRequest:

217

"""Request structure for batch scraping"""

218

urls: List[str]

219

options: Optional[ScrapeOptions]

220

webhook: Optional[str] # Webhook URL for completion notification

221

```

222

223

## Error Handling

224

225

Batch operations handle individual URL failures gracefully:

226

227

```python

228

from firecrawl import Firecrawl

229

230

app = Firecrawl(api_key="your-api-key")

231

232

# Mix of valid and invalid URLs

233

urls = [

234

"https://example.com/valid1",

235

"https://invalid-domain-12345.com", # This will fail

236

"https://example.com/valid2",

237

"https://httpstat.us/404" # This will fail

238

]

239

240

result = app.batch_scrape(urls)

241

242

# Check results

243

successful_count = len(result.data)

244

total_count = len(urls)

245

failed_count = total_count - successful_count

246

247

print(f"Successful: {successful_count}/{total_count}")

248

print(f"Failed: {failed_count}")

249

250

# Individual results contain status information

251

for doc in result.data:

252

if hasattr(doc, 'error'):

253

print(f"Failed URL: {doc.url}, Error: {doc.error}")

254

else:

255

print(f"Success URL: {doc.url}, Content length: {len(doc.content)}")

256

```

257

258

## Async Usage

259

260

All batch operations have async equivalents:

261

262

```python

263

import asyncio

264

from firecrawl import AsyncFirecrawl

265

266

async def batch_scrape_async():

267

app = AsyncFirecrawl(api_key="your-api-key")

268

269

urls = ["https://example.com/1", "https://example.com/2"]

270

271

# Async complete batch scrape

272

result = await app.batch_scrape(urls)

273

274

# Async job management

275

batch_id = await app.start_batch_scrape(urls)

276

status = await app.get_batch_scrape_status(batch_id)

277

278

# Wait for completion (async-specific method)

279

final_result = await app.wait_batch_scrape(batch_id)

280

281

asyncio.run(batch_scrape_async())

282

```