Tessl Tile for pypi/firecrawl-py@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

batch.md crawling.md extraction.md index.md monitoring.md scraping.md usage.md v1-api.md

batch.mddocs/

0
# Batch Processing
1

2
Batch operations for processing multiple URLs efficiently. Includes both batch scraping with full result polling and asynchronous job management for large-scale operations.
3

4
## Capabilities
5

6
### Complete Batch Scraping
7

8
Process multiple URLs in batch and return complete results, automatically polling for completion. Best for smaller batches or when you need immediate complete results.
9

10
```python { .api }
11
def batch_scrape(urls: List[str], options: Optional[ScrapeOptions] = None) -> BatchScrapeResponse:
12
    """
13
    Scrape multiple URLs in batch and return complete results.
14
    
15
    Parameters:
16
    - urls: List[str], list of URLs to scrape
17
    - options: ScrapeOptions, optional configuration applied to all URLs
18
    
19
    Returns:
20
    - BatchScrapeResponse: complete batch scraping results
21
    """
22
```
23

24
### Asynchronous Batch Processing
25

26
Start a batch scrape job and manage it asynchronously, ideal for large batches or when you need to track progress.
27

28
```python { .api }
29
def start_batch_scrape(urls: List[str], options: Optional[ScrapeOptions] = None) -> str:
30
    """
31
    Start a batch scrape job asynchronously.
32
    
33
    Parameters:
34
    - urls: List[str], list of URLs to scrape
35
    - options: ScrapeOptions, optional configuration for scraping behavior
36
    
37
    Returns:
38
    - str: batch job ID for status tracking
39
    """
40

41
def get_batch_scrape_status(batch_id: str) -> BatchScrapeJobStatus:
42
    """
43
    Get status of a running batch scrape job.
44
    
45
    Parameters:
46
    - batch_id: str, batch job ID from start_batch_scrape
47
    
48
    Returns:
49
    - BatchScrapeJobStatus: current status and progress information
50
    """
51

52
def cancel_batch_scrape(batch_id: str) -> dict:
53
    """
54
    Cancel a running batch scrape job.
55
    
56
    Parameters:
57
    - batch_id: str, batch job ID to cancel
58
    
59
    Returns:
60
    - dict: cancellation confirmation
61
    """
62

63
# Async clients only
64
async def wait_batch_scrape(job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> BatchScrapeResponse:
65
    """
66
    Wait for batch scrape completion with automatic polling (AsyncFirecrawl only).
67
    
68
    Parameters:
69
    - job_id: str, batch job ID to wait for
70
    - poll_interval: int, polling interval in seconds (default: 2)
71
    - timeout: Optional[int], maximum wait time in seconds
72
    
73
    Returns:
74
    - BatchScrapeResponse: completed batch scrape results
75
    """
76
```
77

78
### Batch Job Management
79

80
Manage and monitor batch jobs with error handling and progress tracking.
81

82
```python { .api }
83
def get_batch_scrape_errors(batch_id: str) -> dict:
84
    """
85
    Get errors from a batch scrape job.
86
    
87
    Parameters:
88
    - batch_id: str, batch job ID
89
    
90
    Returns:
91
    - dict: error information for failed URLs
92
    """
93
```
94

95
## Usage Examples
96

97
### Basic Batch Scraping
98

99
```python
100
from firecrawl import Firecrawl, ScrapeOptions
101

102
app = Firecrawl(api_key="your-api-key")
103

104
# Simple batch scrape
105
urls = [
106
    "https://example.com/page1",
107
    "https://example.com/page2", 
108
    "https://example.com/page3"
109
]
110
result = app.batch_scrape(urls)
111
print(f"Scraped {len(result.data)} URLs")
112

113
# Batch scrape with options
114
options = ScrapeOptions(
115
    formats=["markdown"],
116
    include_tags=["article", "main"],
117
    wait_for=1000
118
)
119
result = app.batch_scrape(urls, options)
120
```
121

122
### Asynchronous Batch Management
123

124
```python
125
from firecrawl import Firecrawl
126
import time
127

128
app = Firecrawl(api_key="your-api-key")
129

130
# Large batch of URLs
131
urls = [f"https://example.com/page{i}" for i in range(1, 101)]
132

133
# Start batch job
134
batch_id = app.start_batch_scrape(urls, 
135
                                 ScrapeOptions(formats=["markdown"]))
136
print(f"Started batch job: {batch_id}")
137

138
# Monitor progress
139
while True:
140
    status = app.get_batch_scrape_status(batch_id)
141
    print(f"Status: {status.status}")
142
    print(f"Completed: {status.completed}/{status.total}")
143
    
144
    if status.status in ["completed", "failed", "cancelled"]:
145
        break
146
        
147
    time.sleep(10)
148

149
# Get final results
150
if status.status == "completed":
151
    print(f"Batch completed with {len(status.data)} pages")
152
    for doc in status.data:
153
        print(f"URL: {doc.url}, Content length: {len(doc.content)}")
154
else:
155
    # Check for errors
156
    errors = app.get_batch_scrape_errors(batch_id)
157
    print(f"Batch failed URLs: {len(errors.get('failed_urls', []))}")
158
```
159

160
### Processing Large URL Lists
161

162
```python
163
from firecrawl import Firecrawl, ScrapeOptions
164
import csv
165

166
app = Firecrawl(api_key="your-api-key")
167

168
# Read URLs from CSV file
169
urls = []
170
with open('urls.csv', 'r') as file:
171
    reader = csv.reader(file)
172
    urls = [row[0] for row in reader]
173

174
print(f"Processing {len(urls)} URLs")
175

176
# Configure scraping options
177
options = ScrapeOptions(
178
    formats=["markdown", "html"],
179
    include_tags=["article", "main", "content"],
180
    exclude_tags=["nav", "footer", "sidebar"],
181
    wait_for=2000
182
)
183

184
# Process in batches to manage resources
185
batch_size = 50
186
results = []
187

188
for i in range(0, len(urls), batch_size):
189
    batch_urls = urls[i:i+batch_size]
190
    print(f"Processing batch {i//batch_size + 1}")
191
    
192
    batch_result = app.batch_scrape(batch_urls, options)
193
    results.extend(batch_result.data)
194
    
195
    print(f"Completed {len(results)} URLs so far")
196

197
print(f"Total scraped: {len(results)} pages")
198
```
199

200
## Types
201

202
```python { .api }
203
class BatchScrapeResponse:
204
    """Response from batch scrape operation"""
205
    success: bool
206
    data: List[Document]
207
    
208
class BatchScrapeJobStatus:
209
    """Status information for batch scrape job"""
210
    status: str  # "pending", "running", "completed", "failed", "cancelled"
211
    job_id: str
212
    total: int  # Total URLs to scrape
213
    completed: int  # URLs completed
214
    data: Optional[List[Document]]  # Results (available when completed)
215
    
216
class BatchScrapeRequest:
217
    """Request structure for batch scraping"""
218
    urls: List[str]
219
    options: Optional[ScrapeOptions]
220
    webhook: Optional[str]  # Webhook URL for completion notification
221
```
222

223
## Error Handling
224

225
Batch operations handle individual URL failures gracefully:
226

227
```python
228
from firecrawl import Firecrawl
229

230
app = Firecrawl(api_key="your-api-key")
231

232
# Mix of valid and invalid URLs
233
urls = [
234
    "https://example.com/valid1",
235
    "https://invalid-domain-12345.com",  # This will fail
236
    "https://example.com/valid2",
237
    "https://httpstat.us/404"  # This will fail
238
]
239

240
result = app.batch_scrape(urls)
241

242
# Check results
243
successful_count = len(result.data)
244
total_count = len(urls)
245
failed_count = total_count - successful_count
246

247
print(f"Successful: {successful_count}/{total_count}")
248
print(f"Failed: {failed_count}")
249

250
# Individual results contain status information
251
for doc in result.data:
252
    if hasattr(doc, 'error'):
253
        print(f"Failed URL: {doc.url}, Error: {doc.error}")
254
    else:
255
        print(f"Success URL: {doc.url}, Content length: {len(doc.content)}")
256
```
257

258
## Async Usage
259

260
All batch operations have async equivalents:
261

262
```python
263
import asyncio
264
from firecrawl import AsyncFirecrawl
265

266
async def batch_scrape_async():
267
    app = AsyncFirecrawl(api_key="your-api-key")
268
    
269
    urls = ["https://example.com/1", "https://example.com/2"]
270
    
271
    # Async complete batch scrape
272
    result = await app.batch_scrape(urls)
273
    
274
    # Async job management
275
    batch_id = await app.start_batch_scrape(urls)
276
    status = await app.get_batch_scrape_status(batch_id)
277
    
278
    # Wait for completion (async-specific method)
279
    final_result = await app.wait_batch_scrape(batch_id)
280

281
asyncio.run(batch_scrape_async())
282
```

Version

Tile

Files

batch.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

batch.mddocs/