0
# Crawling Operations
1
2
Website crawling functionality for discovering and processing multiple pages from a website. Supports both complete crawling with result polling and asynchronous job-based crawling for large sites.
3
4
## Capabilities
5
6
### Complete Crawling
7
8
Crawl a website and return complete results, automatically polling for completion. Best for smaller sites or when you need immediate complete results.
9
10
```python { .api }
11
def crawl(url: str, options: Optional[CrawlOptions] = None) -> CrawlResponse:
12
"""
13
Crawl a website and return complete results.
14
15
Parameters:
16
- url: str, target website URL to crawl
17
- options: CrawlOptions, optional crawling configuration
18
19
Returns:
20
- CrawlResponse: complete crawl results with all discovered pages
21
"""
22
```
23
24
### Asynchronous Crawling
25
26
Start a crawl job and manage it asynchronously, ideal for large websites or when you need to track progress.
27
28
```python { .api }
29
def start_crawl(url: str, options: Optional[CrawlOptions] = None) -> str:
30
"""
31
Start a crawl job asynchronously.
32
33
Parameters:
34
- url: str, target website URL to crawl
35
- options: CrawlOptions, optional crawling configuration
36
37
Returns:
38
- str: crawl job ID for status tracking
39
"""
40
41
def get_crawl_status(crawl_id: str) -> CrawlJobStatus:
42
"""
43
Get status of a running crawl job.
44
45
Parameters:
46
- crawl_id: str, crawl job ID from start_crawl
47
48
Returns:
49
- CrawlJobStatus: current status and progress information
50
"""
51
52
def cancel_crawl(crawl_id: str) -> dict:
53
"""
54
Cancel a running crawl job.
55
56
Parameters:
57
- crawl_id: str, crawl job ID to cancel
58
59
Returns:
60
- dict: cancellation confirmation
61
"""
62
63
# Async clients only
64
async def wait_crawl(job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> CrawlResponse:
65
"""
66
Wait for crawl completion with automatic polling (AsyncFirecrawl only).
67
68
Parameters:
69
- job_id: str, crawl job ID to wait for
70
- poll_interval: int, polling interval in seconds (default: 2)
71
- timeout: Optional[int], maximum wait time in seconds
72
73
Returns:
74
- CrawlResponse: completed crawl results
75
"""
76
```
77
78
### Crawl Management
79
80
Manage and monitor crawl jobs with error handling and active job tracking.
81
82
```python { .api }
83
def get_crawl_errors(crawl_id: str) -> dict:
84
"""
85
Get errors from a crawl job.
86
87
Parameters:
88
- crawl_id: str, crawl job ID
89
90
Returns:
91
- dict: error information and details
92
"""
93
94
def get_active_crawls() -> List[dict]:
95
"""
96
Get list of active crawl jobs.
97
98
Returns:
99
- List[dict]: list of active crawl job information
100
"""
101
102
def crawl_params_preview(options: CrawlOptions) -> dict:
103
"""
104
Preview crawl parameters and estimated scope.
105
106
Parameters:
107
- options: CrawlOptions, crawling configuration to preview
108
109
Returns:
110
- dict: preview information including estimated pages and cost
111
"""
112
```
113
114
## Usage Examples
115
116
### Basic Crawling
117
118
```python
119
from firecrawl import Firecrawl, CrawlOptions, ScrapeOptions
120
121
app = Firecrawl(api_key="your-api-key")
122
123
# Simple crawl
124
result = app.crawl("https://example.com")
125
print(f"Crawled {len(result.data)} pages")
126
127
# Crawl with options
128
scrape_options = ScrapeOptions(formats=["markdown"])
129
crawl_options = CrawlOptions(
130
limit=50,
131
max_depth=3,
132
allowed_domains=["example.com"],
133
scrape_options=scrape_options
134
)
135
result = app.crawl("https://example.com", crawl_options)
136
```
137
138
### Asynchronous Crawl Management
139
140
```python
141
from firecrawl import Firecrawl
142
import time
143
144
app = Firecrawl(api_key="your-api-key")
145
146
# Start crawl job
147
crawl_id = app.start_crawl("https://example.com",
148
CrawlOptions(limit=100))
149
print(f"Started crawl job: {crawl_id}")
150
151
# Monitor progress
152
while True:
153
status = app.get_crawl_status(crawl_id)
154
print(f"Status: {status.status}")
155
print(f"Completed: {status.completed}/{status.total}")
156
157
if status.status in ["completed", "failed", "cancelled"]:
158
break
159
160
time.sleep(10)
161
162
# Get final results
163
if status.status == "completed":
164
print(f"Crawl completed with {len(status.data)} pages")
165
else:
166
# Check for errors
167
errors = app.get_crawl_errors(crawl_id)
168
print(f"Crawl failed: {errors}")
169
```
170
171
### Advanced Crawling
172
173
```python
174
from firecrawl import Firecrawl, CrawlOptions, ScrapeOptions
175
176
app = Firecrawl(api_key="your-api-key")
177
178
# Advanced crawl configuration
179
scrape_options = ScrapeOptions(
180
formats=["markdown", "html"],
181
include_tags=["article", "main", "content"],
182
exclude_tags=["nav", "footer", "aside"],
183
wait_for=2000
184
)
185
186
crawl_options = CrawlOptions(
187
limit=200,
188
max_depth=4,
189
allowed_domains=["example.com", "blog.example.com"],
190
ignored_paths=["/admin", "/api", "/search"],
191
scrape_options=scrape_options
192
)
193
194
# Preview crawl scope
195
preview = app.crawl_params_preview(crawl_options)
196
print(f"Estimated pages: {preview.get('estimated_pages')}")
197
print(f"Estimated cost: {preview.get('estimated_credits')}")
198
199
# Start crawl
200
crawl_id = app.start_crawl("https://example.com", crawl_options)
201
```
202
203
## Types
204
205
```python { .api }
206
class CrawlOptions:
207
"""Configuration options for crawling operations"""
208
limit: Optional[int] # Maximum pages to crawl (default: 5000)
209
max_depth: Optional[int] # Maximum crawl depth (default: unlimited)
210
allowed_domains: Optional[List[str]] # Domains to crawl
211
ignored_paths: Optional[List[str]] # Paths to ignore
212
include_paths: Optional[List[str]] # Paths to include
213
scrape_options: Optional[ScrapeOptions] # Options for individual page scraping
214
webhook: Optional[str] # Webhook URL for job completion notification
215
216
class CrawlResponse:
217
"""Response from crawl operation"""
218
success: bool
219
data: List[Document]
220
221
class CrawlJobStatus:
222
"""Status information for crawl job"""
223
status: str # "pending", "running", "completed", "failed", "cancelled"
224
job_id: str
225
total: int # Total pages to crawl
226
completed: int # Pages completed
227
data: Optional[List[Document]] # Results (available when completed)
228
229
class JobStatusType:
230
"""Enumeration of job status types"""
231
PENDING = "pending"
232
RUNNING = "running"
233
COMPLETED = "completed"
234
FAILED = "failed"
235
CANCELLED = "cancelled"
236
```
237
238
## Async Usage
239
240
All crawling operations have async equivalents:
241
242
```python
243
import asyncio
244
from firecrawl import AsyncFirecrawl
245
246
async def crawl_async():
247
app = AsyncFirecrawl(api_key="your-api-key")
248
249
# Async complete crawl
250
result = await app.crawl("https://example.com")
251
252
# Async job management
253
crawl_id = await app.start_crawl("https://example.com")
254
status = await app.get_crawl_status(crawl_id)
255
256
# Wait for completion (async-specific method)
257
final_result = await app.wait_crawl(crawl_id)
258
259
asyncio.run(crawl_async())
260
```