Tessl Tile for pypi/firecrawl-py@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

batch.md crawling.md extraction.md index.md monitoring.md scraping.md usage.md v1-api.md

crawling.mddocs/

0
# Crawling Operations
1

2
Website crawling functionality for discovering and processing multiple pages from a website. Supports both complete crawling with result polling and asynchronous job-based crawling for large sites.
3

4
## Capabilities
5

6
### Complete Crawling
7

8
Crawl a website and return complete results, automatically polling for completion. Best for smaller sites or when you need immediate complete results.
9

10
```python { .api }
11
def crawl(url: str, options: Optional[CrawlOptions] = None) -> CrawlResponse:
12
    """
13
    Crawl a website and return complete results.
14
    
15
    Parameters:
16
    - url: str, target website URL to crawl
17
    - options: CrawlOptions, optional crawling configuration
18
    
19
    Returns:
20
    - CrawlResponse: complete crawl results with all discovered pages
21
    """
22
```
23

24
### Asynchronous Crawling
25

26
Start a crawl job and manage it asynchronously, ideal for large websites or when you need to track progress.
27

28
```python { .api }
29
def start_crawl(url: str, options: Optional[CrawlOptions] = None) -> str:
30
    """
31
    Start a crawl job asynchronously.
32
    
33
    Parameters:
34
    - url: str, target website URL to crawl
35
    - options: CrawlOptions, optional crawling configuration
36
    
37
    Returns:
38
    - str: crawl job ID for status tracking
39
    """
40

41
def get_crawl_status(crawl_id: str) -> CrawlJobStatus:
42
    """
43
    Get status of a running crawl job.
44
    
45
    Parameters:
46
    - crawl_id: str, crawl job ID from start_crawl
47
    
48
    Returns:
49
    - CrawlJobStatus: current status and progress information
50
    """
51

52
def cancel_crawl(crawl_id: str) -> dict:
53
    """
54
    Cancel a running crawl job.
55
    
56
    Parameters:
57
    - crawl_id: str, crawl job ID to cancel
58
    
59
    Returns:
60
    - dict: cancellation confirmation
61
    """
62

63
# Async clients only
64
async def wait_crawl(job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> CrawlResponse:
65
    """
66
    Wait for crawl completion with automatic polling (AsyncFirecrawl only).
67
    
68
    Parameters:
69
    - job_id: str, crawl job ID to wait for
70
    - poll_interval: int, polling interval in seconds (default: 2)
71
    - timeout: Optional[int], maximum wait time in seconds
72
    
73
    Returns:
74
    - CrawlResponse: completed crawl results
75
    """
76
```
77

78
### Crawl Management
79

80
Manage and monitor crawl jobs with error handling and active job tracking.
81

82
```python { .api }
83
def get_crawl_errors(crawl_id: str) -> dict:
84
    """
85
    Get errors from a crawl job.
86
    
87
    Parameters:
88
    - crawl_id: str, crawl job ID
89
    
90
    Returns:
91
    - dict: error information and details
92
    """
93

94
def get_active_crawls() -> List[dict]:
95
    """
96
    Get list of active crawl jobs.
97
    
98
    Returns:
99
    - List[dict]: list of active crawl job information
100
    """
101

102
def crawl_params_preview(options: CrawlOptions) -> dict:
103
    """
104
    Preview crawl parameters and estimated scope.
105
    
106
    Parameters:
107
    - options: CrawlOptions, crawling configuration to preview
108
    
109
    Returns:
110
    - dict: preview information including estimated pages and cost
111
    """
112
```
113

114
## Usage Examples
115

116
### Basic Crawling
117

118
```python
119
from firecrawl import Firecrawl, CrawlOptions, ScrapeOptions
120

121
app = Firecrawl(api_key="your-api-key")
122

123
# Simple crawl
124
result = app.crawl("https://example.com")
125
print(f"Crawled {len(result.data)} pages")
126

127
# Crawl with options
128
scrape_options = ScrapeOptions(formats=["markdown"])
129
crawl_options = CrawlOptions(
130
    limit=50,
131
    max_depth=3,
132
    allowed_domains=["example.com"],
133
    scrape_options=scrape_options
134
)
135
result = app.crawl("https://example.com", crawl_options)
136
```
137

138
### Asynchronous Crawl Management
139

140
```python
141
from firecrawl import Firecrawl
142
import time
143

144
app = Firecrawl(api_key="your-api-key")
145

146
# Start crawl job
147
crawl_id = app.start_crawl("https://example.com", 
148
                          CrawlOptions(limit=100))
149
print(f"Started crawl job: {crawl_id}")
150

151
# Monitor progress
152
while True:
153
    status = app.get_crawl_status(crawl_id)
154
    print(f"Status: {status.status}")
155
    print(f"Completed: {status.completed}/{status.total}")
156
    
157
    if status.status in ["completed", "failed", "cancelled"]:
158
        break
159
        
160
    time.sleep(10)
161

162
# Get final results
163
if status.status == "completed":
164
    print(f"Crawl completed with {len(status.data)} pages")
165
else:
166
    # Check for errors
167
    errors = app.get_crawl_errors(crawl_id)
168
    print(f"Crawl failed: {errors}")
169
```
170

171
### Advanced Crawling
172

173
```python
174
from firecrawl import Firecrawl, CrawlOptions, ScrapeOptions
175

176
app = Firecrawl(api_key="your-api-key")
177

178
# Advanced crawl configuration
179
scrape_options = ScrapeOptions(
180
    formats=["markdown", "html"],
181
    include_tags=["article", "main", "content"],
182
    exclude_tags=["nav", "footer", "aside"],
183
    wait_for=2000
184
)
185

186
crawl_options = CrawlOptions(
187
    limit=200,
188
    max_depth=4,
189
    allowed_domains=["example.com", "blog.example.com"],
190
    ignored_paths=["/admin", "/api", "/search"],
191
    scrape_options=scrape_options
192
)
193

194
# Preview crawl scope
195
preview = app.crawl_params_preview(crawl_options)
196
print(f"Estimated pages: {preview.get('estimated_pages')}")
197
print(f"Estimated cost: {preview.get('estimated_credits')}")
198

199
# Start crawl
200
crawl_id = app.start_crawl("https://example.com", crawl_options)
201
```
202

203
## Types
204

205
```python { .api }
206
class CrawlOptions:
207
    """Configuration options for crawling operations"""
208
    limit: Optional[int]  # Maximum pages to crawl (default: 5000)
209
    max_depth: Optional[int]  # Maximum crawl depth (default: unlimited)
210
    allowed_domains: Optional[List[str]]  # Domains to crawl
211
    ignored_paths: Optional[List[str]]  # Paths to ignore
212
    include_paths: Optional[List[str]]  # Paths to include
213
    scrape_options: Optional[ScrapeOptions]  # Options for individual page scraping
214
    webhook: Optional[str]  # Webhook URL for job completion notification
215

216
class CrawlResponse:
217
    """Response from crawl operation"""
218
    success: bool
219
    data: List[Document]
220
    
221
class CrawlJobStatus:
222
    """Status information for crawl job"""
223
    status: str  # "pending", "running", "completed", "failed", "cancelled"
224
    job_id: str
225
    total: int  # Total pages to crawl
226
    completed: int  # Pages completed
227
    data: Optional[List[Document]]  # Results (available when completed)
228
    
229
class JobStatusType:
230
    """Enumeration of job status types"""
231
    PENDING = "pending"
232
    RUNNING = "running"
233
    COMPLETED = "completed"
234
    FAILED = "failed"
235
    CANCELLED = "cancelled"
236
```
237

238
## Async Usage
239

240
All crawling operations have async equivalents:
241

242
```python
243
import asyncio
244
from firecrawl import AsyncFirecrawl
245

246
async def crawl_async():
247
    app = AsyncFirecrawl(api_key="your-api-key")
248
    
249
    # Async complete crawl
250
    result = await app.crawl("https://example.com")
251
    
252
    # Async job management
253
    crawl_id = await app.start_crawl("https://example.com")
254
    status = await app.get_crawl_status(crawl_id)
255
    
256
    # Wait for completion (async-specific method)
257
    final_result = await app.wait_crawl(crawl_id)
258

259
asyncio.run(crawl_async())
260
```

Version

Tile

Files

crawling.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

crawling.mddocs/