or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch.mdcrawling.mdextraction.mdindex.mdmonitoring.mdscraping.mdusage.mdv1-api.md

crawling.mddocs/

0

# Crawling Operations

1

2

Website crawling functionality for discovering and processing multiple pages from a website. Supports both complete crawling with result polling and asynchronous job-based crawling for large sites.

3

4

## Capabilities

5

6

### Complete Crawling

7

8

Crawl a website and return complete results, automatically polling for completion. Best for smaller sites or when you need immediate complete results.

9

10

```python { .api }

11

def crawl(url: str, options: Optional[CrawlOptions] = None) -> CrawlResponse:

12

"""

13

Crawl a website and return complete results.

14

15

Parameters:

16

- url: str, target website URL to crawl

17

- options: CrawlOptions, optional crawling configuration

18

19

Returns:

20

- CrawlResponse: complete crawl results with all discovered pages

21

"""

22

```

23

24

### Asynchronous Crawling

25

26

Start a crawl job and manage it asynchronously, ideal for large websites or when you need to track progress.

27

28

```python { .api }

29

def start_crawl(url: str, options: Optional[CrawlOptions] = None) -> str:

30

"""

31

Start a crawl job asynchronously.

32

33

Parameters:

34

- url: str, target website URL to crawl

35

- options: CrawlOptions, optional crawling configuration

36

37

Returns:

38

- str: crawl job ID for status tracking

39

"""

40

41

def get_crawl_status(crawl_id: str) -> CrawlJobStatus:

42

"""

43

Get status of a running crawl job.

44

45

Parameters:

46

- crawl_id: str, crawl job ID from start_crawl

47

48

Returns:

49

- CrawlJobStatus: current status and progress information

50

"""

51

52

def cancel_crawl(crawl_id: str) -> dict:

53

"""

54

Cancel a running crawl job.

55

56

Parameters:

57

- crawl_id: str, crawl job ID to cancel

58

59

Returns:

60

- dict: cancellation confirmation

61

"""

62

63

# Async clients only

64

async def wait_crawl(job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> CrawlResponse:

65

"""

66

Wait for crawl completion with automatic polling (AsyncFirecrawl only).

67

68

Parameters:

69

- job_id: str, crawl job ID to wait for

70

- poll_interval: int, polling interval in seconds (default: 2)

71

- timeout: Optional[int], maximum wait time in seconds

72

73

Returns:

74

- CrawlResponse: completed crawl results

75

"""

76

```

77

78

### Crawl Management

79

80

Manage and monitor crawl jobs with error handling and active job tracking.

81

82

```python { .api }

83

def get_crawl_errors(crawl_id: str) -> dict:

84

"""

85

Get errors from a crawl job.

86

87

Parameters:

88

- crawl_id: str, crawl job ID

89

90

Returns:

91

- dict: error information and details

92

"""

93

94

def get_active_crawls() -> List[dict]:

95

"""

96

Get list of active crawl jobs.

97

98

Returns:

99

- List[dict]: list of active crawl job information

100

"""

101

102

def crawl_params_preview(options: CrawlOptions) -> dict:

103

"""

104

Preview crawl parameters and estimated scope.

105

106

Parameters:

107

- options: CrawlOptions, crawling configuration to preview

108

109

Returns:

110

- dict: preview information including estimated pages and cost

111

"""

112

```

113

114

## Usage Examples

115

116

### Basic Crawling

117

118

```python

119

from firecrawl import Firecrawl, CrawlOptions, ScrapeOptions

120

121

app = Firecrawl(api_key="your-api-key")

122

123

# Simple crawl

124

result = app.crawl("https://example.com")

125

print(f"Crawled {len(result.data)} pages")

126

127

# Crawl with options

128

scrape_options = ScrapeOptions(formats=["markdown"])

129

crawl_options = CrawlOptions(

130

limit=50,

131

max_depth=3,

132

allowed_domains=["example.com"],

133

scrape_options=scrape_options

134

)

135

result = app.crawl("https://example.com", crawl_options)

136

```

137

138

### Asynchronous Crawl Management

139

140

```python

141

from firecrawl import Firecrawl

142

import time

143

144

app = Firecrawl(api_key="your-api-key")

145

146

# Start crawl job

147

crawl_id = app.start_crawl("https://example.com",

148

CrawlOptions(limit=100))

149

print(f"Started crawl job: {crawl_id}")

150

151

# Monitor progress

152

while True:

153

status = app.get_crawl_status(crawl_id)

154

print(f"Status: {status.status}")

155

print(f"Completed: {status.completed}/{status.total}")

156

157

if status.status in ["completed", "failed", "cancelled"]:

158

break

159

160

time.sleep(10)

161

162

# Get final results

163

if status.status == "completed":

164

print(f"Crawl completed with {len(status.data)} pages")

165

else:

166

# Check for errors

167

errors = app.get_crawl_errors(crawl_id)

168

print(f"Crawl failed: {errors}")

169

```

170

171

### Advanced Crawling

172

173

```python

174

from firecrawl import Firecrawl, CrawlOptions, ScrapeOptions

175

176

app = Firecrawl(api_key="your-api-key")

177

178

# Advanced crawl configuration

179

scrape_options = ScrapeOptions(

180

formats=["markdown", "html"],

181

include_tags=["article", "main", "content"],

182

exclude_tags=["nav", "footer", "aside"],

183

wait_for=2000

184

)

185

186

crawl_options = CrawlOptions(

187

limit=200,

188

max_depth=4,

189

allowed_domains=["example.com", "blog.example.com"],

190

ignored_paths=["/admin", "/api", "/search"],

191

scrape_options=scrape_options

192

)

193

194

# Preview crawl scope

195

preview = app.crawl_params_preview(crawl_options)

196

print(f"Estimated pages: {preview.get('estimated_pages')}")

197

print(f"Estimated cost: {preview.get('estimated_credits')}")

198

199

# Start crawl

200

crawl_id = app.start_crawl("https://example.com", crawl_options)

201

```

202

203

## Types

204

205

```python { .api }

206

class CrawlOptions:

207

"""Configuration options for crawling operations"""

208

limit: Optional[int] # Maximum pages to crawl (default: 5000)

209

max_depth: Optional[int] # Maximum crawl depth (default: unlimited)

210

allowed_domains: Optional[List[str]] # Domains to crawl

211

ignored_paths: Optional[List[str]] # Paths to ignore

212

include_paths: Optional[List[str]] # Paths to include

213

scrape_options: Optional[ScrapeOptions] # Options for individual page scraping

214

webhook: Optional[str] # Webhook URL for job completion notification

215

216

class CrawlResponse:

217

"""Response from crawl operation"""

218

success: bool

219

data: List[Document]

220

221

class CrawlJobStatus:

222

"""Status information for crawl job"""

223

status: str # "pending", "running", "completed", "failed", "cancelled"

224

job_id: str

225

total: int # Total pages to crawl

226

completed: int # Pages completed

227

data: Optional[List[Document]] # Results (available when completed)

228

229

class JobStatusType:

230

"""Enumeration of job status types"""

231

PENDING = "pending"

232

RUNNING = "running"

233

COMPLETED = "completed"

234

FAILED = "failed"

235

CANCELLED = "cancelled"

236

```

237

238

## Async Usage

239

240

All crawling operations have async equivalents:

241

242

```python

243

import asyncio

244

from firecrawl import AsyncFirecrawl

245

246

async def crawl_async():

247

app = AsyncFirecrawl(api_key="your-api-key")

248

249

# Async complete crawl

250

result = await app.crawl("https://example.com")

251

252

# Async job management

253

crawl_id = await app.start_crawl("https://example.com")

254

status = await app.get_crawl_status(crawl_id)

255

256

# Wait for completion (async-specific method)

257

final_result = await app.wait_crawl(crawl_id)

258

259

asyncio.run(crawl_async())

260

```