or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch.mdcrawling.mdextraction.mdindex.mdmonitoring.mdscraping.mdusage.mdv1-api.md

extraction.mddocs/

0

# Data Extraction

1

2

AI-powered structured data extraction using custom schemas. Supports both immediate extraction with result polling and asynchronous job-based extraction for complex data processing.

3

4

## Capabilities

5

6

### Complete Data Extraction

7

8

Extract structured data from a URL using AI and return complete results, automatically polling for completion. Best for smaller extractions or when you need immediate results.

9

10

```python { .api }

11

def extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> ExtractResponse:

12

"""

13

Extract structured data from a URL using AI.

14

15

Parameters:

16

- url: str, target URL to extract data from

17

- schema: dict, JSON schema defining the data structure to extract

18

- options: ExtractOptions, optional extraction configuration

19

20

Returns:

21

- ExtractResponse: extracted structured data matching the schema

22

"""

23

```

24

25

### Asynchronous Data Extraction

26

27

Start an extraction job and manage it asynchronously, ideal for complex extractions or when you need to track progress.

28

29

```python { .api }

30

def start_extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> str:

31

"""

32

Start an extraction job asynchronously.

33

34

Parameters:

35

- url: str, target URL to extract data from

36

- schema: dict, JSON schema defining the data structure to extract

37

- options: ExtractOptions, optional extraction configuration

38

39

Returns:

40

- str: extraction job ID for status tracking

41

"""

42

43

def get_extract_status(extract_id: str) -> ExtractJobStatus:

44

"""

45

Get status of a running extraction job.

46

47

Parameters:

48

- extract_id: str, extraction job ID from start_extract

49

50

Returns:

51

- ExtractJobStatus: current status and progress information

52

"""

53

```

54

55

## Usage Examples

56

57

### Basic Data Extraction

58

59

```python

60

from firecrawl import Firecrawl

61

62

app = Firecrawl(api_key="your-api-key")

63

64

# Define schema for product information

65

schema = {

66

"type": "object",

67

"properties": {

68

"title": {"type": "string"},

69

"price": {"type": "number"},

70

"description": {"type": "string"},

71

"features": {

72

"type": "array",

73

"items": {"type": "string"}

74

},

75

"availability": {"type": "string"}

76

},

77

"required": ["title", "price"]

78

}

79

80

# Extract product data

81

result = app.extract("https://store.example.com/product/123", schema)

82

print(result.data)

83

```

84

85

### Complex Schema Extraction

86

87

```python

88

from firecrawl import Firecrawl, ExtractOptions

89

90

app = Firecrawl(api_key="your-api-key")

91

92

# Complex schema for news article

93

schema = {

94

"type": "object",

95

"properties": {

96

"headline": {"type": "string"},

97

"author": {

98

"type": "object",

99

"properties": {

100

"name": {"type": "string"},

101

"bio": {"type": "string"},

102

"email": {"type": "string"}

103

}

104

},

105

"published_date": {"type": "string", "format": "date-time"},

106

"content": {"type": "string"},

107

"tags": {

108

"type": "array",

109

"items": {"type": "string"}

110

},

111

"related_articles": {

112

"type": "array",

113

"items": {

114

"type": "object",

115

"properties": {

116

"title": {"type": "string"},

117

"url": {"type": "string"}

118

}

119

}

120

}

121

},

122

"required": ["headline", "content", "published_date"]

123

}

124

125

# Extract with options

126

options = ExtractOptions(

127

prompt="Focus on extracting accurate publication dates and author information"

128

)

129

130

result = app.extract("https://news.example.com/article/123", schema, options)

131

print(f"Headline: {result.data['headline']}")

132

print(f"Author: {result.data['author']['name']}")

133

print(f"Tags: {', '.join(result.data['tags'])}")

134

```

135

136

### Asynchronous Extraction

137

138

```python

139

from firecrawl import Firecrawl

140

import time

141

142

app = Firecrawl(api_key="your-api-key")

143

144

# Schema for e-commerce catalog

145

schema = {

146

"type": "object",

147

"properties": {

148

"products": {

149

"type": "array",

150

"items": {

151

"type": "object",

152

"properties": {

153

"name": {"type": "string"},

154

"price": {"type": "number"},

155

"category": {"type": "string"},

156

"rating": {"type": "number"},

157

"reviews_count": {"type": "integer"}

158

}

159

}

160

},

161

"total_products": {"type": "integer"},

162

"page_info": {

163

"type": "object",

164

"properties": {

165

"current_page": {"type": "integer"},

166

"total_pages": {"type": "integer"}

167

}

168

}

169

}

170

}

171

172

# Start extraction job

173

extract_id = app.start_extract("https://store.example.com/catalog", schema)

174

print(f"Started extraction job: {extract_id}")

175

176

# Monitor progress

177

while True:

178

status = app.get_extract_status(extract_id)

179

print(f"Status: {status.status}")

180

181

if status.status in ["completed", "failed", "cancelled"]:

182

break

183

184

time.sleep(5)

185

186

# Get results

187

if status.status == "completed":

188

products = status.data['products']

189

print(f"Extracted {len(products)} products")

190

for product in products[:5]: # Show first 5

191

print(f"- {product['name']}: ${product['price']}")

192

```

193

194

### Multi-Page Data Extraction

195

196

```python

197

from firecrawl import Firecrawl

198

199

app = Firecrawl(api_key="your-api-key")

200

201

# Schema for extracting company information

202

company_schema = {

203

"type": "object",

204

"properties": {

205

"company_name": {"type": "string"},

206

"industry": {"type": "string"},

207

"founded": {"type": "string"},

208

"employees": {"type": "string"},

209

"headquarters": {"type": "string"},

210

"description": {"type": "string"},

211

"key_people": {

212

"type": "array",

213

"items": {

214

"type": "object",

215

"properties": {

216

"name": {"type": "string"},

217

"position": {"type": "string"}

218

}

219

}

220

},

221

"contact": {

222

"type": "object",

223

"properties": {

224

"email": {"type": "string"},

225

"phone": {"type": "string"},

226

"website": {"type": "string"}

227

}

228

}

229

}

230

}

231

232

# Extract from multiple company pages

233

companies = []

234

urls = [

235

"https://example1.com/about",

236

"https://example2.com/company",

237

"https://example3.com/about-us"

238

]

239

240

for url in urls:

241

try:

242

result = app.extract(url, company_schema)

243

companies.append(result.data)

244

print(f"Extracted: {result.data['company_name']}")

245

except Exception as e:

246

print(f"Failed to extract from {url}: {e}")

247

248

print(f"Total companies extracted: {len(companies)}")

249

```

250

251

## Types

252

253

```python { .api }

254

class ExtractOptions:

255

"""Configuration options for extraction operations"""

256

prompt: Optional[str] # Additional prompt to guide extraction

257

schema_description: Optional[str] # Description of the schema purpose

258

259

class ExtractResponse:

260

"""Response from extract operation"""

261

success: bool

262

data: dict # Extracted data matching the provided schema

263

264

class ExtractJobStatus:

265

"""Status information for extraction job"""

266

status: str # "pending", "running", "completed", "failed", "cancelled"

267

job_id: str

268

data: Optional[dict] # Extracted data (available when completed)

269

270

class ExtractRequest:

271

"""Request structure for data extraction"""

272

url: str

273

schema: dict

274

options: Optional[ExtractOptions]

275

webhook: Optional[str] # Webhook URL for completion notification

276

```

277

278

## Schema Design Best Practices

279

280

### Simple Schema

281

282

```python

283

# For basic information extraction

284

simple_schema = {

285

"type": "object",

286

"properties": {

287

"title": {"type": "string"},

288

"content": {"type": "string"},

289

"date": {"type": "string"}

290

},

291

"required": ["title", "content"]

292

}

293

```

294

295

### Nested Schema

296

297

```python

298

# For complex structured data

299

nested_schema = {

300

"type": "object",

301

"properties": {

302

"article": {

303

"type": "object",

304

"properties": {

305

"metadata": {

306

"type": "object",

307

"properties": {

308

"title": {"type": "string"},

309

"author": {"type": "string"},

310

"date": {"type": "string"}

311

}

312

},

313

"content": {

314

"type": "object",

315

"properties": {

316

"body": {"type": "string"},

317

"sections": {

318

"type": "array",

319

"items": {

320

"type": "object",

321

"properties": {

322

"heading": {"type": "string"},

323

"text": {"type": "string"}

324

}

325

}

326

}

327

}

328

}

329

}

330

}

331

}

332

}

333

```

334

335

### Array Schema

336

337

```python

338

# For extracting lists of items

339

array_schema = {

340

"type": "object",

341

"properties": {

342

"items": {

343

"type": "array",

344

"items": {

345

"type": "object",

346

"properties": {

347

"name": {"type": "string"},

348

"value": {"type": "string"},

349

"category": {"type": "string"}

350

},

351

"required": ["name"]

352

}

353

}

354

}

355

}

356

```

357

358

## Error Handling

359

360

```python

361

from firecrawl import Firecrawl

362

363

app = Firecrawl(api_key="your-api-key")

364

365

schema = {

366

"type": "object",

367

"properties": {

368

"title": {"type": "string"},

369

"price": {"type": "number"}

370

}

371

}

372

373

try:

374

result = app.extract("https://example.com/product", schema)

375

if result.success:

376

print(f"Extracted: {result.data}")

377

else:

378

print("Extraction failed")

379

except Exception as e:

380

print(f"Error during extraction: {e}")

381

```

382

383

## Async Usage

384

385

All extraction operations have async equivalents:

386

387

```python

388

import asyncio

389

from firecrawl import AsyncFirecrawl

390

391

async def extract_async():

392

app = AsyncFirecrawl(api_key="your-api-key")

393

394

schema = {"type": "object", "properties": {"title": {"type": "string"}}}

395

396

# Async complete extraction

397

result = await app.extract("https://example.com", schema)

398

399

# Async job management

400

extract_id = await app.start_extract("https://example.com", schema)

401

status = await app.get_extract_status(extract_id)

402

403

asyncio.run(extract_async())

404

```