Tessl Tile for npm/@mendable/firecrawl-js@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

batch.md crawling.md extraction.md index.md mapping.md monitoring.md scraping.md search.md usage.md v1-api.md

crawling.mddocs/

0
# Web Crawling
1

2
Recursive website crawling with configurable limits, path filtering, webhook support, and job monitoring.
3

4
## Core Crawling Methods
5

6
```typescript { .api }
7
/**
8
 * Start an async crawl job
9
 * @param url - Root URL to crawl
10
 * @param req - Crawl configuration options
11
 * @returns Promise resolving to job ID and URL
12
 */
13
startCrawl(url: string, req?: CrawlOptions): Promise<CrawlResponse>;
14

15
/**
16
 * Get crawl job status and partial data
17
 * @param jobId - Crawl job identifier
18
 * @param pagination - Pagination configuration for results
19
 * @returns Promise resolving to job status and data
20
 */
21
getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
22

23
/**
24
 * Cancel a running crawl job
25
 * @param jobId - Crawl job identifier
26
 * @returns Promise resolving to true if cancelled
27
 */
28
cancelCrawl(jobId: string): Promise<boolean>;
29

30
/**
31
 * Convenience waiter: start crawl and poll until completion
32
 * @param url - Root URL to crawl
33
 * @param req - Crawl configuration plus waiter controls
34
 * @returns Promise resolving to final job snapshot
35
 */
36
crawl(url: string, req?: CrawlOptions & { pollInterval?: number; timeout?: number }): Promise<CrawlJob>;
37

38
/**
39
 * Retrieve crawl errors and robots.txt blocks
40
 * @param crawlId - Crawl job identifier
41
 * @returns Promise resolving to error details
42
 */
43
getCrawlErrors(crawlId: string): Promise<CrawlErrorsResponse>;
44

45
/**
46
 * List active crawls for the authenticated team
47
 * @returns Promise resolving to active crawls list
48
 */
49
getActiveCrawls(): Promise<ActiveCrawlsResponse>;
50

51
/**
52
 * Preview normalized crawl parameters from natural language
53
 * @param url - Root URL
54
 * @param prompt - Natural language instruction
55
 * @returns Promise resolving to normalized parameters
56
 */
57
crawlParamsPreview(url: string, prompt: string): Promise<Record<string, unknown>>;
58
```
59

60
## Crawl Configuration
61

62
```typescript { .api }
63
interface CrawlOptions {
64
  // Natural language crawl configuration
65
  prompt?: string | null;
66
  
67
  // Path filtering
68
  excludePaths?: string[] | null;
69
  includePaths?: string[] | null;
70
  
71
  // Crawl behavior
72
  maxDiscoveryDepth?: number | null;
73
  sitemap?: "skip" | "include";
74
  ignoreQueryParameters?: boolean;
75
  limit?: number | null;
76
  crawlEntireDomain?: boolean;
77
  allowExternalLinks?: boolean;
78
  allowSubdomains?: boolean;
79
  
80
  // Performance control
81
  delay?: number | null;
82
  maxConcurrency?: number | null;
83
  
84
  // Notifications
85
  webhook?: string | WebhookConfig | null;
86
  
87
  // Content processing
88
  scrapeOptions?: ScrapeOptions | null;
89
  
90
  // Privacy
91
  zeroDataRetention?: boolean;
92
  
93
  // Integration tracking
94
  integration?: string;
95
}
96
```
97

98
## Response Types
99

100
```typescript { .api }
101
// Crawl initiation response
102
interface CrawlResponse {
103
  id: string;
104
  url: string;
105
}
106

107
// Crawl job status and data
108
interface CrawlJob {
109
  status: "scraping" | "completed" | "failed" | "cancelled";
110
  total: number;
111
  completed: number;
112
  creditsUsed?: number;
113
  expiresAt?: string;
114
  next?: string | null;
115
  data: Document[];
116
}
117

118
// Crawl error details
119
interface CrawlErrorsResponse {
120
  errors: {
121
    id: string;
122
    timestamp?: string;
123
    url: string;
124
    code?: string;
125
    error: string;
126
  }[];
127
  robotsBlocked: string[];
128
}
129

130
// Active crawls listing
131
interface ActiveCrawlsResponse {
132
  success: boolean;
133
  crawls: ActiveCrawl[];
134
}
135

136
interface ActiveCrawl {
137
  id: string;
138
  teamId: string;
139
  url: string;
140
  options?: Record<string, unknown> | null;
141
}
142
```
143

144
## Webhook Configuration
145

146
```typescript { .api }
147
interface WebhookConfig {
148
  url: string;
149
  headers?: Record<string, string>;
150
  metadata?: Record<string, string>;
151
  events?: Array<"completed" | "failed" | "page" | "started">;
152
}
153
```
154

155
## Pagination Configuration
156

157
```typescript { .api }
158
interface PaginationConfig {
159
  // Automatically follow `next` links and aggregate documents
160
  autoPaginate?: boolean;
161
  
162
  // Maximum additional pages to fetch after first response
163
  maxPages?: number;
164
  
165
  // Maximum total documents to return across all pages
166
  maxResults?: number;
167
  
168
  // Maximum time to spend fetching additional pages (seconds)
169
  maxWaitTime?: number;
170
}
171
```
172

173
## Usage Examples
174

175
### Basic Crawling
176

177
```typescript
178
// Simple crawl with limit
179
const crawlJob = await app.crawl('https://example.com', {
180
  limit: 50,
181
  scrapeOptions: {
182
    formats: ['markdown']
183
  }
184
});
185

186
console.log(`Crawled ${crawlJob.completed} of ${crawlJob.total} pages`);
187
console.log(crawlJob.data); // Array of scraped documents
188
```
189

190
### Async Crawl with Status Monitoring
191

192
```typescript
193
// Start crawl job
194
const crawlResponse = await app.startCrawl('https://example.com', {
195
  limit: 100,
196
  maxConcurrency: 5,
197
  scrapeOptions: {
198
    formats: ['markdown', 'links']
199
  }
200
});
201

202
console.log(`Started crawl job: ${crawlResponse.id}`);
203

204
// Monitor status
205
let job: CrawlJob;
206
do {
207
  await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds
208
  job = await app.getCrawlStatus(crawlResponse.id);
209
  console.log(`Progress: ${job.completed}/${job.total} - Status: ${job.status}`);
210
} while (job.status === 'scraping');
211

212
console.log('Crawl completed!', job.data.length, 'pages scraped');
213
```
214

215
### Path Filtering
216

217
```typescript
218
const crawlJob = await app.crawl('https://docs.example.com', {
219
  includePaths: ['/api/*', '/guides/*'],
220
  excludePaths: ['/api/v1/*', '*/deprecated/*'],
221
  limit: 200,
222
  scrapeOptions: {
223
    formats: ['markdown'],
224
    onlyMainContent: true
225
  }
226
});
227
```
228

229
### Natural Language Crawl Configuration
230

231
```typescript
232
// Preview what the natural language prompt will do
233
const preview = await app.crawlParamsPreview(
234
  'https://blog.example.com',
235
  'Crawl all blog posts from 2024, exclude author pages and tag pages'
236
);
237
console.log('Generated parameters:', preview);
238

239
// Use natural language prompt
240
const crawlJob = await app.crawl('https://blog.example.com', {
241
  prompt: 'Crawl all blog posts from 2024, exclude author pages and tag pages',
242
  limit: 500,
243
  scrapeOptions: {
244
    formats: ['markdown', {
245
      type: 'json',
246
      schema: {
247
        type: 'object',
248
        properties: {
249
          title: { type: 'string' },
250
          author: { type: 'string' },
251
          publishDate: { type: 'string' },
252
          content: { type: 'string' },
253
          tags: { type: 'array', items: { type: 'string' } }
254
        }
255
      }
256
    }]
257
  }
258
});
259
```
260

261
### Webhook Integration
262

263
```typescript
264
const crawlJob = await app.crawl('https://example.com', {
265
  limit: 100,
266
  webhook: {
267
    url: 'https://myapp.com/webhooks/crawl-complete',
268
    headers: {
269
      'Authorization': 'Bearer my-webhook-token'
270
    },
271
    metadata: {
272
      'userId': '12345',
273
      'jobType': 'content-audit'
274
    },
275
    events: ['completed', 'failed', 'page']
276
  },
277
  scrapeOptions: {
278
    formats: ['markdown']
279
  }
280
});
281
```
282

283
### Advanced Crawl Configuration
284

285
```typescript
286
const crawlJob = await app.crawl('https://example.com', {
287
  // Crawl configuration
288
  maxDiscoveryDepth: 3,
289
  sitemap: 'include',
290
  crawlEntireDomain: false,
291
  allowSubdomains: true,
292
  allowExternalLinks: false,
293
  ignoreQueryParameters: true,
294
  
295
  // Performance
296
  delay: 1000, // 1 second between requests
297
  maxConcurrency: 3,
298
  limit: 500,
299
  
300
  // Content filtering
301
  includePaths: ['/docs/*', '/api/*'],
302
  excludePaths: ['*/private/*', '/admin/*'],
303
  
304
  // Privacy
305
  zeroDataRetention: true,
306
  
307
  // Scraping options
308
  scrapeOptions: {
309
    formats: ['markdown', 'links'],
310
    onlyMainContent: true,
311
    blockAds: true,
312
    mobile: false
313
  }
314
});
315
```
316

317
### Error Handling and Monitoring
318

319
```typescript
320
try {
321
  const crawlJob = await app.crawl('https://example.com', {
322
    limit: 100
323
  });
324
  
325
  // Check for errors
326
  const errors = await app.getCrawlErrors(crawlJob.id);
327
  if (errors.errors.length > 0) {
328
    console.log('Crawl errors:', errors.errors);
329
  }
330
  if (errors.robotsBlocked.length > 0) {
331
    console.log('URLs blocked by robots.txt:', errors.robotsBlocked);
332
  }
333
  
334
} catch (error) {
335
  console.error('Crawl failed:', error);
336
}
337

338
// List all active crawls
339
const activeCrawls = await app.getActiveCrawls();
340
console.log('Currently active crawls:', activeCrawls.crawls);
341
```
342

343
### Pagination Handling
344

345
```typescript
346
// Get first page of results
347
let job = await app.getCrawlStatus('crawl-job-id', {
348
  autoPaginate: false,
349
  maxResults: 10
350
});
351

352
console.log('First 10 results:', job.data);
353

354
// Get all remaining results with pagination
355
if (job.next) {
356
  const allResults = await app.getCrawlStatus('crawl-job-id', {
357
    autoPaginate: true,
358
    maxPages: 10,
359
    maxResults: 1000,
360
    maxWaitTime: 300 // 5 minutes
361
  });
362
  console.log('All results:', allResults.data);
363
}
364
```

Version

Tile

Files

crawling.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

crawling.mddocs/