0
# Web Crawling
1
2
Recursive website crawling with configurable limits, path filtering, webhook support, and job monitoring.
3
4
## Core Crawling Methods
5
6
```typescript { .api }
7
/**
8
* Start an async crawl job
9
* @param url - Root URL to crawl
10
* @param req - Crawl configuration options
11
* @returns Promise resolving to job ID and URL
12
*/
13
startCrawl(url: string, req?: CrawlOptions): Promise<CrawlResponse>;
14
15
/**
16
* Get crawl job status and partial data
17
* @param jobId - Crawl job identifier
18
* @param pagination - Pagination configuration for results
19
* @returns Promise resolving to job status and data
20
*/
21
getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
22
23
/**
24
* Cancel a running crawl job
25
* @param jobId - Crawl job identifier
26
* @returns Promise resolving to true if cancelled
27
*/
28
cancelCrawl(jobId: string): Promise<boolean>;
29
30
/**
31
* Convenience waiter: start crawl and poll until completion
32
* @param url - Root URL to crawl
33
* @param req - Crawl configuration plus waiter controls
34
* @returns Promise resolving to final job snapshot
35
*/
36
crawl(url: string, req?: CrawlOptions & { pollInterval?: number; timeout?: number }): Promise<CrawlJob>;
37
38
/**
39
* Retrieve crawl errors and robots.txt blocks
40
* @param crawlId - Crawl job identifier
41
* @returns Promise resolving to error details
42
*/
43
getCrawlErrors(crawlId: string): Promise<CrawlErrorsResponse>;
44
45
/**
46
* List active crawls for the authenticated team
47
* @returns Promise resolving to active crawls list
48
*/
49
getActiveCrawls(): Promise<ActiveCrawlsResponse>;
50
51
/**
52
* Preview normalized crawl parameters from natural language
53
* @param url - Root URL
54
* @param prompt - Natural language instruction
55
* @returns Promise resolving to normalized parameters
56
*/
57
crawlParamsPreview(url: string, prompt: string): Promise<Record<string, unknown>>;
58
```
59
60
## Crawl Configuration
61
62
```typescript { .api }
63
interface CrawlOptions {
64
// Natural language crawl configuration
65
prompt?: string | null;
66
67
// Path filtering
68
excludePaths?: string[] | null;
69
includePaths?: string[] | null;
70
71
// Crawl behavior
72
maxDiscoveryDepth?: number | null;
73
sitemap?: "skip" | "include";
74
ignoreQueryParameters?: boolean;
75
limit?: number | null;
76
crawlEntireDomain?: boolean;
77
allowExternalLinks?: boolean;
78
allowSubdomains?: boolean;
79
80
// Performance control
81
delay?: number | null;
82
maxConcurrency?: number | null;
83
84
// Notifications
85
webhook?: string | WebhookConfig | null;
86
87
// Content processing
88
scrapeOptions?: ScrapeOptions | null;
89
90
// Privacy
91
zeroDataRetention?: boolean;
92
93
// Integration tracking
94
integration?: string;
95
}
96
```
97
98
## Response Types
99
100
```typescript { .api }
101
// Crawl initiation response
102
interface CrawlResponse {
103
id: string;
104
url: string;
105
}
106
107
// Crawl job status and data
108
interface CrawlJob {
109
status: "scraping" | "completed" | "failed" | "cancelled";
110
total: number;
111
completed: number;
112
creditsUsed?: number;
113
expiresAt?: string;
114
next?: string | null;
115
data: Document[];
116
}
117
118
// Crawl error details
119
interface CrawlErrorsResponse {
120
errors: {
121
id: string;
122
timestamp?: string;
123
url: string;
124
code?: string;
125
error: string;
126
}[];
127
robotsBlocked: string[];
128
}
129
130
// Active crawls listing
131
interface ActiveCrawlsResponse {
132
success: boolean;
133
crawls: ActiveCrawl[];
134
}
135
136
interface ActiveCrawl {
137
id: string;
138
teamId: string;
139
url: string;
140
options?: Record<string, unknown> | null;
141
}
142
```
143
144
## Webhook Configuration
145
146
```typescript { .api }
147
interface WebhookConfig {
148
url: string;
149
headers?: Record<string, string>;
150
metadata?: Record<string, string>;
151
events?: Array<"completed" | "failed" | "page" | "started">;
152
}
153
```
154
155
## Pagination Configuration
156
157
```typescript { .api }
158
interface PaginationConfig {
159
// Automatically follow `next` links and aggregate documents
160
autoPaginate?: boolean;
161
162
// Maximum additional pages to fetch after first response
163
maxPages?: number;
164
165
// Maximum total documents to return across all pages
166
maxResults?: number;
167
168
// Maximum time to spend fetching additional pages (seconds)
169
maxWaitTime?: number;
170
}
171
```
172
173
## Usage Examples
174
175
### Basic Crawling
176
177
```typescript
178
// Simple crawl with limit
179
const crawlJob = await app.crawl('https://example.com', {
180
limit: 50,
181
scrapeOptions: {
182
formats: ['markdown']
183
}
184
});
185
186
console.log(`Crawled ${crawlJob.completed} of ${crawlJob.total} pages`);
187
console.log(crawlJob.data); // Array of scraped documents
188
```
189
190
### Async Crawl with Status Monitoring
191
192
```typescript
193
// Start crawl job
194
const crawlResponse = await app.startCrawl('https://example.com', {
195
limit: 100,
196
maxConcurrency: 5,
197
scrapeOptions: {
198
formats: ['markdown', 'links']
199
}
200
});
201
202
console.log(`Started crawl job: ${crawlResponse.id}`);
203
204
// Monitor status
205
let job: CrawlJob;
206
do {
207
await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds
208
job = await app.getCrawlStatus(crawlResponse.id);
209
console.log(`Progress: ${job.completed}/${job.total} - Status: ${job.status}`);
210
} while (job.status === 'scraping');
211
212
console.log('Crawl completed!', job.data.length, 'pages scraped');
213
```
214
215
### Path Filtering
216
217
```typescript
218
const crawlJob = await app.crawl('https://docs.example.com', {
219
includePaths: ['/api/*', '/guides/*'],
220
excludePaths: ['/api/v1/*', '*/deprecated/*'],
221
limit: 200,
222
scrapeOptions: {
223
formats: ['markdown'],
224
onlyMainContent: true
225
}
226
});
227
```
228
229
### Natural Language Crawl Configuration
230
231
```typescript
232
// Preview what the natural language prompt will do
233
const preview = await app.crawlParamsPreview(
234
'https://blog.example.com',
235
'Crawl all blog posts from 2024, exclude author pages and tag pages'
236
);
237
console.log('Generated parameters:', preview);
238
239
// Use natural language prompt
240
const crawlJob = await app.crawl('https://blog.example.com', {
241
prompt: 'Crawl all blog posts from 2024, exclude author pages and tag pages',
242
limit: 500,
243
scrapeOptions: {
244
formats: ['markdown', {
245
type: 'json',
246
schema: {
247
type: 'object',
248
properties: {
249
title: { type: 'string' },
250
author: { type: 'string' },
251
publishDate: { type: 'string' },
252
content: { type: 'string' },
253
tags: { type: 'array', items: { type: 'string' } }
254
}
255
}
256
}]
257
}
258
});
259
```
260
261
### Webhook Integration
262
263
```typescript
264
const crawlJob = await app.crawl('https://example.com', {
265
limit: 100,
266
webhook: {
267
url: 'https://myapp.com/webhooks/crawl-complete',
268
headers: {
269
'Authorization': 'Bearer my-webhook-token'
270
},
271
metadata: {
272
'userId': '12345',
273
'jobType': 'content-audit'
274
},
275
events: ['completed', 'failed', 'page']
276
},
277
scrapeOptions: {
278
formats: ['markdown']
279
}
280
});
281
```
282
283
### Advanced Crawl Configuration
284
285
```typescript
286
const crawlJob = await app.crawl('https://example.com', {
287
// Crawl configuration
288
maxDiscoveryDepth: 3,
289
sitemap: 'include',
290
crawlEntireDomain: false,
291
allowSubdomains: true,
292
allowExternalLinks: false,
293
ignoreQueryParameters: true,
294
295
// Performance
296
delay: 1000, // 1 second between requests
297
maxConcurrency: 3,
298
limit: 500,
299
300
// Content filtering
301
includePaths: ['/docs/*', '/api/*'],
302
excludePaths: ['*/private/*', '/admin/*'],
303
304
// Privacy
305
zeroDataRetention: true,
306
307
// Scraping options
308
scrapeOptions: {
309
formats: ['markdown', 'links'],
310
onlyMainContent: true,
311
blockAds: true,
312
mobile: false
313
}
314
});
315
```
316
317
### Error Handling and Monitoring
318
319
```typescript
320
try {
321
const crawlJob = await app.crawl('https://example.com', {
322
limit: 100
323
});
324
325
// Check for errors
326
const errors = await app.getCrawlErrors(crawlJob.id);
327
if (errors.errors.length > 0) {
328
console.log('Crawl errors:', errors.errors);
329
}
330
if (errors.robotsBlocked.length > 0) {
331
console.log('URLs blocked by robots.txt:', errors.robotsBlocked);
332
}
333
334
} catch (error) {
335
console.error('Crawl failed:', error);
336
}
337
338
// List all active crawls
339
const activeCrawls = await app.getActiveCrawls();
340
console.log('Currently active crawls:', activeCrawls.crawls);
341
```
342
343
### Pagination Handling
344
345
```typescript
346
// Get first page of results
347
let job = await app.getCrawlStatus('crawl-job-id', {
348
autoPaginate: false,
349
maxResults: 10
350
});
351
352
console.log('First 10 results:', job.data);
353
354
// Get all remaining results with pagination
355
if (job.next) {
356
const allResults = await app.getCrawlStatus('crawl-job-id', {
357
autoPaginate: true,
358
maxPages: 10,
359
maxResults: 1000,
360
maxWaitTime: 300 // 5 minutes
361
});
362
console.log('All results:', allResults.data);
363
}
364
```