or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch.mdcrawling.mdextraction.mdindex.mdmapping.mdmonitoring.mdscraping.mdsearch.mdusage.mdv1-api.md

crawling.mddocs/

0

# Web Crawling

1

2

Recursive website crawling with configurable limits, path filtering, webhook support, and job monitoring.

3

4

## Core Crawling Methods

5

6

```typescript { .api }

7

/**

8

* Start an async crawl job

9

* @param url - Root URL to crawl

10

* @param req - Crawl configuration options

11

* @returns Promise resolving to job ID and URL

12

*/

13

startCrawl(url: string, req?: CrawlOptions): Promise<CrawlResponse>;

14

15

/**

16

* Get crawl job status and partial data

17

* @param jobId - Crawl job identifier

18

* @param pagination - Pagination configuration for results

19

* @returns Promise resolving to job status and data

20

*/

21

getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;

22

23

/**

24

* Cancel a running crawl job

25

* @param jobId - Crawl job identifier

26

* @returns Promise resolving to true if cancelled

27

*/

28

cancelCrawl(jobId: string): Promise<boolean>;

29

30

/**

31

* Convenience waiter: start crawl and poll until completion

32

* @param url - Root URL to crawl

33

* @param req - Crawl configuration plus waiter controls

34

* @returns Promise resolving to final job snapshot

35

*/

36

crawl(url: string, req?: CrawlOptions & { pollInterval?: number; timeout?: number }): Promise<CrawlJob>;

37

38

/**

39

* Retrieve crawl errors and robots.txt blocks

40

* @param crawlId - Crawl job identifier

41

* @returns Promise resolving to error details

42

*/

43

getCrawlErrors(crawlId: string): Promise<CrawlErrorsResponse>;

44

45

/**

46

* List active crawls for the authenticated team

47

* @returns Promise resolving to active crawls list

48

*/

49

getActiveCrawls(): Promise<ActiveCrawlsResponse>;

50

51

/**

52

* Preview normalized crawl parameters from natural language

53

* @param url - Root URL

54

* @param prompt - Natural language instruction

55

* @returns Promise resolving to normalized parameters

56

*/

57

crawlParamsPreview(url: string, prompt: string): Promise<Record<string, unknown>>;

58

```

59

60

## Crawl Configuration

61

62

```typescript { .api }

63

interface CrawlOptions {

64

// Natural language crawl configuration

65

prompt?: string | null;

66

67

// Path filtering

68

excludePaths?: string[] | null;

69

includePaths?: string[] | null;

70

71

// Crawl behavior

72

maxDiscoveryDepth?: number | null;

73

sitemap?: "skip" | "include";

74

ignoreQueryParameters?: boolean;

75

limit?: number | null;

76

crawlEntireDomain?: boolean;

77

allowExternalLinks?: boolean;

78

allowSubdomains?: boolean;

79

80

// Performance control

81

delay?: number | null;

82

maxConcurrency?: number | null;

83

84

// Notifications

85

webhook?: string | WebhookConfig | null;

86

87

// Content processing

88

scrapeOptions?: ScrapeOptions | null;

89

90

// Privacy

91

zeroDataRetention?: boolean;

92

93

// Integration tracking

94

integration?: string;

95

}

96

```

97

98

## Response Types

99

100

```typescript { .api }

101

// Crawl initiation response

102

interface CrawlResponse {

103

id: string;

104

url: string;

105

}

106

107

// Crawl job status and data

108

interface CrawlJob {

109

status: "scraping" | "completed" | "failed" | "cancelled";

110

total: number;

111

completed: number;

112

creditsUsed?: number;

113

expiresAt?: string;

114

next?: string | null;

115

data: Document[];

116

}

117

118

// Crawl error details

119

interface CrawlErrorsResponse {

120

errors: {

121

id: string;

122

timestamp?: string;

123

url: string;

124

code?: string;

125

error: string;

126

}[];

127

robotsBlocked: string[];

128

}

129

130

// Active crawls listing

131

interface ActiveCrawlsResponse {

132

success: boolean;

133

crawls: ActiveCrawl[];

134

}

135

136

interface ActiveCrawl {

137

id: string;

138

teamId: string;

139

url: string;

140

options?: Record<string, unknown> | null;

141

}

142

```

143

144

## Webhook Configuration

145

146

```typescript { .api }

147

interface WebhookConfig {

148

url: string;

149

headers?: Record<string, string>;

150

metadata?: Record<string, string>;

151

events?: Array<"completed" | "failed" | "page" | "started">;

152

}

153

```

154

155

## Pagination Configuration

156

157

```typescript { .api }

158

interface PaginationConfig {

159

// Automatically follow `next` links and aggregate documents

160

autoPaginate?: boolean;

161

162

// Maximum additional pages to fetch after first response

163

maxPages?: number;

164

165

// Maximum total documents to return across all pages

166

maxResults?: number;

167

168

// Maximum time to spend fetching additional pages (seconds)

169

maxWaitTime?: number;

170

}

171

```

172

173

## Usage Examples

174

175

### Basic Crawling

176

177

```typescript

178

// Simple crawl with limit

179

const crawlJob = await app.crawl('https://example.com', {

180

limit: 50,

181

scrapeOptions: {

182

formats: ['markdown']

183

}

184

});

185

186

console.log(`Crawled ${crawlJob.completed} of ${crawlJob.total} pages`);

187

console.log(crawlJob.data); // Array of scraped documents

188

```

189

190

### Async Crawl with Status Monitoring

191

192

```typescript

193

// Start crawl job

194

const crawlResponse = await app.startCrawl('https://example.com', {

195

limit: 100,

196

maxConcurrency: 5,

197

scrapeOptions: {

198

formats: ['markdown', 'links']

199

}

200

});

201

202

console.log(`Started crawl job: ${crawlResponse.id}`);

203

204

// Monitor status

205

let job: CrawlJob;

206

do {

207

await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds

208

job = await app.getCrawlStatus(crawlResponse.id);

209

console.log(`Progress: ${job.completed}/${job.total} - Status: ${job.status}`);

210

} while (job.status === 'scraping');

211

212

console.log('Crawl completed!', job.data.length, 'pages scraped');

213

```

214

215

### Path Filtering

216

217

```typescript

218

const crawlJob = await app.crawl('https://docs.example.com', {

219

includePaths: ['/api/*', '/guides/*'],

220

excludePaths: ['/api/v1/*', '*/deprecated/*'],

221

limit: 200,

222

scrapeOptions: {

223

formats: ['markdown'],

224

onlyMainContent: true

225

}

226

});

227

```

228

229

### Natural Language Crawl Configuration

230

231

```typescript

232

// Preview what the natural language prompt will do

233

const preview = await app.crawlParamsPreview(

234

'https://blog.example.com',

235

'Crawl all blog posts from 2024, exclude author pages and tag pages'

236

);

237

console.log('Generated parameters:', preview);

238

239

// Use natural language prompt

240

const crawlJob = await app.crawl('https://blog.example.com', {

241

prompt: 'Crawl all blog posts from 2024, exclude author pages and tag pages',

242

limit: 500,

243

scrapeOptions: {

244

formats: ['markdown', {

245

type: 'json',

246

schema: {

247

type: 'object',

248

properties: {

249

title: { type: 'string' },

250

author: { type: 'string' },

251

publishDate: { type: 'string' },

252

content: { type: 'string' },

253

tags: { type: 'array', items: { type: 'string' } }

254

}

255

}

256

}]

257

}

258

});

259

```

260

261

### Webhook Integration

262

263

```typescript

264

const crawlJob = await app.crawl('https://example.com', {

265

limit: 100,

266

webhook: {

267

url: 'https://myapp.com/webhooks/crawl-complete',

268

headers: {

269

'Authorization': 'Bearer my-webhook-token'

270

},

271

metadata: {

272

'userId': '12345',

273

'jobType': 'content-audit'

274

},

275

events: ['completed', 'failed', 'page']

276

},

277

scrapeOptions: {

278

formats: ['markdown']

279

}

280

});

281

```

282

283

### Advanced Crawl Configuration

284

285

```typescript

286

const crawlJob = await app.crawl('https://example.com', {

287

// Crawl configuration

288

maxDiscoveryDepth: 3,

289

sitemap: 'include',

290

crawlEntireDomain: false,

291

allowSubdomains: true,

292

allowExternalLinks: false,

293

ignoreQueryParameters: true,

294

295

// Performance

296

delay: 1000, // 1 second between requests

297

maxConcurrency: 3,

298

limit: 500,

299

300

// Content filtering

301

includePaths: ['/docs/*', '/api/*'],

302

excludePaths: ['*/private/*', '/admin/*'],

303

304

// Privacy

305

zeroDataRetention: true,

306

307

// Scraping options

308

scrapeOptions: {

309

formats: ['markdown', 'links'],

310

onlyMainContent: true,

311

blockAds: true,

312

mobile: false

313

}

314

});

315

```

316

317

### Error Handling and Monitoring

318

319

```typescript

320

try {

321

const crawlJob = await app.crawl('https://example.com', {

322

limit: 100

323

});

324

325

// Check for errors

326

const errors = await app.getCrawlErrors(crawlJob.id);

327

if (errors.errors.length > 0) {

328

console.log('Crawl errors:', errors.errors);

329

}

330

if (errors.robotsBlocked.length > 0) {

331

console.log('URLs blocked by robots.txt:', errors.robotsBlocked);

332

}

333

334

} catch (error) {

335

console.error('Crawl failed:', error);

336

}

337

338

// List all active crawls

339

const activeCrawls = await app.getActiveCrawls();

340

console.log('Currently active crawls:', activeCrawls.crawls);

341

```

342

343

### Pagination Handling

344

345

```typescript

346

// Get first page of results

347

let job = await app.getCrawlStatus('crawl-job-id', {

348

autoPaginate: false,

349

maxResults: 10

350

});

351

352

console.log('First 10 results:', job.data);

353

354

// Get all remaining results with pagination

355

if (job.next) {

356

const allResults = await app.getCrawlStatus('crawl-job-id', {

357

autoPaginate: true,

358

maxPages: 10,

359

maxResults: 1000,

360

maxWaitTime: 300 // 5 minutes

361

});

362

console.log('All results:', allResults.data);

363

}

364

```