or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-crawling.mdconfiguration-proxies.mdcore-crawling.mdhttp-crawling.mdindex.mdsession-management.mdstorage.mdutilities.md

utilities.mddocs/

0

# Utilities

1

2

Crawlee provides extensive utility functions for common crawling tasks including URL extraction, social media parsing, system detection, and various helper functions for web scraping operations.

3

4

## Capabilities

5

6

### Sleep Utility

7

8

Promise-based sleep function for introducing delays in crawling operations.

9

10

```typescript { .api }

11

/**

12

* Promise-based sleep function

13

* @param millis - Milliseconds to sleep (defaults to random between 1-5 seconds)

14

*/

15

function sleep(millis?: number): Promise<void>;

16

```

17

18

**Usage Examples:**

19

20

```typescript

21

import { sleep } from "crawlee";

22

23

// Sleep for 2 seconds

24

await sleep(2000);

25

26

// Random sleep between 1-5 seconds

27

await sleep();

28

29

// Use in crawler for rate limiting

30

const crawler = new CheerioCrawler({

31

requestHandler: async ({ request }) => {

32

// Process request

33

console.log(`Processing: ${request.url}`);

34

35

// Add delay between requests

36

await sleep(1000);

37

},

38

});

39

```

40

41

### Link Enqueueing

42

43

Extract and enqueue links from web pages with powerful filtering and transformation options.

44

45

```typescript { .api }

46

/**

47

* Extract and enqueue links from HTML pages

48

*/

49

function enqueueLinks(options: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;

50

51

interface EnqueueLinksOptions {

52

/** Cheerio root object or HTML string */

53

$?: CheerioRoot;

54

55

/** Base URL for resolving relative links */

56

baseUrl?: string;

57

58

/** CSS selector for finding links */

59

selector?: string;

60

61

/** Pseudo-URLs for matching links */

62

pseudoUrls?: (string | PseudoUrl)[];

63

64

/** Glob patterns for URLs to include */

65

globs?: string[];

66

67

/** URLs or patterns to exclude */

68

exclude?: (string | RegExp)[];

69

70

/** Label to assign to enqueued requests */

71

label?: string;

72

73

/** Custom user data to attach */

74

userData?: Dictionary;

75

76

/** Transform function for request options */

77

transformRequestFunction?: (request: RequestOptions) => RequestOptions;

78

79

/** Request queue to add requests to */

80

requestQueue?: RequestQueue;

81

82

/** Maximum number of links to enqueue */

83

limit?: number;

84

85

/** Strategy for handling duplicate URLs */

86

strategy?: EnqueueStrategy;

87

}

88

89

type EnqueueStrategy = 'all' | 'same-domain' | 'same-subdomain' | 'same-origin';

90

```

91

92

**Usage Examples:**

93

94

```typescript

95

import { CheerioCrawler, enqueueLinks } from "crawlee";

96

97

const crawler = new CheerioCrawler({

98

requestHandler: async ({ $, request, enqueueLinks: crawlerEnqueueLinks }) => {

99

// Using crawler's built-in enqueueLinks

100

await crawlerEnqueueLinks({

101

selector: 'a[href]',

102

globs: ['**/products/**', '**/category/**'],

103

exclude: [/\/admin\//, /\/login/],

104

label: 'PRODUCT_PAGE',

105

transformRequestFunction: (req) => ({

106

...req,

107

userData: { parentUrl: request.url },

108

}),

109

limit: 50,

110

});

111

112

// Using standalone enqueueLinks function

113

const result = await enqueueLinks({

114

$,

115

baseUrl: request.loadedUrl,

116

selector: '.pagination a',

117

label: 'PAGINATION',

118

strategy: 'same-domain',

119

});

120

121

console.log(`Enqueued ${result.processedRequests} pagination links`);

122

},

123

});

124

```

125

126

### Social Media Parsing

127

128

Comprehensive social media handle and contact extraction from text and HTML content.

129

130

```typescript { .api }

131

const social: {

132

/** Extract email addresses from text */

133

emailsFromText(text: string): string[];

134

135

/** Extract emails from mailto: URLs */

136

emailsFromUrls(urls: string[]): string[];

137

138

/** Extract phone numbers from text */

139

phonesFromText(text: string): string[];

140

141

/** Extract phones from tel: URLs */

142

phonesFromUrls(urls: string[]): string[];

143

144

/** Parse all social handles from HTML */

145

parseHandlesFromHtml(html: string): SocialHandles;

146

147

/** Regular expression patterns for matching emails */

148

EMAIL_REGEX: RegExp;

149

EMAIL_REGEX_GLOBAL: RegExp;

150

151

/** Social platform URL patterns */

152

LINKEDIN_REGEX: RegExp;

153

LINKEDIN_REGEX_GLOBAL: RegExp;

154

INSTAGRAM_REGEX: RegExp;

155

INSTAGRAM_REGEX_GLOBAL: RegExp;

156

TWITTER_REGEX: RegExp;

157

TWITTER_REGEX_GLOBAL: RegExp;

158

FACEBOOK_REGEX: RegExp;

159

FACEBOOK_REGEX_GLOBAL: RegExp;

160

YOUTUBE_REGEX: RegExp;

161

YOUTUBE_REGEX_GLOBAL: RegExp;

162

TIKTOK_REGEX: RegExp;

163

TIKTOK_REGEX_GLOBAL: RegExp;

164

PINTEREST_REGEX: RegExp;

165

PINTEREST_REGEX_GLOBAL: RegExp;

166

DISCORD_REGEX: RegExp;

167

DISCORD_REGEX_GLOBAL: RegExp;

168

};

169

170

interface SocialHandles {

171

emails: string[];

172

phones: string[];

173

linkedIns: string[];

174

twitters: string[];

175

instagrams: string[];

176

facebooks: string[];

177

youtubes: string[];

178

tiktoks: string[];

179

pinterests: string[];

180

discords: string[];

181

phonesUncertain: string[];

182

}

183

```

184

185

**Usage Examples:**

186

187

```typescript

188

import { CheerioCrawler, utils } from "crawlee";

189

190

const crawler = new CheerioCrawler({

191

requestHandler: async ({ $, request, pushData }) => {

192

const html = $.html();

193

const textContent = $.text();

194

195

// Extract all social handles

196

const socialHandles = utils.social.parseHandlesFromHtml(html);

197

198

// Extract emails from text content

199

const emailsInText = utils.social.emailsFromText(textContent);

200

201

// Extract phones from text

202

const phonesInText = utils.social.phonesFromText(textContent);

203

204

// Get all links and extract emails/phones from them

205

const allLinks = [];

206

$('a[href]').each((_, link) => {

207

allLinks.push($(link).attr('href'));

208

});

209

210

const emailsFromLinks = utils.social.emailsFromUrls(allLinks);

211

const phonesFromLinks = utils.social.phonesFromUrls(allLinks);

212

213

// Combine all contacts

214

const allContacts = {

215

url: request.loadedUrl,

216

emails: [...new Set([...socialHandles.emails, ...emailsInText, ...emailsFromLinks])],

217

phones: [...new Set([...socialHandles.phones, ...phonesInText, ...phonesFromLinks])],

218

socialMedia: {

219

linkedin: socialHandles.linkedIns,

220

twitter: socialHandles.twitters,

221

instagram: socialHandles.instagrams,

222

facebook: socialHandles.facebooks,

223

youtube: socialHandles.youtubes,

224

tiktok: socialHandles.tiktoks,

225

pinterest: socialHandles.pinterests,

226

discord: socialHandles.discords,

227

},

228

};

229

230

await pushData(allContacts);

231

},

232

});

233

234

// Custom social media extraction

235

const customText = "Contact us at info@example.com or follow @example on Twitter";

236

const emails = utils.social.emailsFromText(customText);

237

const twitterMatches = customText.match(utils.social.TWITTER_REGEX_GLOBAL);

238

239

console.log('Emails found:', emails);

240

console.log('Twitter handles:', twitterMatches);

241

```

242

243

### URL Utilities

244

245

Functions for URL extraction, validation, and manipulation.

246

247

```typescript { .api }

248

/** Regular expressions for matching URLs */

249

const URL_NO_COMMAS_REGEX: RegExp;

250

const URL_WITH_COMMAS_REGEX: RegExp;

251

252

/**

253

* Extract URLs from text content

254

*/

255

function extractUrls(options: ExtractUrlsOptions): string[];

256

257

/**

258

* Download and parse a list of URLs from a remote source

259

*/

260

function downloadListOfUrls(options: DownloadListOfUrlsOptions): Promise<string[]>;

261

262

/**

263

* Safely create absolute URLs from relative URLs

264

*/

265

function tryAbsoluteURL(href: string, baseUrl: string): string | null;

266

267

interface ExtractUrlsOptions {

268

/** Text content to extract URLs from */

269

string: string;

270

271

/** Whether to include URLs with commas */

272

urlRegex?: RegExp;

273

}

274

275

interface DownloadListOfUrlsOptions {

276

/** URL of the list to download */

277

url: string;

278

279

/** Character encoding */

280

encoding?: BufferEncoding;

281

282

/** Regex pattern to match URLs in the content */

283

urlRegex?: RegExp;

284

}

285

```

286

287

**Usage Examples:**

288

289

```typescript

290

import { utils, CheerioCrawler } from "crawlee";

291

292

// Extract URLs from text

293

const textWithUrls = "Visit https://example.com or check out http://test.com/page";

294

const extractedUrls = utils.extractUrls({ string: textWithUrls });

295

console.log('Found URLs:', extractedUrls);

296

297

// Download URL list from remote source

298

const urlList = await utils.downloadListOfUrls({

299

url: 'https://example.com/sitemap.txt',

300

encoding: 'utf8',

301

});

302

303

// Use in crawler for URL validation

304

const crawler = new CheerioCrawler({

305

requestHandler: async ({ $, request, enqueueLinks }) => {

306

// Extract and validate URLs

307

const allLinks = [];

308

$('a[href]').each((_, element) => {

309

const href = $(element).attr('href');

310

const absoluteUrl = utils.tryAbsoluteURL(href, request.loadedUrl);

311

312

if (absoluteUrl) {

313

allLinks.push(absoluteUrl);

314

}

315

});

316

317

// Find URLs in text content

318

const textContent = $.text();

319

const urlsInText = utils.extractUrls({

320

string: textContent,

321

urlRegex: utils.URL_WITH_COMMAS_REGEX,

322

});

323

324

console.log(`Found ${allLinks.length} links and ${urlsInText.length} URLs in text`);

325

326

await enqueueLinks({

327

urls: allLinks.slice(0, 100), // Limit to first 100 URLs

328

label: 'DISCOVERED',

329

});

330

},

331

});

332

```

333

334

### System Detection

335

336

Functions for detecting the runtime environment and system capabilities.

337

338

```typescript { .api }

339

/**

340

* Detect if running in Docker container

341

* @param forceReset - Force rechecking (internal use)

342

*/

343

function isDocker(forceReset?: boolean): Promise<boolean>;

344

345

/**

346

* Detect if running in any containerized environment

347

*/

348

function isContainerized(): Promise<boolean>;

349

350

/**

351

* Detect if running in AWS Lambda

352

*/

353

function isLambda(): boolean;

354

355

/**

356

* Get cgroup version (V1 or V2)

357

* @param forceReset - Force rechecking (internal use)

358

*/

359

function getCgroupsVersion(forceReset?: boolean): Promise<'V1' | 'V2' | null>;

360

361

interface CpuTicks {

362

/** User CPU time */

363

user: number;

364

365

/** System CPU time */

366

system: number;

367

368

/** Idle CPU time */

369

idle: number;

370

371

/** I/O wait time */

372

iowait: number;

373

374

/** IRQ time */

375

irq: number;

376

377

/** Soft IRQ time */

378

softirq: number;

379

380

/** Steal time */

381

steal: number;

382

383

/** Guest time */

384

guest: number;

385

}

386

387

interface MemoryInfo {

388

/** Total system memory in bytes */

389

totalBytes: number;

390

391

/** Free memory in bytes */

392

freeBytes: number;

393

394

/** Used memory in bytes */

395

usedBytes: number;

396

397

/** Available memory in bytes */

398

availableBytes: number;

399

400

/** Memory usage as a ratio (0-1) */

401

ratio: number;

402

}

403

```

404

405

**Usage Examples:**

406

407

```typescript

408

import { utils, Configuration } from "crawlee";

409

410

// Detect environment and configure accordingly

411

if (await utils.isDocker()) {

412

console.log('Running in Docker - using optimized settings');

413

Configuration.getGlobalConfig().set('defaultDatasetId', 'docker-dataset');

414

}

415

416

if (utils.isLambda()) {

417

console.log('Running in Lambda - reducing memory usage');

418

Configuration.getGlobalConfig().set('memoryMbytes', 512);

419

}

420

421

// Monitor system resources

422

async function logSystemInfo() {

423

console.log('System Status:');

424

console.log(`Containerized: ${await utils.isContainerized()}`);

425

console.log(`Cgroups version: ${await utils.getCgroupsVersion()}`);

426

console.log(`Lambda environment: ${utils.isLambda()}`);

427

}

428

429

// Use in crawler for adaptive behavior

430

const crawler = new BasicCrawler({

431

requestHandler: async ({ request }) => {

432

// Check environment before processing

433

if (await utils.isContainerized()) {

434

console.log('Running in containerized environment');

435

}

436

437

// Process request...

438

},

439

440

// Adjust concurrency based on environment (set at initialization)

441

maxConcurrency: utils.isLambda() ? 1 : 10,

442

});

443

```

444

445

### OpenGraph Parsing

446

447

Extract OpenGraph metadata from HTML pages.

448

449

```typescript { .api }

450

/**

451

* Parse OpenGraph tags from HTML content

452

*/

453

function parseOpenGraph(html: string): Dictionary<string>;

454

```

455

456

**Usage Examples:**

457

458

```typescript

459

import { utils, CheerioCrawler } from "crawlee";

460

461

const crawler = new CheerioCrawler({

462

requestHandler: async ({ $, request, pushData, body }) => {

463

// Parse OpenGraph data

464

const ogData = utils.parseOpenGraph(body);

465

466

// Extract standard metadata

467

const metadata = {

468

url: request.loadedUrl,

469

title: $('title').text() || ogData['og:title'],

470

description: $('meta[name="description"]').attr('content') || ogData['og:description'],

471

image: ogData['og:image'],

472

type: ogData['og:type'],

473

siteName: ogData['og:site_name'],

474

author: ogData['article:author'],

475

publishedTime: ogData['article:published_time'],

476

twitterCard: ogData['twitter:card'],

477

twitterSite: ogData['twitter:site'],

478

// Include all OpenGraph data

479

openGraph: ogData,

480

};

481

482

await pushData(metadata);

483

},

484

});

485

```

486

487

### Mathematical Utilities

488

489

Helper functions for calculations and data processing.

490

491

```typescript { .api }

492

/**

493

* Calculate weighted average from values and weights

494

*/

495

function weightedAvg(values: number[], weights: number[]): number;

496

497

/**

498

* Convert snake_case strings to camelCase

499

*/

500

function snakeCaseToCamelCase(str: string): string;

501

```

502

503

**Usage Examples:**

504

505

```typescript

506

import { utils } from "crawlee";

507

508

// Calculate weighted ratings

509

const ratings = [4.5, 3.8, 4.9, 4.1];

510

const weights = [100, 50, 200, 75]; // Number of reviews

511

const averageRating = utils.weightedAvg(ratings, weights);

512

513

console.log(`Weighted average rating: ${averageRating.toFixed(2)}`);

514

515

// Convert API response keys

516

const apiResponse = {

517

product_name: 'Widget',

518

price_usd: 29.99,

519

is_available: true,

520

created_at: '2023-01-01',

521

};

522

523

const camelCaseResponse = {};

524

Object.entries(apiResponse).forEach(([key, value]) => {

525

const camelKey = utils.snakeCaseToCamelCase(key);

526

camelCaseResponse[camelKey] = value;

527

});

528

529

console.log(camelCaseResponse);

530

// Result: { productName: 'Widget', priceUsd: 29.99, isAvailable: true, createdAt: '2023-01-01' }

531

```

532

533

### DOM Utilities

534

535

Helper functions for DOM manipulation and processing.

536

537

```typescript { .api }

538

/**

539

* Expand shadow DOM roots to access shadow content

540

*/

541

function expandShadowRoots(document: Document): void;

542

```

543

544

**Usage Examples:**

545

546

```typescript

547

import { JSDOMCrawler, utils } from "crawlee";

548

549

const crawler = new JSDOMCrawler({

550

requestHandler: async ({ window, document, request, pushData }) => {

551

// Expand shadow DOM to access hidden content

552

utils.expandShadowRoots(document);

553

554

// Now you can query shadow DOM content

555

const shadowContent = document.querySelectorAll('[data-shadow-content]');

556

557

const extractedData = Array.from(shadowContent).map(element => ({

558

text: element.textContent?.trim(),

559

attributes: Array.from(element.attributes).reduce((attrs, attr) => {

560

attrs[attr.name] = attr.value;

561

return attrs;

562

}, {}),

563

}));

564

565

await pushData({

566

url: request.loadedUrl,

567

shadowDomData: extractedData,

568

hasShadowContent: shadowContent.length > 0,

569

});

570

},

571

});

572

```

573

574

### Unified Utils Object

575

576

The main utils object that combines all utility functions.

577

578

```typescript { .api }

579

const utils: {

580

/** Puppeteer utility functions */

581

puppeteer: typeof puppeteerUtils;

582

583

/** Playwright utility functions */

584

playwright: typeof playwrightUtils;

585

586

/** Logging utility */

587

log: Log;

588

589

/** Link enqueueing function */

590

enqueueLinks: typeof enqueueLinks;

591

592

/** Social media parsing utilities */

593

social: typeof social;

594

595

/** Sleep function */

596

sleep: typeof sleep;

597

598

/** URL list downloading */

599

downloadListOfUrls: typeof downloadListOfUrls;

600

601

/** OpenGraph parsing */

602

parseOpenGraph: typeof parseOpenGraph;

603

604

/** System detection functions */

605

isDocker: typeof isDocker;

606

isLambda: typeof isLambda;

607

isContainerized: typeof isContainerized;

608

getCgroupsVersion: typeof getCgroupsVersion;

609

610

// Note: System monitoring functions are available in utils object but not directly exported

611

612

/** Mathematical utilities */

613

weightedAvg: typeof weightedAvg;

614

615

/** String utilities */

616

snakeCaseToCamelCase: typeof snakeCaseToCamelCase;

617

618

/** URL utilities */

619

extractUrls: typeof extractUrls;

620

tryAbsoluteURL: typeof tryAbsoluteURL;

621

URL_NO_COMMAS_REGEX: RegExp;

622

URL_WITH_COMMAS_REGEX: RegExp;

623

624

/** DOM utilities */

625

expandShadowRoots: typeof expandShadowRoots;

626

};

627

```

628

629

**Usage Examples:**

630

631

```typescript

632

import { utils } from "crawlee";

633

634

// All utilities available through single import

635

console.log('Environment check:');

636

console.log(`Docker: ${utils.isDocker()}`);

637

console.log(`Lambda: ${utils.isLambda()}`);

638

639

// Use social media parsing

640

const html = '<p>Contact: info@example.com, Twitter: @company</p>';

641

const contacts = utils.social.parseHandlesFromHtml(html);

642

643

// Use URL extraction

644

const text = 'Visit https://example.com for more info';

645

const urls = utils.extractUrls({ string: text });

646

647

// Use system detection

648

const isInDocker = await utils.isDocker();

649

console.log(`Running in Docker: ${isInDocker}`);

650

651

// Use in crawler with all utilities

652

const crawler = new CheerioCrawler({

653

requestHandler: async ({ $, request, pushData }) => {

654

// Rate limiting

655

await utils.sleep(1000);

656

657

// Extract data

658

const ogData = utils.parseOpenGraph($.html());

659

const socialData = utils.social.parseHandlesFromHtml($.html());

660

const urls = utils.extractUrls({ string: $.text() });

661

662

await pushData({

663

url: request.loadedUrl,

664

metadata: ogData,

665

contacts: socialData,

666

extractedUrls: urls,

667

systemInfo: {

668

isDocker: await utils.isDocker(),

669

isLambda: utils.isLambda(),

670

},

671

});

672

673

// Environment-aware link enqueueing

674

const isLimitedEnv = utils.isLambda() || await utils.isContainerized();

675

await utils.enqueueLinks({

676

$,

677

baseUrl: request.loadedUrl,

678

selector: 'a[href]',

679

limit: isLimitedEnv ? 10 : 50, // Reduce links in constrained environments

680

});

681

},

682

});

683

```

684

685

## Types

686

687

```typescript { .api }

688

interface Log {

689

/** Log debug message */

690

debug(message: string, data?: any): void;

691

692

/** Log info message */

693

info(message: string, data?: any): void;

694

695

/** Log warning message */

696

warning(message: string, data?: any): void;

697

698

/** Log error message */

699

error(message: string, error?: Error): void;

700

701

/** Log exception */

702

exception(error: Error, message?: string, data?: any): void;

703

704

/** Get child logger with prefix */

705

child(options: { prefix?: string; suffix?: string }): Log;

706

}

707

708

interface PseudoUrl {

709

/** Create pseudo-URL matcher */

710

new (purl: string, requestTemplate?: Partial<RequestOptions>): PseudoUrl;

711

712

/** Test if URL matches pattern */

713

matches(url: string): boolean;

714

715

/** Create request from matched URL */

716

createRequest(url: string): RequestOptions;

717

}

718

719

type BufferEncoding = 'ascii' | 'utf8' | 'utf16le' | 'ucs2' | 'base64' | 'latin1' | 'binary' | 'hex';

720

721

interface RequestTemplate {

722

/** Default user data for matched requests */

723

userData?: Dictionary;

724

725

/** Default label for matched requests */

726

label?: string;

727

728

/** Default HTTP method */

729

method?: HttpMethod;

730

731

/** Default headers */

732

headers?: Dictionary<string>;

733

}

734

```