or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-crawling.mdconfiguration-proxies.mdcore-crawling.mdhttp-crawling.mdindex.mdsession-management.mdstorage.mdutilities.md

browser-crawling.mddocs/

0

# Browser Crawling

1

2

Browser crawling provides full browser automation capabilities for handling JavaScript-heavy websites, dynamic content, and complex user interactions. This includes Puppeteer and Playwright integration with efficient browser pool management.

3

4

## Capabilities

5

6

### BrowserCrawler

7

8

Base browser crawler class that extends BasicCrawler with browser automation capabilities.

9

10

```typescript { .api }

11

/**

12

* Base browser crawler for browser automation with Puppeteer or Playwright

13

*/

14

class BrowserCrawler extends BasicCrawler<BrowserCrawlingContext> {

15

constructor(options: BrowserCrawlerOptions);

16

}

17

```

18

19

### BrowserCrawlerOptions

20

21

Configuration options for the BrowserCrawler.

22

23

```typescript { .api }

24

interface BrowserCrawlerOptions extends BasicCrawlerOptions<BrowserCrawlingContext> {

25

/** Browser launcher options */

26

launchContext?: LaunchContext;

27

28

/** Browser pool configuration */

29

browserPoolOptions?: BrowserPoolOptions;

30

31

/** Whether to block certain resource types for faster loading */

32

blockRequests?: boolean;

33

34

/** List of resource types to block */

35

blockedUrlPatterns?: string[];

36

37

/** Pre-navigation hooks to run before page navigation */

38

preNavigationHooks?: Array<(crawlingContext: BrowserCrawlingContext, gotoOptions: DirectNavigationOptions) => Promise<void>>;

39

40

/** Post-navigation hooks to run after page navigation */

41

postNavigationHooks?: Array<(crawlingContext: BrowserCrawlingContext) => Promise<void>>;

42

43

/** Custom page function to run on each page */

44

pageFunction?: (context: BrowserCrawlingContext) => Promise<void>;

45

46

/** Navigation timeout in milliseconds */

47

navigationTimeoutSecs?: number;

48

49

/** Whether to keep browser context alive between requests */

50

keepAlive?: boolean;

51

52

/** Request interception handler */

53

requestHandler?: (context: BrowserCrawlingContext) => Promise<void>;

54

}

55

```

56

57

### BrowserCrawlingContext

58

59

The context object passed to browser crawler request handlers.

60

61

```typescript { .api }

62

interface BrowserCrawlingContext<UserData = Dictionary> extends BasicCrawlingContext<UserData> {

63

/** The browser page object */

64

page: Page;

65

66

/** Browser context */

67

browserContext: BrowserContext;

68

69

/** The response object from navigation */

70

response?: Response;

71

72

/** Enqueue links found on the page */

73

enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;

74

75

/** Take a screenshot of the page */

76

saveSnapshot(options?: SaveSnapshotOptions): Promise<void>;

77

78

/** Scroll page to load infinite content */

79

infiniteScroll(options?: InfiniteScrollOptions): Promise<void>;

80

81

/** Wait for a selector to appear */

82

waitForSelector(selector: string, options?: WaitForSelectorOptions): Promise<ElementHandle | null>;

83

84

/** Click elements matching selector */

85

clickElements(selector: string, options?: ClickElementsOptions): Promise<void>;

86

}

87

```

88

89

### PuppeteerCrawler

90

91

Browser crawler using Puppeteer for Chrome/Chromium automation.

92

93

```typescript { .api }

94

/**

95

* Puppeteer-based browser crawler for Chrome/Chromium automation

96

*/

97

class PuppeteerCrawler extends BrowserCrawler {

98

constructor(options: PuppeteerCrawlerOptions);

99

}

100

```

101

102

### PuppeteerCrawlerOptions

103

104

Configuration options specific to PuppeteerCrawler.

105

106

```typescript { .api }

107

interface PuppeteerCrawlerOptions extends BrowserCrawlerOptions {

108

/** Handler function that receives Puppeteer context */

109

requestHandler: (context: PuppeteerCrawlingContext) => Promise<void>;

110

111

/** Puppeteer launch options */

112

launchContext?: PuppeteerLaunchContext;

113

114

/** Whether to use Puppeteer request interception */

115

useRequestInterception?: boolean;

116

117

/** Request interception patterns */

118

interceptRequestHandler?: InterceptHandler;

119

120

/** Whether to block requests for faster crawling */

121

blockRequests?: boolean;

122

123

/** Custom viewport settings */

124

viewport?: Viewport;

125

126

/** Whether to use Chrome headless mode */

127

headless?: boolean | 'new';

128

129

/** Additional Chrome launch arguments */

130

args?: string[];

131

}

132

```

133

134

### PuppeteerCrawlingContext

135

136

The context object passed to Puppeteer crawler request handlers.

137

138

```typescript { .api }

139

interface PuppeteerCrawlingContext<UserData = Dictionary> extends BrowserCrawlingContext<UserData> {

140

/** The Puppeteer page object */

141

page: PuppeteerPage;

142

143

/** Browser context */

144

browserContext: PuppeteerBrowserContext;

145

146

/** The Puppeteer response object */

147

response?: PuppeteerResponse;

148

149

/** Enqueue links by clicking elements */

150

enqueueLinksByClickingElements(options: EnqueueLinksByClickingElementsOptions): Promise<BatchAddRequestsResult>;

151

152

/** Compile and evaluate script on page */

153

compileScript(pageFunction: string | Function, options?: CompileScriptOptions): Promise<any>;

154

}

155

```

156

157

**Usage Examples:**

158

159

```typescript

160

import { PuppeteerCrawler, Dataset } from "crawlee";

161

162

const crawler = new PuppeteerCrawler({

163

launchContext: {

164

launchOptions: {

165

headless: true,

166

args: ['--no-sandbox', '--disable-setuid-sandbox'],

167

},

168

},

169

170

requestHandler: async ({ page, request, enqueueLinks, infiniteScroll, saveSnapshot }) => {

171

// Wait for dynamic content to load

172

await page.waitForSelector('.product-list', { timeout: 10000 });

173

174

// Handle infinite scrolling

175

await infiniteScroll({

176

maxScrollHeight: 5000,

177

scrollDownAndUp: true,

178

});

179

180

// Extract data using browser APIs

181

const products = await page.evaluate(() => {

182

return Array.from(document.querySelectorAll('.product')).map(product => ({

183

name: product.querySelector('.name')?.textContent?.trim(),

184

price: product.querySelector('.price')?.textContent?.trim(),

185

image: product.querySelector('img')?.src,

186

rating: product.querySelector('.rating')?.getAttribute('data-rating'),

187

}));

188

});

189

190

// Take screenshot for debugging

191

await saveSnapshot({

192

key: `screenshot-${request.uniqueKey}`,

193

saveHtml: true,

194

});

195

196

await Dataset.pushData({

197

url: request.loadedUrl,

198

products,

199

extractedAt: new Date(),

200

});

201

202

// Find and click "Load More" buttons

203

await page.click('.load-more-btn').catch(() => {

204

// Ignore if button doesn't exist

205

});

206

207

// Enqueue pagination links

208

await enqueueLinks({

209

selector: 'a[href*="page="]',

210

label: 'LIST',

211

});

212

},

213

214

// Enable request blocking for faster crawling

215

blockRequests: true,

216

blockedUrlPatterns: [

217

'**/*.css',

218

'**/*.jpg',

219

'**/*.jpeg',

220

'**/*.png',

221

'**/*.svg',

222

'**/*.gif',

223

'**/*.woff',

224

'**/*.pdf',

225

'**/*.zip',

226

],

227

228

maxConcurrency: 3, // Lower concurrency for browser crawling

229

navigationTimeoutSecs: 30,

230

});

231

```

232

233

### PlaywrightCrawler

234

235

Browser crawler using Playwright for multi-browser automation.

236

237

```typescript { .api }

238

/**

239

* Playwright-based browser crawler supporting Chrome, Firefox, and Safari

240

*/

241

class PlaywrightCrawler extends BrowserCrawler {

242

constructor(options: PlaywrightCrawlerOptions);

243

}

244

```

245

246

### PlaywrightCrawlerOptions

247

248

Configuration options specific to PlaywrightCrawler.

249

250

```typescript { .api }

251

interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions {

252

/** Handler function that receives Playwright context */

253

requestHandler: (context: PlaywrightCrawlingContext) => Promise<void>;

254

255

/** Playwright launch context */

256

launchContext?: PlaywrightLaunchContext;

257

258

/** Browser type to use (chromium, firefox, webkit) */

259

browserName?: 'chromium' | 'firefox' | 'webkit';

260

261

/** Whether to use browser context fingerprinting */

262

useFingerprints?: boolean;

263

264

/** Additional browser launch options */

265

launchOptions?: LaunchOptions;

266

267

/** Experiment with different rendering strategies */

268

experimentalContainers?: boolean;

269

}

270

```

271

272

### PlaywrightCrawlingContext

273

274

The context object passed to Playwright crawler request handlers.

275

276

```typescript { .api }

277

interface PlaywrightCrawlingContext<UserData = Dictionary> extends BrowserCrawlingContext<UserData> {

278

/** The Playwright page object */

279

page: PlaywrightPage;

280

281

/** Browser context */

282

browserContext: PlaywrightBrowserContext;

283

284

/** The Playwright response object */

285

response?: PlaywrightResponse;

286

287

/** Wait for network to be idle */

288

waitForNetworkIdle(options?: WaitForNetworkIdleOptions): Promise<void>;

289

290

/** Handle dialogs (alerts, confirms, prompts) */

291

handleDialog(handler: (dialog: Dialog) => Promise<void>): void;

292

}

293

```

294

295

**Usage Examples:**

296

297

```typescript

298

import { PlaywrightCrawler } from "crawlee";

299

300

const crawler = new PlaywrightCrawler({

301

launchContext: {

302

launcher: 'chromium', // or 'firefox', 'webkit'

303

launchOptions: {

304

headless: true,

305

viewport: { width: 1920, height: 1080 },

306

},

307

},

308

309

requestHandler: async ({ page, request, enqueueLinks, waitForNetworkIdle }) => {

310

// Handle JavaScript-heavy pages

311

await waitForNetworkIdle({ timeout: 30000 });

312

313

// Interact with dynamic forms

314

await page.fill('input[name="search"]', 'example query');

315

await page.click('button[type="submit"]');

316

await page.waitForSelector('.results', { timeout: 10000 });

317

318

// Extract data after JavaScript execution

319

const results = await page.locator('.result-item').evaluateAll(items => {

320

return items.map(item => ({

321

title: item.querySelector('.title')?.textContent?.trim(),

322

description: item.querySelector('.description')?.textContent?.trim(),

323

link: item.querySelector('a')?.href,

324

}));

325

});

326

327

await page.screenshot({

328

path: `screenshots/${request.uniqueKey}.png`,

329

fullPage: true,

330

});

331

332

await Dataset.pushData({

333

url: request.loadedUrl,

334

results,

335

totalCount: results.length,

336

});

337

338

// Handle pagination with JavaScript

339

const hasNextPage = await page.locator('.next-page:not(.disabled)').count() > 0;

340

if (hasNextPage) {

341

await page.click('.next-page');

342

await enqueueLinks({

343

selector: '.next-page',

344

label: 'LIST',

345

});

346

}

347

},

348

349

browserName: 'chromium',

350

maxConcurrency: 2,

351

});

352

```

353

354

### AdaptivePlaywrightCrawler

355

356

Intelligent crawler that automatically switches between HTTP and browser rendering based on page requirements.

357

358

```typescript { .api }

359

/**

360

* Adaptive crawler that switches between HTTP and browser rendering automatically

361

*/

362

class AdaptivePlaywrightCrawler extends PlaywrightCrawler {

363

constructor(options: AdaptivePlaywrightCrawlerOptions);

364

}

365

```

366

367

### AdaptivePlaywrightCrawlerOptions

368

369

Configuration options for the AdaptivePlaywrightCrawler.

370

371

```typescript { .api }

372

interface AdaptivePlaywrightCrawlerOptions extends PlaywrightCrawlerOptions {

373

/** Strategy for determining rendering type */

374

renderingTypeDecisionMaker?: RenderingTypePredictor;

375

376

/** HTTP crawler options for static pages */

377

httpCrawlerOptions?: HttpCrawlerOptions;

378

379

/** Threshold for switching to browser rendering */

380

browserRenderingThreshold?: number;

381

382

/** Whether to cache rendering decisions */

383

cacheDecisions?: boolean;

384

}

385

```

386

387

### RenderingTypePredictor

388

389

Service that predicts the optimal rendering strategy for websites.

390

391

```typescript { .api }

392

/**

393

* Predicts whether a website requires browser rendering or can use HTTP

394

*/

395

class RenderingTypePredictor {

396

constructor();

397

398

/** Predict rendering type for a URL */

399

predictRenderingType(url: string): Promise<RenderingType>;

400

401

/** Store rendering decision for future use */

402

storeResult(url: string, renderingType: RenderingType): void;

403

404

/** Get cached decision if available */

405

getCachedResult(url: string): RenderingType | null;

406

}

407

408

type RenderingType = 'http' | 'browser' | 'hybrid';

409

```

410

411

### BrowserPool

412

413

Manages browser instances efficiently for optimal resource usage.

414

415

```typescript { .api }

416

/**

417

* Pool for managing browser instances with automatic lifecycle management

418

*/

419

class BrowserPool {

420

constructor(options?: BrowserPoolOptions);

421

422

/** Get a browser page from the pool */

423

newPage(options?: NewPageOptions): Promise<{ page: Page; browser: Browser }>;

424

425

/** Return a page to the pool */

426

retire(page: Page): Promise<void>;

427

428

/** Destroy all browsers in the pool */

429

destroy(): Promise<void>;

430

431

/** Get current pool statistics */

432

getStatistics(): BrowserPoolStatistics;

433

}

434

```

435

436

### BrowserPoolOptions

437

438

Configuration options for BrowserPool.

439

440

```typescript { .api }

441

interface BrowserPoolOptions {

442

/** Maximum number of browser instances */

443

maxOpenPagesPerBrowser?: number;

444

445

/** Browser plugins to use */

446

browserPlugins?: BrowserPlugin[];

447

448

/** Browser fingerprinting options */

449

fingerprintOptions?: FingerprintGeneratorOptions;

450

451

/** Whether to use fingerprints */

452

useFingerprints?: boolean;

453

454

/** Browser launch context */

455

launchContext?: LaunchContext;

456

457

/** How often to check for retired browsers */

458

retireBrowserAfterPageCount?: number;

459

460

/** Maximum browser idle time before retirement */

461

maxOpenPagesPerBrowser?: number;

462

}

463

464

interface BrowserPoolStatistics {

465

/** Number of active browsers */

466

activeBrowsers: number;

467

468

/** Number of active pages */

469

activePages: number;

470

471

/** Number of retired browsers */

472

retiredBrowsers: number;

473

474

/** Total pages created */

475

totalPagesCreated: number;

476

}

477

```

478

479

### Browser Launchers

480

481

Specialized launchers for different browser automation libraries.

482

483

```typescript { .api }

484

/**

485

* Puppeteer browser launcher

486

*/

487

class PuppeteerLauncher {

488

constructor(options?: PuppeteerLauncherOptions);

489

490

/** Launch a Puppeteer browser */

491

launch(options?: LaunchOptions): Promise<Browser>;

492

}

493

494

/**

495

* Playwright browser launcher

496

*/

497

class PlaywrightLauncher {

498

constructor(options?: PlaywrightLauncherOptions);

499

500

/** Launch a Playwright browser */

501

launch(options?: LaunchOptions): Promise<Browser>;

502

}

503

```

504

505

### Utility Functions

506

507

Browser automation helper functions.

508

509

```typescript { .api }

510

const puppeteerUtils: {

511

/** Block requests matching patterns */

512

blockRequests(page: PuppeteerPage, options?: BlockRequestsOptions): Promise<void>;

513

514

/** Cache responses for faster loading */

515

cacheResponses(page: PuppeteerPage, cache: Map<string, any>): Promise<void>;

516

517

/** Compile and inject JavaScript into page */

518

compileScript(scriptString: string, context?: any): CompiledScriptFunction;

519

520

/** Navigate with retries and error handling */

521

gotoExtended(page: PuppeteerPage, request: Request, options?: DirectNavigationOptions): Promise<Response | null>;

522

523

/** Infinite scroll implementation */

524

infiniteScroll(page: PuppeteerPage, options?: InfiniteScrollOptions): Promise<void>;

525

526

/** Save page snapshot (HTML + screenshot) */

527

saveSnapshot(page: PuppeteerPage, options?: SaveSnapshotOptions): Promise<void>;

528

529

/** Enqueue links by clicking elements */

530

enqueueLinksByClickingElements(options: EnqueueLinksByClickingElementsOptions): Promise<BatchAddRequestsResult>;

531

};

532

533

const playwrightUtils: {

534

/** Block requests matching patterns */

535

blockRequests(page: PlaywrightPage, options?: BlockRequestsOptions): Promise<void>;

536

537

/** Navigate with retries and error handling */

538

gotoExtended(page: PlaywrightPage, request: Request, options?: DirectNavigationOptions): Promise<Response | null>;

539

540

/** Infinite scroll implementation */

541

infiniteScroll(page: PlaywrightPage, options?: InfiniteScrollOptions): Promise<void>;

542

543

/** Save page snapshot (HTML + screenshot) */

544

saveSnapshot(page: PlaywrightPage, options?: SaveSnapshotOptions): Promise<void>;

545

546

/** Wait for network to be idle */

547

waitForNetworkIdle(page: PlaywrightPage, options?: WaitForNetworkIdleOptions): Promise<void>;

548

};

549

```

550

551

**Usage Examples:**

552

553

```typescript

554

import { PuppeteerCrawler, puppeteerUtils } from "crawlee";

555

556

const crawler = new PuppeteerCrawler({

557

preNavigationHooks: [

558

async ({ page }, gotoOptions) => {

559

// Block unnecessary resources

560

await puppeteerUtils.blockRequests(page, {

561

urlPatterns: ['.css', '.jpg', '.png'],

562

});

563

564

// Set custom headers

565

await page.setExtraHTTPHeaders({

566

'Accept-Language': 'en-US,en;q=0.9',

567

});

568

},

569

],

570

571

postNavigationHooks: [

572

async ({ page }) => {

573

// Wait for dynamic content

574

await page.waitForSelector('.dynamic-content', { timeout: 5000 });

575

576

// Inject custom scripts

577

await page.addScriptTag({

578

content: 'window.customFlag = true;',

579

});

580

},

581

],

582

583

requestHandler: async ({ page, request, infiniteScroll, saveSnapshot }) => {

584

// Use utility functions

585

await infiniteScroll({

586

maxScrollHeight: 10000,

587

waitForSecs: 2,

588

});

589

590

// Take snapshot for debugging

591

await saveSnapshot({

592

key: `snapshot-${Date.now()}`,

593

saveHtml: true,

594

saveScreenshot: true,

595

});

596

597

// Extract data...

598

},

599

});

600

```

601

602

## Types

603

604

```typescript { .api }

605

interface LaunchContext {

606

/** Browser launcher instance */

607

launcher?: any;

608

609

/** Browser launch options */

610

launchOptions?: LaunchOptions;

611

612

/** Browser type identifier */

613

browserName?: BrowserName;

614

615

/** Whether to use stealth mode */

616

useIncognito?: boolean;

617

618

/** Proxy configuration */

619

proxyUrl?: string;

620

621

/** User data directory for persistent sessions */

622

userDataDir?: string;

623

}

624

625

interface DirectNavigationOptions {

626

/** Navigation timeout */

627

timeout?: number;

628

629

/** Wait until condition */

630

waitUntil?: 'load' | 'domcontentloaded' | 'networkidle0' | 'networkidle2';

631

632

/** Referer header */

633

referer?: string;

634

}

635

636

interface InfiniteScrollOptions {

637

/** Maximum height to scroll */

638

maxScrollHeight?: number;

639

640

/** Time to wait between scrolls */

641

waitForSecs?: number;

642

643

/** Scroll down and back up */

644

scrollDownAndUp?: boolean;

645

646

/** Custom scroll function */

647

scrollFunction?: string;

648

649

/** Stop scrolling condition */

650

stopScrollCallback?: () => boolean;

651

}

652

653

interface SaveSnapshotOptions {

654

/** Key to save under */

655

key: string;

656

657

/** Save HTML content */

658

saveHtml?: boolean;

659

660

/** Save screenshot */

661

saveScreenshot?: boolean;

662

663

/** Screenshot options */

664

screenshotOptions?: {

665

fullPage?: boolean;

666

quality?: number;

667

type?: 'png' | 'jpeg';

668

};

669

670

/** Key-value store to save to */

671

keyValueStore?: KeyValueStore;

672

}

673

674

interface BlockRequestsOptions {

675

/** URL patterns to block */

676

urlPatterns?: string[];

677

678

/** Extra URL patterns to block */

679

extraUrlPatterns?: string[];

680

681

/** Whether to block CSS */

682

blockCssRequests?: boolean;

683

684

/** Whether to block fonts */

685

blockFontRequests?: boolean;

686

687

/** Whether to block images */

688

blockImageRequests?: boolean;

689

690

/** Custom request handler */

691

requestHandler?: (request: any) => boolean;

692

}

693

694

interface ClickElementsOptions {

695

/** Maximum number of elements to click */

696

limit?: number;

697

698

/** Delay between clicks */

699

delay?: number;

700

701

/** Whether to wait for navigation after clicking */

702

waitForNavigation?: boolean;

703

704

/** Timeout for clicking each element */

705

timeout?: number;

706

}

707

708

interface EnqueueLinksByClickingElementsOptions extends CrawlerEnqueueLinksOptions {

709

/** Elements to click for finding links */

710

selector: string;

711

712

/** Wait for selector after clicking */

713

waitForSelector?: string;

714

715

/** Maximum number of clicks */

716

clickLimit?: number;

717

}

718

719

interface WaitForNetworkIdleOptions {

720

/** Timeout for network idle */

721

timeout?: number;

722

723

/** Time to wait with no network requests */

724

idleTime?: number;

725

}

726

727

interface CompileScriptOptions {

728

/** Context variables to inject */

729

context?: any;

730

731

/** Whether to return a promise */

732

async?: boolean;

733

}

734

735

type CompiledScriptFunction = (...args: any[]) => Promise<any>;

736

737

enum BrowserName {

738

CHROMIUM = 'chromium',

739

CHROME = 'chrome',

740

FIREFOX = 'firefox',

741

WEBKIT = 'webkit',

742

SAFARI = 'webkit',

743

}

744

745

interface Viewport {

746

/** Width in pixels */

747

width: number;

748

749

/** Height in pixels */

750

height: number;

751

752

/** Device scale factor */

753

deviceScaleFactor?: number;

754

755

/** Whether it's a mobile device */

756

isMobile?: boolean;

757

758

/** Whether it has touch support */

759

hasTouch?: boolean;

760

761

/** Whether it's in landscape mode */

762

isLandscape?: boolean;

763

}

764

765

interface FingerprintGeneratorOptions {

766

/** Browser fingerprints to generate */

767

browsers?: BrowserName[];

768

769

/** Operating systems to simulate */

770

operatingSystems?: OperatingSystemsName[];

771

772

/** Device categories to simulate */

773

devices?: DeviceCategory[];

774

775

/** Locale settings */

776

locales?: string[];

777

}

778

779

enum DeviceCategory {

780

DESKTOP = 'desktop',

781

MOBILE = 'mobile',

782

}

783

784

enum OperatingSystemsName {

785

WINDOWS = 'windows',

786

MACOS = 'macos',

787

LINUX = 'linux',

788

ANDROID = 'android',

789

IOS = 'ios',

790

}

791

```