or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-automation.mdcli-tools.mdconfiguration.mdcore-types.mdcrawlers.mderror-handling.mdevents.mdfingerprinting.mdhttp-clients.mdindex.mdrequest-management.mdsessions.mdstatistics.mdstorage.md

browser-automation.mddocs/

0

# Browser Automation

1

2

Optional Playwright integration for full browser automation with support for JavaScript-heavy sites and complex user interactions. Browser automation capabilities enable crawling of dynamic content that requires JavaScript execution.

3

4

## Capabilities

5

6

### Browser Pool

7

8

Pool of browser instances for efficient resource management and reuse across multiple crawler requests.

9

10

```python { .api }

11

class BrowserPool:

12

def __init__(

13

self,

14

*,

15

browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",

16

max_browsers: int = 10,

17

idle_browser_ttl: timedelta = timedelta(minutes=5),

18

browser_options: dict[str, any] | None = None,

19

page_options: dict[str, any] | None = None

20

): ...

21

22

async def new_page(self, **page_options) -> tuple[Page, Browser]:

23

"""

24

Get new page from browser pool.

25

26

Args:

27

**page_options: Additional options for page creation

28

29

Returns:

30

Tuple of (Page, Browser) objects

31

"""

32

33

async def retire_browser(self, browser: Browser) -> None:

34

"""Remove browser from pool and close it."""

35

36

async def close(self) -> None:

37

"""Close all browsers and clean up pool."""

38

39

@property

40

def browser_type(self) -> str: ...

41

42

@property

43

def active_browsers(self) -> int:

44

"""Number of currently active browsers."""

45

```

46

47

### Playwright Browser Controller

48

49

Controller for managing Playwright browser instances with advanced configuration and lifecycle management.

50

51

```python { .api }

52

class PlaywrightBrowserController:

53

def __init__(

54

self,

55

*,

56

browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",

57

launch_options: dict[str, any] | None = None,

58

new_page_options: dict[str, any] | None = None

59

): ...

60

61

async def launch(self) -> Browser:

62

"""

63

Launch new browser instance.

64

65

Returns:

66

Playwright Browser object

67

"""

68

69

async def new_page(

70

self,

71

browser: Browser | None = None,

72

**page_options

73

) -> Page:

74

"""

75

Create new page in browser.

76

77

Args:

78

browser: Browser instance (creates new if None)

79

**page_options: Options for page creation

80

81

Returns:

82

Playwright Page object

83

"""

84

85

async def close_browser(self, browser: Browser) -> None:

86

"""Close browser instance."""

87

88

@property

89

def browser_type(self) -> str: ...

90

91

@property

92

def launch_options(self) -> dict[str, any]: ...

93

```

94

95

### Playwright Browser Plugin

96

97

Plugin system for extending browser functionality with custom behaviors and middleware.

98

99

```python { .api }

100

class PlaywrightBrowserPlugin:

101

async def before_launch(

102

self,

103

browser_type: str,

104

launch_options: dict[str, any]

105

) -> dict[str, any]:

106

"""

107

Hook called before browser launch.

108

109

Args:

110

browser_type: Type of browser being launched

111

launch_options: Launch options for browser

112

113

Returns:

114

Modified launch options

115

"""

116

117

async def after_launch(self, browser: Browser) -> None:

118

"""

119

Hook called after browser launch.

120

121

Args:

122

browser: Launched browser instance

123

"""

124

125

async def before_page_create(

126

self,

127

browser: Browser,

128

page_options: dict[str, any]

129

) -> dict[str, any]:

130

"""

131

Hook called before page creation.

132

133

Args:

134

browser: Browser instance

135

page_options: Page creation options

136

137

Returns:

138

Modified page options

139

"""

140

141

async def after_page_create(self, page: Page) -> None:

142

"""

143

Hook called after page creation.

144

145

Args:

146

page: Created page instance

147

"""

148

149

async def before_page_close(self, page: Page) -> None:

150

"""Hook called before page closes."""

151

152

async def after_browser_close(self, browser: Browser) -> None:

153

"""Hook called after browser closes."""

154

```

155

156

## Browser Configuration

157

158

### Launch Options

159

160

Common Playwright browser launch options for customizing browser behavior.

161

162

```python { .api }

163

class BrowserLaunchOptions:

164

headless: bool = True

165

slow_mo: int = 0

166

timeout: int = 30000

167

executable_path: str | None = None

168

args: list[str] | None = None

169

ignore_default_args: bool | list[str] = False

170

handle_sigint: bool = True

171

handle_sigterm: bool = True

172

handle_sighup: bool = True

173

proxy: ProxySettings | None = None

174

downloads_path: str | None = None

175

chromium_sandbox: bool | None = None

176

firefox_user_prefs: dict[str, any] | None = None

177

```

178

179

### Page Options

180

181

Configuration options for Playwright page creation and behavior.

182

183

```python { .api }

184

class PageOptions:

185

viewport: ViewportSize | None = None

186

screen: ScreenSize | None = None

187

no_viewport: bool = False

188

ignore_https_errors: bool = False

189

java_script_enabled: bool = True

190

bypass_csp: bool = False

191

user_agent: str | None = None

192

locale: str | None = None

193

timezone_id: str | None = None

194

geolocation: Geolocation | None = None

195

permissions: list[str] | None = None

196

extra_http_headers: dict[str, str] | None = None

197

offline: bool = False

198

http_credentials: HttpCredentials | None = None

199

device_scale_factor: float | None = None

200

is_mobile: bool | None = None

201

has_touch: bool | None = None

202

color_scheme: Literal["light", "dark", "no-preference"] | None = None

203

reduced_motion: Literal["reduce", "no-preference"] | None = None

204

forced_colors: Literal["active", "none"] | None = None

205

```

206

207

## Usage Examples

208

209

### Basic Browser Pool Usage

210

211

```python

212

import asyncio

213

from crawlee.browsers import BrowserPool

214

215

async def main():

216

# Create browser pool

217

browser_pool = BrowserPool(

218

browser_type="chromium",

219

max_browsers=5,

220

idle_browser_ttl=timedelta(minutes=3)

221

)

222

223

try:

224

# Get page from pool

225

page, browser = await browser_pool.new_page()

226

227

# Navigate and interact with page

228

await page.goto('https://example.com')

229

await page.wait_for_load_state('networkidle')

230

231

title = await page.title()

232

print(f"Page title: {title}")

233

234

# Take screenshot

235

await page.screenshot(path='screenshot.png')

236

237

# Close page (browser returns to pool)

238

await page.close()

239

240

finally:

241

# Clean up pool

242

await browser_pool.close()

243

244

asyncio.run(main())

245

```

246

247

### Browser Controller with Custom Options

248

249

```python

250

import asyncio

251

from crawlee.browsers import PlaywrightBrowserController

252

253

async def main():

254

# Configure browser with custom options

255

controller = PlaywrightBrowserController(

256

browser_type="chromium",

257

launch_options={

258

'headless': False, # Show browser window

259

'slow_mo': 50, # Slow down operations

260

'args': [

261

'--disable-blink-features=AutomationControlled',

262

'--disable-dev-shm-usage'

263

]

264

},

265

new_page_options={

266

'viewport': {'width': 1920, 'height': 1080},

267

'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'

268

}

269

)

270

271

try:

272

# Launch browser

273

browser = await controller.launch()

274

275

# Create page with configured options

276

page = await controller.new_page(browser)

277

278

# Navigate and interact

279

await page.goto('https://example.com')

280

281

# Wait for specific element

282

await page.wait_for_selector('h1')

283

284

# Click button if it exists

285

button = page.locator('button:has-text("Accept")')

286

if await button.count() > 0:

287

await button.click()

288

289

# Extract data

290

heading = await page.locator('h1').text_content()

291

print(f"Main heading: {heading}")

292

293

await page.close()

294

await controller.close_browser(browser)

295

296

except Exception as e:

297

print(f"Browser automation error: {e}")

298

299

asyncio.run(main())

300

```

301

302

### Custom Browser Plugin

303

304

```python

305

import asyncio

306

from crawlee.browsers import PlaywrightBrowserPlugin, BrowserPool

307

308

class StealthPlugin(PlaywrightBrowserPlugin):

309

"""Plugin to make browser appear more human-like."""

310

311

async def before_launch(self, browser_type: str, launch_options: dict) -> dict:

312

# Add stealth arguments

313

args = launch_options.get('args', [])

314

args.extend([

315

'--disable-blink-features=AutomationControlled',

316

'--disable-dev-shm-usage',

317

'--no-sandbox',

318

'--disable-setuid-sandbox'

319

])

320

launch_options['args'] = args

321

322

return launch_options

323

324

async def after_page_create(self, page):

325

# Remove webdriver property

326

await page.add_init_script("""

327

Object.defineProperty(navigator, 'webdriver', {

328

get: () => false,

329

});

330

""")

331

332

# Override permissions query

333

await page.add_init_script("""

334

const originalQuery = window.navigator.permissions.query;

335

return window.navigator.permissions.query = (parameters) => (

336

parameters.name === 'notifications' ?

337

Promise.resolve({ state: Cypress.env('granted') }) :

338

originalQuery(parameters)

339

);

340

""")

341

342

async def main():

343

# Create browser pool with custom plugin

344

plugin = StealthPlugin()

345

346

browser_pool = BrowserPool(

347

browser_type="chromium",

348

browser_plugins=[plugin]

349

)

350

351

try:

352

page, browser = await browser_pool.new_page()

353

354

# Browser now has stealth features enabled

355

await page.goto('https://bot-detection-test.com')

356

357

# Check if bot detection was bypassed

358

result = await page.evaluate('() => window.navigator.webdriver')

359

print(f"Webdriver detected: {result}")

360

361

await page.close()

362

363

finally:

364

await browser_pool.close()

365

366

asyncio.run(main())

367

```

368

369

### Integration with Playwright Crawler

370

371

```python

372

import asyncio

373

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

374

from crawlee.browsers import BrowserPool

375

376

async def main():

377

# Create custom browser pool

378

browser_pool = BrowserPool(

379

browser_type="chromium",

380

max_browsers=3,

381

browser_options={

382

'headless': True,

383

'args': ['--disable-dev-shm-usage']

384

},

385

page_options={

386

'viewport': {'width': 1366, 'height': 768},

387

'user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'

388

}

389

)

390

391

# Create crawler with custom browser pool

392

crawler = PlaywrightCrawler(

393

browser_pool=browser_pool,

394

max_requests_per_crawl=20

395

)

396

397

@crawler.router.default_handler

398

async def handler(context: PlaywrightCrawlingContext):

399

page = context.page

400

401

# Wait for page to be fully loaded

402

await page.wait_for_load_state('networkidle')

403

404

# Handle infinite scroll or load more buttons

405

await context.infinite_scroll(max_scroll_height=5000)

406

407

# Extract data using Playwright selectors

408

products = await page.locator('.product').all()

409

410

for product in products:

411

name = await product.locator('.product-name').text_content()

412

price = await product.locator('.price').text_content()

413

414

data = {

415

'url': context.request.url,

416

'name': name.strip() if name else None,

417

'price': price.strip() if price else None

418

}

419

420

await context.push_data(data)

421

422

# Find and enqueue pagination links

423

next_links = await page.locator('a:has-text("Next")').all()

424

for link in next_links:

425

href = await link.get_attribute('href')

426

if href:

427

await context.add_requests([href])

428

429

await crawler.run(['https://example-store.com/products'])

430

431

asyncio.run(main())

432

```

433

434

### Advanced Page Interactions

435

436

```python

437

import asyncio

438

from crawlee.browsers import BrowserPool

439

440

async def main():

441

browser_pool = BrowserPool()

442

443

try:

444

page, browser = await browser_pool.new_page()

445

446

await page.goto('https://example.com/login')

447

448

# Fill login form

449

await page.fill('input[name="username"]', 'myusername')

450

await page.fill('input[name="password"]', 'mypassword')

451

452

# Click login button and wait for navigation

453

async with page.expect_navigation():

454

await page.click('button[type="submit"]')

455

456

# Wait for dashboard to load

457

await page.wait_for_selector('.dashboard')

458

459

# Handle file download

460

async with page.expect_download() as download_info:

461

await page.click('a[href$=".pdf"]')

462

463

download = await download_info.value

464

await download.save_as('./downloaded_file.pdf')

465

466

# Take screenshot of specific element

467

element = page.locator('.important-data')

468

await element.screenshot(path='element_screenshot.png')

469

470

# Execute custom JavaScript

471

result = await page.evaluate('''

472

() => {

473

return {

474

title: document.title,

475

userAgent: navigator.userAgent,

476

cookies: document.cookie

477

};

478

}

479

''')

480

481

print(f"Page info: {result}")

482

483

await page.close()

484

485

finally:

486

await browser_pool.close()

487

488

asyncio.run(main())

489

```