0
# Browser Automation
1
2
Optional Playwright integration for full browser automation with support for JavaScript-heavy sites and complex user interactions. Browser automation capabilities enable crawling of dynamic content that requires JavaScript execution.
3
4
## Capabilities
5
6
### Browser Pool
7
8
Pool of browser instances for efficient resource management and reuse across multiple crawler requests.
9
10
```python { .api }
11
class BrowserPool:
12
def __init__(
13
self,
14
*,
15
browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",
16
max_browsers: int = 10,
17
idle_browser_ttl: timedelta = timedelta(minutes=5),
18
browser_options: dict[str, any] | None = None,
19
page_options: dict[str, any] | None = None
20
): ...
21
22
async def new_page(self, **page_options) -> tuple[Page, Browser]:
23
"""
24
Get new page from browser pool.
25
26
Args:
27
**page_options: Additional options for page creation
28
29
Returns:
30
Tuple of (Page, Browser) objects
31
"""
32
33
async def retire_browser(self, browser: Browser) -> None:
34
"""Remove browser from pool and close it."""
35
36
async def close(self) -> None:
37
"""Close all browsers and clean up pool."""
38
39
@property
40
def browser_type(self) -> str: ...
41
42
@property
43
def active_browsers(self) -> int:
44
"""Number of currently active browsers."""
45
```
46
47
### Playwright Browser Controller
48
49
Controller for managing Playwright browser instances with advanced configuration and lifecycle management.
50
51
```python { .api }
52
class PlaywrightBrowserController:
53
def __init__(
54
self,
55
*,
56
browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",
57
launch_options: dict[str, any] | None = None,
58
new_page_options: dict[str, any] | None = None
59
): ...
60
61
async def launch(self) -> Browser:
62
"""
63
Launch new browser instance.
64
65
Returns:
66
Playwright Browser object
67
"""
68
69
async def new_page(
70
self,
71
browser: Browser | None = None,
72
**page_options
73
) -> Page:
74
"""
75
Create new page in browser.
76
77
Args:
78
browser: Browser instance (creates new if None)
79
**page_options: Options for page creation
80
81
Returns:
82
Playwright Page object
83
"""
84
85
async def close_browser(self, browser: Browser) -> None:
86
"""Close browser instance."""
87
88
@property
89
def browser_type(self) -> str: ...
90
91
@property
92
def launch_options(self) -> dict[str, any]: ...
93
```
94
95
### Playwright Browser Plugin
96
97
Plugin system for extending browser functionality with custom behaviors and middleware.
98
99
```python { .api }
100
class PlaywrightBrowserPlugin:
101
async def before_launch(
102
self,
103
browser_type: str,
104
launch_options: dict[str, any]
105
) -> dict[str, any]:
106
"""
107
Hook called before browser launch.
108
109
Args:
110
browser_type: Type of browser being launched
111
launch_options: Launch options for browser
112
113
Returns:
114
Modified launch options
115
"""
116
117
async def after_launch(self, browser: Browser) -> None:
118
"""
119
Hook called after browser launch.
120
121
Args:
122
browser: Launched browser instance
123
"""
124
125
async def before_page_create(
126
self,
127
browser: Browser,
128
page_options: dict[str, any]
129
) -> dict[str, any]:
130
"""
131
Hook called before page creation.
132
133
Args:
134
browser: Browser instance
135
page_options: Page creation options
136
137
Returns:
138
Modified page options
139
"""
140
141
async def after_page_create(self, page: Page) -> None:
142
"""
143
Hook called after page creation.
144
145
Args:
146
page: Created page instance
147
"""
148
149
async def before_page_close(self, page: Page) -> None:
150
"""Hook called before page closes."""
151
152
async def after_browser_close(self, browser: Browser) -> None:
153
"""Hook called after browser closes."""
154
```
155
156
## Browser Configuration
157
158
### Launch Options
159
160
Common Playwright browser launch options for customizing browser behavior.
161
162
```python { .api }
163
class BrowserLaunchOptions:
164
headless: bool = True
165
slow_mo: int = 0
166
timeout: int = 30000
167
executable_path: str | None = None
168
args: list[str] | None = None
169
ignore_default_args: bool | list[str] = False
170
handle_sigint: bool = True
171
handle_sigterm: bool = True
172
handle_sighup: bool = True
173
proxy: ProxySettings | None = None
174
downloads_path: str | None = None
175
chromium_sandbox: bool | None = None
176
firefox_user_prefs: dict[str, any] | None = None
177
```
178
179
### Page Options
180
181
Configuration options for Playwright page creation and behavior.
182
183
```python { .api }
184
class PageOptions:
185
viewport: ViewportSize | None = None
186
screen: ScreenSize | None = None
187
no_viewport: bool = False
188
ignore_https_errors: bool = False
189
java_script_enabled: bool = True
190
bypass_csp: bool = False
191
user_agent: str | None = None
192
locale: str | None = None
193
timezone_id: str | None = None
194
geolocation: Geolocation | None = None
195
permissions: list[str] | None = None
196
extra_http_headers: dict[str, str] | None = None
197
offline: bool = False
198
http_credentials: HttpCredentials | None = None
199
device_scale_factor: float | None = None
200
is_mobile: bool | None = None
201
has_touch: bool | None = None
202
color_scheme: Literal["light", "dark", "no-preference"] | None = None
203
reduced_motion: Literal["reduce", "no-preference"] | None = None
204
forced_colors: Literal["active", "none"] | None = None
205
```
206
207
## Usage Examples
208
209
### Basic Browser Pool Usage
210
211
```python
212
import asyncio
213
from crawlee.browsers import BrowserPool
214
215
async def main():
216
# Create browser pool
217
browser_pool = BrowserPool(
218
browser_type="chromium",
219
max_browsers=5,
220
idle_browser_ttl=timedelta(minutes=3)
221
)
222
223
try:
224
# Get page from pool
225
page, browser = await browser_pool.new_page()
226
227
# Navigate and interact with page
228
await page.goto('https://example.com')
229
await page.wait_for_load_state('networkidle')
230
231
title = await page.title()
232
print(f"Page title: {title}")
233
234
# Take screenshot
235
await page.screenshot(path='screenshot.png')
236
237
# Close page (browser returns to pool)
238
await page.close()
239
240
finally:
241
# Clean up pool
242
await browser_pool.close()
243
244
asyncio.run(main())
245
```
246
247
### Browser Controller with Custom Options
248
249
```python
250
import asyncio
251
from crawlee.browsers import PlaywrightBrowserController
252
253
async def main():
254
# Configure browser with custom options
255
controller = PlaywrightBrowserController(
256
browser_type="chromium",
257
launch_options={
258
'headless': False, # Show browser window
259
'slow_mo': 50, # Slow down operations
260
'args': [
261
'--disable-blink-features=AutomationControlled',
262
'--disable-dev-shm-usage'
263
]
264
},
265
new_page_options={
266
'viewport': {'width': 1920, 'height': 1080},
267
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
268
}
269
)
270
271
try:
272
# Launch browser
273
browser = await controller.launch()
274
275
# Create page with configured options
276
page = await controller.new_page(browser)
277
278
# Navigate and interact
279
await page.goto('https://example.com')
280
281
# Wait for specific element
282
await page.wait_for_selector('h1')
283
284
# Click button if it exists
285
button = page.locator('button:has-text("Accept")')
286
if await button.count() > 0:
287
await button.click()
288
289
# Extract data
290
heading = await page.locator('h1').text_content()
291
print(f"Main heading: {heading}")
292
293
await page.close()
294
await controller.close_browser(browser)
295
296
except Exception as e:
297
print(f"Browser automation error: {e}")
298
299
asyncio.run(main())
300
```
301
302
### Custom Browser Plugin
303
304
```python
305
import asyncio
306
from crawlee.browsers import PlaywrightBrowserPlugin, BrowserPool
307
308
class StealthPlugin(PlaywrightBrowserPlugin):
309
"""Plugin to make browser appear more human-like."""
310
311
async def before_launch(self, browser_type: str, launch_options: dict) -> dict:
312
# Add stealth arguments
313
args = launch_options.get('args', [])
314
args.extend([
315
'--disable-blink-features=AutomationControlled',
316
'--disable-dev-shm-usage',
317
'--no-sandbox',
318
'--disable-setuid-sandbox'
319
])
320
launch_options['args'] = args
321
322
return launch_options
323
324
async def after_page_create(self, page):
325
# Remove webdriver property
326
await page.add_init_script("""
327
Object.defineProperty(navigator, 'webdriver', {
328
get: () => false,
329
});
330
""")
331
332
# Override permissions query
333
await page.add_init_script("""
334
const originalQuery = window.navigator.permissions.query;
335
return window.navigator.permissions.query = (parameters) => (
336
parameters.name === 'notifications' ?
337
Promise.resolve({ state: Cypress.env('granted') }) :
338
originalQuery(parameters)
339
);
340
""")
341
342
async def main():
343
# Create browser pool with custom plugin
344
plugin = StealthPlugin()
345
346
browser_pool = BrowserPool(
347
browser_type="chromium",
348
browser_plugins=[plugin]
349
)
350
351
try:
352
page, browser = await browser_pool.new_page()
353
354
# Browser now has stealth features enabled
355
await page.goto('https://bot-detection-test.com')
356
357
# Check if bot detection was bypassed
358
result = await page.evaluate('() => window.navigator.webdriver')
359
print(f"Webdriver detected: {result}")
360
361
await page.close()
362
363
finally:
364
await browser_pool.close()
365
366
asyncio.run(main())
367
```
368
369
### Integration with Playwright Crawler
370
371
```python
372
import asyncio
373
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
374
from crawlee.browsers import BrowserPool
375
376
async def main():
377
# Create custom browser pool
378
browser_pool = BrowserPool(
379
browser_type="chromium",
380
max_browsers=3,
381
browser_options={
382
'headless': True,
383
'args': ['--disable-dev-shm-usage']
384
},
385
page_options={
386
'viewport': {'width': 1366, 'height': 768},
387
'user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
388
}
389
)
390
391
# Create crawler with custom browser pool
392
crawler = PlaywrightCrawler(
393
browser_pool=browser_pool,
394
max_requests_per_crawl=20
395
)
396
397
@crawler.router.default_handler
398
async def handler(context: PlaywrightCrawlingContext):
399
page = context.page
400
401
# Wait for page to be fully loaded
402
await page.wait_for_load_state('networkidle')
403
404
# Handle infinite scroll or load more buttons
405
await context.infinite_scroll(max_scroll_height=5000)
406
407
# Extract data using Playwright selectors
408
products = await page.locator('.product').all()
409
410
for product in products:
411
name = await product.locator('.product-name').text_content()
412
price = await product.locator('.price').text_content()
413
414
data = {
415
'url': context.request.url,
416
'name': name.strip() if name else None,
417
'price': price.strip() if price else None
418
}
419
420
await context.push_data(data)
421
422
# Find and enqueue pagination links
423
next_links = await page.locator('a:has-text("Next")').all()
424
for link in next_links:
425
href = await link.get_attribute('href')
426
if href:
427
await context.add_requests([href])
428
429
await crawler.run(['https://example-store.com/products'])
430
431
asyncio.run(main())
432
```
433
434
### Advanced Page Interactions
435
436
```python
437
import asyncio
438
from crawlee.browsers import BrowserPool
439
440
async def main():
441
browser_pool = BrowserPool()
442
443
try:
444
page, browser = await browser_pool.new_page()
445
446
await page.goto('https://example.com/login')
447
448
# Fill login form
449
await page.fill('input[name="username"]', 'myusername')
450
await page.fill('input[name="password"]', 'mypassword')
451
452
# Click login button and wait for navigation
453
async with page.expect_navigation():
454
await page.click('button[type="submit"]')
455
456
# Wait for dashboard to load
457
await page.wait_for_selector('.dashboard')
458
459
# Handle file download
460
async with page.expect_download() as download_info:
461
await page.click('a[href$=".pdf"]')
462
463
download = await download_info.value
464
await download.save_as('./downloaded_file.pdf')
465
466
# Take screenshot of specific element
467
element = page.locator('.important-data')
468
await element.screenshot(path='element_screenshot.png')
469
470
# Execute custom JavaScript
471
result = await page.evaluate('''
472
() => {
473
return {
474
title: document.title,
475
userAgent: navigator.userAgent,
476
cookies: document.cookie
477
};
478
}
479
''')
480
481
print(f"Page info: {result}")
482
483
await page.close()
484
485
finally:
486
await browser_pool.close()
487
488
asyncio.run(main())
489
```