A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Optional Playwright integration for full browser automation with support for JavaScript-heavy sites and complex user interactions. Browser automation capabilities enable crawling of dynamic content that requires JavaScript execution.
Pool of browser instances for efficient resource management and reuse across multiple crawler requests.
class BrowserPool:
def __init__(
self,
*,
browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",
max_browsers: int = 10,
idle_browser_ttl: timedelta = timedelta(minutes=5),
browser_options: dict[str, any] | None = None,
page_options: dict[str, any] | None = None
): ...
async def new_page(self, **page_options) -> tuple[Page, Browser]:
"""
Get new page from browser pool.
Args:
**page_options: Additional options for page creation
Returns:
Tuple of (Page, Browser) objects
"""
async def retire_browser(self, browser: Browser) -> None:
"""Remove browser from pool and close it."""
async def close(self) -> None:
"""Close all browsers and clean up pool."""
@property
def browser_type(self) -> str: ...
@property
def active_browsers(self) -> int:
"""Number of currently active browsers."""Controller for managing Playwright browser instances with advanced configuration and lifecycle management.
class PlaywrightBrowserController:
def __init__(
self,
*,
browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",
launch_options: dict[str, any] | None = None,
new_page_options: dict[str, any] | None = None
): ...
async def launch(self) -> Browser:
"""
Launch new browser instance.
Returns:
Playwright Browser object
"""
async def new_page(
self,
browser: Browser | None = None,
**page_options
) -> Page:
"""
Create new page in browser.
Args:
browser: Browser instance (creates new if None)
**page_options: Options for page creation
Returns:
Playwright Page object
"""
async def close_browser(self, browser: Browser) -> None:
"""Close browser instance."""
@property
def browser_type(self) -> str: ...
@property
def launch_options(self) -> dict[str, any]: ...Plugin system for extending browser functionality with custom behaviors and middleware.
class PlaywrightBrowserPlugin:
async def before_launch(
self,
browser_type: str,
launch_options: dict[str, any]
) -> dict[str, any]:
"""
Hook called before browser launch.
Args:
browser_type: Type of browser being launched
launch_options: Launch options for browser
Returns:
Modified launch options
"""
async def after_launch(self, browser: Browser) -> None:
"""
Hook called after browser launch.
Args:
browser: Launched browser instance
"""
async def before_page_create(
self,
browser: Browser,
page_options: dict[str, any]
) -> dict[str, any]:
"""
Hook called before page creation.
Args:
browser: Browser instance
page_options: Page creation options
Returns:
Modified page options
"""
async def after_page_create(self, page: Page) -> None:
"""
Hook called after page creation.
Args:
page: Created page instance
"""
async def before_page_close(self, page: Page) -> None:
"""Hook called before page closes."""
async def after_browser_close(self, browser: Browser) -> None:
"""Hook called after browser closes."""Common Playwright browser launch options for customizing browser behavior.
class BrowserLaunchOptions:
headless: bool = True
slow_mo: int = 0
timeout: int = 30000
executable_path: str | None = None
args: list[str] | None = None
ignore_default_args: bool | list[str] = False
handle_sigint: bool = True
handle_sigterm: bool = True
handle_sighup: bool = True
proxy: ProxySettings | None = None
downloads_path: str | None = None
chromium_sandbox: bool | None = None
firefox_user_prefs: dict[str, any] | None = NoneConfiguration options for Playwright page creation and behavior.
class PageOptions:
viewport: ViewportSize | None = None
screen: ScreenSize | None = None
no_viewport: bool = False
ignore_https_errors: bool = False
java_script_enabled: bool = True
bypass_csp: bool = False
user_agent: str | None = None
locale: str | None = None
timezone_id: str | None = None
geolocation: Geolocation | None = None
permissions: list[str] | None = None
extra_http_headers: dict[str, str] | None = None
offline: bool = False
http_credentials: HttpCredentials | None = None
device_scale_factor: float | None = None
is_mobile: bool | None = None
has_touch: bool | None = None
color_scheme: Literal["light", "dark", "no-preference"] | None = None
reduced_motion: Literal["reduce", "no-preference"] | None = None
forced_colors: Literal["active", "none"] | None = Noneimport asyncio
from crawlee.browsers import BrowserPool
async def main():
# Create browser pool
browser_pool = BrowserPool(
browser_type="chromium",
max_browsers=5,
idle_browser_ttl=timedelta(minutes=3)
)
try:
# Get page from pool
page, browser = await browser_pool.new_page()
# Navigate and interact with page
await page.goto('https://example.com')
await page.wait_for_load_state('networkidle')
title = await page.title()
print(f"Page title: {title}")
# Take screenshot
await page.screenshot(path='screenshot.png')
# Close page (browser returns to pool)
await page.close()
finally:
# Clean up pool
await browser_pool.close()
asyncio.run(main())import asyncio
from crawlee.browsers import PlaywrightBrowserController
async def main():
# Configure browser with custom options
controller = PlaywrightBrowserController(
browser_type="chromium",
launch_options={
'headless': False, # Show browser window
'slow_mo': 50, # Slow down operations
'args': [
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage'
]
},
new_page_options={
'viewport': {'width': 1920, 'height': 1080},
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
)
try:
# Launch browser
browser = await controller.launch()
# Create page with configured options
page = await controller.new_page(browser)
# Navigate and interact
await page.goto('https://example.com')
# Wait for specific element
await page.wait_for_selector('h1')
# Click button if it exists
button = page.locator('button:has-text("Accept")')
if await button.count() > 0:
await button.click()
# Extract data
heading = await page.locator('h1').text_content()
print(f"Main heading: {heading}")
await page.close()
await controller.close_browser(browser)
except Exception as e:
print(f"Browser automation error: {e}")
asyncio.run(main())import asyncio
from crawlee.browsers import PlaywrightBrowserPlugin, BrowserPool
class StealthPlugin(PlaywrightBrowserPlugin):
"""Plugin to make browser appear more human-like."""
async def before_launch(self, browser_type: str, launch_options: dict) -> dict:
# Add stealth arguments
args = launch_options.get('args', [])
args.extend([
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox'
])
launch_options['args'] = args
return launch_options
async def after_page_create(self, page):
# Remove webdriver property
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
""")
# Override permissions query
await page.add_init_script("""
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Cypress.env('granted') }) :
originalQuery(parameters)
);
""")
async def main():
# Create browser pool with custom plugin
plugin = StealthPlugin()
browser_pool = BrowserPool(
browser_type="chromium",
browser_plugins=[plugin]
)
try:
page, browser = await browser_pool.new_page()
# Browser now has stealth features enabled
await page.goto('https://bot-detection-test.com')
# Check if bot detection was bypassed
result = await page.evaluate('() => window.navigator.webdriver')
print(f"Webdriver detected: {result}")
await page.close()
finally:
await browser_pool.close()
asyncio.run(main())import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.browsers import BrowserPool
async def main():
# Create custom browser pool
browser_pool = BrowserPool(
browser_type="chromium",
max_browsers=3,
browser_options={
'headless': True,
'args': ['--disable-dev-shm-usage']
},
page_options={
'viewport': {'width': 1366, 'height': 768},
'user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
}
)
# Create crawler with custom browser pool
crawler = PlaywrightCrawler(
browser_pool=browser_pool,
max_requests_per_crawl=20
)
@crawler.router.default_handler
async def handler(context: PlaywrightCrawlingContext):
page = context.page
# Wait for page to be fully loaded
await page.wait_for_load_state('networkidle')
# Handle infinite scroll or load more buttons
await context.infinite_scroll(max_scroll_height=5000)
# Extract data using Playwright selectors
products = await page.locator('.product').all()
for product in products:
name = await product.locator('.product-name').text_content()
price = await product.locator('.price').text_content()
data = {
'url': context.request.url,
'name': name.strip() if name else None,
'price': price.strip() if price else None
}
await context.push_data(data)
# Find and enqueue pagination links
next_links = await page.locator('a:has-text("Next")').all()
for link in next_links:
href = await link.get_attribute('href')
if href:
await context.add_requests([href])
await crawler.run(['https://example-store.com/products'])
asyncio.run(main())import asyncio
from crawlee.browsers import BrowserPool
async def main():
browser_pool = BrowserPool()
try:
page, browser = await browser_pool.new_page()
await page.goto('https://example.com/login')
# Fill login form
await page.fill('input[name="username"]', 'myusername')
await page.fill('input[name="password"]', 'mypassword')
# Click login button and wait for navigation
async with page.expect_navigation():
await page.click('button[type="submit"]')
# Wait for dashboard to load
await page.wait_for_selector('.dashboard')
# Handle file download
async with page.expect_download() as download_info:
await page.click('a[href$=".pdf"]')
download = await download_info.value
await download.save_as('./downloaded_file.pdf')
# Take screenshot of specific element
element = page.locator('.important-data')
await element.screenshot(path='element_screenshot.png')
# Execute custom JavaScript
result = await page.evaluate('''
() => {
return {
title: document.title,
userAgent: navigator.userAgent,
cookies: document.cookie
};
}
''')
print(f"Page info: {result}")
await page.close()
finally:
await browser_pool.close()
asyncio.run(main())Install with Tessl CLI
npx tessl i tessl/pypi-crawlee