A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Command-line interface for project scaffolding and development workflow automation. The Crawlee CLI provides tools for quickly creating new projects with best practices and templates.
The main CLI entry point providing access to all Crawlee command-line tools.
# Command line usage:
# crawlee --help # Show help information
# crawlee --version # Display version information
# crawlee create # Create new projectCreate new Crawlee projects using predefined templates with best practices and common patterns.
# Create new project interactively
crawlee create
# Create project with specific name
crawlee create my-crawler
# Create project with template
crawlee create my-crawler --template basic
# Create project in specific directory
crawlee create my-crawler --output-dir ./projectsDisplay version information for the installed Crawlee package.
# Show version
crawlee --version
# Alternative version command
crawlee versionThe CLI provides several project templates optimized for different use cases:
Generated projects include:
my-crawler/
├── src/
│ └── main.py # Main crawler implementation
├── requirements.txt # Python dependencies
├── pyproject.toml # Project configuration
├── README.md # Project documentation
├── .gitignore # Git ignore rules
└── storage/ # Default storage directory
├── datasets/ # Scraped data storage
├── key_value_stores/ # Key-value storage
└── request_queues/ # Request queue storage$ crawlee create
? Project name: my-web-scraper
? Select template:
> basic
playwright
beautifulsoup
adaptive
advanced
? Description: A web scraper for extracting product data
? Author name: John Doe
? Author email: john@example.com
? Use session management? (y/N): y
? Use proxy rotation? (y/N): n
? Include example handlers? (Y/n): y
✅ Project created successfully!
📁 Project location: ./my-web-scraper
📋 Next steps:
1. cd my-web-scraper
2. pip install -r requirements.txt
3. python src/main.py
🚀 Happy crawling!# Generated src/main.py for basic template
import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
async def main() -> None:
crawler = HttpCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': 'TODO: Extract title from response', # Add your extraction logic
'content_length': len(context.body),
}
# Push the extracted data to the default dataset.
await context.push_data(data)
# Find and enqueue links from the current page.
await context.enqueue_links()
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())# Generated src/main.py for playwright template
import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
# Headless mode (set to False to see browser window)
headless=True,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Wait for the page to fully load
await context.page.wait_for_load_state('networkidle')
# Extract data from the page using Playwright selectors.
title = await context.page.title()
data = {
'url': context.request.url,
'title': title,
}
# Push the extracted data to the default dataset.
await context.push_data(data)
# Find and enqueue links from the current page.
await context.enqueue_links()
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())# Generated src/main.py for beautifulsoup template
import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page using BeautifulSoup.
title_element = context.soup.find('title')
title = title_element.get_text().strip() if title_element else 'No title'
data = {
'url': context.request.url,
'title': title,
}
# Push the extracted data to the default dataset.
await context.push_data(data)
# Find and enqueue links from the current page.
await context.enqueue_links()
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())The advanced template includes:
# Advanced template excerpt
import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext
from crawlee.sessions import SessionPool
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.statistics import Statistics
async def main() -> None:
# Configure session pool
session_pool = SessionPool(max_pool_size=100)
# Configure proxy rotation (optional)
# proxy_config = ProxyConfiguration([
# 'http://proxy1:8080',
# 'http://proxy2:8080'
# ])
# Configure statistics
stats = Statistics()
crawler = AdaptivePlaywrightCrawler(
max_requests_per_crawl=50,
session_pool=session_pool,
# proxy_configuration=proxy_config,
statistics=stats,
headless=True,
)
# Route for product pages
@crawler.router.route('product')
async def product_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info(f'Processing product: {context.request.url}')
# Extract product data
# Add your product extraction logic here
data = {
'url': context.request.url,
'type': 'product',
'title': 'TODO: Extract product title',
'price': 'TODO: Extract product price',
}
await context.push_data(data)
# Default handler for other pages
@crawler.router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')
# Enqueue product links
await context.enqueue_links(
selector='a[href*="/product"]',
label='product'
)
data = {
'url': context.request.url,
'type': 'page',
}
await context.push_data(data)
# Error handler
@crawler.router.error_handler
async def error_handler(context: AdaptivePlaywrightCrawlingContext, error: Exception) -> None:
context.log.error(f'Error processing {context.request.url}: {error}')
# Log error data
await context.push_data({
'url': context.request.url,
'error': str(error),
'type': 'error'
})
# Run the crawler
final_stats = await crawler.run(['https://example-store.com'])
# Print final statistics
print(f'Crawl completed. Success rate: {final_stats.success_rate:.1f}%')
if __name__ == '__main__':
asyncio.run(main())# Generated pyproject.toml
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "my-web-scraper"
version = "0.1.0"
description = "A web scraper for extracting product data"
authors = [
{name = "John Doe", email = "john@example.com"},
]
readme = "README.md"
license = {file = "LICENSE"}
requires-python = ">=3.9"
dependencies = [
"crawlee[all]>=0.6.0",
]
[project.optional-dependencies]
dev = [
"pytest>=7.0",
"pytest-asyncio>=0.21.0",
"black>=23.0",
"ruff>=0.1.0",
]
[tool.black]
line-length = 100
[tool.ruff]
line-length = 100
select = ["E", "F", "I"]
[tool.pytest.ini_options]
asyncio_mode = "auto"# Generated requirements.txt
crawlee[all]>=0.6.0
# Development dependencies (optional)
# pytest>=7.0
# pytest-asyncio>=0.21.0
# black>=23.0
# ruff>=0.1.0# My Web Scraper
A web scraper for extracting product data built with Crawlee.
## Installation
1. Install dependencies:
```bash
pip install -r requirements.txtplaywright installRun the scraper:
python src/main.pysrc/main.py to customize scraping logicmax_requests_per_crawl to control crawl sizecrawler.run() callScraped data is saved to:
./storage/datasets/ - Structured data in JSON format./storage/key_value_stores/ - Key-value pairs and files./storage/request_queues/ - Request queue stateRun tests:
pytestFormat code:
black src/
ruff check src/This project is licensed under the MIT License.
## Advanced CLI Usage
### Custom Templates
Create custom templates by extending the CLI:
```python
# custom_template.py
from crawlee._cli import create_project_template
def create_custom_template(project_name: str, output_dir: str):
"""Create project with custom template."""
template_data = {
'project_name': project_name,
'crawler_type': 'custom',
'features': ['sessions', 'statistics', 'error_handling']
}
return create_project_template(
template_name='custom',
project_name=project_name,
output_dir=output_dir,
template_data=template_data
)Use CLI functionality programmatically:
import asyncio
from crawlee._cli import CLICommands
async def create_project_programmatically():
"""Create project using CLI programmatically."""
cli = CLICommands()
result = await cli.create_project(
project_name='automated-scraper',
template='playwright',
output_dir='./projects',
options={
'author_name': 'Automation Script',
'author_email': 'automation@example.com',
'include_sessions': True,
'include_examples': True
}
)
if result.success:
print(f"Project created: {result.project_path}")
else:
print(f"Failed to create project: {result.error}")
asyncio.run(create_project_programmatically())The Crawlee CLI provides a quick and efficient way to bootstrap new web scraping projects with industry best practices, allowing developers to focus on extraction logic rather than project setup and configuration.
Install with Tessl CLI
npx tessl i tessl/pypi-crawlee