A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Persistent storage solutions for managing crawled data, key-value pairs, and request queues. Crawlee provides three main storage types designed for different data persistence needs in web scraping workflows.
Storage for structured data with built-in export capabilities to various formats including JSON, CSV, and Excel. Ideal for storing scraped data records.
class Dataset:
def __init__(self, id: str | None = None, name: str | None = None): ...
async def push_data(
self,
data: dict | list[dict],
*,
limit: int | None = None
) -> None:
"""
Store structured data records.
Args:
data: Dictionary or list of dictionaries to store
limit: Maximum number of records to store (None for unlimited)
"""
async def get_data(
self,
*,
offset: int = 0,
limit: int | None = None,
clean: bool = False,
desc: bool = False,
fields: list[str] | None = None
) -> DatasetData:
"""
Retrieve stored data with pagination and filtering.
Args:
offset: Number of records to skip
limit: Maximum number of records to return
clean: Remove empty records and standardize data
desc: Return records in descending order
fields: Specific fields to include in results
Returns:
DatasetData object containing items and metadata
"""
async def export_to(
self,
path: str,
*,
format: Literal["json", "csv", "xlsx"] = "json",
**kwargs
) -> None:
"""
Export dataset to file in specified format.
Args:
path: File path for export
format: Output format (json, csv, xlsx)
"""
async def drop(self) -> None:
"""Delete the dataset and all its data."""
async def get_info(self) -> DatasetInfo:
"""Get dataset metadata and statistics."""
@property
def id(self) -> str: ...
@property
def name(self) -> str | None: ...class DatasetData:
items: list[dict]
total: int
offset: int
count: int
limit: int | Noneclass DatasetInfo:
id: str
name: str | None
title: str | None
created_at: datetime
modified_at: datetime
accessed_at: datetime
item_count: int
clean_item_count: intStorage for arbitrary data including binary files, configuration objects, and intermediate processing results. Supports any serializable data type.
class KeyValueStore:
def __init__(self, id: str | None = None, name: str | None = None): ...
async def set_value(
self,
key: str,
value: any,
*,
content_type: str | None = None
) -> None:
"""
Store a value under the specified key.
Args:
key: Storage key
value: Data to store (any serializable type)
content_type: MIME type for the stored content
"""
async def get_value(self, key: str) -> any:
"""
Retrieve value by key.
Args:
key: Storage key
Returns:
Stored value or None if key doesn't exist
"""
async def delete(self, key: str) -> None:
"""Delete a key-value pair."""
async def list_keys(
self,
*,
limit: int | None = None,
exclusive_start_key: str | None = None
) -> KeyValueStoreListPage:
"""
List stored keys with pagination.
Args:
limit: Maximum number of keys to return
exclusive_start_key: Start listing after this key
Returns:
Page of keys with metadata
"""
async def drop(self) -> None:
"""Delete the store and all its contents."""
async def get_info(self) -> KeyValueStoreInfo:
"""Get store metadata and statistics."""
@property
def id(self) -> str: ...
@property
def name(self) -> str | None: ...class KeyValueStoreListPage:
items: list[KeyValueStoreKey]
total: int
offset: int
count: int
limit: int | Noneclass KeyValueStoreKey:
key: str
size: intclass KeyValueStoreInfo:
id: str
name: str | None
title: str | None
created_at: datetime
modified_at: datetime
accessed_at: datetimeFIFO queue for managing crawling requests with support for request deduplication, retry logic, and distributed processing.
class RequestQueue:
def __init__(self, id: str | None = None, name: str | None = None): ...
async def add_request(
self,
request: str | Request,
*,
forefront: bool = False
) -> RequestQueueOperationInfo:
"""
Add request to the queue.
Args:
request: URL string or Request object
forefront: Add to front of queue for priority processing
Returns:
Information about the add operation
"""
async def add_requests_batched(
self,
requests: list[str | Request],
*,
forefront: bool = False
) -> BatchAddRequestsResult:
"""
Add multiple requests efficiently in batch.
Args:
requests: List of URL strings or Request objects
forefront: Add to front of queue for priority processing
Returns:
Batch operation results
"""
async def fetch_next_request(self) -> Request | None:
"""
Get next request from queue for processing.
Returns:
Request object or None if queue is empty
"""
async def mark_request_as_handled(self, request: Request) -> None:
"""Mark request as successfully processed."""
async def reclaim_request(self, request: Request) -> None:
"""Return request to queue for retry after failure."""
async def is_empty(self) -> bool:
"""Check if queue has no pending requests."""
async def is_finished(self) -> bool:
"""Check if all requests have been processed."""
async def drop(self) -> None:
"""Delete the queue and all its requests."""
async def get_info(self) -> RequestQueueInfo:
"""Get queue metadata and statistics."""
@property
def id(self) -> str: ...
@property
def name(self) -> str | None: ...class RequestQueueOperationInfo:
request_id: str
was_already_present: bool
was_already_handled: boolclass BatchAddRequestsResult:
processed_requests: list[ProcessedRequest]
unprocessed_requests: list[str | Request]class ProcessedRequest:
unique_key: str
was_already_present: bool
was_already_handled: bool
request_id: strclass RequestQueueInfo:
id: str
name: str | None
title: str | None
created_at: datetime
modified_at: datetime
accessed_at: datetime
total_request_count: int
handled_request_count: int
pending_request_count: intAbstract base class defining the interface for different storage backend implementations.
class StorageClient:
async def dataset_exists(self, id: str) -> bool: ...
async def dataset_list(self) -> list[DatasetInfo]: ...
async def dataset_get_data(
self,
id: str,
**kwargs
) -> DatasetData: ...
async def dataset_push_data(
self,
id: str,
data: dict | list[dict]
) -> None: ...
async def dataset_delete(self, id: str) -> None: ...
async def key_value_store_exists(self, id: str) -> bool: ...
async def key_value_store_list(self) -> list[KeyValueStoreInfo]: ...
async def key_value_store_get_record(
self,
id: str,
key: str
) -> KeyValueStoreRecord | None: ...
async def key_value_store_set_record(
self,
id: str,
key: str,
value: any,
content_type: str | None = None
) -> None: ...
async def key_value_store_delete_record(
self,
id: str,
key: str
) -> None: ...
async def key_value_store_list_keys(
self,
id: str,
**kwargs
) -> KeyValueStoreListPage: ...
async def key_value_store_delete(self, id: str) -> None: ...
async def request_queue_exists(self, id: str) -> bool: ...
async def request_queue_list(self) -> list[RequestQueueInfo]: ...
async def request_queue_add_request(
self,
id: str,
request: Request,
forefront: bool = False
) -> RequestQueueOperationInfo: ...
async def request_queue_get_request(
self,
id: str
) -> Request | None: ...
async def request_queue_update_request(
self,
id: str,
request: Request
) -> None: ...
async def request_queue_delete(self, id: str) -> None: ...In-memory storage implementation for development, testing, and temporary data storage.
class MemoryStorageClient(StorageClient):
def __init__(self): ...import asyncio
from crawlee.storages import Dataset
async def main():
# Create or get existing dataset
dataset = await Dataset.open('my-results')
# Store single record
await dataset.push_data({
'url': 'https://example.com',
'title': 'Example Page',
'price': 19.99
})
# Store multiple records
products = [
{'name': 'Product 1', 'price': 10.00},
{'name': 'Product 2', 'price': 15.00}
]
await dataset.push_data(products)
# Retrieve data with pagination
data = await dataset.get_data(limit=10, offset=0)
print(f"Found {data.total} total records")
# Export to file
await dataset.export_to('results.csv', format='csv')
asyncio.run(main())import asyncio
from crawlee.storages import KeyValueStore
async def main():
# Create or get existing store
store = await KeyValueStore.open('my-store')
# Store configuration
config = {'timeout': 30, 'retries': 3}
await store.set_value('config', config)
# Store binary data
with open('image.jpg', 'rb') as f:
await store.set_value('logo', f.read(), content_type='image/jpeg')
# Retrieve data
saved_config = await store.get_value('config')
print(f"Timeout: {saved_config['timeout']}")
# List all keys
keys_page = await store.list_keys(limit=100)
for item in keys_page.items:
print(f"Key: {item.key}, Size: {item.size} bytes")
asyncio.run(main())import asyncio
from crawlee.storages import RequestQueue
from crawlee import Request
async def main():
# Create or get existing queue
queue = await RequestQueue.open('my-queue')
# Add single request
await queue.add_request('https://example.com')
# Add request with custom data
request = Request('https://example.com/products', user_data={'category': 'electronics'})
await queue.add_request(request)
# Add multiple requests
urls = ['https://example.com/page1', 'https://example.com/page2']
await queue.add_requests_batched(urls)
# Process requests
while not await queue.is_empty():
request = await queue.fetch_next_request()
if request:
print(f"Processing: {request.url}")
# Simulate processing
try:
# Process request here...
await queue.mark_request_as_handled(request)
except Exception:
# Return to queue for retry
await queue.reclaim_request(request)
print("Queue processing complete!")
asyncio.run(main())Install with Tessl CLI
npx tessl i tessl/pypi-crawlee