tessl/pypi-apify-client

Apify API client for Python providing access to web scraping and automation platform resources

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Data Storage

Name: tessl/pypi-apify-client
Author: tessl

Access to Apify's data storage systems including datasets for structured data and key-value stores for arbitrary data storage. These storage systems provide persistent, scalable data management for Actor runs and general use.

Capabilities

Dataset Operations

Dataset management for structured data storage with support for multiple formats and streaming access.

class DatasetClient:
    def get(self) -> dict | None:
        """Get dataset information."""
    
    def update(self, *, name: str | None = None, general_access: StorageGeneralAccess | None = None) -> dict:
        """Update dataset configuration.
        
        Args:
            name: Dataset name
            general_access: Storage access level (from apify_shared.consts)
        """
    
    def delete(self) -> None:
        """Delete dataset."""
    
    def list_items(self, **kwargs) -> ListPage:
        """List dataset items with filtering and pagination.
        
        Args:
            offset (int, optional): Starting offset
            limit (int, optional): Maximum items to return
            desc (bool, optional): Sort in descending order
            fields (list[str], optional): Fields to include
            omit (list[str], optional): Fields to exclude
            format (str, optional): Response format ('json', 'csv', 'xlsx', etc.)
            clean (bool, optional): Clean items before return
            **kwargs: Additional filtering parameters
        """
    
    def iterate_items(self, **kwargs) -> Iterator[dict]:
        """Iterate over all dataset items.
        
        Args:
            offset (int, optional): Starting offset
            limit (int, optional): Maximum items to iterate
            **kwargs: Additional parameters passed to list_items
        """
    
    def download_items(self, **kwargs) -> bytes:
        """Download items as bytes (deprecated - use get_items_as_bytes)."""
    
    def get_items_as_bytes(self, **kwargs) -> bytes:
        """Get items as raw bytes.
        
        Args:
            format (str, optional): Export format
            **kwargs: Additional export parameters
        """
    
    def stream_items(self, **kwargs) -> Iterator[Response]:
        """Stream items as context manager.
        
        Args:
            format (str, optional): Stream format
            **kwargs: Additional streaming parameters
        """
    
    def push_items(self, items: list | dict) -> None:
        """Push items to dataset.
        
        Args:
            items: Items to push (single item or list of items)
        """
    
    def get_statistics(self) -> dict | None:
        """Get dataset statistics including item count and size."""
    
    def create_items_public_url(self, **kwargs) -> str:
        """Generate public URL for dataset items.
        
        Args:
            format (str, optional): Export format
            **kwargs: Additional URL parameters
        """

class DatasetClientAsync:
    """Async version of DatasetClient with identical methods."""

class DatasetCollectionClient:
    def list(self, **kwargs) -> ListPage[dict]:
        """List datasets.
        
        Args:
            unnamed (bool, optional): Include unnamed datasets
            limit (int, optional): Maximum number of items
            offset (int, optional): Offset for pagination
            desc (bool, optional): Sort in descending order
        """
    
    def get_or_create(self, *, name: str | None = None, schema: dict | None = None) -> dict:
        """Get or create dataset.
        
        Args:
            name: Dataset name
            schema: Dataset schema definition
        """

class DatasetCollectionClientAsync:
    """Async version of DatasetCollectionClient with identical methods."""

Key-Value Store Operations

Key-value store management for arbitrary data storage with support for binary data and streaming.

class KeyValueStoreClient:
    def get(self) -> dict | None:
        """Get key-value store information."""
    
    def update(self, *, name: str | None = None, general_access: StorageGeneralAccess | None = None) -> dict:
        """Update store configuration.
        
        Args:
            name: Store name
            general_access: Storage access level (from apify_shared.consts)
        """
    
    def delete(self) -> None:
        """Delete store."""
    
    def list_keys(self, **kwargs) -> dict:
        """List keys in the store.
        
        Args:
            limit (int, optional): Maximum keys to return
            exclusive_start_key (str, optional): Key to start listing from
        """
    
    def get_record(self, key: str) -> dict | None:
        """Get record by key.
        
        Args:
            key: Record key
        """
    
    def record_exists(self, key: str) -> bool:
        """Check if record exists.
        
        Args:
            key: Record key
        """
    
    def get_record_as_bytes(self, key: str) -> bytes | None:
        """Get record as raw bytes.
        
        Args:
            key: Record key
        """
    
    def stream_record(self, key: str) -> Iterator[dict | None]:
        """Stream record as context manager.
        
        Args:
            key: Record key
        """
    
    def set_record(self, key: str, value: Any, content_type: str | None = None) -> None:
        """Set record value.
        
        Args:
            key: Record key
            value: Record value (dict, str, bytes, etc.)
            content_type: MIME content type
        """
    
    def delete_record(self, key: str) -> None:
        """Delete record.
        
        Args:
            key: Record key
        """
    
    def create_keys_public_url(self, **kwargs) -> str:
        """Generate public URL for accessing keys."""

class KeyValueStoreClientAsync:
    """Async version of KeyValueStoreClient with identical methods."""

class KeyValueStoreCollectionClient:
    def list(self, **kwargs) -> ListPage[dict]:
        """List key-value stores.
        
        Args:
            unnamed (bool, optional): Include unnamed stores
            limit (int, optional): Maximum number of items
            offset (int, optional): Offset for pagination
            desc (bool, optional): Sort in descending order
        """
    
    def get_or_create(self, *, name: str | None = None, schema: dict | None = None) -> dict:
        """Get or create key-value store.
        
        Args:
            name: Store name
            schema: Store schema definition
        """

class KeyValueStoreCollectionClientAsync:
    """Async version of KeyValueStoreCollectionClient with identical methods."""

Usage Examples

Dataset Operations

from apify_client import ApifyClient

client = ApifyClient('your-api-token')

# Create or get dataset
dataset = client.datasets().get_or_create(name='web-scraping-results')
dataset_client = client.dataset(dataset['id'])

# Push data to dataset
data = [
    {'url': 'https://example.com', 'title': 'Example Page', 'price': 29.99},
    {'url': 'https://example.org', 'title': 'Another Page', 'price': 39.99}
]
dataset_client.push_items(data)

# List items with pagination
items = dataset_client.list_items(limit=100, offset=0, format='json')
print(f"Retrieved {items.count} items")

# Iterate over all items
for item in dataset_client.iterate_items():
    print(f"Title: {item['title']}, Price: {item['price']}")

# Export dataset as CSV
csv_data = dataset_client.get_items_as_bytes(format='csv')
with open('results.csv', 'wb') as f:
    f.write(csv_data)

# Get dataset statistics
stats = dataset_client.get_statistics()
print(f"Dataset contains {stats['itemCount']} items")

Key-Value Store Operations

# Create or get key-value store
store = client.key_value_stores().get_or_create(name='app-config')
store_client = client.key_value_store(store['id'])

# Store configuration data
config = {
    'api_endpoint': 'https://api.example.com',
    'timeout': 30,
    'retry_count': 3
}
store_client.set_record('config', config, content_type='application/json')

# Store binary data
with open('screenshot.png', 'rb') as f:
    image_data = f.read()
store_client.set_record('screenshot', image_data, content_type='image/png')

# Retrieve data
stored_config = store_client.get_record('config')
print(f"API endpoint: {stored_config['api_endpoint']}")

# Check if record exists
if store_client.record_exists('screenshot'):
    image_bytes = store_client.get_record_as_bytes('screenshot')
    print(f"Screenshot size: {len(image_bytes)} bytes")

# List all keys
keys = store_client.list_keys()
print(f"Store contains keys: {keys['keys']}")

# Stream large records
with store_client.stream_record('large-file') as stream:
    for chunk in stream:
        process_chunk(chunk)

Advanced Data Processing

# Process dataset items in batches
dataset_client = client.dataset('dataset-id')

def process_batch(items):
    # Process items in batch
    processed = []
    for item in items:
        processed.append({
            **item,
            'processed_at': datetime.now().isoformat(),
            'price_usd': item['price'] * 1.2  # Convert currency
        })
    return processed

# Iterate with batch processing
batch_size = 1000
offset = 0

while True:
    batch = dataset_client.list_items(limit=batch_size, offset=offset)
    if not batch.items:
        break
    
    processed_items = process_batch(batch.items)
    
    # Store processed results
    processed_dataset = client.datasets().get_or_create(name='processed-results')
    client.dataset(processed_dataset['id']).push_items(processed_items)
    
    offset += batch_size
    print(f"Processed {offset} items")

Install with Tessl CLI