tessl/pypi-datadog

The Datadog Python library provides tools for interacting with Datadog's monitoring platform through HTTP API client functionality, DogStatsD metrics client, and command-line tools.

—

Pending

Overview

Eval results

Files

Error Handling

Name: tessl/pypi-datadog
Author: tessl

Comprehensive exception hierarchy and error handling strategies for managing API errors, network issues, authentication problems, and client-side errors with appropriate retry and recovery mechanisms.

Capabilities

Exception Hierarchy

Well-structured exception classes providing specific error information for different failure scenarios.

class DatadogException(Exception):
    """
    Base exception class for all Datadog-related errors.
    
    Attributes:
    - message (str): Error description
    - code (int): HTTP status code (when applicable)
    """

class ApiError(DatadogException):
    """
    API-specific errors including authentication failures and invalid requests.
    
    Raised when:
    - Invalid API key or application key
    - Malformed API requests
    - API rate limiting
    - Resource not found
    - Permission denied
    """

class ClientError(DatadogException):
    """
    Client-side errors related to HTTP communication and network issues.
    
    Base class for:
    - Connection failures
    - Timeout errors  
    - Proxy errors
    - SSL/TLS errors
    """

class HttpTimeout(ClientError):
    """
    Request timeout errors when API calls exceed configured timeout.
    
    Raised when:
    - API requests take longer than specified timeout
    - Network latency causes delays
    - Datadog API is experiencing high load
    """

class HttpBackoff(ClientError):
    """
    Backoff errors indicating temporary API unavailability.
    
    Raised when:
    - API returns 5xx server errors
    - Rate limiting triggers backoff
    - Temporary service disruptions
    """

class HTTPError(ClientError):
    """
    HTTP response errors for non-2xx status codes.
    
    Attributes:
    - status_code (int): HTTP status code
    - response (object): Raw HTTP response object
    
    Raised for:
    - 400 Bad Request
    - 401 Unauthorized  
    - 403 Forbidden
    - 404 Not Found
    - 429 Too Many Requests
    - 5xx Server Errors
    """

class ProxyError(ClientError):
    """
    Proxy connection and configuration errors.
    
    Raised when:
    - Proxy server is unreachable
    - Proxy authentication fails
    - Invalid proxy configuration
    """

class ApiNotInitialized(ApiError):
    """
    Error when attempting API calls without proper initialization.
    
    Raised when:
    - API key not configured
    - Application key not configured
    - initialize() not called before API usage
    """

Error Suppression Control

Configure error handling behavior through the mute parameter and global settings.

# Global error suppression setting (configured via initialize())
# api._mute (bool): When True, suppresses ApiError and ClientError exceptions

# Error suppression affects:
# - API method calls (api.Event.create, api.Monitor.get, etc.)
# - HTTP client errors (timeouts, connection failures)
# - Authentication and authorization errors

# Errors still logged but not raised when mute=True

StatsD Error Resilience

StatsD operations are designed to be fire-and-forget with built-in error resilience.

# StatsD error handling characteristics:
# - UDP transport failures are silently ignored
# - Socket errors don't interrupt application flow
# - Network issues don't block metric submission
# - Malformed metrics are dropped without errors

# StatsD errors that may occur:
# - Socket creation failures
# - DNS resolution errors for statsd_host
# - Permission errors for Unix Domain Sockets
# - Network unreachable errors

Usage Examples

Basic Error Handling

from datadog import initialize, api
from datadog.api.exceptions import ApiError, ClientError, ApiNotInitialized

# Configure with error suppression disabled for explicit handling
initialize(
    api_key="your-api-key",
    app_key="your-app-key",
    mute=False  # Enable explicit error handling
)

try:
    # API call that might fail
    monitor = api.Monitor.create(
        type="metric alert",
        query="avg(last_5m):avg:system.cpu.user{*} > 80",
        name="High CPU usage"
    )
    print(f"Monitor created with ID: {monitor['id']}")
    
except ApiNotInitialized:
    print("ERROR: Datadog not properly initialized")
    
except ApiError as e:
    print(f"API Error: {e}")
    # Handle authentication, permission, or API-specific errors
    
except ClientError as e:
    print(f"Client Error: {e}")
    # Handle network, timeout, or connection errors
    
except Exception as e:
    print(f"Unexpected error: {e}")

Specific Exception Handling

from datadog import api
from datadog.api.exceptions import HttpTimeout, HTTPError, ApiError

def create_monitor_with_retry(monitor_config, max_retries=3):
    """Create monitor with retry logic for different error types."""
    
    for attempt in range(max_retries):
        try:
            return api.Monitor.create(**monitor_config)
            
        except HttpTimeout:
            if attempt < max_retries - 1:
                print(f"Timeout on attempt {attempt + 1}, retrying...")
                time.sleep(2 ** attempt)  # Exponential backoff
                continue
            else:
                print("Failed after maximum timeout retries")
                raise
                
        except HTTPError as e:
            if e.status_code == 429:  # Rate limiting
                if attempt < max_retries - 1:
                    print("Rate limited, waiting before retry...")
                    time.sleep(60)  # Wait 1 minute for rate limit reset
                    continue
            elif e.status_code >= 500:  # Server errors
                if attempt < max_retries - 1:
                    print(f"Server error {e.status_code}, retrying...")
                    time.sleep(5)
                    continue
            print(f"HTTP Error {e.status_code}: {e}")
            raise
            
        except ApiError as e:
            # Don't retry authentication or permission errors
            print(f"API Error (not retryable): {e}")
            raise

# Usage
monitor_config = {
    "type": "metric alert",
    "query": "avg(last_5m):avg:system.cpu.user{*} > 80",
    "name": "High CPU usage"
}

try:
    monitor = create_monitor_with_retry(monitor_config)
    print(f"Monitor created: {monitor['id']}")
except Exception as e:
    print(f"Failed to create monitor: {e}")

Error Handling with Raw Response Access

from datadog import initialize, api
from datadog.api.exceptions import HTTPError

# Configure to include raw HTTP responses
initialize(
    api_key="your-api-key",
    app_key="your-app-key",
    return_raw_response=True,
    mute=False
)

try:
    result = api.Event.create(
        title="Test Event",
        text="Testing error handling"
    )
    
    # With return_raw_response=True, result includes:
    # - Decoded response data
    # - Raw HTTP response object
    print(f"Event created: {result[0]['event']['id']}")
    print(f"Status code: {result[1].status_code}")
    print(f"Response headers: {result[1].headers}")
    
except HTTPError as e:
    print(f"HTTP Status: {e.status_code}")
    print(f"Response body: {e.response.text}")
    print(f"Request headers: {e.response.request.headers}")
    
    # Handle specific HTTP status codes
    if e.status_code == 400:
        print("Bad request - check your parameters")
    elif e.status_code == 401:
        print("Unauthorized - check your API key")
    elif e.status_code == 403:
        print("Forbidden - check your permissions")
    elif e.status_code == 404:
        print("Resource not found")

Graceful Degradation Pattern

from datadog import api, statsd
from datadog.api.exceptions import DatadogException
import logging

logger = logging.getLogger(__name__)

def submit_metrics_with_fallback(metrics_data):
    """Submit metrics with graceful degradation."""
    
    # Primary: Try API submission for persistent metrics
    try:
        api.Metric.send(**metrics_data)
        logger.info("Metrics submitted via API")
        return True
        
    except DatadogException as e:
        logger.warning(f"API submission failed: {e}")
        
        # Fallback: Use StatsD for real-time metrics
        try:
            statsd.gauge(
                metrics_data['metric'],
                metrics_data['points'][-1][1],  # Latest value
                tags=metrics_data.get('tags', [])
            )
            logger.info("Metrics submitted via StatsD fallback")
            return True
            
        except Exception as e:
            logger.error(f"StatsD fallback failed: {e}")
            return False

def create_monitor_with_fallback(monitor_config):
    """Create monitor with fallback to simplified configuration."""
    
    try:
        # Try creating monitor with full configuration
        return api.Monitor.create(**monitor_config)
        
    except DatadogException as e:
        logger.warning(f"Full monitor creation failed: {e}")
        
        # Fallback: Create simplified monitor
        simplified_config = {
            'type': monitor_config['type'],
            'query': monitor_config['query'],
            'name': f"[Simplified] {monitor_config['name']}"
        }
        
        try:
            return api.Monitor.create(**simplified_config)
        except DatadogException as e:
            logger.error(f"Simplified monitor creation failed: {e}")
            raise

Circuit Breaker Pattern

from datadog import api
from datadog.api.exceptions import DatadogException
import time
from threading import Lock

class DatadogCircuitBreaker:
    """Circuit breaker for Datadog API calls."""
    
    def __init__(self, failure_threshold=5, recovery_timeout=60):
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
        self.failure_count = 0
        self.last_failure_time = None
        self.state = 'CLOSED'  # CLOSED, OPEN, HALF_OPEN
        self.lock = Lock()
    
    def call(self, func, *args, **kwargs):
        """Execute function with circuit breaker protection."""
        
        with self.lock:
            if self.state == 'OPEN':
                if time.time() - self.last_failure_time > self.recovery_timeout:
                    self.state = 'HALF_OPEN'
                else:
                    raise DatadogException("Circuit breaker is OPEN")
            
        try:
            result = func(*args, **kwargs)
            
            with self.lock:
                # Success resets failure count
                self.failure_count = 0
                if self.state == 'HALF_OPEN':
                    self.state = 'CLOSED'
            
            return result
            
        except DatadogException as e:
            with self.lock:
                self.failure_count += 1
                self.last_failure_time = time.time()
                
                if self.failure_count >= self.failure_threshold:
                    self.state = 'OPEN'
                    
            raise

# Usage
circuit_breaker = DatadogCircuitBreaker()

def safe_api_call(func, *args, **kwargs):
    """Make API call with circuit breaker protection."""
    try:
        return circuit_breaker.call(func, *args, **kwargs)
    except DatadogException as e:
        print(f"API call failed (circuit breaker): {e}")
        return None

# Protected API calls
event = safe_api_call(
    api.Event.create,
    title="Test Event",
    text="Circuit breaker test"
)

monitors = safe_api_call(api.Monitor.get_all)

Comprehensive Error Logging

from datadog import initialize, api
from datadog.api.exceptions import *
import logging
import traceback

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize with error suppression disabled
initialize(
    api_key="your-api-key", 
    app_key="your-app-key",
    mute=False,
    return_raw_response=True
)

def log_datadog_error(operation, exception, **context):
    """Comprehensive error logging for Datadog operations."""
    
    error_details = {
        'operation': operation,
        'exception_type': type(exception).__name__,
        'error_message': str(exception),
        'context': context
    }
    
    if isinstance(exception, HTTPError):
        error_details.update({
            'status_code': exception.status_code,
            'response_body': getattr(exception.response, 'text', 'N/A'),
            'request_url': getattr(exception.response.request, 'url', 'N/A'),
            'request_method': getattr(exception.response.request, 'method', 'N/A')
        })
    
    if isinstance(exception, (HttpTimeout, HttpBackoff)):
        error_details['retry_recommended'] = True
    
    logger.error(f"Datadog operation failed: {error_details}")
    
    # Log full traceback for debugging
    logger.debug(f"Full traceback: {traceback.format_exc()}")

def robust_datadog_operation(operation_func, operation_name, **kwargs):
    """Execute Datadog operation with comprehensive error handling."""
    
    try:
        result = operation_func(**kwargs)
        logger.info(f"Datadog operation succeeded: {operation_name}")
        return result
        
    except ApiNotInitialized as e:
        log_datadog_error(operation_name, e, **kwargs)
        raise  # Re-raise as this is a configuration issue
        
    except HttpTimeout as e:
        log_datadog_error(operation_name, e, **kwargs)
        # Could implement retry logic here
        raise
        
    except HTTPError as e:
        log_datadog_error(operation_name, e, **kwargs)
        
        if e.status_code == 401:
            logger.critical("Authentication failed - check API keys")
        elif e.status_code == 403:
            logger.critical("Authorization failed - check permissions")
        elif e.status_code == 429:
            logger.warning("Rate limited - implement backoff")
        elif e.status_code >= 500:
            logger.warning("Server error - may be temporary")
            
        raise
        
    except ApiError as e:
        log_datadog_error(operation_name, e, **kwargs)
        raise
        
    except ClientError as e:
        log_datadog_error(operation_name, e, **kwargs)
        raise
        
    except Exception as e:
        log_datadog_error(operation_name, e, **kwargs)
        logger.error(f"Unexpected error in Datadog operation: {e}")
        raise

# Usage examples
try:
    monitor = robust_datadog_operation(
        api.Monitor.create,
        "create_monitor",
        type="metric alert",
        query="avg(last_5m):avg:system.cpu.user{*} > 80",
        name="High CPU usage"
    )
except Exception:
    print("Monitor creation failed - check logs")

try:
    events = robust_datadog_operation(
        api.Event.query,
        "query_events", 
        start=1234567890,
        end=1234567899
    )
except Exception:
    print("Event query failed - check logs")

StatsD Error Resilience Patterns

from datadog import statsd
import logging
import socket

logger = logging.getLogger(__name__)

def resilient_statsd_submit(metric_name, value, **kwargs):
    """Submit StatsD metric with error resilience."""
    
    try:
        statsd.gauge(metric_name, value, **kwargs)
        return True
        
    except socket.error as e:
        logger.warning(f"StatsD socket error: {e}")
        # StatsD errors shouldn't block application
        return False
        
    except Exception as e:
        logger.warning(f"Unexpected StatsD error: {e}")
        return False

def batch_statsd_with_recovery(metrics_batch):
    """Submit batch of StatsD metrics with individual error recovery."""
    
    success_count = 0
    
    for metric in metrics_batch:
        try:
            if metric['type'] == 'gauge':
                statsd.gauge(metric['name'], metric['value'], tags=metric.get('tags'))
            elif metric['type'] == 'increment':
                statsd.increment(metric['name'], metric['value'], tags=metric.get('tags'))
            elif metric['type'] == 'timing':
                statsd.timing(metric['name'], metric['value'], tags=metric.get('tags'))
                
            success_count += 1
            
        except Exception as e:
            logger.warning(f"Failed to submit metric {metric['name']}: {e}")
            # Continue with remaining metrics
            continue
    
    logger.info(f"Submitted {success_count}/{len(metrics_batch)} metrics successfully")
    return success_count

# Usage
metrics = [
    {'type': 'gauge', 'name': 'system.cpu.usage', 'value': 75.0, 'tags': ['host:web01']},
    {'type': 'increment', 'name': 'web.requests', 'value': 1, 'tags': ['endpoint:/api']},
    {'type': 'timing', 'name': 'db.query.time', 'value': 150, 'tags': ['table:users']}
]

batch_statsd_with_recovery(metrics)

Error Handling Best Practices

Appropriate Error Suppression

# Production: Suppress errors to prevent application crashes
initialize(
    api_key=os.environ['DATADOG_API_KEY'],
    app_key=os.environ['DATADOG_APP_KEY'], 
    mute=True  # Suppress exceptions in production
)

# Development: Enable errors for debugging
initialize(
    api_key="dev-api-key",
    app_key="dev-app-key",
    mute=False  # Show all errors during development
)

Monitoring and Alerting Resilience

# Critical monitoring should not fail application
def submit_critical_metric(metric_name, value):
    try:
        statsd.gauge(metric_name, value)
    except:
        # Never let metrics submission crash critical application flow
        pass

# Non-critical operations can have explicit error handling
def create_dashboard_with_handling(dashboard_config):
    try:
        return api.Dashboard.create(**dashboard_config)
    except DatadogException as e:
        logger.error(f"Dashboard creation failed: {e}")
        return None  # Graceful degradation

Retry Strategy Guidelines

# Retry on transient errors
RETRYABLE_ERRORS = (HttpTimeout, HttpBackoff)

# Don't retry on permanent errors  
NON_RETRYABLE_ERRORS = (ApiNotInitialized,)

# Conditional retry on HTTP errors
def should_retry_http_error(http_error):
    return http_error.status_code in [429, 500, 502, 503, 504]

Install with Tessl CLI