The Datadog Python library provides tools for interacting with Datadog's monitoring platform through HTTP API client functionality, DogStatsD metrics client, and command-line tools.
—
Comprehensive exception hierarchy and error handling strategies for managing API errors, network issues, authentication problems, and client-side errors with appropriate retry and recovery mechanisms.
Well-structured exception classes providing specific error information for different failure scenarios.
class DatadogException(Exception):
"""
Base exception class for all Datadog-related errors.
Attributes:
- message (str): Error description
- code (int): HTTP status code (when applicable)
"""
class ApiError(DatadogException):
"""
API-specific errors including authentication failures and invalid requests.
Raised when:
- Invalid API key or application key
- Malformed API requests
- API rate limiting
- Resource not found
- Permission denied
"""
class ClientError(DatadogException):
"""
Client-side errors related to HTTP communication and network issues.
Base class for:
- Connection failures
- Timeout errors
- Proxy errors
- SSL/TLS errors
"""
class HttpTimeout(ClientError):
"""
Request timeout errors when API calls exceed configured timeout.
Raised when:
- API requests take longer than specified timeout
- Network latency causes delays
- Datadog API is experiencing high load
"""
class HttpBackoff(ClientError):
"""
Backoff errors indicating temporary API unavailability.
Raised when:
- API returns 5xx server errors
- Rate limiting triggers backoff
- Temporary service disruptions
"""
class HTTPError(ClientError):
"""
HTTP response errors for non-2xx status codes.
Attributes:
- status_code (int): HTTP status code
- response (object): Raw HTTP response object
Raised for:
- 400 Bad Request
- 401 Unauthorized
- 403 Forbidden
- 404 Not Found
- 429 Too Many Requests
- 5xx Server Errors
"""
class ProxyError(ClientError):
"""
Proxy connection and configuration errors.
Raised when:
- Proxy server is unreachable
- Proxy authentication fails
- Invalid proxy configuration
"""
class ApiNotInitialized(ApiError):
"""
Error when attempting API calls without proper initialization.
Raised when:
- API key not configured
- Application key not configured
- initialize() not called before API usage
"""Configure error handling behavior through the mute parameter and global settings.
# Global error suppression setting (configured via initialize())
# api._mute (bool): When True, suppresses ApiError and ClientError exceptions
# Error suppression affects:
# - API method calls (api.Event.create, api.Monitor.get, etc.)
# - HTTP client errors (timeouts, connection failures)
# - Authentication and authorization errors
# Errors still logged but not raised when mute=TrueStatsD operations are designed to be fire-and-forget with built-in error resilience.
# StatsD error handling characteristics:
# - UDP transport failures are silently ignored
# - Socket errors don't interrupt application flow
# - Network issues don't block metric submission
# - Malformed metrics are dropped without errors
# StatsD errors that may occur:
# - Socket creation failures
# - DNS resolution errors for statsd_host
# - Permission errors for Unix Domain Sockets
# - Network unreachable errorsfrom datadog import initialize, api
from datadog.api.exceptions import ApiError, ClientError, ApiNotInitialized
# Configure with error suppression disabled for explicit handling
initialize(
api_key="your-api-key",
app_key="your-app-key",
mute=False # Enable explicit error handling
)
try:
# API call that might fail
monitor = api.Monitor.create(
type="metric alert",
query="avg(last_5m):avg:system.cpu.user{*} > 80",
name="High CPU usage"
)
print(f"Monitor created with ID: {monitor['id']}")
except ApiNotInitialized:
print("ERROR: Datadog not properly initialized")
except ApiError as e:
print(f"API Error: {e}")
# Handle authentication, permission, or API-specific errors
except ClientError as e:
print(f"Client Error: {e}")
# Handle network, timeout, or connection errors
except Exception as e:
print(f"Unexpected error: {e}")from datadog import api
from datadog.api.exceptions import HttpTimeout, HTTPError, ApiError
def create_monitor_with_retry(monitor_config, max_retries=3):
"""Create monitor with retry logic for different error types."""
for attempt in range(max_retries):
try:
return api.Monitor.create(**monitor_config)
except HttpTimeout:
if attempt < max_retries - 1:
print(f"Timeout on attempt {attempt + 1}, retrying...")
time.sleep(2 ** attempt) # Exponential backoff
continue
else:
print("Failed after maximum timeout retries")
raise
except HTTPError as e:
if e.status_code == 429: # Rate limiting
if attempt < max_retries - 1:
print("Rate limited, waiting before retry...")
time.sleep(60) # Wait 1 minute for rate limit reset
continue
elif e.status_code >= 500: # Server errors
if attempt < max_retries - 1:
print(f"Server error {e.status_code}, retrying...")
time.sleep(5)
continue
print(f"HTTP Error {e.status_code}: {e}")
raise
except ApiError as e:
# Don't retry authentication or permission errors
print(f"API Error (not retryable): {e}")
raise
# Usage
monitor_config = {
"type": "metric alert",
"query": "avg(last_5m):avg:system.cpu.user{*} > 80",
"name": "High CPU usage"
}
try:
monitor = create_monitor_with_retry(monitor_config)
print(f"Monitor created: {monitor['id']}")
except Exception as e:
print(f"Failed to create monitor: {e}")from datadog import initialize, api
from datadog.api.exceptions import HTTPError
# Configure to include raw HTTP responses
initialize(
api_key="your-api-key",
app_key="your-app-key",
return_raw_response=True,
mute=False
)
try:
result = api.Event.create(
title="Test Event",
text="Testing error handling"
)
# With return_raw_response=True, result includes:
# - Decoded response data
# - Raw HTTP response object
print(f"Event created: {result[0]['event']['id']}")
print(f"Status code: {result[1].status_code}")
print(f"Response headers: {result[1].headers}")
except HTTPError as e:
print(f"HTTP Status: {e.status_code}")
print(f"Response body: {e.response.text}")
print(f"Request headers: {e.response.request.headers}")
# Handle specific HTTP status codes
if e.status_code == 400:
print("Bad request - check your parameters")
elif e.status_code == 401:
print("Unauthorized - check your API key")
elif e.status_code == 403:
print("Forbidden - check your permissions")
elif e.status_code == 404:
print("Resource not found")from datadog import api, statsd
from datadog.api.exceptions import DatadogException
import logging
logger = logging.getLogger(__name__)
def submit_metrics_with_fallback(metrics_data):
"""Submit metrics with graceful degradation."""
# Primary: Try API submission for persistent metrics
try:
api.Metric.send(**metrics_data)
logger.info("Metrics submitted via API")
return True
except DatadogException as e:
logger.warning(f"API submission failed: {e}")
# Fallback: Use StatsD for real-time metrics
try:
statsd.gauge(
metrics_data['metric'],
metrics_data['points'][-1][1], # Latest value
tags=metrics_data.get('tags', [])
)
logger.info("Metrics submitted via StatsD fallback")
return True
except Exception as e:
logger.error(f"StatsD fallback failed: {e}")
return False
def create_monitor_with_fallback(monitor_config):
"""Create monitor with fallback to simplified configuration."""
try:
# Try creating monitor with full configuration
return api.Monitor.create(**monitor_config)
except DatadogException as e:
logger.warning(f"Full monitor creation failed: {e}")
# Fallback: Create simplified monitor
simplified_config = {
'type': monitor_config['type'],
'query': monitor_config['query'],
'name': f"[Simplified] {monitor_config['name']}"
}
try:
return api.Monitor.create(**simplified_config)
except DatadogException as e:
logger.error(f"Simplified monitor creation failed: {e}")
raisefrom datadog import api
from datadog.api.exceptions import DatadogException
import time
from threading import Lock
class DatadogCircuitBreaker:
"""Circuit breaker for Datadog API calls."""
def __init__(self, failure_threshold=5, recovery_timeout=60):
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.failure_count = 0
self.last_failure_time = None
self.state = 'CLOSED' # CLOSED, OPEN, HALF_OPEN
self.lock = Lock()
def call(self, func, *args, **kwargs):
"""Execute function with circuit breaker protection."""
with self.lock:
if self.state == 'OPEN':
if time.time() - self.last_failure_time > self.recovery_timeout:
self.state = 'HALF_OPEN'
else:
raise DatadogException("Circuit breaker is OPEN")
try:
result = func(*args, **kwargs)
with self.lock:
# Success resets failure count
self.failure_count = 0
if self.state == 'HALF_OPEN':
self.state = 'CLOSED'
return result
except DatadogException as e:
with self.lock:
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = 'OPEN'
raise
# Usage
circuit_breaker = DatadogCircuitBreaker()
def safe_api_call(func, *args, **kwargs):
"""Make API call with circuit breaker protection."""
try:
return circuit_breaker.call(func, *args, **kwargs)
except DatadogException as e:
print(f"API call failed (circuit breaker): {e}")
return None
# Protected API calls
event = safe_api_call(
api.Event.create,
title="Test Event",
text="Circuit breaker test"
)
monitors = safe_api_call(api.Monitor.get_all)from datadog import initialize, api
from datadog.api.exceptions import *
import logging
import traceback
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize with error suppression disabled
initialize(
api_key="your-api-key",
app_key="your-app-key",
mute=False,
return_raw_response=True
)
def log_datadog_error(operation, exception, **context):
"""Comprehensive error logging for Datadog operations."""
error_details = {
'operation': operation,
'exception_type': type(exception).__name__,
'error_message': str(exception),
'context': context
}
if isinstance(exception, HTTPError):
error_details.update({
'status_code': exception.status_code,
'response_body': getattr(exception.response, 'text', 'N/A'),
'request_url': getattr(exception.response.request, 'url', 'N/A'),
'request_method': getattr(exception.response.request, 'method', 'N/A')
})
if isinstance(exception, (HttpTimeout, HttpBackoff)):
error_details['retry_recommended'] = True
logger.error(f"Datadog operation failed: {error_details}")
# Log full traceback for debugging
logger.debug(f"Full traceback: {traceback.format_exc()}")
def robust_datadog_operation(operation_func, operation_name, **kwargs):
"""Execute Datadog operation with comprehensive error handling."""
try:
result = operation_func(**kwargs)
logger.info(f"Datadog operation succeeded: {operation_name}")
return result
except ApiNotInitialized as e:
log_datadog_error(operation_name, e, **kwargs)
raise # Re-raise as this is a configuration issue
except HttpTimeout as e:
log_datadog_error(operation_name, e, **kwargs)
# Could implement retry logic here
raise
except HTTPError as e:
log_datadog_error(operation_name, e, **kwargs)
if e.status_code == 401:
logger.critical("Authentication failed - check API keys")
elif e.status_code == 403:
logger.critical("Authorization failed - check permissions")
elif e.status_code == 429:
logger.warning("Rate limited - implement backoff")
elif e.status_code >= 500:
logger.warning("Server error - may be temporary")
raise
except ApiError as e:
log_datadog_error(operation_name, e, **kwargs)
raise
except ClientError as e:
log_datadog_error(operation_name, e, **kwargs)
raise
except Exception as e:
log_datadog_error(operation_name, e, **kwargs)
logger.error(f"Unexpected error in Datadog operation: {e}")
raise
# Usage examples
try:
monitor = robust_datadog_operation(
api.Monitor.create,
"create_monitor",
type="metric alert",
query="avg(last_5m):avg:system.cpu.user{*} > 80",
name="High CPU usage"
)
except Exception:
print("Monitor creation failed - check logs")
try:
events = robust_datadog_operation(
api.Event.query,
"query_events",
start=1234567890,
end=1234567899
)
except Exception:
print("Event query failed - check logs")from datadog import statsd
import logging
import socket
logger = logging.getLogger(__name__)
def resilient_statsd_submit(metric_name, value, **kwargs):
"""Submit StatsD metric with error resilience."""
try:
statsd.gauge(metric_name, value, **kwargs)
return True
except socket.error as e:
logger.warning(f"StatsD socket error: {e}")
# StatsD errors shouldn't block application
return False
except Exception as e:
logger.warning(f"Unexpected StatsD error: {e}")
return False
def batch_statsd_with_recovery(metrics_batch):
"""Submit batch of StatsD metrics with individual error recovery."""
success_count = 0
for metric in metrics_batch:
try:
if metric['type'] == 'gauge':
statsd.gauge(metric['name'], metric['value'], tags=metric.get('tags'))
elif metric['type'] == 'increment':
statsd.increment(metric['name'], metric['value'], tags=metric.get('tags'))
elif metric['type'] == 'timing':
statsd.timing(metric['name'], metric['value'], tags=metric.get('tags'))
success_count += 1
except Exception as e:
logger.warning(f"Failed to submit metric {metric['name']}: {e}")
# Continue with remaining metrics
continue
logger.info(f"Submitted {success_count}/{len(metrics_batch)} metrics successfully")
return success_count
# Usage
metrics = [
{'type': 'gauge', 'name': 'system.cpu.usage', 'value': 75.0, 'tags': ['host:web01']},
{'type': 'increment', 'name': 'web.requests', 'value': 1, 'tags': ['endpoint:/api']},
{'type': 'timing', 'name': 'db.query.time', 'value': 150, 'tags': ['table:users']}
]
batch_statsd_with_recovery(metrics)# Production: Suppress errors to prevent application crashes
initialize(
api_key=os.environ['DATADOG_API_KEY'],
app_key=os.environ['DATADOG_APP_KEY'],
mute=True # Suppress exceptions in production
)
# Development: Enable errors for debugging
initialize(
api_key="dev-api-key",
app_key="dev-app-key",
mute=False # Show all errors during development
)# Critical monitoring should not fail application
def submit_critical_metric(metric_name, value):
try:
statsd.gauge(metric_name, value)
except:
# Never let metrics submission crash critical application flow
pass
# Non-critical operations can have explicit error handling
def create_dashboard_with_handling(dashboard_config):
try:
return api.Dashboard.create(**dashboard_config)
except DatadogException as e:
logger.error(f"Dashboard creation failed: {e}")
return None # Graceful degradation# Retry on transient errors
RETRYABLE_ERRORS = (HttpTimeout, HttpBackoff)
# Don't retry on permanent errors
NON_RETRYABLE_ERRORS = (ApiNotInitialized,)
# Conditional retry on HTTP errors
def should_retry_http_error(http_error):
return http_error.status_code in [429, 500, 502, 503, 504]Install with Tessl CLI
npx tessl i tessl/pypi-datadog