Comprehensive Kubernetes debugging and troubleshooting toolkit. Use this skill when diagnosing Kubernetes cluster issues, debugging failing pods, investigating network connectivity problems, analyzing resource usage, troubleshooting deployments, or performing cluster health checks.
Overall
score
93%
Does it follow best practices?
Validation for skill structure
Symptoms:
CrashLoopBackOffCommon Causes:
Debugging Steps:
# Check pod events
kubectl describe pod <pod-name> -n <namespace>
# View current logs
kubectl logs <pod-name> -n <namespace>
# View previous container logs (from crashed container)
kubectl logs <pod-name> -n <namespace> --previous
# Check resource limits
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 5 resources
# Check liveness/readiness probes
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 10 livenessProbeSolutions:
Symptoms:
ImagePullBackOff or ErrImagePullCommon Causes:
Debugging Steps:
# Check exact error message
kubectl describe pod <pod-name> -n <namespace>
# Verify image name and tag
kubectl get pod <pod-name> -n <namespace> -o yaml | grep image:
# Check image pull secrets
kubectl get pod <pod-name> -n <namespace> -o yaml | grep imagePullSecrets -A 2
# List secrets in namespace
kubectl get secrets -n <namespace>
# Test image pull manually on node
docker pull <image-name>Solutions:
docker pull <image>kubectl create secret docker-registry <secret-name> --docker-server=<registry> --docker-username=<user> --docker-password=<pass>latest in production)Symptoms:
Pending stateCommon Causes:
Debugging Steps:
# Check scheduling events
kubectl describe pod <pod-name> -n <namespace>
# Check node resources
kubectl top nodes
kubectl describe nodes
# Check PVC status
kubectl get pvc -n <namespace>
# Check node selectors and taints
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 5 nodeSelector
kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taintsSolutions:
kubectl get resourcequota -n <namespace>Symptoms:
OOMKilledDebugging Steps:
# Check pod status and last state
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 10 lastState
# Check memory limits
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 5 resources
# Check actual memory usage
kubectl top pod <pod-name> -n <namespace> --containersSolutions:
Symptoms:
Common Causes:
Debugging Steps:
# Check service configuration
kubectl get svc <service-name> -n <namespace> -o yaml
# Check endpoints
kubectl get endpoints <service-name> -n <namespace>
# Check pod labels
kubectl get pods -n <namespace> --show-labels
# Test from another pod
kubectl run tmp-shell --rm -i --tty --image nicolaka/netshoot -- /bin/bash
# Inside pod: curl <service-name>.<namespace>.svc.cluster.local
# Check network policies
kubectl get networkpolicies -n <namespace>Solutions:
Symptoms:
nslookup or dig commands failCommon Causes:
Debugging Steps:
# Check CoreDNS pods
kubectl get pods -n kube-system -l k8s-app=kube-dns
# Check CoreDNS logs
kubectl logs -n kube-system -l k8s-app=kube-dns
# Test DNS from pod
kubectl exec <pod-name> -n <namespace> -- nslookup kubernetes.default
# Check pod DNS config
kubectl exec <pod-name> -n <namespace> -- cat /etc/resolv.conf
# Check DNS service
kubectl get svc -n kube-system kube-dnsSolutions:
kubectl rollout restart deployment/coredns -n kube-systemSymptoms:
Pending stateDebugging Steps:
# Check PVC status
kubectl describe pvc <pvc-name> -n <namespace>
# List available PVs
kubectl get pv
# Check storage class
kubectl get storageclassSolutions:
Symptoms:
Debugging Steps:
# List ConfigMaps
kubectl get configmaps -n <namespace>
# List Secrets
kubectl get secrets -n <namespace>
# Check pod configuration
kubectl get pod <pod-name> -n <namespace> -o yaml | grep -A 10 envSolutions:
Debugging Steps:
# Check resource usage
kubectl top nodes
kubectl top pods -n <namespace>
# Check resource requests/limits
kubectl describe pod <pod-name> -n <namespace> | grep -A 5 Limits
# Get detailed metrics
kubectl get --raw /apis/metrics.k8s.io/v1beta1/namespaces/<namespace>/pods/<pod-name>Solutions:
Symptoms:
Debugging Steps:
# Check rollout status
kubectl rollout status deployment/<deployment-name> -n <namespace>
# Check rollout history
kubectl rollout history deployment/<deployment-name> -n <namespace>
# Check replica sets
kubectl get rs -n <namespace>
# Check events
kubectl get events -n <namespace> --sort-by='.lastTimestamp'Solutions:
kubectl rollout undo deployment/<deployment-name> -n <namespace>Install with Tessl CLI
npx tessl i pantheon-ai/k8s-debug@0.1.0