CtrlK
BlogDocsLog inGet started
Tessl Logo

nitinjain999/platform-skills

Production-grade platform engineering handbook — Kubernetes, Terraform, Flux CD, GitHub Actions, AWS, and more.

64

Quality

80%

Does it follow best practices?

Impact

No eval scenarios have been run

SecuritybySnyk

Advisory

Suggest reviewing before use

Overview
Quality
Evals
Security
Files

karpenter-validate.shexamples/karpenter/

#!/usr/bin/env bash
# Validates Karpenter example manifests and checks live cluster health if available.
#
# Offline mode: field-level checks on all YAML files.
# Online mode: kubectl apply --dry-run=server + live NodePool/NodeClaim status.
#
# Usage: bash examples/karpenter/karpenter-validate.sh
# Requires: kubectl (for cluster validation, with Karpenter CRDs installed), jq (for live JSON parsing)

set -euo pipefail

ERRORS=0
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

pass() { echo "  PASS: $1"; }
fail() { echo "  FAIL: $1"; ERRORS=$((ERRORS + 1)); }
warn() { echo "  WARN: $1"; }

YAML_FILES=(
  nodepool-default-al2023.yaml
  nodepool-spot-flex.yaml
  nodepool-critical-ondemand.yaml
  nodepool-gpu.yaml
  ec2nodeclass-private-cluster.yaml
)

# ─── Offline field checks ─────────────────────────────────────────────────────

_check_yaml_fields() {
  local name="$1"
  local file="$2"

  # API version checks
  if grep -q "karpenter.sh/v1" "$file"; then
    pass "$name — uses karpenter.sh/v1 (v1.x API)"
  elif grep -q "karpenter.sh/v1beta1\|karpenter.sh/v1alpha5" "$file"; then
    fail "$name — uses deprecated v1beta1/v1alpha5 API — migrate to karpenter.sh/v1"
  fi

  if grep -q "karpenter.k8s.aws/v1" "$file"; then
    pass "$name — uses karpenter.k8s.aws/v1 EC2NodeClass (v1.x)"
  elif grep -q "karpenter.k8s.aws/v1beta1" "$file"; then
    fail "$name — uses deprecated EC2NodeClass v1beta1 — migrate to karpenter.k8s.aws/v1"
  fi

  # Old v0.x resources
  if grep -qE "^kind: Provisioner$" "$file"; then
    fail "$name — contains Provisioner (v0.x) — replace with NodePool"
  fi
  if grep -qE "^kind: AWSNodeTemplate$" "$file"; then
    fail "$name — contains AWSNodeTemplate (v0.x) — replace with EC2NodeClass"
  fi

  # NodePool checks
  if grep -qE "^kind: NodePool$" "$file"; then
    if grep -q "limits:" "$file"; then
      pass "$name — NodePool has limits (prevents runaway provisioning)"
    else
      fail "$name — NodePool missing limits — uncapped NodePool can provision unbounded nodes"
    fi

    if grep -q "disruption:" "$file"; then
      pass "$name — NodePool has disruption policy"
    else
      fail "$name — NodePool missing disruption block — defaults to no consolidation"
    fi

    if grep -q "expireAfter:" "$file"; then
      pass "$name — NodePool has expireAfter (node rotation configured)"
    else
      warn "$name — NodePool missing expireAfter — nodes will not be rotated for AMI updates"
    fi

    # startupTaints are only valid when another component removes the taint after readiness.
    # karpenter.sh/not-ready has no built-in remover — flag it as an error if present.
    if grep -qE '^[[:space:]]+startupTaints:' "$file"; then
      # Only fail if karpenter.sh/not-ready is an actual key: value line (not a comment)
      if grep -E "^\s+key:\s+karpenter\.sh/not-ready" "$file" >/dev/null 2>&1; then
        fail "$name — startupTaints uses karpenter.sh/not-ready which has no remover — pods will be permanently blocked; use a taint your CNI/device-plugin actually removes (e.g. node.cilium.io/agent-not-ready)"
      else
        pass "$name — NodePool has startupTaints (verify the taint key is removed by your CNI or device plugin)"
      fi
    fi
    # No warning for missing startupTaints — it is optional and correct to omit it

    # minValues is only meaningful for Spot NodePools — skip if On-Demand only
    if grep -q "values: \[on-demand\]" "$file" && ! grep -q "values: \[spot" "$file"; then
      pass "$name — NodePool is On-Demand only; minValues Spot diversity check skipped"
    elif grep -q "minValues:" "$file"; then
      pass "$name — NodePool has minValues on instance requirements (Spot diversity enforced)"
    else
      warn "$name — NodePool missing minValues — Spot diversity not enforced"
    fi
  fi

  # EC2NodeClass checks
  if grep -qE "^kind: EC2NodeClass$" "$file"; then
    if grep -q "httpTokens: required" "$file"; then
      pass "$name — EC2NodeClass enforces IMDSv2 (httpTokens: required)"
    else
      fail "$name — EC2NodeClass missing httpTokens: required — IMDSv2 not enforced"
    fi

    if grep -q "httpPutResponseHopLimit: 1" "$file"; then
      pass "$name — EC2NodeClass sets httpPutResponseHopLimit: 1 (blocks pod IMDS access)"
    else
      fail "$name — EC2NodeClass missing httpPutResponseHopLimit: 1 — pods can access instance metadata"
    fi

    if grep -q "encrypted: true" "$file"; then
      pass "$name — EC2NodeClass has encrypted EBS volumes"
    else
      fail "$name — EC2NodeClass missing encrypted: true on EBS — unencrypted volumes in production"
    fi

    if grep -q "amiSelectorTerms:" "$file"; then
      pass "$name — EC2NodeClass has amiSelectorTerms"
    else
      fail "$name — EC2NodeClass missing amiSelectorTerms — no AMI selection configured"
    fi

    if grep -q "subnetSelectorTerms:" "$file"; then
      pass "$name — EC2NodeClass has subnetSelectorTerms"
    else
      fail "$name — EC2NodeClass missing subnetSelectorTerms"
    fi

    if grep -q "securityGroupSelectorTerms:" "$file"; then
      pass "$name — EC2NodeClass has securityGroupSelectorTerms"
    else
      fail "$name — EC2NodeClass missing securityGroupSelectorTerms"
    fi

    # Flag the placeholder AMI
    if grep -q "ami-PLACEHOLDER" "$file"; then
      warn "$name — contains ami-PLACEHOLDER — replace with a real tested AMI ID before deploying"
    fi
  fi

  # NodePool content checks (consolidateAfter safety — not covered in structural checks above)
  if grep -qE "^kind: NodePool$" "$file"; then
    # consolidateAfter safety check — 1m with WhenEmptyOrUnderutilized causes churn
    if grep -q "WhenEmptyOrUnderutilized" "$file"; then
      consolidate_val=$(grep "consolidateAfter:" "$file" | grep -oE "[0-9]+(m|h|s)" | head -1 || true)
      consolidate_num=$(echo "$consolidate_val" | grep -oE "[0-9]+" || true)
      consolidate_unit=$(echo "$consolidate_val" | grep -oE "[a-z]+" || true)
      if [ -n "$consolidate_num" ] && [ "$consolidate_unit" = "m" ] && [ "$consolidate_num" -lt 5 ]; then
        fail "$name — consolidateAfter: ${consolidate_val} with WhenEmptyOrUnderutilized is too aggressive (resets on pod activity) — use 5m minimum"
      elif [ -n "$consolidate_val" ]; then
        pass "$name — consolidateAfter: ${consolidate_val} (safe for WhenEmptyOrUnderutilized)"
      fi
    fi
  fi
}

# ─── Online cluster checks ────────────────────────────────────────────────────

_check_live_cluster() {
  echo ""
  echo "=== Live cluster checks ==="
  echo ""

  if ! command -v jq >/dev/null 2>&1; then
    warn "jq not found — skipping live cluster JSON checks (install jq to enable)"
    return
  fi

  # NodePool health
  if kubectl get nodepool -A &>/dev/null 2>&1; then
    local not_ready
    not_ready=$(kubectl get nodepool -A -o json 2>/dev/null | \
      jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | .metadata.name' || true)
    if [ -z "$not_ready" ]; then
      pass "All NodePools are Ready"
    else
      fail "NodePool(s) not Ready: $not_ready"
      kubectl get nodepool -A
    fi
  else
    warn "No NodePools found in cluster"
  fi

  # Stuck NodeClaims
  local pending_claims
  pending_claims=$(kubectl get nodeclaim -A -o json 2>/dev/null | \
    jq -r '.items[] | select(.status.conditions[]? | select(.type=="Launched" and .status=="False")) | .metadata.name' || true)
  if [ -z "$pending_claims" ]; then
    pass "No NodeClaims stuck in Launched=False"
  else
    fail "NodeClaim(s) stuck (Launched=False): $pending_claims"
    kubectl get nodeclaim -A
  fi

  # Karpenter controller pod
  local karpenter_ready
  karpenter_ready=$(kubectl get pods -n karpenter -l app.kubernetes.io/name=karpenter \
    --field-selector=status.phase=Running -o name 2>/dev/null | wc -l | tr -d ' ')
  if [ "$karpenter_ready" -gt 0 ]; then
    pass "Karpenter controller pod is Running ($karpenter_ready pod(s))"
  else
    fail "Karpenter controller pod is not Running"
    kubectl get pods -n karpenter
  fi

  # Recent errors in Karpenter logs
  local log_errors
  log_errors=$(kubectl logs -n karpenter \
    -l app.kubernetes.io/name=karpenter \
    --since=5m --tail=200 2>/dev/null | grep -ci '"level":"error"' || true)
  if [ "$log_errors" -eq 0 ]; then
    pass "No errors in Karpenter logs (last 5 minutes)"
  else
    warn "$log_errors error log line(s) in last 5 minutes — run: kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --since=5m | grep error"
  fi
}

# ─── Main ─────────────────────────────────────────────────────────────────────

echo ""
echo "=== Karpenter example manifest validation ==="
echo ""

USE_KUBECTL=false
HAS_CRDS=false

if kubectl cluster-info >/dev/null 2>&1; then
  USE_KUBECTL=true
  if kubectl get crd nodepools.karpenter.sh >/dev/null 2>&1; then
    HAS_CRDS=true
    echo "  Mode: kubectl apply --dry-run=server + live checks (Karpenter CRDs available)"
  else
    echo "  Mode: offline field checks + kubectl syntax (cluster available, Karpenter CRDs not installed)"
    echo "        To enable full validation: helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter ..."
  fi
else
  echo "  Mode: offline field checks (no cluster)"
fi

echo ""

for manifest in "${YAML_FILES[@]}"; do
  filepath="$SCRIPT_DIR/$manifest"

  if [ ! -f "$filepath" ]; then
    fail "$manifest — file not found"
    continue
  fi

  echo "--- $manifest ---"

  # Offline field checks always run
  _check_yaml_fields "$manifest" "$filepath"

  # Online dry-run if CRDs are available
  if [ "$HAS_CRDS" = "true" ]; then
    if kubectl apply --dry-run=server -f "$filepath" >/dev/null 2>&1; then
      pass "$manifest — kubectl dry-run=server passed"
    else
      fail "$manifest — kubectl dry-run=server failed"
      kubectl apply --dry-run=server -f "$filepath" 2>&1 | sed 's/^/    /'
    fi
  elif [ "$USE_KUBECTL" = "true" ]; then
    if kubectl apply --dry-run=client -f "$filepath" >/dev/null 2>&1; then
      pass "$manifest — kubectl dry-run=client passed (syntax only)"
    else
      fail "$manifest — kubectl dry-run=client failed"
      kubectl apply --dry-run=client -f "$filepath" 2>&1 | sed 's/^/    /'
    fi
  fi

  echo ""
done

# Live cluster checks (only when Karpenter CRDs are present)
if [ "$HAS_CRDS" = "true" ]; then
  _check_live_cluster
fi

echo ""
echo "=== IAM reachability check ==="
echo ""
echo "  To verify the Karpenter controller can call EC2 APIs, run from inside the cluster:"
echo "  kubectl run -it --rm iamtest --image=amazon/aws-cli \\"
echo "    --serviceaccount=karpenter --namespace=karpenter \\"
echo "    -- ec2 describe-instance-types --region eu-north-1 --max-items 1"
echo ""
echo "  To verify SQS interruption queue is reachable:"
echo "  kubectl run -it --rm sqstest --image=amazon/aws-cli \\"
echo "    --serviceaccount=karpenter --namespace=karpenter \\"
echo "    -- sqs get-queue-attributes \\"
echo "       --queue-url <queue-url> --attribute-names All --region eu-north-1"
echo ""

if [ "$ERRORS" -gt 0 ]; then
  echo "FAIL: $ERRORS validation error(s)"
  exit 1
fi

echo "PASS: all Karpenter example checks passed"

examples

BEFORE_AFTER.md

CHANGELOG.md

CODE_OF_CONDUCT.md

COMMANDS.md

CONTRIBUTING.md

EDITOR_INTEGRATIONS.md

GETTING_STARTED.md

HOW_IT_WORKS.md

install.sh

INSTALLATION.md

LAUNCH.md

PROMPTS.md

QUICKSTART.md

README.md

renovate.json

SECURITY.md

SKILL.md

tessl.json

tile.json