Production-grade platform engineering handbook — Kubernetes, Terraform, Flux CD, GitHub Actions, AWS, and more.
67
84%
Does it follow best practices?
Impact
—
No eval scenarios have been run
Passed
No known issues
Example repository structure for managing multiple environments with Terraform.
multi-env-structure/
├── modules/ # Reusable modules
│ ├── networking/ # VPC, subnets, routing
│ └── eks-cluster/ # EKS cluster abstraction
├── live/ # Environment configurations
│ ├── staging/
│ │ ├── backend.tf # Remote state configuration
│ │ ├── main.tf # Module compositions
│ │ ├── variables.tf # Environment variables
│ │ ├── outputs.tf # Environment outputs
│ │ └── terraform.tfvars # Environment-specific values
│ └── production/
│ ├── backend.tf
│ ├── main.tf
│ ├── variables.tf
│ ├── outputs.tf
│ └── terraform.tfvars
└── README.mdEach environment has its own state file:
# live/production/backend.tf
terraform {
backend "s3" {
bucket = "my-terraform-state"
key = "production/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
}
}Modules are environment-agnostic:
# live/production/main.tf
module "networking" {
source = "../../modules/networking"
environment = "production"
vpc_cidr = var.vpc_cidr
azs = var.availability_zones
}
module "eks_cluster" {
source = "../../modules/eks-cluster"
cluster_name = "production-cluster"
cluster_version = "1.29"
vpc_id = module.networking.vpc_id
subnet_ids = module.networking.private_subnet_ids
}Use .tfvars files for environment differences:
# live/staging/terraform.tfvars
vpc_cidr = "10.1.0.0/16"
availability_zones = ["us-east-1a", "us-east-1b"]
cluster_node_count = 2
instance_types = ["t3.medium"]# live/production/terraform.tfvars
vpc_cidr = "10.0.0.0/16"
availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
cluster_node_count = 5
instance_types = ["t3.large", "t3a.large"]cd live/staging
terraform initterraform plan -var-file=terraform.tfvarsterraform apply -var-file=terraform.tfvarscd ../production
terraform init
terraform plan -var-file=terraform.tfvarsConfigure in each environment:
# live/production/main.tf
terraform {
required_version = ">= 1.5.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Environment = "production"
ManagedBy = "terraform"
Owner = "platform-team"
}
}
}Reference other state files when needed:
data "terraform_remote_state" "networking" {
backend = "s3"
config = {
bucket = "my-terraform-state"
key = "production/networking/terraform.tfstate"
region = "us-east-1"
}
}Add validation to variables:
variable "environment" {
description = "Environment name"
type = string
validation {
condition = contains(["staging", "production"], var.environment)
error_message = "Environment must be staging or production"
}
}Use naming conventions:
locals {
name_prefix = "${var.environment}-${var.project_name}"
common_tags = {
Environment = var.environment
Project = var.project_name
ManagedBy = "terraform"
}
}
resource "aws_s3_bucket" "example" {
bucket = "${local.name_prefix}-bucket"
tags = local.common_tags
}Use DynamoDB for state locking:
terraform {
backend "s3" {
bucket = "my-terraform-state"
key = "production/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
# Prevent accidental state deletion
skip_region_validation = false
skip_credentials_validation = false
skip_metadata_api_check = false
}
}Moving resources between environments:
# Export from staging
cd live/staging
terraform state pull > staging-state.json
# Import to production
cd ../production
terraform import aws_s3_bucket.example my-bucket-namejobs:
terraform:
runs-on: ubuntu-latest
strategy:
matrix:
environment: [staging, production]
steps:
- uses: actions/checkout@v4
- name: Terraform Plan
working-directory: live/${{ matrix.environment }}
run: |
terraform init
terraform plan -var-file=terraform.tfvarsNever commit sensitive values:
# ❌ Don't do this
variable "database_password" {
default = "hardcoded-secret"
}
# ✅ Do this
data "aws_secretsmanager_secret_version" "db_password" {
secret_id = "production/database/password"
}Terraform execution role should have minimal permissions:
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ec2:Describe*",
"eks:Describe*",
"eks:List*"
],
"Resource": "*",
"Condition": {
"StringEquals": {
"aws:RequestedRegion": "us-east-1"
}
}
}
]
}# View lock info
aws dynamodb get-item \
--table-name terraform-locks \
--key '{"LockID":{"S":"my-state-bucket/production/terraform.tfstate-md5"}}'
# Force unlock (only if certain no other process is running)
terraform force-unlock <lock-id># Refresh and show changes
terraform plan -refresh-only.claude-plugin
.github
commands
docs
examples
agent-self-improve
argocd
awesome-docs
aws
cloudfront
functions
lambda-edge
functions
azure
compliance
conventional-commits
datadog
llm-observability
demo
documentation
dora
dynatrace
fluxcd
github-actions
composite-actions
configure-cloud
db-migrate
docker-build-push
k8s-deploy
notify-slack
pr-comment
release-tag
security-scan
setup-env
setup-terraform
terraform-plan
helm
web-service
templates
kubernetes
kyverno
mcp
observability
openshift
pr-review
ownership
runtime-security
supply-chain
terraform
references
scripts
skills
platform-skills
tests