Skip to main content

Best Practices Guide

Backup Strategy

The 3-2-1 Rule

Follow the industry-standard 3-2-1 backup rule adapted for infrastructure:

  • 3 copies of your state files
  • 2 different storage media types
  • 1 off-site backup location

Implementation Example

# Primary: S3 with versioning
terraback config set storage.type s3
terraback config set storage.s3.versioning true

# Secondary: Local encrypted backup
terraback backup export --all --encrypted /mnt/backup/

# Tertiary: Git repository
terraback git init --remote https://github.com/company/terraform-states

Backup Frequency Guidelines

EnvironmentRecommended FrequencyRetention PeriodJustification
ProductionEvery hour90 daysCritical infrastructure needs frequent snapshots
StagingEvery 6 hours30 daysBalance between protection and storage
DevelopmentDaily7 daysLower change frequency, less critical
CI/CDOn every apply30 daysTrack all automated changes

Automated Backup Configuration

# Production environment
terraback schedule create prod-hourly \
--interval 1h \
--retention 90d \
--tags "production,automated" \
--workspace production

# Staging environment
terraback schedule create staging-6h \
--interval 6h \
--retention 30d \
--tags "staging,automated" \
--workspace staging

# Development environment
terraback schedule create dev-daily \
--interval 24h \
--retention 7d \
--tags "development,automated" \
--workspace development

Pre and Post Operation Backups

# .terraback/config.yml
backup:
automatic:
on_apply: true # Backup before terraform apply
on_destroy: true # Backup before terraform destroy
on_import: true # Backup before terraform import
on_state_rm: true # Backup before state manipulation

hooks:
pre_apply: "terraback checkpoint create --name pre-apply-$(date +%Y%m%d-%H%M%S)"
post_apply: "terraback backup create --name post-apply-$(date +%Y%m%d-%H%M%S)"

Security Best Practices

Encryption Standards

Always Enable Encryption

# Encryption at rest
storage:
encryption:
enabled: true
algorithm: "AES256-GCM"
key_management: "aws-kms" # Use cloud KMS when possible

# Encryption in transit
network:
tls:
enabled: true
min_version: "1.2"
verify_certificates: true

Key Management

# Use cloud provider KMS
terraback config set storage.s3.kms_key_id "arn:aws:kms:us-east-1:123456789012:key/..."

# Rotate encryption keys regularly
terraback security rotate-keys --schedule monthly

# Never store keys in configuration files
export TERRABACK_ENCRYPTION_KEY=$(vault read -field=key secret/terraback)

Access Control

Principle of Least Privilege

# Role-based access control
security:
rbac:
- role: "infrastructure-admin"
permissions: ["all"]
users: ["alice@company.com"]

- role: "developer"
permissions: ["backup:create", "backup:list", "restore:preview"]
users: ["*@dev.company.com"]

- role: "auditor"
permissions: ["read"]
users: ["audit-team@company.com"]

Multi-Factor Authentication

# Enable MFA requirement
terraback security mfa enable

# Configure TOTP
terraback security mfa setup --type totp

# Enforce for production operations
terraback config set security.production.require_mfa true

Audit Logging

# Comprehensive audit configuration
audit:
enabled: true
destinations:
- type: "file"
path: "/var/log/terraback/audit.log"
format: "json"
rotation: "daily"

- type: "siem"
endpoint: "https://siem.company.com"
api_key: "${SIEM_API_KEY}"

events:
- "backup.*"
- "restore.*"
- "config.change"
- "auth.*"
- "drift.detected"

compliance:
standards: ["SOC2", "ISO27001", "HIPAA"]
retention: "7 years"

Drift Detection

Continuous Monitoring Setup

# Enable drift detection with optimal settings
terraback drift enable \
--interval 15m \
--alert-threshold medium \
--auto-remediate safe

# Configure intelligent drift detection
terraback drift config set detection.smart_mode true
terraback drift config set detection.ignore_tags true
terraback drift config set detection.business_hours "08:00-18:00"

Drift Response Playbook

# Drift response automation
drift:
response:
low_severity:
action: "log"
notification: "email_summary"

medium_severity:
action: "alert"
notification: "slack"
require_acknowledgment: true

high_severity:
action: "page"
notification: ["slack", "pagerduty", "email"]
auto_create_ticket: true

critical_severity:
action: "block_changes"
notification: ["all"]
escalation: "immediate"

Handling Drift

# 1. Investigate drift
terraback drift show --detailed

# 2. Generate report for review
terraback drift report --format html > drift-report.html

# 3. Decision tree:
if [[ $(terraback drift severity) == "critical" ]]; then
# Create incident
terraback incident create --severity critical
# Restore immediately
terraback restore last-known-good --emergency
else
# Review and plan remediation
terraback drift fix --dry-run
# Apply fixes after review
terraback drift fix --approved-by security-team
fi

Performance Optimization

Large State Files

Optimization Strategies

# For state files > 100MB
performance:
large_state_optimization:
enabled: true
chunking:
enabled: true
size: "10MB"
compression:
algorithm: "zstd" # Better for large files
level: 3 # Balance speed/compression
parallel_processing:
enabled: true
workers: 8

State Splitting

# Split large monolithic state
terraback state split \
--strategy "resource-type" \
--output-dir ./split-states/

# Or split by module
terraback state split \
--strategy "module" \
--preserve-dependencies

Network Optimization

network:
optimization:
# Connection pooling
connection_pool:
size: 20
timeout: "30s"
keepalive: true

# Bandwidth management
bandwidth:
upload_limit: "100Mbps"
download_limit: "200Mbps"
burst_allowance: 1.5

# Regional optimization
edge_locations:
enabled: true
prefer_closest: true

Storage Optimization

# Enable intelligent tiering
terraback storage optimize --enable-tiering

# Configure lifecycle policies
terraback storage lifecycle create \
--move-to-cool-after 30d \
--move-to-archive-after 90d \
--delete-after 365d

# Deduplicate backups
terraback storage dedupe --aggressive

Disaster Recovery

RTO and RPO Targets

ScenarioRTO TargetRPO TargetStrategy
Critical Production< 5 min< 1 hourHot standby with continuous backup
Standard Production< 30 min< 4 hoursAutomated recovery with recent backup
Non-Production< 2 hours< 24 hoursManual recovery with daily backup

Recovery Testing

# Schedule regular recovery drills
terraback dr schedule-drill \
--frequency monthly \
--environments "staging,dr-test" \
--notification "dr-team@company.com"

# Automated recovery testing
terraback dr test \
--scenario "complete-failure" \
--target-rto "30m" \
--report-output ./dr-test-results/

Emergency Recovery Procedures

#!/bin/bash
# emergency-recovery.sh

# 1. Assess the situation
terraback status --emergency-mode

# 2. Find last known good state
LAST_GOOD=$(terraback backup list \
--before-incident \
--verified \
--limit 1)

# 3. Create emergency checkpoint
terraback checkpoint create \
--name "emergency-$(date +%s)" \
--priority critical

# 4. Perform recovery
terraback restore apply $LAST_GOOD \
--emergency \
--skip-validation \
--notify-stakeholders

# 5. Verify recovery
terraback verify --comprehensive

# 6. Document incident
terraback incident document \
--template post-mortem \
--attach-logs

CI/CD Integration

Pipeline Best Practices

GitHub Actions Example

# .github/workflows/terraform-backup.yml
name: Terraform State Management

on:
push:
branches: [main, production]
pull_request:
types: [opened, synchronized]
schedule:
- cron: '0 */6 * * *' # Every 6 hours

jobs:
backup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: Setup Terraback
run: |
curl -sSL https://terraback.io/install.sh | bash
terraback version

- name: Configure Terraback
env:
TERRABACK_API_KEY: ${{ secrets.TERRABACK_API_KEY }}
run: |
terraback config import .terraback/ci-config.yml

- name: Pre-change Backup
if: github.event_name == 'pull_request'
run: |
terraback checkpoint create \
--name "pr-${{ github.event.pull_request.number }}" \
--tags "ci,pr,automated"

- name: Check Drift
run: |
terraback drift check --fail-on-drift || {
echo "Drift detected, creating issue"
terraback github issue create \
--title "Drift detected in ${{ github.ref }}" \
--labels "drift,automated"
}

- name: Terraform Plan
id: plan
run: terraform plan -out=tfplan

- name: Backup After Plan
if: steps.plan.outcome == 'success'
run: |
terraback backup create \
--name "post-plan-${{ github.sha }}" \
--attach-plan tfplan

- name: Apply Changes
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: |
terraform apply -auto-approve tfplan
terraback backup create \
--name "post-apply-${{ github.sha }}" \
--tags "production,applied"

GitOps Workflow

# terraback-gitops.yml
gitops:
enabled: true
repository: "github.com/company/infrastructure"
branch: "main"

sync:
interval: "5m"
auto_commit: true
auto_push: true

state_tracking:
path: "states/"
format: "encrypted-json"

approval:
required_reviewers: 2
approved_by: ["sre-team", "security-team"]

rollback:
auto_rollback_on_failure: true
keep_failed_attempts: true

Team Collaboration

Shared Configuration

# team-config.yml
team:
name: "infrastructure"

shared_settings:
backup_prefix: "team-infra-"
default_tags:
team: "infrastructure"
managed_by: "terraback"
cost_center: "engineering"

permissions:
default: "read"
leads: ["alice@company.com", "bob@company.com"]

notifications:
channels:
slack: "#infrastructure"
email: "infra-team@company.com"

workspace_ownership:
production: ["sre-team"]
staging: ["dev-team", "sre-team"]
development: ["all"]

Conflict Prevention

# Enable state locking
terraback config set state.locking.enabled true
terraback config set state.locking.timeout "5m"

# Configure operation queueing
terraback config set operations.queue.enabled true
terraback config set operations.queue.max_wait "10m"

# Set up mutex for critical operations
terraback mutex acquire --resource "production-apply"
terraform apply
terraback mutex release --resource "production-apply"

Monitoring and Alerting

Comprehensive Monitoring Setup

monitoring:
health_checks:
- name: "backup_freshness"
interval: "1h"
condition: "last_backup < 2h"
alert_if_fails: 2

- name: "storage_capacity"
interval: "6h"
condition: "usage < 80%"
alert_if_fails: 1

- name: "drift_detection"
interval: "15m"
condition: "drift_count == 0"
alert_if_fails: 3

sla_monitoring:
backup_success_rate: 99.9
recovery_success_rate: 99.5
api_availability: 99.95

dashboards:
- type: "grafana"
url: "https://grafana.company.com"
dashboards:
- "terraback-overview"
- "backup-metrics"
- "drift-analysis"

Alert Fatigue Prevention

alerting:
smart_alerts:
enabled: true

deduplication:
window: "5m"
key: ["resource", "severity"]

suppression:
maintenance_window: "Sat 02:00-06:00"
repeated_alerts: "exponential_backoff"

correlation:
group_related: true
time_window: "10m"

escalation:
levels:
- delay: "0m"
notify: ["on-call"]
- delay: "15m"
notify: ["team-lead"]
- delay: "30m"
notify: ["manager"]

Cost Optimization

Storage Cost Management

# Analyze storage costs
terraback cost analyze --breakdown-by age,size,type

# Implement cost-saving policies
terraback cost optimize \
--enable-compression \
--enable-deduplication \
--lifecycle-transitions \
--remove-redundant

# Set cost alerts
terraback cost alert create \
--threshold "$100/month" \
--notify "finance@company.com"

Intelligent Retention

retention:
intelligent:
enabled: true

rules:
- name: "production_critical"
pattern: "*production*critical*"
keep_forever: true

- name: "releases"
pattern: "*release*"
keep_days: 365

- name: "daily_backups"
pattern: "*daily*"
keep_days: 30
thin_after: 7 # Keep only one per day after 7 days

- name: "development"
pattern: "*dev*"
keep_days: 7

Compliance and Governance

Regulatory Compliance

compliance:
frameworks:
- name: "SOC2"
controls:
- encryption_at_rest: true
- encryption_in_transit: true
- audit_logging: true
- access_control: true
- retention_policy: "7 years"

- name: "HIPAA"
controls:
- phi_encryption: "AES256"
- access_logs: true
- backup_encryption: true
- disaster_recovery: true

- name: "GDPR"
controls:
- data_location: ["EU"]
- right_to_deletion: true
- data_portability: true
- breach_notification: "72 hours"

Governance Policies

# Enforce governance policies
terraback governance enable

# Define policies
terraback governance policy create \
--name "backup-requirements" \
--rule "all production workspaces must have hourly backups" \
--enforcement "block"

# Audit compliance
terraback governance audit \
--report-format "pdf" \
--output "./compliance-report.pdf"

Troubleshooting Guidelines

Diagnostic Runbook

# Step 1: Check system health
terraback doctor --comprehensive

# Step 2: Verify connectivity
terraback test connectivity --all

# Step 3: Validate configuration
terraback config validate --strict

# Step 4: Check permissions
terraback permissions verify

# Step 5: Review recent operations
terraback logs tail --lines 100 --level error

# Step 6: Generate support bundle
terraback support bundle \
--include-logs \
--include-config \
--output support-$(date +%Y%m%d).tar.gz

Maintenance and Updates

Regular Maintenance Tasks

#!/bin/bash
# maintenance.sh - Run weekly

# Update Terraback
terraback update check
terraback update apply --auto-approve

# Cleanup old backups
terraback backup prune \
--older-than 90d \
--keep-tags "release,critical" \
--dry-run

# Verify backup integrity
terraback verify --all --last 7d

# Optimize storage
terraback storage optimize --defragment

# Update documentation
terraback docs generate --format markdown

# Health report
terraback report generate --type weekly \
--email "team@company.com"

Version Management

version_policy:
auto_update:
enabled: true
channel: "stable" # stable, beta, edge
schedule: "Sunday 02:00"

compatibility:
terraform_versions:
minimum: "0.12.0"
maximum: "1.6.x"
warn_on_unsupported: true

rollback:
keep_previous: 3
auto_rollback_on_error: true