From e8ab07038402152605080c83865f8fe43bfecc29 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:34 -0700
Subject: [PATCH 01/31] harden GCP Cloud Run: Cloud SQL lockdown and Secret
 Manager credentials

Remove public 0.0.0.0/0 access from Cloud SQL and mark credential outputs sensitive. Inject the MLflow backend DSN and the Grafana DB URL and admin password via Secret Manager value_from instead of plaintext env, and stop the Grafana entrypoint echoing the DB URL to logs. Add MLflow min-instances, a longer request timeout, and a startup health probe.
---
 .../docker/grafana-container/Dockerfile       | 11 ++-
 .../docker/grafana-container/entrypoint.sh    |  1 -
 .../templates/gcp/cloud_run/mlflow_main.tf.j2 | 26 +++++--
 .../modules/cloud_sql_postgres/main.tf        | 72 +++++++++++++++----
 .../modules/cloud_sql_postgres/outputs.tf     | 33 +++++++--
 .../grafana/cloud/gcp/cloud_run/main.tf       | 72 ++++++++++++++++---
 .../grafana/cloud/gcp/cloud_run/outputs.tf    | 13 +++-
 .../grafana/cloud/gcp/cloud_run/variables.tf  |  8 ++-
 .../mlflow/cloud/gcp/cloud_run/main.tf        | 44 ++++++++----
 .../mlflow/cloud/gcp/cloud_run/variables.tf   | 29 +++++++-
 10 files changed, 255 insertions(+), 54 deletions(-)

diff --git a/src/deployml/docker/grafana-container/Dockerfile b/src/deployml/docker/grafana-container/Dockerfile
index 5ba6664..4aa5578 100644
--- a/src/deployml/docker/grafana-container/Dockerfile
+++ b/src/deployml/docker/grafana-container/Dockerfile
@@ -3,12 +3,11 @@ FROM grafana/grafana:10.4.5
 # Switch to root to copy files and change permissions
 USER root
 
-# Configure Grafana to listen on Cloud Run's expected port
-ENV GF_SERVER_HTTP_PORT=8080 \
-    GF_SECURITY_ADMIN_USER=admin \
-    GF_SECURITY_ADMIN_PASSWORD=admin
+# Server port only. Admin user and password come from Cloud Run env vars,
+# which Terraform wires from Secret Manager. No baked default credentials.
+ENV GF_SERVER_HTTP_PORT=8080
 
-# Copy entrypoint script and set permissions 
+# Copy entrypoint script and set permissions
 COPY entrypoint.sh /usr/local/bin/entrypoint.sh
 RUN chmod 755 /usr/local/bin/entrypoint.sh
 
@@ -20,4 +19,4 @@ RUN mkdir -p /etc/grafana/provisioning/datasources \
 USER grafana
 
 # Set custom entrypoint
-ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
\ No newline at end of file
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
diff --git a/src/deployml/docker/grafana-container/entrypoint.sh b/src/deployml/docker/grafana-container/entrypoint.sh
index ef92453..c1af0b4 100644
--- a/src/deployml/docker/grafana-container/entrypoint.sh
+++ b/src/deployml/docker/grafana-container/entrypoint.sh
@@ -5,6 +5,5 @@ set -e
 export GF_SERVER_HTTP_PORT="${PORT:-${GF_SERVER_HTTP_PORT:-8080}}"
 
 echo "Starting Grafana on port ${GF_SERVER_HTTP_PORT}..."
-[ -n "${GF_DATABASE_URL:-}" ] && echo "Using database: ${GF_DATABASE_URL}"
 
 exec /run.sh
diff --git a/src/deployml/templates/gcp/cloud_run/mlflow_main.tf.j2 b/src/deployml/templates/gcp/cloud_run/mlflow_main.tf.j2
index 8302530..b69147d 100644
--- a/src/deployml/templates/gcp/cloud_run/mlflow_main.tf.j2
+++ b/src/deployml/templates/gcp/cloud_run/mlflow_main.tf.j2
@@ -74,7 +74,10 @@ module "{{ stage_name }}_mlflow" {
   container_concurrency = var.container_concurrency
   artifact_bucket = {% if create_artifact_bucket and tool.params.get("artifact_bucket") %}google_storage_bucket.{{ stage_name }}_mlflow_artifact.name{% else %}var.artifact_bucket{% endif %}
   {% if flags.needs_postgres %}
-  backend_store_uri = module.cloud_sql_postgres.connection_string
+  # DSN comes from Secret Manager so the password is not a plain env var.
+  # Plain backend_store_uri is set to empty so the module picks the secret path.
+  backend_store_uri = ""
+  backend_store_uri_secret_id = module.cloud_sql_postgres.mlflow_dsn_secret_id
   cloudsql_instance_annotation = module.cloud_sql_postgres.instance_connection_name
   {% else %}
   backend_store_uri = "{{ tool.params.get('backend_store_uri', 'sqlite:///mlflow.db') }}"
@@ -128,7 +131,8 @@ module "{{ stage_name }}_{{ tool.name }}" {
   memory_limit        = var.memory_limit
   allow_public_access = var.allow_public_access
   {% if flags.needs_postgres %}
-  backend_store_uri   = module.cloud_sql_postgres.connection_string
+  # Socket DSN. Public IP is blocked; Cloud SQL Auth Proxy via annotation only.
+  backend_store_uri   = module.cloud_sql_postgres.connection_string_cloud_sql
   # IMPORTANT: Provide a dedicated metrics database connection for the app
   # so prediction metrics are not written to the MLflow database.
   db_connection_string = module.cloud_sql_postgres.metrics_connection_string_cloud_sql
@@ -281,7 +285,9 @@ module "{{ stage_name }}_{{ tool.name }}" {
   memory_limit        = var.memory_limit
   allow_public_access = var.allow_public_access
   {% if flags.needs_postgres %}
-  metrics_connection_string = module.cloud_sql_postgres.grafana_connection_string_cloud_sql
+  # DSN comes from Secret Manager. Plain field is empty so the module picks the secret path.
+  metrics_connection_string = ""
+  metrics_connection_string_secret_id = module.cloud_sql_postgres.grafana_metrics_dsn_secret_id
   use_metrics_database = true
   cloudsql_instance_annotation = module.cloud_sql_postgres.instance_connection_name
   depends_on = [module.cloud_sql_postgres]
@@ -323,9 +329,17 @@ output "artifact_bucket" {
 {% endif %}
 {% if flags.needs_postgres %}
 output "postgresql_credentials" {
-  value = module.cloud_sql_postgres.postgresql_credentials
+  value     = module.cloud_sql_postgres.postgresql_credentials
   sensitive = true
 }
+output "mlflow_dsn_secret_id" {
+  description = "Secret Manager ID holding the MLflow DSN."
+  value       = module.cloud_sql_postgres.mlflow_dsn_secret_id
+}
+output "instance_connection_name" {
+  description = "Cloud SQL connection name. Needed to launch the Cloud SQL Auth Proxy locally."
+  value       = module.cloud_sql_postgres.instance_connection_name
+}
 {% endif %}
 output "bigquery_dataset" {
   value = module.bigquery.dataset_id
@@ -341,6 +355,10 @@ output "fastapi_url" {
     {% if stage_name == "model_monitoring" and tool.name == "grafana" %}
 output "grafana_url" {
   value = length(module.{{ stage_name }}_{{ tool.name }}) > 0 ? module.{{ stage_name }}_{{ tool.name }}[0].service_url : ""
+}
+output "grafana_admin_password_secret_id" {
+  description = "Secret Manager ID holding the Grafana admin password. Fetch the value with gcloud secrets versions access."
+  value       = length(module.{{ stage_name }}_{{ tool.name }}) > 0 ? module.{{ stage_name }}_{{ tool.name }}[0].admin_password_secret_id : ""
 }
     {% endif %}
     {% if stage_name == "feature_store" and tool.name == "feast" %}
diff --git a/src/deployml/terraform/modules/cloud_sql_postgres/main.tf b/src/deployml/terraform/modules/cloud_sql_postgres/main.tf
index 0baec00..6dd63db 100644
--- a/src/deployml/terraform/modules/cloud_sql_postgres/main.tf
+++ b/src/deployml/terraform/modules/cloud_sql_postgres/main.tf
@@ -17,10 +17,11 @@ resource "google_sql_database_instance" "postgres" {
       name  = "max_connections"
       value = var.max_connections
     }
+    # Public IP stays enabled so outputs that reference public_ip_address keep
+    # working, but no authorized_networks means no direct internet access.
+    # Cloud Run reaches the instance through the cloudsql-instances annotation,
+    # which tunnels via the Cloud SQL Auth Proxy and bypasses authorized_networks.
     ip_configuration {
-      authorized_networks {
-        value = "0.0.0.0/0"
-      }
       ipv4_enabled = true
     }
   }
@@ -28,17 +29,11 @@ resource "google_sql_database_instance" "postgres" {
   deletion_protection = false
 }
 
-# Wait for Cloud SQL instance to be fully ready before creating databases
-# Cloud SQL instances can take 2-5 minutes to become fully operational
-resource "time_sleep" "wait_for_instance" {
-  depends_on = [google_sql_database_instance.postgres]
-  create_duration = "180s"  # Wait 3 minutes for instance to be fully ready
-}
-
-# Additional check: Use a null_resource to verify instance is actually running
-# This helps catch cases where the instance exists but is stopped
+# Verify the instance is RUNNABLE before creating databases.
+# Previously paired with a fixed 180s sleep; the polling here makes the sleep
+# unnecessary and faster on the happy path.
 resource "null_resource" "verify_instance_running" {
-  depends_on = [time_sleep.wait_for_instance]
+  depends_on = [google_sql_database_instance.postgres]
   
   provisioner "local-exec" {
     command = <<-EOT
@@ -120,6 +115,57 @@ resource "google_sql_user" "users" {
   }
 }
 
+# Secret Manager holds the full MLflow DSN so Cloud Run env vars do not carry
+# the DB password in plaintext. The Cloud Run runtime SA reads it at start.
+data "google_project" "current" {}
+
+resource "google_secret_manager_secret" "mlflow_dsn" {
+  project   = var.project_id
+  secret_id = "${var.db_instance_name}-mlflow-dsn"
+  replication {
+    auto {}
+  }
+}
+
+resource "google_secret_manager_secret_version" "mlflow_dsn" {
+  secret      = google_secret_manager_secret.mlflow_dsn.id
+  secret_data = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@/${var.db_name}?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}"
+  depends_on  = [google_sql_user.users]
+}
+
+resource "google_secret_manager_secret_iam_member" "mlflow_dsn_access" {
+  project   = var.project_id
+  secret_id = google_secret_manager_secret.mlflow_dsn.secret_id
+  role      = "roles/secretmanager.secretAccessor"
+  member    = "serviceAccount:${data.google_project.current.number}-compute@developer.gserviceaccount.com"
+}
+
+# Same pattern for the Grafana metrics DSN so Grafana's GF_DATABASE_URL env
+# does not carry the DB password in plaintext.
+resource "google_secret_manager_secret" "grafana_metrics_dsn" {
+  count     = var.create_metrics_db ? 1 : 0
+  project   = var.project_id
+  secret_id = "${var.db_instance_name}-grafana-metrics-dsn"
+  replication {
+    auto {}
+  }
+}
+
+resource "google_secret_manager_secret_version" "grafana_metrics_dsn" {
+  count       = var.create_metrics_db ? 1 : 0
+  secret      = google_secret_manager_secret.grafana_metrics_dsn[0].id
+  secret_data = "postgres://${var.db_user}:${random_password.db_password.result}@/metrics?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}&sslmode=disable"
+  depends_on  = [google_sql_database.metrics_db]
+}
+
+resource "google_secret_manager_secret_iam_member" "grafana_metrics_dsn_access" {
+  count     = var.create_metrics_db ? 1 : 0
+  project   = var.project_id
+  secret_id = google_secret_manager_secret.grafana_metrics_dsn[0].secret_id
+  role      = "roles/secretmanager.secretAccessor"
+  member    = "serviceAccount:${data.google_project.current.number}-compute@developer.gserviceaccount.com"
+}
+
 resource "google_project_service" "required" {
   for_each           = toset(var.gcp_service_list)
   project            = var.project_id
diff --git a/src/deployml/terraform/modules/cloud_sql_postgres/outputs.tf b/src/deployml/terraform/modules/cloud_sql_postgres/outputs.tf
index c15598e..32ab971 100644
--- a/src/deployml/terraform/modules/cloud_sql_postgres/outputs.tf
+++ b/src/deployml/terraform/modules/cloud_sql_postgres/outputs.tf
@@ -5,6 +5,7 @@ output "db_user" {
 
 output "db_password" {
   value     = random_password.db_password.result
+  sensitive = true
 }
 
 output "db_name" {
@@ -17,18 +18,21 @@ output "db_public_ip" {
 
 output "connection_string" {
   value     = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@${google_sql_database_instance.postgres.public_ip_address}:5432/${var.db_name}"
-} 
+  sensitive = true
+}
 
 output "instance_connection_name" {
   value = google_sql_database_instance.postgres.connection_name
 }
 
 output "connection_string_cloud_sql" {
-  value = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@/mlflow?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}"
+  value     = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@/mlflow?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}"
+  sensitive = true
 }
 
 output "postgresql_credentials" {
   description = "All credentials and connection info for the Cloud SQL PostgreSQL instance."
+  sensitive   = true
   value = {
     db_user                  = var.db_user
     db_password              = random_password.db_password.result
@@ -40,24 +44,29 @@ output "postgresql_credentials" {
 }
 
 output "feast_connection_string" {
-  value = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@${google_sql_database_instance.postgres.public_ip_address}:5432/feast"
+  value     = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@${google_sql_database_instance.postgres.public_ip_address}:5432/feast"
+  sensitive = true
 }
 
 output "feast_connection_string_cloud_sql" {
-  value = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@/feast?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}"
+  value     = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@/feast?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}"
+  sensitive = true
 }
 
 output "metrics_connection_string" {
-  value = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@${google_sql_database_instance.postgres.public_ip_address}:5432/metrics"
+  value     = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@${google_sql_database_instance.postgres.public_ip_address}:5432/metrics"
+  sensitive = true
 }
 
 output "metrics_connection_string_cloud_sql" {
-  value = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@/metrics?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}"
+  value     = "postgresql+psycopg2://${var.db_user}:${urlencode(random_password.db_password.result)}@/metrics?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}"
+  sensitive = true
 }
 
 # Grafana-friendly connection URL using lib/pq style
 output "grafana_connection_string_cloud_sql" {
   description = "Postgres connection URL suitable for Grafana using Cloud SQL Unix socket"
+  sensitive   = true
   value = "postgres://${var.db_user}:${random_password.db_password.result}@/metrics?host=/cloudsql/${google_sql_database_instance.postgres.connection_name}&sslmode=disable"
 }
 
@@ -84,4 +93,14 @@ output "postgres_password" {
 
 output "instance_name" {
   value = google_sql_database_instance.postgres.name
-}
\ No newline at end of file
+}
+
+output "mlflow_dsn_secret_id" {
+  description = "Secret Manager ID holding the MLflow DSN. Cloud Run reads it via value_from instead of carrying the password in env."
+  value       = google_secret_manager_secret.mlflow_dsn.secret_id
+}
+
+output "grafana_metrics_dsn_secret_id" {
+  description = "Secret Manager ID holding the Grafana metrics DSN. Empty when create_metrics_db is false."
+  value       = var.create_metrics_db ? google_secret_manager_secret.grafana_metrics_dsn[0].secret_id : ""
+}
diff --git a/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/main.tf b/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/main.tf
index 440001a..e55d30b 100644
--- a/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/main.tf
+++ b/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/main.tf
@@ -1,5 +1,32 @@
 data "google_project" "current" {}
 
+# Auto-generated admin password. Lives in Secret Manager so the Grafana image
+# does not carry a baked default credential.
+resource "random_password" "grafana_admin" {
+  length  = 20
+  special = false
+}
+
+resource "google_secret_manager_secret" "grafana_admin" {
+  project   = var.project_id
+  secret_id = "${var.service_name}-admin-password"
+  replication {
+    auto {}
+  }
+}
+
+resource "google_secret_manager_secret_version" "grafana_admin" {
+  secret      = google_secret_manager_secret.grafana_admin.id
+  secret_data = random_password.grafana_admin.result
+}
+
+resource "google_secret_manager_secret_iam_member" "grafana_admin_access" {
+  project   = var.project_id
+  secret_id = google_secret_manager_secret.grafana_admin.secret_id
+  role      = "roles/secretmanager.secretAccessor"
+  member    = "serviceAccount:${data.google_project.current.number}-compute@developer.gserviceaccount.com"
+}
+
 resource "google_cloud_run_service" "grafana" {
   name     = var.service_name
   location = var.region
@@ -11,7 +38,7 @@ resource "google_cloud_run_service" "grafana" {
         "run.googleapis.com/cloudsql-instances" = var.cloudsql_instance_annotation
       } : {}
     }
-    
+
     spec {
       service_account_name = "${data.google_project.current.number}-compute@developer.gserviceaccount.com"
       containers {
@@ -25,16 +52,28 @@ resource "google_cloud_run_service" "grafana" {
         ports {
           container_port = 8080
         }
-        
-        # Add metrics database connection if enabled
+
+        # Metrics DB URL. Prefer Secret Manager when secret_id is provided.
         dynamic "env" {
-          for_each = var.use_metrics_database && var.metrics_connection_string != "" ? [1] : []
+          for_each = var.use_metrics_database && var.metrics_connection_string != "" && var.metrics_connection_string_secret_id == "" ? [1] : []
           content {
             name  = "GF_DATABASE_URL"
             value = var.metrics_connection_string
           }
         }
-        
+        dynamic "env" {
+          for_each = var.use_metrics_database && var.metrics_connection_string_secret_id != "" ? [1] : []
+          content {
+            name = "GF_DATABASE_URL"
+            value_from {
+              secret_key_ref {
+                name = var.metrics_connection_string_secret_id
+                key  = "latest"
+              }
+            }
+          }
+        }
+
         dynamic "env" {
           for_each = var.use_metrics_database ? [1] : []
           content {
@@ -43,11 +82,25 @@ resource "google_cloud_run_service" "grafana" {
           }
         }
 
-        # Ensure Grafana listens on Cloud Run's port
         env {
           name  = "GF_SERVER_HTTP_PORT"
           value = "8080"
         }
+
+        env {
+          name  = "GF_SECURITY_ADMIN_USER"
+          value = "admin"
+        }
+
+        env {
+          name = "GF_SECURITY_ADMIN_PASSWORD"
+          value_from {
+            secret_key_ref {
+              name = google_secret_manager_secret.grafana_admin.secret_id
+              key  = "latest"
+            }
+          }
+        }
       }
     }
   }
@@ -56,6 +109,10 @@ resource "google_cloud_run_service" "grafana" {
     percent         = 100
     latest_revision = true
   }
+
+  depends_on = [
+    google_secret_manager_secret_iam_member.grafana_admin_access,
+  ]
 }
 
 resource "google_cloud_run_service_iam_member" "public" {
@@ -67,9 +124,8 @@ resource "google_cloud_run_service_iam_member" "public" {
   member   = "allUsers"
 }
 
-# Allow service account to connect to Cloud SQL via socket
 resource "google_project_iam_member" "grafana_cloudsql_client" {
   project = var.project_id
   role    = "roles/cloudsql.client"
   member  = "serviceAccount:${data.google_project.current.number}-compute@developer.gserviceaccount.com"
-}
\ No newline at end of file
+}
diff --git a/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/outputs.tf b/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/outputs.tf
index 1da0f7d..2f4598e 100644
--- a/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/outputs.tf
+++ b/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/outputs.tf
@@ -1,4 +1,15 @@
 output "service_url" {
   description = "URL of the Grafana service"
   value       = google_cloud_run_service.grafana.status[0].url
-}
\ No newline at end of file
+}
+
+output "admin_password_secret_id" {
+  description = "Secret Manager ID holding the Grafana admin password. Fetch with: gcloud secrets versions access latest --secret=ID --project=PROJECT"
+  value       = google_secret_manager_secret.grafana_admin.secret_id
+}
+
+output "admin_password" {
+  description = "Grafana admin password. Sensitive."
+  value       = random_password.grafana_admin.result
+  sensitive   = true
+}
diff --git a/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/variables.tf b/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/variables.tf
index 5db3fbb..74eccb7 100644
--- a/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/variables.tf
+++ b/src/deployml/terraform/modules/grafana/cloud/gcp/cloud_run/variables.tf
@@ -38,7 +38,13 @@ variable "allow_public_access" {
 
 variable "metrics_connection_string" {
   type        = string
-  description = "Connection string for the metrics database"
+  description = "Connection string for the metrics database. Used only when metrics_connection_string_secret_id is empty."
+  default     = ""
+}
+
+variable "metrics_connection_string_secret_id" {
+  type        = string
+  description = "Secret Manager secret ID holding the Grafana metrics DSN. When set, GF_DATABASE_URL is sourced via value_from."
   default     = ""
 }
 
diff --git a/src/deployml/terraform/modules/mlflow/cloud/gcp/cloud_run/main.tf b/src/deployml/terraform/modules/mlflow/cloud/gcp/cloud_run/main.tf
index f6e85a2..486f658 100644
--- a/src/deployml/terraform/modules/mlflow/cloud/gcp/cloud_run/main.tf
+++ b/src/deployml/terraform/modules/mlflow/cloud/gcp/cloud_run/main.tf
@@ -21,18 +21,29 @@ resource "google_cloud_run_service" "mlflow" {
     metadata {
       annotations = merge({
         "autoscaling.knative.dev/maxScale" = "10"
+        "autoscaling.knative.dev/minScale" = tostring(var.min_instances)
         "run.googleapis.com/cpu-throttling" = "false"
       }, var.cloudsql_instance_annotation != "" ? {
         "run.googleapis.com/cloudsql-instances" = var.cloudsql_instance_annotation
       } : {})
     }
-    
+
     spec {
       container_concurrency = 80
-      timeout_seconds       = 300
-      
+      timeout_seconds       = var.request_timeout_seconds
+
       containers {
-        image = var.image        
+        image = var.image
+        startup_probe {
+          http_get {
+            path = var.startup_probe_path
+            port = 8080
+          }
+          initial_delay_seconds = 10
+          period_seconds        = 10
+          timeout_seconds       = 5
+          failure_threshold     = 30
+        }
         # Always set basic MLflow environment
         env {
           name  = "MLFLOW_SERVER_HOST"
@@ -44,25 +55,34 @@ resource "google_cloud_run_service" "mlflow" {
           value = "8080"
         }
 
-        # Allow all host headers — required for Cloud Run since the Host header
-        # is the dynamic *.run.app URL which MLflow's DNS rebinding check rejects by default
+        # Allow Host header for *.run.app since Cloud Run assigns a dynamic
+        # subdomain. Security middleware stays ON, only DNS rebinding check is relaxed.
         env {
           name  = "MLFLOW_SERVER_ALLOWED_HOSTS"
           value = "*"
         }
-        env {
-          name  = "MLFLOW_SERVER_DISABLE_SECURITY_MIDDLEWARE"
-          value = "true"
-        }
         
-        # Backend store URI
+        # Backend store URI. Prefer Secret Manager when secret_id provided so
+        # the password is not visible in the Cloud Run env tab.
         dynamic "env" {
-          for_each = var.backend_store_uri != "" ? [1] : []
+          for_each = var.backend_store_uri != "" && var.backend_store_uri_secret_id == "" ? [1] : []
           content {
             name  = "MLFLOW_BACKEND_STORE_URI"
             value = var.backend_store_uri
           }
         }
+        dynamic "env" {
+          for_each = var.backend_store_uri_secret_id != "" ? [1] : []
+          content {
+            name = "MLFLOW_BACKEND_STORE_URI"
+            value_from {
+              secret_key_ref {
+                name = var.backend_store_uri_secret_id
+                key  = "latest"
+              }
+            }
+          }
+        }
         
         # Artifact root - use bucket if provided, otherwise local
         env {
diff --git a/src/deployml/terraform/modules/mlflow/cloud/gcp/cloud_run/variables.tf b/src/deployml/terraform/modules/mlflow/cloud/gcp/cloud_run/variables.tf
index ded70fc..f823cef 100644
--- a/src/deployml/terraform/modules/mlflow/cloud/gcp/cloud_run/variables.tf
+++ b/src/deployml/terraform/modules/mlflow/cloud/gcp/cloud_run/variables.tf
@@ -52,7 +52,13 @@ variable "artifact_bucket" {
 # MLflow configuration
 variable "backend_store_uri" {
   type        = string
-  description = "URI for MLflow backend store (database)"
+  description = "URI for MLflow backend store (database). Used only when backend_store_uri_secret_id is empty."
+  default     = ""
+}
+
+variable "backend_store_uri_secret_id" {
+  type        = string
+  description = "Secret Manager secret ID holding the MLflow DSN. When set, the env var is sourced via value_from, keeping the password out of plain env."
   default     = ""
 }
 
@@ -87,6 +93,27 @@ variable "max_scale" {
   default     = 10
 }
 
+# Keep at least one warm MLflow instance to avoid cold starts on the tracking
+# server, which makes the UI feel broken. Costs ~$5/mo per warm instance.
+# Override to 0 if you accept cold starts in exchange for zero idle cost.
+variable "min_instances" {
+  type        = number
+  description = "Minimum warm container instances. 1 avoids cold starts. 0 saves cost when idle."
+  default     = 1
+}
+
+variable "startup_probe_path" {
+  type        = string
+  description = "HTTP path used for the Cloud Run startup probe."
+  default     = "/health"
+}
+
+variable "request_timeout_seconds" {
+  type        = number
+  description = "Cloud Run request timeout. Long enough for MLflow log_artifact uploads and large queries."
+  default     = 1800
+}
+
 variable "container_concurrency" {
   type        = number
   description = "Maximum number of concurrent requests per container"

From 511d85806f387358dd15f5528b468e29d75e9dd7 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:34 -0700
Subject: [PATCH 02/31] secure FastAPI and Feast database credentials

Drop the unused plaintext DATABASE_URL, BACKEND_STORE_URI, and USE_POSTGRES env from FastAPI since the app never reads them. Inject the Feast online-store password via Secret Manager and default the internal Feast feature server to private.
---
 .../fastapi/cloud/gcp/cloud_run/main.tf       |  34 ++---
 .../modules/feast/cloud/gcp/cloud_run/main.tf | 120 ++++++++++++------
 .../feast/cloud/gcp/cloud_run/variables.tf    |   2 +-
 3 files changed, 90 insertions(+), 66 deletions(-)

diff --git a/src/deployml/terraform/modules/fastapi/cloud/gcp/cloud_run/main.tf b/src/deployml/terraform/modules/fastapi/cloud/gcp/cloud_run/main.tf
index 40415a4..b2a3c71 100644
--- a/src/deployml/terraform/modules/fastapi/cloud/gcp/cloud_run/main.tf
+++ b/src/deployml/terraform/modules/fastapi/cloud/gcp/cloud_run/main.tf
@@ -8,10 +8,10 @@ resource "google_cloud_run_service" "fastapi" {
   template {
     metadata {
       annotations = merge({
-        "autoscaling.knative.dev/maxScale" = "10"
-        "run.googleapis.com/cpu-throttling" = "false"
+        "autoscaling.knative.dev/maxScale"         = "10"
+        "run.googleapis.com/cpu-throttling"        = "false"
         "run.googleapis.com/execution-environment" = "gen2"
-      }, var.use_postgres && var.cloudsql_instance_annotation != "" ? {
+        }, var.use_postgres && var.cloudsql_instance_annotation != "" ? {
         "run.googleapis.com/cloudsql-instances" = var.cloudsql_instance_annotation
       } : {})
     }
@@ -27,18 +27,6 @@ resource "google_cloud_run_service" "fastapi" {
           name  = "MODEL_URI"
           value = var.model_uri
         }
-        env {
-          name  = "BACKEND_STORE_URI"
-          value = var.backend_store_uri
-        }
-        env {
-          name  = "USE_POSTGRES"
-          value = var.use_postgres ? "true" : "false"
-        }
-        env {
-          name  = "DATABASE_URL"
-          value = var.use_postgres ? (var.db_connection_string != "" ? var.db_connection_string : var.backend_store_uri) : "sqlite:///app.db"
-        }
         env {
           name  = "FEAST_SERVICE_URL"
           value = var.feast_service_url
@@ -64,7 +52,7 @@ resource "google_cloud_run_service" "fastapi" {
         ports {
           container_port = 8080
         }
-        
+
         # Health check
         liveness_probe {
           http_get {
@@ -72,20 +60,20 @@ resource "google_cloud_run_service" "fastapi" {
             port = 8080
           }
           initial_delay_seconds = 30
-          timeout_seconds = 10
-          period_seconds = 30
-          failure_threshold = 3
+          timeout_seconds       = 10
+          period_seconds        = 30
+          failure_threshold     = 3
         }
-        
+
         startup_probe {
           http_get {
             path = "/health"
             port = 8080
           }
           initial_delay_seconds = 10
-          timeout_seconds = 10
-          period_seconds = 10
-          failure_threshold = 10
+          timeout_seconds       = 10
+          period_seconds        = 10
+          failure_threshold     = 10
         }
       }
     }
diff --git a/src/deployml/terraform/modules/feast/cloud/gcp/cloud_run/main.tf b/src/deployml/terraform/modules/feast/cloud/gcp/cloud_run/main.tf
index c8e32c0..dbb86a1 100644
--- a/src/deployml/terraform/modules/feast/cloud/gcp/cloud_run/main.tf
+++ b/src/deployml/terraform/modules/feast/cloud/gcp/cloud_run/main.tf
@@ -1,5 +1,30 @@
 data "google_project" "current" {}
 
+# Store the Feast DB password in Secret Manager so it is not visible as a
+# plaintext env var in the Cloud Run console or Cloud Logging.
+resource "google_secret_manager_secret" "feast_db_password" {
+  count     = var.use_postgres && var.postgres_password != "" ? 1 : 0
+  project   = var.project_id
+  secret_id = "${var.service_name}-feast-db-password"
+  replication {
+    auto {}
+  }
+}
+
+resource "google_secret_manager_secret_version" "feast_db_password" {
+  count       = var.use_postgres && var.postgres_password != "" ? 1 : 0
+  secret      = google_secret_manager_secret.feast_db_password[0].id
+  secret_data = var.postgres_password
+}
+
+resource "google_secret_manager_secret_iam_member" "feast_db_password_access" {
+  count     = var.use_postgres && var.postgres_password != "" ? 1 : 0
+  project   = var.project_id
+  secret_id = google_secret_manager_secret.feast_db_password[0].secret_id
+  role      = "roles/secretmanager.secretAccessor"
+  member    = "serviceAccount:${data.google_project.current.number}-compute@developer.gserviceaccount.com"
+}
+
 resource "google_cloud_run_service" "feast" {
   name     = var.service_name
   location = var.region
@@ -8,104 +33,114 @@ resource "google_cloud_run_service" "feast" {
   template {
     metadata {
       annotations = var.use_postgres && var.cloudsql_instance_annotation != "" ? {
-        "autoscaling.knative.dev/maxScale" = var.max_scale
-        "run.googleapis.com/cloudsql-instances" = var.cloudsql_instance_annotation
+        "autoscaling.knative.dev/maxScale"         = var.max_scale
+        "run.googleapis.com/cloudsql-instances"    = var.cloudsql_instance_annotation
         "run.googleapis.com/execution-environment" = "gen2"
-        "run.googleapis.com/memory" = var.memory_limit
-        "run.googleapis.com/cpu" = var.cpu_limit
-      } : {
-        "autoscaling.knative.dev/maxScale" = var.max_scale
+        "run.googleapis.com/memory"                = var.memory_limit
+        "run.googleapis.com/cpu"                   = var.cpu_limit
+        } : {
+        "autoscaling.knative.dev/maxScale"         = var.max_scale
         "run.googleapis.com/execution-environment" = "gen2"
-        "run.googleapis.com/memory" = var.memory_limit
-        "run.googleapis.com/cpu" = var.cpu_limit
+        "run.googleapis.com/memory"                = var.memory_limit
+        "run.googleapis.com/cpu"                   = var.cpu_limit
       }
     }
     spec {
-      service_account_name = "${data.google_project.current.number}-compute@developer.gserviceaccount.com"
+      service_account_name  = "${data.google_project.current.number}-compute@developer.gserviceaccount.com"
       container_concurrency = var.container_concurrency
       containers {
         image = var.image
-        
+
         env {
           name  = "FEAST_REGISTRY_TYPE"
           value = var.use_postgres ? "sql" : "file"
         }
-        
+
         env {
           name  = "FEAST_REGISTRY_PATH"
           value = var.backend_store_uri
         }
-        
+
         env {
           name  = "FEAST_ONLINE_STORE_TYPE"
           value = var.use_postgres ? "postgres" : "sqlite"
         }
-        
+
         env {
           name  = "FEAST_ONLINE_STORE_HOST"
           value = var.postgres_host
         }
-        
+
         env {
           name  = "FEAST_ONLINE_STORE_PORT"
           value = var.postgres_port
         }
-        
+
         env {
           name  = "FEAST_ONLINE_STORE_DATABASE"
           value = var.postgres_database
         }
-        
+
         env {
           name  = "FEAST_ONLINE_STORE_USER"
           value = var.postgres_user
         }
-        
-        env {
-          name  = "FEAST_ONLINE_STORE_PASSWORD"
-          value = var.postgres_password
+
+        # Inject the DB password via Secret Manager so it does not appear in
+        # Cloud Run env tab or Cloud Logging in plaintext.
+        dynamic "env" {
+          for_each = var.use_postgres && var.postgres_password != "" ? [1] : []
+          content {
+            name = "FEAST_ONLINE_STORE_PASSWORD"
+            value_from {
+              secret_key_ref {
+                name = google_secret_manager_secret.feast_db_password[0].secret_id
+                key  = "latest"
+              }
+            }
+          }
         }
-        
+
         env {
           name  = "USE_POSTGRES"
           value = var.use_postgres ? "true" : "false"
         }
-        
+
         env {
           name  = "FEAST_OFFLINE_STORE_TYPE"
           value = var.offline_store
         }
-        
+
         env {
           name  = "FEAST_ARTIFACT_BUCKET"
           value = var.artifact_bucket
         }
-        
+
         env {
           name  = "FEAST_BIGQUERY_PROJECT"
           value = var.bigquery_project != "" ? var.bigquery_project : var.project_id
         }
-        
+
         env {
           name  = "FEAST_BIGQUERY_DATASET"
           value = var.bigquery_dataset
         }
-        
+
         env {
           name  = "GOOGLE_CLOUD_PROJECT"
           value = var.bigquery_project != "" ? var.bigquery_project : var.project_id
         }
-        
+
         env {
           name  = "FEAST_OFFLINE_STORE_DATASET"
           value = var.bigquery_dataset
         }
-        
+
         env {
           name  = "FEAST_OFFLINE_STORE_PROJECT_ID"
           value = var.bigquery_project != "" ? var.bigquery_project : var.project_id
         }
-        
+
         resources {
           limits = {
             cpu    = var.cpu_limit
@@ -116,27 +151,27 @@ resource "google_cloud_run_service" "feast" {
             memory = var.memory_request
           }
         }
-        
+
         startup_probe {
           http_get {
             path = "/health"
             port = 8080
           }
-          failure_threshold     = 20       # Allow 20 failed attempts
-          initial_delay_seconds = 240      # Wait 2 minutes before first check
-          period_seconds        = 30       # Check every 30 seconds
-          timeout_seconds       = 10       # Each check times out after 10 seconds
+          failure_threshold     = 20  # Allow 20 failed attempts
+          initial_delay_seconds = 240 # Wait 2 minutes before first check
+          period_seconds        = 30  # Check every 30 seconds
+          timeout_seconds       = 10  # Each check times out after 10 seconds
         }
-        
+
         liveness_probe {
           http_get {
             path = "/health"
             port = 8080
           }
           initial_delay_seconds = 240
-          timeout_seconds = 25
-          period_seconds = 30
-          failure_threshold = 3
+          timeout_seconds       = 25
+          period_seconds        = 30
+          failure_threshold     = 3
         }
       }
     }
@@ -146,9 +181,10 @@ resource "google_cloud_run_service" "feast" {
     percent         = 100
     latest_revision = true
   }
-  
+
   depends_on = [
-    google_project_service.feast_apis
+    google_project_service.feast_apis,
+    google_secret_manager_secret_iam_member.feast_db_password_access,
   ]
 }
 
@@ -167,10 +203,10 @@ resource "google_project_service" "feast_apis" {
     "bigquery.googleapis.com",
     "storage.googleapis.com"
   ])
-  
+
   project = var.project_id
   service = each.value
-  
+
   disable_on_destroy = false
 }
 
diff --git a/src/deployml/terraform/modules/feast/cloud/gcp/cloud_run/variables.tf b/src/deployml/terraform/modules/feast/cloud/gcp/cloud_run/variables.tf
index cbef9ee..4a55cc5 100644
--- a/src/deployml/terraform/modules/feast/cloud/gcp/cloud_run/variables.tf
+++ b/src/deployml/terraform/modules/feast/cloud/gcp/cloud_run/variables.tf
@@ -105,7 +105,7 @@ variable "container_concurrency" {
 variable "allow_public_access" {
   type        = bool
   description = "Whether to allow public access to the Feast service"
-  default     = true
+  default     = false
 }
 
 variable "cloudsql_instance_annotation" {

From 8646ad9db05c22e46c87ef9666b8e9ce99bf5043 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:34 -0700
Subject: [PATCH 03/31] bigquery: day-partition growth tables and allow clean
 destroy

Partition drift_metrics, ground_truth, and predictions by day to bound query scan cost, and set delete_contents_on_destroy so teardown does not fail on populated tables.
---
 .../modules/bigquery/cloud/gcp/main.tf        | 36 ++++++++++++++-----
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/deployml/terraform/modules/bigquery/cloud/gcp/main.tf b/src/deployml/terraform/modules/bigquery/cloud/gcp/main.tf
index 4db3b16..dd315b1 100644
--- a/src/deployml/terraform/modules/bigquery/cloud/gcp/main.tf
+++ b/src/deployml/terraform/modules/bigquery/cloud/gcp/main.tf
@@ -1,16 +1,25 @@
 resource "google_bigquery_dataset" "mlops" {
-  project    = var.project_id
-  dataset_id = var.dataset_id
-  location   = var.region
+  project                    = var.project_id
+  dataset_id                 = var.dataset_id
+  location                   = var.region
+  delete_contents_on_destroy = true
 }
 
- resource "google_bigquery_table" "drift_metrics" {
+# Partitioning by date keeps Grafana queries bounded and stops full table
+# scans from running away as the table grows. Same for ground_truth and
+# predictions, which both grow per request.
+resource "google_bigquery_table" "drift_metrics" {
   dataset_id = google_bigquery_dataset.mlops.dataset_id
   table_id   = "drift_metrics"
   project    = var.project_id
 
-  schema = file("${path.module}/schemas/drift_metrics.json")
+  schema              = file("${path.module}/schemas/drift_metrics.json")
   deletion_protection = false
+
+  time_partitioning {
+    type  = "DAY"
+    field = "metric_timestamp"
+  }
 }
 
 resource "google_bigquery_table" "ground_truth" {
@@ -18,8 +27,13 @@ resource "google_bigquery_table" "ground_truth" {
   table_id   = "ground_truth"
   project    = var.project_id
 
-  schema = file("${path.module}/schemas/ground_truth.json")
+  schema              = file("${path.module}/schemas/ground_truth.json")
   deletion_protection = false
+
+  time_partitioning {
+    type  = "DAY"
+    field = "event_timestamp"
+  }
 }
 
 resource "google_bigquery_table" "offline_features" {
@@ -27,7 +41,7 @@ resource "google_bigquery_table" "offline_features" {
   table_id   = "offline_features"
   project    = var.project_id
 
-  schema = file("${path.module}/schemas/offline_features.json")
+  schema              = file("${path.module}/schemas/offline_features.json")
   deletion_protection = false
 }
 
@@ -36,7 +50,11 @@ resource "google_bigquery_table" "predictions" {
   table_id   = "predictions"
   project    = var.project_id
 
-  schema = file("${path.module}/schemas/predictions.json")
+  schema              = file("${path.module}/schemas/predictions.json")
   deletion_protection = false
-}
 
+  time_partitioning {
+    type  = "DAY"
+    field = "prediction_timestamp"
+  }
+}

From 8552b79670b270f7d37407facf29fff0548bb435 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:34 -0700
Subject: [PATCH 04/31] teardown: narrow the auto-teardown service account
 privileges

Tighten the teardown service account role set and remove project IAM admin.
---
 .../modules/teardown/cloud/gcp/main.tf        | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/deployml/terraform/modules/teardown/cloud/gcp/main.tf b/src/deployml/terraform/modules/teardown/cloud/gcp/main.tf
index b6d5416..a0b53fb 100644
--- a/src/deployml/terraform/modules/teardown/cloud/gcp/main.tf
+++ b/src/deployml/terraform/modules/teardown/cloud/gcp/main.tf
@@ -40,22 +40,24 @@ resource "google_service_account" "teardown" {
   project      = var.project_id
 }
 
-# Grant permissions to service account
+# Grant teardown permissions. Narrowed scope: removed
+# roles/resourcemanager.projectIamAdmin because deleting resources does not
+# require IAM modification rights, and granting it is a project-wide
+# escalation risk if the SA is ever compromised.
 resource "google_project_iam_member" "teardown_permissions" {
   for_each = toset([
-    "roles/run.admin",                    # To destroy Cloud Run services and jobs
-    "roles/compute.instanceAdmin.v1",     # To destroy VMs
-    "roles/storage.admin",                # To destroy storage buckets
-    "roles/cloudsql.admin",               # To destroy Cloud SQL instances
-    "roles/iam.serviceAccountUser",       # To use service accounts
-    "roles/resourcemanager.projectIamAdmin", # To clean up IAM bindings
-    "roles/storage.objectAdmin",          # To read/write files in GCS
-    "roles/secretmanager.secretAccessor",  # To access secrets
-    "roles/pubsub.admin",                 # To delete Pub/Sub topics
-    "roles/cloudscheduler.admin",         # To delete scheduler jobs
-    "roles/cloudbuild.builds.builder"     # To delete Cloud Build triggers (if any exist from old deployments)
+    "roles/run.admin",                  # Destroy Cloud Run services and jobs
+    "roles/compute.instanceAdmin.v1",   # Destroy VMs
+    "roles/storage.admin",              # Destroy storage buckets
+    "roles/cloudsql.admin",             # Destroy Cloud SQL instances
+    "roles/iam.serviceAccountUser",     # Use service accounts
+    "roles/storage.objectAdmin",        # Read/write files in GCS
+    "roles/secretmanager.secretAccessor", # Access secrets
+    "roles/pubsub.admin",               # Delete Pub/Sub topics
+    "roles/cloudscheduler.admin",       # Delete scheduler jobs
+    "roles/cloudbuild.builds.builder"   # Delete Cloud Build triggers from old deployments
   ])
-  
+
   project = var.project_id
   role    = each.value
   member  = "serviceAccount:${google_service_account.teardown.email}"

From 1395ff4b50004eddfef60a9b085e5f68e0b3735e Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:34 -0700
Subject: [PATCH 05/31] cli: deploy/destroy lifecycle robustness and
 minikube/GKE commands

Add gcloud auth, ADC, and region preflight checks, keep the Terraform workspace name consistent across deploy, destroy, get-urls, and status, surface terraform stderr on failure, preserve state on a failed destroy, and clean up the Artifact Registry repo and Cloud Build staging bucket. Fix the generate overwrite flag and add the minikube and GKE commands with namespace support, persistent storage, and self-cleaning teardown.
---
 src/deployml/cli/cli.py             | 737 +++++++++++++++++++++++-----
 src/deployml/enum/cloud_provider.py |   5 +-
 src/deployml/utils/constants.py     |  14 +
 src/deployml/utils/helpers.py       | 134 +++++
 4 files changed, 768 insertions(+), 122 deletions(-)

diff --git a/src/deployml/cli/cli.py b/src/deployml/cli/cli.py
index fe9dc14..46da55b 100644
--- a/src/deployml/cli/cli.py
+++ b/src/deployml/cli/cli.py
@@ -14,6 +14,7 @@
     ANIMAL_NAMES,
     FALLBACK_WORDS,
     REQUIRED_GCP_APIS,
+    REQUIRED_GCP_IAM_ROLES,
 )
 from deployml.enum.cloud_provider import CloudProvider
 from jinja2 import Environment, FileSystemLoader
@@ -30,6 +31,13 @@
 from deployml.utils.helpers import (
     check,
     check_gcp_auth,
+    check_gcp_adc,
+    check_bq,
+    get_terraform_version,
+    validate_gcp_project,
+    validate_gcp_region,
+    get_missing_iam_roles,
+    check_docker_daemon,
     copy_modules_to_workspace,
     bucket_exists,
     generate_bucket_name,
@@ -458,6 +466,47 @@ def upload_resource_manifest(manifest: dict, terraform_dir: Path, project_id: st
 import json
 from datetime import datetime, timedelta
 
+def _load_config_or_exit(config_path: Path) -> dict:
+    """Load YAML config with clean error messages. Exits non-zero on failure."""
+    try:
+        data = yaml.safe_load(config_path.read_text())
+    except yaml.YAMLError as e:
+        typer.secho(f" Config file is not valid YAML: {e}", fg=typer.colors.RED)
+        raise typer.Exit(code=1)
+    if not isinstance(data, dict):
+        typer.secho(
+            f" Config file must be a YAML mapping at the top level, got {type(data).__name__}.",
+            fg=typer.colors.RED,
+        )
+        raise typer.Exit(code=1)
+    return data
+
+
+_SUPPORTED_PROVIDERS = {"gcp", "aws", "azure"}
+
+
+def _validate_deploy_config_or_exit(config: dict) -> None:
+    """Validate the fields deploy and destroy need. Exits non-zero on missing or bad values."""
+    provider = config.get("provider")
+    if not isinstance(provider, dict):
+        typer.secho(" Config is missing required field: provider (mapping).", fg=typer.colors.RED)
+        raise typer.Exit(code=1)
+    name = provider.get("name")
+    if name not in _SUPPORTED_PROVIDERS:
+        typer.secho(
+            f" provider.name must be one of {sorted(_SUPPORTED_PROVIDERS)}, got {name!r}.",
+            fg=typer.colors.RED,
+        )
+        raise typer.Exit(code=1)
+    if name == "gcp" and not provider.get("project_id"):
+        typer.secho(" GCP config is missing provider.project_id.", fg=typer.colors.RED)
+        raise typer.Exit(code=1)
+    deployment = config.get("deployment")
+    if not isinstance(deployment, dict) or not deployment.get("type"):
+        typer.secho(" Config is missing required field: deployment.type.", fg=typer.colors.RED)
+        raise typer.Exit(code=1)
+
+
 def get_version():
     """Get version from package metadata"""
     try:
@@ -520,9 +569,15 @@ def doctor(
     else:
         typer.secho("\n Docker is not installed", fg=typer.colors.RED)
 
-    # Terraform
+    # Terraform with version gate (need >= 1.0)
     if terraform_installed:
-        typer.secho("\n Terraform is installed", fg=typer.colors.GREEN)
+        tf_ver = get_terraform_version()
+        if tf_ver and tf_ver[0] >= 1:
+            typer.secho(f"\n Terraform {'.'.join(map(str, tf_ver))} (>= 1.0)", fg=typer.colors.GREEN)
+        elif tf_ver:
+            typer.secho(f"\n Terraform {'.'.join(map(str, tf_ver))} found, but requires >= 1.0", fg=typer.colors.RED)
+        else:
+            typer.secho("\n Terraform installed, version unknown", fg=typer.colors.YELLOW)
     else:
         typer.secho("\n Terraform is not installed", fg=typer.colors.RED)
 
@@ -543,11 +598,21 @@ def doctor(
             "\n GCP CLI installed and authenticated", fg=typer.colors.GREEN
         )
         # Check enabled GCP APIs
+        # ADC and bq are required for client libs and BigQuery work
+        if check_gcp_adc():
+            typer.secho("\n GCP Application Default Credentials configured", fg=typer.colors.GREEN)
+        else:
+            typer.secho("\n GCP Application Default Credentials NOT configured", fg=typer.colors.RED)
+            typer.echo("   Fix: gcloud auth application-default login")
+        if check_bq():
+            typer.secho("\n bq CLI is installed", fg=typer.colors.GREEN)
+        else:
+            typer.secho("\n bq CLI not installed", fg=typer.colors.YELLOW)
+            typer.echo("   Fix: gcloud components install bq")
         if not project_id:
-            project_id = typer.prompt(
-                "Enter your GCP Project ID to check enabled APIs",
-                default="",
-                show_default=False,
+            typer.secho(
+                "\nSKIP: API and IAM checks need --project-id. Re-run as: deployml doctor --project-id YOUR_GCP_PROJECT_ID",
+                fg=typer.colors.YELLOW,
             )
         if project_id:
             project_id = project_id.strip()
@@ -592,11 +657,21 @@ def doctor(
                     typer.echo(
                         "You can enable them with: deployml init --provider gcp --project-id <PROJECT_ID>"
                     )
-        elif project_id:  # Empty string after stripping
-            typer.secho(
-                "\nWARNING: No project ID provided. Skipping API check.",
-                fg=typer.colors.YELLOW,
-            )
+            # IAM role probe for the same project
+            missing_roles = get_missing_iam_roles(project_id, REQUIRED_GCP_IAM_ROLES)
+            if not missing_roles:
+                typer.secho(
+                    f"\n IAM roles on {project_id}: all required roles present",
+                    fg=typer.colors.GREEN,
+                )
+            else:
+                typer.secho(
+                    f"\nWARNING: Missing IAM roles on {project_id}:",
+                    fg=typer.colors.YELLOW,
+                )
+                for r in missing_roles:
+                    typer.echo(f"  - {r}")
+                typer.echo("   Fix: grant roles/owner OR each role via gcloud projects add-iam-policy-binding")
     elif gcp_installed:
         typer.secho(
             "\nWARNING: GCP CLI installed but not authenticated",
@@ -619,13 +694,22 @@ def doctor(
 @cli.command()
 def vm():
     """
-    Create a new Virtual Machine (VM) deployment.
+    Create a new Virtual Machine (VM) deployment. NOT YET IMPLEMENTED.
     """
-    pass
+    typer.secho(
+        " The `vm` command is not yet implemented. Use `deployml deploy` with "
+        "deployment.type: cloud_vm in config.yaml.",
+        fg=typer.colors.YELLOW,
+    )
+    raise typer.Exit(code=1)
 
 
 @cli.command()
-def generate():
+def generate(
+    force: bool = typer.Option(
+        False, "--force", "-f", help="Overwrite an existing config without confirming"
+    ),
+):
     """
     Generate a deployment configuration YAML file interactively.
     """
@@ -712,15 +796,12 @@ def generate():
 
     # Write configuration to file
     config_filename = "config.yaml"
-    
-    if not config_filename.exists():
-        typer.secho(
-            "config.yaml not found. Run 'mlops-infra init' first.",
-            fg=typer.colors.RED,
-        )
-        raise typer.Exit(code=1)
+    config_path = Path(config_filename)
 
-    if not force:
+    # Confirm only if the file already exists. Earlier code mistakenly called
+    # .exists() on a string and crashed for every user; now it gates on the
+    # real overwrite case.
+    if config_path.exists() and not force:
         confirm = typer.confirm(
             "This will overwrite the existing config.yaml. Continue?"
         )
@@ -753,27 +834,22 @@ def terraform(
     """
     Run Terraform actions (plan, apply, destroy) for the specified stack configuration.
     """
-    print(action)
     if action not in ["plan", "apply", "destroy"]:
         typer.secho(
             f" Invalid action: {action}. Use: plan, apply, destroy",
             fg=typer.colors.RED,
         )
+        raise typer.Exit(code=1)
 
     config_path = Path(stack_config_path)
-
-    print(config_path)
-    try:
-        with open(config_path, "r") as f:
-            config = yaml.safe_load(f)
-
-    except Exception as e:
-        typer.secho(
-            f" Failed to load configuration: {e}", fg=typer.colors.RED
-        )
+    if not config_path.exists():
+        typer.secho(f" Config file not found: {config_path}", fg=typer.colors.RED)
+        raise typer.Exit(code=1)
+    config = _load_config_or_exit(config_path)
 
     if not output_dir:
-        output_dir = Path.cwd() / ".deployml" / "terraform" / config["name"]
+        workspace = config.get("name") or "default"
+        output_dir = Path.cwd() / ".deployml" / workspace / "terraform"
     else:
         output_dir = Path(output_dir)
 
@@ -800,7 +876,8 @@ def deploy(
         typer.echo(f" Config file not found: {config_path}")
         raise typer.Exit(code=1)
 
-    config = yaml.safe_load(config_path.read_text())
+    config = _load_config_or_exit(config_path)
+    _validate_deploy_config_or_exit(config)
 
     # --- GCS bucket existence and unique name logic ---
     cloud = config["provider"]["name"]
@@ -837,7 +914,9 @@ def deploy(
                             "postgresql"
                         )
 
-    workspace_name = config.get("name") or "development"
+    # Workspace name MUST match across deploy, get-urls, and destroy.
+    # All three default to "default" when config has no name set.
+    workspace_name = config.get("name") or "default"
 
     DEPLOYML_DIR = Path.cwd() / ".deployml" / workspace_name
     DEPLOYML_TERRAFORM_DIR = DEPLOYML_DIR / "terraform"
@@ -849,7 +928,27 @@ def deploy(
     DEPLOYML_TERRAFORM_DIR.mkdir(parents=True, exist_ok=True)
     DEPLOYML_MODULES_DIR.mkdir(parents=True, exist_ok=True)
 
+    # Project ID drift detection. If this workspace was previously deployed to a
+    # different project, fail fast so we do not orphan resources in the old one.
+    project_marker = DEPLOYML_DIR / ".project_id"
+    if cloud == "gcp":
+        if project_marker.exists():
+            previous_project = project_marker.read_text().strip()
+            if previous_project and previous_project != project_id:
+                typer.secho(
+                    f" Workspace '{workspace_name}' was previously deployed to project "
+                    f"'{previous_project}'. Config now points at '{project_id}'.",
+                    fg=typer.colors.RED,
+                )
+                typer.echo("  Run `deployml destroy` first to clean up the old project,")
+                typer.echo("  or change `name:` in config.yaml to use a fresh workspace.")
+                raise typer.Exit(code=1)
+        project_marker.write_text(project_id)
+
     region = config["provider"]["region"]
+    if cloud == "gcp" and not validate_gcp_region(region, project_id):
+        typer.secho(f" Region '{region}' is not valid for GCP. Run: gcloud compute regions list", fg=typer.colors.RED)
+        raise typer.Exit(code=1)
     deployment_type = config["deployment"]["type"]
     stack = config["stack"]
 
@@ -888,10 +987,11 @@ def deploy(
             connect_to_gke_cluster,
         )
         
-        # Connect to GKE cluster
-        if not connect_to_gke_cluster(project_id, cluster_name, zone, region_gke):
+        # Connect to the cluster only when we will actually apply. --generate-only
+        # renders manifests offline, so it can run before the cluster exists.
+        if not generate_only and not connect_to_gke_cluster(project_id, cluster_name, zone, region_gke):
             raise typer.Exit(code=1)
-        
+
         # Process stack and generate manifests
         mlflow_manifest_dir = None
         fastapi_manifest_dir = None
@@ -901,7 +1001,11 @@ def deploy(
                 if stage_name == "experiment_tracking" and tool.get("name") == "mlflow":
                     params = tool.get("params", {})
                     image = params.get("image", f"gcr.io/{project_id}/mlflow/mlflow:latest")
-                    backend_uri = params.get("backend_store_uri", "sqlite:///mlflow.db")
+                    # Leave as None when unset so the GKE generator picks its
+                    # persistent default (sqlite on the mounted PVC). A hardcoded
+                    # sqlite:///mlflow.db here would override it with an ephemeral,
+                    # container-local store that is wiped on every pod restart.
+                    backend_uri = params.get("backend_store_uri")
                     artifact_root = params.get("artifact_root")
                     
                     mlflow_manifest_dir = manifests_dir / "mlflow"
@@ -930,6 +1034,14 @@ def deploy(
                         push_image=not image.startswith("gcr.io/"),
                     )
         
+        # --generate-only stops here: manifests are rendered but not applied.
+        # The documented flow is to then apply them with `deployml gke-apply`.
+        if generate_only:
+            typer.echo("\n Manifests generated (not applied).")
+            typer.echo(f" Manifests saved to: {manifests_dir}")
+            typer.echo(f" Apply with: deployml gke-apply --config-path {config_path}")
+            return
+
         # Deploy manifests
         if mlflow_manifest_dir and mlflow_manifest_dir.exists():
             typer.echo(f"\n Deploying MLflow to GKE...")
@@ -1043,6 +1155,10 @@ def deploy(
         "wandb": "wandb",
     }
     _ar_base = f"{region}-docker.pkg.dev/{project_id}/mlops-images"
+    # Tag defaults to the deployml version so deploys are reproducible.
+    # Users can pin to anything (a git SHA, a date string, a release tag) via
+    # config.provider.image_tag. Avoid :latest in production paths.
+    _image_tag = config.get("provider", {}).get("image_tag") or f"v{get_version()}"
     for stage in stack:
         for stage_name, tool in stage.items():
             tool_name = tool.get("name", "")
@@ -1051,13 +1167,13 @@ def deploy(
             if not existing_image or existing_image.startswith("gcr.io/"):
                 image_name = _TOOL_IMAGE_NAMES.get(tool_name)
                 if image_name:
-                    params["image"] = f"{_ar_base}/{image_name}:latest"
-            # Cron job images are per-job and must be set explicitly — skip here
+                    params["image"] = f"{_ar_base}/{image_name}:{_image_tag}"
+            # Cron job images are per-job and must be set explicitly. Skip here.
             if stage_name == "workflow_orchestration" and tool_name == "cron":
                 for job in params.get("jobs", []):
                     if not job.get("image") or job.get("image", "").startswith("gcr.io/"):
                         job_name = job.get("service_name", "")
-                        job["image"] = f"{_ar_base}/{job_name}:latest"
+                        job["image"] = f"{_ar_base}/{job_name}:{_image_tag}"
 
     env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
     # PATCH: Use wandb_main.tf.j2 or mlflow_main.tf.j2 for cloud_run if present
@@ -1104,8 +1220,11 @@ def deploy(
     
     if teardown_enabled:
         duration_hours = teardown_config.get("duration_hours", 24)
-        deployed_at = datetime.utcnow()
-        # Add buffer to ensure schedule is in future (will be updated to exact time after deployment)
+        # Use timezone-aware UTC. datetime.utcnow() returns a naive datetime,
+        # and .timestamp() on a naive datetime treats it as local time,
+        # corrupting the schedule by the local TZ offset.
+        from datetime import timezone as _tz
+        deployed_at = datetime.now(_tz.utc)
         teardown_at = deployed_at + timedelta(hours=duration_hours, minutes=10)
         teardown_scheduled_timestamp = int(teardown_at.timestamp())
         teardown_cron_schedule = calculate_cron_from_timestamp(teardown_scheduled_timestamp)
@@ -1164,8 +1283,8 @@ def deploy(
     (DEPLOYML_TERRAFORM_DIR / "variables.tf").write_text(variables_tf)
     (DEPLOYML_TERRAFORM_DIR / "terraform.tfvars").write_text(tfvars_content)
 
-    # Deploy
-    typer.echo(f" Deploying {config['name']} to {cloud}...")
+    # Deploy. Falls back to workspace_name when config has no top-level 'name'.
+    typer.echo(f" Deploying {config.get('name', workspace_name)} to {cloud}...")
 
     if not check_gcp_auth():
         typer.echo(" Authenticating with GCP...")
@@ -1180,13 +1299,19 @@ def deploy(
     )
 
     typer.echo(" Initializing Terraform...")
-    # Suppress output of terraform init
-    subprocess.run(
+    # Capture stderr so init failures (state lock, missing ADC, bucket perms)
+    # surface a real message instead of a silent exit.
+    init_proc = subprocess.run(
         ["terraform", "init"],
         cwd=DEPLOYML_TERRAFORM_DIR,
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.DEVNULL,
+        capture_output=True,
+        text=True,
     )
+    if init_proc.returncode != 0:
+        typer.secho(" Terraform init failed.", fg=typer.colors.RED)
+        if init_proc.stderr.strip():
+            typer.echo(init_proc.stderr.strip())
+        raise typer.Exit(code=1)
 
     typer.echo(" Planning deployment...")
     result = subprocess.run(
@@ -1267,13 +1392,18 @@ def deploy(
     if yes or typer.confirm(confirmation_msg):
         estimated_time = estimate_terraform_time(result.stdout, "apply")
         typer.echo(f" Applying changes... (Estimated time: {estimated_time})")
-        # Suppress output of terraform init
-        subprocess.run(
+        # Re-init before apply; capture stderr to surface failures
+        init_proc2 = subprocess.run(
             ["terraform", "init"],
             cwd=DEPLOYML_TERRAFORM_DIR,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
+            capture_output=True,
+            text=True,
         )
+        if init_proc2.returncode != 0:
+            typer.secho(" Terraform init failed before apply.", fg=typer.colors.RED)
+            if init_proc2.stderr.strip():
+                typer.echo(init_proc2.stderr.strip())
+            raise typer.Exit(code=1)
         # Parse estimated minutes from string (e.g., '~20 minutes ...')
         import re as _re
 
@@ -1314,11 +1444,11 @@ def deploy(
             # Handle auto-teardown metadata and update scheduler schedule
             if teardown_enabled:
                 duration_hours = teardown_config.get("duration_hours", 24)
-                # Calculate teardown time AFTER deployment completes (not before)
-                deployed_at = datetime.utcnow()
+                # Timezone-aware UTC. Avoid datetime.utcnow() to keep .timestamp() correct.
+                from datetime import timezone as _tz
+                deployed_at = datetime.now(_tz.utc)
                 teardown_at = deployed_at + timedelta(hours=duration_hours)
-                
-                # Calculate the correct cron schedule based on actual deployment completion time
+
                 teardown_scheduled_timestamp = int(teardown_at.timestamp())
                 correct_cron_schedule = calculate_cron_from_timestamp(teardown_scheduled_timestamp)
                 time_zone = teardown_config.get("time_zone", "UTC")
@@ -1462,6 +1592,9 @@ def get_urls(
     env_path: Path = typer.Option(
         Path(".env"), "--env-path", help="Path to write .env file"
     ),
+    show_secrets: bool = typer.Option(
+        False, "--show-secrets", help="Fetch and print Grafana admin password and MLflow DSN connection hint. Uses gcloud secrets versions access."
+    ),
 ):
     """
     Print service URLs from the last deployment and write them to a .env file.
@@ -1470,8 +1603,8 @@ def get_urls(
         typer.echo(f" Config file not found: {config_path}")
         raise typer.Exit(code=1)
 
-    config = yaml.safe_load(config_path.read_text())
-    workspace_name = config.get("name") or "development"
+    config = _load_config_or_exit(config_path)
+    workspace_name = config.get("name") or "default"
     project_id = config.get("provider", {}).get("project_id", "")
     terraform_dir = Path.cwd() / ".deployml" / workspace_name / "terraform"
 
@@ -1536,6 +1669,33 @@ def get_urls(
     env_path.write_text("\n".join(env_lines) + "\n")
     typer.echo(f"\n .env written to {env_path.resolve()}")
 
+    if show_secrets:
+        typer.secho("\n Secrets:", fg=typer.colors.YELLOW, bold=True)
+        # Grafana admin password
+        grafana_secret = outputs.get("grafana_admin_password_secret_id", {}).get("value", "")
+        if grafana_secret and project_id:
+            fetch = subprocess.run(
+                ["gcloud", "secrets", "versions", "access", "latest",
+                 "--secret", grafana_secret, "--project", project_id],
+                capture_output=True, text=True,
+            )
+            if fetch.returncode == 0:
+                typer.echo(f"  grafana_admin_user: admin")
+                typer.echo(f"  grafana_admin_password: {fetch.stdout.strip()}")
+            else:
+                typer.echo(f"  grafana_admin_password: (fetch failed: {fetch.stderr.strip()})")
+        # MLflow DSN. Public IP is blocked; print the Cloud SQL Auth Proxy steps.
+        instance = outputs.get("instance_connection_name", {}).get("value", "")
+        dsn_secret = outputs.get("mlflow_dsn_secret_id", {}).get("value", "")
+        if instance and dsn_secret and project_id:
+            typer.echo("")
+            typer.echo("  To connect to MLflow Postgres from your laptop, run the Cloud SQL Auth Proxy:")
+            typer.echo(f"    cloud-sql-proxy {instance} --port=5432")
+            typer.echo("  Install the proxy if you do not have it:")
+            typer.echo("    https://cloud.google.com/sql/docs/postgres/sql-proxy#install")
+            typer.echo("  Fetch the DSN with:")
+            typer.echo(f"    gcloud secrets versions access latest --secret={dsn_secret} --project={project_id}")
+
 
 @cli.command()
 def destroy(
@@ -1559,7 +1719,7 @@ def destroy(
         typer.echo(f" Config file not found: {config_path}")
         raise typer.Exit(code=1)
 
-    config = yaml.safe_load(config_path.read_text())
+    config = _load_config_or_exit(config_path)
 
     # Determine workspace name (same logic as deploy)
     workspace_name = config.get("name") or "default"
@@ -1576,6 +1736,8 @@ def destroy(
         )
         return
 
+    _validate_deploy_config_or_exit(config)
+
     # Extract project info
     cloud = config["provider"]["name"]
     if cloud == "gcp":
@@ -1660,14 +1822,48 @@ def destroy(
         if result.returncode == 0:
             typer.echo(" Infrastructure destroyed successfully!")
 
+            # Clean up the Artifact Registry repo created by build-images.
+            # Terraform does not manage it, so without this it lingers and bills.
+            region = config.get("provider", {}).get("region", "us-central1")
+            ar_repo = "mlops-images"
+            typer.echo(f" Removing Artifact Registry repo {ar_repo}...")
+            subprocess.run(
+                ["gcloud", "artifacts", "repositories", "delete", ar_repo,
+                 "--location", region, "--project", project_id, "--quiet"],
+                capture_output=True,
+            )
+
+            # Clean up the Cloud Build staging bucket that `gcloud builds submit`
+            # auto-creates during build-images. It is not Terraform-managed and
+            # accumulates source tarballs across cycles. Best-effort; Cloud Build
+            # recreates it on the next build if needed.
+            cb_bucket = f"gs://{project_id}_cloudbuild"
+            typer.echo(f" Removing Cloud Build staging bucket {cb_bucket}...")
+            subprocess.run(
+                ["gcloud", "storage", "rm", "--recursive", cb_bucket, "--quiet"],
+                capture_output=True,
+            )
+
             if clean_workspace:
                 typer.echo(" Cleaning workspace...")
                 shutil.rmtree(DEPLOYML_DIR)
                 typer.echo(" Workspace cleaned")
-            elif typer.confirm("Clean up Terraform state files?"):
+            elif yes or typer.confirm("Clean up Terraform state files?"):
+                # --yes propagates to the cleanup confirm so scripted runs do not hang
                 cleanup_terraform_files(DEPLOYML_TERRAFORM_DIR)
         else:
-            typer.echo(f" Destroy failed: {result.stderr}")
+            # PRESERVE state on partial failure so a re-run can reconcile.
+            typer.secho(
+                f"\n Destroy failed with exit code {result.returncode}. "
+                "Terraform state preserved at:",
+                fg=typer.colors.RED,
+            )
+            typer.echo(f"   {DEPLOYML_TERRAFORM_DIR}")
+            typer.echo("\nRecovery:")
+            typer.echo("  1. Inspect residual resources: gcloud asset search-all-resources "
+                       f"--scope=projects/{project_id}")
+            typer.echo(f"  2. Re-run: deployml destroy --yes")
+            typer.echo(f"  3. Or delete the whole project: gcloud projects delete {project_id}")
             raise typer.Exit(code=1)
 
     except Exception as e:
@@ -1676,19 +1872,58 @@ def destroy(
 
 
 @cli.command()
-def status():
+def status(
+    config_path: Path = typer.Option(
+        Path("config.yaml"), "--config-path", "-c", help="Path to YAML config file"
+    ),
+):
     """
-    Check the deployment status of the current workspace.
+    Show the current workspace, whether a deployment exists, and the latest service URLs.
     """
-    typer.echo("Checking deployment status...")
+    if not config_path.exists():
+        typer.echo(f" Config file not found: {config_path}")
+        raise typer.Exit(code=1)
+    config = _load_config_or_exit(config_path)
+    workspace_name = config.get("name") or "default"
+    deployml_dir = Path.cwd() / ".deployml" / workspace_name
+    tf_dir = deployml_dir / "terraform"
+    typer.echo(f"Workspace: {workspace_name}")
+    typer.echo(f"Path: {deployml_dir}")
+    if not tf_dir.exists():
+        typer.secho("Status: not deployed (no terraform workspace found)", fg=typer.colors.YELLOW)
+        raise typer.Exit(code=0)
+    marker = deployml_dir / ".project_id"
+    if marker.exists():
+        typer.echo(f"Project: {marker.read_text().strip()}")
+    out_proc = subprocess.run(
+        ["terraform", "output", "-json"],
+        cwd=tf_dir, capture_output=True, text=True,
+    )
+    if out_proc.returncode == 0 and out_proc.stdout.strip():
+        try:
+            outputs = json.loads(out_proc.stdout)
+            urls = {k: v.get("value") for k, v in outputs.items() if isinstance(v.get("value"), str) and v.get("value", "").startswith("http")}
+            if urls:
+                typer.secho("Status: deployed", fg=typer.colors.GREEN)
+                for k, v in urls.items():
+                    typer.echo(f"  {k}: {v}")
+            else:
+                typer.secho("Status: workspace exists but no URL outputs found", fg=typer.colors.YELLOW)
+        except Exception:
+            typer.secho("Status: workspace exists but terraform output is not parseable", fg=typer.colors.YELLOW)
+    else:
+        typer.secho("Status: workspace exists but terraform output is empty", fg=typer.colors.YELLOW)
 
 
 @cli.command()
 def teardown(
-    action: str = typer.Argument(..., help="Action: cancel, status, or schedule"),
+    action: str = typer.Argument(..., help="Action: cancel, status, update, or schedule"),
     config_path: Path = typer.Option(
         ..., "--config-path", "-c", help="Path to YAML config file"
     ),
+    hours: int = typer.Option(
+        24, "--hours", help="Hours until teardown. Used by schedule and update."
+    ),
 ):
     """
     Manage auto-teardown: cancel scheduled teardown, check status, update schedule, or schedule new teardown.
@@ -1696,19 +1931,19 @@ def teardown(
     if not config_path.exists():
         typer.echo(f" Config file not found: {config_path}")
         raise typer.Exit(code=1)
-    
-    config = yaml.safe_load(config_path.read_text())
+
+    config = _load_config_or_exit(config_path)
     workspace_name = config.get("name") or "default"
     DEPLOYML_DIR = Path.cwd() / ".deployml" / workspace_name
-    
+
     if action == "cancel":
         cancel_teardown(config, DEPLOYML_DIR, workspace_name)
     elif action == "status":
         show_teardown_status(config, DEPLOYML_DIR, workspace_name)
     elif action == "update":
-        update_teardown_schedule(config, DEPLOYML_DIR, workspace_name)
+        update_teardown_schedule(config, DEPLOYML_DIR, workspace_name, hours)
     elif action == "schedule":
-        schedule_teardown(config, DEPLOYML_DIR, workspace_name)
+        schedule_teardown(config, DEPLOYML_DIR, workspace_name, hours)
     else:
         typer.echo(f" Unknown action: {action}. Use: cancel, status, update, or schedule")
         raise typer.Exit(code=1)
@@ -1719,11 +1954,12 @@ def cancel_teardown(config: dict, deployml_dir: Path, workspace_name: str):
     project_id = config["provider"]["project_id"]
     region = config["provider"]["region"]
     
-    # Delete Cloud Scheduler job
+    # Delete Cloud Scheduler job. Cloud Scheduler uses --location, not --region.
+    # Earlier code passed --region which gcloud rejects, so cancel silently failed.
     scheduler_job_name = f"deployml-teardown-{workspace_name}"
     result = subprocess.run(
         ["gcloud", "scheduler", "jobs", "delete", scheduler_job_name,
-         "--project", project_id, "--region", region, "--quiet"],
+         "--project", project_id, "--location", region, "--quiet"],
         capture_output=True,
         text=True,
     )
@@ -1837,7 +2073,7 @@ def show_teardown_status(config: dict, deployml_dir: Path, workspace_name: str):
     typer.echo(f"   View in Console: https://console.cloud.google.com/cloudscheduler/jobs/edit/{region}/{scheduler_job_name}?project={project_id}")
 
 
-def update_teardown_schedule(config: dict, deployml_dir: Path, workspace_name: str):
+def update_teardown_schedule(config: dict, deployml_dir: Path, workspace_name: str, duration_hours: int = 24):
     """Update the scheduled teardown time."""
     project_id = config["provider"]["project_id"]
     region = config["provider"]["region"]
@@ -1878,9 +2114,7 @@ def update_teardown_schedule(config: dict, deployml_dir: Path, workspace_name: s
     except Exception:
         pass
     
-    # Get new duration
-    duration_hours = typer.prompt("Hours until new teardown time", default=24, type=int)
-    
+    # Duration is now passed in via CLI flag instead of interactive prompt.
     if duration_hours < 0:
         typer.echo(" Duration must be positive")
         raise typer.Exit(code=1)
@@ -1894,13 +2128,7 @@ def update_teardown_schedule(config: dict, deployml_dir: Path, workspace_name: s
     new_cron_schedule = calculate_cron_from_timestamp(teardown_scheduled_timestamp)
     
     typer.echo(f"\n New Schedule: {teardown_at.strftime('%Y-%m-%d %H:%M:%S UTC')}")
-    
-    # Confirm update
-    confirm = typer.confirm("Update the teardown schedule?", default=True)
-    if not confirm:
-        typer.echo(" Update cancelled")
-        return
-    
+
     # Update Cloud Scheduler job
     typer.echo("\n Updating Cloud Scheduler job...")
     typer.echo(f"   Cron schedule: {new_cron_schedule}")
@@ -1970,10 +2198,10 @@ def update_teardown_schedule(config: dict, deployml_dir: Path, workspace_name: s
     save_deployment_metadata(deployml_dir, metadata)
 
 
-def schedule_teardown(config: dict, deployml_dir: Path, workspace_name: str):
+def schedule_teardown(config: dict, deployml_dir: Path, workspace_name: str, duration_hours: int = 24):
     """Schedule a new teardown."""
-    duration_hours = typer.prompt("Hours until teardown", default=24, type=int)
-    deployed_at = datetime.utcnow()
+    from datetime import timezone as _tz
+    deployed_at = datetime.now(_tz.utc)
     teardown_at = deployed_at + timedelta(hours=duration_hours)
     
     metadata = {
@@ -2019,6 +2247,16 @@ def init(
         if not project_id:
             typer.echo(" --project-id is required for GCP.")
             raise typer.Exit(code=1)
+        if not check_gcp_auth():
+            typer.secho(" gcloud is not authenticated. Run: gcloud auth login", fg=typer.colors.RED)
+            raise typer.Exit(code=1)
+        if not check_gcp_adc():
+            typer.secho(" Application Default Credentials missing. Run: gcloud auth application-default login", fg=typer.colors.RED)
+            raise typer.Exit(code=1)
+        if not validate_gcp_project(project_id):
+            typer.secho(f" Project '{project_id}' not found or not accessible by your gcloud account.", fg=typer.colors.RED)
+            typer.echo("   Verify with: gcloud projects describe " + project_id)
+            raise typer.Exit(code=1)
         typer.echo(
             f" Enabling required GCP APIs for project: {project_id} ..."
         )
@@ -2068,22 +2306,50 @@ def init(
                 f"{config_path} already exists. Use --overwrite to replace."
             )
 
-        config_template = {
-            "# Run `mlops-infra generate` to create your config"
-        }
+        # Write a runnable starter config so the user can deploy immediately
+        # after build-images. Earlier code wrote a Python set literal here,
+        # which yaml.dump serialized as `!!set` and broke deploy.
+        if provider == "gcp":
+            config_template = {
+                "name": f"{provider}-mlops-stack-mlflow",
+                "provider": {
+                    "name": provider,
+                    "project_id": project_id,
+                    "region": "us-west1",
+                    "image_tag": f"v{get_version()}",
+                },
+                "deployment": {"type": "cloud_run"},
+                "stack": [
+                    {"experiment_tracking": {"name": "mlflow", "params": {"service_name": "mlflow-server"}}},
+                    {"artifact_tracking": {"name": "mlflow", "params": {"artifact_bucket": f"mlflow-artifacts-{project_id}"}}},
+                    {"model_registry": {"name": "mlflow", "params": {"backend_store_uri": "postgresql"}}},
+                    {"model_serving": {"name": "fastapi", "params": {"service_name": "fastapi-mlflow-server"}}},
+                    {"model_monitoring": {"name": "grafana", "params": {"service_name": "grafana-server"}}},
+                ],
+            }
+        else:
+            # AWS and Azure scaffolds. The full stack is not yet implemented for
+            # these providers, but the file is at least valid YAML the user can extend.
+            config_template = {
+                "name": f"{provider}-mlops-stack",
+                "provider": {"name": provider, "project_id": project_id, "region": ""},
+                "deployment": {"type": ""},
+                "stack": [],
+            }
 
         with open(config_path, "w") as f:
-            yaml.dump(config_template, f, sort_keys=False)
+            yaml.dump(config_template, f, sort_keys=False, default_flow_style=False)
 
         typer.secho("Project initialized successfully.", fg=typer.colors.GREEN)
         typer.echo()
         typer.echo("Created:")
         typer.echo("  - docker/")
-        typer.echo("  - config.yaml")
+        typer.echo(f"  - config.yaml  (runnable starter for {provider})")
         typer.echo()
         typer.echo("Next steps:")
-        typer.echo("  1. Edit config.yaml")
-        typer.echo("  2. Build images with: mlops-infra build-images --docker-root docker")
+        typer.echo("  1. Review config.yaml")
+        typer.echo("  2. deployml build-images --create-repo")
+        typer.echo("  3. deployml deploy --verbose")
 
     except Exception as e:
         typer.secho(f"Error: {e}", fg=typer.colors.RED)
@@ -2142,6 +2408,11 @@ def minikube_deploy(
         None, "--image-name", "-i",
         help="Docker image name to load into minikube (auto-detected from deployment.yaml if not provided)"
     ),
+    namespace: Optional[str] = typer.Option(
+        None, "--namespace", "-n",
+        help="Kubernetes namespace to deploy into. Defaults to the default namespace. "
+        "Use the same namespace for MLflow and FastAPI so service DNS resolves."
+    ),
 ):
     """
     Deploy FastAPI to minikube using kubectl apply.
@@ -2150,15 +2421,15 @@ def minikube_deploy(
     if not manifest_dir.exists():
         typer.echo(f"Directory not found: {manifest_dir}")
         raise typer.Exit(code=1)
-    
+
     if not check_minikube_running():
         typer.echo("Minikube is not running. Start it first:")
         typer.echo("   minikube start")
         typer.echo("   OR")
         typer.echo("   deployml minikube-init --start-cluster")
         raise typer.Exit(code=1)
-    
-    success = deploy_fastapi_to_minikube(manifest_dir, image_name=image_name)
+
+    success = deploy_fastapi_to_minikube(manifest_dir, image_name=image_name, namespace=namespace)
     
     if not success:
         raise typer.Exit(code=1)
@@ -2173,7 +2444,7 @@ def mlflow_init(
         ..., "--image", "-i", help="MLflow Docker image"
     ),
     backend_store_uri: Optional[str] = typer.Option(
-        None, "--backend-store-uri", "-b", help="Backend store URI (defaults to SQLite)"
+        None, "--backend-store-uri", "-b", help="Backend store URI. Default sqlite on the mounted PVC."
     ),
     artifact_root: Optional[str] = typer.Option(
         None, "--artifact-root", "-a", help="Artifact root path (defaults to /mlflow-artifacts)"
@@ -2182,10 +2453,18 @@ def mlflow_init(
         True, "--start-cluster/--no-start-cluster",
         help="Start minikube cluster if not running"
     ),
+    persistent_storage: bool = typer.Option(
+        True, "--persistent-storage/--ephemeral-storage",
+        help="Mount a PersistentVolumeClaim so sqlite and artifacts survive pod restarts. Default on."
+    ),
+    pvc_size: str = typer.Option(
+        "5Gi", "--pvc-size", help="PVC size when --persistent-storage is on."
+    ),
 ):
     """
     Initialize minikube and generate MLflow Kubernetes manifests.
-    Creates deployment.yaml and service.yaml in the specified directory.
+    Creates deployment.yaml, service.yaml, and (when --persistent-storage)
+    pvc.yaml in the specified directory.
     """
     if not check_minikube_running():
         if start_cluster:
@@ -2202,7 +2481,9 @@ def mlflow_init(
         output_dir=output_dir,
         image=image,
         backend_store_uri=backend_store_uri,
-        artifact_root=artifact_root
+        artifact_root=artifact_root,
+        use_pvc=persistent_storage,
+        pvc_size=pvc_size,
     )
     
     typer.echo("\nSetup complete! Next steps:")
@@ -2220,6 +2501,11 @@ def mlflow_deploy(
         None, "--image-name", "-i",
         help="Docker image name to load into minikube (auto-detected from deployment.yaml if not provided)"
     ),
+    namespace: Optional[str] = typer.Option(
+        None, "--namespace", "-n",
+        help="Kubernetes namespace to deploy into. Defaults to the default namespace. "
+        "Use the same namespace for MLflow and FastAPI so service DNS resolves."
+    ),
 ):
     """
     Deploy MLflow to minikube using kubectl apply.
@@ -2228,15 +2514,15 @@ def mlflow_deploy(
     if not manifest_dir.exists():
         typer.echo(f"Directory not found: {manifest_dir}")
         raise typer.Exit(code=1)
-    
+
     if not check_minikube_running():
         typer.echo("Minikube is not running. Start it first:")
         typer.echo("   minikube start")
         typer.echo("   OR")
         typer.echo("   deployml mlflow-init --start-cluster")
         raise typer.Exit(code=1)
-    
-    success = deploy_mlflow_to_minikube(manifest_dir, image_name=image_name)
+
+    success = deploy_mlflow_to_minikube(manifest_dir, image_name=image_name, namespace=namespace)
     
     if not success:
         raise typer.Exit(code=1)
@@ -2260,6 +2546,11 @@ def gke_deploy(
     region: Optional[str] = typer.Option(
         None, "--region", "-r", help="GKE cluster region"
     ),
+    namespace: Optional[str] = typer.Option(
+        None, "--namespace", "-n",
+        help="Kubernetes namespace to deploy into. Defaults to the default namespace. "
+        "Use the same namespace for MLflow and FastAPI so service DNS resolves."
+    ),
 ):
     """
     Deploy Kubernetes manifests to GKE cluster.
@@ -2268,23 +2559,184 @@ def gke_deploy(
     if not manifest_dir.exists():
         typer.echo(f"Directory not found: {manifest_dir}")
         raise typer.Exit(code=1)
-    
+
     if not zone and not region:
         typer.echo("Either --zone or --region must be provided")
         raise typer.Exit(code=1)
-    
+
     success = deploy_to_gke(
         manifest_dir=manifest_dir,
         cluster_name=cluster,
         project_id=project,
         zone=zone,
         region=region,
+        namespace=namespace,
     )
-    
+
     if not success:
         raise typer.Exit(code=1)
 
 
+@cli.command("gke-cluster-create")
+def gke_cluster_create(
+    cluster: str = typer.Option(..., "--cluster", "-c", help="Cluster name"),
+    project: str = typer.Option(..., "--project", "-p", help="GCP project ID"),
+    region: str = typer.Option("us-west1", "--region", "-r", help="Region for the cluster"),
+    autopilot: bool = typer.Option(
+        True, "--autopilot/--standard",
+        help="Use GKE Autopilot (default) or a standard zonal cluster.",
+    ),
+):
+    """
+    Create a GKE cluster. Thin wrapper around `gcloud container clusters create`.
+    Autopilot is the default and the cheapest path for occasional testing.
+    """
+    if autopilot:
+        cmd = [
+            "gcloud", "container", "clusters", "create-auto", cluster,
+            "--region", region, "--project", project,
+        ]
+    else:
+        cmd = [
+            "gcloud", "container", "clusters", "create", cluster,
+            "--region", region, "--project", project,
+            "--num-nodes", "1", "--machine-type", "e2-medium",
+        ]
+    typer.echo(f" Creating {'Autopilot' if autopilot else 'standard'} cluster {cluster}...")
+    typer.echo("   This typically takes 5 to 10 minutes.")
+    result = subprocess.run(cmd, capture_output=False, text=True)
+    if result.returncode != 0:
+        raise typer.Exit(code=1)
+    typer.secho(f" Cluster {cluster} created.", fg=typer.colors.GREEN)
+    typer.echo(f"   Next: deployml gke-init --output-dir manifests --image gcr.io/{project}/... --project {project}")
+
+
+@cli.command("gke-destroy")
+def gke_destroy(
+    manifest_dir: Path = typer.Option(
+        ..., "--manifest-dir", "-d",
+        help="Directory containing deployment.yaml and service.yaml that were applied"
+    ),
+    cluster: str = typer.Option(
+        ..., "--cluster", "-c", help="GKE cluster name"
+    ),
+    project: str = typer.Option(
+        ..., "--project", "-p", help="GCP project ID"
+    ),
+    zone: Optional[str] = typer.Option(
+        None, "--zone", "-z", help="GKE cluster zone"
+    ),
+    region: Optional[str] = typer.Option(
+        None, "--region", "-r", help="GKE cluster region"
+    ),
+    namespace: Optional[str] = typer.Option(
+        None, "--namespace", "-n",
+        help="Namespace the manifests were applied to. Defaults to the default namespace."
+    ),
+    delete_cluster: bool = typer.Option(
+        False, "--delete-cluster",
+        help="Also delete the GKE cluster after removing manifests."
+    ),
+    keep_images: bool = typer.Option(
+        False, "--keep-images",
+        help="Keep the gcr.io image this workload used. By default it is deleted so "
+        "teardown is fully self-cleaning, matching the Cloud Run destroy behavior."
+    ),
+):
+    """
+    Remove deployml-managed manifests from a GKE cluster. Optionally delete the cluster.
+
+    Mirrors the Cloud Run `destroy` command for the GKE flow. Without `--delete-cluster`,
+    only the deployed Deployments and Services are removed; the cluster stays up. By
+    default the gcr.io image referenced by the deployment is also deleted; pass
+    --keep-images to keep it for a quick redeploy.
+    """
+    if not manifest_dir.exists():
+        typer.echo(f"Directory not found: {manifest_dir}")
+        raise typer.Exit(code=1)
+
+    if not zone and not region:
+        typer.echo("Either --zone or --region must be provided")
+        raise typer.Exit(code=1)
+
+    from deployml.utils.kubernetes_gke import connect_to_gke_cluster
+
+    if not connect_to_gke_cluster(project, cluster, zone, region):
+        raise typer.Exit(code=1)
+
+    # Delete in reverse order: service, then deployment, then PVC last. The PVC
+    # is deleted explicitly because its backing PersistentDisk bills even after
+    # the workload is gone (GKE's default storageclass reclaims on PVC delete).
+    ns = ["-n", namespace] if namespace and namespace != "default" else []
+    for fname in ["service.yaml", "deployment.yaml", "pvc.yaml"]:
+        f = manifest_dir / fname
+        if f.exists():
+            typer.echo(f" Deleting {fname}...")
+            result = subprocess.run(
+                ["kubectl", "delete", "-f", str(f), "--ignore-not-found"] + ns,
+                capture_output=True, text=True,
+            )
+            if result.returncode == 0:
+                typer.echo(f"   {result.stdout.strip() or 'deleted'}")
+            else:
+                typer.secho(f"   {result.stderr.strip()}", fg=typer.colors.YELLOW)
+
+    # Remove the gcr.io image this workload referenced so it does not linger and
+    # bill, mirroring the Cloud Run destroy that removes the Artifact Registry repo.
+    # Best-effort; --keep-images opts out for iterative redeploys.
+    if not keep_images:
+        dep = manifest_dir / "deployment.yaml"
+        image = ""
+        if dep.exists():
+            try:
+                doc = yaml.safe_load(dep.read_text())
+                image = doc["spec"]["template"]["spec"]["containers"][0].get("image", "")
+            except Exception:
+                image = ""
+        if image.startswith("gcr.io/"):
+            typer.echo(f" Removing image {image}...")
+            subprocess.run(
+                ["gcloud", "container", "images", "delete", image,
+                 "--force-delete-tags", "--quiet", "--project", project],
+                capture_output=True,
+            )
+
+    if delete_cluster:
+        typer.echo(f"\n Deleting cluster {cluster}...")
+        cmd = ["gcloud", "container", "clusters", "delete", cluster,
+               "--project", project, "--quiet"]
+        if zone:
+            cmd += ["--zone", zone]
+        else:
+            cmd += ["--region", region]
+        # Deleting the Services above starts LoadBalancer teardown operations.
+        # GKE refuses a cluster delete while one is in flight with a 400
+        # "incompatible operation", which would otherwise leave the cluster
+        # billing. Retry until the in-flight operation clears.
+        loc = zone or region
+        for attempt in range(6):
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode == 0:
+                typer.echo(f" Cluster {cluster} deleted")
+                break
+            if "incompatible operation" in (result.stderr or "").lower():
+                typer.echo("   Cluster busy with another operation, retrying in 20s...")
+                time.sleep(20)
+                continue
+            typer.secho(f" Cluster delete failed: {result.stderr}", fg=typer.colors.RED)
+            raise typer.Exit(code=1)
+        else:
+            typer.secho(
+                f" Cluster delete still blocked after retries. Re-run: "
+                f"gcloud container clusters delete {cluster} --location {loc} "
+                f"--project {project} --quiet",
+                fg=typer.colors.RED,
+            )
+            raise typer.Exit(code=1)
+    else:
+        typer.echo("\n Cluster left running. Pass --delete-cluster to also remove it.")
+
+
 @cli.command()
 def gke_init(
     output_dir: Path = typer.Option(
@@ -2297,15 +2749,21 @@ def gke_init(
         ..., "--project", "-p", help="GCP project ID"
     ),
     service: str = typer.Option(
-        "mlflow", "--service", "-s", help="Service type: mlflow or fastapi"
+        "mlflow", "--service", "-s", help="Service type: mlflow, fastapi, or all"
     ),
     mlflow_uri: Optional[str] = typer.Option(
-        None, "--mlflow-uri", "-m", help="MLflow URI (for FastAPI)"
+        None, "--mlflow-uri", "-m", help="MLflow URI (for FastAPI). Only used when service is fastapi."
+    ),
+    mlflow_image: Optional[str] = typer.Option(
+        None, "--mlflow-image", help="MLflow image. Required when --service all. Defaults to --image when not provided."
     ),
 ):
     """
     Generate Kubernetes manifests for GKE.
-    Simple command: specify image, project, and service type.
+
+    With --service mlflow or --service fastapi, renders one set of manifests in
+    output_dir. With --service all, renders mlflow into output_dir/mlflow and
+    fastapi into output_dir/fastapi so you can deploy both halves of the stack.
     """
     if service == "mlflow":
         generate_mlflow_manifests_gke(
@@ -2324,8 +2782,30 @@ def gke_init(
             push_image=not image.startswith("gcr.io/"),
         )
         typer.echo(f"\nNext: deployml gke-deploy -d {output_dir} -c CLUSTER -p {project} -z ZONE")
+    elif service == "all":
+        ml_img = mlflow_image or image
+        ml_dir = output_dir / "mlflow"
+        fa_dir = output_dir / "fastapi"
+        generate_mlflow_manifests_gke(
+            output_dir=ml_dir,
+            image=ml_img,
+            project_id=project,
+            push_image=not ml_img.startswith("gcr.io/"),
+        )
+        # FastAPI will reach MLflow via the in-cluster service DNS.
+        in_cluster_mlflow = mlflow_uri or "http://mlflow-service:5000"
+        generate_fastapi_manifests_gke(
+            output_dir=fa_dir,
+            image=image,
+            project_id=project,
+            mlflow_tracking_uri=in_cluster_mlflow,
+            push_image=not image.startswith("gcr.io/"),
+        )
+        typer.echo("\nNext steps:")
+        typer.echo(f"  1. deployml gke-deploy -d {ml_dir} -c CLUSTER -p {project} -r REGION")
+        typer.echo(f"  2. deployml gke-deploy -d {fa_dir} -c CLUSTER -p {project} -r REGION")
     else:
-        typer.echo(f"Unknown service: {service}. Use 'mlflow' or 'fastapi'")
+        typer.echo(f"Unknown service: {service}. Use 'mlflow', 'fastapi', or 'all'")
         raise typer.Exit(code=1)
 
 
@@ -2346,7 +2826,7 @@ def gke_apply(
         typer.echo(f"Config file not found: {config_path}")
         raise typer.Exit(code=1)
 
-    config = yaml.safe_load(config_path.read_text())
+    config = _load_config_or_exit(config_path)
     
     # Validate deployment type
     deployment_type = config.get("deployment", {}).get("type")
@@ -2354,7 +2834,7 @@ def gke_apply(
         typer.echo(f"This command is only for GKE deployments. Found: {deployment_type}")
         raise typer.Exit(code=1)
     
-    workspace_name = config.get("name") or "development"
+    workspace_name = config.get("name") or "default"
     DEPLOYML_DIR = Path.cwd() / ".deployml" / workspace_name
     manifests_dir = DEPLOYML_DIR / "manifests"
     
@@ -2369,7 +2849,9 @@ def gke_apply(
     cluster_name = gke_config.get("cluster_name")
     zone = gke_config.get("zone")
     region_gke = gke_config.get("region")
-    
+    # Optional namespace; MLflow and FastAPI share it so service DNS resolves.
+    gke_namespace = gke_config.get("namespace")
+
     if not cluster_name:
         typer.echo("GKE cluster_name must be specified in config.gke.cluster_name")
         raise typer.Exit(code=1)
@@ -2413,11 +2895,12 @@ def gke_apply(
             project_id=project_id,
             zone=zone,
             region=region_gke,
+            namespace=gke_namespace,
         ):
             deployed_any = True
         else:
             raise typer.Exit(code=1)
-    
+
     if fastapi_manifest_dir.exists():
         typer.echo(f"\n Deploying FastAPI to GKE...")
         if deploy_to_gke(
@@ -2426,11 +2909,12 @@ def gke_apply(
             project_id=project_id,
             zone=zone,
             region=region_gke,
+            namespace=gke_namespace,
         ):
             deployed_any = True
         else:
             raise typer.Exit(code=1)
-    
+
     if deployed_any:
         typer.echo("\n GKE deployment complete!")
     else:
@@ -2468,11 +2952,11 @@ def build_images_command(
         "--repository",
         help="Artifact Registry repository name.",
     ),
-    tag: str = typer.Option(
-        "latest",
+    tag: Optional[str] = typer.Option(
+        None,
         "--tag",
         "-t",
-        help="Image tag to apply.",
+        help="Image tag to apply. Defaults to config.provider.image_tag or v{deployml_version}.",
     ),
     create_repo: bool = typer.Option(
         False,
@@ -2484,6 +2968,12 @@ def build_images_command(
         "--dry-run",
         help="Show what would be built without executing Docker or gcloud commands.",
     ),
+    platform: Optional[str] = typer.Option(
+        None,
+        "--platform",
+        help="Local build platform. Defaults to host arch so images run on a local "
+        "minikube node. Pass linux/amd64 only for a manual amd64 push. Ignored in GCP mode.",
+    ),
 ):
     """
     Build all Docker images found in subdirectories of the given folder.
@@ -2498,14 +2988,18 @@ def build_images_command(
     """
 
     if config_path and config_path.exists():
-        config = yaml.safe_load(config_path.read_text())
+        config = _load_config_or_exit(config_path)
         if not gcp_project:
             gcp_project = config.get("provider", {}).get("project_id")
         if not region:
             region = config.get("provider", {}).get("region", "us-central1")
+        if not tag:
+            tag = config.get("provider", {}).get("image_tag")
 
     if not region:
         region = "us-central1"
+    if not tag:
+        tag = f"v{get_version()}"
 
     if create_repo and not gcp_project:
         typer.secho(
@@ -2527,6 +3021,7 @@ def build_images_command(
             tag=tag,
             create_repo=create_repo,
             dry_run=dry_run,
+            platform=platform,
         )
 
         if not dry_run:
diff --git a/src/deployml/enum/cloud_provider.py b/src/deployml/enum/cloud_provider.py
index 5bf2613..468fa63 100644
--- a/src/deployml/enum/cloud_provider.py
+++ b/src/deployml/enum/cloud_provider.py
@@ -2,7 +2,10 @@
 
 class CloudProvider(Enum):
     """
-    TODO
+    Cloud provider identifiers used by config.provider.name and by the
+    interactive `deployml generate` flow. Only `gcp` is fully implemented.
+    `aws` and `azure` write skeleton configs from `deployml init`.
+    `local` is reserved for minikube and other local Kubernetes flows.
     """
     LOCAL = "local"
     AWS = "aws"
diff --git a/src/deployml/utils/constants.py b/src/deployml/utils/constants.py
index 75184ff..643d4f1 100644
--- a/src/deployml/utils/constants.py
+++ b/src/deployml/utils/constants.py
@@ -49,4 +49,18 @@
     "logging.googleapis.com",
     "artifactregistry.googleapis.com",
     "cloudbuild.googleapis.com",
+    "secretmanager.googleapis.com",
+    "container.googleapis.com",  # GKE clusters. Free to enable, only billed when a cluster exists.
+]
+
+# Minimum project-level roles to deploy the full stack. roles/owner satisfies all.
+REQUIRED_GCP_IAM_ROLES = [
+    "roles/serviceusage.serviceUsageAdmin",
+    "roles/artifactregistry.admin",
+    "roles/cloudsql.admin",
+    "roles/run.admin",
+    "roles/storage.admin",
+    "roles/bigquery.admin",
+    "roles/iam.serviceAccountAdmin",
+    "roles/iam.serviceAccountUser",
 ]
\ No newline at end of file
diff --git a/src/deployml/utils/helpers.py b/src/deployml/utils/helpers.py
index 97e8e39..4081e1e 100644
--- a/src/deployml/utils/helpers.py
+++ b/src/deployml/utils/helpers.py
@@ -1,5 +1,6 @@
 import shutil
 import subprocess
+import sys
 import importlib.resources as pkg_resources
 from pathlib import Path
 from typing import Optional
@@ -54,6 +55,139 @@ def check_gcp_auth() -> bool:
         return False
 
 
+def check_gcp_adc() -> bool:
+    """Application Default Credentials are required by Terraform and client libs."""
+    try:
+        result = subprocess.run(
+            ["gcloud", "auth", "application-default", "print-access-token"],
+            capture_output=True, text=True,
+        )
+        return result.returncode == 0
+    except Exception:
+        return False
+
+
+def check_bq() -> bool:
+    if not shutil.which("bq"):
+        return False
+    try:
+        result = subprocess.run(
+            ["bq", "version"], capture_output=True, text=True,
+        )
+        return result.returncode == 0
+    except Exception:
+        return False
+
+
+def get_terraform_version() -> Optional[tuple]:
+    """Return (major, minor, patch) or None."""
+    if not shutil.which("terraform"):
+        return None
+    try:
+        import json as _json
+        result = subprocess.run(
+            ["terraform", "version", "-json"],
+            capture_output=True, text=True,
+        )
+        if result.returncode != 0:
+            return None
+        data = _json.loads(result.stdout)
+        parts = data.get("terraform_version", "").split(".")
+        return tuple(int(p) for p in parts[:3])
+    except Exception:
+        return None
+
+
+def validate_gcp_project(project_id: str) -> bool:
+    """Verify project exists and active gcloud account can access it."""
+    try:
+        result = subprocess.run(
+            ["gcloud", "projects", "describe", project_id,
+             "--format=value(projectId)"],
+            capture_output=True, text=True,
+        )
+        return result.returncode == 0 and result.stdout.strip() == project_id
+    except Exception:
+        return False
+
+
+_GCP_REGIONS_CACHE: Optional[set] = None
+
+
+def validate_gcp_region(region: str, project_id: Optional[str] = None) -> bool:
+    """Check region exists. Cached. Returns True on lookup failure to avoid blocking."""
+    global _GCP_REGIONS_CACHE
+    if _GCP_REGIONS_CACHE is None:
+        cmd = ["gcloud", "compute", "regions", "list", "--format=value(name)"]
+        if project_id:
+            cmd += ["--project", project_id]
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode != 0:
+                print(
+                    f"Warning: could not verify region '{region}' "
+                    "(gcloud compute regions list failed). Proceeding unvalidated; "
+                    "a typo here surfaces later as a confusing Terraform error.",
+                    file=sys.stderr,
+                )
+                return True
+            _GCP_REGIONS_CACHE = set(result.stdout.strip().splitlines())
+        except Exception:
+            print(
+                f"Warning: could not verify region '{region}' "
+                "(gcloud unavailable). Proceeding unvalidated.",
+                file=sys.stderr,
+            )
+            return True
+    return region in _GCP_REGIONS_CACHE
+
+
+def get_missing_iam_roles(project_id: str, required_roles: list) -> list:
+    """Return roles the active account lacks. roles/owner short-circuits to empty."""
+    try:
+        import json as _json
+        account_result = subprocess.run(
+            ["gcloud", "config", "get-value", "account"],
+            capture_output=True, text=True,
+        )
+        account = account_result.stdout.strip()
+        if not account:
+            return list(required_roles)
+
+        result = subprocess.run(
+            ["gcloud", "projects", "get-iam-policy", project_id, "--format=json"],
+            capture_output=True, text=True,
+        )
+        if result.returncode != 0:
+            return list(required_roles)
+
+        policy = _json.loads(result.stdout)
+        member_keys = {f"user:{account}", f"serviceAccount:{account}"}
+        held = set()
+        for binding in policy.get("bindings", []):
+            if any(m in member_keys for m in binding.get("members", [])):
+                held.add(binding["role"])
+
+        if "roles/owner" in held:
+            return []
+        return [r for r in required_roles if r not in held]
+    except Exception:
+        return list(required_roles)
+
+
+def check_docker_daemon() -> bool:
+    """Returns True if docker daemon is reachable (not just binary present)."""
+    if not shutil.which("docker"):
+        return False
+    try:
+        result = subprocess.run(
+            ["docker", "info"], capture_output=True, text=True,
+        )
+        return result.returncode == 0
+    except Exception:
+        return False
+
+
 def copy_modules_to_workspace(
     modules_dir: Path,
     stack: list | None = None,

From 17e1f5eaad98a8df6ebf718310da374aa34e68fb Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:34 -0700
Subject: [PATCH 06/31] kubernetes: MLflow persistence, namespace isolation,
 GKE LB targeting

Add a PersistentVolumeClaim with fsGroup and a Recreate strategy so MLflow data survives pod restarts on minikube and GKE, isolate deploys by namespace, target the named service when waiting for the GKE LoadBalancer IP, and harden image loading.
---
 .../mlflow-deployment.yaml.j2                 |  39 ++++-
 .../kubernetes_local/mlflow-pvc.yaml.j2       |  12 ++
 src/deployml/utils/kubernetes_gke.py          | 160 +++++++++++------
 src/deployml/utils/kubernetes_local.py        | 165 ++++++++++++------
 4 files changed, 266 insertions(+), 110 deletions(-)
 create mode 100644 src/deployml/templates/kubernetes_local/mlflow-pvc.yaml.j2

diff --git a/src/deployml/templates/kubernetes_local/mlflow-deployment.yaml.j2 b/src/deployml/templates/kubernetes_local/mlflow-deployment.yaml.j2
index 70f0296..48f9079 100644
--- a/src/deployml/templates/kubernetes_local/mlflow-deployment.yaml.j2
+++ b/src/deployml/templates/kubernetes_local/mlflow-deployment.yaml.j2
@@ -6,6 +6,13 @@ metadata:
     app: mlflow
 spec:
   replicas: {{ replicas }}
+  {% if use_pvc %}
+  # A ReadWriteOnce PVC (e.g. a GKE PersistentDisk) can attach to only one node
+  # at a time, so a RollingUpdate would deadlock: the new pod cannot mount until
+  # the old one releases. Recreate tears down the old pod first.
+  strategy:
+    type: Recreate
+  {% endif %}
   selector:
     matchLabels:
       app: mlflow
@@ -14,6 +21,12 @@ spec:
       labels:
         app: mlflow
     spec:
+      # The image runs as non-root user 'mlflow'. A freshly provisioned PVC is
+      # root-owned, so without fsGroup the server cannot write the sqlite db or
+      # artifacts and crashes with permission denied. fsGroup chowns the volume
+      # and is added as a supplementary group to the container process.
+      securityContext:
+        fsGroup: 1000
       containers:
       - name: mlflow
         image: {{ image }}
@@ -25,6 +38,10 @@ spec:
           value: "0.0.0.0"
         - name: MLFLOW_SERVER_PORT
           value: "{{ port }}"
+        # Allow Host header for service DNS, NodePort, LoadBalancer IP, etc.
+        # Without this MLflow 3.x rejects external requests with "Invalid Host header".
+        - name: MLFLOW_SERVER_ALLOWED_HOSTS
+          value: "*"
         {% if backend_store_uri %}
         - name: MLFLOW_BACKEND_STORE_URI
           value: "{{ backend_store_uri }}"
@@ -51,19 +68,33 @@ spec:
         volumeMounts:
         - name: mlflow-data
           mountPath: /mlflow-artifacts
-        livenessProbe:
+        startupProbe:
           httpGet:
             path: /health
             port: {{ port }}
-          initialDelaySeconds: 30
+          # MLflow 3.x boots huey consumers and other workers before /health responds.
+          # Allow up to 5 minutes (30 attempts * 10s) before declaring failure.
+          initialDelaySeconds: 10
           periodSeconds: 10
+          failureThreshold: 30
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: {{ port }}
+          periodSeconds: 30
+          timeoutSeconds: 5
         readinessProbe:
           httpGet:
             path: /health
             port: {{ port }}
-          initialDelaySeconds: 10
-          periodSeconds: 5
+          periodSeconds: 10
+          timeoutSeconds: 3
       volumes:
       - name: mlflow-data
+        {% if use_pvc %}
+        persistentVolumeClaim:
+          claimName: mlflow-data-pvc
+        {% else %}
         emptyDir: {}
+        {% endif %}
 
diff --git a/src/deployml/templates/kubernetes_local/mlflow-pvc.yaml.j2 b/src/deployml/templates/kubernetes_local/mlflow-pvc.yaml.j2
new file mode 100644
index 0000000..6e1d541
--- /dev/null
+++ b/src/deployml/templates/kubernetes_local/mlflow-pvc.yaml.j2
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: mlflow-data-pvc
+  labels:
+    app: mlflow
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ pvc_size | default('5Gi') }}
diff --git a/src/deployml/utils/kubernetes_gke.py b/src/deployml/utils/kubernetes_gke.py
index e5fc264..e7b90a4 100644
--- a/src/deployml/utils/kubernetes_gke.py
+++ b/src/deployml/utils/kubernetes_gke.py
@@ -1,13 +1,26 @@
 import subprocess
 import typer
 from pathlib import Path
-from typing import Optional, Dict
+from typing import Optional
 from jinja2 import Environment, FileSystemLoader
+
+try:
+    from importlib.metadata import version as _pkg_version
+    _DEPLOYML_VERSION = _pkg_version("deployml-core")
+except Exception:
+    _DEPLOYML_VERSION = "0.0.42"
+
 from deployml.utils.constants import TEMPLATE_DIR
+from deployml.utils.kubernetes_local import ensure_namespace, ns_args
 
 
 def check_gke_cluster_connection(cluster_name: str, zone: Optional[str] = None, region: Optional[str] = None) -> bool:
-    """Check if kubectl is connected to the GKE cluster."""
+    """Check if kubectl is connected to THIS specific GKE cluster.
+
+    Earlier the function returned True if kubectl was connected to any GKE
+    cluster, which silently applied manifests to the wrong cluster. Now we
+    only return True if the current context contains the exact cluster name.
+    """
     try:
         result = subprocess.run(
             ["kubectl", "cluster-info"],
@@ -15,13 +28,12 @@ def check_gke_cluster_connection(cluster_name: str, zone: Optional[str] = None,
             text=True
         )
         if result.returncode == 0:
-            # Check if cluster name is in context
             context_result = subprocess.run(
                 ["kubectl", "config", "current-context"],
                 capture_output=True,
                 text=True
             )
-            return cluster_name in context_result.stdout or "gke" in context_result.stdout.lower()
+            return cluster_name in context_result.stdout
         return False
     except Exception:
         return False
@@ -123,16 +135,16 @@ def generate_fastapi_manifests_gke(
     """
     output_dir.mkdir(parents=True, exist_ok=True)
     
-    # Convert local image to GCR format
+    # Convert local image to GCR format. Pin tag to the deployml version to
+    # avoid the :latest drift bug that bites the Cloud Run path the same way.
     if not image.startswith("gcr.io/"):
-        gcr_image = f"gcr.io/{project_id}/fastapi/fastapi:latest"
+        gcr_image = f"gcr.io/{project_id}/fastapi/fastapi:v{_DEPLOYML_VERSION}"
         if push_image:
             push_image_to_gcr(image, gcr_image, project_id)
         image = gcr_image
     else:
         gcr_image = image
-    
-    # Default values
+
     port = 8000
     replicas = 1
     cpu_request = "250m"
@@ -200,10 +212,12 @@ def generate_mlflow_manifests_gke(
     artifact_root: Optional[str] = None,
     service_type: str = "LoadBalancer",
     push_image: bool = True,
+    use_pvc: bool = True,
+    pvc_size: str = "5Gi",
 ) -> None:
     """
-    Generate deployment.yaml and service.yaml for MLflow on GKE.
-    
+    Generate deployment.yaml, service.yaml, and optionally pvc.yaml for MLflow on GKE.
+
     Args:
         output_dir: Directory where manifests will be created
         image: Docker image for MLflow (local name)
@@ -212,19 +226,22 @@ def generate_mlflow_manifests_gke(
         artifact_root: Optional artifact root path (GCS bucket)
         service_type: Kubernetes service type (LoadBalancer or ClusterIP)
         push_image: Whether to push image to GCR
+        use_pvc: When True, provision a PersistentVolumeClaim so experiment data
+          survives pod restarts. Without it MLflow stores sqlite in the container
+          filesystem and loses everything when the pod is rescheduled.
+        pvc_size: PVC size when use_pvc=True.
     """
     output_dir.mkdir(parents=True, exist_ok=True)
     
-    # Convert local image to GCR format
+    # Convert local image to GCR format. Pin tag to the deployml version.
     if not image.startswith("gcr.io/"):
-        gcr_image = f"gcr.io/{project_id}/mlflow/mlflow:latest"
+        gcr_image = f"gcr.io/{project_id}/mlflow/mlflow:v{_DEPLOYML_VERSION}"
         if push_image:
             push_image_to_gcr(image, gcr_image, project_id)
         image = gcr_image
     else:
         gcr_image = image
-    
-    # Default values
+
     port = 5000
     replicas = 1
     cpu_request = "250m"
@@ -233,9 +250,14 @@ def generate_mlflow_manifests_gke(
     memory_limit = "2Gi"  # Increased for GKE
     service_name = "mlflow-service"
     
-    # Defaults if not provided
+    # Defaults if not provided. Put sqlite on the mounted volume (4 slashes =
+    # absolute /mlflow-artifacts/mlflow.db) so the backend store persists with
+    # use_pvc. A relative sqlite:///mlflow.db would sit in the ephemeral
+    # container filesystem and be lost on restart.
     if not backend_store_uri:
-        backend_store_uri = "sqlite:///mlflow.db"
+        backend_store_uri = (
+            "sqlite:////mlflow-artifacts/mlflow.db" if use_pvc else "sqlite:///mlflow.db"
+        )
     if not artifact_root:
         artifact_root = "/mlflow-artifacts"
     
@@ -255,9 +277,10 @@ def generate_mlflow_manifests_gke(
         cpu_limit=cpu_limit,
         memory_limit=memory_limit,
         backend_store_uri=backend_store_uri,
-        artifact_root=artifact_root
+        artifact_root=artifact_root,
+        use_pvc=use_pvc,
     )
-    
+
     # Update imagePullPolicy for GCR images
     deployment_yaml = deployment_yaml.replace("imagePullPolicy: Never", "imagePullPolicy: IfNotPresent")
     
@@ -284,11 +307,18 @@ def generate_mlflow_manifests_gke(
     
     deployment_file.write_text(deployment_yaml)
     service_file.write_text(service_yaml)
-    
+
     typer.echo(f"Generated MLflow GKE manifests in {output_dir}")
     typer.echo(f"   - {deployment_file}")
     typer.echo(f"   - {service_file}")
 
+    if use_pvc:
+        pvc_template = env.get_template("mlflow-pvc.yaml.j2")
+        pvc_yaml = pvc_template.render(pvc_size=pvc_size)
+        pvc_file = output_dir / "pvc.yaml"
+        pvc_file.write_text(pvc_yaml)
+        typer.echo(f"   - {pvc_file}  (PersistentVolumeClaim, {pvc_size})")
+
 
 def deploy_to_gke(
     manifest_dir: Path,
@@ -296,16 +326,20 @@ def deploy_to_gke(
     project_id: str,
     zone: Optional[str] = None,
     region: Optional[str] = None,
+    namespace: Optional[str] = None,
 ) -> bool:
     """
     Deploy manifests to GKE cluster using kubectl apply.
-    
+
     Args:
         manifest_dir: Directory containing deployment.yaml and service.yaml
         cluster_name: GKE cluster name
         project_id: GCP project ID
         zone: GKE cluster zone (for zonal clusters)
         region: GKE cluster region (for regional clusters)
+        namespace: Target namespace. Default keeps the default namespace; pass a
+          value to isolate this stack. MLflow and FastAPI must share a namespace
+          for in-cluster service DNS to resolve.
     """
     if not manifest_dir.exists():
         typer.echo(f"Directory not found: {manifest_dir}")
@@ -324,20 +358,34 @@ def deploy_to_gke(
             return False
     
     typer.echo("🚀 Applying Kubernetes manifests to GKE...")
-    
+    ensure_namespace(namespace)
+    ns = ns_args(namespace)
+
     try:
+        # Apply the PVC first so the deployment can bind it on first schedule.
+        pvc_file = manifest_dir / "pvc.yaml"
+        if pvc_file.exists():
+            typer.echo(f"   Applying {pvc_file.name}...")
+            result = subprocess.run(
+                ["kubectl", "apply", "-f", str(pvc_file)] + ns,
+                check=True,
+                capture_output=True,
+                text=True
+            )
+            typer.echo(f"   {result.stdout.strip()}")
+
         typer.echo(f"   Applying {deployment_file.name}...")
         result = subprocess.run(
-            ["kubectl", "apply", "-f", str(deployment_file)],
+            ["kubectl", "apply", "-f", str(deployment_file)] + ns,
             check=True,
             capture_output=True,
             text=True
         )
         typer.echo(f"   {result.stdout.strip()}")
-        
+
         typer.echo(f"   Applying {service_file.name}...")
         result = subprocess.run(
-            ["kubectl", "apply", "-f", str(service_file)],
+            ["kubectl", "apply", "-f", str(service_file)] + ns,
             check=True,
             capture_output=True,
             text=True
@@ -348,41 +396,51 @@ def deploy_to_gke(
         typer.echo("\n⏳ Waiting for LoadBalancer IP...")
         typer.echo("   (This may take a few minutes)")
         
-        # Wait for external IP
-        import time
-        max_wait = 300  # 5 minutes
+        # Wait for external IP. Earlier code did service_file.stem.replace("service", "service")
+        # which is a no-op and then queried kubectl for service "service" which is wrong.
+        # Read the actual service name from the rendered manifest instead.
+        import time, yaml as _yaml
+        max_wait = 300
         waited = 0
-        service_name = service_file.stem.replace("service", "service")
-        
+        try:
+            svc_doc = _yaml.safe_load(service_file.read_text())
+            service_name = svc_doc.get("metadata", {}).get("name", "")
+        except Exception:
+            service_name = ""
+
+        # Without a concrete service name we cannot safely target one service.
+        # Querying every service and guessing an IP risks reporting the wrong
+        # endpoint, so bail out and let the user inspect manually instead.
+        if not service_name:
+            typer.echo("   Could not read the service name from service.yaml; "
+                       "skipping IP wait. Run: kubectl get svc")
+            waited = max_wait
+
         while waited < max_wait:
-            result = subprocess.run(
-                ["kubectl", "get", "svc", "-o", "jsonpath='{.items[?(@.spec.type==\"LoadBalancer\")].status.loadBalancer.ingress[0].ip}'"],
-                capture_output=True,
-                text=True
-            )
-            
-            if result.returncode == 0 and result.stdout.strip().strip("'"):
-                external_ip = result.stdout.strip().strip("'")
-                if external_ip and external_ip != "<none>":
-                    # Get port
-                    port_result = subprocess.run(
-                        ["kubectl", "get", "svc", service_name, "-o", "jsonpath='{.spec.ports[0].port}'"],
-                        capture_output=True,
-                        text=True
-                    )
-                    port = port_result.stdout.strip().strip("'") or "5000"
-                    typer.echo(f"\n✅ Service is available at: http://{external_ip}:{port}")
-                    break
-            
+            ip_query = "{.status.loadBalancer.ingress[0].ip}"
+            cmd = ["kubectl", "get", "svc", service_name,
+                   "-o", f"jsonpath={ip_query}"] + ns
+            result = subprocess.run(cmd, capture_output=True, text=True)
+
+            external_ip = result.stdout.strip().strip("'")
+            if result.returncode == 0 and external_ip and external_ip != "<none>":
+                port_query = "{.spec.ports[0].port}"
+                port_cmd = ["kubectl", "get", "svc", service_name,
+                            "-o", f"jsonpath={port_query}"] + ns
+                port_result = subprocess.run(port_cmd, capture_output=True, text=True)
+                port = port_result.stdout.strip().strip("'") or "5000"
+                typer.echo(f"\n Service is available at: http://{external_ip}:{port}")
+                break
+
             time.sleep(5)
             waited += 5
             if waited % 30 == 0:
                 typer.echo(f"   Still waiting... ({waited}s)")
         
         typer.echo("\n Deployment status:")
-        subprocess.run(["kubectl", "get", "pods"])
-        subprocess.run(["kubectl", "get", "svc"])
-        
+        subprocess.run(["kubectl", "get", "pods"] + ns)
+        subprocess.run(["kubectl", "get", "svc"] + ns)
+
         return True
         
     except subprocess.CalledProcessError as e:
diff --git a/src/deployml/utils/kubernetes_local.py b/src/deployml/utils/kubernetes_local.py
index c6a1937..da3a6a9 100644
--- a/src/deployml/utils/kubernetes_local.py
+++ b/src/deployml/utils/kubernetes_local.py
@@ -6,6 +6,27 @@
 from deployml.utils.constants import TEMPLATE_DIR
 
 
+def ns_args(namespace: Optional[str]) -> list:
+    """kubectl/minikube namespace flag. Empty for the default namespace so the
+    existing single-stack flow is unchanged."""
+    return ["-n", namespace] if namespace and namespace != "default" else []
+
+
+def ensure_namespace(namespace: Optional[str]) -> None:
+    """Create the namespace if it does not exist. No-op for the default
+    namespace. Idempotent via apply of a client-side dry-run manifest."""
+    if not namespace or namespace == "default":
+        return
+    rendered = subprocess.run(
+        ["kubectl", "create", "namespace", namespace, "--dry-run=client", "-o", "yaml"],
+        capture_output=True, text=True,
+    )
+    if rendered.returncode == 0:
+        subprocess.run(["kubectl", "apply", "-f", "-"], input=rendered.stdout,
+                       capture_output=True, text=True)
+        typer.echo(f"   Using namespace: {namespace}")
+
+
 def check_minikube_running() -> bool:
     """Check if minikube is currently running."""
     try:
@@ -113,47 +134,53 @@ def generate_mlflow_manifests(
     backend_store_uri: Optional[str] = None,
     artifact_root: Optional[str] = None,
     load_image: bool = True,
+    use_pvc: bool = True,
+    pvc_size: str = "5Gi",
 ) -> None:
     """
-    Generate deployment.yaml and service.yaml for MLflow in the specified directory.
-    
+    Generate deployment.yaml, service.yaml, and optionally pvc.yaml for MLflow.
+
     Args:
         output_dir: Directory where manifests will be created
         image: Docker image for MLflow
-        backend_store_uri: Optional backend store URI (defaults to SQLite if not provided)
-        artifact_root: Optional artifact root path (defaults to local storage if not provided)
-        load_image: Whether to automatically load image into minikube (default: True)
+        backend_store_uri: Optional backend store URI. Default puts sqlite on
+          the mounted volume so data survives pod restarts.
+        artifact_root: Optional artifact root path
+        load_image: Whether to automatically load image into minikube
+        use_pvc: When True, generate a PersistentVolumeClaim and mount it.
+          When False, use ephemeral emptyDir (legacy behavior).
+        pvc_size: PVC size when use_pvc=True. Default 5Gi.
     """
     output_dir.mkdir(parents=True, exist_ok=True)
-    
-    # Load image into minikube if requested
+
     if load_image:
         load_image_to_minikube(image)
-    
-    # Default values
+
     port = 5000
     node_port = 30050
     replicas = 1
-    cpu_request = "250m"
-    memory_request = "512Mi"
-    cpu_limit = "500m"
-    memory_limit = "1Gi"
+    # MLflow 3.x ships with huey consumers and other workers that push memory
+    # well past 1Gi on cold start. Earlier defaults caused OOMKilled (exit 137)
+    # within seconds of uvicorn starting. Bumped to 2Gi limit / 1Gi request.
+    cpu_request = "500m"
+    memory_request = "1Gi"
+    cpu_limit = "1000m"
+    memory_limit = "2Gi"
     service_name = "mlflow-service"
-    
-    # Defaults if not provided
+
     if not backend_store_uri:
-        backend_store_uri = "sqlite:///mlflow.db"
+        # Put sqlite on the mounted volume so data survives pod restarts
+        # when use_pvc=True. With emptyDir the file is still pod-local.
+        backend_store_uri = "sqlite:////mlflow-artifacts/mlflow.db"
     if not artifact_root:
         artifact_root = "/mlflow-artifacts"
-    
-    # Load templates from files
+
     template_dir = TEMPLATE_DIR / "kubernetes_local"
     env = Environment(loader=FileSystemLoader(str(template_dir)))
-    
+
     deployment_template = env.get_template("mlflow-deployment.yaml.j2")
     service_template = env.get_template("mlflow-service.yaml.j2")
-    
-    # Render templates
+
     deployment_yaml = deployment_template.render(
         image=image,
         port=port,
@@ -163,34 +190,44 @@ def generate_mlflow_manifests(
         cpu_limit=cpu_limit,
         memory_limit=memory_limit,
         backend_store_uri=backend_store_uri,
-        artifact_root=artifact_root
+        artifact_root=artifact_root,
+        use_pvc=use_pvc,
     )
-    
+
     service_yaml = service_template.render(
         service_name=service_name,
         port=port,
         node_port=node_port
     )
-    
-    # Write files
+
     deployment_file = output_dir / "deployment.yaml"
     service_file = output_dir / "service.yaml"
-    
+
     deployment_file.write_text(deployment_yaml)
     service_file.write_text(service_yaml)
-    
+
     typer.echo(f"Generated MLflow manifests in {output_dir}")
     typer.echo(f"   - {deployment_file}")
     typer.echo(f"   - {service_file}")
 
+    if use_pvc:
+        pvc_template = env.get_template("mlflow-pvc.yaml.j2")
+        pvc_yaml = pvc_template.render(pvc_size=pvc_size)
+        pvc_file = output_dir / "pvc.yaml"
+        pvc_file.write_text(pvc_yaml)
+        typer.echo(f"   - {pvc_file}  (PersistentVolumeClaim, {pvc_size})")
 
-def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = None) -> bool:
+
+def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = None,
+                              namespace: Optional[str] = None) -> bool:
     """
     Deploy MLflow to minikube using kubectl apply.
-    
+
     Args:
         manifest_dir: Directory containing deployment.yaml and service.yaml
         image_name: Optional image name to load if not already in minikube
+        namespace: Target Kubernetes namespace. Default (None/"default") keeps
+          everything in the default namespace; pass a value to isolate this stack.
     """
     if not manifest_dir.exists():
         typer.echo(f"Directory not found: {manifest_dir}")
@@ -219,39 +256,51 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
         load_image_to_minikube(image_name)
     
     typer.echo("Applying Kubernetes manifests...")
-    
+    ensure_namespace(namespace)
+    ns = ns_args(namespace)
+
     try:
+        # Apply PVC first so the deployment can reference it.
+        pvc_file = manifest_dir / "pvc.yaml"
+        if pvc_file.exists():
+            typer.echo(f"   Applying {pvc_file.name}...")
+            result = subprocess.run(
+                ["kubectl", "apply", "-f", str(pvc_file)] + ns,
+                check=True, capture_output=True, text=True,
+            )
+            typer.echo(f"{result.stdout.strip()}")
+
         typer.echo(f"   Applying {deployment_file.name}...")
         result = subprocess.run(
-            ["kubectl", "apply", "-f", str(deployment_file)],
+            ["kubectl", "apply", "-f", str(deployment_file)] + ns,
             check=True,
             capture_output=True,
             text=True
         )
         typer.echo(f"{result.stdout.strip()}")
-        
+
         # Apply service
         typer.echo(f"   Applying {service_file.name}...")
         result = subprocess.run(
-            ["kubectl", "apply", "-f", str(service_file)],
+            ["kubectl", "apply", "-f", str(service_file)] + ns,
             check=True,
             capture_output=True,
             text=True
         )
         typer.echo(f"{result.stdout.strip()}")
-        
+
         # Get service URL
         typer.echo("\n Getting service URL...")
-        
+
         # Try minikube service --url with timeout (can hang)
         try:
             result = subprocess.run(
-                ["minikube", "service", "mlflow-service", "--url"],
+                ["minikube", "service", "mlflow-service", "--url"] + ns,
                 capture_output=True,
                 text=True,
                 timeout=5  # 5 second timeout to prevent hanging
             )
-            
+
             if result.returncode == 0 and result.stdout.strip():
                 service_url = result.stdout.strip()
                 typer.echo(f"MLflow service is available at: {service_url}")
@@ -261,7 +310,7 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
             # Fallback: get NodePort manually (more reliable)
             typer.echo("   Getting NodePort...")
             result = subprocess.run(
-                ["kubectl", "get", "svc", "mlflow-service", "-o", "jsonpath='{.spec.ports[0].nodePort}'"],
+                ["kubectl", "get", "svc", "mlflow-service", "-o", "jsonpath='{.spec.ports[0].nodePort}'"] + ns,
                 capture_output=True,
                 text=True
             )
@@ -279,10 +328,10 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
                     typer.echo("   Could not determine minikube IP")
             else:
                 typer.echo("   Could not determine service URL. Check with: kubectl get svc mlflow-service")
-        
+
         typer.echo("\n Deployment status:")
-        subprocess.run(["kubectl", "get", "pods", "-l", "app=mlflow"])
-        subprocess.run(["kubectl", "get", "svc", "-l", "app=mlflow"])
+        subprocess.run(["kubectl", "get", "pods", "-l", "app=mlflow"] + ns)
+        subprocess.run(["kubectl", "get", "svc", "-l", "app=mlflow"] + ns)
         
         return True
         
@@ -294,13 +343,17 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
         return False
 
 
-def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = None) -> bool:
+def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = None,
+                               namespace: Optional[str] = None) -> bool:
     """
     Deploy FastAPI to minikube using kubectl apply.
-    
+
     Args:
         manifest_dir: Directory containing deployment.yaml and service.yaml
         image_name: Optional image name to load if not already in minikube
+        namespace: Target Kubernetes namespace. Default keeps the default
+          namespace; pass a value to isolate this stack. Must match the MLflow
+          namespace for in-cluster service DNS to resolve.
     """
     if not manifest_dir.exists():
         typer.echo(f"Directory not found: {manifest_dir}")
@@ -329,39 +382,41 @@ def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = N
         load_image_to_minikube(image_name)
     
     typer.echo("Applying Kubernetes manifests...")
-    
+    ensure_namespace(namespace)
+    ns = ns_args(namespace)
+
     try:
         typer.echo(f"   Applying {deployment_file.name}...")
         result = subprocess.run(
-            ["kubectl", "apply", "-f", str(deployment_file)],
+            ["kubectl", "apply", "-f", str(deployment_file)] + ns,
             check=True,
             capture_output=True,
             text=True
         )
         typer.echo(f"{result.stdout.strip()}")
-        
+
         # Apply service
         typer.echo(f"   Applying {service_file.name}...")
         result = subprocess.run(
-            ["kubectl", "apply", "-f", str(service_file)],
+            ["kubectl", "apply", "-f", str(service_file)] + ns,
             check=True,
             capture_output=True,
             text=True
         )
         typer.echo(f"{result.stdout.strip()}")
-        
+
         # Get service URL
         typer.echo("\n Getting service URL...")
-        
+
         # Try minikube service --url with timeout (can hang)
         try:
             result = subprocess.run(
-                ["minikube", "service", "fastapi-service", "--url"],
+                ["minikube", "service", "fastapi-service", "--url"] + ns,
                 capture_output=True,
                 text=True,
                 timeout=5  # 5 second timeout to prevent hanging
             )
-            
+
             if result.returncode == 0 and result.stdout.strip():
                 service_url = result.stdout.strip()
                 typer.echo(f"FastAPI service is available at: {service_url}")
@@ -371,7 +426,7 @@ def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = N
             # Fallback: get NodePort manually (more reliable)
             typer.echo("   Getting NodePort...")
             result = subprocess.run(
-                ["kubectl", "get", "svc", "fastapi-service", "-o", "jsonpath='{.spec.ports[0].nodePort}'"],
+                ["kubectl", "get", "svc", "fastapi-service", "-o", "jsonpath='{.spec.ports[0].nodePort}'"] + ns,
                 capture_output=True,
                 text=True
             )
@@ -389,10 +444,10 @@ def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = N
                     typer.echo("   Could not determine minikube IP")
             else:
                 typer.echo("   Could not determine service URL. Check with: kubectl get svc fastapi-service")
-        
+
         typer.echo("\n Deployment status:")
-        subprocess.run(["kubectl", "get", "pods", "-l", "app=fastapi"])
-        subprocess.run(["kubectl", "get", "svc", "-l", "app=fastapi"])
+        subprocess.run(["kubectl", "get", "pods", "-l", "app=fastapi"] + ns)
+        subprocess.run(["kubectl", "get", "svc", "-l", "app=fastapi"] + ns)
         
         return True
         

From 1092e39659e0d9ea36487a836315a500542a1bcc Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:34 -0700
Subject: [PATCH 07/31] docker build and FastAPI app robustness

Probe the Docker daemon before local builds and build for the host architecture by default with an optional platform override. In the FastAPI app, load the model in a background task so startup is fast and report a degraded but healthy status when MLflow is unreachable.
---
 src/deployml/docker/fastapi/main.py | 69 +++++++++++++++++++++--------
 src/deployml/notebook/docker.py     | 43 +++++++++++++++---
 2 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/src/deployml/docker/fastapi/main.py b/src/deployml/docker/fastapi/main.py
index 7180477..2ecea2b 100644
--- a/src/deployml/docker/fastapi/main.py
+++ b/src/deployml/docker/fastapi/main.py
@@ -90,10 +90,27 @@ def log_prediction_to_bigquery(entity_id: str, predicted_value: float, model_ver
         print(f"⚠️  Could not log prediction to BigQuery: {e}")
 
 
+async def _model_load_retry_loop():
+    """Retry loading the model until it succeeds. Earlier code attempted
+    the load once at startup; if MLflow was unavailable then, the model
+    stayed None forever even after MLflow recovered."""
+    import asyncio
+    retry_interval = 30
+    while model is None:
+        ok = await asyncio.to_thread(load_model_from_mlflow)
+        if ok:
+            return
+        await asyncio.sleep(retry_interval)
+
+
 @app.on_event("startup")
 async def startup_event():
-    load_model_from_mlflow()
+    # Init BigQuery synchronously. Cheap, returns immediately if no project.
     init_bigquery()
+    # Load the model in a background task with retry so startup completes fast
+    # AND we keep trying if MLflow was unreachable at first.
+    import asyncio
+    asyncio.create_task(_model_load_retry_loop())
 
 
 @app.get("/")
@@ -111,11 +128,20 @@ async def root():
 @app.get("/health", response_model=HealthResponse)
 async def health():
     port = int(os.getenv("PORT", "8000"))
+    # Actually probe MLflow. Earlier code returned mlflow_connected=True
+    # hardcoded, which lied to users when MLflow was down.
+    mlflow_ok = False
+    try:
+        import requests as _rq
+        resp = _rq.get(f"{MLFLOW_TRACKING_URI.rstrip('/')}/health", timeout=2)
+        mlflow_ok = resp.status_code == 200
+    except Exception:
+        mlflow_ok = False
     return HealthResponse(
-        status="healthy",
-        timestamp=datetime.now().isoformat(),
+        status="healthy" if mlflow_ok else "degraded",
+        timestamp=datetime.now(timezone.utc).isoformat(),
         port=port,
-        mlflow_connected=True,
+        mlflow_connected=mlflow_ok,
         model_loaded=model is not None
     )
 
@@ -126,15 +152,18 @@ async def predict(request: PredictionRequest):
 
     entity_id = request.entity_id or str(uuid.uuid4())
 
+    # Do not lazy-load synchronously here. mlflow.pyfunc.load_model can hang
+    # for many seconds when MLflow is unreachable, blocking the request worker.
+    # The background loader started in startup_event handles loading. If the
+    # model is not yet loaded, return 503 fast.
     if model is None:
-        load_model_from_mlflow()
-
-    if model is None:
-        return PredictionResponse(
-            prediction=-1.0,
-            timestamp=datetime.now().isoformat(),
-            model_used=None,
-            entity_id=entity_id
+        return JSONResponse(
+            status_code=503,
+            content={
+                "error": "Model not available yet. MLflow may be unreachable or no model is registered in Production stage. Retry shortly.",
+                "entity_id": entity_id,
+                "timestamp": datetime.now(timezone.utc).isoformat(),
+            },
         )
 
     try:
@@ -148,16 +177,20 @@ async def predict(request: PredictionRequest):
 
         return PredictionResponse(
             prediction=prediction,
-            timestamp=datetime.now().isoformat(),
+            timestamp=datetime.now(timezone.utc).isoformat(),
             model_used=model_version,
             entity_id=entity_id
         )
     except Exception as e:
-        return PredictionResponse(
-            prediction=-1.0,
-            timestamp=datetime.now().isoformat(),
-            model_used=None,
-            entity_id=entity_id
+        # Earlier code returned prediction=-1.0 with HTTP 200 and no message,
+        # which silently hid model errors. Now we return 500 with the cause.
+        return JSONResponse(
+            status_code=500,
+            content={
+                "error": f"Prediction failed: {type(e).__name__}: {e}",
+                "entity_id": entity_id,
+                "timestamp": datetime.now(timezone.utc).isoformat(),
+            },
         )
 
 
diff --git a/src/deployml/notebook/docker.py b/src/deployml/notebook/docker.py
index de68598..b6fb6ac 100644
--- a/src/deployml/notebook/docker.py
+++ b/src/deployml/notebook/docker.py
@@ -2,6 +2,8 @@
 from pathlib import Path
 from typing import Optional
 
+from deployml.utils.helpers import check_docker_daemon
+
 class ImageBuildError(Exception):
     pass
 
@@ -13,6 +15,7 @@ def build_images(
     tag: str = "latest",
     create_repo: bool = False,
     dry_run: bool = False,
+    platform: Optional[str] = None,
 ) -> None:
     """
     Build all Docker images located in subdirectories of docker_root.
@@ -34,6 +37,11 @@ def build_images(
         tag: Docker image tag.
         create_repo: Whether to create Artifact Registry repository (GCP mode only).
         dry_run: If True, print commands without executing them.
+        platform: Local-mode docker build platform. Default None builds for the
+            host architecture so images run on the local minikube node (arm64 on
+            Apple Silicon). Pass "linux/amd64" only if you are building locally
+            to push to an amd64 target like Cloud Run by hand. The GCP Cloud Build
+            path always produces amd64 regardless of this flag.
     """
 
     docker_root = Path(docker_root)
@@ -41,6 +49,13 @@ def build_images(
     if not docker_root.exists():
         raise ValueError(f"Docker root does not exist: {docker_root}")
 
+    # Local mode needs docker daemon. GCP mode uses Cloud Build, no local docker needed.
+    if not gcp_project_id and not check_docker_daemon():
+        raise ImageBuildError(
+            "Docker daemon is not running or not reachable. Start Docker Desktop, "
+            "or pass --gcp-project-id to build via Cloud Build."
+        )
+
     # Discover services
     services = [
         d for d in docker_root.iterdir()
@@ -79,7 +94,19 @@ def build_images(
                 print()
             else:
                 print("Ensuring Artifact Registry repository exists...")
-                subprocess.run(create_cmd, check=False)  # safe if already exists
+                create_proc = subprocess.run(
+                    create_cmd, check=False,
+                    capture_output=True, text=True,
+                )
+                stderr_lower = (create_proc.stderr or "").lower()
+                if create_proc.returncode == 0:
+                    print(f"Created repository: {repository}")
+                elif "already exists" in stderr_lower or "alreadyexists" in stderr_lower:
+                    print(f"Repository {repository} already exists, reusing.")
+                else:
+                    raise ImageBuildError(
+                        f"Artifact Registry create failed: {create_proc.stderr.strip()}"
+                    )
                 print()
 
         # Build each service
@@ -112,11 +139,15 @@ def build_images(
             service_name = service_dir.name
             image_name = f"{service_name}:{tag}"
 
-            build_cmd = [
-                "docker", "build",
-                "-t", image_name,
-                str(service_dir),
-            ]
+            # Build for the host architecture by default so the image runs on the
+            # local minikube node (arm64 on Apple Silicon). Local mode feeds
+            # minikube; the Cloud Run path builds amd64 via Cloud Build above, so
+            # there is no Cloud Run use case for a forced amd64 local build. Pass
+            # platform explicitly only to override (e.g. a manual amd64 push).
+            build_cmd = ["docker", "build"]
+            if platform:
+                build_cmd += ["--platform", platform]
+            build_cmd += ["-t", image_name, str(service_dir)]
 
             if dry_run:
                 print("Would build locally:")

From 9b6e6785ef3d196d4c315fc5be89d64ad715e6b8 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:34 -0700
Subject: [PATCH 08/31] docs: rewrite for the GCP Cloud Run path

Document the supported init, build-images, deploy, get-urls, destroy flow with auth, IAM, billing, and cost guidance, add config.example.yaml, and align the Python version and command references.
---
 README.md                       |  18 ++-
 config.example.yaml             |   1 +
 docs/api/cli-commands.md        | 116 +++++++++++++-----
 docs/installation.md            |  57 +++++++--
 docs/tutorials/example.md       |   6 +-
 docs/tutorials/gcp-cloud-run.md | 206 +++++++++++++++++++++++---------
 docs/tutorials/overview.md      |   2 +-
 example/README.md               |  16 ++-
 8 files changed, 312 insertions(+), 110 deletions(-)

diff --git a/README.md b/README.md
index f0cb2bf..9f9b60c 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ A CLI tool that deploys a production MLOps stack on GCP with a single command. B
 - **Grafana** — monitoring dashboard connected to your metrics database
 - **BigQuery** — `mlops` dataset with tables for features, predictions, ground truth, and drift metrics
 
-All running on GCP Cloud Run — no servers to manage, scales to zero when idle.
+All running on GCP Cloud Run. No servers to manage. Cloud Run services scale to zero when idle. Cloud SQL and BigQuery storage incur baseline cost. See Costs below.
 
 ## Quick Start
 
@@ -22,7 +22,7 @@ pip install deployml-core
 **2. Initialize your GCP project** (enables APIs, creates Artifact Registry)
 
 ```bash
-deployml init --provider gcp --project-id YOUR_PROJECT_ID
+deployml init --provider gcp --project-id YOUR_GCP_PROJECT_ID
 ```
 
 **3. Configure**
@@ -86,7 +86,13 @@ See [docs/tutorials/gcp-cloud-run.md](docs/tutorials/gcp-cloud-run.md) for a ste
 
 ## Requirements
 
-- Python 3.10+
-- `gcloud` CLI (authenticated)
-- Docker (running)
-- Terraform
+- Python 3.11 or newer
+- `gcloud` CLI, authenticated with `gcloud auth login`, `gcloud auth application-default login`, and `gcloud auth configure-docker us-west1-docker.pkg.dev`
+- Docker, running
+- Terraform 1.0 or newer
+
+Run `deployml doctor --project-id YOUR_GCP_PROJECT_ID` to verify auth, ADC, tool versions, enabled APIs, and IAM roles on your project.
+
+## Costs
+
+Cloud Run scales to zero when idle. Cloud SQL Postgres and BigQuery storage do not. Expect roughly $30 to $80 per month while the stack is up. MLflow runs with `min_instances = 1` by default for snappy UI, which adds about $5 per month. Set `min_instances` to 0 if you want zero idle cost in exchange for cold starts. Always run `deployml destroy` when done.
diff --git a/config.example.yaml b/config.example.yaml
index b1805e4..67caa00 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -3,6 +3,7 @@ provider:
   name: gcp
   project_id: YOUR_GCP_PROJECT_ID
   region: us-west1
+  image_tag: v0.0.42  # pinned. Override to your build tag if you rebuild images.
 deployment:
   type: cloud_run
 stack:
diff --git a/docs/api/cli-commands.md b/docs/api/cli-commands.md
index 61bd4c6..6eeddd5 100644
--- a/docs/api/cli-commands.md
+++ b/docs/api/cli-commands.md
@@ -1,88 +1,150 @@
 # CLI Commands Reference
 
+Commands you use most live at the top. Advanced and experimental commands are listed at the bottom.
+
 ## `deployml doctor`
 
-Check that all required local tools are installed and authenticated.
+Check that required tools are installed and authenticated. Optionally checks enabled APIs and IAM roles on a project.
 
 ```bash
 deployml doctor
+deployml doctor --project-id YOUR_GCP_PROJECT_ID
 ```
 
-Run this before anything else. Checks for `gcloud`, `docker`, `terraform`, and `bq`.
+**Options:**
+- `--project-id`, `-j`: GCP project to probe for enabled APIs and IAM role coverage.
+
+The doctor checks: tool versions for docker, terraform, gcloud, bq; gcloud auth and Application Default Credentials; Infracost (optional); and for a given project, required APIs and IAM roles.
 
 ---
 
 ## `deployml init`
 
-Enable required GCP APIs for a project and create the Artifact Registry repository. Run once per project.
+Enable required GCP APIs for a project and create a local `docker/` folder plus a runnable `config.yaml` starter. Run once per project.
 
 ```bash
-deployml init --provider gcp --project-id YOUR_PROJECT_ID
+deployml init --provider gcp --project-id YOUR_GCP_PROJECT_ID
 ```
 
 **Options:**
-- `--provider`, `-p`: Cloud provider — currently `gcp`
-- `--project-id`, `-j`: GCP project ID
+- `--provider`, `-p`: Cloud provider. Currently `gcp` is fully supported. `aws` and `azure` write skeleton configs.
+- `--project-id`, `-j`: GCP project ID.
+- `--path`: Directory where the project is initialized. Defaults to current directory.
+- `--overwrite`: Overwrite an existing `docker/` folder or `config.yaml`.
+
+The generated `config.yaml` includes a `provider.image_tag` field set to the deployml version. Override this to pin to a specific build tag.
 
 ---
 
 ## `deployml build-images`
 
-Build Docker images and push them to GCP Artifact Registry. Reads project ID and region from `config.yaml` by default.
+Build Docker images and push them to GCP Artifact Registry. Reads project ID, region, and image_tag from `config.yaml` by default.
 
 ```bash
-deployml build-images
+deployml build-images --create-repo
 ```
 
 **Options:**
-- `--config-path`, `-c`: Path to config YAML file (default: `config.yaml`)
-- `--docker-root`, `-d`: Path to folder containing Dockerfiles (default: built-in package images)
-- `--gcp-project`, `-p`: GCP project ID (default: inferred from config)
-- `--region`: GCP region (default: inferred from config)
-- `--repository`: Artifact Registry repository name (default: `mlops-images`)
-- `--tag`: Image tag (default: `latest`)
-- `--create-repo`: Create the Artifact Registry repository if it does not exist
+- `--config-path`, `-c`: Path to config YAML file. Default `config.yaml`.
+- `--docker-root`, `-d`: Folder containing subfolders with Dockerfiles. Default is the built-in deployml docker directory.
+- `--gcp-project`, `-p`: GCP project ID. Default inferred from config.
+- `--region`: GCP region. Default inferred from config.
+- `--repository`: Artifact Registry repository name. Default `mlops-images`.
+- `--tag`, `-t`: Image tag. Default reads `config.provider.image_tag`, falls back to `v{deployml_version}`.
+- `--create-repo`: Create the Artifact Registry repository on first run.
+- `--dry-run`: Print commands without executing.
+
+Builds run on Cloud Build, so a local Docker daemon is not required for GCP mode.
 
 ---
 
 ## `deployml deploy`
 
-Deploy infrastructure from a YAML config file.
+Deploy infrastructure from a YAML config file. Prompts for confirmation by default.
 
 ```bash
 deployml deploy --verbose
+deployml deploy --verbose --yes
 ```
 
 **Options:**
-- `--config-path`, `-c`: Path to config YAML file (default: `config.yaml`)
-- `--verbose`, `-v`: Stream Terraform logs instead of showing a progress bar
-- `--yes`, `-y`: Skip confirmation prompts
+- `--config-path`, `-c`: Path to config YAML. Default `config.yaml`.
+- `--verbose`, `-v`: Stream Terraform logs to stdout. Without this you get a progress bar.
+- `--yes`, `-y`: Skip the `[y/N]` deploy confirmation. Required for non-interactive scripts.
+- `--generate-only`, `-g`: Only generate manifests, no apply. For GKE flow.
+
+First-time deploy takes about 20 minutes because Cloud SQL Postgres provisioning is slow.
 
 ---
 
 ## `deployml get-urls`
 
-Print service URLs from the last deployment and write them to a `.env` file.
+Print service URLs from the last deployment and write them to a `.env` file. Database credentials are masked.
 
 ```bash
 deployml get-urls
+deployml get-urls --show-secrets
 ```
 
 **Options:**
-- `--config-path`, `-c`: Path to config YAML file (default: `config.yaml`)
-- `--env-path`: Where to write the `.env` file (default: `.env`)
+- `--config-path`, `-c`: Path to config YAML. Default `config.yaml`.
+- `--env-path`: Where to write the `.env`. Default `.env`.
+- `--show-secrets`: Additionally fetch and print the Grafana admin password and the Cloud SQL Auth Proxy connection command.
 
 ---
 
 ## `deployml destroy`
 
-Tear down all infrastructure for a given config.
+Tear down all infrastructure for a given config. Also removes the Artifact Registry repo created by build-images.
 
 ```bash
-deployml destroy
+deployml destroy --yes
 ```
 
 **Options:**
-- `--config-path`, `-c`: Path to config YAML file (default: `config.yaml`)
-- `--clean-workspace`: Delete the local workspace folder after destroy
-- `--yes`, `-y`: Skip confirmation prompts
+- `--config-path`, `-c`: Path to config YAML. Default `config.yaml`.
+- `--clean-workspace`: Remove the local `.deployml/` workspace folder after destroy.
+- `--yes`, `-y`: Skip both the destroy confirmation and the Terraform state cleanup prompt.
+- `--workspace`: Override the workspace name from config.
+
+On partial failure, Terraform state is preserved and the command prints recovery instructions including `gcloud asset search-all-resources` for finding residual resources.
+
+---
+
+## Config file reference
+
+Top-level fields used by deploy and destroy:
+
+```yaml
+name: string                       # workspace name. defaults to "default" if omitted
+provider:
+  name: gcp | aws | azure
+  project_id: string               # required for gcp
+  region: string                   # required for gcp
+  image_tag: string                # optional. default v{deployml_version}
+deployment:
+  type: cloud_run | cloud_vm | gke # required
+stack:
+  - <stage_name>:
+      name: mlflow | fastapi | grafana | feast | cron
+      params: {}                   # tool-specific
+```
+
+Supported stage names: `experiment_tracking`, `artifact_tracking`, `model_registry`, `model_serving`, `model_monitoring`, `feature_store`, `workflow_orchestration`.
+
+---
+
+## Advanced and experimental commands
+
+These exist in the CLI but are not part of the documented happy path. Use at your own risk and inspect the source.
+
+- `deployml generate`: Interactive YAML generator. Less useful since `init` now writes a runnable config.
+- `deployml status`: Stub. Reports deployment status of current workspace.
+- `deployml terraform`: Run raw terraform actions (plan, apply, destroy) on a rendered workspace.
+- `deployml teardown`: Manage scheduled auto-teardown jobs.
+- `deployml vm`: Placeholder for VM deployment.
+- `deployml mlflow-init`, `deployml mlflow-deploy`: MLflow-only flows.
+- `deployml minikube-init`, `deployml minikube-deploy`: Local Kubernetes flow for testing without GCP.
+- `deployml gke-init`, `deployml gke-deploy`, `deployml gke-apply`, `deployml gke-destroy`: GKE deployment path. `gke-destroy` removes the deployed manifests and, with `--delete-cluster`, the cluster itself. See the [GKE flow notes](../tutorials/gcp-cloud-run.md#gke-flow-notes) in the tutorial.
+
+The fully supported and tested path is GCP Cloud Run via `init`, `build-images`, `deploy`, `get-urls`, `destroy`.
diff --git a/docs/installation.md b/docs/installation.md
index c3271de..cdcc94f 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -2,12 +2,44 @@
 
 ## Prerequisites
 
-Before installing deployml, ensure you have:
+Install these tools first. `deployml doctor` checks all of them.
 
-- **Python 3.10+**
-- **Docker** (running)
-- **gcloud CLI** — authenticated with `gcloud auth login` and `gcloud auth application-default login`
-- **Terraform**
+- Python 3.11 or newer
+- Docker, running
+- Terraform 1.0 or newer
+- gcloud CLI
+
+## Project setup checklist
+
+Set up the project FIRST so the auth steps below can reference your project ID.
+
+1. Create a GCP project. Either in the [GCP Console](https://console.cloud.google.com) or via CLI:
+   ```bash
+   gcloud projects create YOUR_GCP_PROJECT_ID --name="Your Project Name"
+   ```
+2. Link a billing account. Verify with `gcloud billing projects describe YOUR_GCP_PROJECT_ID`. Expect `billingEnabled: true`.
+3. Confirm you have a sufficient IAM role on the project. `roles/owner` is the simplest. Or this explicit set:
+   - `roles/serviceusage.serviceUsageAdmin`
+   - `roles/artifactregistry.admin`
+   - `roles/cloudsql.admin`
+   - `roles/run.admin`
+   - `roles/storage.admin`
+   - `roles/bigquery.admin`
+   - `roles/iam.serviceAccountAdmin`
+   - `roles/iam.serviceAccountUser`
+
+## Authenticate gcloud
+
+Four commands. Run them after the project exists so you can pass its ID.
+
+```bash
+gcloud auth login                                                              # user auth
+gcloud auth application-default login                                          # ADC for Terraform and client libs
+gcloud auth application-default set-quota-project YOUR_GCP_PROJECT_ID          # bills BigQuery and client lib calls to the right project
+gcloud auth configure-docker us-west1-docker.pkg.dev                           # Docker push to Artifact Registry
+```
+
+Replace `us-west1` with the region you plan to deploy in. The third command is critical. Skipping it leaves ADC pointing at whatever project you used last, and the example scripts fail with `USER_PROJECT_DENIED` if that project was deleted. The fourth command lets Docker push to Artifact Registry. Skipping it makes `deployml build-images` fail with `denied: User cannot access repository`.
 
 ## Install deployml
 
@@ -15,12 +47,21 @@ Before installing deployml, ensure you have:
 pip install deployml-core
 ```
 
-## Verify Installation
+## Verify
 
 ```bash
-deployml doctor
+deployml doctor --project-id YOUR_GCP_PROJECT_ID
 ```
 
-This checks that all required tools (`gcloud`, `docker`, `terraform`, `bq`) are installed and authenticated. Install any missing tools and rerun until it passes.
+The doctor checks tool versions, authentication, ADC, the `bq` CLI, enabled APIs, and your IAM roles on the project. Install any missing tool and rerun until every line is green.
+
+## Platform notes
+
+deployml is tested on macOS and Linux. Windows users can run it with these caveats:
+
+- The auth commands above and all `deployml` CLI calls work the same in PowerShell, cmd, and WSL.
+- `export PATH=...` examples in the tutorials are bash. On PowerShell use `$env:PATH = "..." + $env:PATH`. On cmd use `set PATH=...;%PATH%`.
+- Docker Desktop on Windows uses the WSL2 backend by default. `deployml build-images` against Cloud Build does not need a local Docker daemon, so the safest path is to skip local builds and let Cloud Build do the work.
+- If you clone the repo on Windows, the included `.gitattributes` forces shell scripts and Dockerfiles to LF line endings. Without this, `docker build` would fail inside containers with `exec format error`.
 
 - [Get Started →](tutorials/overview.md)
diff --git a/docs/tutorials/example.md b/docs/tutorials/example.md
index 7c8fe50..046a8fd 100644
--- a/docs/tutorials/example.md
+++ b/docs/tutorials/example.md
@@ -20,7 +20,7 @@ All scripts read from the `.env` file written by `deployml get-urls`. It should
 MLFLOW_URL=https://...
 FASTAPI_URL=https://...
 GRAFANA_URL=https://...
-BIGQUERY_PROJECT=your-project-id
+BIGQUERY_PROJECT=YOUR_GCP_PROJECT_ID
 BIGQUERY_DATASET=mlops
 ```
 
@@ -38,7 +38,7 @@ Generates 500 rows of synthetic housing data and loads them into the `offline_fe
 
 Verify:
 ```bash
-bq query --use_legacy_sql=false 'SELECT COUNT(*) FROM `YOUR_PROJECT.mlops.offline_features`'
+bq query --use_legacy_sql=false 'SELECT COUNT(*) FROM `$BIGQUERY_PROJECT.mlops.offline_features`'
 ```
 
 ### Step 2 — Train a model with MLflow
@@ -71,7 +71,7 @@ Pulls 50 rows from `offline_features` and sends each to FastAPI `/predict`. Fast
 
 Verify:
 ```bash
-bq query --use_legacy_sql=false 'SELECT COUNT(*) FROM `YOUR_PROJECT.mlops.predictions`'
+bq query --use_legacy_sql=false 'SELECT COUNT(*) FROM `$BIGQUERY_PROJECT.mlops.predictions`'
 ```
 
 Also check FastAPI is serving the model:
diff --git a/docs/tutorials/gcp-cloud-run.md b/docs/tutorials/gcp-cloud-run.md
index 9bae181..5f482fd 100644
--- a/docs/tutorials/gcp-cloud-run.md
+++ b/docs/tutorials/gcp-cloud-run.md
@@ -1,44 +1,36 @@
 # GCP Cloud Run Tutorial
 
-This tutorial walks you through deploying a full MLOps stack on GCP using Cloud Run. By the end you will have MLflow (experiment tracking, artifact storage, model registry), FastAPI (model serving), and Grafana (monitoring dashboard) running in the cloud.
+This tutorial walks you through deploying a full MLOps stack on GCP using Cloud Run. By the end you will have MLflow, FastAPI, and Grafana running in the cloud.
 
 ## Prerequisites
 
-Make sure `deployml doctor` passes before starting. You will need:
+Run `deployml doctor --project-id YOUR_GCP_PROJECT_ID` first. It checks everything in the [installation guide](../installation.md). Make sure the IAM and API checks pass before you continue.
 
-- `gcloud` CLI, authenticated (`gcloud auth login` and `gcloud auth application-default login`)
-- Docker (running)
-- Terraform
+## 1. Create and prepare your GCP project
 
-## 1. Create a GCP Project
+1. Create a project in the [GCP Console](https://console.cloud.google.com). Note the project ID.
+2. Link a billing account. Verify with `gcloud billing projects describe YOUR_GCP_PROJECT_ID`. The output should include `billingEnabled: true`.
+3. Confirm you have a sufficient IAM role on the project. `roles/owner` is the simplest. See the [installation guide](../installation.md) for the minimum explicit set.
 
-Create a new project in the [GCP Console](https://console.cloud.google.com) and enable billing. Note your project ID — you will use it throughout this tutorial.
+## 2. Initialize the project
 
-## 2. Initialize the Project
-
-`init` enables the required GCP APIs and creates the Artifact Registry repository that your Docker images will be pushed to.
+`init` enables the required GCP APIs and creates a local `docker/` folder and `config.yaml` template.
 
 ```bash
-deployml init --provider gcp --project-id YOUR_PROJECT_ID
+deployml init --provider gcp --project-id YOUR_GCP_PROJECT_ID
 ```
 
-This only needs to be run once per project. It takes a few minutes while GCP enables APIs.
-
-## 3. Create a Configuration File
-
-Copy the example config and fill in your project ID:
+This only needs to be run once per project. API enablement takes a few minutes.
 
-```bash
-cp config.example.yaml config.yaml
-```
+## 3. Review the configuration file
 
-Edit `config.yaml` and replace `YOUR_GCP_PROJECT_ID` with your actual project ID:
+`init` already wrote a runnable `config.yaml` with your project ID filled in. Inspect it and adjust service names, region, or `image_tag` if you need to. The default looks like this:
 
 ```yaml
 name: gcp-mlops-stack-mlflow
 provider:
   name: gcp
-  project_id: YOUR_PROJECT_ID
+  project_id: YOUR_GCP_PROJECT_ID
   region: us-west1
 deployment:
   type: cloud_run
@@ -50,7 +42,7 @@ stack:
   - artifact_tracking:
       name: mlflow
       params:
-        artifact_bucket: mlflow-artifacts-YOUR_PROJECT_ID
+        artifact_bucket: mlflow-artifacts-YOUR_GCP_PROJECT_ID
   - model_registry:
       name: mlflow
       params:
@@ -67,22 +59,20 @@ stack:
 
 **What each block does:**
 
-- `experiment_tracking` + `artifact_tracking` + `model_registry` — these three together deploy a single MLflow server backed by Cloud SQL (Postgres) for metadata and a GCS bucket for artifacts
-- `model_serving` — deploys a FastAPI container that pulls the latest registered model from MLflow on startup
-- `model_monitoring` — deploys Grafana connected to the Postgres `metrics` database
+- `experiment_tracking` + `artifact_tracking` + `model_registry` together deploy a single MLflow server backed by Cloud SQL Postgres for metadata and a GCS bucket for artifacts.
+- `model_serving` deploys a FastAPI container that pulls the latest registered model from MLflow on startup.
+- `model_monitoring` deploys Grafana connected to the Postgres `metrics` database.
 
-## 4. Build Docker Images
-
-Build and push the service images to Artifact Registry:
+## 4. Build Docker images
 
 ```bash
 deployml build-images --create-repo
 ```
 
 This reads your project ID and region from `config.yaml` and pushes images for MLflow, FastAPI, and Grafana to:
-`us-west1-docker.pkg.dev/YOUR_PROJECT_ID/mlops-images/`
+`us-west1-docker.pkg.dev/YOUR_GCP_PROJECT_ID/mlops-images/`
 
-Building takes a few minutes. You only need to rebuild if you change a Dockerfile or application code inside a container.
+The first build creates the Artifact Registry repo. Subsequent runs reuse it. Builds run on Cloud Build, so you do not need a local Docker daemon for this step.
 
 ## 5. Deploy
 
@@ -90,83 +80,116 @@ Building takes a few minutes. You only need to rebuild if you change a Dockerfil
 deployml deploy --verbose
 ```
 
-`--verbose` streams Terraform output directly so you can see what's being created. Without it you get a progress bar. The first deployment takes roughly 20 minutes — Cloud SQL Postgres takes 15-20 minutes to provision.
+Deploy prompts `Do you want to deploy the stack? [y/N]` before applying. Type `y` to proceed. Pass `--yes` to skip the prompt in scripts.
+
+`--verbose` streams Terraform output directly. The first deployment takes roughly 20 minutes because Cloud SQL Postgres provisioning is slow.
 
 What gets created:
 
-- Cloud SQL Postgres instance with `mlflow` and `metrics` databases
+- Cloud SQL Postgres instance with `mlflow` and `metrics` databases. The public IP is allocated but no client network is authorized, so the DB is reachable only from Cloud Run via the Cloud SQL Auth Proxy tunnel.
 - GCS bucket for MLflow artifacts
-- Cloud Run services for MLflow, FastAPI, and Grafana
+- Cloud Run services for MLflow, FastAPI, and Grafana. MLflow runs with `min_instances = 1` to avoid cold starts on the tracking server.
 - BigQuery `mlops` dataset with four tables
 - IAM service accounts and bindings
 
-## 6. Get Service URLs
+## 6. Costs
+
+While the stack is up, expect rough monthly cost in the $30 to $80 range depending on usage:
+
+- Cloud SQL Postgres `db-g1-small`: about $25 per month, even when idle.
+- MLflow Cloud Run with `min_instances = 1`: about $5 per month for the warm instance.
+- BigQuery storage: tiny until you load real data.
+- Cloud Run for FastAPI and Grafana: scales to zero when idle.
+
+Cloud Build minutes during `build-images` are pay-per-use and usually a few cents per cycle.
+
+**Run `deployml destroy` as soon as you are done.** Cloud SQL keeps billing while running.
+
+## 7. Get service URLs
 
 ```bash
 deployml get-urls
 ```
 
-This prints all service URLs and writes them to a `.env` file in the current directory. Example output:
+This prints all service URLs and writes them to a `.env` file in the current directory. Database credentials are masked so the `.env` is safe to share with your IDE or commit to a local notebook.
 
-```
-  experiment_tracking_mlflow_url: https://mlflow-server-xxxx-uw.a.run.app
-  model_serving_fastapi_url: https://fastapi-mlflow-server-xxxx-uw.a.run.app
-  model_monitoring_grafana_url: https://grafana-server-xxxx-uw.a.run.app
+Add `--show-secrets` to additionally print the Grafana admin password and the Cloud SQL Auth Proxy connection command:
 
- .env written to /your/project/.env
+```bash
+deployml get-urls --show-secrets
 ```
 
-## 7. Verify the Stack
+## 8. Verify the stack
 
 **MLflow**
 
-Open the MLflow URL in your browser — you should see the MLflow UI with no experiments yet.
-
 ```bash
-curl https://YOUR_MLFLOW_URL/health
+curl https://YOUR_MLFLOW_URL/health   # returns OK
 ```
 
+Open the MLflow URL in your browser to see the UI.
+
 **FastAPI**
 
 ```bash
 curl https://YOUR_FASTAPI_URL/health
 ```
 
-The `/docs` endpoint gives you the auto-generated OpenAPI UI.
+The `/docs` endpoint shows the auto-generated OpenAPI UI.
 
 **Grafana**
 
-Open the Grafana URL in your browser. Default credentials are `admin` / `admin`. You will be prompted to change the password on first login.
+Open the Grafana URL in your browser. Username is `admin`. The password is auto-generated and stored in Secret Manager. Fetch it with:
 
-**BigQuery**
+```bash
+deployml get-urls --show-secrets
+```
 
-Verify the `mlops` dataset and all four tables were created:
+The output prints the password directly and also gives the secret ID so you can rotate or re-fetch later.
+
+**BigQuery**
 
 ```bash
-bq ls --project_id=YOUR_PROJECT_ID mlops
+bq ls --project_id=YOUR_GCP_PROJECT_ID mlops
 ```
 
 You should see `offline_features`, `predictions`, `ground_truth`, and `drift_metrics`.
 
-## 8. Run the End-to-End Example
+## 9. Run the end to end example
 
 With the stack running, follow the [example walkthrough](example.md) to train a model, register it, serve predictions through FastAPI, and visualize drift metrics in Grafana.
 
-## 9. Teardown
+## 10. Teardown
 
-When you are done, destroy all infrastructure to avoid ongoing charges:
+When you are done, destroy all infrastructure to stop billing:
 
 ```bash
-deployml destroy
+deployml destroy --yes
 ```
 
-This deletes all Cloud Run services, Cloud SQL instance, GCS bucket contents, and Terraform state. It does not delete the Artifact Registry images or the GCP project itself.
+`--yes` skips both the destroy confirm and the workspace cleanup prompt. This deletes all Cloud Run services, Cloud SQL instance, GCS bucket contents, and Terraform state files. It does not delete the Artifact Registry images or the GCP project itself.
+
+To free up an active project slot, also run:
+
+```bash
+gcloud projects delete YOUR_GCP_PROJECT_ID
+```
 
 ## Troubleshooting
 
-**Terraform lock file error**
+**`build-images` fails with denied: User cannot access repository**
 
-If a previous deploy was interrupted, you may see a lock file error. Delete the lock file and retry:
+You skipped `gcloud auth configure-docker us-west1-docker.pkg.dev`. Run it and retry.
+
+**`init` says Project not found or not accessible**
+
+Either the project ID is wrong, or your gcloud account does not have access. Verify with `gcloud projects describe YOUR_GCP_PROJECT_ID`.
+
+**Deploy hangs at MLflow Cloud Run creation**
+
+Check the Cloud Run logs in the link the error prints. The most common cause is a misconfigured DB connection. If you are on a fork, confirm the template passes `connection_string_cloud_sql` to MLflow rather than `connection_string`.
+
+**Terraform lock file error after an interrupted deploy**
 
 ```bash
 rm .deployml/YOUR_CONFIG_NAME/terraform/.terraform.lock.hcl
@@ -176,9 +199,74 @@ deployml deploy --verbose
 **Service logs**
 
 ```bash
-gcloud run services logs read SERVICE_NAME --project YOUR_PROJECT_ID --region us-west1
+gcloud run services logs read SERVICE_NAME --project YOUR_GCP_PROJECT_ID --region us-west1
 ```
 
-**Cloud SQL connection issues**
+**`Your active project does not match the quota project` warning**
+
+Not cosmetic. If the ADC quota project points at a deleted or unrelated project, BigQuery and other Google client libraries fail with `403 USER_PROJECT_DENIED`. Always run this once per fresh project before using the example scripts:
+
+```bash
+gcloud auth application-default set-quota-project YOUR_GCP_PROJECT_ID
+```
+
+**`USER_PROJECT_DENIED` or `Project ... has been deleted` in example scripts**
+
+Same root cause as above. Run the ADC quota project command.
+
+**`The project cannot be created because you have exceeded your allotted project quota`**
+
+GCP limits how many projects you can create. Each deleted project still counts against the quota during a 30-day grace period. If you are iterating on fresh test projects, you will hit this. Workarounds:
+
+- Wait for the grace period to expire.
+- Request a higher project creation quota in the [GCP Console](https://console.cloud.google.com/iam-admin/quotas).
+- Reuse one project for multiple test runs, calling `deployml destroy` between them. The drift detection in `deployml deploy` makes this safe.
+
+## GKE flow notes
+
+The `deployml gke-*` commands are an alternative to Cloud Run for users who want a Kubernetes cluster. The flow is intentionally lighter than Cloud Run and has a few sharp edges:
+
+**1. GKE uses gcr.io. Cloud Run uses Artifact Registry.**
+
+`deployml build-images --create-repo` pushes to `{region}-docker.pkg.dev/{project}/mlops-images/` (Artifact Registry). The GKE flow expects images at `gcr.io/{project}/...`. These are different registries. If you ran `build-images` for the Cloud Run flow and then try `gke-init`, the GKE manifests reference an image that does not exist. For GKE, build and push manually:
+
+```bash
+docker build --platform linux/amd64 -t gcr.io/YOUR_GCP_PROJECT_ID/fastapi/fastapi:v0.0.42 ./docker/fastapi
+docker push gcr.io/YOUR_GCP_PROJECT_ID/fastapi/fastapi:v0.0.42
+```
+
+**2. kubectl needs `gke-gcloud-auth-plugin`.**
+
+`gcloud components install gke-gcloud-auth-plugin` installs the plugin but it may not be on PATH. Without it on PATH, `kubectl get nodes` fails with `executable gke-gcloud-auth-plugin not found`.
+
+macOS with Homebrew gcloud (bash or zsh):
+
+```bash
+export PATH="/opt/homebrew/share/google-cloud-sdk/bin:$PATH"
+```
+
+Linux with gcloud installed system-wide is usually already on PATH.
+
+Windows PowerShell:
+
+```powershell
+$env:PATH = "C:\Users\$env:USERNAME\AppData\Local\Google\Cloud SDK\google-cloud-sdk\bin;" + $env:PATH
+```
+
+Windows cmd:
+
+```cmd
+set PATH=C:\Users\%USERNAME%\AppData\Local\Google\Cloud SDK\google-cloud-sdk\bin;%PATH%
+```
+
+Adjust the path if your gcloud install is elsewhere.
+
+**3. You manage the cluster yourself.**
+
+`deployml` does not create or delete GKE clusters for you. Use `gcloud container clusters create-auto` for Autopilot or `gcloud container clusters create` for standard. To tear down:
+
+```bash
+deployml gke-destroy --manifest-dir manifests --cluster gke-test --project YOUR_GCP_PROJECT_ID --region us-west1 --delete-cluster
+```
 
-If destroy fails with an active connections error, the Cloud Run services may not have been fully shut down. Re-run destroy — it will retry the Cloud SQL cleanup.
+`--delete-cluster` also removes the cluster. Omit it to only delete the deployed manifests.
diff --git a/docs/tutorials/overview.md b/docs/tutorials/overview.md
index 4d358c1..f061b76 100644
--- a/docs/tutorials/overview.md
+++ b/docs/tutorials/overview.md
@@ -21,7 +21,7 @@ Once deployed, walk through a complete MLOps workflow using a synthetic housing
 deployml doctor
 
 # 2. Enable GCP APIs
-deployml init --provider gcp --project-id YOUR_PROJECT_ID
+deployml init --provider gcp --project-id YOUR_GCP_PROJECT_ID
 
 # 3. Build Docker images
 deployml build-images --create-repo
diff --git a/example/README.md b/example/README.md
index 5bcb181..7c17f84 100644
--- a/example/README.md
+++ b/example/README.md
@@ -5,7 +5,7 @@ This example walks through a complete MLOps workflow using a synthetic housing p
 ## Prerequisites
 
 1. Deploy the infrastructure following the [GCP Cloud Run tutorial](../docs/tutorials/gcp-cloud-run.md)
-2. Run `deployml get-urls --config-path config.yaml` to generate your `.env` file
+2. Run `deployml get-urls` to generate your `.env` file. The scripts read it from your current working directory.
 3. Install Python dependencies:
 
 ```bash
@@ -14,14 +14,18 @@ pip install mlflow scikit-learn pandas numpy google-cloud-bigquery db-dtypes pyt
 
 ## Environment
 
-All scripts read from the `.env` file written by `deployml get-urls`. It should contain:
+All scripts read from the `.env` file written by `deployml get-urls`. It contains:
 
 ```
 MLFLOW_URL=https://...
 FASTAPI_URL=https://...
 GRAFANA_URL=https://...
-BIGQUERY_PROJECT=your-project-id
+ARTIFACT_BUCKET=mlflow-artifacts-YOUR_GCP_PROJECT_ID
+BIGQUERY_PROJECT=YOUR_GCP_PROJECT_ID
 BIGQUERY_DATASET=mlops
+GRAFANA_ADMIN_PASSWORD_SECRET_ID=grafana-server-admin-password
+MLFLOW_DSN_SECRET_ID=mlflow-postgres-YOUR_GCP_PROJECT_ID-mlflow-dsn
+INSTANCE_CONNECTION_NAME=YOUR_GCP_PROJECT_ID:us-west1:mlflow-postgres-YOUR_GCP_PROJECT_ID
 ```
 
 ## Scripts
@@ -38,7 +42,7 @@ Generates 500 rows of synthetic housing data and loads them into the `offline_fe
 
 Verify:
 ```bash
-bq query --use_legacy_sql=false 'SELECT COUNT(*) FROM `YOUR_PROJECT.mlops.offline_features`'
+bq query --use_legacy_sql=false 'SELECT COUNT(*) FROM `'$BIGQUERY_PROJECT'.mlops.offline_features`'
 ```
 
 ### Step 2 — Train a model with MLflow
@@ -71,7 +75,7 @@ Pulls 50 rows from `offline_features` and sends each to FastAPI `/predict`. Fast
 
 Verify:
 ```bash
-bq query --use_legacy_sql=false 'SELECT COUNT(*) FROM `YOUR_PROJECT.mlops.predictions`'
+bq query --use_legacy_sql=false 'SELECT COUNT(*) FROM `'$BIGQUERY_PROJECT'.mlops.predictions`'
 ```
 
 Also check FastAPI is serving the model:
@@ -108,7 +112,7 @@ Provisions a monitoring dashboard in Grafana via the API showing:
 - Feature mean shift per feature
 - MAE over time
 
-Open `GRAFANA_URL` in your browser (login: `admin` / `admin`) to view the dashboard.
+Open `GRAFANA_URL` in your browser to view the dashboard. The username is `admin`. Fetch the password with `deployml get-urls --show-secrets`, which prints it directly and also gives you the secret ID for later. Script 07 itself fetches the password from Secret Manager automatically using `GRAFANA_ADMIN_PASSWORD_SECRET_ID` from `.env`.
 
 ## Dataset
 

From a2749d26d4e418aba1106f100cfceb8252f9a641 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:35 -0700
Subject: [PATCH 09/31] example: guard env vars with actionable errors

Each script checks for the variables written by get-urls and exits with a clear message pointing to the right command instead of failing cryptically.
---
 example/scripts/01_load_training_data.py    | 12 +++++--
 example/scripts/02_train_model.py           | 10 ++++--
 example/scripts/03_register_model.py        |  7 +++--
 example/scripts/04_make_predictions.py      |  9 ++++--
 example/scripts/05_generate_ground_truth.py |  7 +++--
 example/scripts/06_compute_drift_metrics.py |  7 +++--
 example/scripts/07_setup_grafana.py         | 35 +++++++++++++++++----
 7 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/example/scripts/01_load_training_data.py b/example/scripts/01_load_training_data.py
index db5802a..eec4891 100644
--- a/example/scripts/01_load_training_data.py
+++ b/example/scripts/01_load_training_data.py
@@ -6,12 +6,20 @@
 import numpy as np
 import pandas as pd
 from datetime import datetime, timezone
+from pathlib import Path
 from dotenv import load_dotenv
 from google.cloud import bigquery
 
-load_dotenv()
+# Look for .env in the user's working directory, not next to this script.
+# `deployml get-urls` writes .env into the directory the user runs it from.
+load_dotenv(Path.cwd() / ".env")
 
-PROJECT = os.environ["BIGQUERY_PROJECT"]
+PROJECT = os.environ.get("BIGQUERY_PROJECT")
+if not PROJECT:
+    raise SystemExit(
+        "BIGQUERY_PROJECT is not set. Run `deployml get-urls` after `deployml deploy` to write a .env "
+        "with BIGQUERY_PROJECT, MLFLOW_URL, and others. Then re-run this script from the same directory."
+    )
 DATASET = os.getenv("BIGQUERY_DATASET", "mlops")
 TABLE = f"{PROJECT}.{DATASET}.offline_features"
 N_ROWS = 500
diff --git a/example/scripts/02_train_model.py b/example/scripts/02_train_model.py
index c0ef147..a66f9b8 100644
--- a/example/scripts/02_train_model.py
+++ b/example/scripts/02_train_model.py
@@ -5,16 +5,20 @@
 import numpy as np
 import mlflow
 import mlflow.sklearn
+from pathlib import Path
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import root_mean_squared_error, r2_score
 from google.cloud import bigquery
 from dotenv import load_dotenv
 
-load_dotenv()
+# Read .env from the user's working dir, not the script's location.
+load_dotenv(Path.cwd() / ".env")
 
-MLFLOW_URL     = os.environ["MLFLOW_URL"]
-PROJECT        = os.environ["BIGQUERY_PROJECT"]
+MLFLOW_URL = os.environ.get("MLFLOW_URL")
+PROJECT    = os.environ.get("BIGQUERY_PROJECT")
+if not MLFLOW_URL or not PROJECT:
+    raise SystemExit("MLFLOW_URL or BIGQUERY_PROJECT missing. Run `deployml get-urls` to write .env.")
 DATASET        = os.getenv("BIGQUERY_DATASET", "mlops")
 EXPERIMENT     = "housing-price-prediction"
 FEATURE_COLS   = ["bedrooms", "bathrooms", "area_sqft", "lot_size", "year_built", "city", "state"]
diff --git a/example/scripts/03_register_model.py b/example/scripts/03_register_model.py
index c6a8c15..4d46be8 100644
--- a/example/scripts/03_register_model.py
+++ b/example/scripts/03_register_model.py
@@ -3,12 +3,15 @@
 """
 import os
 import mlflow
+from pathlib import Path
 from mlflow.tracking import MlflowClient
 from dotenv import load_dotenv
 
-load_dotenv()
+load_dotenv(Path.cwd() / ".env")
 
-MLFLOW_URL  = os.environ["MLFLOW_URL"]
+MLFLOW_URL = os.environ.get("MLFLOW_URL")
+if not MLFLOW_URL:
+    raise SystemExit("MLFLOW_URL missing. Run `deployml get-urls` to write .env.")
 MODEL_NAME  = "HousingPriceModel"
 EXPERIMENT  = "housing-price-prediction"
 
diff --git a/example/scripts/04_make_predictions.py b/example/scripts/04_make_predictions.py
index 4247156..b349b6a 100644
--- a/example/scripts/04_make_predictions.py
+++ b/example/scripts/04_make_predictions.py
@@ -4,13 +4,16 @@
 """
 import os
 import requests
+from pathlib import Path
 from google.cloud import bigquery
 from dotenv import load_dotenv
 
-load_dotenv()
+load_dotenv(Path.cwd() / ".env")
 
-FASTAPI_URL = os.environ["FASTAPI_URL"]
-PROJECT     = os.environ["BIGQUERY_PROJECT"]
+FASTAPI_URL = os.environ.get("FASTAPI_URL")
+PROJECT     = os.environ.get("BIGQUERY_PROJECT")
+if not FASTAPI_URL or not PROJECT:
+    raise SystemExit("FASTAPI_URL or BIGQUERY_PROJECT missing. Run `deployml get-urls` to write .env.")
 DATASET     = os.getenv("BIGQUERY_DATASET", "mlops")
 N_PREDICT   = 50
 
diff --git a/example/scripts/05_generate_ground_truth.py b/example/scripts/05_generate_ground_truth.py
index 175885e..6d51300 100644
--- a/example/scripts/05_generate_ground_truth.py
+++ b/example/scripts/05_generate_ground_truth.py
@@ -4,13 +4,16 @@
 """
 import os
 import numpy as np
+from pathlib import Path
 from datetime import datetime, timezone
 from google.cloud import bigquery
 from dotenv import load_dotenv
 
-load_dotenv()
+load_dotenv(Path.cwd() / ".env")
 
-PROJECT = os.environ["BIGQUERY_PROJECT"]
+PROJECT = os.environ.get("BIGQUERY_PROJECT")
+if not PROJECT:
+    raise SystemExit("BIGQUERY_PROJECT missing. Run `deployml get-urls` to write .env.")
 DATASET = os.getenv("BIGQUERY_DATASET", "mlops")
 NOISE   = 15000  # std dev of fake noise around predicted value
 
diff --git a/example/scripts/06_compute_drift_metrics.py b/example/scripts/06_compute_drift_metrics.py
index 1ccd831..3dfb85a 100644
--- a/example/scripts/06_compute_drift_metrics.py
+++ b/example/scripts/06_compute_drift_metrics.py
@@ -7,13 +7,16 @@
 """
 import os
 import numpy as np
+from pathlib import Path
 from datetime import datetime, timezone
 from google.cloud import bigquery
 from dotenv import load_dotenv
 
-load_dotenv()
+load_dotenv(Path.cwd() / ".env")
 
-PROJECT      = os.environ["BIGQUERY_PROJECT"]
+PROJECT = os.environ.get("BIGQUERY_PROJECT")
+if not PROJECT:
+    raise SystemExit("BIGQUERY_PROJECT missing. Run `deployml get-urls` to write .env.")
 DATASET      = os.getenv("BIGQUERY_DATASET", "mlops")
 FEATURE_COLS = ["bedrooms", "bathrooms", "area_sqft", "lot_size", "year_built", "city", "state"]
 
diff --git a/example/scripts/07_setup_grafana.py b/example/scripts/07_setup_grafana.py
index 3f7867b..af88f8e 100644
--- a/example/scripts/07_setup_grafana.py
+++ b/example/scripts/07_setup_grafana.py
@@ -8,20 +8,43 @@
 - MAE over time
 
 Prerequisites: Grafana must be running and accessible at GRAFANA_URL.
-Default credentials: admin / admin (change on first login).
+Auth: the admin password lives in Secret Manager. This script fetches it via
+gcloud using GRAFANA_ADMIN_PASSWORD_SECRET_ID from .env (run `deployml get-urls`).
+Override by exporting GRAFANA_PASSWORD if you prefer.
 """
 import os
 import json
+import subprocess
 import requests
+from pathlib import Path
 from dotenv import load_dotenv
 
-load_dotenv()
+load_dotenv(Path.cwd() / ".env")
 
-GRAFANA_URL  = os.environ["GRAFANA_URL"].rstrip("/")
+GRAFANA_URL = os.environ.get("GRAFANA_URL")
+PROJECT     = os.environ.get("BIGQUERY_PROJECT")
+if not GRAFANA_URL or not PROJECT:
+    raise SystemExit("GRAFANA_URL or BIGQUERY_PROJECT missing. Run `deployml get-urls` to write .env.")
+GRAFANA_URL = GRAFANA_URL.rstrip("/")
+DATASET = os.getenv("BIGQUERY_DATASET", "mlops")
 GRAFANA_USER = os.getenv("GRAFANA_USER", "admin")
-GRAFANA_PASS = os.getenv("GRAFANA_PASSWORD", "admin")
-PROJECT      = os.environ["BIGQUERY_PROJECT"]
-DATASET      = os.getenv("BIGQUERY_DATASET", "mlops")
+
+GRAFANA_PASS = os.getenv("GRAFANA_PASSWORD")
+if not GRAFANA_PASS:
+    secret_id = os.environ.get("GRAFANA_ADMIN_PASSWORD_SECRET_ID")
+    if not secret_id:
+        raise SystemExit(
+            "Grafana password not available. Either export GRAFANA_PASSWORD, "
+            "or run `deployml get-urls` so .env has GRAFANA_ADMIN_PASSWORD_SECRET_ID."
+        )
+    proc = subprocess.run(
+        ["gcloud", "secrets", "versions", "access", "latest",
+         "--secret", secret_id, "--project", PROJECT],
+        capture_output=True, text=True,
+    )
+    if proc.returncode != 0:
+        raise SystemExit(f"Failed to fetch Grafana password: {proc.stderr.strip()}")
+    GRAFANA_PASS = proc.stdout.strip()
 
 session = requests.Session()
 session.auth = (GRAFANA_USER, GRAFANA_PASS)

From d640b21dbcf14936f6401af1e86ea43ccc299eb3 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 17:16:35 -0700
Subject: [PATCH 10/31] add unit tests and packaging config

Add offline unit tests for config validation, helper probes, and teardown cron timing. Update packaging metadata and normalize shell and Dockerfile line endings.
---
 .gitattributes        |  44 ++++--
 pyproject.toml        |  11 +-
 tests/README.md       |  28 ++++
 tests/__init__.py     |   0
 tests/test_helpers.py | 331 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 398 insertions(+), 16 deletions(-)
 create mode 100644 tests/README.md
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_helpers.py

diff --git a/.gitattributes b/.gitattributes
index 30bbf2c..e2bd0c7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,21 +1,47 @@
-# Count Python and HCL only
-*.py linguist-detectable=true
+# Default: normalize line endings on commit, native on checkout
+* text=auto
 
-# Ignore notebooks
-*.ipynb linguist-documentation
+# Shell scripts MUST stay LF so they run in Linux containers.
+# Without this, a Windows clone produces CRLF which causes
+# "exec format error" inside Docker images.
+*.sh text eol=lf
+*.bash text eol=lf
+
+# Dockerfile content and configs that may run in Linux
+Dockerfile text eol=lf
+*.dockerfile text eol=lf
+.dockerignore text eol=lf
+
+# Templates rendered into Terraform / Kubernetes / YAML
+# also need stable LF since they may be consumed by Linux tools.
+*.tf text eol=lf
+*.tf.j2 text eol=lf
+*.tfvars text eol=lf
+*.yaml text eol=lf
+*.yml text eol=lf
+*.j2 text eol=lf
+*.tpl text eol=lf
+*.json text eol=lf
+
+# Python and HCL source: native auto handling
+*.py text=auto
 
-# Ignore configs and data
+# Linguist hints for repo language stats
+*.py linguist-detectable=true
+*.ipynb linguist-documentation
 *.yaml linguist-documentation
 *.yml linguist-documentation
 *.json linguist-documentation
 *.csv linguist-documentation
 *.parquet linguist-documentation
-
-# Ignore templates
 *.jinja linguist-documentation
 *.j2 linguist-documentation
 *.tpl linguist-documentation
 *.tf linguist-documentation
-
-# Ignore misc
 *.md linguist-documentation
+
+# Binary files
+*.png binary
+*.jpg binary
+*.gif binary
+*.parquet binary
diff --git a/pyproject.toml b/pyproject.toml
index 51b9805..ef15cfb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,13 +29,10 @@ deployml = "deployml.cli.cli:main"
 
 [tool.poetry]
 packages = [{include = "deployml", from = "src"}]
-
-[tool.setuptools]
-include-package-data = true
-
-[tool.setuptools.package-data]
-mlops_infra = [
-  "docker/**"
+include = [
+  { path = "src/deployml/docker/**/*", format = ["sdist", "wheel"] },
+  { path = "src/deployml/templates/**/*", format = ["sdist", "wheel"] },
+  { path = "src/deployml/terraform/**/*", format = ["sdist", "wheel"] }
 ]
 
 [build-system]
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..600769e
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,28 @@
+# Tests
+
+Unit tests for the pure-Python helpers and validators in `deployml.cli` and
+`deployml.utils.helpers`. These tests do NOT touch GCP. Subprocess calls are
+mocked.
+
+Run with:
+
+```bash
+conda run -n ml pytest tests/ -v
+```
+
+Or from the project root:
+
+```bash
+pytest tests/
+```
+
+What is covered:
+- `_load_config_or_exit`: valid mapping, malformed YAML, non-mapping, empty.
+- `_validate_deploy_config_or_exit`: every documented error path.
+- `validate_gcp_project`, `validate_gcp_region`: subprocess mocked.
+- `get_missing_iam_roles`: owner short-circuit and the diff path.
+- `check_gcp_adc`, `check_bq`, `check_docker_daemon`, `get_terraform_version`: subprocess mocked.
+
+What is NOT covered:
+- Anything that requires a real GCP project (deploy, destroy, init API enable).
+  Those are validated by the end-to-end walkthrough in CLAUDE_INSTRUCTIONS.
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
new file mode 100644
index 0000000..33ec6cf
--- /dev/null
+++ b/tests/test_helpers.py
@@ -0,0 +1,331 @@
+"""Unit tests for deployml helpers and config validators. No GCP calls."""
+from unittest.mock import patch, MagicMock
+
+import pytest
+import typer
+
+import deployml.utils.helpers as helpers_mod
+from deployml.cli.cli import _load_config_or_exit, _validate_deploy_config_or_exit
+from deployml.utils.helpers import (
+    check_gcp_adc,
+    check_bq,
+    check_docker_daemon,
+    get_terraform_version,
+    validate_gcp_project,
+    validate_gcp_region,
+    get_missing_iam_roles,
+)
+
+
+# ---------- _load_config_or_exit ----------
+
+def test_load_config_valid_mapping(tmp_path):
+    f = tmp_path / "c.yaml"
+    f.write_text("foo: bar\nnested:\n  key: 1\n")
+    assert _load_config_or_exit(f) == {"foo": "bar", "nested": {"key": 1}}
+
+
+def test_load_config_malformed_yaml_exits(tmp_path):
+    f = tmp_path / "c.yaml"
+    f.write_text("this is: not valid: yaml: at all:\n")
+    with pytest.raises(typer.Exit):
+        _load_config_or_exit(f)
+
+
+def test_load_config_list_top_level_exits(tmp_path):
+    f = tmp_path / "c.yaml"
+    f.write_text("- item1\n- item2\n")
+    with pytest.raises(typer.Exit):
+        _load_config_or_exit(f)
+
+
+def test_load_config_empty_file_exits(tmp_path):
+    f = tmp_path / "c.yaml"
+    f.write_text("")
+    with pytest.raises(typer.Exit):
+        _load_config_or_exit(f)
+
+
+def test_load_config_scalar_top_level_exits(tmp_path):
+    f = tmp_path / "c.yaml"
+    f.write_text("just-a-string\n")
+    with pytest.raises(typer.Exit):
+        _load_config_or_exit(f)
+
+
+# ---------- _validate_deploy_config_or_exit ----------
+
+def test_validate_deploy_full_gcp():
+    cfg = {
+        "provider": {"name": "gcp", "project_id": "test-123"},
+        "deployment": {"type": "cloud_run"},
+    }
+    _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_full_aws_no_project_id_required():
+    cfg = {
+        "provider": {"name": "aws"},
+        "deployment": {"type": "eks"},
+    }
+    _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_missing_provider_exits():
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit({"deployment": {"type": "cloud_run"}})
+
+
+def test_validate_deploy_provider_not_a_dict_exits():
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit({"provider": "gcp", "deployment": {"type": "x"}})
+
+
+def test_validate_deploy_bad_provider_name_exits():
+    cfg = {"provider": {"name": "gpc"}, "deployment": {"type": "cloud_run"}}
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_gcp_missing_project_id_exits():
+    cfg = {"provider": {"name": "gcp"}, "deployment": {"type": "cloud_run"}}
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_missing_deployment_exits():
+    cfg = {"provider": {"name": "gcp", "project_id": "x"}}
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_missing_deployment_type_exits():
+    cfg = {"provider": {"name": "gcp", "project_id": "x"}, "deployment": {}}
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit(cfg)
+
+
+# ---------- check_gcp_adc ----------
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_check_gcp_adc_returncode_zero_true(mock_run):
+    mock_run.return_value = MagicMock(returncode=0)
+    assert check_gcp_adc() is True
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_check_gcp_adc_returncode_nonzero_false(mock_run):
+    mock_run.return_value = MagicMock(returncode=1)
+    assert check_gcp_adc() is False
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_check_gcp_adc_exception_returns_false(mock_run):
+    mock_run.side_effect = OSError("boom")
+    assert check_gcp_adc() is False
+
+
+# ---------- check_bq ----------
+
+@patch("deployml.utils.helpers.shutil.which")
+def test_check_bq_binary_missing_false(mock_which):
+    mock_which.return_value = None
+    assert check_bq() is False
+
+
+@patch("deployml.utils.helpers.shutil.which")
+@patch("deployml.utils.helpers.subprocess.run")
+def test_check_bq_binary_present_and_runs(mock_run, mock_which):
+    mock_which.return_value = "/usr/bin/bq"
+    mock_run.return_value = MagicMock(returncode=0)
+    assert check_bq() is True
+
+
+@patch("deployml.utils.helpers.shutil.which")
+@patch("deployml.utils.helpers.subprocess.run")
+def test_check_bq_binary_present_but_errors(mock_run, mock_which):
+    mock_which.return_value = "/usr/bin/bq"
+    mock_run.return_value = MagicMock(returncode=2)
+    assert check_bq() is False
+
+
+# ---------- get_terraform_version ----------
+
+@patch("deployml.utils.helpers.shutil.which")
+def test_get_terraform_version_binary_missing(mock_which):
+    mock_which.return_value = None
+    assert get_terraform_version() is None
+
+
+@patch("deployml.utils.helpers.shutil.which")
+@patch("deployml.utils.helpers.subprocess.run")
+def test_get_terraform_version_parses_json(mock_run, mock_which):
+    mock_which.return_value = "/usr/bin/terraform"
+    mock_run.return_value = MagicMock(
+        returncode=0,
+        stdout='{"terraform_version": "1.15.0", "format_version": "1.2"}',
+    )
+    assert get_terraform_version() == (1, 15, 0)
+
+
+@patch("deployml.utils.helpers.shutil.which")
+@patch("deployml.utils.helpers.subprocess.run")
+def test_get_terraform_version_bad_json_returns_none(mock_run, mock_which):
+    mock_which.return_value = "/usr/bin/terraform"
+    mock_run.return_value = MagicMock(returncode=0, stdout="not json")
+    assert get_terraform_version() is None
+
+
+# ---------- validate_gcp_project ----------
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_validate_gcp_project_exists(mock_run):
+    mock_run.return_value = MagicMock(returncode=0, stdout="my-project\n")
+    assert validate_gcp_project("my-project") is True
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_validate_gcp_project_missing(mock_run):
+    mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="not found")
+    assert validate_gcp_project("ghost") is False
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_validate_gcp_project_stdout_mismatch(mock_run):
+    # returncode 0 but stdout does not match the project id we asked for
+    mock_run.return_value = MagicMock(returncode=0, stdout="other-project\n")
+    assert validate_gcp_project("my-project") is False
+
+
+# ---------- validate_gcp_region ----------
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_validate_gcp_region_in_list(mock_run):
+    helpers_mod._GCP_REGIONS_CACHE = None
+    mock_run.return_value = MagicMock(
+        returncode=0,
+        stdout="us-west1\nus-central1\neurope-west1\n",
+    )
+    assert validate_gcp_region("us-west1") is True
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_validate_gcp_region_not_in_list(mock_run):
+    helpers_mod._GCP_REGIONS_CACHE = None
+    mock_run.return_value = MagicMock(
+        returncode=0,
+        stdout="us-west1\nus-central1\n",
+    )
+    assert validate_gcp_region("mars-central1") is False
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_validate_gcp_region_lookup_failure_does_not_block(mock_run):
+    helpers_mod._GCP_REGIONS_CACHE = None
+    mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="oops")
+    assert validate_gcp_region("us-west1") is True
+
+
+# ---------- check_docker_daemon ----------
+
+@patch("deployml.utils.helpers.shutil.which")
+def test_check_docker_daemon_binary_missing(mock_which):
+    mock_which.return_value = None
+    assert check_docker_daemon() is False
+
+
+@patch("deployml.utils.helpers.shutil.which")
+@patch("deployml.utils.helpers.subprocess.run")
+def test_check_docker_daemon_up(mock_run, mock_which):
+    mock_which.return_value = "/usr/local/bin/docker"
+    mock_run.return_value = MagicMock(returncode=0)
+    assert check_docker_daemon() is True
+
+
+@patch("deployml.utils.helpers.shutil.which")
+@patch("deployml.utils.helpers.subprocess.run")
+def test_check_docker_daemon_down(mock_run, mock_which):
+    mock_which.return_value = "/usr/local/bin/docker"
+    mock_run.return_value = MagicMock(returncode=1, stderr="cannot connect")
+    assert check_docker_daemon() is False
+
+
+# ---------- get_missing_iam_roles ----------
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_get_missing_iam_roles_owner_short_circuits(mock_run):
+    mock_run.side_effect = [
+        MagicMock(returncode=0, stdout="me@example.com\n"),
+        MagicMock(
+            returncode=0,
+            stdout='{"bindings": [{"role": "roles/owner", "members": ["user:me@example.com"]}]}',
+        ),
+    ]
+    required = ["roles/cloudsql.admin", "roles/run.admin"]
+    assert get_missing_iam_roles("proj", required) == []
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_get_missing_iam_roles_some_missing(mock_run):
+    mock_run.side_effect = [
+        MagicMock(returncode=0, stdout="me@example.com\n"),
+        MagicMock(
+            returncode=0,
+            stdout='{"bindings": [{"role": "roles/cloudsql.admin", "members": ["user:me@example.com"]}]}',
+        ),
+    ]
+    required = ["roles/cloudsql.admin", "roles/run.admin"]
+    assert get_missing_iam_roles("proj", required) == ["roles/run.admin"]
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_get_missing_iam_roles_account_lookup_fails_returns_all(mock_run):
+    mock_run.return_value = MagicMock(returncode=0, stdout="")
+    assert get_missing_iam_roles("proj", ["roles/run.admin"]) == ["roles/run.admin"]
+
+
+@patch("deployml.utils.helpers.subprocess.run")
+def test_get_missing_iam_roles_policy_query_fails_returns_all(mock_run):
+    mock_run.side_effect = [
+        MagicMock(returncode=0, stdout="me@example.com\n"),
+        MagicMock(returncode=1, stdout="", stderr="permission denied"),
+    ]
+    required = ["roles/run.admin", "roles/storage.admin"]
+    assert get_missing_iam_roles("proj", required) == required
+
+
+# ---------- teardown cron timezone correctness ----------
+
+def test_calculate_cron_from_timestamp_round_trip():
+    """The cron string must reflect the UTC time of the timestamp, with no
+    TZ skew. Regression for the datetime.utcnow() bug that off-set teardown
+    schedules by the local TZ offset."""
+    from datetime import datetime, timezone, timedelta
+    from deployml.utils.teardown import calculate_cron_from_timestamp
+
+    now = datetime.now(timezone.utc)
+    later = now + timedelta(hours=2)
+    cron = calculate_cron_from_timestamp(int(later.timestamp()))
+    expected = f"{later.minute} {later.hour} {later.day} {later.month} *"
+    assert cron == expected
+
+
+def test_deploy_path_timestamp_uses_timezone_aware_now():
+    """The deploy path used datetime.utcnow() which silently corrupts
+    .timestamp() by the local TZ offset. After the fix it uses
+    datetime.now(timezone.utc). This test simulates the exact computation
+    and confirms the cron lines up with what the user is told."""
+    from datetime import datetime, timezone, timedelta
+    from deployml.utils.teardown import calculate_cron_from_timestamp
+
+    deployed_at = datetime.now(timezone.utc)
+    teardown_at = deployed_at + timedelta(hours=24)
+    cron = calculate_cron_from_timestamp(int(teardown_at.timestamp()))
+
+    # Parse cron back. Expect minute, hour, day, month, *
+    parts = cron.split()
+    assert parts[0] == str(teardown_at.minute)
+    assert parts[1] == str(teardown_at.hour)
+    assert parts[2] == str(teardown_at.day)
+    assert parts[3] == str(teardown_at.month)
+    assert parts[4] == "*"

From 83ec86de450dabc77e16d2c45f80c9d4a1519bf1 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 18:21:24 -0700
Subject: [PATCH 11/31] fix #54: preflight gcloud auth and ADC in deploy

deploy only checked 'gcloud auth list', so a user who was logged in but had not run application-default login passed the check and then failed opaquely at terraform apply with 'default credentials not found'. Add a GCP credentials preflight, mirroring init and doctor, that verifies both auth and ADC before any deploy work and exits with actionable guidance. Covers the Cloud Run and GKE deploy paths. Includes unit tests for the preflight.
---
 src/deployml/cli/cli.py | 34 +++++++++++++++++++++++++++-------
 tests/test_helpers.py   | 30 +++++++++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/src/deployml/cli/cli.py b/src/deployml/cli/cli.py
index 46da55b..f5145d6 100644
--- a/src/deployml/cli/cli.py
+++ b/src/deployml/cli/cli.py
@@ -507,6 +507,25 @@ def _validate_deploy_config_or_exit(config: dict) -> None:
         raise typer.Exit(code=1)
 
 
+def _gcp_credentials_preflight_or_exit() -> None:
+    """Verify gcloud auth and Application Default Credentials before any GCP deploy
+    work. ADC backs the Terraform google provider, so without it deploy fails
+    opaquely at apply with 'default credentials not found' (issue #54)."""
+    if not check_gcp_auth():
+        typer.secho(
+            " gcloud is not authenticated. Run: gcloud auth login",
+            fg=typer.colors.RED,
+        )
+        raise typer.Exit(code=1)
+    if not check_gcp_adc():
+        typer.secho(
+            " Application Default Credentials are missing. "
+            "Run: gcloud auth application-default login",
+            fg=typer.colors.RED,
+        )
+        raise typer.Exit(code=1)
+
+
 def get_version():
     """Get version from package metadata"""
     try:
@@ -949,6 +968,13 @@ def deploy(
     if cloud == "gcp" and not validate_gcp_region(region, project_id):
         typer.secho(f" Region '{region}' is not valid for GCP. Run: gcloud compute regions list", fg=typer.colors.RED)
         raise typer.Exit(code=1)
+
+    # ADC backs the Terraform google provider, so deploy must verify auth and ADC
+    # the way init and doctor do. Without this a logged-in user with no ADC passes
+    # the auth check and then fails opaquely at terraform apply (issue #54).
+    if cloud == "gcp":
+        _gcp_credentials_preflight_or_exit()
+
     deployment_type = config["deployment"]["type"]
     stack = config["stack"]
 
@@ -1284,15 +1310,9 @@ def deploy(
     (DEPLOYML_TERRAFORM_DIR / "terraform.tfvars").write_text(tfvars_content)
 
     # Deploy. Falls back to workspace_name when config has no top-level 'name'.
+    # Auth and ADC were already preflighted above, so just point gcloud at the project.
     typer.echo(f" Deploying {config.get('name', workspace_name)} to {cloud}...")
 
-    if not check_gcp_auth():
-        typer.echo(" Authenticating with GCP...")
-        subprocess.run(
-            ["gcloud", "auth", "application-default", "login"],
-            cwd=DEPLOYML_TERRAFORM_DIR,
-        )
-
     subprocess.run(
         ["gcloud", "config", "set", "project", project_id],
         cwd=DEPLOYML_TERRAFORM_DIR,
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index 33ec6cf..5947430 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -5,7 +5,11 @@
 import typer
 
 import deployml.utils.helpers as helpers_mod
-from deployml.cli.cli import _load_config_or_exit, _validate_deploy_config_or_exit
+from deployml.cli.cli import (
+    _load_config_or_exit,
+    _validate_deploy_config_or_exit,
+    _gcp_credentials_preflight_or_exit,
+)
 from deployml.utils.helpers import (
     check_gcp_adc,
     check_bq,
@@ -105,6 +109,30 @@ def test_validate_deploy_missing_deployment_type_exits():
         _validate_deploy_config_or_exit(cfg)
 
 
+# ---------- _gcp_credentials_preflight_or_exit (#54) ----------
+
+@patch("deployml.cli.cli.check_gcp_adc", return_value=True)
+@patch("deployml.cli.cli.check_gcp_auth", return_value=True)
+def test_gcp_preflight_passes_with_auth_and_adc(mock_auth, mock_adc):
+    # Both present: no exit.
+    _gcp_credentials_preflight_or_exit()
+
+
+@patch("deployml.cli.cli.check_gcp_adc", return_value=True)
+@patch("deployml.cli.cli.check_gcp_auth", return_value=False)
+def test_gcp_preflight_exits_when_not_authenticated(mock_auth, mock_adc):
+    with pytest.raises(typer.Exit):
+        _gcp_credentials_preflight_or_exit()
+
+
+@patch("deployml.cli.cli.check_gcp_adc", return_value=False)
+@patch("deployml.cli.cli.check_gcp_auth", return_value=True)
+def test_gcp_preflight_exits_when_adc_missing(mock_auth, mock_adc):
+    # Issue #54: logged in but no ADC must fail at preflight, not at terraform apply.
+    with pytest.raises(typer.Exit):
+        _gcp_credentials_preflight_or_exit()
+
+
 # ---------- check_gcp_adc ----------
 
 @patch("deployml.utils.helpers.subprocess.run")

From 44cf09a61afcdcca002cbb635064eb77bf96c85c Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 18:22:24 -0700
Subject: [PATCH 12/31] fix #53: validate config.stack shape to avoid
 set/non-dict crash

A stack tool value parsed as a set or any non-mapping crashed the deploy loop with 'set' object has no attribute 'get'. Validate in _validate_deploy_config_or_exit that stack is a list, each stage is a mapping, and each tool is a mapping, exiting with an actionable message that points at a stray !!set tag. Validation runs only when stack is present so existing configs are unaffected. Includes unit tests for set, non-list, and non-dict stage cases.
---
 src/deployml/cli/cli.py | 30 ++++++++++++++++++++++++
 tests/test_helpers.py   | 52 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/src/deployml/cli/cli.py b/src/deployml/cli/cli.py
index f5145d6..0f08c6a 100644
--- a/src/deployml/cli/cli.py
+++ b/src/deployml/cli/cli.py
@@ -506,6 +506,36 @@ def _validate_deploy_config_or_exit(config: dict) -> None:
         typer.secho(" Config is missing required field: deployment.type.", fg=typer.colors.RED)
         raise typer.Exit(code=1)
 
+    # Validate the stack shape so a malformed entry fails here with a clear message
+    # instead of crashing later with "'set' object has no attribute 'get'" when the
+    # deploy loop calls tool.get(...) (issue #53). YAML flow like {a, b} or an
+    # explicit !!set tag parses as a set, not the mapping the stack expects.
+    stack = config.get("stack")
+    if stack is not None:
+        if not isinstance(stack, list):
+            typer.secho(
+                f" config.stack must be a list of stage mappings, got {type(stack).__name__}.",
+                fg=typer.colors.RED,
+            )
+            raise typer.Exit(code=1)
+        for i, stage in enumerate(stack):
+            if not isinstance(stage, dict):
+                typer.secho(
+                    f" config.stack[{i}] must be a mapping of stage name to tool config, "
+                    f"got {type(stage).__name__}.",
+                    fg=typer.colors.RED,
+                )
+                raise typer.Exit(code=1)
+            for stage_name, tool in stage.items():
+                if not isinstance(tool, dict):
+                    typer.secho(
+                        f" config.stack[{i}].{stage_name} must be a mapping with at least a "
+                        f"'name', got {type(tool).__name__}. Check for a stray !!set tag or a "
+                        f"list where a mapping is expected.",
+                        fg=typer.colors.RED,
+                    )
+                    raise typer.Exit(code=1)
+
 
 def _gcp_credentials_preflight_or_exit() -> None:
     """Verify gcloud auth and Application Default Credentials before any GCP deploy
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index 5947430..6b2ffb7 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -109,6 +109,58 @@ def test_validate_deploy_missing_deployment_type_exits():
         _validate_deploy_config_or_exit(cfg)
 
 
+# ---------- stack validation (#53) ----------
+
+def test_validate_deploy_valid_stack_ok():
+    cfg = {
+        "provider": {"name": "gcp", "project_id": "x"},
+        "deployment": {"type": "cloud_run"},
+        "stack": [
+            {"experiment_tracking": {"name": "mlflow", "params": {}}},
+            {"model_serving": {"name": "fastapi"}},
+        ],
+    }
+    _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_stack_tool_as_set_exits():
+    # Issue #53: a tool value parsed as a set crashed later with
+    # "'set' object has no attribute 'get'". It must exit cleanly here.
+    cfg = {
+        "provider": {"name": "gcp", "project_id": "x"},
+        "deployment": {"type": "cloud_run"},
+        "stack": [{"experiment_tracking": {"mlflow", "params"}}],
+    }
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_stack_not_a_list_exits():
+    cfg = {
+        "provider": {"name": "gcp", "project_id": "x"},
+        "deployment": {"type": "cloud_run"},
+        "stack": {"experiment_tracking": {"name": "mlflow"}},
+    }
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_stage_not_a_dict_exits():
+    cfg = {
+        "provider": {"name": "gcp", "project_id": "x"},
+        "deployment": {"type": "cloud_run"},
+        "stack": ["experiment_tracking"],
+    }
+    with pytest.raises(typer.Exit):
+        _validate_deploy_config_or_exit(cfg)
+
+
+def test_validate_deploy_no_stack_key_still_ok():
+    # Stack is validated only when present, so configs without it still pass.
+    cfg = {"provider": {"name": "gcp", "project_id": "x"}, "deployment": {"type": "cloud_run"}}
+    _validate_deploy_config_or_exit(cfg)
+
+
 # ---------- _gcp_credentials_preflight_or_exit (#54) ----------
 
 @patch("deployml.cli.cli.check_gcp_adc", return_value=True)

From 5ce4830145e5d847b68faf9373aa45f24d41aaf3 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 18:41:52 -0700
Subject: [PATCH 13/31] docs: document new CLI flags and teardown behaviors

Document build-images --platform, the deploy auth/ADC preflight and config-shape validation, destroy removing the Cloud Build staging bucket, generate --force, gke-cluster-create, the --namespace flag on the minikube and GKE deploy commands, MLflow PVC persistence on minikube and GKE, and gke-destroy cleaning the PVC and gcr.io image with --keep-images to opt out. Fixes the stale note that deployml does not manage GKE clusters.
---
 docs/api/cli-commands.md        | 18 +++++++++++-------
 docs/tutorials/gcp-cloud-run.md |  6 +++---
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/docs/api/cli-commands.md b/docs/api/cli-commands.md
index 6eeddd5..5789780 100644
--- a/docs/api/cli-commands.md
+++ b/docs/api/cli-commands.md
@@ -53,8 +53,9 @@ deployml build-images --create-repo
 - `--tag`, `-t`: Image tag. Default reads `config.provider.image_tag`, falls back to `v{deployml_version}`.
 - `--create-repo`: Create the Artifact Registry repository on first run.
 - `--dry-run`: Print commands without executing.
+- `--platform`: Local build platform. Defaults to the host architecture so images run on a local minikube node, including arm64 Macs. Pass `linux/amd64` only when building locally for a manual amd64 push. Ignored in GCP Cloud Build mode.
 
-Builds run on Cloud Build, so a local Docker daemon is not required for GCP mode.
+Builds run on Cloud Build, so a local Docker daemon is not required for GCP mode. In local mode a daemon probe runs first and the build targets the host architecture.
 
 ---
 
@@ -71,7 +72,9 @@ deployml deploy --verbose --yes
 - `--config-path`, `-c`: Path to config YAML. Default `config.yaml`.
 - `--verbose`, `-v`: Stream Terraform logs to stdout. Without this you get a progress bar.
 - `--yes`, `-y`: Skip the `[y/N]` deploy confirmation. Required for non-interactive scripts.
-- `--generate-only`, `-g`: Only generate manifests, no apply. For GKE flow.
+- `--generate-only`, `-g`: For the GKE flow, render the Kubernetes manifests without applying them, so you can review and then apply with `gke-apply`.
+
+Before any cloud work, deploy validates the config shape and, for GCP, preflights both gcloud auth and Application Default Credentials. A missing ADC fails fast with `gcloud auth application-default login` guidance instead of an opaque "default credentials not found" at apply. A malformed `stack` entry also fails here with a clear message rather than crashing mid-deploy.
 
 First-time deploy takes about 20 minutes because Cloud SQL Postgres provisioning is slow.
 
@@ -95,7 +98,7 @@ deployml get-urls --show-secrets
 
 ## `deployml destroy`
 
-Tear down all infrastructure for a given config. Also removes the Artifact Registry repo created by build-images.
+Tear down all infrastructure for a given config. Also removes the Artifact Registry repo and the Cloud Build staging bucket created by build-images, so a destroyed project leaves no billing residue.
 
 ```bash
 deployml destroy --yes
@@ -138,13 +141,14 @@ Supported stage names: `experiment_tracking`, `artifact_tracking`, `model_regist
 
 These exist in the CLI but are not part of the documented happy path. Use at your own risk and inspect the source.
 
-- `deployml generate`: Interactive YAML generator. Less useful since `init` now writes a runnable config.
+- `deployml generate`: Interactive YAML generator. Less useful since `init` now writes a runnable config. Pass `--force`, `-f` to overwrite an existing `config.yaml` without the confirm prompt.
 - `deployml status`: Stub. Reports deployment status of current workspace.
 - `deployml terraform`: Run raw terraform actions (plan, apply, destroy) on a rendered workspace.
 - `deployml teardown`: Manage scheduled auto-teardown jobs.
 - `deployml vm`: Placeholder for VM deployment.
-- `deployml mlflow-init`, `deployml mlflow-deploy`: MLflow-only flows.
-- `deployml minikube-init`, `deployml minikube-deploy`: Local Kubernetes flow for testing without GCP.
-- `deployml gke-init`, `deployml gke-deploy`, `deployml gke-apply`, `deployml gke-destroy`: GKE deployment path. `gke-destroy` removes the deployed manifests and, with `--delete-cluster`, the cluster itself. See the [GKE flow notes](../tutorials/gcp-cloud-run.md#gke-flow-notes) in the tutorial.
+- `deployml mlflow-init`, `deployml mlflow-deploy`: MLflow-only minikube flow. `mlflow-init` provisions a PersistentVolumeClaim by default with `--persistent-storage` so the sqlite backend and artifacts survive pod restarts, with `--pvc-size` to size it and `--ephemeral-storage` to opt out. `mlflow-deploy` takes `--namespace`, `-n` to isolate the stack.
+- `deployml minikube-init`, `deployml minikube-deploy`: Local Kubernetes flow for testing without GCP. `minikube-deploy` takes `--namespace`, `-n`. Build local images with `build-images` in local mode, which targets the host architecture so they run on the minikube node.
+- `deployml gke-cluster-create`: Create a GKE cluster. Autopilot by default with `--region`; pass `--standard` for a small zonal cluster.
+- `deployml gke-init`, `deployml gke-deploy`, `deployml gke-apply`, `deployml gke-destroy`: GKE deployment path. MLflow on GKE provisions a PersistentVolumeClaim by default so experiment data survives pod restarts. Deploy commands take `--namespace`, `-n`, and MLflow and FastAPI must share a namespace for in-cluster service DNS to resolve. `gke-destroy` removes the deployed manifests including the PVC and the referenced `gcr.io` image so nothing keeps billing, with `--keep-images` to retain images for a quick redeploy and `--delete-cluster` to remove the cluster. See the [GKE flow notes](../tutorials/gcp-cloud-run.md#gke-flow-notes) in the tutorial.
 
 The fully supported and tested path is GCP Cloud Run via `init`, `build-images`, `deploy`, `get-urls`, `destroy`.
diff --git a/docs/tutorials/gcp-cloud-run.md b/docs/tutorials/gcp-cloud-run.md
index 5f482fd..71f6bca 100644
--- a/docs/tutorials/gcp-cloud-run.md
+++ b/docs/tutorials/gcp-cloud-run.md
@@ -261,12 +261,12 @@ set PATH=C:\Users\%USERNAME%\AppData\Local\Google\Cloud SDK\google-cloud-sdk\bin
 
 Adjust the path if your gcloud install is elsewhere.
 
-**3. You manage the cluster yourself.**
+**3. Cluster lifecycle and persistence.**
 
-`deployml` does not create or delete GKE clusters for you. Use `gcloud container clusters create-auto` for Autopilot or `gcloud container clusters create` for standard. To tear down:
+Create a cluster with `deployml gke-cluster-create --cluster NAME --project YOUR_GCP_PROJECT_ID --region us-west1`, which uses Autopilot by default, or `--standard` for a small zonal cluster. You can also use `gcloud container clusters create-auto` directly. MLflow on GKE provisions a PersistentVolumeClaim by default, so experiment data survives pod restarts. MLflow and FastAPI must share a namespace for in-cluster service DNS to resolve, so pass the same `--namespace` to both deploys if you do not use the default namespace. To tear down:
 
 ```bash
 deployml gke-destroy --manifest-dir manifests --cluster gke-test --project YOUR_GCP_PROJECT_ID --region us-west1 --delete-cluster
 ```
 
-`--delete-cluster` also removes the cluster. Omit it to only delete the deployed manifests.
+`gke-destroy` deletes the manifests, the MLflow PVC so its PersistentDisk is reclaimed, and the referenced gcr.io image so nothing keeps billing. `--delete-cluster` also removes the cluster. `--keep-images` retains the image for a quick redeploy. Pass `--namespace` if you deployed into a non-default namespace.

From 1eefb2f8fa93f82af46815a7c69a8fa11aa8d304 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 19:56:39 -0700
Subject: [PATCH 14/31] docs: correct destroy note in README and document the
 Kubernetes paths

destroy now removes the Artifact Registry repo and the Cloud Build staging bucket, so the old 'does not delete Artifact Registry images' line was inaccurate. Correct it and add an Other deployment targets section pointing to the local minikube and GKE flows.
---
 README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9f9b60c..94a9d76 100644
--- a/README.md
+++ b/README.md
@@ -78,12 +78,21 @@ See [example/README.md](example/README.md) for details.
 deployml destroy
 ```
 
-Deletes all Cloud Run services, Cloud SQL, GCS bucket, and BigQuery dataset. Does not delete Artifact Registry images or the GCP project.
+Deletes all Cloud Run services, Cloud SQL, the GCS bucket, and the BigQuery dataset, and also removes the Artifact Registry repo and the Cloud Build staging bucket that `build-images` created, so a destroyed project leaves no billing residue. Does not delete the GCP project itself.
 
 ## Full Tutorial
 
 See [docs/tutorials/gcp-cloud-run.md](docs/tutorials/gcp-cloud-run.md) for a step-by-step walkthrough.
 
+## Other deployment targets
+
+Cloud Run is the primary, fully supported path. The CLI also supports Kubernetes for users who want a cluster:
+
+- **Local minikube**, for testing without GCP: `mlflow-init` and `mlflow-deploy`, or `minikube-init` and `minikube-deploy`.
+- **GKE** on GCP: `gke-cluster-create`, `gke-init`, then `gke-deploy` or `gke-apply`, torn down with `gke-destroy`.
+
+MLflow keeps its data on a PersistentVolumeClaim in both, so experiments survive pod restarts. See [CLI Commands](docs/api/cli-commands.md) and the [GKE flow notes](docs/tutorials/gcp-cloud-run.md#gke-flow-notes).
+
 ## Requirements
 
 - Python 3.11 or newer

From be6a5978290e276e23ff9ebd45223127777af0cc Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sat, 30 May 2026 19:56:39 -0700
Subject: [PATCH 15/31] docs: surface minikube and GKE across the site overview
 pages

api/overview lists the Kubernetes and GKE commands, features/overview documents gke and minikube as deployment targets selected by deployment.type, tutorials/overview adds a Kubernetes section, and costs notes the GKE PersistentDisk and ties the SQLite tip to the minikube and GKE paths. Brings the architecture and reference pages in line with the now-real Kubernetes functionality.
---
 docs/api/overview.md       | 15 ++++++++++++++-
 docs/features/costs.md     |  4 ++--
 docs/features/overview.md  | 10 ++++++++++
 docs/tutorials/overview.md |  6 ++++++
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/docs/api/overview.md b/docs/api/overview.md
index ca346b7..7829ea9 100644
--- a/docs/api/overview.md
+++ b/docs/api/overview.md
@@ -13,4 +13,17 @@ deployml provides a command-line interface (CLI) for deploying and managing MLOp
 | `deployml get-urls` | Print service URLs and write `.env` file |
 | `deployml destroy` | Tear down all infrastructure |
 
-See [CLI Commands](cli-commands.md) for full usage details.
+### Kubernetes commands
+
+For the optional Kubernetes paths, local minikube and GKE:
+
+| Command | Description |
+|---|---|
+| `deployml minikube-init` / `minikube-deploy` | Generate and deploy FastAPI manifests to a local minikube cluster |
+| `deployml mlflow-init` / `mlflow-deploy` | Generate and deploy MLflow to minikube, with a PersistentVolumeClaim for data |
+| `deployml gke-cluster-create` | Create a GKE cluster, Autopilot by default |
+| `deployml gke-init` | Generate Kubernetes manifests for GKE |
+| `deployml gke-deploy` / `gke-apply` | Apply manifests to a GKE cluster |
+| `deployml gke-destroy` | Remove manifests, the PVC, and the gcr.io image, optionally the cluster |
+
+See [CLI Commands](cli-commands.md) for full usage details and flags.
diff --git a/docs/features/costs.md b/docs/features/costs.md
index db732ec..f25f481 100644
--- a/docs/features/costs.md
+++ b/docs/features/costs.md
@@ -33,7 +33,7 @@ Here are estimated typical costs for several **GCP** services, but please do not
 - Google Cloud Storage costs approximately $0.020 per GB per month.   
 - BigQuery storage costs $0.020 per GB per month with query costs based on data scanned.  
 - Cloud VMs cost approximately $25 per month for medium instances.   
-- GKE clusters have no management fee, but you pay for VM instances and load balancers. Note that the GKE can get very expensive very quickly.
+- GKE clusters have no management fee, but you pay for VM instances and load balancers. MLflow on GKE also provisions a small PersistentDisk for its data, a few cents per GB-month. Note that GKE can get expensive quickly.
 
 
 
@@ -41,6 +41,6 @@ Here are estimated typical costs for several **GCP** services, but please do not
 
 Here are some tips to keep the costs low while you are learning:  
 
-- Use SQLite instead of Cloud SQL whenever possible, particularly for development purposes and when your data is small.  
+- Use SQLite instead of Cloud SQL whenever possible, particularly for development purposes and when your data is small. The minikube and GKE MLflow paths already do this, sqlite on a PersistentVolumeClaim, so they avoid the always-on Cloud SQL cost.  
 - Enable auto-teardown to prevent forgotten deployments. 
 - Use Cloud Run for variable workloads to take advantage of scale-to-zero pricing.
\ No newline at end of file
diff --git a/docs/features/overview.md b/docs/features/overview.md
index 0e1d699..c2eeb8e 100644
--- a/docs/features/overview.md
+++ b/docs/features/overview.md
@@ -6,6 +6,16 @@ deployml is a Python library that deploys a complete MLOps infrastructure in GCP
 
 You define your stack in a YAML config file, run `deployml deploy`, and Terraform provisions everything in GCP. When you're done, `deployml destroy` tears it all down cleanly.
 
+## Deployment targets
+
+Cloud Run is the primary, fully supported target and is what the rest of this page describes. The same MLflow and FastAPI stack can also run on Kubernetes, selected by `deployment.type` in your config:
+
+- `cloud_run` — serverless on GCP Cloud Run, the default.
+- `gke` — a Google Kubernetes Engine cluster, where MLflow gets a PersistentVolumeClaim so experiment data survives pod restarts.
+- Local **minikube**, for testing without GCP, via the `minikube-*` and `mlflow-*` commands.
+
+See [CLI Commands](../api/cli-commands.md) and the [GKE flow notes](../tutorials/gcp-cloud-run.md#gke-flow-notes) for the Kubernetes paths.
+
 ## What gets deployed
 
 ### Experiment Tracking, Artifact Storage, and Model Registry — MLflow
diff --git a/docs/tutorials/overview.md b/docs/tutorials/overview.md
index f061b76..6391efb 100644
--- a/docs/tutorials/overview.md
+++ b/docs/tutorials/overview.md
@@ -14,6 +14,12 @@ Once deployed, walk through a complete MLOps workflow using a synthetic housing
 
 **[End-to-End Example →](example.md)**
 
+## Kubernetes: local minikube and GKE
+
+Prefer a Kubernetes cluster over Cloud Run? deployml can run the MLflow and FastAPI stack on a local minikube cluster for offline testing, or on GKE. MLflow keeps its data on a PersistentVolumeClaim so experiments survive pod restarts. The commands are `minikube-init` / `minikube-deploy`, `mlflow-init` / `mlflow-deploy`, and `gke-cluster-create`, `gke-init`, `gke-deploy` / `gke-apply`, `gke-destroy`.
+
+**[GKE flow notes →](gcp-cloud-run.md#gke-flow-notes)** and the [CLI Commands reference](../api/cli-commands.md).
+
 ## Quick Reference
 
 ```bash

From cdcaf9f3260745c35046bde631088e8bdf232333 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 14:51:26 -0700
Subject: [PATCH 16/31] add platform_compat module for cross platform tool
 execution

Centralize OS awareness so CLI commands stay identical across Windows, macOS, and Linux. resolve_tool returns an executable tool path and prefers a real wrapper over the extensionless launcher scripts that ship beside gcloud.cmd, bq.cmd, gsutil.cmd, and docker.exe on Windows. run_tool launches the resolved path and falls back to the command interpreter only if a direct .cmd launch raises OSError, so batch wrappers work on any Windows build. configure_console_encoding forces UTF-8 with replacement to stop cp1252 UnicodeEncodeError crashes. robust_rmtree clears the read only bit and retries, supporting both the onerror and onexc rmtree callbacks.
---
 src/deployml/utils/platform_compat.py | 129 ++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 src/deployml/utils/platform_compat.py

diff --git a/src/deployml/utils/platform_compat.py b/src/deployml/utils/platform_compat.py
new file mode 100644
index 0000000..96a425c
--- /dev/null
+++ b/src/deployml/utils/platform_compat.py
@@ -0,0 +1,129 @@
+"""Cross platform helpers so the deployml CLI behaves identically on Windows,
+macOS, and Linux.
+
+All operating system awareness lives here. Command modules call run_tool instead
+of subprocess.run for external tools, call configure_console_encoding once at
+startup, and use robust_rmtree for workspace cleanup. This keeps every CLI command
+the same across operating systems while the engine adapts underneath.
+
+The Windows problems this module solves:
+
+- gcloud, bq, and gsutil ship as .cmd batch wrappers, not .exe files, so
+  subprocess.run with a bare name fails with FileNotFoundError, WinError 2.
+  resolve_tool finds the real wrapper and run_tool launches it.
+- The gcloud SDK and Docker also ship an extensionless launcher script beside the
+  real wrapper. subprocess cannot execute that script, so resolve_tool prefers an
+  executable extension and never returns the bare launcher.
+- The default console code page is cp1252, so printing emoji or box glyphs raises
+  UnicodeEncodeError. configure_console_encoding forces UTF-8 with replacement.
+- Workspace cleanup hits read only files and transient locks, so a plain rmtree
+  raises PermissionError. robust_rmtree clears the read only bit and retries.
+"""
+
+import os
+import shutil
+import stat
+import subprocess
+import sys
+import time
+
+IS_WINDOWS = os.name == "nt"
+
+# Extensions Windows treats as directly executable. resolve_tool uses these to
+# reject the extensionless launcher scripts that ship beside gcloud.cmd, bq.cmd,
+# gsutil.cmd, and docker.exe.
+_WINDOWS_EXEC_EXTS = (".exe", ".cmd", ".bat", ".com")
+
+
+def resolve_tool(name: str) -> str:
+    """Return the absolute path to an external tool, honoring PATHEXT on Windows.
+
+    On Windows shutil.which can return an extensionless launcher script that ships
+    alongside the real wrapper, for example the gcloud bash script next to
+    gcloud.cmd. subprocess cannot launch that script, so when the first match has
+    no executable extension we look specifically for a .cmd, .exe, or .bat wrapper.
+
+    Raises FileNotFoundError with an actionable message if the tool is missing.
+    """
+    path = shutil.which(name)
+    if path is None:
+        raise FileNotFoundError(
+            f"Required tool '{name}' was not found on PATH. "
+            f"Install it and reopen your shell, then retry."
+        )
+    if IS_WINDOWS and os.path.splitext(path)[1].lower() not in _WINDOWS_EXEC_EXTS:
+        for ext in (".cmd", ".exe", ".bat"):
+            candidate = shutil.which(name + ext)
+            if candidate:
+                return candidate
+    return path
+
+
+def run_tool(name: str, args: list, **kwargs) -> subprocess.CompletedProcess:
+    """Run an external command the same way on every operating system.
+
+    Resolves the tool to its real path so .cmd wrappers like gcloud.cmd work on
+    Windows. Pass the args list you would have passed after the tool name. Every
+    subprocess.run keyword argument passes through unchanged, so capture_output,
+    text, check, input, cwd, env, and stdout or stderr redirection behave exactly
+    as a direct subprocess.run call would.
+    """
+    resolved = resolve_tool(name)
+    try:
+        return subprocess.run([resolved, *args], **kwargs)
+    except OSError:
+        # Some Windows Python builds cannot launch a .cmd or .bat directly and
+        # raise OSError, WinError 193, when CreateProcess runs. That happens before
+        # the child process starts, so retrying through the command interpreter is
+        # safe and produces no duplicate side effects. Only batch wrappers need it.
+        if IS_WINDOWS and resolved.lower().endswith((".cmd", ".bat")):
+            comspec = os.environ.get("COMSPEC", "cmd.exe")
+            return subprocess.run([comspec, "/c", resolved, *args], **kwargs)
+        raise
+
+
+def configure_console_encoding() -> None:
+    """Force UTF-8 on Windows stdout and stderr so non ASCII output never crashes.
+
+    Emoji and box drawing glyphs in CLI messages raise UnicodeEncodeError on a
+    legacy cp1252 console. Reconfiguring with errors set to replace guarantees the
+    command keeps running even on a strict console, degrading an unprintable glyph
+    to a placeholder instead of crashing. No effect off Windows.
+    """
+    if not IS_WINDOWS:
+        return
+    for stream in (sys.stdout, sys.stderr):
+        reconfigure = getattr(stream, "reconfigure", None)
+        if reconfigure is None:
+            # stdout or stderr was replaced by a plain object, for example under a
+            # test harness, so there is nothing to reconfigure.
+            continue
+        try:
+            reconfigure(encoding="utf-8", errors="replace")
+        except (ValueError, OSError):
+            pass
+
+
+def robust_rmtree(path) -> None:
+    """Remove a directory tree, surviving Windows read only files and brief locks.
+
+    Windows marks some files read only, which shutil.rmtree does not clear, and a
+    sync client or scanner can hold a file open for a moment, so a plain rmtree
+    raises PermissionError. The error handler clears the read only bit and retries,
+    then pauses briefly and retries once more before giving up.
+    """
+    def _handle(func, target, _exc):
+        try:
+            os.chmod(target, stat.S_IWRITE)
+            func(target)
+        except Exception:
+            time.sleep(0.2)
+            func(target)
+
+    if not os.path.exists(path):
+        return
+    try:
+        # Python 3.12 renamed the rmtree error callback from onerror to onexc.
+        shutil.rmtree(path, onexc=_handle)
+    except TypeError:
+        shutil.rmtree(path, onerror=_handle)

From 1fd28a6cb1025841b893159ebc2fc46ef38c0e6e Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 15:15:46 -0700
Subject: [PATCH 17/31] route external tool calls through run_tool (blocker 1)

On Windows gcloud, bq, and gsutil are .cmd batch wrappers that subprocess cannot launch by bare name, failing with WinError 2. Route every external tool invocation, gcloud, bq, gsutil, terraform, docker, kubectl, minikube, infracost, git, through run_tool from platform_compat, which resolves the real executable path and works for both .cmd wrappers and .exe binaries. Streaming Popen call sites resolve the tool with resolve_tool and keep Popen. Remove now-dead subprocess imports where only run calls remained. Behavior is preserved on macOS and Linux because resolve_tool returns the same path a bare name resolves to, and all subprocess kwargs pass through unchanged. Update the helpers unit tests to patch run_tool instead of subprocess.run to match the new call path; they assert only on return values, so coverage is unchanged.
---
 src/deployml/api.py                    |  25 ++--
 src/deployml/cli/cli.py                | 195 +++++++++++++------------
 src/deployml/diagnostics/doctor.py     |  20 +--
 src/deployml/notebook/docker.py        |  25 ++--
 src/deployml/notebook/stack.py         |  18 +--
 src/deployml/utils/helpers.py          |  53 ++++---
 src/deployml/utils/infracost.py        |  10 +-
 src/deployml/utils/kubernetes_gke.py   |  43 +++---
 src/deployml/utils/kubernetes_local.py |  81 +++++-----
 tests/test_helpers.py                  |  38 ++---
 10 files changed, 261 insertions(+), 247 deletions(-)

diff --git a/src/deployml/api.py b/src/deployml/api.py
index 6d322ed..7cd6d59 100644
--- a/src/deployml/api.py
+++ b/src/deployml/api.py
@@ -29,11 +29,11 @@
     )
 """
 import json
-import subprocess
 from pathlib import Path
 from datetime import datetime, timedelta, timezone
 from typing import Optional, Dict, Any
 
+from .utils.platform_compat import run_tool
 from .utils.teardown import (
     calculate_cron_from_timestamp,
     load_deployment_metadata,
@@ -68,9 +68,9 @@ def get_teardown_status(
     """
     scheduler_job_name = f"deployml-teardown-{workspace_name}"
     
-    result = subprocess.run(
-        ["gcloud", "scheduler", "jobs", "describe", scheduler_job_name,
-         "--project", project_id, "--location", region, "--format", "json"],
+    result = run_tool(
+        "gcloud", ["scheduler", "jobs", "describe", scheduler_job_name,
+                   "--project", project_id, "--location", region, "--format", "json"],
         capture_output=True,
         text=True,
     )
@@ -141,9 +141,9 @@ def update_teardown_schedule(
     scheduler_job_name = f"deployml-teardown-{workspace_name}"
     
     # Check if job exists and get current timezone
-    result = subprocess.run(
-        ["gcloud", "scheduler", "jobs", "describe", scheduler_job_name,
-         "--project", project_id, "--location", region, "--format", "json"],
+    result = run_tool(
+        "gcloud", ["scheduler", "jobs", "describe", scheduler_job_name,
+                   "--project", project_id, "--location", region, "--format", "json"],
         capture_output=True,
         text=True,
     )
@@ -168,9 +168,10 @@ def update_teardown_schedule(
     new_cron_schedule = calculate_cron_from_timestamp(teardown_scheduled_timestamp)
     
     # Update Cloud Scheduler job
-    update_result = subprocess.run(
+    update_result = run_tool(
+        "gcloud",
         [
-            "gcloud", "scheduler", "jobs", "update", "http", scheduler_job_name,
+            "scheduler", "jobs", "update", "http", scheduler_job_name,
             "--location", region,
             "--schedule", new_cron_schedule,
             "--time-zone", time_zone,
@@ -231,9 +232,9 @@ def cancel_teardown(
     """
     scheduler_job_name = f"deployml-teardown-{workspace_name}"
     
-    result = subprocess.run(
-        ["gcloud", "scheduler", "jobs", "delete", scheduler_job_name,
-         "--project", project_id, "--location", region, "--quiet"],
+    result = run_tool(
+        "gcloud", ["scheduler", "jobs", "delete", scheduler_job_name,
+                   "--project", project_id, "--location", region, "--quiet"],
         capture_output=True,
         text=True,
     )
diff --git a/src/deployml/cli/cli.py b/src/deployml/cli/cli.py
index 0f08c6a..bb73645 100644
--- a/src/deployml/cli/cli.py
+++ b/src/deployml/cli/cli.py
@@ -2,7 +2,6 @@
 import yaml
 import typer
 import shutil
-import subprocess
 import re
 import importlib.resources as pkg_resources
 from deployml.utils.banner import display_banner
@@ -47,6 +46,7 @@
     run_terraform_with_loading_bar,
     _create_docker_folder,
 )
+from deployml.utils.platform_compat import run_tool, resolve_tool
 from deployml.utils.infracost import (
     check_infracost_available,
     run_infracost_analysis,
@@ -81,8 +81,8 @@ def upload_terraform_files_to_gcs(terraform_dir: Path, project_id: str, workspac
     try:
         # Get terraform files bucket from Terraform state
         # The bucket is created by the teardown module
-        state_proc = subprocess.run(
-            ["terraform", "state", "list"],
+        state_proc = run_tool(
+            "terraform", ["state", "list"],
             cwd=terraform_dir,
             capture_output=True,
             text=True,
@@ -104,8 +104,8 @@ def upload_terraform_files_to_gcs(terraform_dir: Path, project_id: str, workspac
             return
         
         # Get bucket name from state
-        show_proc = subprocess.run(
-            ["terraform", "state", "show", bucket_resource],
+        show_proc = run_tool(
+            "terraform", ["state", "show", bucket_resource],
             cwd=terraform_dir,
             capture_output=True,
             text=True,
@@ -160,7 +160,6 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
     Returns a manifest dictionary with all resources that need to be deleted.
     """
     import json
-    import subprocess
     from urllib.parse import urlparse
     
     manifest = {
@@ -181,13 +180,13 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
     }
     
     # Get Terraform outputs
-    output_proc = subprocess.run(
-        ["terraform", "output", "-json"],
+    output_proc = run_tool(
+        "terraform", ["output", "-json"],
         cwd=terraform_dir,
         capture_output=True,
         text=True,
     )
-    
+
     if output_proc.returncode == 0:
         outputs = json.loads(output_proc.stdout)
         
@@ -217,8 +216,8 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
                     })
     
     # Query Terraform state for additional resources
-    state_proc = subprocess.run(
-        ["terraform", "state", "list"],
+    state_proc = run_tool(
+        "terraform", ["state", "list"],
         cwd=terraform_dir,
         capture_output=True,
         text=True,
@@ -231,8 +230,8 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
             try:
                 # Cloud Run services (v1 and v2)
                 if 'google_cloud_run_service' in resource and 'google_cloud_run_v2_job' not in resource:
-                    show_proc = subprocess.run(
-                        ["terraform", "state", "show", resource],
+                    show_proc = run_tool(
+                        "terraform", ["state", "show", resource],
                         cwd=terraform_dir,
                         capture_output=True,
                         text=True,
@@ -268,8 +267,8 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
                 
                 # Cloud Run Jobs
                 elif 'google_cloud_run_v2_job' in resource:
-                    show_proc = subprocess.run(
-                        ["terraform", "state", "show", resource],
+                    show_proc = run_tool(
+                        "terraform", ["state", "show", resource],
                         cwd=terraform_dir,
                         capture_output=True,
                         text=True,
@@ -287,8 +286,8 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
                 
                 # Cloud Scheduler jobs
                 elif 'google_cloud_scheduler_job' in resource:
-                    show_proc = subprocess.run(
-                        ["terraform", "state", "show", resource],
+                    show_proc = run_tool(
+                        "terraform", ["state", "show", resource],
                         cwd=terraform_dir,
                         capture_output=True,
                         text=True,
@@ -312,8 +311,8 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
                 
                 # Pub/Sub topics
                 elif 'google_pubsub_topic' in resource:
-                    show_proc = subprocess.run(
-                        ["terraform", "state", "show", resource],
+                    show_proc = run_tool(
+                        "terraform", ["state", "show", resource],
                         cwd=terraform_dir,
                         capture_output=True,
                         text=True,
@@ -330,8 +329,8 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
                 
                 # Secret Manager secrets
                 elif 'google_secret_manager_secret' in resource:
-                    show_proc = subprocess.run(
-                        ["terraform", "state", "show", resource],
+                    show_proc = run_tool(
+                        "terraform", ["state", "show", resource],
                         cwd=terraform_dir,
                         capture_output=True,
                         text=True,
@@ -348,8 +347,8 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
                 
                 # Service accounts (only teardown ones to avoid deleting user SAs)
                 elif 'google_service_account' in resource and 'teardown' in resource:
-                    show_proc = subprocess.run(
-                        ["terraform", "state", "show", resource],
+                    show_proc = run_tool(
+                        "terraform", ["state", "show", resource],
                         cwd=terraform_dir,
                         capture_output=True,
                         text=True,
@@ -366,8 +365,8 @@ def extract_resource_manifest(terraform_dir: Path, project_id: str, workspace_na
                 
                 # Cloud Build triggers
                 elif 'google_cloudbuild_trigger' in resource:
-                    show_proc = subprocess.run(
-                        ["terraform", "state", "show", resource],
+                    show_proc = run_tool(
+                        "terraform", ["state", "show", resource],
                         cwd=terraform_dir,
                         capture_output=True,
                         text=True,
@@ -411,8 +410,8 @@ def upload_resource_manifest(manifest: dict, terraform_dir: Path, project_id: st
     
     try:
         # Get bucket name from Terraform state (same logic as upload_terraform_files_to_gcs)
-        state_proc = subprocess.run(
-            ["terraform", "state", "list"],
+        state_proc = run_tool(
+            "terraform", ["state", "list"],
             cwd=terraform_dir,
             capture_output=True,
             text=True,
@@ -430,8 +429,8 @@ def upload_resource_manifest(manifest: dict, terraform_dir: Path, project_id: st
         if not bucket_resource:
             raise Exception("Teardown module bucket not found in state")
         
-        show_proc = subprocess.run(
-            ["terraform", "state", "show", bucket_resource],
+        show_proc = run_tool(
+            "terraform", ["state", "show", bucket_resource],
             cwd=terraform_dir,
             capture_output=True,
             text=True,
@@ -563,8 +562,8 @@ def get_version():
         return version("deployml-core")
     except Exception:
         try:
-            result = subprocess.run(
-                ["git", "describe", "--tags", "--abbrev=0"],
+            result = run_tool(
+                "git", ["describe", "--tags", "--abbrev=0"],
                 capture_output=True,
                 text=True,
                 cwd=Path(__file__).parent.parent.parent.parent
@@ -669,9 +668,9 @@ def doctor(
             typer.echo(
                 f"\n Checking enabled APIs for project: {project_id} ..."
             )
-            result = subprocess.run(
+            result = run_tool(
+                "gcloud",
                 [
-                    "gcloud",
                     "services",
                     "list",
                     "--enabled",
@@ -1343,16 +1342,16 @@ def deploy(
     # Auth and ADC were already preflighted above, so just point gcloud at the project.
     typer.echo(f" Deploying {config.get('name', workspace_name)} to {cloud}...")
 
-    subprocess.run(
-        ["gcloud", "config", "set", "project", project_id],
+    run_tool(
+        "gcloud", ["config", "set", "project", project_id],
         cwd=DEPLOYML_TERRAFORM_DIR,
     )
 
     typer.echo(" Initializing Terraform...")
     # Capture stderr so init failures (state lock, missing ADC, bucket perms)
     # surface a real message instead of a silent exit.
-    init_proc = subprocess.run(
-        ["terraform", "init"],
+    init_proc = run_tool(
+        "terraform", ["init"],
         cwd=DEPLOYML_TERRAFORM_DIR,
         capture_output=True,
         text=True,
@@ -1364,8 +1363,8 @@ def deploy(
         raise typer.Exit(code=1)
 
     typer.echo(" Planning deployment...")
-    result = subprocess.run(
-        ["terraform", "plan"],
+    result = run_tool(
+        "terraform", ["plan"],
         cwd=DEPLOYML_TERRAFORM_DIR,
         capture_output=True,
         text=True,
@@ -1443,8 +1442,8 @@ def deploy(
         estimated_time = estimate_terraform_time(result.stdout, "apply")
         typer.echo(f" Applying changes... (Estimated time: {estimated_time})")
         # Re-init before apply; capture stderr to surface failures
-        init_proc2 = subprocess.run(
-            ["terraform", "init"],
+        init_proc2 = run_tool(
+            "terraform", ["init"],
             cwd=DEPLOYML_TERRAFORM_DIR,
             capture_output=True,
             text=True,
@@ -1507,9 +1506,10 @@ def deploy(
                 scheduler_job_name = f"deployml-teardown-{workspace_name}"
                 try:
                     typer.echo(f" Updating teardown schedule to: {teardown_at.strftime('%Y-%m-%d %H:%M:%S UTC')}")
-                    update_result = subprocess.run(
+                    update_result = run_tool(
+                        "gcloud",
                         [
-                            "gcloud", "scheduler", "jobs", "update", "http", scheduler_job_name,
+                            "scheduler", "jobs", "update", "http", scheduler_job_name,
                             "--location", region,
                             "--schedule", correct_cron_schedule,
                             "--time-zone", time_zone,
@@ -1544,8 +1544,8 @@ def deploy(
                 typer.echo(f"   To cancel: deployml teardown cancel --config-path {config_path}")
             
             # Show all Terraform outputs in a user-friendly way
-            output_proc = subprocess.run(
-                ["terraform", "output", "-json"],
+            output_proc = run_tool(
+                "terraform", ["output", "-json"],
                 cwd=DEPLOYML_TERRAFORM_DIR,
                 capture_output=True,
                 text=True,
@@ -1662,8 +1662,8 @@ def get_urls(
         typer.echo(f" No deployment found at {terraform_dir}. Run 'deployml deploy' first.")
         raise typer.Exit(code=1)
 
-    output_proc = subprocess.run(
-        ["terraform", "output", "-json"],
+    output_proc = run_tool(
+        "terraform", ["output", "-json"],
         cwd=terraform_dir,
         capture_output=True,
         text=True,
@@ -1724,9 +1724,9 @@ def get_urls(
         # Grafana admin password
         grafana_secret = outputs.get("grafana_admin_password_secret_id", {}).get("value", "")
         if grafana_secret and project_id:
-            fetch = subprocess.run(
-                ["gcloud", "secrets", "versions", "access", "latest",
-                 "--secret", grafana_secret, "--project", project_id],
+            fetch = run_tool(
+                "gcloud", ["secrets", "versions", "access", "latest",
+                           "--secret", grafana_secret, "--project", project_id],
                 capture_output=True, text=True,
             )
             if fetch.returncode == 0:
@@ -1810,8 +1810,8 @@ def destroy(
         typer.echo(f" Destroying infrastructure...")
 
         # Set GCP project
-        subprocess.run(
-            ["gcloud", "config", "set", "project", project_id],
+        run_tool(
+            "gcloud", ["config", "set", "project", project_id],
             cwd=DEPLOYML_TERRAFORM_DIR,
         )
 
@@ -1819,30 +1819,30 @@ def destroy(
         # before attempting to destroy Cloud SQL — otherwise active connections
         # prevent database/user deletion and the destroy fails.
         region = config.get("provider", {}).get("region", "us-central1")
-        cr_result = subprocess.run(
-            ["gcloud", "run", "services", "list",
-             "--project", project_id,
-             "--region", region,
-             "--format", "value(metadata.name)"],
+        cr_result = run_tool(
+            "gcloud", ["run", "services", "list",
+                       "--project", project_id,
+                       "--region", region,
+                       "--format", "value(metadata.name)"],
             capture_output=True, text=True
         )
         if cr_result.returncode == 0:
             services = [s.strip() for s in cr_result.stdout.splitlines() if s.strip()]
             for service in services:
                 typer.echo(f" Deleting Cloud Run service: {service}")
-                subprocess.run(
-                    ["gcloud", "run", "services", "delete", service,
-                     "--project", project_id,
-                     "--region", region,
-                     "--quiet"],
+                run_tool(
+                    "gcloud", ["run", "services", "delete", service,
+                               "--project", project_id,
+                               "--region", region,
+                               "--quiet"],
                     capture_output=True,
                 )
 
         # Remove Cloud SQL databases and user from Terraform state so Terraform
         # doesn't try to delete them individually — the instance deletion handles
         # that automatically, avoiding active-connection errors on destroy.
-        state_result = subprocess.run(
-            ["terraform", "state", "list"],
+        state_result = run_tool(
+            "terraform", ["state", "list"],
             cwd=DEPLOYML_TERRAFORM_DIR,
             capture_output=True,
             text=True,
@@ -1857,8 +1857,8 @@ def destroy(
             ]
             for resource in resources_to_remove:
                 typer.echo(f" Removing from state: {resource}")
-                subprocess.run(
-                    ["terraform", "state", "rm", resource],
+                run_tool(
+                    "terraform", ["state", "rm", resource],
                     cwd=DEPLOYML_TERRAFORM_DIR,
                     capture_output=True,
                 )
@@ -1867,7 +1867,7 @@ def destroy(
         cmd = ["terraform", "destroy", "--auto-approve"]
 
         # Run destroy
-        result = subprocess.run(cmd, cwd=DEPLOYML_TERRAFORM_DIR, check=False)
+        result = run_tool(cmd[0], cmd[1:], cwd=DEPLOYML_TERRAFORM_DIR, check=False)
 
         if result.returncode == 0:
             typer.echo(" Infrastructure destroyed successfully!")
@@ -1877,9 +1877,9 @@ def destroy(
             region = config.get("provider", {}).get("region", "us-central1")
             ar_repo = "mlops-images"
             typer.echo(f" Removing Artifact Registry repo {ar_repo}...")
-            subprocess.run(
-                ["gcloud", "artifacts", "repositories", "delete", ar_repo,
-                 "--location", region, "--project", project_id, "--quiet"],
+            run_tool(
+                "gcloud", ["artifacts", "repositories", "delete", ar_repo,
+                           "--location", region, "--project", project_id, "--quiet"],
                 capture_output=True,
             )
 
@@ -1889,8 +1889,8 @@ def destroy(
             # recreates it on the next build if needed.
             cb_bucket = f"gs://{project_id}_cloudbuild"
             typer.echo(f" Removing Cloud Build staging bucket {cb_bucket}...")
-            subprocess.run(
-                ["gcloud", "storage", "rm", "--recursive", cb_bucket, "--quiet"],
+            run_tool(
+                "gcloud", ["storage", "rm", "--recursive", cb_bucket, "--quiet"],
                 capture_output=True,
             )
 
@@ -1945,8 +1945,8 @@ def status(
     marker = deployml_dir / ".project_id"
     if marker.exists():
         typer.echo(f"Project: {marker.read_text().strip()}")
-    out_proc = subprocess.run(
-        ["terraform", "output", "-json"],
+    out_proc = run_tool(
+        "terraform", ["output", "-json"],
         cwd=tf_dir, capture_output=True, text=True,
     )
     if out_proc.returncode == 0 and out_proc.stdout.strip():
@@ -2007,9 +2007,9 @@ def cancel_teardown(config: dict, deployml_dir: Path, workspace_name: str):
     # Delete Cloud Scheduler job. Cloud Scheduler uses --location, not --region.
     # Earlier code passed --region which gcloud rejects, so cancel silently failed.
     scheduler_job_name = f"deployml-teardown-{workspace_name}"
-    result = subprocess.run(
-        ["gcloud", "scheduler", "jobs", "delete", scheduler_job_name,
-         "--project", project_id, "--location", region, "--quiet"],
+    result = run_tool(
+        "gcloud", ["scheduler", "jobs", "delete", scheduler_job_name,
+                   "--project", project_id, "--location", region, "--quiet"],
         capture_output=True,
         text=True,
     )
@@ -2033,9 +2033,9 @@ def show_teardown_status(config: dict, deployml_dir: Path, workspace_name: str):
     scheduler_job_name = f"deployml-teardown-{workspace_name}"
     
     # Query Cloud Scheduler job
-    result = subprocess.run(
-        ["gcloud", "scheduler", "jobs", "describe", scheduler_job_name,
-         "--project", project_id, "--location", region, "--format", "json"],
+    result = run_tool(
+        "gcloud", ["scheduler", "jobs", "describe", scheduler_job_name,
+                   "--project", project_id, "--location", region, "--format", "json"],
         capture_output=True,
         text=True,
     )
@@ -2130,9 +2130,9 @@ def update_teardown_schedule(config: dict, deployml_dir: Path, workspace_name: s
     scheduler_job_name = f"deployml-teardown-{workspace_name}"
     
     # Check if Cloud Scheduler job exists
-    result = subprocess.run(
-        ["gcloud", "scheduler", "jobs", "describe", scheduler_job_name,
-         "--project", project_id, "--location", region, "--format", "json"],
+    result = run_tool(
+        "gcloud", ["scheduler", "jobs", "describe", scheduler_job_name,
+                   "--project", project_id, "--location", region, "--format", "json"],
         capture_output=True,
         text=True,
     )
@@ -2182,9 +2182,10 @@ def update_teardown_schedule(config: dict, deployml_dir: Path, workspace_name: s
     # Update Cloud Scheduler job
     typer.echo("\n Updating Cloud Scheduler job...")
     typer.echo(f"   Cron schedule: {new_cron_schedule}")
-    update_result = subprocess.run(
+    update_result = run_tool(
+        "gcloud",
         [
-            "gcloud", "scheduler", "jobs", "update", "http", scheduler_job_name,
+            "scheduler", "jobs", "update", "http", scheduler_job_name,
             "--location", region,
             "--schedule", new_cron_schedule,
             "--time-zone", time_zone,
@@ -2202,9 +2203,9 @@ def update_teardown_schedule(config: dict, deployml_dir: Path, workspace_name: s
         raise typer.Exit(code=1)
     
     # Verify the update by querying the job again
-    verify_result = subprocess.run(
-        ["gcloud", "scheduler", "jobs", "describe", scheduler_job_name,
-         "--project", project_id, "--location", region, "--format", "json"],
+    verify_result = run_tool(
+        "gcloud", ["scheduler", "jobs", "describe", scheduler_job_name,
+                   "--project", project_id, "--location", region, "--format", "json"],
         capture_output=True,
         text=True,
     )
@@ -2310,9 +2311,9 @@ def init(
         typer.echo(
             f" Enabling required GCP APIs for project: {project_id} ..."
         )
-        result = subprocess.run(
+        result = run_tool(
+            "gcloud",
             [
-                "gcloud",
                 "services",
                 "enable",
                 *REQUIRED_GCP_APIS,
@@ -2654,7 +2655,7 @@ def gke_cluster_create(
         ]
     typer.echo(f" Creating {'Autopilot' if autopilot else 'standard'} cluster {cluster}...")
     typer.echo("   This typically takes 5 to 10 minutes.")
-    result = subprocess.run(cmd, capture_output=False, text=True)
+    result = run_tool(cmd[0], cmd[1:], capture_output=False, text=True)
     if result.returncode != 0:
         raise typer.Exit(code=1)
     typer.secho(f" Cluster {cluster} created.", fg=typer.colors.GREEN)
@@ -2722,8 +2723,8 @@ def gke_destroy(
         f = manifest_dir / fname
         if f.exists():
             typer.echo(f" Deleting {fname}...")
-            result = subprocess.run(
-                ["kubectl", "delete", "-f", str(f), "--ignore-not-found"] + ns,
+            result = run_tool(
+                "kubectl", ["delete", "-f", str(f), "--ignore-not-found"] + ns,
                 capture_output=True, text=True,
             )
             if result.returncode == 0:
@@ -2745,9 +2746,9 @@ def gke_destroy(
                 image = ""
         if image.startswith("gcr.io/"):
             typer.echo(f" Removing image {image}...")
-            subprocess.run(
-                ["gcloud", "container", "images", "delete", image,
-                 "--force-delete-tags", "--quiet", "--project", project],
+            run_tool(
+                "gcloud", ["container", "images", "delete", image,
+                           "--force-delete-tags", "--quiet", "--project", project],
                 capture_output=True,
             )
 
@@ -2765,7 +2766,7 @@ def gke_destroy(
         # billing. Retry until the in-flight operation clears.
         loc = zone or region
         for attempt in range(6):
-            result = subprocess.run(cmd, capture_output=True, text=True)
+            result = run_tool(cmd[0], cmd[1:], capture_output=True, text=True)
             if result.returncode == 0:
                 typer.echo(f" Cluster {cluster} deleted")
                 break
diff --git a/src/deployml/diagnostics/doctor.py b/src/deployml/diagnostics/doctor.py
index 791ff97..f06ba60 100644
--- a/src/deployml/diagnostics/doctor.py
+++ b/src/deployml/diagnostics/doctor.py
@@ -1,4 +1,3 @@
-import subprocess
 import shutil
 import os
 import sys
@@ -7,6 +6,9 @@
 from typing import Dict, List, Optional, Tuple
 import json
 import importlib
+
+from deployml.utils.platform_compat import run_tool
+
 try:
     from importlib.metadata import version as get_package_version_metadata
 except ImportError:
@@ -216,7 +218,7 @@ def _check_docker(self):
             return
         
         try:
-            result = subprocess.run(['docker', '--version'], capture_output=True, text=True)
+            result = run_tool('docker', ['--version'], capture_output=True, text=True)
             if result.returncode == 0:
                 version = result.stdout.strip()
                 self._add_result(CheckResult(
@@ -251,7 +253,7 @@ def _check_terraform(self):
             return
         
         try:
-            result = subprocess.run(['terraform', 'version'], capture_output=True, text=True)
+            result = run_tool('terraform', ['version'], capture_output=True, text=True)
             if result.returncode == 0:
                 version_line = result.stdout.split('\n')[0]
                 self._add_result(CheckResult(
@@ -286,7 +288,7 @@ def _check_cloud_cli_tools(self):
             if shutil.which(tool):
                 try:
                     if tool == 'gcloud':
-                        result = subprocess.run(['gcloud', 'version'], capture_output=True, text=True)
+                        result = run_tool('gcloud', ['version'], capture_output=True, text=True)
                         if result.returncode == 0:
                             version = result.stdout.split('\n')[0]
                             self._add_result(CheckResult(
@@ -304,7 +306,7 @@ def _check_cloud_cli_tools(self):
                             ))
                     else:
                         # For AWS and Azure CLI
-                        result = subprocess.run([tool, '--version'], capture_output=True, text=True)
+                        result = run_tool(tool, ['--version'], capture_output=True, text=True)
                         if result.returncode == 0:
                             version = result.stdout.strip()
                             self._add_result(CheckResult(
@@ -342,7 +344,7 @@ def _check_git(self):
             return
         
         try:
-            result = subprocess.run(['git', '--version'], capture_output=True, text=True)
+            result = run_tool('git', ['--version'], capture_output=True, text=True)
             if result.returncode == 0:
                 version = result.stdout.strip()
                 self._add_result(CheckResult(
@@ -372,7 +374,7 @@ def _check_infracost(self):
             return
         
         try:
-            result = subprocess.run(['infracost', '--version'], capture_output=True, text=True)
+            result = run_tool('infracost', ['--version'], capture_output=True, text=True)
             if result.returncode == 0:
                 version = result.stdout.strip()
                 self._add_result(CheckResult(
@@ -392,7 +394,7 @@ def _check_infracost(self):
     def _check_docker_permissions(self):
         """Check Docker permissions"""
         try:
-            result = subprocess.run(['docker', 'ps'], capture_output=True, text=True)
+            result = run_tool('docker', ['ps'], capture_output=True, text=True)
             if result.returncode == 0:
                 self._add_result(CheckResult(
                     name="Docker Permissions",
@@ -427,7 +429,7 @@ def _check_cloud_authentication(self):
         # Check GCP authentication
         if shutil.which('gcloud'):
             try:
-                result = subprocess.run(['gcloud', 'auth', 'list'], capture_output=True, text=True)
+                result = run_tool('gcloud', ['auth', 'list'], capture_output=True, text=True)
                 if result.returncode == 0 and "ACTIVE" in result.stdout:
                     self._add_result(CheckResult(
                         name="GCP Authentication",
diff --git a/src/deployml/notebook/docker.py b/src/deployml/notebook/docker.py
index b6fb6ac..9d02f0f 100644
--- a/src/deployml/notebook/docker.py
+++ b/src/deployml/notebook/docker.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 from deployml.utils.helpers import check_docker_daemon
+from deployml.utils.platform_compat import run_tool
 
 class ImageBuildError(Exception):
     pass
@@ -94,8 +95,8 @@ def build_images(
                 print()
             else:
                 print("Ensuring Artifact Registry repository exists...")
-                create_proc = subprocess.run(
-                    create_cmd, check=False,
+                create_proc = run_tool(
+                    create_cmd[0], create_cmd[1:], check=False,
                     capture_output=True, text=True,
                 )
                 stderr_lower = (create_proc.stderr or "").lower()
@@ -127,7 +128,7 @@ def build_images(
                 print()
             else:
                 print(f"Building {service_name} via Cloud Build...")
-                subprocess.run(build_cmd, check=True)
+                run_tool(build_cmd[0], build_cmd[1:], check=True)
                 print(f"Pushed: {image_uri}")
                 print()
 
@@ -155,7 +156,7 @@ def build_images(
                 print()
             else:
                 print(f"Building {service_name} locally...")
-                subprocess.run(build_cmd, check=True)
+                run_tool(build_cmd[0], build_cmd[1:], check=True)
                 print(f"Built: {image_name}")
                 print()
 
@@ -168,8 +169,8 @@ def build_images(
 
 def _validate_docker():
     try:
-        subprocess.run(
-            ["docker", "--version"],
+        run_tool(
+            "docker", ["--version"],
             check=True,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
@@ -187,9 +188,9 @@ def _build_locally(service_dirs: list[Path], tag: str):
 
         print(f"Building {image_name} ...")
 
-        subprocess.run(
+        run_tool(
+            "docker",
             [
-                "docker",
                 "build",
                 "-t",
                 image_name,
@@ -207,8 +208,8 @@ def _build_locally(service_dirs: list[Path], tag: str):
 
 def _validate_gcloud():
     try:
-        subprocess.run(
-            ["gcloud", "--version"],
+        run_tool(
+            "gcloud", ["--version"],
             check=True,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
@@ -238,9 +239,9 @@ def _build_with_cloud_build(
 
         print(f"Submitting Cloud Build for {image_uri} ...")
 
-        subprocess.run(
+        run_tool(
+            "gcloud",
             [
-                "gcloud",
                 "builds",
                 "submit",
                 str(service_dir),
diff --git a/src/deployml/notebook/stack.py b/src/deployml/notebook/stack.py
index bb1319f..e6967a1 100644
--- a/src/deployml/notebook/stack.py
+++ b/src/deployml/notebook/stack.py
@@ -1,5 +1,4 @@
 import json
-import subprocess
 from pathlib import Path
 from typing import Dict, Any, TYPE_CHECKING
 from datetime import datetime, timezone
@@ -10,6 +9,7 @@
 
 from .urls import ServiceURLs
 from .display import display_services_table
+from deployml.utils.platform_compat import run_tool
 
 
 class DeploymentStack:
@@ -37,8 +37,8 @@ def _extract_urls(self) -> ServiceURLs:
         """Extract URLs from Terraform outputs"""
         try:
             terraform_dir = self.workspace_dir / "terraform"
-            result = subprocess.run(
-                ["terraform", "output", "-json"],
+            result = run_tool(
+                "terraform", ["output", "-json"],
                 cwd=terraform_dir,
                 capture_output=True,
                 text=True,
@@ -130,8 +130,8 @@ def get_postgresql_info(self, show_credentials: bool = False) -> Dict[str, str]:
         """
         try:
             terraform_dir = self.workspace_dir / "terraform"
-            result = subprocess.run(
-                ["terraform", "output", "-json"],
+            result = run_tool(
+                "terraform", ["output", "-json"],
                 cwd=terraform_dir,
                 capture_output=True,
                 text=True,
@@ -149,8 +149,8 @@ def get_postgresql_info(self, show_credentials: bool = False) -> Dict[str, str]:
                     if show_credentials:
                         # Try to get the actual sensitive value
                         try:
-                            sensitive_result = subprocess.run(
-                                ["terraform", "output", "-raw", key],
+                            sensitive_result = run_tool(
+                                "terraform", ["output", "-raw", key],
                                 cwd=terraform_dir,
                                 capture_output=True,
                                 text=True,
@@ -326,8 +326,8 @@ def get_cron_jobs_info(self) -> Dict[str, Any]:
         """Get detailed cron job information"""
         try:
             terraform_dir = self.workspace_dir / "terraform"
-            result = subprocess.run(
-                ["terraform", "output", "-json"],
+            result = run_tool(
+                "terraform", ["output", "-json"],
                 cwd=terraform_dir,
                 capture_output=True,
                 text=True,
diff --git a/src/deployml/utils/helpers.py b/src/deployml/utils/helpers.py
index 4081e1e..c43b4ab 100644
--- a/src/deployml/utils/helpers.py
+++ b/src/deployml/utils/helpers.py
@@ -8,6 +8,7 @@
 import random
 import string
 from deployml.utils.constants import ANIMAL_NAMES, FALLBACK_WORDS, TERRAFORM_DIR
+from deployml.utils.platform_compat import run_tool, resolve_tool
 import subprocess
 import time
 from rich.progress import (
@@ -47,8 +48,8 @@ def check_gcp_auth() -> bool:
         bool: True if authenticated, False otherwise.
     """
     try:
-        result = subprocess.run(
-            ["gcloud", "auth", "list"], capture_output=True, text=True
+        result = run_tool(
+            "gcloud", ["auth", "list"], capture_output=True, text=True
         )
         return "ACTIVE" in result.stdout
     except Exception:
@@ -58,8 +59,8 @@ def check_gcp_auth() -> bool:
 def check_gcp_adc() -> bool:
     """Application Default Credentials are required by Terraform and client libs."""
     try:
-        result = subprocess.run(
-            ["gcloud", "auth", "application-default", "print-access-token"],
+        result = run_tool(
+            "gcloud", ["auth", "application-default", "print-access-token"],
             capture_output=True, text=True,
         )
         return result.returncode == 0
@@ -71,8 +72,8 @@ def check_bq() -> bool:
     if not shutil.which("bq"):
         return False
     try:
-        result = subprocess.run(
-            ["bq", "version"], capture_output=True, text=True,
+        result = run_tool(
+            "bq", ["version"], capture_output=True, text=True,
         )
         return result.returncode == 0
     except Exception:
@@ -85,8 +86,8 @@ def get_terraform_version() -> Optional[tuple]:
         return None
     try:
         import json as _json
-        result = subprocess.run(
-            ["terraform", "version", "-json"],
+        result = run_tool(
+            "terraform", ["version", "-json"],
             capture_output=True, text=True,
         )
         if result.returncode != 0:
@@ -101,9 +102,9 @@ def get_terraform_version() -> Optional[tuple]:
 def validate_gcp_project(project_id: str) -> bool:
     """Verify project exists and active gcloud account can access it."""
     try:
-        result = subprocess.run(
-            ["gcloud", "projects", "describe", project_id,
-             "--format=value(projectId)"],
+        result = run_tool(
+            "gcloud", ["projects", "describe", project_id,
+                       "--format=value(projectId)"],
             capture_output=True, text=True,
         )
         return result.returncode == 0 and result.stdout.strip() == project_id
@@ -122,7 +123,7 @@ def validate_gcp_region(region: str, project_id: Optional[str] = None) -> bool:
         if project_id:
             cmd += ["--project", project_id]
         try:
-            result = subprocess.run(cmd, capture_output=True, text=True)
+            result = run_tool(cmd[0], cmd[1:], capture_output=True, text=True)
             if result.returncode != 0:
                 print(
                     f"Warning: could not verify region '{region}' "
@@ -146,16 +147,16 @@ def get_missing_iam_roles(project_id: str, required_roles: list) -> list:
     """Return roles the active account lacks. roles/owner short-circuits to empty."""
     try:
         import json as _json
-        account_result = subprocess.run(
-            ["gcloud", "config", "get-value", "account"],
+        account_result = run_tool(
+            "gcloud", ["config", "get-value", "account"],
             capture_output=True, text=True,
         )
         account = account_result.stdout.strip()
         if not account:
             return list(required_roles)
 
-        result = subprocess.run(
-            ["gcloud", "projects", "get-iam-policy", project_id, "--format=json"],
+        result = run_tool(
+            "gcloud", ["projects", "get-iam-policy", project_id, "--format=json"],
             capture_output=True, text=True,
         )
         if result.returncode != 0:
@@ -180,8 +181,8 @@ def check_docker_daemon() -> bool:
     if not shutil.which("docker"):
         return False
     try:
-        result = subprocess.run(
-            ["docker", "info"], capture_output=True, text=True,
+        result = run_tool(
+            "docker", ["info"], capture_output=True, text=True,
         )
         return result.returncode == 0
     except Exception:
@@ -399,12 +400,11 @@ def cleanup_cloud_sql_resources(terraform_dir: Path, project_id: str):
     delete databases and users. We just restart the instance — that kills all
     active connections — and let Terraform handle the actual resource deletion.
     """
-    import subprocess
     import time as _time
 
     try:
-        result = subprocess.run(
-            ["terraform", "output", "-raw", "instance_connection_name"],
+        result = run_tool(
+            "terraform", ["output", "-raw", "instance_connection_name"],
             cwd=terraform_dir,
             capture_output=True,
             text=True,
@@ -417,9 +417,9 @@ def cleanup_cloud_sql_resources(terraform_dir: Path, project_id: str):
         instance_name = parts[2] if len(parts) == 3 else instance_connection_name
 
         print(f"🗄️  Restarting Cloud SQL instance to close active connections: {instance_name}")
-        subprocess.run(
-            ["gcloud", "sql", "instances", "restart", instance_name,
-             "--project", project_id, "--quiet"],
+        run_tool(
+            "gcloud", ["sql", "instances", "restart", instance_name,
+                       "--project", project_id, "--quiet"],
             capture_output=True,
             text=True,
         )
@@ -468,6 +468,11 @@ def run_terraform_with_loading_bar(cmd, cwd, estimated_minutes, stack=None, verb
     Returns:
         int: The return code of the process.
     """
+    # Resolve the tool to its real path so the streaming Popen calls below work on
+    # Windows, where a bare .cmd name would fail. terraform is a real .exe, but
+    # resolving keeps this robust if the front tool ever changes.
+    cmd = [resolve_tool(cmd[0]), *cmd[1:]]
+
     # Default messages if stack is not provided
     default_msgs = [
         "DeployML: Preparing your cloud environment...",
diff --git a/src/deployml/utils/infracost.py b/src/deployml/utils/infracost.py
index 5418ef2..ef9f1e5 100644
--- a/src/deployml/utils/infracost.py
+++ b/src/deployml/utils/infracost.py
@@ -5,6 +5,8 @@
 from typing import Dict, Optional, List
 from dataclasses import dataclass
 
+from deployml.utils.platform_compat import run_tool
+
 
 @dataclass
 class CostComponent:
@@ -48,8 +50,8 @@ def check_infracost_available() -> bool:
         bool: True if infracost is available, False otherwise.
     """
     try:
-        result = subprocess.run(
-            ["infracost", "--version"],
+        result = run_tool(
+            "infracost", ["--version"],
             capture_output=True,
             text=True,
             timeout=10,
@@ -89,8 +91,8 @@ def run_infracost_breakdown(terraform_dir: Path, usage_file: Optional[Path] = No
         if usage_file is not None:
             cmd.extend(["--usage-file", str(usage_file)])
 
-        result = subprocess.run(
-            cmd,
+        result = run_tool(
+            cmd[0], cmd[1:],
             cwd=terraform_dir,
             capture_output=True,
             text=True,
diff --git a/src/deployml/utils/kubernetes_gke.py b/src/deployml/utils/kubernetes_gke.py
index e7b90a4..54fc0a9 100644
--- a/src/deployml/utils/kubernetes_gke.py
+++ b/src/deployml/utils/kubernetes_gke.py
@@ -12,6 +12,7 @@
 
 from deployml.utils.constants import TEMPLATE_DIR
 from deployml.utils.kubernetes_local import ensure_namespace, ns_args
+from deployml.utils.platform_compat import run_tool
 
 
 def check_gke_cluster_connection(cluster_name: str, zone: Optional[str] = None, region: Optional[str] = None) -> bool:
@@ -22,14 +23,14 @@ def check_gke_cluster_connection(cluster_name: str, zone: Optional[str] = None,
     only return True if the current context contains the exact cluster name.
     """
     try:
-        result = subprocess.run(
-            ["kubectl", "cluster-info"],
+        result = run_tool(
+            "kubectl", ["cluster-info"],
             capture_output=True,
             text=True
         )
         if result.returncode == 0:
-            context_result = subprocess.run(
-                ["kubectl", "config", "current-context"],
+            context_result = run_tool(
+                "kubectl", ["config", "current-context"],
                 capture_output=True,
                 text=True
             )
@@ -67,8 +68,8 @@ def connect_to_gke_cluster(
             typer.echo("Either zone or region must be provided")
             return False
         
-        result = subprocess.run(
-            cmd,
+        result = run_tool(
+            cmd[0], cmd[1:],
             check=True,
             capture_output=True,
             text=True
@@ -89,16 +90,16 @@ def push_image_to_gcr(image_name: str, gcr_image: str, project_id: str) -> bool:
     
     try:
         # Tag image
-        tag_result = subprocess.run(
-            ["docker", "tag", image_name, gcr_image],
+        tag_result = run_tool(
+            "docker", ["tag", image_name, gcr_image],
             check=True,
             capture_output=True,
             text=True
         )
-        
+
         # Push image
-        push_result = subprocess.run(
-            ["docker", "push", gcr_image],
+        push_result = run_tool(
+            "docker", ["push", gcr_image],
             check=True,
             capture_output=True,
             text=True
@@ -366,8 +367,8 @@ def deploy_to_gke(
         pvc_file = manifest_dir / "pvc.yaml"
         if pvc_file.exists():
             typer.echo(f"   Applying {pvc_file.name}...")
-            result = subprocess.run(
-                ["kubectl", "apply", "-f", str(pvc_file)] + ns,
+            result = run_tool(
+                "kubectl", ["apply", "-f", str(pvc_file)] + ns,
                 check=True,
                 capture_output=True,
                 text=True
@@ -375,8 +376,8 @@ def deploy_to_gke(
             typer.echo(f"   {result.stdout.strip()}")
 
         typer.echo(f"   Applying {deployment_file.name}...")
-        result = subprocess.run(
-            ["kubectl", "apply", "-f", str(deployment_file)] + ns,
+        result = run_tool(
+            "kubectl", ["apply", "-f", str(deployment_file)] + ns,
             check=True,
             capture_output=True,
             text=True
@@ -384,8 +385,8 @@ def deploy_to_gke(
         typer.echo(f"   {result.stdout.strip()}")
 
         typer.echo(f"   Applying {service_file.name}...")
-        result = subprocess.run(
-            ["kubectl", "apply", "-f", str(service_file)] + ns,
+        result = run_tool(
+            "kubectl", ["apply", "-f", str(service_file)] + ns,
             check=True,
             capture_output=True,
             text=True
@@ -420,14 +421,14 @@ def deploy_to_gke(
             ip_query = "{.status.loadBalancer.ingress[0].ip}"
             cmd = ["kubectl", "get", "svc", service_name,
                    "-o", f"jsonpath={ip_query}"] + ns
-            result = subprocess.run(cmd, capture_output=True, text=True)
+            result = run_tool(cmd[0], cmd[1:], capture_output=True, text=True)
 
             external_ip = result.stdout.strip().strip("'")
             if result.returncode == 0 and external_ip and external_ip != "<none>":
                 port_query = "{.spec.ports[0].port}"
                 port_cmd = ["kubectl", "get", "svc", service_name,
                             "-o", f"jsonpath={port_query}"] + ns
-                port_result = subprocess.run(port_cmd, capture_output=True, text=True)
+                port_result = run_tool(port_cmd[0], port_cmd[1:], capture_output=True, text=True)
                 port = port_result.stdout.strip().strip("'") or "5000"
                 typer.echo(f"\n Service is available at: http://{external_ip}:{port}")
                 break
@@ -438,8 +439,8 @@ def deploy_to_gke(
                 typer.echo(f"   Still waiting... ({waited}s)")
         
         typer.echo("\n Deployment status:")
-        subprocess.run(["kubectl", "get", "pods"] + ns)
-        subprocess.run(["kubectl", "get", "svc"] + ns)
+        run_tool("kubectl", ["get", "pods"] + ns)
+        run_tool("kubectl", ["get", "svc"] + ns)
 
         return True
         
diff --git a/src/deployml/utils/kubernetes_local.py b/src/deployml/utils/kubernetes_local.py
index da3a6a9..b102b1b 100644
--- a/src/deployml/utils/kubernetes_local.py
+++ b/src/deployml/utils/kubernetes_local.py
@@ -4,6 +4,7 @@
 from typing import Optional, Dict
 from jinja2 import Environment, FileSystemLoader
 from deployml.utils.constants import TEMPLATE_DIR
+from deployml.utils.platform_compat import run_tool
 
 
 def ns_args(namespace: Optional[str]) -> list:
@@ -17,21 +18,21 @@ def ensure_namespace(namespace: Optional[str]) -> None:
     namespace. Idempotent via apply of a client-side dry-run manifest."""
     if not namespace or namespace == "default":
         return
-    rendered = subprocess.run(
-        ["kubectl", "create", "namespace", namespace, "--dry-run=client", "-o", "yaml"],
+    rendered = run_tool(
+        "kubectl", ["create", "namespace", namespace, "--dry-run=client", "-o", "yaml"],
         capture_output=True, text=True,
     )
     if rendered.returncode == 0:
-        subprocess.run(["kubectl", "apply", "-f", "-"], input=rendered.stdout,
-                       capture_output=True, text=True)
+        run_tool("kubectl", ["apply", "-f", "-"], input=rendered.stdout,
+                 capture_output=True, text=True)
         typer.echo(f"   Using namespace: {namespace}")
 
 
 def check_minikube_running() -> bool:
     """Check if minikube is currently running."""
     try:
-        result = subprocess.run(
-            ["minikube", "status"],
+        result = run_tool(
+            "minikube", ["status"],
             capture_output=True,
             text=True
         )
@@ -44,8 +45,8 @@ def start_minikube() -> bool:
     """Start minikube cluster."""
     typer.echo("Starting minikube...")
     try:
-        result = subprocess.run(
-            ["minikube", "start"],
+        result = run_tool(
+            "minikube", ["start"],
             check=True,
             capture_output=True,
             text=True
@@ -264,15 +265,15 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
         pvc_file = manifest_dir / "pvc.yaml"
         if pvc_file.exists():
             typer.echo(f"   Applying {pvc_file.name}...")
-            result = subprocess.run(
-                ["kubectl", "apply", "-f", str(pvc_file)] + ns,
+            result = run_tool(
+                "kubectl", ["apply", "-f", str(pvc_file)] + ns,
                 check=True, capture_output=True, text=True,
             )
             typer.echo(f"{result.stdout.strip()}")
 
         typer.echo(f"   Applying {deployment_file.name}...")
-        result = subprocess.run(
-            ["kubectl", "apply", "-f", str(deployment_file)] + ns,
+        result = run_tool(
+            "kubectl", ["apply", "-f", str(deployment_file)] + ns,
             check=True,
             capture_output=True,
             text=True
@@ -281,8 +282,8 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
 
         # Apply service
         typer.echo(f"   Applying {service_file.name}...")
-        result = subprocess.run(
-            ["kubectl", "apply", "-f", str(service_file)] + ns,
+        result = run_tool(
+            "kubectl", ["apply", "-f", str(service_file)] + ns,
             check=True,
             capture_output=True,
             text=True
@@ -294,8 +295,8 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
 
         # Try minikube service --url with timeout (can hang)
         try:
-            result = subprocess.run(
-                ["minikube", "service", "mlflow-service", "--url"] + ns,
+            result = run_tool(
+                "minikube", ["service", "mlflow-service", "--url"] + ns,
                 capture_output=True,
                 text=True,
                 timeout=5  # 5 second timeout to prevent hanging
@@ -309,15 +310,15 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
         except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
             # Fallback: get NodePort manually (more reliable)
             typer.echo("   Getting NodePort...")
-            result = subprocess.run(
-                ["kubectl", "get", "svc", "mlflow-service", "-o", "jsonpath='{.spec.ports[0].nodePort}'"] + ns,
+            result = run_tool(
+                "kubectl", ["get", "svc", "mlflow-service", "-o", "jsonpath='{.spec.ports[0].nodePort}'"] + ns,
                 capture_output=True,
                 text=True
             )
             if result.returncode == 0:
                 node_port = result.stdout.strip().strip("'")
-                minikube_ip_result = subprocess.run(
-                    ["minikube", "ip"],
+                minikube_ip_result = run_tool(
+                    "minikube", ["ip"],
                     capture_output=True,
                     text=True
                 )
@@ -330,8 +331,8 @@ def deploy_mlflow_to_minikube(manifest_dir: Path, image_name: Optional[str] = No
                 typer.echo("   Could not determine service URL. Check with: kubectl get svc mlflow-service")
 
         typer.echo("\n Deployment status:")
-        subprocess.run(["kubectl", "get", "pods", "-l", "app=mlflow"] + ns)
-        subprocess.run(["kubectl", "get", "svc", "-l", "app=mlflow"] + ns)
+        run_tool("kubectl", ["get", "pods", "-l", "app=mlflow"] + ns)
+        run_tool("kubectl", ["get", "svc", "-l", "app=mlflow"] + ns)
         
         return True
         
@@ -387,8 +388,8 @@ def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = N
 
     try:
         typer.echo(f"   Applying {deployment_file.name}...")
-        result = subprocess.run(
-            ["kubectl", "apply", "-f", str(deployment_file)] + ns,
+        result = run_tool(
+            "kubectl", ["apply", "-f", str(deployment_file)] + ns,
             check=True,
             capture_output=True,
             text=True
@@ -397,8 +398,8 @@ def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = N
 
         # Apply service
         typer.echo(f"   Applying {service_file.name}...")
-        result = subprocess.run(
-            ["kubectl", "apply", "-f", str(service_file)] + ns,
+        result = run_tool(
+            "kubectl", ["apply", "-f", str(service_file)] + ns,
             check=True,
             capture_output=True,
             text=True
@@ -410,8 +411,8 @@ def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = N
 
         # Try minikube service --url with timeout (can hang)
         try:
-            result = subprocess.run(
-                ["minikube", "service", "fastapi-service", "--url"] + ns,
+            result = run_tool(
+                "minikube", ["service", "fastapi-service", "--url"] + ns,
                 capture_output=True,
                 text=True,
                 timeout=5  # 5 second timeout to prevent hanging
@@ -425,15 +426,15 @@ def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = N
         except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
             # Fallback: get NodePort manually (more reliable)
             typer.echo("   Getting NodePort...")
-            result = subprocess.run(
-                ["kubectl", "get", "svc", "fastapi-service", "-o", "jsonpath='{.spec.ports[0].nodePort}'"] + ns,
+            result = run_tool(
+                "kubectl", ["get", "svc", "fastapi-service", "-o", "jsonpath='{.spec.ports[0].nodePort}'"] + ns,
                 capture_output=True,
                 text=True
             )
             if result.returncode == 0:
                 node_port = result.stdout.strip().strip("'")
-                minikube_ip_result = subprocess.run(
-                    ["minikube", "ip"],
+                minikube_ip_result = run_tool(
+                    "minikube", ["ip"],
                     capture_output=True,
                     text=True
                 )
@@ -446,8 +447,8 @@ def deploy_fastapi_to_minikube(manifest_dir: Path, image_name: Optional[str] = N
                 typer.echo("   Could not determine service URL. Check with: kubectl get svc fastapi-service")
 
         typer.echo("\n Deployment status:")
-        subprocess.run(["kubectl", "get", "pods", "-l", "app=fastapi"] + ns)
-        subprocess.run(["kubectl", "get", "svc", "-l", "app=fastapi"] + ns)
+        run_tool("kubectl", ["get", "pods", "-l", "app=fastapi"] + ns)
+        run_tool("kubectl", ["get", "svc", "-l", "app=fastapi"] + ns)
         
         return True
         
@@ -470,8 +471,8 @@ def load_image_to_minikube(image_name: str) -> bool:
         True if image was loaded or already exists, False otherwise
     """
     # Check if image exists locally
-    result = subprocess.run(
-        ["docker", "images", "-q", image_name],
+    result = run_tool(
+        "docker", ["images", "-q", image_name],
         capture_output=True,
         text=True
     )
@@ -482,8 +483,8 @@ def load_image_to_minikube(image_name: str) -> bool:
         return False
     
     # Check if image is already in minikube
-    result = subprocess.run(
-        ["minikube", "image", "ls"],
+    result = run_tool(
+        "minikube", ["image", "ls"],
         capture_output=True,
         text=True
     )
@@ -494,8 +495,8 @@ def load_image_to_minikube(image_name: str) -> bool:
     
     # Load image into minikube
     typer.echo(f"📦 Loading image '{image_name}' into minikube...")
-    result = subprocess.run(
-        ["minikube", "image", "load", image_name],
+    result = run_tool(
+        "minikube", ["image", "load", image_name],
         check=True,
         capture_output=True,
         text=True
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index 6b2ffb7..fcc2f41 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -187,19 +187,19 @@ def test_gcp_preflight_exits_when_adc_missing(mock_auth, mock_adc):
 
 # ---------- check_gcp_adc ----------
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_check_gcp_adc_returncode_zero_true(mock_run):
     mock_run.return_value = MagicMock(returncode=0)
     assert check_gcp_adc() is True
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_check_gcp_adc_returncode_nonzero_false(mock_run):
     mock_run.return_value = MagicMock(returncode=1)
     assert check_gcp_adc() is False
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_check_gcp_adc_exception_returns_false(mock_run):
     mock_run.side_effect = OSError("boom")
     assert check_gcp_adc() is False
@@ -214,7 +214,7 @@ def test_check_bq_binary_missing_false(mock_which):
 
 
 @patch("deployml.utils.helpers.shutil.which")
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_check_bq_binary_present_and_runs(mock_run, mock_which):
     mock_which.return_value = "/usr/bin/bq"
     mock_run.return_value = MagicMock(returncode=0)
@@ -222,7 +222,7 @@ def test_check_bq_binary_present_and_runs(mock_run, mock_which):
 
 
 @patch("deployml.utils.helpers.shutil.which")
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_check_bq_binary_present_but_errors(mock_run, mock_which):
     mock_which.return_value = "/usr/bin/bq"
     mock_run.return_value = MagicMock(returncode=2)
@@ -238,7 +238,7 @@ def test_get_terraform_version_binary_missing(mock_which):
 
 
 @patch("deployml.utils.helpers.shutil.which")
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_get_terraform_version_parses_json(mock_run, mock_which):
     mock_which.return_value = "/usr/bin/terraform"
     mock_run.return_value = MagicMock(
@@ -249,7 +249,7 @@ def test_get_terraform_version_parses_json(mock_run, mock_which):
 
 
 @patch("deployml.utils.helpers.shutil.which")
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_get_terraform_version_bad_json_returns_none(mock_run, mock_which):
     mock_which.return_value = "/usr/bin/terraform"
     mock_run.return_value = MagicMock(returncode=0, stdout="not json")
@@ -258,19 +258,19 @@ def test_get_terraform_version_bad_json_returns_none(mock_run, mock_which):
 
 # ---------- validate_gcp_project ----------
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_validate_gcp_project_exists(mock_run):
     mock_run.return_value = MagicMock(returncode=0, stdout="my-project\n")
     assert validate_gcp_project("my-project") is True
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_validate_gcp_project_missing(mock_run):
     mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="not found")
     assert validate_gcp_project("ghost") is False
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_validate_gcp_project_stdout_mismatch(mock_run):
     # returncode 0 but stdout does not match the project id we asked for
     mock_run.return_value = MagicMock(returncode=0, stdout="other-project\n")
@@ -279,7 +279,7 @@ def test_validate_gcp_project_stdout_mismatch(mock_run):
 
 # ---------- validate_gcp_region ----------
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_validate_gcp_region_in_list(mock_run):
     helpers_mod._GCP_REGIONS_CACHE = None
     mock_run.return_value = MagicMock(
@@ -289,7 +289,7 @@ def test_validate_gcp_region_in_list(mock_run):
     assert validate_gcp_region("us-west1") is True
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_validate_gcp_region_not_in_list(mock_run):
     helpers_mod._GCP_REGIONS_CACHE = None
     mock_run.return_value = MagicMock(
@@ -299,7 +299,7 @@ def test_validate_gcp_region_not_in_list(mock_run):
     assert validate_gcp_region("mars-central1") is False
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_validate_gcp_region_lookup_failure_does_not_block(mock_run):
     helpers_mod._GCP_REGIONS_CACHE = None
     mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="oops")
@@ -315,7 +315,7 @@ def test_check_docker_daemon_binary_missing(mock_which):
 
 
 @patch("deployml.utils.helpers.shutil.which")
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_check_docker_daemon_up(mock_run, mock_which):
     mock_which.return_value = "/usr/local/bin/docker"
     mock_run.return_value = MagicMock(returncode=0)
@@ -323,7 +323,7 @@ def test_check_docker_daemon_up(mock_run, mock_which):
 
 
 @patch("deployml.utils.helpers.shutil.which")
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_check_docker_daemon_down(mock_run, mock_which):
     mock_which.return_value = "/usr/local/bin/docker"
     mock_run.return_value = MagicMock(returncode=1, stderr="cannot connect")
@@ -332,7 +332,7 @@ def test_check_docker_daemon_down(mock_run, mock_which):
 
 # ---------- get_missing_iam_roles ----------
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_get_missing_iam_roles_owner_short_circuits(mock_run):
     mock_run.side_effect = [
         MagicMock(returncode=0, stdout="me@example.com\n"),
@@ -345,7 +345,7 @@ def test_get_missing_iam_roles_owner_short_circuits(mock_run):
     assert get_missing_iam_roles("proj", required) == []
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_get_missing_iam_roles_some_missing(mock_run):
     mock_run.side_effect = [
         MagicMock(returncode=0, stdout="me@example.com\n"),
@@ -358,13 +358,13 @@ def test_get_missing_iam_roles_some_missing(mock_run):
     assert get_missing_iam_roles("proj", required) == ["roles/run.admin"]
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_get_missing_iam_roles_account_lookup_fails_returns_all(mock_run):
     mock_run.return_value = MagicMock(returncode=0, stdout="")
     assert get_missing_iam_roles("proj", ["roles/run.admin"]) == ["roles/run.admin"]
 
 
-@patch("deployml.utils.helpers.subprocess.run")
+@patch("deployml.utils.helpers.run_tool")
 def test_get_missing_iam_roles_policy_query_fails_returns_all(mock_run):
     mock_run.side_effect = [
         MagicMock(returncode=0, stdout="me@example.com\n"),

From d5f8ed38b142b365479ae51a18a360bc93188da0 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 15:19:18 -0700
Subject: [PATCH 18/31] force UTF-8 console output at CLI entry (blocker 2)

The default Windows console code page is cp1252, so emoji and box glyphs in CLI output raise UnicodeEncodeError and crash the command. Call configure_console_encoding first thing in main(), before cli(), to reconfigure stdout and stderr to UTF-8 with errors=replace. Proven: on a cp1252 stream the lightbulb glyph raised UnicodeEncodeError, and after the call stdout is UTF-8 and the glyph prints. deployml doctor now runs to completion with exit code 0 and no UnicodeEncodeError.
---
 src/deployml/cli/cli.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/deployml/cli/cli.py b/src/deployml/cli/cli.py
index bb73645..78821c3 100644
--- a/src/deployml/cli/cli.py
+++ b/src/deployml/cli/cli.py
@@ -46,7 +46,7 @@
     run_terraform_with_loading_bar,
     _create_docker_folder,
 )
-from deployml.utils.platform_compat import run_tool, resolve_tool
+from deployml.utils.platform_compat import run_tool, resolve_tool, configure_console_encoding
 from deployml.utils.infracost import (
     check_infracost_available,
     run_infracost_analysis,
@@ -3086,6 +3086,10 @@ def main():
     """
     Entry point for the DeployML CLI.
     """
+    # Force UTF-8 on the Windows console first so any emoji or box glyph in command
+    # output cannot raise UnicodeEncodeError on a legacy cp1252 console. No-op off
+    # Windows.
+    configure_console_encoding()
     cli()
 
 

From a875839800c360df34b3edd65766fdbf11297d80 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 15:23:29 -0700
Subject: [PATCH 19/31] run Cloud SQL readiness provisioner under bash (blocker
 3)

The local-exec readiness script uses bash only syntax: set +e, brace expansion {1..30}, command -v, POSIX test brackets, and sleep. On Windows the default local-exec interpreter is cmd.exe, which cannot parse this and fails the provisioner, killing the deploy. Add interpreter = [bash, -c] so it always runs under bash, provided by Git for Windows or WSL. This is also a portability improvement on Ubuntu, whose /bin/sh is dash and does not expand {1..30}. The script self-guards with command -v gcloud and always exits 0, so it degrades gracefully regardless of which bash resolves.
---
 .../terraform/modules/cloud_sql_postgres/main.tf       | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/deployml/terraform/modules/cloud_sql_postgres/main.tf b/src/deployml/terraform/modules/cloud_sql_postgres/main.tf
index 6dd63db..32fd965 100644
--- a/src/deployml/terraform/modules/cloud_sql_postgres/main.tf
+++ b/src/deployml/terraform/modules/cloud_sql_postgres/main.tf
@@ -36,7 +36,15 @@ resource "null_resource" "verify_instance_running" {
   depends_on = [google_sql_database_instance.postgres]
   
   provisioner "local-exec" {
-    command = <<-EOT
+    # Run under bash explicitly. This script uses bash only syntax, set +e, brace
+    # expansion {1..30}, command -v, POSIX test brackets, sleep. On Windows the
+    # default local-exec shell is cmd.exe, which cannot parse any of it, so the
+    # provisioner would fail. bash is provided by Git for Windows or WSL. This is
+    # also a portability win on Ubuntu, where /bin/sh is dash and does not expand
+    # {1..30}. The script self-guards with command -v gcloud and always exits 0, so
+    # it degrades gracefully and never fails the deploy regardless of which bash.
+    interpreter = ["bash", "-c"]
+    command     = <<-EOT
       set +e
       echo "Checking Cloud SQL instance status..."
       if ! command -v gcloud &> /dev/null; then

From 61aa842cf80e433bde39c35794709fc1577db181 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 15:24:56 -0700
Subject: [PATCH 20/31] use robust_rmtree for destroy workspace cleanup
 (blocker 4)

On Windows, shutil.rmtree of the .deployml workspace can fail with PermissionError when a file is marked read only or briefly locked, for example by a sync client. Route the destroy workspace cleanup through robust_rmtree from platform_compat, which clears the read only bit and retries. Removed the now-unused import shutil; this was its only use in cli.py. The Terraform module copy rmtree calls in helpers.py are left as-is per the brief, to be revisited only if they fail.
---
 src/deployml/cli/cli.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/deployml/cli/cli.py b/src/deployml/cli/cli.py
index 78821c3..33ab00a 100644
--- a/src/deployml/cli/cli.py
+++ b/src/deployml/cli/cli.py
@@ -1,7 +1,6 @@
 import sys
 import yaml
 import typer
-import shutil
 import re
 import importlib.resources as pkg_resources
 from deployml.utils.banner import display_banner
@@ -46,7 +45,7 @@
     run_terraform_with_loading_bar,
     _create_docker_folder,
 )
-from deployml.utils.platform_compat import run_tool, resolve_tool, configure_console_encoding
+from deployml.utils.platform_compat import run_tool, resolve_tool, configure_console_encoding, robust_rmtree
 from deployml.utils.infracost import (
     check_infracost_available,
     run_infracost_analysis,
@@ -1896,7 +1895,7 @@ def destroy(
 
             if clean_workspace:
                 typer.echo(" Cleaning workspace...")
-                shutil.rmtree(DEPLOYML_DIR)
+                robust_rmtree(DEPLOYML_DIR)
                 typer.echo(" Workspace cleaned")
             elif yes or typer.confirm("Clean up Terraform state files?"):
                 # --yes propagates to the cleanup confirm so scripted runs do not hang

From 5c1f872076ea36eea08423d21fc78bb77a353e10 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 15:27:13 -0700
Subject: [PATCH 21/31] warn when gke-gcloud-auth-plugin is missing (blocker 5)

kubectl cannot authenticate to GKE without gke-gcloud-auth-plugin, and on Windows the gcloud SDK bin directory holding it is not always on the PATH a subprocess inherits. connect_to_gke_cluster now checks shutil.which for the plugin after connecting and, if absent, prints an actionable message: how to install it and, on Windows, the SDK bin directory to add to PATH. This replaces the cryptic kubectl executable-not-found failure with a clear fix, covering every GKE flow since deploy and destroy both connect through this function.
---
 src/deployml/utils/kubernetes_gke.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/deployml/utils/kubernetes_gke.py b/src/deployml/utils/kubernetes_gke.py
index 54fc0a9..3efdab9 100644
--- a/src/deployml/utils/kubernetes_gke.py
+++ b/src/deployml/utils/kubernetes_gke.py
@@ -1,3 +1,5 @@
+import os
+import shutil
 import subprocess
 import typer
 from pathlib import Path
@@ -40,6 +42,27 @@ def check_gke_cluster_connection(cluster_name: str, zone: Optional[str] = None,
         return False
 
 
+def warn_if_gke_auth_plugin_missing() -> None:
+    """kubectl needs gke-gcloud-auth-plugin to authenticate to GKE. The gcloud SDK
+    installs it into the SDK bin directory, which is not always on the PATH that a
+    subprocess inherits, especially on Windows. Warn with an actionable hint up
+    front instead of letting kubectl fail later with a cryptic
+    "executable gke-gcloud-auth-plugin not found"."""
+    if shutil.which("gke-gcloud-auth-plugin"):
+        return
+    typer.echo(
+        "WARNING: gke-gcloud-auth-plugin was not found on PATH. kubectl cannot "
+        "authenticate to GKE without it."
+    )
+    typer.echo("   Install it: gcloud components install gke-gcloud-auth-plugin")
+    if os.name == "nt":
+        typer.echo(
+            "   Then add the gcloud SDK bin directory to PATH, typically "
+            "%LOCALAPPDATA%\\Google\\Cloud SDK\\google-cloud-sdk\\bin or "
+            "C:\\Program Files (x86)\\Google\\Cloud SDK\\google-cloud-sdk\\bin."
+        )
+
+
 def connect_to_gke_cluster(
     project_id: str,
     cluster_name: str,
@@ -75,6 +98,8 @@ def connect_to_gke_cluster(
             text=True
         )
         typer.echo(f"Connected to cluster: {cluster_name}")
+        # kubectl will now need the GKE auth plugin; warn early if it is missing.
+        warn_if_gke_auth_plugin_missing()
         return True
     except subprocess.CalledProcessError as e:
         typer.echo(f"Failed to connect to cluster: {e.stderr}")

From 6bb37841c1f932b6853075d2c247aae59bf322d5 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 15:30:30 -0700
Subject: [PATCH 22/31] docs: native Windows setup and platform notes

Expand the installation platform notes for native Windows: the required toolchain, using the py launcher to build the venv instead of the Microsoft Store python stub, Git for Windows as a hard requirement for the bash that the Cloud SQL readiness step needs, keeping the working directory off OneDrive to avoid workspace cleanup PermissionErrors, and the gcloud .cmd and PowerShell execution-policy note. The GKE auth plugin PATH note for Windows already exists in the Cloud Run tutorial. mkdocs build --strict passes.
---
 docs/installation.md | 46 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index cdcc94f..1a0288e 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -57,11 +57,45 @@ The doctor checks tool versions, authentication, ADC, the `bq` CLI, enabled APIs
 
 ## Platform notes
 
-deployml is tested on macOS and Linux. Windows users can run it with these caveats:
-
-- The auth commands above and all `deployml` CLI calls work the same in PowerShell, cmd, and WSL.
-- `export PATH=...` examples in the tutorials are bash. On PowerShell use `$env:PATH = "..." + $env:PATH`. On cmd use `set PATH=...;%PATH%`.
-- Docker Desktop on Windows uses the WSL2 backend by default. `deployml build-images` against Cloud Build does not need a local Docker daemon, so the safest path is to skip local builds and let Cloud Build do the work.
-- If you clone the repo on Windows, the included `.gitattributes` forces shell scripts and Dockerfiles to LF line endings. Without this, `docker build` would fail inside containers with `exec format error`.
+deployml runs on macOS, Linux, and native Windows. The CLI commands are identical
+across all three. The engine detects the operating system and adapts underneath, so
+you type the same `deployml` commands everywhere.
+
+### Windows
+
+deployml works on native Windows in PowerShell or cmd. A few setup notes keep it
+smooth and let it work out of the box:
+
+- Toolchain. Install native Windows builds of Python 3.11 or newer, Git for
+  Windows, the gcloud SDK, Terraform, and Docker Desktop. For the Kubernetes paths
+  also install minikube and run `gcloud components install gke-gcloud-auth-plugin`.
+  `deployml doctor` checks the core tools.
+- Python. Install from python.org, then create the virtual environment with the
+  launcher, `py -3.11 -m venv .venv`. A bare `python` on a fresh Windows often
+  resolves to the Microsoft Store stub, which is not a usable interpreter.
+- Git for Windows is required, not optional. It provides the `bash` that the Cloud
+  SQL readiness step runs under during `deployml deploy`. Confirm `bash --version`
+  resolves before you deploy.
+- Keep the project and its working directory off OneDrive. OneDrive holds file
+  handles open and can make workspace cleanup on `deployml destroy` fail with a
+  PermissionError. A path such as `C:\dev\your-project` avoids this.
+- gcloud, bq, and gsutil ship as `.cmd` wrappers on Windows. deployml resolves and
+  invokes them correctly for you. If you run gcloud yourself in PowerShell and see
+  "running scripts is disabled", call `gcloud.cmd` instead of `gcloud`, or run it
+  from cmd.
+
+### Path syntax across shells
+
+- The `export PATH=...` examples in the tutorials are bash. In PowerShell use
+  `$env:PATH = "...;" + $env:PATH`. In cmd use `set PATH=...;%PATH%`.
+
+### Docker and line endings
+
+- Docker Desktop on Windows uses the WSL2 backend by default. The Cloud Run path
+  builds images with Cloud Build and does not need a local Docker daemon, so for
+  Cloud Run you can skip local builds. Docker is needed only for the minikube path.
+- If you clone the repo on Windows, the included `.gitattributes` forces shell
+  scripts and Dockerfiles to LF line endings. Without this, `docker build` would
+  fail inside containers with `exec format error`.
 
 - [Get Started →](tutorials/overview.md)

From 1407ad0ef79723e4fe56fbb56f9473dcc411af47 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 16:04:23 -0700
Subject: [PATCH 23/31] prefer Git bash over WSL bash for terraform local-exec
 (blocker 3)

The interpreter = [bash, -c] fix is not enough on a Windows host that also has WSL: terraform resolves bare bash to C:\Windows\System32\bash.exe, the WSL launcher, which re-translates the Windows command line and strips embedded quotes. The Cloud SQL readiness script gcloud --format=value(state) then reaches bash as an unquoted value(state) and fails with a syntax error on the parenthesis, killing the apply. Proven with subprocess: WSL bash errors on the quoted parens, Git bash returns value(state) cleanly. Fix in the engine: find_windows_bash locates a real Git for Windows bash and terraform_env prepends its directory to PATH for the terraform apply subprocess, so the bare bash interpreter resolves to Git bash. None off Windows, so macOS and Linux are unchanged. Caught by the live W9 Cloud Run deploy.
---
 src/deployml/utils/helpers.py         | 13 +++++--
 src/deployml/utils/platform_compat.py | 55 +++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/src/deployml/utils/helpers.py b/src/deployml/utils/helpers.py
index c43b4ab..be6a228 100644
--- a/src/deployml/utils/helpers.py
+++ b/src/deployml/utils/helpers.py
@@ -8,7 +8,7 @@
 import random
 import string
 from deployml.utils.constants import ANIMAL_NAMES, FALLBACK_WORDS, TERRAFORM_DIR
-from deployml.utils.platform_compat import run_tool, resolve_tool
+from deployml.utils.platform_compat import run_tool, resolve_tool, terraform_env
 import subprocess
 import time
 from rich.progress import (
@@ -473,6 +473,12 @@ def run_terraform_with_loading_bar(cmd, cwd, estimated_minutes, stack=None, verb
     # resolving keeps this robust if the front tool ever changes.
     cmd = [resolve_tool(cmd[0]), *cmd[1:]]
 
+    # On Windows, ensure terraform's local-exec bash interpreter resolves to Git
+    # bash, not the WSL launcher in System32 which mangles quoting and breaks the
+    # Cloud SQL readiness provisioner. None off Windows, so behavior is unchanged
+    # on macOS and Linux.
+    tf_env = terraform_env()
+
     # Default messages if stack is not provided
     default_msgs = [
         "DeployML: Preparing your cloud environment...",
@@ -501,7 +507,8 @@ def run_terraform_with_loading_bar(cmd, cwd, estimated_minutes, stack=None, verb
     if verbose:
         with open(log_file, "w") as f:
             process = subprocess.Popen(
-                cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
+                cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True,
+                env=tf_env,
             )
             for line in iter(process.stdout.readline, ""):
                 print(line, end="", flush=True)
@@ -522,7 +529,7 @@ def run_terraform_with_loading_bar(cmd, cwd, estimated_minutes, stack=None, verb
         f = open(log_file, "w")
         try:
             process = subprocess.Popen(
-                cmd, cwd=cwd, stdout=f, stderr=subprocess.STDOUT
+                cmd, cwd=cwd, stdout=f, stderr=subprocess.STDOUT, env=tf_env
             )
             start_time = time.time()
             estimated_seconds = estimated_minutes * 60
diff --git a/src/deployml/utils/platform_compat.py b/src/deployml/utils/platform_compat.py
index 96a425c..67ce13a 100644
--- a/src/deployml/utils/platform_compat.py
+++ b/src/deployml/utils/platform_compat.py
@@ -82,6 +82,61 @@ def run_tool(name: str, args: list, **kwargs) -> subprocess.CompletedProcess:
         raise
 
 
+def find_windows_bash() -> "str | None":
+    """Absolute path to a real Windows bash (Git for Windows), or None.
+
+    On Windows the bash found first on PATH is often C:\\Windows\\System32\\bash.exe,
+    the WSL launcher. When a Windows process such as terraform.exe invokes it, the
+    WSL launcher re-translates the command line and strips embedded quoting, which
+    breaks Terraform local-exec scripts, for example gcloud --format="value(state)"
+    becomes an unquoted value(state) and bash errors on the parenthesis. Git for
+    Windows ships a normal bash that receives arguments unchanged, so prefer it.
+    Returns None off Windows or if no Git bash is found.
+    """
+    if not IS_WINDOWS:
+        return None
+    candidates = []
+    try:
+        # resolve_tool rejects the extensionless System32\git stub and returns the
+        # real git.exe, for example C:\Program Files\Git\cmd\git.exe.
+        git = resolve_tool("git")
+        git_root = os.path.dirname(os.path.dirname(git))  # ...\Git\cmd -> ...\Git
+        candidates.append(os.path.join(git_root, "bin", "bash.exe"))
+        candidates.append(os.path.join(git_root, "usr", "bin", "bash.exe"))
+    except FileNotFoundError:
+        pass
+    program_files = os.environ.get("ProgramFiles", r"C:\Program Files")
+    program_files_x86 = os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)")
+    local_appdata = os.environ.get("LOCALAPPDATA", "")
+    for base in (program_files, program_files_x86):
+        candidates.append(os.path.join(base, "Git", "bin", "bash.exe"))
+    if local_appdata:
+        candidates.append(os.path.join(local_appdata, "Programs", "Git", "bin", "bash.exe"))
+    for path in candidates:
+        if path and os.path.isfile(path):
+            return path
+    return None
+
+
+def terraform_env() -> "dict | None":
+    """Environment for running terraform so its local-exec provisioners resolve a
+    real Windows bash instead of the WSL launcher.
+
+    Returns None to mean "inherit the current environment unchanged", off Windows
+    or when no Git bash is found. On Windows with Git bash present, returns a copy
+    of the environment with the Git bash directory prepended to PATH, so terraform's
+    bare "bash" interpreter resolves there first, ahead of the WSL launcher in
+    System32.
+    """
+    bash = find_windows_bash()
+    if not bash:
+        return None
+    env = dict(os.environ)
+    bash_dir = os.path.dirname(bash)
+    env["PATH"] = bash_dir + os.pathsep + env.get("PATH", "")
+    return env
+
+
 def configure_console_encoding() -> None:
     """Force UTF-8 on Windows stdout and stderr so non ASCII output never crashes.
 

From 26a3d1848c17303362855cdad0d5a9708b1cef7a Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 17:32:41 -0700
Subject: [PATCH 24/31] decode subprocess output as UTF-8 on Windows (blocker 2
 read side)

configure_console_encoding fixed the write side, but captured subprocess output was still decoded with the legacy cp1252 code page, which raised UnicodeDecodeError on bytes invalid in cp1252, for example 0x9d emitted by minikube. run_tool now sets encoding=utf-8, errors=replace when the caller requests text mode on Windows, and the terraform streaming Popen and its log file in helpers use the same. Caught live by minikube-deploy, where minikube service --url output crashed a subprocess reader thread; after the fix minikube-deploy runs clean with no traceback. None off Windows since utf-8 is already the default there.
---
 src/deployml/utils/helpers.py         | 5 +++--
 src/deployml/utils/platform_compat.py | 9 +++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/deployml/utils/helpers.py b/src/deployml/utils/helpers.py
index be6a228..de0b51f 100644
--- a/src/deployml/utils/helpers.py
+++ b/src/deployml/utils/helpers.py
@@ -505,9 +505,10 @@ def run_terraform_with_loading_bar(cmd, cwd, estimated_minutes, stack=None, verb
     log_file = cwd / "terraform_apply.log"
 
     if verbose:
-        with open(log_file, "w") as f:
+        with open(log_file, "w", encoding="utf-8", errors="replace") as f:
             process = subprocess.Popen(
                 cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True,
+                encoding="utf-8", errors="replace",
                 env=tf_env,
             )
             for line in iter(process.stdout.readline, ""):
@@ -526,7 +527,7 @@ def run_terraform_with_loading_bar(cmd, cwd, estimated_minutes, stack=None, verb
         task = progress.add_task(resource_msgs[0], total=100)
 
         # Open log file and keep it open until process completes
-        f = open(log_file, "w")
+        f = open(log_file, "w", encoding="utf-8", errors="replace")
         try:
             process = subprocess.Popen(
                 cmd, cwd=cwd, stdout=f, stderr=subprocess.STDOUT, env=tf_env
diff --git a/src/deployml/utils/platform_compat.py b/src/deployml/utils/platform_compat.py
index 67ce13a..8feddb1 100644
--- a/src/deployml/utils/platform_compat.py
+++ b/src/deployml/utils/platform_compat.py
@@ -69,6 +69,15 @@ def run_tool(name: str, args: list, **kwargs) -> subprocess.CompletedProcess:
     as a direct subprocess.run call would.
     """
     resolved = resolve_tool(name)
+    # On Windows, when the caller captures output as text, subprocess decodes the
+    # child's bytes with the legacy cp1252 code page by default. Tools like minikube
+    # emit bytes that are invalid in cp1252, for example 0x9d, which raises
+    # UnicodeDecodeError. Decode as UTF-8 with replacement instead, the read side
+    # companion to configure_console_encoding. Only when no explicit encoding was
+    # requested, so callers keep full control.
+    if IS_WINDOWS and (kwargs.get("text") or kwargs.get("universal_newlines")):
+        kwargs.setdefault("encoding", "utf-8")
+        kwargs.setdefault("errors", "replace")
     try:
         return subprocess.run([resolved, *args], **kwargs)
     except OSError:

From 8099dec670706b21a3762d300a25993be8100ff4 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Sun, 31 May 2026 17:56:46 -0700
Subject: [PATCH 25/31] docs: minikube tunnel, memory, and gcloud component
 notes for Windows

Document the Windows minikube nuances found during live validation: the Docker Desktop driver service URL is not reachable from the host without minikube tunnel, minikube service --url, or kubectl port-forward; MLflow on minikube needs at least 4 GB which may not fit on an 8 GB machine; and gcloud components install may need CLOUDSDK_PYTHON via copy-bundled-python in a non interactive shell.
---
 docs/installation.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/installation.md b/docs/installation.md
index 1a0288e..0fba6e7 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -83,6 +83,17 @@ smooth and let it work out of the box:
   invokes them correctly for you. If you run gcloud yourself in PowerShell and see
   "running scripts is disabled", call `gcloud.cmd` instead of `gcloud`, or run it
   from cmd.
+- minikube on Windows uses the Docker Desktop driver. The service URL deployml
+  prints sits on minikube's internal network and is not reachable from the Windows
+  host directly. Reach it with `minikube tunnel`, `minikube service <name> --url`,
+  or `kubectl port-forward svc/<name> <local>:<port>`. MLflow on minikube wants at
+  least 4 GB, so start with `minikube start --memory=4096 --cpus=2` on a machine
+  that can spare it; an 8 GB machine that is also running Docker Desktop and other
+  apps may not have room for the MLflow pod.
+- Installing a gcloud component such as the GKE auth plugin with
+  `gcloud components install` may, in a non interactive shell, ask you to set
+  `CLOUDSDK_PYTHON` first; run `gcloud components copy-bundled-python` and set the
+  printed path, or just run the install from an interactive prompt.
 
 ### Path syntax across shells
 

From 7496629403dd7f86a7fc84310fa60775f17916a6 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sun, 31 May 2026 21:34:07 -0700
Subject: [PATCH 26/31] remove duplicate subprocess and shutil imports in
 helpers

subprocess was imported twice and shutil was re-imported locally in cleanup_terraform_files; both already exist at module top. No behavior change.
---
 src/deployml/utils/helpers.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/deployml/utils/helpers.py b/src/deployml/utils/helpers.py
index de0b51f..6c5d69a 100644
--- a/src/deployml/utils/helpers.py
+++ b/src/deployml/utils/helpers.py
@@ -9,7 +9,6 @@
 import string
 from deployml.utils.constants import ANIMAL_NAMES, FALLBACK_WORDS, TERRAFORM_DIR
 from deployml.utils.platform_compat import run_tool, resolve_tool, terraform_env
-import subprocess
 import time
 from rich.progress import (
     Progress,
@@ -435,8 +434,6 @@ def cleanup_terraform_files(terraform_dir: Path):
     """
     Clean up Terraform state and lock files from the specified directory.
     """
-    import shutil
-
     cleanup_files = [
         ".terraform",
         "terraform.tfstate",

From 2cc4c30127f3c425bb79fa67b734da4580bde358 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sun, 31 May 2026 21:34:07 -0700
Subject: [PATCH 27/31] skip docker permission check in doctor when docker is
 absent

Previously went straight to run_tool('docker', ['ps']); with docker missing, resolve_tool raised FileNotFoundError caught into a misleading FAIL on top of the real Docker-not-found FAIL. Guard with shutil.which and emit a clean SKIP, matching the other checks.
---
 src/deployml/diagnostics/doctor.py | 11 ++++++++++
 tests/test_doctor.py               | 33 ++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 tests/test_doctor.py

diff --git a/src/deployml/diagnostics/doctor.py b/src/deployml/diagnostics/doctor.py
index f06ba60..e04196c 100644
--- a/src/deployml/diagnostics/doctor.py
+++ b/src/deployml/diagnostics/doctor.py
@@ -393,6 +393,17 @@ def _check_infracost(self):
     
     def _check_docker_permissions(self):
         """Check Docker permissions"""
+        if not shutil.which('docker'):
+            # _check_docker already reports the missing binary. Skip here rather
+            # than emit a misleading "cannot run docker" permission failure.
+            self._add_result(CheckResult(
+                name="Docker Permissions",
+                status=CheckStatus.SKIP,
+                message="Docker not installed, skipping permission check",
+                required=False
+            ))
+            return
+
         try:
             result = run_tool('docker', ['ps'], capture_output=True, text=True)
             if result.returncode == 0:
diff --git a/tests/test_doctor.py b/tests/test_doctor.py
new file mode 100644
index 0000000..6596ff3
--- /dev/null
+++ b/tests/test_doctor.py
@@ -0,0 +1,33 @@
+"""Unit tests for the deployml doctor checks. No real Docker daemon or GCP calls."""
+from types import SimpleNamespace
+from unittest.mock import patch
+
+import deployml.diagnostics.doctor as doctor_mod
+from deployml.diagnostics.doctor import DeployMLDoctor, CheckStatus
+
+
+def test_docker_permissions_skips_when_docker_missing():
+    """When docker is not on PATH, the permissions check should SKIP cleanly
+    rather than emit a misleading FAIL. _check_docker already reports the
+    missing binary, so a second 'cannot run docker' failure is noise."""
+    d = DeployMLDoctor()
+    d.results.clear()
+    with patch.object(doctor_mod.shutil, "which", return_value=None), \
+         patch.object(doctor_mod, "run_tool", side_effect=FileNotFoundError("docker")):
+        d._check_docker_permissions()
+    assert len(d.results) == 1
+    result = d.results[0]
+    assert result.name == "Docker Permissions"
+    assert result.status == CheckStatus.SKIP
+
+
+def test_docker_permissions_pass_when_docker_runs():
+    """Regression lock: when docker is present and `docker ps` succeeds, the
+    check still reports PASS through the new guard."""
+    d = DeployMLDoctor()
+    d.results.clear()
+    ok = SimpleNamespace(returncode=0, stdout="", stderr="")
+    with patch.object(doctor_mod.shutil, "which", return_value="/usr/local/bin/docker"), \
+         patch.object(doctor_mod, "run_tool", return_value=ok):
+        d._check_docker_permissions()
+    assert d.results[0].status == CheckStatus.PASS

From 51d690ce8efa70874ed92bbafa49d5b7f1bc480c Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sun, 31 May 2026 21:34:07 -0700
Subject: [PATCH 28/31] merge duplicate markdown_extensions key in mkdocs
 config

Two markdown_extensions blocks meant YAML kept only the second, silently dropping attr_list and pymdownx.emoji. Merge into one block so all extensions load. mkdocs build --strict passes.
---
 mkdocs.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index 1c66f55..328ff83 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -43,8 +43,6 @@ markdown_extensions:
   - pymdownx.emoji:
       emoji_index: !!python/name:material.extensions.emoji.twemoji
       emoji_generator: !!python/name:material.extensions.emoji.to_svg
-
-markdown_extensions:
   - pymdownx.highlight:
       anchor_linenums: true
       line_spans: __span

From 877d04578c8201a92de028f4dafa99449c095450 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sun, 31 May 2026 21:34:07 -0700
Subject: [PATCH 29/31] add unit tests for platform_compat

Cover resolve_tool resolve and raise, run_tool kwarg passthrough and the Windows utf-8 decode branch, robust_rmtree, and configure_console_encoding. Closes the gap where mocking run_tool wholesale skipped resolve_tool.
---
 tests/test_platform_compat.py | 103 ++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 tests/test_platform_compat.py

diff --git a/tests/test_platform_compat.py b/tests/test_platform_compat.py
new file mode 100644
index 0000000..9dee129
--- /dev/null
+++ b/tests/test_platform_compat.py
@@ -0,0 +1,103 @@
+"""Unit tests for deployml.utils.platform_compat.
+
+These close the coverage gap noted in the macOS re-validation: the helper test
+suite mocks run_tool wholesale, so resolve_tool's resolve/raise contract and
+run_tool's kwarg passthrough were never exercised. No real external tools are
+launched; the only real side effect is rmtree on a pytest tmp_path.
+"""
+import os
+import stat
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import deployml.utils.platform_compat as pc
+
+
+# ---------- resolve_tool ----------
+
+def test_resolve_tool_returns_path_when_found():
+    with patch.object(pc.shutil, "which", return_value="/usr/local/bin/gcloud"):
+        assert pc.resolve_tool("gcloud") == "/usr/local/bin/gcloud"
+
+
+def test_resolve_tool_raises_filenotfound_when_missing():
+    with patch.object(pc.shutil, "which", return_value=None):
+        with pytest.raises(FileNotFoundError) as exc:
+            pc.resolve_tool("nonexistent-tool-xyz")
+    assert "nonexistent-tool-xyz" in str(exc.value)
+
+
+# ---------- run_tool ----------
+
+def test_run_tool_forwards_resolved_path_args_and_kwargs():
+    """run_tool must prepend the resolved path and pass every kwarg straight
+    through to subprocess.run. The whole blocker 1 refactor depends on this."""
+    completed = SimpleNamespace(returncode=0)
+    with patch.object(pc, "resolve_tool", return_value="/usr/local/bin/gcloud"), \
+         patch.object(pc.subprocess, "run", return_value=completed) as mock_run:
+        result = pc.run_tool("gcloud", ["version", "--quiet"],
+                             capture_output=True, check=True)
+    assert result is completed
+    mock_run.assert_called_once_with(
+        ["/usr/local/bin/gcloud", "version", "--quiet"],
+        capture_output=True, check=True,
+    )
+
+
+def test_run_tool_decodes_utf8_in_windows_text_mode():
+    """On Windows, text-mode captures must be decoded as UTF-8 with replacement
+    to avoid the cp1252 UnicodeDecodeError. Off Windows this branch is skipped."""
+    completed = SimpleNamespace(returncode=0)
+    with patch.object(pc, "IS_WINDOWS", True), \
+         patch.object(pc, "resolve_tool", return_value=r"C:\sdk\bin\gcloud.cmd"), \
+         patch.object(pc.subprocess, "run", return_value=completed) as mock_run:
+        pc.run_tool("gcloud", ["version"], text=True)
+    _, kwargs = mock_run.call_args
+    assert kwargs.get("encoding") == "utf-8"
+    assert kwargs.get("errors") == "replace"
+
+
+# ---------- robust_rmtree ----------
+
+def test_robust_rmtree_removes_directory_tree(tmp_path):
+    target = tmp_path / "ws"
+    (target / "sub").mkdir(parents=True)
+    (target / "sub" / "f.txt").write_text("data")
+    pc.robust_rmtree(str(target))
+    assert not target.exists()
+
+
+def test_robust_rmtree_is_noop_on_missing_path(tmp_path):
+    missing = tmp_path / "does-not-exist"
+    pc.robust_rmtree(str(missing))  # must not raise
+    assert not missing.exists()
+
+
+def test_robust_rmtree_removes_tree_with_readonly_file(tmp_path):
+    target = tmp_path / "ws"
+    target.mkdir()
+    ro = target / "ro.txt"
+    ro.write_text("x")
+    os.chmod(ro, stat.S_IREAD)
+    pc.robust_rmtree(str(target))
+    assert not target.exists()
+
+
+# ---------- configure_console_encoding ----------
+
+def test_configure_console_encoding_is_noop_off_windows():
+    fake_sys = SimpleNamespace(stdout=MagicMock(), stderr=MagicMock())
+    with patch.object(pc, "IS_WINDOWS", False), patch.object(pc, "sys", fake_sys):
+        assert pc.configure_console_encoding() is None
+    fake_sys.stdout.reconfigure.assert_not_called()
+    fake_sys.stderr.reconfigure.assert_not_called()
+
+
+def test_configure_console_encoding_forces_utf8_on_windows():
+    fake_sys = SimpleNamespace(stdout=MagicMock(), stderr=MagicMock())
+    with patch.object(pc, "IS_WINDOWS", True), patch.object(pc, "sys", fake_sys):
+        pc.configure_console_encoding()
+    fake_sys.stdout.reconfigure.assert_called_once_with(encoding="utf-8", errors="replace")
+    fake_sys.stderr.reconfigure.assert_called_once_with(encoding="utf-8", errors="replace")

From 1bdd7bba9a8b0c8317812f4e1585b8de5a82a081 Mon Sep 17 00:00:00 2001
From: Jivan Bal <bal.jivan@yahoo.com>
Date: Sun, 31 May 2026 21:34:07 -0700
Subject: [PATCH 30/31] clean up orphaned PVC disk on gke-destroy
 --delete-cluster

Deleting the cluster tore down the in-cluster CSI driver before it reclaimed the PVC backing PersistentDisk, orphaning a billing disk. Capture the disk before teardown and remove it after the cluster is deleted, touching only the disk our own PVC created. Validated live on GKE: reproduced the orphan and the fix removed it, zero residual.
---
 src/deployml/cli/cli.py              | 28 ++++++++-
 src/deployml/utils/kubernetes_gke.py | 82 ++++++++++++++++++++++++++
 tests/test_gke_destroy.py            | 86 ++++++++++++++++++++++++++++
 3 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_gke_destroy.py

diff --git a/src/deployml/cli/cli.py b/src/deployml/cli/cli.py
index 33ab00a..bb8b437 100644
--- a/src/deployml/cli/cli.py
+++ b/src/deployml/cli/cli.py
@@ -2709,11 +2709,32 @@ def gke_destroy(
         typer.echo("Either --zone or --region must be provided")
         raise typer.Exit(code=1)
 
-    from deployml.utils.kubernetes_gke import connect_to_gke_cluster
+    from deployml.utils.kubernetes_gke import (
+        connect_to_gke_cluster,
+        get_pvc_volume_handle,
+        disk_ref_from_volume_handle,
+        delete_gce_disk_if_exists,
+    )
 
     if not connect_to_gke_cluster(project, cluster, zone, region):
         raise typer.Exit(code=1)
 
+    # Capture the PVC's backing PersistentDisk BEFORE teardown. With
+    # --delete-cluster the in-cluster CSI driver can be removed before it finishes
+    # reclaiming the PD asynchronously, which orphans a billing disk (concern C12).
+    # We capture only the disk our own PVC created, then guarantee its removal
+    # after the cluster is gone.
+    pvc_disk_ref = None
+    if delete_cluster:
+        pvc_manifest = manifest_dir / "pvc.yaml"
+        if pvc_manifest.exists():
+            try:
+                pvc_name = yaml.safe_load(pvc_manifest.read_text())["metadata"]["name"]
+                handle = get_pvc_volume_handle(pvc_name, namespace)
+                pvc_disk_ref = disk_ref_from_volume_handle(handle) if handle else None
+            except Exception:
+                pvc_disk_ref = None
+
     # Delete in reverse order: service, then deployment, then PVC last. The PVC
     # is deleted explicitly because its backing PersistentDisk bills even after
     # the workload is gone (GKE's default storageclass reclaims on PVC delete).
@@ -2768,6 +2789,11 @@ def gke_destroy(
             result = run_tool(cmd[0], cmd[1:], capture_output=True, text=True)
             if result.returncode == 0:
                 typer.echo(f" Cluster {cluster} deleted")
+                # The cluster is gone, so the CSI driver can no longer reclaim the
+                # PVC's PersistentDisk. Guarantee that one disk is removed. No-op
+                # if the driver already reclaimed it before the cluster delete.
+                if pvc_disk_ref:
+                    delete_gce_disk_if_exists(project, pvc_disk_ref)
                 break
             if "incompatible operation" in (result.stderr or "").lower():
                 typer.echo("   Cluster busy with another operation, retrying in 20s...")
diff --git a/src/deployml/utils/kubernetes_gke.py b/src/deployml/utils/kubernetes_gke.py
index 3efdab9..bc10e37 100644
--- a/src/deployml/utils/kubernetes_gke.py
+++ b/src/deployml/utils/kubernetes_gke.py
@@ -63,6 +63,88 @@ def warn_if_gke_auth_plugin_missing() -> None:
         )
 
 
+def disk_ref_from_volume_handle(volume_handle):
+    """Parse a GCE PD CSI volume handle into (disk_name, location_flag, location).
+
+    Handles zonal "projects/P/zones/Z/disks/NAME" and regional
+    "projects/P/regions/R/disks/NAME" forms. Returns None when the string is not a
+    GCE PD handle, so callers can skip cleanup safely.
+    """
+    if not volume_handle:
+        return None
+    parts = volume_handle.strip().strip("/").split("/")
+    if "disks" not in parts:
+        return None
+    di = parts.index("disks")
+    if di + 1 >= len(parts):
+        return None
+    disk_name = parts[di + 1]
+    if "zones" in parts:
+        return (disk_name, "--zone", parts[parts.index("zones") + 1])
+    if "regions" in parts:
+        return (disk_name, "--region", parts[parts.index("regions") + 1])
+    return None
+
+
+def get_pvc_volume_handle(pvc_name, namespace=None):
+    """Return the CSI volume handle of the PV bound to pvc_name, or None.
+
+    Must be called while the PVC still exists, since it follows the PVC to its
+    bound PV and reads the PV's CSI volume handle. Used by gke-destroy to capture
+    the backing PersistentDisk before teardown.
+    """
+    ns = ["-n", namespace] if namespace and namespace != "default" else []
+    pv = run_tool(
+        "kubectl",
+        ["get", "pvc", pvc_name, "-o", "jsonpath={.spec.volumeName}"] + ns,
+        capture_output=True, text=True,
+    )
+    pv_name = (pv.stdout or "").strip()
+    if pv.returncode != 0 or not pv_name:
+        return None
+    handle = run_tool(
+        "kubectl",
+        ["get", "pv", pv_name, "-o", "jsonpath={.spec.csi.volumeHandle}"],
+        capture_output=True, text=True,
+    )
+    vh = (handle.stdout or "").strip()
+    return vh or None
+
+
+def delete_gce_disk_if_exists(project, disk_ref) -> bool:
+    """Best-effort delete of a specific GCE PersistentDisk.
+
+    disk_ref is (disk_name, location_flag, location) as returned by
+    disk_ref_from_volume_handle. If the disk is already gone, for example the CSI
+    driver reclaimed it before the cluster was deleted, describe fails and no
+    delete is issued. Only ever touches the one disk passed in, so it cannot
+    affect unrelated disks. Returns True if the disk is absent afterward.
+    """
+    disk_name, loc_flag, loc = disk_ref
+    describe = run_tool(
+        "gcloud",
+        ["compute", "disks", "describe", disk_name, loc_flag, loc,
+         "--project", project, "--format=value(name)"],
+        capture_output=True, text=True,
+    )
+    if describe.returncode != 0:
+        return True
+    typer.echo(f" Removing orphaned persistent disk {disk_name}...")
+    run_tool(
+        "gcloud",
+        ["compute", "disks", "delete", disk_name, loc_flag, loc,
+         "--project", project, "--quiet"],
+        capture_output=True, text=True,
+    )
+    verify = run_tool(
+        "gcloud",
+        ["compute", "disks", "describe", disk_name, loc_flag, loc,
+         "--project", project, "--format=value(name)"],
+        capture_output=True, text=True,
+    )
+    return verify.returncode != 0
+
+
 def connect_to_gke_cluster(
     project_id: str,
     cluster_name: str,
diff --git a/tests/test_gke_destroy.py b/tests/test_gke_destroy.py
new file mode 100644
index 0000000..94c76b7
--- /dev/null
+++ b/tests/test_gke_destroy.py
@@ -0,0 +1,86 @@
+"""Unit tests for the GKE teardown disk-reclaim logic.
+
+Covers the fix for the orphaned PersistentDisk left by gke-destroy
+--delete-cluster: the in-cluster CSI driver reclaims a PVC's backing PD
+asynchronously, and deleting the cluster too soon orphans a billing disk. No real
+GCP or kubectl calls; run_tool is mocked at the boundary.
+"""
+from types import SimpleNamespace
+from unittest.mock import patch
+
+import deployml.utils.kubernetes_gke as gke
+
+
+# ---------- disk_ref_from_volume_handle (pure parsing) ----------
+
+def test_disk_ref_from_zonal_volume_handle():
+    h = "projects/my-proj/zones/us-west1-a/disks/pvc-1234"
+    assert gke.disk_ref_from_volume_handle(h) == ("pvc-1234", "--zone", "us-west1-a")
+
+
+def test_disk_ref_from_regional_volume_handle():
+    h = "projects/my-proj/regions/us-west1/disks/pvc-abcd"
+    assert gke.disk_ref_from_volume_handle(h) == ("pvc-abcd", "--region", "us-west1")
+
+
+def test_disk_ref_from_volume_handle_none_for_garbage():
+    assert gke.disk_ref_from_volume_handle("") is None
+    assert gke.disk_ref_from_volume_handle("not-a-handle") is None
+    assert gke.disk_ref_from_volume_handle(None) is None
+
+
+# ---------- get_pvc_volume_handle (boundary mocked) ----------
+
+def test_get_pvc_volume_handle_returns_handle_when_bound():
+    def fake_run_tool(name, args, **kw):
+        if args[:2] == ["get", "pvc"]:
+            return SimpleNamespace(returncode=0, stdout="pv-xyz\n", stderr="")
+        if args[:2] == ["get", "pv"]:
+            return SimpleNamespace(
+                returncode=0, stdout="projects/p/zones/z/disks/pvc-1\n", stderr=""
+            )
+        raise AssertionError(f"unexpected call {args}")
+
+    with patch.object(gke, "run_tool", side_effect=fake_run_tool):
+        assert gke.get_pvc_volume_handle("mlflow-pvc") == "projects/p/zones/z/disks/pvc-1"
+
+
+def test_get_pvc_volume_handle_none_when_unbound():
+    with patch.object(gke, "run_tool",
+                      return_value=SimpleNamespace(returncode=0, stdout="", stderr="")):
+        assert gke.get_pvc_volume_handle("mlflow-pvc") is None
+
+
+# ---------- delete_gce_disk_if_exists (boundary mocked) ----------
+
+def test_delete_gce_disk_skips_when_already_gone():
+    """If the CSI driver already reclaimed the disk, describe fails and no delete
+    must be issued."""
+    calls = []
+
+    def fake_run_tool(name, args, **kw):
+        calls.append(args)
+        return SimpleNamespace(returncode=1, stdout="", stderr="NOT_FOUND")
+
+    with patch.object(gke, "run_tool", side_effect=fake_run_tool):
+        ok = gke.delete_gce_disk_if_exists("my-proj", ("pvc-1", "--zone", "us-west1-a"))
+    assert ok is True
+    assert all(a[:3] != ["compute", "disks", "delete"] for a in calls)
+
+
+def test_delete_gce_disk_deletes_when_present():
+    seq = [
+        SimpleNamespace(returncode=0, stdout="pvc-1\n", stderr=""),    # describe: found
+        SimpleNamespace(returncode=0, stdout="", stderr=""),           # delete
+        SimpleNamespace(returncode=1, stdout="", stderr="NOT_FOUND"),  # describe: gone
+    ]
+    calls = []
+
+    def fake_run_tool(name, args, **kw):
+        calls.append(args)
+        return seq.pop(0)
+
+    with patch.object(gke, "run_tool", side_effect=fake_run_tool):
+        ok = gke.delete_gce_disk_if_exists("my-proj", ("pvc-1", "--zone", "us-west1-a"))
+    assert ok is True
+    assert any(a[:3] == ["compute", "disks", "delete"] for a in calls)

From b13e8fe6900d59c1598b98af173b127098dd3f43 Mon Sep 17 00:00:00 2001
From: jivanb7 <bal.jivan1@gmail.com>
Date: Mon, 1 Jun 2026 00:03:40 -0700
Subject: [PATCH 31/31] warn when gke-init image push fails instead of silently
 producing a broken manifest

gke-init ignored the push_image_to_gcr return value, so a failed docker push (Docker not running, auth, or any error) silently produced a manifest referencing a gcr.io image that does not exist; the user only discovered it later as an ImagePullBackOff at deploy time. Now check the result and emit a clear, actionable warning naming the image and the consequence and how to fix it. Applies to both the fastapi and mlflow gke-init generators. 63 unit tests pass.
---
 src/deployml/utils/kubernetes_gke.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/deployml/utils/kubernetes_gke.py b/src/deployml/utils/kubernetes_gke.py
index bc10e37..72ec15f 100644
--- a/src/deployml/utils/kubernetes_gke.py
+++ b/src/deployml/utils/kubernetes_gke.py
@@ -247,8 +247,13 @@ def generate_fastapi_manifests_gke(
     # avoid the :latest drift bug that bites the Cloud Run path the same way.
     if not image.startswith("gcr.io/"):
         gcr_image = f"gcr.io/{project_id}/fastapi/fastapi:v{_DEPLOYML_VERSION}"
-        if push_image:
-            push_image_to_gcr(image, gcr_image, project_id)
+        if push_image and not push_image_to_gcr(image, gcr_image, project_id):
+            typer.echo(
+                f"WARNING: could not push the image to {gcr_image}. The manifest "
+                f"references that image, so the GKE deploy will fail with "
+                f"ImagePullBackOff until it exists. Start Docker and retry, or push "
+                f"the image to {gcr_image} yourself, then deploy."
+            )
         image = gcr_image
     else:
         gcr_image = image
@@ -344,8 +349,13 @@ def generate_mlflow_manifests_gke(
     # Convert local image to GCR format. Pin tag to the deployml version.
     if not image.startswith("gcr.io/"):
         gcr_image = f"gcr.io/{project_id}/mlflow/mlflow:v{_DEPLOYML_VERSION}"
-        if push_image:
-            push_image_to_gcr(image, gcr_image, project_id)
+        if push_image and not push_image_to_gcr(image, gcr_image, project_id):
+            typer.echo(
+                f"WARNING: could not push the image to {gcr_image}. The manifest "
+                f"references that image, so the GKE deploy will fail with "
+                f"ImagePullBackOff until it exists. Start Docker and retry, or push "
+                f"the image to {gcr_image} yourself, then deploy."
+            )
         image = gcr_image
     else:
         gcr_image = image