From d9a8569432176c4ce3369b3d0a9489d25a2910fb Mon Sep 17 00:00:00 2001 From: Dan Aschwanden Date: Mon, 1 Jun 2026 14:31:00 +0000 Subject: [PATCH 1/4] Adds IaC for GCP --- .gitignore | 10 +- README.md | 2 + hack/ate-dev-env.sh.gcp | 80 ++++++++ hack/iac/README.md | 117 ++++++++++++ hack/iac/main.tf | 400 ++++++++++++++++++++++++++++++++++++++++ hack/iac/outputs.tf | 38 ++++ hack/iac/variables.tf | 106 +++++++++++ 7 files changed, 752 insertions(+), 1 deletion(-) create mode 100644 hack/ate-dev-env.sh.gcp create mode 100644 hack/iac/README.md create mode 100644 hack/iac/main.tf create mode 100644 hack/iac/outputs.tf create mode 100644 hack/iac/variables.tf diff --git a/.gitignore b/.gitignore index d7afedf39..49405a6b3 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,12 @@ __pycache__/ *.pyc # Local environment variables -.ate-dev-env.sh \ No newline at end of file +.ate-dev-env.sh + +# Local .terraform directories +*.terraform/ +*.terraform.lock.* + +# .tfstate files +*.tfstate +*.tfstate.* diff --git a/README.md b/README.md index de0f7a498..07bd92c18 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,8 @@ curl -X POST -H "Host: my-counter-1.actors.resources.substrate.ate.dev" -i http: ### GKE Quickstart (Development) +> For a declarative, Terraform-based setup that starts from a vanilla Google Cloud project, see [hack/iac/README.md](hack/iac/README.md) (GKE Quickstart (Production)). + 1. Create and configure your environment file: ```bash cp hack/ate-dev-env.sh.example .ate-dev-env.sh diff --git a/hack/ate-dev-env.sh.gcp b/hack/ate-dev-env.sh.gcp new file mode 100644 index 000000000..6611b8ea9 --- /dev/null +++ b/hack/ate-dev-env.sh.gcp @@ -0,0 +1,80 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Environment variables for Substrate development on GCP when the underlying +# resources are provisioned with Terraform (see hack/iac/). +# +# Copy this file to .ate-dev-env.sh and customize it for your environment. +# The values here must match the variables in hack/iac/terraform.tfvars. +# +# Unlike hack/ate-dev-env.sh.example (which is consumed by the +# `go run ./tools/setup-gcp` provisioner), the resources referenced below are +# created by Terraform. Run `terraform apply` in hack/iac/ first, then source +# this file before deploying with hack/install-ate.sh. + +export PROJECT_ID=${USER}-gke-dev +export PROJECT_NUMBER=$(gcloud projects describe ${PROJECT_ID} --format="value(projectNumber)") + +export GCE_REGION=us-central1 +export CLUSTER_LOCATION=us-central1-c + +# VPC and subnet are created by Terraform (hack/iac/network.tf) and are both +# named "substrate". +export NETWORK=substrate +export SUBNETWORK=substrate + +export CLUSTER_NAME=substrate-poc +export CLUSTER_VERSION=1.35.0-gke.2398000 + +# The gVisor sandbox runtime runs on the "worker" node pool created by +# Terraform (hack/iac/cluster.tf). The default pool keeps a single node for +# system and non-gVisor workloads. +export NODE_POOL_NAME=worker +export NODE_POOL_VERSION=1.35.0-gke.2398000 +export DEFAULT_NODE_MACHINE_TYPE=e2-standard-2 +export GVISOR_NODE_MACHINE_TYPE=c3-standard-4 + +# Set this if you are using an existing cluster with a different context name. +export KUBECTL_CONTEXT= + +export BUCKET_NAME=snapshot-substrate-test-${PROJECT_ID} + +# Artifact Registry repository created by Terraform (hack/iac/artifactregistry.tf). +# Cloud Build pushes images here; the cluster pulls from it. +export AR_REPOSITORY_ID=substrate +export KO_DOCKER_REPO="${GCE_REGION}-docker.pkg.dev/${PROJECT_ID}/${AR_REPOSITORY_ID}" + +# Set this if you want to override the default build platforms +export KO_DEFAULTPLATFORMS=linux/amd64 + +# ── Terraform inputs ────────────────────────────────────────────────────────── +# Terraform automatically picks up any variable from a TF_VAR_ environment +# variable, so we derive its inputs from the values above instead of maintaining +# a separate terraform.tfvars. Source this file before running terraform in +# hack/iac/ and the variables will be populated automatically. +export TF_VAR_project_id=${PROJECT_ID} +export TF_VAR_gce_region=${GCE_REGION} +export TF_VAR_cluster_location=${CLUSTER_LOCATION} +export TF_VAR_cluster_name=${CLUSTER_NAME} +export TF_VAR_cluster_version=${CLUSTER_VERSION} +export TF_VAR_default_node_machine_type=${DEFAULT_NODE_MACHINE_TYPE} +export TF_VAR_worker_node_machine_type=${GVISOR_NODE_MACHINE_TYPE} +export TF_VAR_bucket_name=${BUCKET_NAME} +export TF_VAR_ar_repository_id=${AR_REPOSITORY_ID} + +# Networking CIDRs use the defaults in hack/iac/variables.tf. Uncomment and set +# these to override them. +# export TF_VAR_subnet_cidr=10.0.0.0/20 +# export TF_VAR_pods_cidr=10.1.0.0/16 +# export TF_VAR_services_cidr=10.2.0.0/20 diff --git a/hack/iac/README.md b/hack/iac/README.md new file mode 100644 index 000000000..9e861fea8 --- /dev/null +++ b/hack/iac/README.md @@ -0,0 +1,117 @@ +### GKE Quickstart (Production) + +This is the Terraform equivalent of the `go run ./tools/setup-gcp --all` +provisioner described in the [GKE Quickstart (Development)](../../README.md) +section. It provisions the same GCP resources — a GKE cluster, snapshot bucket, +Artifact Registry repository, and IAM bindings — but does so declaratively and +starting from a **vanilla Google Cloud project**: no APIs enabled, no VPC, and +no subnets are assumed to exist beforehand. + +The configuration lives in [`hack/iac/`](.) and uses resources from the +[Terraform Google Cloud provider](https://registry.terraform.io/providers/hashicorp/google/latest/docs) +directly (no modules), so every resource is visible and easy to adapt. + +What gets created: + +- A dedicated `substrate` VPC and subnet (VPC-native, with secondary ranges for pods and services). +- A GKE cluster with Workload Identity and the required Kubernetes beta APIs enabled. +- A single-node default pool (for system and non-gVisor workloads) and a `worker` pool running the gVisor sandbox runtime. +- A GCS bucket for sandbox snapshots. +- An Artifact Registry repository, with Cloud Build granted write access and the cluster granted read access. +- All required Google Cloud APIs. + +#### Prerequisites + +1. Install [Terraform](https://developer.hashicorp.com/terraform/install) (>= 1.5) and the [`gcloud` CLI](https://cloud.google.com/sdk/docs/install). + +2. Create and source your environment file. This single file drives both + Terraform and the deployment scripts: it exports `TF_VAR_*` variables that + Terraform picks up automatically, so there is no separate `terraform.tfvars` + to keep in sync. Source it now, before any of the steps below — everything + that follows relies on the variables it exports: + ```bash + cp hack/ate-dev-env.sh.gcp .ate-dev-env.sh + + # Edit .ate-dev-env.sh to match your project and preferences, then source it: + source .ate-dev-env.sh + ``` + +3. Authenticate with application-default credentials: + ```bash + gcloud auth application-default login --project=${PROJECT_ID} + ``` + +4. Bootstrap the two APIs that Terraform itself depends on. Although this + configuration enables all required APIs via `google_project_service`, that + resource needs the **Service Usage API** to function, and the + `google_project` data source read during `terraform plan` needs the **Cloud + Resource Manager API**. Neither can be enabled by Terraform on a truly + vanilla project (chicken-and-egg), so enable them once up front with + `gcloud`: + ```bash + gcloud services enable \ + serviceusage.googleapis.com \ + cloudresourcemanager.googleapis.com \ + --project=${PROJECT_ID} + ``` + Terraform manages the remaining APIs from there. + +#### Provisioning + +1. Initialize Terraform and review the plan. Terraform reads its inputs from the + `TF_VAR_*` variables exported by the environment file you sourced in the + prerequisites, so make sure `.ate-dev-env.sh` is sourced in your current + shell: + ```bash + cd hack/iac + terraform init + terraform plan + ``` + +2. Apply to provision all resources: + ```bash + terraform apply + ``` + +3. Configure `kubectl` to talk to the new cluster. Terraform prints the exact + command as the `get_credentials_command` output: + ```bash + gcloud container clusters get-credentials ${CLUSTER_NAME} --location ${CLUSTER_LOCATION} --project ${PROJECT_ID} + ``` + +4. Configure Docker authentication for Artifact Registry. The deployment scripts + build and push images locally (via `ko`/Docker) to `KO_DOCKER_REPO`, so the + human or CI principal running the deployment pushes directly — it does **not** + go through Cloud Build. Authenticate your local Docker client against the + registry host: + ```bash + gcloud auth configure-docker ${GCE_REGION}-docker.pkg.dev + ``` + This Terraform configuration only grants `roles/artifactregistry.writer` to + the Cloud Build service account (see [`iam.tf`](iam.tf)). If you deploy with + the local-push path, make sure the principal running it also has Artifact + Registry writer access on the repository, for example: + ```bash + gcloud artifacts repositories add-iam-policy-binding ${AR_REPOSITORY_ID} \ + --location=${GCE_REGION} \ + --project=${PROJECT_ID} \ + --member="user:$(gcloud config get-value account)" \ + --role="roles/artifactregistry.writer" + ``` + +5. Deploy the Agent Substrate system and demos exactly as in the development + quickstart: + ```bash + ./hack/install-ate.sh --deploy-ate-system + ``` + +#### Tearing down resources + +To delete everything Terraform created: +```bash +cd hack/iac +terraform destroy +``` + +The GKE cluster sets `deletion_protection = false`, so `terraform destroy` +removes it without any manual intervention. diff --git a/hack/iac/main.tf b/hack/iac/main.tf new file mode 100644 index 000000000..fd0195b34 --- /dev/null +++ b/hack/iac/main.tf @@ -0,0 +1,400 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + } +} + +provider "google" { + project = var.project_id + region = var.region + zone = var.zone +} + +provider "google-beta" { + project = var.project_id + region = var.region + zone = var.zone +} + +########################################################################## +# Enable the required Cloud APIs +########################################################################## +resource "google_project_service" "services" { + for_each = toset([ + "aiplatform.googleapis.com", + "artifactregistry.googleapis.com", + "cloudbuild.googleapis.com", + "cloudresourcemanager.googleapis.com", + "cloudtrace.googleapis.com", + "compute.googleapis.com", + "container.googleapis.com", + "iam.googleapis.com", + "iamcredentials.googleapis.com", + "logging.googleapis.com", + "monitoring.googleapis.com", + "networkconnectivity.googleapis.com", + "serviceusage.googleapis.com", + "storage.googleapis.com", + ]) + + project = var.project_id + service = each.value + disable_on_destroy = false +} + +########################################################################## +# Fetch defaults +########################################################################## +data "google_project" "project" { +} + +data "google_compute_default_service_account" "default" { + depends_on = [ + google_project_service.services["compute.googleapis.com, iam.googleapis.com"] + ] +} + +########################################################################## +# Set up the VPC and subnet +########################################################################## +resource "google_compute_network" "substrate" { + name = "substrate" + project = var.project_id + auto_create_subnetworks = false + + depends_on = [ + google_project_service.services["networkingconnectivity.googleapis.com"] + ] +} + +resource "google_compute_subnetwork" "substrate" { + name = "substrate" + project = var.project_id + region = var.region + network = google_compute_network.substrate.id + ip_cidr_range = var.subnet_cidr + + secondary_ip_range { + range_name = "substrate-pods" + ip_cidr_range = var.pods_cidr + } + + secondary_ip_range { + range_name = "substrate-services" + ip_cidr_range = var.services_cidr + } +} + +# allow access from health check ranges +resource "google_compute_firewall" "allow_l7_xlb_fw_hc" { + name = "allow-l7-xlb-fw-hc" + direction = "INGRESS" + network = google_compute_network.substrate.id + source_ranges = ["130.211.0.0/22", "35.191.0.0/16"] + allow { + protocol = "tcp" + } + target_tags = ["allow-health-check"] +} + +# allow ssh ingress from iap +resource "google_compute_firewall" "allow_ssh_ingress_from_iap" { + name = "allow-ssh-ingress-from-iap" + direction = "INGRESS" + network = google_compute_network.substrate.id + source_ranges = ["35.235.240.0/20"] + allow { + protocol = "tcp" + ports = ["22"] + } +} + +########################################################################## +# Set up default service account permissions +########################################################################## +resource "google_project_iam_member" "default_storage_admin" { + project = var.project_id + role = "roles/storage.admin" + member = "serviceAccount:${data.google_compute_default_service_account.default.email}" +} + +resource "google_project_iam_member" "default_service_usage_admin" { + project = var.project_id + role = "roles/serviceusage.serviceUsageAdmin" + member = "serviceAccount:${data.google_compute_default_service_account.default.email}" +} + +########################################################################## +# Set up the NAT Router +########################################################################## +resource "google_compute_router" "substrate_router" { + name = "substrate-router" + region = var.region + network = google_compute_network.substrate.id + + bgp { + asn = 64514 + } +} + +resource "google_compute_router_nat" "substrate_router_nat" { + name = "substrate-router-nat" + router = google_compute_router.substrate_router.name + region = google_compute_router.substrate_router.region + nat_ip_allocate_option = "AUTO_ONLY" + source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES" + + log_config { + enable = true + filter = "ERRORS_ONLY" + } +} + +########################################################################## +# Set up the Artifact Registry +########################################################################## +resource "google_artifact_registry_repository" "ate_images" { + location = var.region + repository_id = "ate-images" + description = "docker repository" + format = "DOCKER" + depends_on = [ + google_project_service.services["artifactregistry.googleapis.com"] + ] +} + +resource "google_artifact_registry_repository_iam_member" "atelet_artifact_reader" { + location = google_artifact_registry_repository.ate_images.location + repository = google_artifact_registry_repository.ate_images.name + role = "roles/artifactregistry.reader" + member = "principal://iam.googleapis.com/projects/${data.google_project.project.number}/locations/global/workloadIdentityPools/${var.project_id}.svc.id.goog/subject/ns/ate-system/sa/atelet" + depends_on = [ + google_container_cluster.substrate + ] +} + +resource "google_artifact_registry_repository_iam_member" "default_sa_artifact_reader" { + location = google_artifact_registry_repository.ate_images.location + repository = google_artifact_registry_repository.ate_images.name + role = "roles/artifactregistry.reader" + member = "serviceAccount:${data.google_compute_default_service_account.default.email}" +} + +########################################################################## +# Set up the Snapshot bucket +########################################################################## +resource "google_storage_bucket" "snapshots" { + name = "snapshot-substrate-test-${var.project_id}" + location = "US" + + force_destroy = true + public_access_prevention = "enforced" + uniform_bucket_level_access = true +} + +resource "google_storage_bucket_iam_member" "atelet_snapshots_bucket_viewer" { + bucket = google_storage_bucket.snapshots.name + role = "roles/storage.bucketViewer" + member = "principal://iam.googleapis.com/projects/${data.google_project.project.number}/locations/global/workloadIdentityPools/${var.project_id}.svc.id.goog/subject/ns/ate-system/sa/atelet" + depends_on = [ + google_container_cluster.substrate + ] +} + +resource "google_storage_bucket_iam_member" "atelet_snapshots_object_admin" { + bucket = google_storage_bucket.snapshots.name + role = "roles/storage.objectAdmin" + member = "principal://iam.googleapis.com/projects/${data.google_project.project.number}/locations/global/workloadIdentityPools/${var.project_id}.svc.id.goog/subject/ns/ate-system/sa/atelet" + depends_on = [ + google_container_cluster.substrate + ] +} + +########################################################################### +# Set up the GKE cluster +########################################################################## +resource "google_container_cluster" "substrate" { + name = var.cluster_name + provider = google-beta + location = var.cluster_location + + min_master_version = var.cluster_version + + # Required by Google provider 5.0+ — must be false before terraform destroy + # will succeed. Appropriate for dev quickstart infra. + deletion_protection = false + + network = google_compute_network.substrate.name + subnetwork = google_compute_subnetwork.substrate.name + + # VPC-native: pod and service IPs drawn from the subnet's secondary ranges. + ip_allocation_policy { + cluster_secondary_range_name = "substrate-pods" + services_secondary_range_name = "substrate-services" + } + + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" + } + + # Beta Kubernetes APIs required by Agent Substrate. + enable_k8s_beta_apis { + enabled_apis = [ + "certificates.k8s.io/v1beta1/podcertificaterequests", + "certificates.k8s.io/v1beta1/clustertrustbundles", + ] + } + + # A single default-pool node ensures system and non-gVisor workloads have + # somewhere to run without being forced onto the gVisor worker pool. + initial_node_count = var.default_pool_count + + node_config { + machine_type = var.default_node_machine_type + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only", + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + ] + + tags = ["default-pool-node", "allow-health-check"] + metadata = { + disable-legacy-endpoints = "true" + } + + shielded_instance_config { + enable_secure_boot = true + } + } + + addons_config { + http_load_balancing { + disabled = false + } + + gcp_filestore_csi_driver_config { + enabled = true + } + + pod_snapshot_config { + enabled = true + } + } + + gateway_api_config { + channel = "CHANNEL_STANDARD" + } + + monitoring_config { + managed_prometheus { + enabled = true + } + + advanced_datapath_observability_config { + enable_metrics = true + enable_relay = true + } + } + + private_cluster_config { + enable_private_nodes = true + } + + datapath_provider = "ADVANCED_DATAPATH" + + enable_shielded_nodes = true + + depends_on = [google_compute_subnetwork.substrate] +} + +# Sandbox workloads are scheduled here. gVisor requires the COS_CONTAINERD +# image type; the sandbox_config block activates the runsc runtime. + +resource "google_container_node_pool" "worker" { + name = "worker" + provider = google-beta + cluster = google_container_cluster.substrate.id + location = var.cluster_location + + node_count = var.worker_pool_count + + node_config { + machine_type = var.worker_node_machine_type + image_type = "COS_CONTAINERD" + + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only", + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + ] + + tags = ["worker-pool-node", "allow-health-check"] + metadata = { + disable-legacy-endpoints = "true" + } + + shielded_instance_config { + enable_secure_boot = true + } + + sandbox_config { + type = "GVISOR" + } + } +} + +########################################################################### +# Set up permissions for Agents to access Vertex AI and Cloud Trace +########################################################################### +resource "google_project_iam_member" "cloud_trace" { + project = var.project_id + role = "roles/cloudtrace.agent" + member = "principal://iam.googleapis.com/projects/${data.google_project.project.number}/locations/global/workloadIdentityPools/${var.project_id}.svc.id.goog/subject/ns/ate-system/sa/atelet" + depends_on = [ + google_container_cluster.substrate + ] +} + +########################################################################### +# Set up permissions for Cloud Build to deploy to GKE and push to AR +########################################################################### +# Explicitly pull the Secret Manager Service Agent identity +resource "google_project_service_identity" "cloudbuild_agent" { + provider = google-beta + project = var.project_id + service = "cloudbuild.googleapis.com" +} + +resource "google_project_iam_member" "cloudbuild_gke_developer" { + project = var.project_id + role = "roles/container.developer" + member = google_project_service_identity.cloudbuild_agent.member +} + +resource "google_artifact_registry_repository_iam_member" "cloudbuild_artifactregistry_writer" { + location = google_artifact_registry_repository.ate_images.location + repository = google_artifact_registry_repository.ate_images.name + role = "roles/artifactregistry.writer" + member = google_project_service_identity.cloudbuild_agent.member +} diff --git a/hack/iac/outputs.tf b/hack/iac/outputs.tf new file mode 100644 index 000000000..a1322d1bf --- /dev/null +++ b/hack/iac/outputs.tf @@ -0,0 +1,38 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "cluster_name" { + description = "GKE cluster name" + value = google_container_cluster.substrate.name +} + +output "cluster_location" { + description = "GKE cluster location" + value = google_container_cluster.substrate.location +} + +output "bucket_name" { + description = "Snapshot GCS bucket name" + value = google_storage_bucket.snapshots.name +} + +output "artifact_registry_url" { + description = "Artifact Registry repository URL — use as KO_DOCKER_REPO" + value = "${var.region}-docker.pkg.dev/${var.project_id}/${google_artifact_registry_repository.ate_images.repository_id}" +} + +output "get_credentials_command" { + description = "gcloud command to configure kubectl for this cluster" + value = "gcloud container clusters get-credentials ${google_container_cluster.substrate.name} --location ${google_container_cluster.substrate.location} --project ${var.project_id}" +} diff --git a/hack/iac/variables.tf b/hack/iac/variables.tf new file mode 100644 index 000000000..639d71f03 --- /dev/null +++ b/hack/iac/variables.tf @@ -0,0 +1,106 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ── Project ─────────────────────────────────────────────────────────────────── + +variable "project_id" { + description = "GCP project ID" + type = string +} + +# ── Regions & locations ─────────────────────────────────────────────────────── + +variable "region" { + description = "region" + default = "us-central1" +} + +variable "zone" { + description = "zone" + default = "us-central1-c" +} + +variable "cluster_location" { + description = "GKE cluster zone or region (e.g. us-central1-c)" + type = string +} + +# ── Networking ──────────────────────────────────────────────────────────────── + +variable "subnet_cidr" { + description = "Primary IP CIDR range for the 'substrate' subnetwork" + type = string + default = "10.0.0.0/20" +} + +variable "pods_cidr" { + description = "Secondary CIDR range for GKE pod IPs (VPC-native)" + type = string + default = "10.1.0.0/16" +} + +variable "services_cidr" { + description = "Secondary CIDR range for GKE service IPs (VPC-native)" + type = string + default = "10.2.0.0/20" +} + +# ── GKE cluster ─────────────────────────────────────────────────────────────── + +variable "cluster_name" { + description = "GKE cluster name" + type = string +} + +variable "cluster_version" { + description = "GKE cluster version" + type = string +} + +variable "default_node_machine_type" { + description = "Machine type for the default (non-gVisor) node pool" + type = string + default = "e2-standard-2" +} + +variable "default_pool_count" { + description = "number of default nodepool nodes" + default = 1 +} + +variable "worker_node_machine_type" { + description = "Machine type for the gVisor worker node pool" + type = string + default = "c3-standard-4" +} + +variable "worker_pool_count" { + description = "number of worker (gvisor) nodepool nodes" + default = 2 +} + +# ── Storage ─────────────────────────────────────────────────────────────────── + +variable "bucket_name" { + description = "GCS bucket name for sandbox snapshots" + type = string +} + +# ── Artifact Registry ───────────────────────────────────────────────────────── + +variable "ar_repository_id" { + description = "Artifact Registry repository ID" + type = string + default = "substrate" +} From b3c47c15aeb06e37b9e05eb6ffbd9e64e9d54546 Mon Sep 17 00:00:00 2001 From: daschwanden Date: Mon, 1 Jun 2026 17:07:08 +0200 Subject: [PATCH 2/4] Update AR_REPOSITORY_ID --- hack/ate-dev-env.sh.gcp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/ate-dev-env.sh.gcp b/hack/ate-dev-env.sh.gcp index 6611b8ea9..3e3d9b645 100644 --- a/hack/ate-dev-env.sh.gcp +++ b/hack/ate-dev-env.sh.gcp @@ -52,7 +52,7 @@ export BUCKET_NAME=snapshot-substrate-test-${PROJECT_ID} # Artifact Registry repository created by Terraform (hack/iac/artifactregistry.tf). # Cloud Build pushes images here; the cluster pulls from it. -export AR_REPOSITORY_ID=substrate +export AR_REPOSITORY_ID=ate-images export KO_DOCKER_REPO="${GCE_REGION}-docker.pkg.dev/${PROJECT_ID}/${AR_REPOSITORY_ID}" # Set this if you want to override the default build platforms From b015bb25298fb1ab31297f5f5e35be217e42f2e4 Mon Sep 17 00:00:00 2001 From: Dan Aschwanden Date: Tue, 2 Jun 2026 15:13:18 +0000 Subject: [PATCH 3/4] Adds optional Filestore --- .gitignore | 8 -------- hack/ate-dev-env.sh.gcp | 1 + hack/iac/.gitignore | 7 +++++++ hack/iac/main.tf | 28 ++++++++++++++++++++++++++++ hack/iac/variables.tf | 6 ++++++ 5 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 hack/iac/.gitignore diff --git a/.gitignore b/.gitignore index 49405a6b3..0803db521 100644 --- a/.gitignore +++ b/.gitignore @@ -13,11 +13,3 @@ __pycache__/ # Local environment variables .ate-dev-env.sh - -# Local .terraform directories -*.terraform/ -*.terraform.lock.* - -# .tfstate files -*.tfstate -*.tfstate.* diff --git a/hack/ate-dev-env.sh.gcp b/hack/ate-dev-env.sh.gcp index 3e3d9b645..ddf07ef51 100644 --- a/hack/ate-dev-env.sh.gcp +++ b/hack/ate-dev-env.sh.gcp @@ -72,6 +72,7 @@ export TF_VAR_default_node_machine_type=${DEFAULT_NODE_MACHINE_TYPE} export TF_VAR_worker_node_machine_type=${GVISOR_NODE_MACHINE_TYPE} export TF_VAR_bucket_name=${BUCKET_NAME} export TF_VAR_ar_repository_id=${AR_REPOSITORY_ID} +export TF_VAR_filestore=false # Networking CIDRs use the defaults in hack/iac/variables.tf. Uncomment and set # these to override them. diff --git a/hack/iac/.gitignore b/hack/iac/.gitignore new file mode 100644 index 000000000..e79c84f5f --- /dev/null +++ b/hack/iac/.gitignore @@ -0,0 +1,7 @@ +# Local .terraform directories +*.terraform/ +*.terraform.lock.* + +# .tfstate files +*.tfstate +*.tfstate.* diff --git a/hack/iac/main.tf b/hack/iac/main.tf index fd0195b34..264f79000 100644 --- a/hack/iac/main.tf +++ b/hack/iac/main.tf @@ -46,6 +46,7 @@ resource "google_project_service" "services" { "cloudresourcemanager.googleapis.com", "cloudtrace.googleapis.com", "compute.googleapis.com", + "file.googleapis.com", "container.googleapis.com", "iam.googleapis.com", "iamcredentials.googleapis.com", @@ -398,3 +399,30 @@ resource "google_artifact_registry_repository_iam_member" "cloudbuild_artifactre role = "roles/artifactregistry.writer" member = google_project_service_identity.cloudbuild_agent.member } + +########################################################################## +# Set up the Filestore cluster +########################################################################## +resource "google_filestore_instance" "filestore" { + count = var.filestore ? 1 : 0 + name = "filestore" + location = var.zone + tier = "BASIC_SSD" + + file_shares { + capacity_gb = 2560 + name = "vol1" + + nfs_export_options { + ip_ranges = ["10.0.0.0/16", "10.2.0.0/16"] + access_mode = "READ_WRITE" + squash_mode = "NO_ROOT_SQUASH" + } + } + + networks { + network = google_compute_network.substrate.name + modes = ["MODE_IPV4"] + connect_mode = "DIRECT_PEERING" + } +} diff --git a/hack/iac/variables.tf b/hack/iac/variables.tf index 639d71f03..52874f829 100644 --- a/hack/iac/variables.tf +++ b/hack/iac/variables.tf @@ -104,3 +104,9 @@ variable "ar_repository_id" { type = string default = "substrate" } + +variable "filestore" { + description = "Flag to add Filestore implementation" + type = bool + default = false +} From eb949cc5b685c7f7d1d5fde747c0b5df98a4477a Mon Sep 17 00:00:00 2001 From: Dan Aschwanden Date: Tue, 2 Jun 2026 15:20:00 +0000 Subject: [PATCH 4/4] Moved files to dedicated gcp subfolder --- README.md | 2 +- hack/{ => gcp}/ate-dev-env.sh.gcp | 0 hack/{ => gcp}/iac/.gitignore | 0 hack/{ => gcp}/iac/README.md | 10 +++++----- hack/{ => gcp}/iac/main.tf | 0 hack/{ => gcp}/iac/outputs.tf | 0 hack/{ => gcp}/iac/variables.tf | 0 7 files changed, 6 insertions(+), 6 deletions(-) rename hack/{ => gcp}/ate-dev-env.sh.gcp (100%) rename hack/{ => gcp}/iac/.gitignore (100%) rename hack/{ => gcp}/iac/README.md (96%) rename hack/{ => gcp}/iac/main.tf (100%) rename hack/{ => gcp}/iac/outputs.tf (100%) rename hack/{ => gcp}/iac/variables.tf (100%) diff --git a/README.md b/README.md index 07bd92c18..a96ab37d5 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ curl -X POST -H "Host: my-counter-1.actors.resources.substrate.ate.dev" -i http: ### GKE Quickstart (Development) -> For a declarative, Terraform-based setup that starts from a vanilla Google Cloud project, see [hack/iac/README.md](hack/iac/README.md) (GKE Quickstart (Production)). +> For a declarative, Terraform-based setup that starts from a vanilla Google Cloud project, see [hack/gcp/iac/README.md](hack/gcp/iac/README.md) (GKE Quickstart (Production)). 1. Create and configure your environment file: ```bash diff --git a/hack/ate-dev-env.sh.gcp b/hack/gcp/ate-dev-env.sh.gcp similarity index 100% rename from hack/ate-dev-env.sh.gcp rename to hack/gcp/ate-dev-env.sh.gcp diff --git a/hack/iac/.gitignore b/hack/gcp/iac/.gitignore similarity index 100% rename from hack/iac/.gitignore rename to hack/gcp/iac/.gitignore diff --git a/hack/iac/README.md b/hack/gcp/iac/README.md similarity index 96% rename from hack/iac/README.md rename to hack/gcp/iac/README.md index 9e861fea8..96f7d05ef 100644 --- a/hack/iac/README.md +++ b/hack/gcp/iac/README.md @@ -1,13 +1,13 @@ ### GKE Quickstart (Production) This is the Terraform equivalent of the `go run ./tools/setup-gcp --all` -provisioner described in the [GKE Quickstart (Development)](../../README.md) +provisioner described in the [GKE Quickstart (Development)](../../../README.md) section. It provisions the same GCP resources — a GKE cluster, snapshot bucket, Artifact Registry repository, and IAM bindings — but does so declaratively and starting from a **vanilla Google Cloud project**: no APIs enabled, no VPC, and no subnets are assumed to exist beforehand. -The configuration lives in [`hack/iac/`](.) and uses resources from the +The configuration lives in [`hack/gcp/iac/`](.) and uses resources from the [Terraform Google Cloud provider](https://registry.terraform.io/providers/hashicorp/google/latest/docs) directly (no modules), so every resource is visible and easy to adapt. @@ -30,7 +30,7 @@ What gets created: to keep in sync. Source it now, before any of the steps below — everything that follows relies on the variables it exports: ```bash - cp hack/ate-dev-env.sh.gcp .ate-dev-env.sh + cp hack/gcp/ate-dev-env.sh.gcp .ate-dev-env.sh # Edit .ate-dev-env.sh to match your project and preferences, then source it: source .ate-dev-env.sh @@ -63,7 +63,7 @@ What gets created: prerequisites, so make sure `.ate-dev-env.sh` is sourced in your current shell: ```bash - cd hack/iac + cd hack/gcp/iac terraform init terraform plan ``` @@ -109,7 +109,7 @@ What gets created: To delete everything Terraform created: ```bash -cd hack/iac +cd hack/gcp/iac terraform destroy ``` diff --git a/hack/iac/main.tf b/hack/gcp/iac/main.tf similarity index 100% rename from hack/iac/main.tf rename to hack/gcp/iac/main.tf diff --git a/hack/iac/outputs.tf b/hack/gcp/iac/outputs.tf similarity index 100% rename from hack/iac/outputs.tf rename to hack/gcp/iac/outputs.tf diff --git a/hack/iac/variables.tf b/hack/gcp/iac/variables.tf similarity index 100% rename from hack/iac/variables.tf rename to hack/gcp/iac/variables.tf