From 824adac92cadd17cc9b080ef0298c92345b46d31 Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Fri, 12 Jun 2026 19:12:27 +0500 Subject: [PATCH 1/2] feat(egress-proxy): deploy-time egress-enforcement pre-flight (non-blocking) [#104] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the §8.2 lockdown is enabled (networkPolicy.training.allowExternalHttps=false) but the cluster's CNI doesn't actually enforce egress NetworkPolicy, the lockdown is a silent no-op (false sense of security — hit on EKS VPC-CNI during the #102 dev validation). This adds a post-install/post-upgrade Helm hook that, in that case, runs a tracebloc.io/workload=training-labelled probe which curls a canary host DIRECTLY: reachable => the CNI isn't enforcing => logs a loud WARNING. Always exits 0 (non-blocking), so it never fails an upgrade or the hourly auto-upgrade. - gated on enabled && !allowExternalHttps && enforcementProbeHost -> ships DORMANT (does nothing until a fleet flips the lockdown). - new values networkPolicy.training.enforcementProbeHost (default 1.1.1.1; "" disables, e.g. air-gapped); registered in values.schema.json. - curl probe image digest-pinned multi-arch, PSA-restricted, runAsUser 100. - helm-unittest egress_enforcement_check_test.yaml (gating both ways + shape); 248/248. - Chart 1.7.0 -> 1.7.1 (version + appVersion lockstep). Chart-only; no backend / client-runtime changes (enforcement treated as a per-fleet prerequisite, so no central audit needed). Refs tracebloc/client-runtime#104, #102. Co-Authored-By: Claude Opus 4.8 --- client/Chart.yaml | 4 +- .../templates/egress-enforcement-check.yaml | 85 ++++++++++++++ .../tests/egress_enforcement_check_test.yaml | 104 ++++++++++++++++++ client/values.schema.json | 4 + client/values.yaml | 5 + 5 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 client/templates/egress-enforcement-check.yaml create mode 100644 client/tests/egress_enforcement_check_test.yaml diff --git a/client/Chart.yaml b/client/Chart.yaml index 9c8a3f7..0a09c80 100644 --- a/client/Chart.yaml +++ b/client/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: client description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift type: application -version: 1.7.0 -appVersion: "1.7.0" +version: 1.7.1 +appVersion: "1.7.1" keywords: - tracebloc - kubernetes diff --git a/client/templates/egress-enforcement-check.yaml b/client/templates/egress-enforcement-check.yaml new file mode 100644 index 0000000..f691966 --- /dev/null +++ b/client/templates/egress-enforcement-check.yaml @@ -0,0 +1,85 @@ +{{- if and (default dict .Values.networkPolicy.training).enabled (not (dig "allowExternalHttps" true .Values.networkPolicy.training)) (dig "enforcementProbeHost" "" .Values.networkPolicy.training) }} +{{- /* + Egress-lockdown enforcement pre-flight (SECURITY §8.2 / client-runtime#104). + Renders ONLY when the lockdown is enabled (allowExternalHttps=false) and a probe + host is set. A post-install/post-upgrade hook runs a `tracebloc.io/workload: + training`-labelled pod (so the training-egress NetworkPolicy governs it) that + curls a canary external host DIRECTLY: blocked => the CNI enforces egress (good); + reachable => NOT enforced => the lockdown is a silent no-op. NON-BLOCKING: the + probe always exits 0 and just logs a loud warning, so it never fails an + install/upgrade (including the hourly auto-upgrade). Set enforcementProbeHost="" + to disable (e.g. air-gapped clusters with no external host to test against). +*/ -}} +{{- $host := dig "enforcementProbeHost" "1.1.1.1" .Values.networkPolicy.training }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-egress-enforcement-check + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + {{- include "tracebloc.selectorLabels" . | nindent 8 }} + tracebloc.io/workload: training + spec: + restartPolicy: Never + automountServiceAccountToken: false + securityContext: + runAsNonRoot: true + # curlimages/curl's default user is non-numeric (curl_user); runAsNonRoot + # can't verify that, so pin the image's uid explicitly. + runAsUser: 100 + seccompProfile: + type: RuntimeDefault + containers: + - name: probe + image: {{ include "tracebloc.image" (dict "repository" "curlimages/curl" "tag" "8.20.0" "digest" "sha256:b3f1fb2a51d923260350d21b8654bbc607164a987e2f7c84a0ac199a67df812a" "registry" "docker.io") | quote }} + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + command: + - sh + - -c + - | + HOST={{ $host | quote }} + echo "[egress-enforcement-check] probing direct egress to https://$HOST (must be BLOCKED when the lockdown is enforced)..." + code=$(curl --noproxy '*' -s -m 5 -o /dev/null -w '%{http_code}' "https://$HOST" 2>/dev/null || true) + if [ -n "$code" ] && [ "$code" != "000" ]; then + echo "WARNING ================================================================" + echo "WARNING EGRESS LOCKDOWN NOT ENFORCED on this cluster." + echo "WARNING A tracebloc.io/workload=training pod reached https://$HOST" + echo "WARNING directly (HTTP $code). networkPolicy.training.allowExternalHttps" + echo "WARNING is false, but the CNI is NOT enforcing egress NetworkPolicy, so" + echo "WARNING the SECURITY §8.2 lockdown is INACTIVE and training pods can" + echo "WARNING still reach the open internet." + echo "WARNING Fix: enable egress NetworkPolicy on the CNI (Calico/Cilium, or" + echo "WARNING EKS VPC-CNI enableNetworkPolicy=true), then re-run the upgrade." + echo "WARNING ================================================================" + else + echo "OK egress lockdown verified: direct external egress is blocked (curl -> ${code:-blocked})." + fi + exit 0 + resources: + requests: + cpu: "10m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "64Mi" + {{- if include "tracebloc.useImagePullSecrets" . }} + imagePullSecrets: + - name: {{ include "tracebloc.registrySecretName" . }} + {{- end }} +{{- end }} diff --git a/client/tests/egress_enforcement_check_test.yaml b/client/tests/egress_enforcement_check_test.yaml new file mode 100644 index 0000000..13d405a --- /dev/null +++ b/client/tests/egress_enforcement_check_test.yaml @@ -0,0 +1,104 @@ +suite: Egress-lockdown enforcement pre-flight hook +# SECURITY §8.2 / client-runtime#104. A post-install/post-upgrade hook that, when +# the lockdown is enabled (allowExternalHttps=false), runs a training-labelled probe +# to verify the CNI actually blocks egress — and warns (non-blocking) if it doesn't. +# Guards: renders ONLY when the lockdown is on + a probe host is set; never otherwise. +templates: + - templates/egress-enforcement-check.yaml +set: + clientId: "test-id" + clientPassword: "test" +tests: + - it: does NOT render by default (lockdown off — allowExternalHttps defaults true) + asserts: + - hasDocuments: + count: 0 + + - it: does NOT render when training NetworkPolicy is disabled + set: + networkPolicy: + training: + enabled: false + allowExternalHttps: false + asserts: + - hasDocuments: + count: 0 + + - it: does NOT render when the probe host is empty (disabled, e.g. air-gapped) + set: + networkPolicy: + training: + allowExternalHttps: false + enforcementProbeHost: "" + asserts: + - hasDocuments: + count: 0 + + - it: renders as a post-install/post-upgrade hook Job when the lockdown is enabled + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - hasDocuments: + count: 1 + - isKind: + of: Job + - equal: + path: metadata.annotations["helm.sh/hook"] + value: post-install,post-upgrade + - equal: + path: metadata.annotations["helm.sh/hook-delete-policy"] + value: before-hook-creation,hook-succeeded + + - it: probe pod is training-labelled (so the lockdown netpol governs it) and PSA-restricted + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - equal: + path: spec.template.metadata.labels["tracebloc.io/workload"] + value: training + - equal: + path: spec.template.spec.securityContext.runAsNonRoot + value: true + - equal: + path: spec.template.spec.securityContext.runAsUser + value: 100 + - equal: + path: spec.template.spec.containers[0].securityContext.readOnlyRootFilesystem + value: true + - contains: + path: spec.template.spec.containers[0].securityContext.capabilities.drop + content: "ALL" + - equal: + path: spec.template.spec.automountServiceAccountToken + value: false + + - it: probes the configured host directly (no proxy) and is non-blocking + set: + networkPolicy: + training: + allowExternalHttps: false + enforcementProbeHost: "canary.example.net" + asserts: + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "HOST=.?canary\\.example\\.net" + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "curl --noproxy '\\*'" + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "exit 0" + + - it: pins the curl probe image by digest + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - matchRegex: + path: spec.template.spec.containers[0].image + pattern: "^docker\\.io/curlimages/curl@sha256:[a-f0-9]{64}$" diff --git a/client/values.schema.json b/client/values.schema.json index 2eb42dc..68b684c 100644 --- a/client/values.schema.json +++ b/client/values.schema.json @@ -226,6 +226,10 @@ "default": true, "description": "When false, drop the 0.0.0.0/0:443 egress rule so training pods reach only DNS, MySQL, requests-proxy and the egress gateway (SECURITY §8.2 / client-runtime#102). Default true keeps existing behaviour; flip per-fleet after verifying the egress gateway works (G2)." }, + "enforcementProbeHost": { + "type": "string", + "description": "Host the enforcement pre-flight hook curls directly (when allowExternalHttps=false) to verify the CNI actually blocks egress; non-enforcement logs a non-blocking warning (client-runtime#104). Empty string disables the check (e.g. air-gapped clusters)." + }, "dnsNamespace": { "type": "string", "default": "kube-system", diff --git a/client/values.yaml b/client/values.yaml index 9565b1f..e655cee 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -176,6 +176,11 @@ networkPolicy: # nil-guards this key, so a `helm upgrade --reuse-values` from a release # predating it keeps the old behaviour (rule present). allowExternalHttps: true + # client-runtime#104: deploy-time enforcement pre-flight. When the lockdown is + # enabled (allowExternalHttps=false), a post-upgrade hook curls this host directly + # from a training-labelled pod; if it's reachable the CNI isn't enforcing egress + # and the hook logs a loud, non-blocking warning. Set "" to disable (air-gapped). + enforcementProbeHost: "1.1.1.1" dnsNamespace: kube-system # CoreDNS pod selector — varies per platform. Override in ci/-values.yaml. # When empty, the template falls back to {k8s-app: kube-dns}, which works From 00104c26bdf50eb53a2a14143067b9fc1988ebeb Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Fri, 12 Jun 2026 20:09:27 +0500 Subject: [PATCH 2/2] =?UTF-8?q?fix(egress-proxy):=20address=20Bugbot=20?= =?UTF-8?q?=E2=80=94=20make=20the=20enforcement=20check=20a=20non-blocking?= =?UTF-8?q?=20helm=20test=20[#104]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Gate default for enforcementProbeHost "" -> "1.1.1.1" (matches values.yaml + $host), so `helm upgrade --reuse-values` from a release predating the key still runs the check instead of silently skipping it. Explicit "" still disables it. - Convert the post-install/post-upgrade hook to a `helm test` hook. A post-upgrade hook fails the release if the probe pod can't run (image pull / PSA / OOM) regardless of the script's exit 0 — which would block the hourly auto-upgrade (not truly non-blocking). As a test hook it never runs during install/upgrade, so it can't block them, and it FAILS (exit 1) on non-enforcement — run `helm test ` after flipping the lockdown for a clear pass/fail. helm-unittest 248/248; lint clean. Co-Authored-By: Claude Opus 4.8 --- .../templates/egress-enforcement-check.yaml | 28 +++++++++---------- .../tests/egress_enforcement_check_test.yaml | 8 +++--- client/values.schema.json | 2 +- client/values.yaml | 9 +++--- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/client/templates/egress-enforcement-check.yaml b/client/templates/egress-enforcement-check.yaml index f691966..c106f4b 100644 --- a/client/templates/egress-enforcement-check.yaml +++ b/client/templates/egress-enforcement-check.yaml @@ -1,14 +1,15 @@ -{{- if and (default dict .Values.networkPolicy.training).enabled (not (dig "allowExternalHttps" true .Values.networkPolicy.training)) (dig "enforcementProbeHost" "" .Values.networkPolicy.training) }} +{{- if and (default dict .Values.networkPolicy.training).enabled (not (dig "allowExternalHttps" true .Values.networkPolicy.training)) (dig "enforcementProbeHost" "1.1.1.1" .Values.networkPolicy.training) }} {{- /* - Egress-lockdown enforcement pre-flight (SECURITY §8.2 / client-runtime#104). + Egress-lockdown enforcement check (SECURITY §8.2 / client-runtime#104). Renders ONLY when the lockdown is enabled (allowExternalHttps=false) and a probe - host is set. A post-install/post-upgrade hook runs a `tracebloc.io/workload: - training`-labelled pod (so the training-egress NetworkPolicy governs it) that - curls a canary external host DIRECTLY: blocked => the CNI enforces egress (good); - reachable => NOT enforced => the lockdown is a silent no-op. NON-BLOCKING: the - probe always exits 0 and just logs a loud warning, so it never fails an - install/upgrade (including the hourly auto-upgrade). Set enforcementProbeHost="" - to disable (e.g. air-gapped clusters with no external host to test against). + host is set. This is a `helm test` hook — run `helm test ` after flipping + the lockdown to verify it. A `tracebloc.io/workload: training`-labelled pod (so the + training-egress NetworkPolicy governs it) curls a canary external host DIRECTLY: + blocked => the CNI enforces egress (test PASSES); reachable => NOT enforced => the + lockdown is a silent no-op (test FAILS, exit 1). Because it's a test hook it never + runs during install/upgrade, so it can NEVER block them or the hourly auto-upgrade. + The probe-host default is 1.1.1.1; set enforcementProbeHost="" to disable (e.g. + air-gapped clusters with no external host to test against). */ -}} {{- $host := dig "enforcementProbeHost" "1.1.1.1" .Values.networkPolicy.training }} apiVersion: batch/v1 @@ -19,8 +20,7 @@ metadata: labels: {{- include "tracebloc.labels" . | nindent 4 }} annotations: - "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "5" + "helm.sh/hook": test "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded spec: backoffLimit: 0 @@ -65,11 +65,11 @@ spec: echo "WARNING the SECURITY §8.2 lockdown is INACTIVE and training pods can" echo "WARNING still reach the open internet." echo "WARNING Fix: enable egress NetworkPolicy on the CNI (Calico/Cilium, or" - echo "WARNING EKS VPC-CNI enableNetworkPolicy=true), then re-run the upgrade." + echo "WARNING EKS VPC-CNI enableNetworkPolicy=true), then re-run 'helm test'." echo "WARNING ================================================================" - else - echo "OK egress lockdown verified: direct external egress is blocked (curl -> ${code:-blocked})." + exit 1 fi + echo "OK egress lockdown verified: direct external egress is blocked (curl -> ${code:-blocked})." exit 0 resources: requests: diff --git a/client/tests/egress_enforcement_check_test.yaml b/client/tests/egress_enforcement_check_test.yaml index 13d405a..90c6fb9 100644 --- a/client/tests/egress_enforcement_check_test.yaml +++ b/client/tests/egress_enforcement_check_test.yaml @@ -34,7 +34,7 @@ tests: - hasDocuments: count: 0 - - it: renders as a post-install/post-upgrade hook Job when the lockdown is enabled + - it: renders as a helm test hook Job when the lockdown is enabled set: networkPolicy: training: @@ -46,7 +46,7 @@ tests: of: Job - equal: path: metadata.annotations["helm.sh/hook"] - value: post-install,post-upgrade + value: test - equal: path: metadata.annotations["helm.sh/hook-delete-policy"] value: before-hook-creation,hook-succeeded @@ -76,7 +76,7 @@ tests: path: spec.template.spec.automountServiceAccountToken value: false - - it: probes the configured host directly (no proxy) and is non-blocking + - it: probes the configured host directly (no proxy) and fails the test on non-enforcement set: networkPolicy: training: @@ -91,7 +91,7 @@ tests: pattern: "curl --noproxy '\\*'" - matchRegex: path: spec.template.spec.containers[0].command[2] - pattern: "exit 0" + pattern: "exit 1" - it: pins the curl probe image by digest set: diff --git a/client/values.schema.json b/client/values.schema.json index 68b684c..9f4048c 100644 --- a/client/values.schema.json +++ b/client/values.schema.json @@ -228,7 +228,7 @@ }, "enforcementProbeHost": { "type": "string", - "description": "Host the enforcement pre-flight hook curls directly (when allowExternalHttps=false) to verify the CNI actually blocks egress; non-enforcement logs a non-blocking warning (client-runtime#104). Empty string disables the check (e.g. air-gapped clusters)." + "description": "Host the `helm test` enforcement check curls directly (when allowExternalHttps=false) to verify the CNI blocks egress; non-enforcement fails the test (a test hook never affects install/upgrade) — client-runtime#104. Empty string disables it (e.g. air-gapped clusters)." }, "dnsNamespace": { "type": "string", diff --git a/client/values.yaml b/client/values.yaml index e655cee..b9a07af 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -176,10 +176,11 @@ networkPolicy: # nil-guards this key, so a `helm upgrade --reuse-values` from a release # predating it keeps the old behaviour (rule present). allowExternalHttps: true - # client-runtime#104: deploy-time enforcement pre-flight. When the lockdown is - # enabled (allowExternalHttps=false), a post-upgrade hook curls this host directly - # from a training-labelled pod; if it's reachable the CNI isn't enforcing egress - # and the hook logs a loud, non-blocking warning. Set "" to disable (air-gapped). + # client-runtime#104: enforcement check, run via `helm test ` after + # flipping the lockdown (allowExternalHttps=false). The test curls this host + # directly from a training-labelled pod; reachable => the CNI isn't enforcing + # egress => the test FAILS (a test hook never affects install/upgrade). Set "" + # to disable (e.g. air-gapped clusters with no external host to test against). enforcementProbeHost: "1.1.1.1" dnsNamespace: kube-system # CoreDNS pod selector — varies per platform. Override in ci/-values.yaml.