diff --git a/client/Chart.yaml b/client/Chart.yaml index 9c8a3f7..0a09c80 100644 --- a/client/Chart.yaml +++ b/client/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: client description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift type: application -version: 1.7.0 -appVersion: "1.7.0" +version: 1.7.1 +appVersion: "1.7.1" keywords: - tracebloc - kubernetes diff --git a/client/templates/egress-enforcement-check.yaml b/client/templates/egress-enforcement-check.yaml new file mode 100644 index 0000000..c106f4b --- /dev/null +++ b/client/templates/egress-enforcement-check.yaml @@ -0,0 +1,85 @@ +{{- if and (default dict .Values.networkPolicy.training).enabled (not (dig "allowExternalHttps" true .Values.networkPolicy.training)) (dig "enforcementProbeHost" "1.1.1.1" .Values.networkPolicy.training) }} +{{- /* + Egress-lockdown enforcement check (SECURITY §8.2 / client-runtime#104). + Renders ONLY when the lockdown is enabled (allowExternalHttps=false) and a probe + host is set. This is a `helm test` hook — run `helm test ` after flipping + the lockdown to verify it. A `tracebloc.io/workload: training`-labelled pod (so the + training-egress NetworkPolicy governs it) curls a canary external host DIRECTLY: + blocked => the CNI enforces egress (test PASSES); reachable => NOT enforced => the + lockdown is a silent no-op (test FAILS, exit 1). Because it's a test hook it never + runs during install/upgrade, so it can NEVER block them or the hourly auto-upgrade. + The probe-host default is 1.1.1.1; set enforcementProbeHost="" to disable (e.g. + air-gapped clusters with no external host to test against). +*/ -}} +{{- $host := dig "enforcementProbeHost" "1.1.1.1" .Values.networkPolicy.training }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-egress-enforcement-check + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + {{- include "tracebloc.selectorLabels" . | nindent 8 }} + tracebloc.io/workload: training + spec: + restartPolicy: Never + automountServiceAccountToken: false + securityContext: + runAsNonRoot: true + # curlimages/curl's default user is non-numeric (curl_user); runAsNonRoot + # can't verify that, so pin the image's uid explicitly. + runAsUser: 100 + seccompProfile: + type: RuntimeDefault + containers: + - name: probe + image: {{ include "tracebloc.image" (dict "repository" "curlimages/curl" "tag" "8.20.0" "digest" "sha256:b3f1fb2a51d923260350d21b8654bbc607164a987e2f7c84a0ac199a67df812a" "registry" "docker.io") | quote }} + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + command: + - sh + - -c + - | + HOST={{ $host | quote }} + echo "[egress-enforcement-check] probing direct egress to https://$HOST (must be BLOCKED when the lockdown is enforced)..." + code=$(curl --noproxy '*' -s -m 5 -o /dev/null -w '%{http_code}' "https://$HOST" 2>/dev/null || true) + if [ -n "$code" ] && [ "$code" != "000" ]; then + echo "WARNING ================================================================" + echo "WARNING EGRESS LOCKDOWN NOT ENFORCED on this cluster." + echo "WARNING A tracebloc.io/workload=training pod reached https://$HOST" + echo "WARNING directly (HTTP $code). networkPolicy.training.allowExternalHttps" + echo "WARNING is false, but the CNI is NOT enforcing egress NetworkPolicy, so" + echo "WARNING the SECURITY §8.2 lockdown is INACTIVE and training pods can" + echo "WARNING still reach the open internet." + echo "WARNING Fix: enable egress NetworkPolicy on the CNI (Calico/Cilium, or" + echo "WARNING EKS VPC-CNI enableNetworkPolicy=true), then re-run 'helm test'." + echo "WARNING ================================================================" + exit 1 + fi + echo "OK egress lockdown verified: direct external egress is blocked (curl -> ${code:-blocked})." + exit 0 + resources: + requests: + cpu: "10m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "64Mi" + {{- if include "tracebloc.useImagePullSecrets" . }} + imagePullSecrets: + - name: {{ include "tracebloc.registrySecretName" . }} + {{- end }} +{{- end }} diff --git a/client/tests/egress_enforcement_check_test.yaml b/client/tests/egress_enforcement_check_test.yaml new file mode 100644 index 0000000..90c6fb9 --- /dev/null +++ b/client/tests/egress_enforcement_check_test.yaml @@ -0,0 +1,104 @@ +suite: Egress-lockdown enforcement pre-flight hook +# SECURITY §8.2 / client-runtime#104. A post-install/post-upgrade hook that, when +# the lockdown is enabled (allowExternalHttps=false), runs a training-labelled probe +# to verify the CNI actually blocks egress — and warns (non-blocking) if it doesn't. +# Guards: renders ONLY when the lockdown is on + a probe host is set; never otherwise. +templates: + - templates/egress-enforcement-check.yaml +set: + clientId: "test-id" + clientPassword: "test" +tests: + - it: does NOT render by default (lockdown off — allowExternalHttps defaults true) + asserts: + - hasDocuments: + count: 0 + + - it: does NOT render when training NetworkPolicy is disabled + set: + networkPolicy: + training: + enabled: false + allowExternalHttps: false + asserts: + - hasDocuments: + count: 0 + + - it: does NOT render when the probe host is empty (disabled, e.g. air-gapped) + set: + networkPolicy: + training: + allowExternalHttps: false + enforcementProbeHost: "" + asserts: + - hasDocuments: + count: 0 + + - it: renders as a helm test hook Job when the lockdown is enabled + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - hasDocuments: + count: 1 + - isKind: + of: Job + - equal: + path: metadata.annotations["helm.sh/hook"] + value: test + - equal: + path: metadata.annotations["helm.sh/hook-delete-policy"] + value: before-hook-creation,hook-succeeded + + - it: probe pod is training-labelled (so the lockdown netpol governs it) and PSA-restricted + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - equal: + path: spec.template.metadata.labels["tracebloc.io/workload"] + value: training + - equal: + path: spec.template.spec.securityContext.runAsNonRoot + value: true + - equal: + path: spec.template.spec.securityContext.runAsUser + value: 100 + - equal: + path: spec.template.spec.containers[0].securityContext.readOnlyRootFilesystem + value: true + - contains: + path: spec.template.spec.containers[0].securityContext.capabilities.drop + content: "ALL" + - equal: + path: spec.template.spec.automountServiceAccountToken + value: false + + - it: probes the configured host directly (no proxy) and fails the test on non-enforcement + set: + networkPolicy: + training: + allowExternalHttps: false + enforcementProbeHost: "canary.example.net" + asserts: + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "HOST=.?canary\\.example\\.net" + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "curl --noproxy '\\*'" + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "exit 1" + + - it: pins the curl probe image by digest + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - matchRegex: + path: spec.template.spec.containers[0].image + pattern: "^docker\\.io/curlimages/curl@sha256:[a-f0-9]{64}$" diff --git a/client/values.schema.json b/client/values.schema.json index 2eb42dc..9f4048c 100644 --- a/client/values.schema.json +++ b/client/values.schema.json @@ -226,6 +226,10 @@ "default": true, "description": "When false, drop the 0.0.0.0/0:443 egress rule so training pods reach only DNS, MySQL, requests-proxy and the egress gateway (SECURITY §8.2 / client-runtime#102). Default true keeps existing behaviour; flip per-fleet after verifying the egress gateway works (G2)." }, + "enforcementProbeHost": { + "type": "string", + "description": "Host the `helm test` enforcement check curls directly (when allowExternalHttps=false) to verify the CNI blocks egress; non-enforcement fails the test (a test hook never affects install/upgrade) — client-runtime#104. Empty string disables it (e.g. air-gapped clusters)." + }, "dnsNamespace": { "type": "string", "default": "kube-system", diff --git a/client/values.yaml b/client/values.yaml index 9565b1f..b9a07af 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -176,6 +176,12 @@ networkPolicy: # nil-guards this key, so a `helm upgrade --reuse-values` from a release # predating it keeps the old behaviour (rule present). allowExternalHttps: true + # client-runtime#104: enforcement check, run via `helm test ` after + # flipping the lockdown (allowExternalHttps=false). The test curls this host + # directly from a training-labelled pod; reachable => the CNI isn't enforcing + # egress => the test FAILS (a test hook never affects install/upgrade). Set "" + # to disable (e.g. air-gapped clusters with no external host to test against). + enforcementProbeHost: "1.1.1.1" dnsNamespace: kube-system # CoreDNS pod selector — varies per platform. Override in ci/-values.yaml. # When empty, the template falls back to {k8s-app: kube-dns}, which works