diff --git a/client/Chart.yaml b/client/Chart.yaml index 9c8a3f7..0a09c80 100644 --- a/client/Chart.yaml +++ b/client/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: client description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift type: application -version: 1.7.0 -appVersion: "1.7.0" +version: 1.7.1 +appVersion: "1.7.1" keywords: - tracebloc - kubernetes diff --git a/client/templates/egress-enforcement-check.yaml b/client/templates/egress-enforcement-check.yaml new file mode 100644 index 0000000..732c8d6 --- /dev/null +++ b/client/templates/egress-enforcement-check.yaml @@ -0,0 +1,106 @@ +{{- if and (default dict .Values.networkPolicy.training).enabled (not (dig "allowExternalHttps" true .Values.networkPolicy.training)) (dig "enforcementProbeHost" "1.1.1.1" .Values.networkPolicy.training) }} +{{- /* + Egress-lockdown enforcement check (SECURITY §8.2 / client-runtime#104). + Renders ONLY when the lockdown is enabled (allowExternalHttps=false) and a probe + host is set. This is a `helm test` hook — run `helm test ` after flipping + the lockdown to verify it. A `tracebloc.io/workload: training`-labelled pod (so the + training-egress NetworkPolicy governs it) opens a DIRECT TCP connection to a canary + host's :443 (the TLS/HTTP outcome is ignored — we only test reachability): connection + blocked => the CNI enforces egress (test PASSES); connection established => NOT + enforced => the lockdown is a silent no-op (test FAILS, exit 1). Because it's a test + hook it never + runs during install/upgrade, so it can NEVER block them or the hourly auto-upgrade. + The probe-host default is 1.1.1.1; set enforcementProbeHost="" to disable (e.g. + air-gapped clusters with no external host to test against). +*/ -}} +{{- $host := dig "enforcementProbeHost" "1.1.1.1" .Values.networkPolicy.training }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-egress-enforcement-check + namespace: {{ .Release.Namespace }} + labels: + {{- include "tracebloc.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + {{- include "tracebloc.selectorLabels" . | nindent 8 }} + tracebloc.io/workload: training + spec: + restartPolicy: Never + automountServiceAccountToken: false + securityContext: + runAsNonRoot: true + # curlimages/curl's default user is non-numeric (curl_user); runAsNonRoot + # can't verify that, so pin the image's uid explicitly. + runAsUser: 100 + seccompProfile: + type: RuntimeDefault + containers: + - name: probe + image: {{ include "tracebloc.image" (dict "repository" "curlimages/curl" "tag" "8.20.0" "digest" "sha256:b3f1fb2a51d923260350d21b8654bbc607164a987e2f7c84a0ac199a67df812a" "registry" "docker.io") | quote }} + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + command: + - sh + - -c + - | + HOST={{ $host | quote }} + echo "[egress-enforcement-check] probing direct TCP egress to ${HOST}:443 (must be BLOCKED when the lockdown is enforced)..." + # We only care whether a TCP connection to :443 can be ESTABLISHED — the + # TLS/HTTP outcome is irrelevant. -k disables cert verification so probing an + # IP (or any host whose cert doesn't match) is NOT misread as a block, and the + # verdict is keyed on curl's EXIT CODE, not the HTTP status: a blocked egress + # yields a timeout (28) or connect failure (7); a DNS failure (6) is inconclusive; + # any other outcome (incl. a TLS/cert error) means the TCP connect already succeeded. + curl --noproxy '*' -k -sS -m 5 -o /dev/null "https://$HOST"; rc=$? + case "$rc" in + 7|28) + # connect refused (7) / timed out (28): the TCP handshake never completed => egress blocked. + echo "OK egress lockdown verified: TCP to ${HOST}:443 could not be established (curl exit $rc) — egress is blocked." + exit 0 + ;; + 6) + # DNS resolution failed: no TCP connection was attempted, so enforcement can't be judged. + echo "WARNING egress enforcement INCONCLUSIVE: could not resolve host '${HOST}' (curl exit 6)." + echo "WARNING No TCP connection was attempted, so the lockdown was NOT verified. Point" + echo "WARNING networkPolicy.training.enforcementProbeHost at an external IP (e.g. 1.1.1.1)" + echo "WARNING or fix in-cluster DNS, then re-run 'helm test'." + exit 1 + ;; + esac + # Any other outcome (0 success, or a TLS-/HTTP-layer error such as 35/52/56/60 — all of + # which require the TCP connect to have already succeeded) => the host was reached. + echo "WARNING ================================================================" + echo "WARNING EGRESS LOCKDOWN NOT ENFORCED on this cluster." + echo "WARNING A tracebloc.io/workload=training pod reached ${HOST}:443 directly" + echo "WARNING (curl exit $rc; the TCP connect succeeded). networkPolicy.training." + echo "WARNING allowExternalHttps is false, but the CNI is NOT enforcing egress" + echo "WARNING NetworkPolicy, so the SECURITY §8.2 lockdown is INACTIVE and" + echo "WARNING training pods can still reach the open internet." + echo "WARNING Fix: enable egress NetworkPolicy on the CNI (Calico/Cilium, or" + echo "WARNING EKS VPC-CNI enableNetworkPolicy=true), then re-run 'helm test'." + echo "WARNING ================================================================" + exit 1 + resources: + requests: + cpu: "10m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "64Mi" + {{- if include "tracebloc.useImagePullSecrets" . }} + imagePullSecrets: + - name: {{ include "tracebloc.registrySecretName" . }} + {{- end }} +{{- end }} diff --git a/client/tests/egress_enforcement_check_test.yaml b/client/tests/egress_enforcement_check_test.yaml new file mode 100644 index 0000000..d17b910 --- /dev/null +++ b/client/tests/egress_enforcement_check_test.yaml @@ -0,0 +1,120 @@ +suite: Egress-lockdown enforcement pre-flight hook +# SECURITY §8.2 / client-runtime#104. A post-install/post-upgrade hook that, when +# the lockdown is enabled (allowExternalHttps=false), runs a training-labelled probe +# to verify the CNI actually blocks egress — and warns (non-blocking) if it doesn't. +# Guards: renders ONLY when the lockdown is on + a probe host is set; never otherwise. +templates: + - templates/egress-enforcement-check.yaml +set: + clientId: "test-id" + clientPassword: "test" +tests: + - it: does NOT render by default (lockdown off — allowExternalHttps defaults true) + asserts: + - hasDocuments: + count: 0 + + - it: does NOT render when training NetworkPolicy is disabled + set: + networkPolicy: + training: + enabled: false + allowExternalHttps: false + asserts: + - hasDocuments: + count: 0 + + - it: does NOT render when the probe host is empty (disabled, e.g. air-gapped) + set: + networkPolicy: + training: + allowExternalHttps: false + enforcementProbeHost: "" + asserts: + - hasDocuments: + count: 0 + + - it: renders as a helm test hook Job when the lockdown is enabled + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - hasDocuments: + count: 1 + - isKind: + of: Job + - equal: + path: metadata.annotations["helm.sh/hook"] + value: test + - equal: + path: metadata.annotations["helm.sh/hook-delete-policy"] + value: before-hook-creation,hook-succeeded + + - it: probe pod is training-labelled (so the lockdown netpol governs it) and PSA-restricted + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - equal: + path: spec.template.metadata.labels["tracebloc.io/workload"] + value: training + - equal: + path: spec.template.spec.securityContext.runAsNonRoot + value: true + - equal: + path: spec.template.spec.securityContext.runAsUser + value: 100 + - equal: + path: spec.template.spec.containers[0].securityContext.readOnlyRootFilesystem + value: true + - contains: + path: spec.template.spec.containers[0].securityContext.capabilities.drop + content: "ALL" + - equal: + path: spec.template.spec.automountServiceAccountToken + value: false + + - it: keys the verdict off TCP reachability (curl exit code), not the TLS/HTTP outcome, and fails on non-enforcement + set: + networkPolicy: + training: + allowExternalHttps: false + enforcementProbeHost: "canary.example.net" + asserts: + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "HOST=.?canary\\.example\\.net" + # -k: a cert mismatch (e.g. probing an IP) must NOT be misread as a block + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "curl --noproxy '\\*' -k" + # verdict comes from curl's exit code (TCP reachability)... + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "rc=\\$\\?" + # a DNS failure (curl exit 6) is reported as inconclusive, not as "TCP connect succeeded" + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "could not resolve host" + # ...NOT the HTTP status — the old logic that conflated a TLS failure (code 000) with a block + - notMatchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "http_code" + - notMatchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: '"000"' + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "exit 1" + + - it: pins the curl probe image by digest + set: + networkPolicy: + training: + allowExternalHttps: false + asserts: + - matchRegex: + path: spec.template.spec.containers[0].image + pattern: "^docker\\.io/curlimages/curl@sha256:[a-f0-9]{64}$" diff --git a/client/tests/logs_pvc_test.yaml b/client/tests/logs_pvc_test.yaml new file mode 100644 index 0000000..5ac4fd2 --- /dev/null +++ b/client/tests/logs_pvc_test.yaml @@ -0,0 +1,197 @@ +suite: Client Logs PVC +templates: + - templates/logs-pvc.yaml +set: + clientId: "test-id" + clientPassword: "test" + dockerRegistry: + server: https://index.docker.io/v1/ + username: test + password: test + email: test@test.com +tests: + # --- default (dynamic provisioning, no hostPath) --- + - it: should render only the PVC when hostPath is disabled + set: + hostPath: + enabled: false + asserts: + - hasDocuments: + count: 1 + - isKind: + of: PersistentVolumeClaim + + - it: should name the PVC client-logs-pvc + set: + hostPath: + enabled: false + asserts: + - equal: + path: metadata.name + value: client-logs-pvc + + - it: should carry the keep resource-policy annotation on the PVC + set: + hostPath: + enabled: false + asserts: + - equal: + path: metadata.annotations["helm.sh/resource-policy"] + value: keep + + - it: should default the PVC access mode to ReadWriteMany + set: + hostPath: + enabled: false + asserts: + - equal: + path: spec.accessModes[0] + value: ReadWriteMany + + - it: should honour pvcAccessMode override on the PVC + set: + hostPath: + enabled: false + pvcAccessMode: ReadWriteOnce + asserts: + - equal: + path: spec.accessModes[0] + value: ReadWriteOnce + + - it: should request the default 10Gi of storage on the PVC + set: + hostPath: + enabled: false + asserts: + - equal: + path: spec.resources.requests.storage + value: 10Gi + + - it: should honour pvc.logs storage override on the PVC + set: + hostPath: + enabled: false + pvc: + logs: 25Gi + asserts: + - equal: + path: spec.resources.requests.storage + value: 25Gi + + - it: should use the release-unique storage class when storageClass.create is true + set: + hostPath: + enabled: false + storageClass: + create: true + release: + name: my-release + asserts: + - equal: + path: spec.storageClassName + value: my-release-storage-class + + - it: should use the provided storage class name when storageClass.create is false + set: + hostPath: + enabled: false + storageClass: + create: false + name: existing-sc + asserts: + - equal: + path: spec.storageClassName + value: existing-sc + + # --- hostPath enabled (bare-metal: PV + PVC) --- + - it: should render both PV and PVC when hostPath is enabled + set: + hostPath: + enabled: true + asserts: + - hasDocuments: + count: 2 + - isKind: + of: PersistentVolume + documentIndex: 0 + - isKind: + of: PersistentVolumeClaim + documentIndex: 1 + + - it: should name the PV -logs-pv + set: + hostPath: + enabled: true + release: + name: my-release + asserts: + - equal: + path: metadata.name + value: my-release-logs-pv + documentIndex: 0 + + - it: should bind the PV to the logs PVC via claimRef + set: + hostPath: + enabled: true + release: + name: my-release + namespace: tracebloc + asserts: + - equal: + path: spec.claimRef.name + value: client-logs-pvc + documentIndex: 0 + - equal: + path: spec.claimRef.namespace + value: tracebloc + documentIndex: 0 + + - it: should set the PV hostPath to a release-scoped logs directory created on demand + set: + hostPath: + enabled: true + release: + name: my-release + asserts: + - equal: + path: spec.hostPath.path + value: /tracebloc/my-release/logs + documentIndex: 0 + - equal: + path: spec.hostPath.type + value: DirectoryOrCreate + documentIndex: 0 + + - it: should expose the PV as ReadWriteOnce with matching capacity + set: + hostPath: + enabled: true + pvc: + logs: 15Gi + asserts: + - equal: + path: spec.accessModes[0] + value: ReadWriteOnce + documentIndex: 0 + - equal: + path: spec.capacity.storage + value: 15Gi + documentIndex: 0 + + - it: should share the storage class between the PV and PVC + set: + hostPath: + enabled: true + storageClass: + create: false + name: existing-sc + asserts: + - equal: + path: spec.storageClassName + value: existing-sc + documentIndex: 0 + - equal: + path: spec.storageClassName + value: existing-sc + documentIndex: 1 diff --git a/client/tests/mysql_storage_pvc_test.yaml b/client/tests/mysql_storage_pvc_test.yaml new file mode 100644 index 0000000..e69e884 --- /dev/null +++ b/client/tests/mysql_storage_pvc_test.yaml @@ -0,0 +1,177 @@ +suite: MySQL Storage PVC +# Covers templates/mysql-storage-pvc.yaml — the PersistentVolumeClaim (and, +# on hostPath/bare-metal, the paired PersistentVolume) backing the per-cluster +# MySQL state store. Previously untested. Locks down: +# - the dynamic-PVC-only path (hostPath.enabled=false, the managed default) +# - the hostPath PV+PVC pair (bare-metal) and its claimRef binding +# - the "helm.sh/resource-policy: keep" annotation that protects the state +# store from accidental deletion on helm uninstall/upgrade +# - access-mode defaulting and pvc size / storageClass wiring +templates: + - templates/mysql-storage-pvc.yaml +release: + name: stg + namespace: tracebloc +set: + clientId: "test-id" + clientPassword: "test" +tests: + # --- dynamic PVC only (managed clusters, the default) --- + - it: renders only the PVC when hostPath.enabled is false (default) + set: + hostPath: + enabled: false + asserts: + - hasDocuments: + count: 1 + - isKind: + of: PersistentVolumeClaim + + - it: PVC has the expected name and namespace + set: + hostPath: + enabled: false + asserts: + - equal: + path: metadata.name + value: mysql-pvc + - equal: + path: metadata.namespace + value: tracebloc + + - it: PVC defaults to ReadWriteMany access mode + set: + hostPath: + enabled: false + asserts: + - equal: + path: spec.accessModes[0] + value: ReadWriteMany + + - it: PVC requests the default 2Gi of storage + set: + hostPath: + enabled: false + asserts: + - equal: + path: spec.resources.requests.storage + value: 2Gi + + - it: PVC carries the resource-policy keep annotation so the state store survives uninstall + set: + hostPath: + enabled: false + asserts: + - equal: + path: metadata.annotations["helm.sh/resource-policy"] + value: keep + + - it: PVC uses the release-scoped StorageClass when storageClass.create is true (default) + set: + hostPath: + enabled: false + storageClass: + create: true + asserts: + - equal: + path: spec.storageClassName + value: stg-storage-class + + - it: PVC uses the provided StorageClass name when storageClass.create is false + set: + hostPath: + enabled: false + storageClass: + create: false + name: existing-sc + asserts: + - equal: + path: spec.storageClassName + value: existing-sc + + - it: PVC honours an explicit pvcAccessMode override + set: + hostPath: + enabled: false + pvcAccessMode: ReadWriteOnce + asserts: + - equal: + path: spec.accessModes[0] + value: ReadWriteOnce + + - it: PVC honours a custom mysql storage size + set: + hostPath: + enabled: false + pvc: + mysql: 5Gi + asserts: + - equal: + path: spec.resources.requests.storage + value: 5Gi + + # --- hostPath PV + PVC pair (bare-metal) --- + - it: renders PV + PVC when hostPath.enabled is true + set: + hostPath: + enabled: true + asserts: + - hasDocuments: + count: 2 + - isKind: + of: PersistentVolume + documentIndex: 0 + - isKind: + of: PersistentVolumeClaim + documentIndex: 1 + + - it: PV has the release-scoped name + set: + hostPath: + enabled: true + documentIndex: 0 + asserts: + - equal: + path: metadata.name + value: stg-mysql-pv + + - it: PV is backed by the fixed release-scoped hostPath with DirectoryOrCreate + set: + hostPath: + enabled: true + documentIndex: 0 + asserts: + - equal: + path: spec.hostPath.path + value: /tracebloc/stg/mysql + - equal: + path: spec.hostPath.type + value: DirectoryOrCreate + + - it: PV is hard-bound to the PVC via claimRef + set: + hostPath: + enabled: true + documentIndex: 0 + asserts: + - equal: + path: spec.claimRef.name + value: mysql-pvc + - equal: + path: spec.claimRef.namespace + value: tracebloc + + - it: PV uses ReadWriteOnce and advertises the configured capacity + set: + hostPath: + enabled: true + pvc: + mysql: 8Gi + documentIndex: 0 + asserts: + - equal: + path: spec.accessModes[0] + value: ReadWriteOnce + - equal: + path: spec.capacity.storage + value: 8Gi diff --git a/client/values.schema.json b/client/values.schema.json index 2eb42dc..9f4048c 100644 --- a/client/values.schema.json +++ b/client/values.schema.json @@ -226,6 +226,10 @@ "default": true, "description": "When false, drop the 0.0.0.0/0:443 egress rule so training pods reach only DNS, MySQL, requests-proxy and the egress gateway (SECURITY §8.2 / client-runtime#102). Default true keeps existing behaviour; flip per-fleet after verifying the egress gateway works (G2)." }, + "enforcementProbeHost": { + "type": "string", + "description": "Host the `helm test` enforcement check curls directly (when allowExternalHttps=false) to verify the CNI blocks egress; non-enforcement fails the test (a test hook never affects install/upgrade) — client-runtime#104. Empty string disables it (e.g. air-gapped clusters)." + }, "dnsNamespace": { "type": "string", "default": "kube-system", diff --git a/client/values.yaml b/client/values.yaml index 9565b1f..ac5cf7d 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -176,6 +176,12 @@ networkPolicy: # nil-guards this key, so a `helm upgrade --reuse-values` from a release # predating it keeps the old behaviour (rule present). allowExternalHttps: true + # client-runtime#104: enforcement check, run via `helm test ` after + # flipping the lockdown (allowExternalHttps=false). The test opens a direct TCP + # connection to this host's :443 from a training-labelled pod; if it connects => + # the CNI isn't enforcing egress => the test FAILS (a test hook never affects + # install/upgrade). Set "" to disable (e.g. air-gapped clusters). + enforcementProbeHost: "1.1.1.1" dnsNamespace: kube-system # CoreDNS pod selector — varies per platform. Override in ci/-values.yaml. # When empty, the template falls back to {k8s-app: kube-dns}, which works