From 028e9848716c8389acedba1c7b7ea9eaaf4b3ac4 Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Fri, 12 Jun 2026 20:34:05 +0500 Subject: [PATCH] fix(egress-proxy): enforcement check probes TCP reachability, not HTTP code [#104] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bugbot (PR #254): the check treated curl HTTP code 000 as proof of a block, but a TCP connection that succeeds (egress OPEN) then fails TLS verification also yields 000 — so `helm test` could pass on a non-enforcing CNI when probing an IP whose cert doesn't validate (the default 1.1.1.1 without -k). It conflated "TLS failed" with "egress blocked", defeating the check. Key the verdict off TCP reachability via curl's exit code instead of the HTTP status, and add -k so TLS is explicitly irrelevant: exit 28 (timeout) / 7 (connect failed) => egress blocked => test PASSES (exit 0); any other outcome (0 success, or a TLS-/HTTP-layer error such as 35/52/60 — all of which require the TCP connect to have already succeeded) => egress reached => NOT enforced => test FAILS (exit 1). Tests assert the verdict keys on the exit code (rc=$?, -k) and guard against a regression to the old http_code/000 logic. helm-unittest 248/248; lint clean. Co-Authored-By: Claude Opus 4.8 --- .../templates/egress-enforcement-check.yaml | 46 +++++++++++-------- .../tests/egress_enforcement_check_test.yaml | 16 ++++++- client/values.yaml | 8 ++-- 3 files changed, 45 insertions(+), 25 deletions(-) diff --git a/client/templates/egress-enforcement-check.yaml b/client/templates/egress-enforcement-check.yaml index c106f4b..ab21b6b 100644 --- a/client/templates/egress-enforcement-check.yaml +++ b/client/templates/egress-enforcement-check.yaml @@ -4,9 +4,11 @@ Renders ONLY when the lockdown is enabled (allowExternalHttps=false) and a probe host is set. This is a `helm test` hook — run `helm test ` after flipping the lockdown to verify it. A `tracebloc.io/workload: training`-labelled pod (so the - training-egress NetworkPolicy governs it) curls a canary external host DIRECTLY: - blocked => the CNI enforces egress (test PASSES); reachable => NOT enforced => the - lockdown is a silent no-op (test FAILS, exit 1). Because it's a test hook it never + training-egress NetworkPolicy governs it) opens a DIRECT TCP connection to a canary + host's :443 (the TLS/HTTP outcome is ignored — we only test reachability): connection + blocked => the CNI enforces egress (test PASSES); connection established => NOT + enforced => the lockdown is a silent no-op (test FAILS, exit 1). Because it's a test + hook it never runs during install/upgrade, so it can NEVER block them or the hourly auto-upgrade. The probe-host default is 1.1.1.1; set enforcementProbeHost="" to disable (e.g. air-gapped clusters with no external host to test against). @@ -54,23 +56,29 @@ spec: - -c - | HOST={{ $host | quote }} - echo "[egress-enforcement-check] probing direct egress to https://$HOST (must be BLOCKED when the lockdown is enforced)..." - code=$(curl --noproxy '*' -s -m 5 -o /dev/null -w '%{http_code}' "https://$HOST" 2>/dev/null || true) - if [ -n "$code" ] && [ "$code" != "000" ]; then - echo "WARNING ================================================================" - echo "WARNING EGRESS LOCKDOWN NOT ENFORCED on this cluster." - echo "WARNING A tracebloc.io/workload=training pod reached https://$HOST" - echo "WARNING directly (HTTP $code). networkPolicy.training.allowExternalHttps" - echo "WARNING is false, but the CNI is NOT enforcing egress NetworkPolicy, so" - echo "WARNING the SECURITY §8.2 lockdown is INACTIVE and training pods can" - echo "WARNING still reach the open internet." - echo "WARNING Fix: enable egress NetworkPolicy on the CNI (Calico/Cilium, or" - echo "WARNING EKS VPC-CNI enableNetworkPolicy=true), then re-run 'helm test'." - echo "WARNING ================================================================" - exit 1 + echo "[egress-enforcement-check] probing direct TCP egress to ${HOST}:443 (must be BLOCKED when the lockdown is enforced)..." + # We only care whether a TCP connection to :443 can be ESTABLISHED — the + # TLS/HTTP outcome is irrelevant. -k disables cert verification so probing an + # IP (or any host whose cert doesn't match) is NOT misread as a block, and the + # verdict is keyed on curl's EXIT CODE, not the HTTP status: a blocked egress + # yields a timeout (28) or connect failure (7), whereas any TLS-/HTTP-layer + # outcome (incl. a cert error) means the TCP connect already succeeded. + curl --noproxy '*' -k -sS -m 5 -o /dev/null "https://$HOST"; rc=$? + if [ "$rc" = "28" ] || [ "$rc" = "7" ]; then + echo "OK egress lockdown verified: TCP to ${HOST}:443 could not be established (curl exit $rc) — egress is blocked." + exit 0 fi - echo "OK egress lockdown verified: direct external egress is blocked (curl -> ${code:-blocked})." - exit 0 + echo "WARNING ================================================================" + echo "WARNING EGRESS LOCKDOWN NOT ENFORCED on this cluster." + echo "WARNING A tracebloc.io/workload=training pod reached ${HOST}:443 directly" + echo "WARNING (curl exit $rc; the TCP connect succeeded). networkPolicy.training." + echo "WARNING allowExternalHttps is false, but the CNI is NOT enforcing egress" + echo "WARNING NetworkPolicy, so the SECURITY §8.2 lockdown is INACTIVE and" + echo "WARNING training pods can still reach the open internet." + echo "WARNING Fix: enable egress NetworkPolicy on the CNI (Calico/Cilium, or" + echo "WARNING EKS VPC-CNI enableNetworkPolicy=true), then re-run 'helm test'." + echo "WARNING ================================================================" + exit 1 resources: requests: cpu: "10m" diff --git a/client/tests/egress_enforcement_check_test.yaml b/client/tests/egress_enforcement_check_test.yaml index 90c6fb9..83841c3 100644 --- a/client/tests/egress_enforcement_check_test.yaml +++ b/client/tests/egress_enforcement_check_test.yaml @@ -76,7 +76,7 @@ tests: path: spec.template.spec.automountServiceAccountToken value: false - - it: probes the configured host directly (no proxy) and fails the test on non-enforcement + - it: keys the verdict off TCP reachability (curl exit code), not the TLS/HTTP outcome, and fails on non-enforcement set: networkPolicy: training: @@ -86,9 +86,21 @@ tests: - matchRegex: path: spec.template.spec.containers[0].command[2] pattern: "HOST=.?canary\\.example\\.net" + # -k: a cert mismatch (e.g. probing an IP) must NOT be misread as a block - matchRegex: path: spec.template.spec.containers[0].command[2] - pattern: "curl --noproxy '\\*'" + pattern: "curl --noproxy '\\*' -k" + # verdict comes from curl's exit code (TCP reachability)... + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "rc=\\$\\?" + # ...NOT the HTTP status — the old logic that conflated a TLS failure (code 000) with a block + - notMatchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "http_code" + - notMatchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: '"000"' - matchRegex: path: spec.template.spec.containers[0].command[2] pattern: "exit 1" diff --git a/client/values.yaml b/client/values.yaml index b9a07af..ac5cf7d 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -177,10 +177,10 @@ networkPolicy: # predating it keeps the old behaviour (rule present). allowExternalHttps: true # client-runtime#104: enforcement check, run via `helm test ` after - # flipping the lockdown (allowExternalHttps=false). The test curls this host - # directly from a training-labelled pod; reachable => the CNI isn't enforcing - # egress => the test FAILS (a test hook never affects install/upgrade). Set "" - # to disable (e.g. air-gapped clusters with no external host to test against). + # flipping the lockdown (allowExternalHttps=false). The test opens a direct TCP + # connection to this host's :443 from a training-labelled pod; if it connects => + # the CNI isn't enforcing egress => the test FAILS (a test hook never affects + # install/upgrade). Set "" to disable (e.g. air-gapped clusters). enforcementProbeHost: "1.1.1.1" dnsNamespace: kube-system # CoreDNS pod selector — varies per platform. Override in ci/-values.yaml.