From 2fb5b8fc1fcbc3f3354d3cb516e1110078100760 Mon Sep 17 00:00:00 2001
From: "Asad Iqbal (Saadi)" <asad.dsoft@gmail.com>
Date: Thu, 11 Jun 2026 13:33:47 +0500
Subject: [PATCH] Merge pull request #247 from
 tracebloc/feat/102-egress-gateway
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

feat(egress-proxy): training-pod egress lockdown — squid gateway, gated rollout (client-runtime#102)
---
 .github/workflows/helm-ci.yaml                |  19 ++
 .github/workflows/installer-tests.yaml        |   4 +-
 .github/workflows/standard-checks.yml         |   2 +-
 client/Chart.yaml                             |   4 +-
 client/templates/egress-proxy-configmap.yaml  |  52 ++++
 client/templates/egress-proxy-deployment.yaml |  88 +++++++
 client/templates/egress-proxy-service.yaml    |  23 ++
 client/templates/jobs-manager-deployment.yaml |   7 +
 client/templates/network-policy-training.yaml |  26 +-
 client/tests/egress_proxy_test.yaml           | 246 ++++++++++++++++++
 client/values.schema.json                     |  48 ++++
 client/values.yaml                            |  46 ++++
 docs/SECURITY.md                              |  20 +-
 scripts/tests/e2e-auto-upgrade.sh             | 147 +++++++++++
 14 files changed, 718 insertions(+), 14 deletions(-)
 create mode 100644 client/templates/egress-proxy-configmap.yaml
 create mode 100644 client/templates/egress-proxy-deployment.yaml
 create mode 100644 client/templates/egress-proxy-service.yaml
 create mode 100644 client/tests/egress_proxy_test.yaml
 create mode 100755 scripts/tests/e2e-auto-upgrade.sh

diff --git a/.github/workflows/helm-ci.yaml b/.github/workflows/helm-ci.yaml
index 1813eeb..cff0662 100644
--- a/.github/workflows/helm-ci.yaml
+++ b/.github/workflows/helm-ci.yaml
@@ -6,12 +6,14 @@ on:
     paths:
       - 'client/**'
       - 'ingestor/**'
+      - 'scripts/tests/e2e-auto-upgrade.sh'
       - '.github/workflows/helm-ci.yaml'
   pull_request:
     branches: [main, develop, openshift]
     paths:
       - 'client/**'
       - 'ingestor/**'
+      - 'scripts/tests/e2e-auto-upgrade.sh'
       - '.github/workflows/helm-ci.yaml'
 
 jobs:
@@ -161,6 +163,23 @@ jobs:
             echo "images.ingestor.digest empty (default) — spawning by floating tag; no pinned digest to check."
           fi
 
+  upgrade-e2e:
+    # Fleet auto-upgrade non-regression gate (client-runtime#102 / #245-class
+    # regressions): installs the LAST PUBLISHED chart from gh-pages on a real
+    # k3d cluster, then upgrades to THIS working tree via both
+    # `--reuse-values` (manual-operator habit; nil-guards must hold, lockdown
+    # must not engage) and `--reset-then-reuse-values` (the auto-upgrade
+    # cronjob's path; new defaults must flow in inert), then flips the #102
+    # egress-lockdown flags and proves the next auto-upgrade preserves them.
+    # Pods are never waited on — published images need real credentials; the
+    # regression class this guards lives in Helm templating/values semantics.
+    name: Fleet auto-upgrade E2E (k3d)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Upgrade from last published release through both flag paths
+        run: bash scripts/tests/e2e-auto-upgrade.sh
+
   # Installer script tests (bats + Pester) + the cross-distro prerequisite matrix
   # live in their own workflow: .github/workflows/installer-tests.yaml
   # (triggered on scripts/** changes).
diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml
index 95518fe..581f9e2 100644
--- a/.github/workflows/installer-tests.yaml
+++ b/.github/workflows/installer-tests.yaml
@@ -55,11 +55,11 @@ jobs:
           # below for visibility but don't fail the gate.
           shellcheck --severity=error --shell=bash \
             scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
-            scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/check-drift.sh
+            scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/e2e-auto-upgrade.sh scripts/tests/check-drift.sh
           echo "── shellcheck warnings (advisory, non-blocking) ──"
           shellcheck --severity=warning --shell=bash \
             scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
-            scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/check-drift.sh || true
+            scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/e2e-auto-upgrade.sh scripts/tests/check-drift.sh || true
 
       - name: PSScriptAnalyzer (PowerShell installer)
         shell: pwsh
diff --git a/.github/workflows/standard-checks.yml b/.github/workflows/standard-checks.yml
index d5b449c..70ec98e 100644
--- a/.github/workflows/standard-checks.yml
+++ b/.github/workflows/standard-checks.yml
@@ -44,7 +44,7 @@ jobs:
           shellcheck --version | grep version
           shellcheck --severity=error --shell=bash \
             scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
-            scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh
+            scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/e2e-auto-upgrade.sh
 
   unit-tests:
     name: Unit tests
diff --git a/client/Chart.yaml b/client/Chart.yaml
index acf7b17..9c8a3f7 100644
--- a/client/Chart.yaml
+++ b/client/Chart.yaml
@@ -2,8 +2,8 @@ apiVersion: v2
 name: client
 description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift
 type: application
-version: 1.6.1
-appVersion: "1.6.1"
+version: 1.7.0
+appVersion: "1.7.0"
 keywords:
   - tracebloc
   - kubernetes
diff --git a/client/templates/egress-proxy-configmap.yaml b/client/templates/egress-proxy-configmap.yaml
new file mode 100644
index 0000000..5724406
--- /dev/null
+++ b/client/templates/egress-proxy-configmap.yaml
@@ -0,0 +1,52 @@
+{{- if (default dict .Values.egressProxy).enabled }}
+{{- /*
+  Egress gateway (squid) config — SECURITY §8.2 / client-runtime#102.
+  A forward proxy that ONLY permits HTTPS CONNECT to an FQDN allowlist, so that
+  once the training NetworkPolicy drops the 0.0.0.0/0:443 rule, a locked-down
+  training pod can still reach the tracebloc backend + App Insights through this
+  gateway — and nothing else. Service Bus is NOT here; it stays on the
+  requests-proxy:8888 path. The allowlist falls CLOSED: an empty list renders no
+  allow rule, so everything is denied.
+*/ -}}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ .Release.Name }}-egress-proxy
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tracebloc.labels" . | nindent 4 }}
+data:
+  squid.conf: |
+    visible_hostname tracebloc-egress-proxy
+    http_port {{ .Values.egressProxy.port | default 3128 }}
+
+    # Tunnel-only: no caching, no pid file, logs to the container's std streams —
+    # squid needs no writable paths, so the root filesystem stays read-only.
+    cache deny all
+    cache_store_log none
+    pid_filename none
+    access_log stdio:/dev/stdout
+    cache_log stdio:/dev/stderr
+    # No peers/cache → the ICMP pinger (needs CAP_NET_RAW, which the pod drops) is
+    # just noise/errors. Disable it.
+    pinger_enable off
+
+    acl SSL_ports port 443
+    acl CONNECT method CONNECT
+    {{- with .Values.egressProxy.allowlist }}
+    # dstdomain: a leading dot matches subdomains (e.g. .in.applicationinsights.azure.com),
+    # a bare host is an exact match (e.g. api.tracebloc.io).
+    acl allowed_fqdns dstdomain {{ join " " . }}
+    http_access deny CONNECT !SSL_ports
+    http_access allow CONNECT allowed_fqdns
+    {{- end }}
+    # Fail closed — anything not explicitly allowed above is denied.
+    http_access deny all
+    {{- if .Values.env.HTTP_PROXY_HOST }}
+
+    # Corporate-proxy chaining: when the cluster sits behind a corporate proxy,
+    # forward upstream through it instead of going direct (mirrors tracebloc.proxyEnv).
+    cache_peer {{ .Values.env.HTTP_PROXY_HOST }} parent {{ .Values.env.HTTP_PROXY_PORT | default 8080 }} 0 no-query default{{ if .Values.env.HTTP_PROXY_USERNAME }} login={{ .Values.env.HTTP_PROXY_USERNAME }}:{{ .Values.env.HTTP_PROXY_PASSWORD }}{{ end }}
+    never_direct allow all
+    {{- end }}
+{{- end }}
diff --git a/client/templates/egress-proxy-deployment.yaml b/client/templates/egress-proxy-deployment.yaml
new file mode 100644
index 0000000..b811bf1
--- /dev/null
+++ b/client/templates/egress-proxy-deployment.yaml
@@ -0,0 +1,88 @@
+{{- if (default dict .Values.egressProxy).enabled }}
+{{- /*
+  Egress gateway (squid) — SECURITY §8.2 / client-runtime#102.
+  Carries label app=egress-proxy, NOT tracebloc.io/workload=training, so it is
+  deliberately OUTSIDE the training NetworkPolicy and can egress to the
+  allowlisted FQDNs on the locked-down training pod's behalf.
+  Nil-guards: a `helm upgrade --reuse-values` from a release predating
+  egressProxy leaves .Values.egressProxy nil → the outer `if` renders nothing
+  (no gateway, no behaviour change). image/resources use default-through-dict so
+  a partial --set can't nil-pointer.
+*/ -}}
+{{- $ep := default dict .Values.egressProxy }}
+{{- $img := default dict $ep.image }}
+{{- $epRes := default dict $ep.resources }}
+{{- $epReq := default dict $epRes.requests }}
+{{- $epLim := default dict $epRes.limits }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ .Release.Name }}-egress-proxy
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tracebloc.labels" . | nindent 4 }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: egress-proxy
+  template:
+    metadata:
+      labels:
+        app: egress-proxy
+      annotations:
+        checksum/config: {{ include (print $.Template.BasePath "/egress-proxy-configmap.yaml") . | sha256sum }}
+    spec:
+      automountServiceAccountToken: false
+      securityContext:
+        runAsNonRoot: true
+        # squid's non-root user (Ubuntu `proxy` uid). Configurable because it is
+        # image-specific — VERIFY against the chosen egressProxy.image.
+        runAsUser: {{ $ep.runAsUser | default 13 }}
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: squid
+          image: {{ include "tracebloc.image" (dict "repository" ($img.repository | default "ubuntu/squid") "tag" ($img.tag | default "6.6-24.04_beta") "digest" ($img.digest | default "") "registry" ($img.registry | default "docker.io")) | quote }}
+          imagePullPolicy: IfNotPresent
+          command: ["squid"]
+          # -N: no daemon (run in foreground). Logs go to the std streams via
+          # squid.conf (access_log/cache_log = stdio); omit -d to avoid duplicate lines.
+          args: ["-N", "-f", "/etc/squid/squid.conf"]
+          ports:
+            - containerPort: {{ $ep.port | default 3128 }}
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+            readOnlyRootFilesystem: true
+          resources:
+            requests:
+              cpu: {{ $epReq.cpu | default "50m" | quote }}
+              memory: {{ $epReq.memory | default "64Mi" | quote }}
+            limits:
+              cpu: {{ $epLim.cpu | default "500m" | quote }}
+              memory: {{ $epLim.memory | default "256Mi" | quote }}
+          volumeMounts:
+            - name: squid-config
+              mountPath: /etc/squid/squid.conf
+              subPath: squid.conf
+              readOnly: true
+            - name: tmp
+              mountPath: /tmp
+            - name: var-run
+              mountPath: /var/run
+      volumes:
+        - name: squid-config
+          configMap:
+            name: {{ .Release.Name }}-egress-proxy
+        - name: tmp
+          emptyDir: {}
+        - name: var-run
+          emptyDir: {}
+      {{- if include "tracebloc.useImagePullSecrets" . }}
+      imagePullSecrets:
+        - name: {{ include "tracebloc.registrySecretName" . }}
+      {{- end }}
+      restartPolicy: Always
+{{- end }}
diff --git a/client/templates/egress-proxy-service.yaml b/client/templates/egress-proxy-service.yaml
new file mode 100644
index 0000000..7043793
--- /dev/null
+++ b/client/templates/egress-proxy-service.yaml
@@ -0,0 +1,23 @@
+{{- if (default dict .Values.egressProxy).enabled }}
+{{- /*
+  ClusterIP for the egress gateway. Training pods reach it as
+  egress-proxy-service:<port> via their HTTPS_PROXY env (wired in Step 1).
+*/ -}}
+apiVersion: v1
+kind: Service
+metadata:
+  name: egress-proxy-service
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tracebloc.labels" . | nindent 4 }}
+    app: egress-proxy
+spec:
+  selector:
+    app: egress-proxy
+  ports:
+    - name: http-proxy
+      port: {{ .Values.egressProxy.port | default 3128 }}
+      targetPort: {{ .Values.egressProxy.port | default 3128 }}
+      protocol: TCP
+  type: ClusterIP
+{{- end }}
diff --git a/client/templates/jobs-manager-deployment.yaml b/client/templates/jobs-manager-deployment.yaml
index ff0f03c..911dd1f 100644
--- a/client/templates/jobs-manager-deployment.yaml
+++ b/client/templates/jobs-manager-deployment.yaml
@@ -127,6 +127,13 @@ spec:
           value: {{ (default dict .Values.images.ingestor).digest | default "" | quote }}
         - name: REQUESTS_PROXY_URL
           value: "http://requests-proxy-service:8888"
+        {{- if and (default dict .Values.egressProxy).enabled (default dict .Values.egressProxy).routeWorkloads }}
+        # client-runtime#102: when routing is enabled, jobs-manager injects
+        # HTTPS_PROXY=<gateway> into training pods (and suppresses raw HTTP_PROXY_HOST).
+        # Inert until egressProxy.routeWorkloads=true.
+        - name: EGRESS_PROXY_URL
+          value: "http://egress-proxy-service:{{ (default dict .Values.egressProxy).port | default 3128 }}"
+        {{- end }}
         - name: JOB_IMAGE_HOST
           value: "docker.io/"
         - name: CLIENT_ENV
diff --git a/client/templates/network-policy-training.yaml b/client/templates/network-policy-training.yaml
index 643d96a..4ad1131 100644
--- a/client/templates/network-policy-training.yaml
+++ b/client/templates/network-policy-training.yaml
@@ -61,10 +61,16 @@ spec:
           protocol: UDP
         - port: 53
           protocol: TCP
+    {{- /* Rule 2 (external HTTPS) is the egress hole SECURITY §8.2 closes. Gated on
+           networkPolicy.training.allowExternalHttps via `dig` with a default of TRUE:
+           an absent key (helm upgrade --reuse-values from a release predating it)
+           keeps the rule — old behaviour — so only an explicit `false` drops it, once
+           an operator has verified the egress gateway on that cluster (#102). */}}
+    {{- if dig "allowExternalHttps" true .Values.networkPolicy.training }}
     # 2. External HTTPS — everything NOT in the cluster's pod/service CIDRs.
-    #    Training pods call backend, Azure Service Bus, App Insights, etc.
-    #    This blocks pod-to-pod, ClusterIPs, jobs-manager, K8s API. MySQL is
-    #    explicitly re-permitted by the next rule.
+    #    Training pods reach the backend / Azure Service Bus / App Insights directly.
+    #    This blocks pod-to-pod, ClusterIPs, jobs-manager, K8s API. MySQL and the
+    #    in-cluster proxies are explicitly re-permitted by the rules below.
     - to:
         - ipBlock:
             cidr: 0.0.0.0/0
@@ -75,6 +81,7 @@ spec:
       ports:
         - port: 443
           protocol: TCP
+    {{- end }}
     # 3. MySQL — training pods read the training dataset from the
     #    in-namespace mysql-client pod. podSelector with no namespaceSelector
     #    matches pods in the same namespace as this NetworkPolicy.
@@ -100,4 +107,17 @@ spec:
       ports:
         - port: 8888
           protocol: TCP
+    {{- if (default dict .Values.egressProxy).enabled }}
+    # 5. egress gateway — training pods reach the in-cluster squid egress gateway
+    #    (egress-proxy-service) for allowlisted external HTTPS, used once the
+    #    external-HTTPS rule (rule 2) is dropped. Re-permitted explicitly like MySQL /
+    #    requests-proxy above (rule 2's `except` blocks ClusterIP egress).
+    - to:
+        - podSelector:
+            matchLabels:
+              app: egress-proxy
+      ports:
+        - port: {{ (default dict .Values.egressProxy).port | default 3128 }}
+          protocol: TCP
+    {{- end }}
 {{- end }}
diff --git a/client/tests/egress_proxy_test.yaml b/client/tests/egress_proxy_test.yaml
new file mode 100644
index 0000000..b0a972a
--- /dev/null
+++ b/client/tests/egress_proxy_test.yaml
@@ -0,0 +1,246 @@
+suite: Egress gateway (squid)
+# SECURITY §8.2 / client-runtime#102. The egress gateway lets a locked-down
+# training pod reach an FQDN allowlist (backend + App Insights) and nothing
+# else. These guards pin: the on/off flag, the fail-closed allowlist, the
+# PSA-restricted security context, the nil-guards against `helm upgrade
+# --reuse-values` from a release predating egressProxy, and — critically — that
+# the gateway is NOT labelled as a training workload (so the lockdown netpol
+# never selects it and it keeps its own egress).
+templates:
+  - templates/egress-proxy-deployment.yaml
+  - templates/egress-proxy-service.yaml
+  - templates/egress-proxy-configmap.yaml
+  - templates/jobs-manager-deployment.yaml
+  - templates/network-policy-training.yaml
+set:
+  clientId: "test-id"
+  clientPassword: "test"
+tests:
+  - it: renders the Deployment, Service and ConfigMap when enabled (default)
+    asserts:
+      - hasDocuments:
+          count: 1
+        template: templates/egress-proxy-deployment.yaml
+      - hasDocuments:
+          count: 1
+        template: templates/egress-proxy-service.yaml
+      - hasDocuments:
+          count: 1
+        template: templates/egress-proxy-configmap.yaml
+
+  - it: renders nothing when disabled
+    set:
+      egressProxy:
+        enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+        template: templates/egress-proxy-deployment.yaml
+      - hasDocuments:
+          count: 0
+        template: templates/egress-proxy-service.yaml
+
+  - it: renders nothing when egressProxy is absent (helm upgrade --reuse-values replay)
+    # A pre-#102 stored values set has no egressProxy key; the nil parent must
+    # render no gateway rather than crash with "nil pointer evaluating interface".
+    set:
+      egressProxy: null
+    asserts:
+      - hasDocuments:
+          count: 0
+        template: templates/egress-proxy-deployment.yaml
+
+  - it: is a ClusterIP service on the proxy port
+    template: templates/egress-proxy-service.yaml
+    asserts:
+      - equal:
+          path: metadata.name
+          value: egress-proxy-service
+      - equal:
+          path: spec.type
+          value: ClusterIP
+      - equal:
+          path: spec.ports[0].port
+          value: 3128
+
+  - it: is NOT labelled as a training workload (must stay outside the lockdown netpol)
+    template: templates/egress-proxy-deployment.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels.app
+          value: egress-proxy
+      - notExists:
+          path: spec.template.metadata.labels["tracebloc.io/workload"]
+
+  - it: enforces a PSA-restricted security context
+    template: templates/egress-proxy-deployment.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.securityContext.runAsNonRoot
+          value: true
+      - equal:
+          path: spec.template.spec.securityContext.seccompProfile.type
+          value: RuntimeDefault
+      - equal:
+          path: spec.template.spec.containers[0].securityContext.allowPrivilegeEscalation
+          value: false
+      - equal:
+          path: spec.template.spec.containers[0].securityContext.readOnlyRootFilesystem
+          value: true
+      - contains:
+          path: spec.template.spec.containers[0].securityContext.capabilities.drop
+          content: "ALL"
+      - equal:
+          path: spec.template.spec.automountServiceAccountToken
+          value: false
+
+  - it: does not automount the service account token
+    template: templates/egress-proxy-deployment.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.automountServiceAccountToken
+          value: false
+
+  - it: renders default resources and honors an override through the nil-guard
+    template: templates/egress-proxy-deployment.yaml
+    set:
+      egressProxy:
+        resources:
+          limits:
+            memory: 1Gi
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].resources.requests.cpu
+          value: 50m
+      - equal:
+          path: spec.template.spec.containers[0].resources.limits.memory
+          value: 1Gi
+
+  - it: allowlists the backend + App Insights and fails closed
+    template: templates/egress-proxy-configmap.yaml
+    asserts:
+      - matchRegex:
+          path: data["squid.conf"]
+          pattern: "acl allowed_fqdns dstdomain .*api\\.tracebloc\\.io"
+      - matchRegex:
+          path: data["squid.conf"]
+          pattern: "\\.in\\.applicationinsights\\.azure\\.com"
+      - matchRegex:
+          path: data["squid.conf"]
+          pattern: "http_access deny all"
+
+  - it: does not chain to a corporate proxy unless one is configured
+    template: templates/egress-proxy-configmap.yaml
+    asserts:
+      - notMatchRegex:
+          path: data["squid.conf"]
+          pattern: "cache_peer"
+
+  - it: chains to the corporate proxy when env.HTTP_PROXY_HOST is set
+    template: templates/egress-proxy-configmap.yaml
+    set:
+      env:
+        HTTP_PROXY_HOST: corp-proxy.internal
+        HTTP_PROXY_PORT: "8080"
+    asserts:
+      - matchRegex:
+          path: data["squid.conf"]
+          pattern: "cache_peer corp-proxy\\.internal parent 8080"
+      - matchRegex:
+          path: data["squid.conf"]
+          pattern: "never_direct allow all"
+
+  # --- Step 1: routing training pods through the gateway ---
+
+  - it: jobs-manager gets EGRESS_PROXY_URL only when routeWorkloads is enabled
+    template: templates/jobs-manager-deployment.yaml
+    documentIndex: 0
+    set:
+      egressProxy:
+        routeWorkloads: true
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: EGRESS_PROXY_URL
+            value: "http://egress-proxy-service:3128"
+
+  - it: jobs-manager does NOT get EGRESS_PROXY_URL by default (routeWorkloads false)
+    template: templates/jobs-manager-deployment.yaml
+    documentIndex: 0
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: EGRESS_PROXY_URL
+            value: "http://egress-proxy-service:3128"
+
+  - it: training netpol permits egress to the gateway and keeps a stable training podSelector
+    template: templates/network-policy-training.yaml
+    asserts:
+      # auto-upgrade safety: the lockdown netpol must select ONLY training pods,
+      # never the auto-upgrade / image-refresh cronjobs.
+      - equal:
+          path: spec.podSelector.matchLabels
+          value:
+            tracebloc.io/workload: training
+      - contains:
+          path: spec.egress
+          content:
+            to:
+              - podSelector:
+                  matchLabels:
+                    app: egress-proxy
+            ports:
+              - port: 3128
+                protocol: TCP
+
+  # --- Step 3: the lockdown — drop the external 0.0.0.0/0:443 rule ---
+
+  - it: keeps the external 443 egress rule by default (allowExternalHttps true)
+    template: templates/network-policy-training.yaml
+    asserts:
+      - contains:
+          path: spec.egress
+          content:
+            to:
+              - ipBlock:
+                  cidr: 0.0.0.0/0
+                  except:
+                    - 10.0.0.0/8
+                    - 172.16.0.0/12
+                    - 192.168.0.0/16
+            ports:
+              - port: 443
+                protocol: TCP
+
+  - it: drops the external 443 rule but keeps the gateway path when allowExternalHttps is false
+    template: templates/network-policy-training.yaml
+    set:
+      networkPolicy:
+        training:
+          allowExternalHttps: false
+    asserts:
+      - notContains:
+          path: spec.egress
+          content:
+            to:
+              - ipBlock:
+                  cidr: 0.0.0.0/0
+                  except:
+                    - 10.0.0.0/8
+                    - 172.16.0.0/12
+                    - 192.168.0.0/16
+            ports:
+              - port: 443
+                protocol: TCP
+      - contains:
+          path: spec.egress
+          content:
+            to:
+              - podSelector:
+                  matchLabels:
+                    app: egress-proxy
+            ports:
+              - port: 3128
+                protocol: TCP
diff --git a/client/values.schema.json b/client/values.schema.json
index c5a16d3..2eb42dc 100644
--- a/client/values.schema.json
+++ b/client/values.schema.json
@@ -221,6 +221,11 @@
               "default": true,
               "description": "Create the training-egress NetworkPolicy. Set false on clusters without an enforcing CNI."
             },
+            "allowExternalHttps": {
+              "type": "boolean",
+              "default": true,
+              "description": "When false, drop the 0.0.0.0/0:443 egress rule so training pods reach only DNS, MySQL, requests-proxy and the egress gateway (SECURITY §8.2 / client-runtime#102). Default true keeps existing behaviour; flip per-fleet after verifying the egress gateway works (G2)."
+            },
             "dnsNamespace": {
               "type": "string",
               "default": "kube-system",
@@ -256,6 +261,49 @@
         }
       }
     },
+    "egressProxy": {
+      "type": "object",
+      "description": "In-cluster squid egress gateway (SECURITY §8.2 / client-runtime#102). Forward proxy that permits HTTPS CONNECT only to the allowlist, so a locked-down training pod can reach the backend + App Insights and nothing else.",
+      "properties": {
+        "enabled": { "type": "boolean", "default": true },
+        "routeWorkloads": { "type": "boolean", "default": false, "description": "Route training-pod outbound HTTPS through the gateway (jobs-manager injects HTTPS_PROXY). Default false — enable per-fleet, verify a run, then drop the direct egress rule (networkPolicy.training.allowExternalHttps=false)." },
+        "port": { "type": "integer", "minimum": 1, "maximum": 65535, "default": 3128 },
+        "runAsUser": { "type": "integer", "minimum": 1 },
+        "image": {
+          "type": "object",
+          "properties": {
+            "registry": { "type": "string" },
+            "repository": { "type": "string", "minLength": 1 },
+            "tag": { "type": "string", "not": { "const": "latest" } },
+            "digest": { "type": "string", "pattern": "^(sha256:[a-f0-9]{64})?$" }
+          }
+        },
+        "allowlist": {
+          "type": "array",
+          "description": "FQDNs the gateway permits HTTPS CONNECT to (squid dstdomain syntax: leading dot = subdomain match, bare host = exact).",
+          "items": { "type": "string", "minLength": 1 }
+        },
+        "resources": {
+          "type": "object",
+          "properties": {
+            "requests": {
+              "type": "object",
+              "properties": {
+                "cpu":    { "type": "string", "pattern": "^[0-9]+m?$" },
+                "memory": { "type": "string", "pattern": "^[0-9]+(Ki|Mi|Gi|Ti)$" }
+              }
+            },
+            "limits": {
+              "type": "object",
+              "properties": {
+                "cpu":    { "type": "string", "pattern": "^[0-9]+m?$" },
+                "memory": { "type": "string", "pattern": "^[0-9]+(Ki|Mi|Gi|Ti)$" }
+              }
+            }
+          }
+        }
+      }
+    },
     "images": {
       "type": "object",
       "description": "Container image pinning. Prefer digest over tag for immutability.",
diff --git a/client/values.yaml b/client/values.yaml
index 8115d62..9565b1f 100644
--- a/client/values.yaml
+++ b/client/values.yaml
@@ -168,6 +168,14 @@ namespace:
 networkPolicy:
   training:
     enabled: true
+    # Egress lockdown (SECURITY §8.2 / client-runtime#102). When false, the
+    # training NetworkPolicy DROPS the 0.0.0.0/0:443 rule, so training pods can
+    # reach only DNS, in-cluster MySQL, the requests-proxy, and the egress
+    # gateway. Default true keeps the fleet unchanged; flip OFF per-fleet AFTER
+    # verifying the egress gateway works on that cluster (G2). The template
+    # nil-guards this key, so a `helm upgrade --reuse-values` from a release
+    # predating it keeps the old behaviour (rule present).
+    allowExternalHttps: true
     dnsNamespace: kube-system
     # CoreDNS pod selector — varies per platform. Override in ci/<platform>-values.yaml.
     # When empty, the template falls back to {k8s-app: kube-dns}, which works
@@ -184,6 +192,44 @@ networkPolicy:
       - "172.16.0.0/12"
       - "192.168.0.0/16"
 
+# -- Egress gateway (squid) — SECURITY §8.2 / client-runtime#102.
+# In-cluster forward proxy that lets a locked-down training pod reach an FQDN
+# allowlist (backend + App Insights) and nothing else. Labelled app=egress-proxy
+# so the training NetworkPolicy never selects it (it keeps its own egress).
+egressProxy:
+  enabled: true
+  # Route training-pod outbound HTTPS through the gateway (Step 1 of #102). Default
+  # FALSE so the gateway ships inert; flip per-fleet to true, verify a real training
+  # run, THEN set networkPolicy.training.allowExternalHttps=false to drop direct egress.
+  routeWorkloads: false
+  # squid image, pinned by multi-arch (amd64+arm64) index digest — tracebloc pins
+  # all images by digest. `tag` stays for readability; the digest is authoritative.
+  # ubuntu/squid:6.6-24.04_beta (Ubuntu 24.04 LTS base), resolved 2026-06-10.
+  image:
+    registry: docker.io
+    repository: ubuntu/squid
+    tag: "6.6-24.04_beta"
+    digest: "sha256:6a097f68bae708cedbabd6188d68c7e2e7a38cedd05a176e1cc0ba29e3bbe029"
+  # squid's non-root uid (Ubuntu `proxy`). Image-specific — verify if you swap image.
+  runAsUser: 13
+  port: 3128
+  # FQDNs the gateway permits HTTPS CONNECT to. squid dstdomain syntax: a leading
+  # dot matches subdomains; a bare host is exact. Fails CLOSED when empty.
+  allowlist:
+    - dev-api.tracebloc.io
+    - stg-api.tracebloc.io
+    - api.tracebloc.io
+    - xray-backend.azurewebsites.net
+    - .in.applicationinsights.azure.com
+    - dc.services.visualstudio.com
+  resources:
+    requests:
+      cpu: "50m"
+      memory: "64Mi"
+    limits:
+      cpu: "500m"
+      memory: "256Mi"
+
 # -- Container image pinning.
 # For each image, set `digest` to a sha256 (e.g. "sha256:abc123...") to pin
 # the image by content hash. Digest pinning is strongly preferred: tags are
diff --git a/docs/SECURITY.md b/docs/SECURITY.md
index 5f70321..45d3732 100644
--- a/docs/SECURITY.md
+++ b/docs/SECURITY.md
@@ -147,9 +147,10 @@ spec:
 **What this still allows:**
 
 - DNS lookups (needed to resolve backend + Azure endpoints)
-- Outbound HTTPS/443 to the public internet (needed today for the training container to reach the tracebloc backend and Azure Service Bus; see §8.2)
+- In-cluster egress to MySQL (3306), the requests-proxy (8888), and the egress gateway (3128)
+- Outbound HTTPS/443 to the public internet — **only while `networkPolicy.training.allowExternalHttps: true` (the current default).** Set it to `false` and this rule is dropped, so training pods reach external services only through the in-cluster egress gateway (see §8.2).
 
-**Configuration:** `networkPolicy.training.enabled: true` (the default).
+**Configuration:** `networkPolicy.training.enabled: true` (the default). Egress lockdown: `networkPolicy.training.allowExternalHttps` + `egressProxy.*` (see §8.2).
 
 ### 4.3 Kubernetes API access (G3)
 
@@ -408,13 +409,20 @@ Known gaps between the current state and a fully-hardened setup, with the owner
 
 **Mitigation plan:** backend endpoint that mints short-TTL, entity-scoped, send-only SAS tokens per experiment. Backend team owns the design and implementation.
 
-**Interim mitigation:** the `NetworkPolicy` in §4.2 still allows outbound HTTPS, so a training pod can reach Azure Service Bus directly. The only way to hard-block forgery before backend support lands is to deny external egress entirely — not currently possible because training pods legitimately call the backend + App Insights + Service Bus. See §8.2.
+**Interim mitigation:** with the §8.2 egress lockdown enabled (`networkPolicy.training.allowExternalHttps: false`), a training pod can no longer reach Azure Service Bus directly — SB traffic goes through the in-cluster requests-proxy (which holds the connection strings), and the conn-strings are no longer injected into the pod. Until a fleet enables the lockdown the NetworkPolicy still allows direct outbound HTTPS. The scoped/short-TTL SAS-token plan above remains the durable fix. See §8.2.
 
-### 8.2 Training pods still have outbound HTTPS (G2) — **platform team**
+### 8.2 Training-pod outbound HTTPS (G2) — **mechanism shipped (1.7.0), gated rollout**
 
-The NetworkPolicy blocks in-cluster traffic and non-443 egress but must allow outbound HTTPS to let training pods function (backend API, Azure Service Bus, App Insights). A malicious pod can still `requests.post()` to an arbitrary endpoint.
+By default the NetworkPolicy still allows outbound HTTPS/443 so training pods can reach the backend, Azure Service Bus, and App Insights — so a malicious pod can still `requests.post()` to an arbitrary endpoint until the lockdown is enabled.
 
-**Final fix:** route all training-pod ↔ tracebloc communication through the jobs-manager sidecar, so training pods egress only to a cluster-internal IP and hold no external-facing credentials. Medium-size architectural change; not scheduled for this quarter.
+**Mechanism (chart 1.7.0, client-runtime#102):** an in-cluster **egress gateway** (`egressProxy` — a squid forward proxy) permits HTTPS CONNECT only to an FQDN allowlist (backend + App Insights) and chains to a corporate proxy via `cache_peer`. With routing on, jobs-manager injects `HTTPS_PROXY=egress-proxy-service:3128` into each training pod (and drops the raw `HTTP_PROXY_HOST`), so backend + App-Insights traffic flows through the gateway; Service Bus already goes via the requests-proxy. The pod then needs no direct internet, and the external-443 rule can be dropped.
+
+**Rollout (per fleet, progressive — each step reversible):**
+1. Upgrade to ≥ 1.7.0 — the gateway deploys, inert (`egressProxy.routeWorkloads: false`).
+2. Set `egressProxy.routeWorkloads: true`; verify a training run completes via the gateway.
+3. Set `networkPolicy.training.allowExternalHttps: false` to drop the external-443 rule, and verify **G2** (a training pod cannot reach an arbitrary external host). Requires a NetworkPolicy-enforcing CNI (§4.2).
+
+**Residual:** the pod still holds `BACKEND_TOKEN` (it authenticates to the backend through the gateway). Scoping / short-TTL of that token is tracked under §8.1.
 
 ### 8.3 Backend tokens never expire — **backend team**
 
diff --git a/scripts/tests/e2e-auto-upgrade.sh b/scripts/tests/e2e-auto-upgrade.sh
new file mode 100755
index 0000000..47433e6
--- /dev/null
+++ b/scripts/tests/e2e-auto-upgrade.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+# =============================================================================
+#  e2e-auto-upgrade.sh — fleet auto-upgrade non-regression gate
+# -----------------------------------------------------------------------------
+#  The fleet self-upgrades hourly via auto-upgrade-cronjob.yaml:
+#      helm upgrade <rel> tracebloc/client --version <latest> --reset-then-reuse-values
+#  and operators habitually run `helm upgrade --reuse-values` by hand. Both
+#  replay OLD stored values against the NEW chart — the failure mode that has
+#  repeatedly bitten this chart (nil-pointer templating on keys the stored
+#  values predate; see requests_proxy_test.yaml / resource_monitor_test.yaml).
+#
+#  This gate installs the LAST PUBLISHED chart from gh-pages on a real k3d
+#  cluster, then upgrades to the LOCAL working-tree chart through both flag
+#  paths and asserts the contract that keeps the fleet safe:
+#    1. `--reuse-values`            -> upgrade succeeds (nil-guards hold) and the
+#                                      egress lockdown does NOT engage by accident.
+#    2. `--reset-then-reuse-values` -> upgrade succeeds, new defaults flow in
+#                                      (egress gateway deploys, inert), and
+#                                      out-of-band image-refresh annotations survive.
+#    3. flip the #102 lockdown flags -> rule 2 drops, jobs-manager routes pods
+#                                      at the gateway.
+#    4. the next plain auto-upgrade  -> the operator's flip PERSISTS.
+#
+#  Pods are NEVER waited on: the published images need real credentials to go
+#  healthy, and the regression class this guards lives entirely in Helm
+#  templating / values semantics. No secrets; stock GitHub runners.
+#
+#  Usage:  bash scripts/tests/e2e-auto-upgrade.sh
+# =============================================================================
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+LIB="$HERE/../lib"
+CHART_DIR="$HERE/../../client"
+
+# Isolated cluster + release so we never touch a real 'tracebloc' install; opt
+# out of autostart so we don't reconfigure docker.service on the host.
+export USER="${USER:-$(id -un)}"
+export CLUSTER_NAME="${CLUSTER_NAME:-tbupg}"
+export TRACEBLOC_NO_AUTOSTART=1
+NS="tbupg"
+REPO_NAME="tracebloc"
+REPO_URL="https://tracebloc.github.io/client"
+
+# shellcheck source=/dev/null
+source "$LIB/common.sh"
+# shellcheck source=/dev/null
+source "$LIB/setup-linux.sh"
+# shellcheck source=/dev/null
+source "$LIB/cluster.sh"
+# shellcheck source=/dev/null
+source "$LIB/preflight.sh"   # provides _pf_recheck_runtime_mem (called by create_cluster)
+
+cleanup() { k3d cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true; }
+trap cleanup EXIT
+
+fail() { echo "FAIL: $*" >&2; exit 1; }
+
+# --- assertion helpers (read live cluster state, not helm output) -----------
+netpol_has_external_443() {
+  kubectl get networkpolicy "${NS}-training-egress" -n "$NS" -o yaml \
+    | grep -q 'cidr: 0.0.0.0/0'
+}
+
+jm_deploy() {
+  kubectl get deploy -n "$NS" -o name | grep -m1 'jobs-manager'
+}
+
+jm_egress_proxy_url() {
+  kubectl get -n "$NS" "$(jm_deploy)" \
+    -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="EGRESS_PROXY_URL")].value}'
+}
+
+echo "═══════════════════════════════════════════════════════════════════════"
+echo "  E2E auto-upgrade gate   arch: $(uname -m)   kernel: $(uname -r)"
+echo "═══════════════════════════════════════════════════════════════════════"
+
+has docker || error "Docker is not available on this host."
+umask 022
+install_kubectl
+install_k3d
+install_helm
+
+echo "── create_cluster() — the installer's real cluster-bring-up path ──"
+create_cluster
+kubectl wait --for=condition=Ready nodes --all --timeout=180s
+
+echo "── install the LAST PUBLISHED chart (what the fleet runs today) ──"
+helm repo add "$REPO_NAME" "$REPO_URL" >/dev/null
+helm repo update >/dev/null
+# Same idiom the auto-upgrade cronjob uses to pick the newest version.
+PREV="$(helm search repo "${REPO_NAME}/client" -o yaml \
+  | awk '/^[[:space:]]*version:/ {print $2; exit}')"
+[ -n "$PREV" ] || fail "could not resolve the latest published chart version from $REPO_URL"
+LOCAL_VERSION="$(awk '/^version:/ {print $2; exit}' "$CHART_DIR/Chart.yaml")"
+echo "   published: $PREV   local working tree: $LOCAL_VERSION"
+
+helm install "$NS" "${REPO_NAME}/client" --version "$PREV" \
+  --namespace "$NS" --create-namespace \
+  --set clientId=ci-e2e-upgrade \
+  --set clientPassword=ci-e2e-upgrade \
+  --set storageClass.provisioner=rancher.io/local-path
+
+echo "── simulate an image-refresh-managed annotation (must survive upgrades) ──"
+kubectl annotate -n "$NS" "$(jm_deploy)" \
+  "tracebloc.io/last-refreshed-jobs-manager-digest=sha256:e2e-sentinel" --overwrite
+
+echo "── path 1: manual-operator habit — helm upgrade --reuse-values ──"
+# Old stored values replayed against the new chart: every new key is absent.
+# The nil-guards must hold, and the lockdown must NOT engage by accident.
+helm upgrade "$NS" "$CHART_DIR" --namespace "$NS" --reuse-values
+netpol_has_external_443 || fail "--reuse-values upgrade dropped the external 443 rule (lockdown engaged by accident)"
+[ -z "$(jm_egress_proxy_url)" ] || fail "--reuse-values upgrade injected EGRESS_PROXY_URL (routing engaged by accident)"
+echo "   OK: upgrade succeeded, lockdown stayed off"
+
+echo "── path 2: the fleet auto-upgrade — helm upgrade --reset-then-reuse-values ──"
+helm upgrade "$NS" "$CHART_DIR" --namespace "$NS" --reset-then-reuse-values
+netpol_has_external_443 || fail "auto-upgrade dropped the external 443 rule (allowExternalHttps default did not flow)"
+[ -z "$(jm_egress_proxy_url)" ] || fail "auto-upgrade injected EGRESS_PROXY_URL (routeWorkloads should default false)"
+kubectl get deploy "${NS}-egress-proxy" -n "$NS" >/dev/null \
+  || fail "auto-upgrade did not deploy the egress gateway (new defaults did not flow)"
+ANNOT="$(kubectl get -n "$NS" "$(jm_deploy)" \
+  -o jsonpath='{.metadata.annotations.tracebloc\.io/last-refreshed-jobs-manager-digest}')"
+[ "$ANNOT" = "sha256:e2e-sentinel" ] || fail "image-refresh annotation was clobbered by the upgrade"
+DEPLOYED="$(helm list -n "$NS" --filter "^${NS}\$" -o yaml \
+  | awk '/^[[:space:]]*chart:/ {print $2; exit}')"
+[ "$DEPLOYED" = "client-${LOCAL_VERSION}" ] || fail "deployed chart is $DEPLOYED, expected client-${LOCAL_VERSION}"
+echo "   OK: new defaults flowed in (gateway deployed, inert), annotations survived"
+
+echo "── path 3: operator flips the #102 lockdown ──"
+helm upgrade "$NS" "$CHART_DIR" --namespace "$NS" --reset-then-reuse-values \
+  --set egressProxy.routeWorkloads=true \
+  --set networkPolicy.training.allowExternalHttps=false
+netpol_has_external_443 && fail "lockdown flip did NOT drop the external 443 rule"
+[ "$(jm_egress_proxy_url)" = "http://egress-proxy-service:3128" ] \
+  || fail "lockdown flip did not point jobs-manager at the egress gateway"
+echo "   OK: rule 2 dropped, training pods route via the gateway"
+
+echo "── path 4: the NEXT hourly auto-upgrade must preserve the flip ──"
+helm upgrade "$NS" "$CHART_DIR" --namespace "$NS" --reset-then-reuse-values
+netpol_has_external_443 && fail "auto-upgrade after the flip re-opened the external 443 rule (override lost)"
+[ "$(jm_egress_proxy_url)" = "http://egress-proxy-service:3128" ] \
+  || fail "auto-upgrade after the flip lost EGRESS_PROXY_URL (override lost)"
+echo "   OK: the operator's lockdown persists across auto-upgrades"
+
+echo ""
+echo "E2E PASS: ${PREV} -> ${LOCAL_VERSION} upgrades safe on both flag paths; #102 flip engages and persists."