Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/helm-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ on:
paths:
- 'client/**'
- 'ingestor/**'
- 'scripts/tests/e2e-auto-upgrade.sh'
- '.github/workflows/helm-ci.yaml'
pull_request:
branches: [main, develop, openshift]
paths:
- 'client/**'
- 'ingestor/**'
- 'scripts/tests/e2e-auto-upgrade.sh'
- '.github/workflows/helm-ci.yaml'

jobs:
Expand Down Expand Up @@ -161,6 +163,23 @@ jobs:
echo "images.ingestor.digest empty (default) — spawning by floating tag; no pinned digest to check."
fi

upgrade-e2e:
# Fleet auto-upgrade non-regression gate (client-runtime#102 / #245-class
# regressions): installs the LAST PUBLISHED chart from gh-pages on a real
# k3d cluster, then upgrades to THIS working tree via both
# `--reuse-values` (manual-operator habit; nil-guards must hold, lockdown
# must not engage) and `--reset-then-reuse-values` (the auto-upgrade
# cronjob's path; new defaults must flow in inert), then flips the #102
# egress-lockdown flags and proves the next auto-upgrade preserves them.
# Pods are never waited on — published images need real credentials; the
# regression class this guards lives in Helm templating/values semantics.
name: Fleet auto-upgrade E2E (k3d)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Upgrade from last published release through both flag paths
run: bash scripts/tests/e2e-auto-upgrade.sh

# Installer script tests (bats + Pester) + the cross-distro prerequisite matrix
# live in their own workflow: .github/workflows/installer-tests.yaml
# (triggered on scripts/** changes).
4 changes: 2 additions & 2 deletions .github/workflows/installer-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ jobs:
# below for visibility but don't fail the gate.
shellcheck --severity=error --shell=bash \
scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/check-drift.sh
scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/e2e-auto-upgrade.sh scripts/tests/check-drift.sh
echo "── shellcheck warnings (advisory, non-blocking) ──"
shellcheck --severity=warning --shell=bash \
scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/check-drift.sh || true
scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/e2e-auto-upgrade.sh scripts/tests/check-drift.sh || true

- name: PSScriptAnalyzer (PowerShell installer)
shell: pwsh
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/standard-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
shellcheck --version | grep version
shellcheck --severity=error --shell=bash \
scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh
scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/e2e-auto-upgrade.sh

unit-tests:
name: Unit tests
Expand Down
4 changes: 2 additions & 2 deletions client/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ apiVersion: v2
name: client
description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift
type: application
version: 1.6.1
appVersion: "1.6.1"
version: 1.7.0
appVersion: "1.7.0"
keywords:
- tracebloc
- kubernetes
Expand Down
52 changes: 52 additions & 0 deletions client/templates/egress-proxy-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{{- if (default dict .Values.egressProxy).enabled }}
{{- /*
Egress gateway (squid) config — SECURITY §8.2 / client-runtime#102.
A forward proxy that ONLY permits HTTPS CONNECT to an FQDN allowlist, so that
once the training NetworkPolicy drops the 0.0.0.0/0:443 rule, a locked-down
training pod can still reach the tracebloc backend + App Insights through this
gateway — and nothing else. Service Bus is NOT here; it stays on the
requests-proxy:8888 path. The allowlist falls CLOSED: an empty list renders no
allow rule, so everything is denied.
*/ -}}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Release.Name }}-egress-proxy
namespace: {{ .Release.Namespace }}
labels:
{{- include "tracebloc.labels" . | nindent 4 }}
data:
squid.conf: |
visible_hostname tracebloc-egress-proxy
http_port {{ .Values.egressProxy.port | default 3128 }}

# Tunnel-only: no caching, no pid file, logs to the container's std streams —
# squid needs no writable paths, so the root filesystem stays read-only.
cache deny all
cache_store_log none
pid_filename none
access_log stdio:/dev/stdout
cache_log stdio:/dev/stderr
# No peers/cache → the ICMP pinger (needs CAP_NET_RAW, which the pod drops) is
# just noise/errors. Disable it.
pinger_enable off

acl SSL_ports port 443
acl CONNECT method CONNECT
{{- with .Values.egressProxy.allowlist }}
# dstdomain: a leading dot matches subdomains (e.g. .in.applicationinsights.azure.com),
# a bare host is an exact match (e.g. api.tracebloc.io).
acl allowed_fqdns dstdomain {{ join " " . }}
http_access deny CONNECT !SSL_ports
http_access allow CONNECT allowed_fqdns
{{- end }}
# Fail closed — anything not explicitly allowed above is denied.
http_access deny all
{{- if .Values.env.HTTP_PROXY_HOST }}

# Corporate-proxy chaining: when the cluster sits behind a corporate proxy,
# forward upstream through it instead of going direct (mirrors tracebloc.proxyEnv).
cache_peer {{ .Values.env.HTTP_PROXY_HOST }} parent {{ .Values.env.HTTP_PROXY_PORT | default 8080 }} 0 no-query default{{ if .Values.env.HTTP_PROXY_USERNAME }} login={{ .Values.env.HTTP_PROXY_USERNAME }}:{{ .Values.env.HTTP_PROXY_PASSWORD }}{{ end }}
never_direct allow all
{{- end }}
{{- end }}
88 changes: 88 additions & 0 deletions client/templates/egress-proxy-deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{{- if (default dict .Values.egressProxy).enabled }}
{{- /*
Egress gateway (squid) — SECURITY §8.2 / client-runtime#102.
Carries label app=egress-proxy, NOT tracebloc.io/workload=training, so it is
deliberately OUTSIDE the training NetworkPolicy and can egress to the
allowlisted FQDNs on the locked-down training pod's behalf.
Nil-guards: a `helm upgrade --reuse-values` from a release predating
egressProxy leaves .Values.egressProxy nil → the outer `if` renders nothing
(no gateway, no behaviour change). image/resources use default-through-dict so
a partial --set can't nil-pointer.
*/ -}}
{{- $ep := default dict .Values.egressProxy }}
{{- $img := default dict $ep.image }}
{{- $epRes := default dict $ep.resources }}
{{- $epReq := default dict $epRes.requests }}
{{- $epLim := default dict $epRes.limits }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ .Release.Name }}-egress-proxy
namespace: {{ .Release.Namespace }}
labels:
{{- include "tracebloc.labels" . | nindent 4 }}
spec:
replicas: 1
selector:
matchLabels:
app: egress-proxy
template:
metadata:
labels:
app: egress-proxy
annotations:
checksum/config: {{ include (print $.Template.BasePath "/egress-proxy-configmap.yaml") . | sha256sum }}
spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
# squid's non-root user (Ubuntu `proxy` uid). Configurable because it is
# image-specific — VERIFY against the chosen egressProxy.image.
runAsUser: {{ $ep.runAsUser | default 13 }}
seccompProfile:
type: RuntimeDefault
containers:
- name: squid
image: {{ include "tracebloc.image" (dict "repository" ($img.repository | default "ubuntu/squid") "tag" ($img.tag | default "6.6-24.04_beta") "digest" ($img.digest | default "") "registry" ($img.registry | default "docker.io")) | quote }}
imagePullPolicy: IfNotPresent
command: ["squid"]
# -N: no daemon (run in foreground). Logs go to the std streams via
# squid.conf (access_log/cache_log = stdio); omit -d to avoid duplicate lines.
args: ["-N", "-f", "/etc/squid/squid.conf"]
ports:
- containerPort: {{ $ep.port | default 3128 }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
readOnlyRootFilesystem: true
resources:
requests:
cpu: {{ $epReq.cpu | default "50m" | quote }}
memory: {{ $epReq.memory | default "64Mi" | quote }}
limits:
cpu: {{ $epLim.cpu | default "500m" | quote }}
memory: {{ $epLim.memory | default "256Mi" | quote }}
volumeMounts:
- name: squid-config
mountPath: /etc/squid/squid.conf
subPath: squid.conf
readOnly: true
- name: tmp
mountPath: /tmp
- name: var-run
mountPath: /var/run
volumes:
- name: squid-config
configMap:
name: {{ .Release.Name }}-egress-proxy
- name: tmp
emptyDir: {}
- name: var-run
emptyDir: {}
{{- if include "tracebloc.useImagePullSecrets" . }}
imagePullSecrets:
- name: {{ include "tracebloc.registrySecretName" . }}
{{- end }}
restartPolicy: Always
{{- end }}
23 changes: 23 additions & 0 deletions client/templates/egress-proxy-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{{- if (default dict .Values.egressProxy).enabled }}
{{- /*
ClusterIP for the egress gateway. Training pods reach it as
egress-proxy-service:<port> via their HTTPS_PROXY env (wired in Step 1).
*/ -}}
apiVersion: v1
kind: Service
metadata:
name: egress-proxy-service
namespace: {{ .Release.Namespace }}
labels:
{{- include "tracebloc.labels" . | nindent 4 }}
app: egress-proxy
spec:
selector:
app: egress-proxy
ports:
- name: http-proxy
port: {{ .Values.egressProxy.port | default 3128 }}
targetPort: {{ .Values.egressProxy.port | default 3128 }}
protocol: TCP
type: ClusterIP
{{- end }}
7 changes: 7 additions & 0 deletions client/templates/jobs-manager-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ spec:
value: {{ (default dict .Values.images.ingestor).digest | default "" | quote }}
- name: REQUESTS_PROXY_URL
value: "http://requests-proxy-service:8888"
{{- if and (default dict .Values.egressProxy).enabled (default dict .Values.egressProxy).routeWorkloads }}
# client-runtime#102: when routing is enabled, jobs-manager injects
# HTTPS_PROXY=<gateway> into training pods (and suppresses raw HTTP_PROXY_HOST).
# Inert until egressProxy.routeWorkloads=true.
- name: EGRESS_PROXY_URL
value: "http://egress-proxy-service:{{ (default dict .Values.egressProxy).port | default 3128 }}"
{{- end }}
- name: JOB_IMAGE_HOST
value: "docker.io/"
- name: CLIENT_ENV
Expand Down
26 changes: 23 additions & 3 deletions client/templates/network-policy-training.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,16 @@ spec:
protocol: UDP
- port: 53
protocol: TCP
{{- /* Rule 2 (external HTTPS) is the egress hole SECURITY §8.2 closes. Gated on
networkPolicy.training.allowExternalHttps via `dig` with a default of TRUE:
an absent key (helm upgrade --reuse-values from a release predating it)
keeps the rule — old behaviour — so only an explicit `false` drops it, once
an operator has verified the egress gateway on that cluster (#102). */}}
{{- if dig "allowExternalHttps" true .Values.networkPolicy.training }}
# 2. External HTTPS — everything NOT in the cluster's pod/service CIDRs.
# Training pods call backend, Azure Service Bus, App Insights, etc.
# This blocks pod-to-pod, ClusterIPs, jobs-manager, K8s API. MySQL is
# explicitly re-permitted by the next rule.
# Training pods reach the backend / Azure Service Bus / App Insights directly.
# This blocks pod-to-pod, ClusterIPs, jobs-manager, K8s API. MySQL and the
# in-cluster proxies are explicitly re-permitted by the rules below.
- to:
- ipBlock:
cidr: 0.0.0.0/0
Expand All @@ -75,6 +81,7 @@ spec:
ports:
- port: 443
protocol: TCP
{{- end }}
# 3. MySQL — training pods read the training dataset from the
# in-namespace mysql-client pod. podSelector with no namespaceSelector
# matches pods in the same namespace as this NetworkPolicy.
Expand All @@ -100,4 +107,17 @@ spec:
ports:
- port: 8888
protocol: TCP
{{- if (default dict .Values.egressProxy).enabled }}
# 5. egress gateway — training pods reach the in-cluster squid egress gateway
# (egress-proxy-service) for allowlisted external HTTPS, used once the
# external-HTTPS rule (rule 2) is dropped. Re-permitted explicitly like MySQL /
# requests-proxy above (rule 2's `except` blocks ClusterIP egress).
- to:
- podSelector:
matchLabels:
app: egress-proxy
ports:
- port: {{ (default dict .Values.egressProxy).port | default 3128 }}
protocol: TCP
{{- end }}
{{- end }}
Loading
Loading