From 66dc6e16656ec7657b56f2c17f197900425ce8b6 Mon Sep 17 00:00:00 2001 From: zoezhao Date: Mon, 15 Jun 2026 09:17:33 -0700 Subject: [PATCH 1/3] Repurposed the existing service dns signer to only sign certificates for ate-system pods. - atesystem.podcert.ate.dev/identity: Issues identity certificates for ate-system control-plane pods, backed by a local CA. Every certificate carries the pod's SPIFFE URI; projections with the atesystem.podcert.ate.dev/expects-service userAnnotation additionally receive DNS SANs for the Services that select the pod. - workerpool.podcert.ate.dev/identity: Issues SPIFFE identity certificates for worker pods backed by a local CA. --- benchmarking/locust/manifests/all.yaml | 2 +- benchmarking/locust/manifests/ate-api.yaml | 2 +- benchmarking/locust/manifests/workloads.yaml | 2 +- cmd/ateapi/main.go | 27 +- cmd/kubectl-ate/README.md | 4 +- cmd/podcertcontroller/main.go | 59 ++-- docs/dev/valkey-direct-access.md | 2 +- hack/install-ate.sh | 21 +- internal/atesystemsigner/atesystemsigner.go | 314 ++++++++++++++++++ internal/servicednssigner/servicednssigner.go | 210 ------------ .../workerpoolsigner.go} | 7 +- manifests/ate-install/ate-api-server.yaml | 33 +- manifests/ate-install/atenet-router.yaml | 14 +- .../pod-certificate-controller.yaml | 16 +- manifests/ate-install/valkey.yaml | 33 +- 15 files changed, 444 insertions(+), 302 deletions(-) create mode 100644 internal/atesystemsigner/atesystemsigner.go delete mode 100644 internal/servicednssigner/servicednssigner.go rename internal/{podidentitysigner/podidentitysigner.go => workerpoolsigner/workerpoolsigner.go} (96%) diff --git a/benchmarking/locust/manifests/all.yaml b/benchmarking/locust/manifests/all.yaml index 63bee19a3..f51f51ef8 100644 --- a/benchmarking/locust/manifests/all.yaml +++ b/benchmarking/locust/manifests/all.yaml @@ -66,7 +66,7 @@ spec: projected: sources: - clusterTrustBundle: - signerName: servicedns.podcert.ate.dev/identity + signerName: atesystem.podcert.ate.dev/identity labelSelector: matchLabels: podcert.ate.dev/canarying: live diff --git a/benchmarking/locust/manifests/ate-api.yaml b/benchmarking/locust/manifests/ate-api.yaml index 78186c87b..b0d5bd129 100644 --- a/benchmarking/locust/manifests/ate-api.yaml +++ b/benchmarking/locust/manifests/ate-api.yaml @@ -63,7 +63,7 @@ spec: projected: sources: - clusterTrustBundle: - signerName: servicedns.podcert.ate.dev/identity + signerName: atesystem.podcert.ate.dev/identity labelSelector: matchLabels: podcert.ate.dev/canarying: live diff --git a/benchmarking/locust/manifests/workloads.yaml b/benchmarking/locust/manifests/workloads.yaml index 8a637fc2e..01bb1df75 100644 --- a/benchmarking/locust/manifests/workloads.yaml +++ b/benchmarking/locust/manifests/workloads.yaml @@ -66,7 +66,7 @@ spec: projected: sources: - clusterTrustBundle: - signerName: servicedns.podcert.ate.dev/identity + signerName: atesystem.podcert.ate.dev/identity labelSelector: matchLabels: podcert.ate.dev/canarying: live diff --git a/cmd/ateapi/main.go b/cmd/ateapi/main.go index 87d5a7433..cc1f44ce9 100644 --- a/cmd/ateapi/main.go +++ b/cmd/ateapi/main.go @@ -62,7 +62,8 @@ var ( sessionIDJWTPoolFile = pflag.String("session-id-jwt-pool", "", "The file that contains the serialized JWT authority pool for signing session JWTs") sessionIDCAPoolFile = pflag.String("session-id-ca-pool", "", "The file that contains the CA pool for signing session JWTs") - workerpoolCACerts = pflag.String("workerpool-ca-certs", "", "The file that contains the CA for verifying workerpool client certificates.") + clientCACerts = pflag.StringSlice("client-ca-certs", nil, "Files containing CA bundles; client certificates are verified against the union of these CAs.") + workerpoolCACerts = pflag.String("workerpool-ca-certs", "", "The file that contains the CA for attesting workerpool client identities.") showVersion = pflag.Bool("version", false, "Print version and exit.") ) @@ -195,6 +196,7 @@ func logFlagValues(ctx context.Context) { slog.String("client-jwt-audience", *clientJWTAudience), slog.String("session-id-jwt-pool", *sessionIDJWTPoolFile), slog.String("session-id-ca-pool", *sessionIDCAPoolFile), + slog.Any("client-ca-certs", *clientCACerts), slog.String("workerpool-ca-certs", *workerpoolCACerts), ) } @@ -302,22 +304,27 @@ func newKubeClients() (*kubernetes.Clientset, versioned.Interface, error) { return clientset, ateClient, nil } -// buildServerCreds loads the workerpool CA pool (if configured) and +// buildServerCreds loads the client-verification CA pools (if configured) and // composes gRPC TransportCredentials over the server bundle + optional // client-cert verification. +// +// TODO: Verifying a client cert only proves it was signed by a trusted CA, not +// that the peer is authorized. func buildServerCreds(ctx context.Context) (credentials.TransportCredentials, error) { var clientCAs *x509.CertPool - if *workerpoolCACerts != "" { + if len(*clientCACerts) > 0 { // TODO: Periodically reload these to handle rotations. Consult with Tina to see how she did it for client-go. - ca, err := os.ReadFile(*workerpoolCACerts) - if err != nil { - return nil, fmt.Errorf("read workerpool CA: %w", err) - } clientCAs = x509.NewCertPool() - if !clientCAs.AppendCertsFromPEM(ca) { - return nil, fmt.Errorf("parse workerpool CA from %s", *workerpoolCACerts) + for _, path := range *clientCACerts { + ca, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read client CA from %s: %w", path, err) + } + if !clientCAs.AppendCertsFromPEM(ca) { + return nil, fmt.Errorf("parse client CA from %s", path) + } + slog.InfoContext(ctx, "Using custom CA for client verification", slog.String("path", path)) } - slog.InfoContext(ctx, "Using custom CA for workerpool clients", slog.String("path", *workerpoolCACerts)) } return credentials.NewTLS(&tls.Config{ GetCertificate: credbundle.Loader(*grpcServerCredBundle), diff --git a/cmd/kubectl-ate/README.md b/cmd/kubectl-ate/README.md index b45679ae2..e7eb5cabc 100644 --- a/cmd/kubectl-ate/README.md +++ b/cmd/kubectl-ate/README.md @@ -144,8 +144,8 @@ Commands for bootstrapping the Substrate control plane and debugging local envir ```bash # Generate a new CA pool and push it directly to a Kubernetes Secret kubectl ate admin make-ca-pool \ - --name workerpool-ca-certs \ - --secret-namespace ate-system \ + --name workerpool-ca-pool \ + --secret-namespace podcertificate-controller-system \ --ca-id "1" # Generate a new JWT authority pool and push it to a Kubernetes Secret diff --git a/cmd/podcertcontroller/main.go b/cmd/podcertcontroller/main.go index 98a321a6e..135c30291 100644 --- a/cmd/podcertcontroller/main.go +++ b/cmd/podcertcontroller/main.go @@ -13,9 +13,12 @@ // limitations under the License. // Command podcertcontroller is a pod certificate controller that implements two signers. -// - servicedns.ate.dev/identity: Issues certificate for Kubernetes service DNS names, backed by a -// local CA. -// - podid.ate.dev/identity: Issues certificates equivalent to KSA tokens, backed by a local CA. +// - atesystem.podcert.ate.dev/identity: Issues identity certificates for ate-system pods, +// backed by a local CA. Every certificate carries the pod's SPIFFE URI; projections with +// the atesystem.podcert.ate.dev/expects-service userAnnotation additionally receive DNS SANs for +// the Services that select the pod. +// - workerpool.podcert.ate.dev/identity: Issues SPIFFE identity certificates for worker pods +// backed by a local CA. // // These signers are not unique to Agent Substrate, and will eventually be replaced by signers that // are being developed as part of upstream Kubernetes. @@ -30,12 +33,12 @@ import ( "path/filepath" "syscall" + "github.com/agent-substrate/substrate/internal/atesystemsigner" "github.com/agent-substrate/substrate/internal/localca" - "github.com/agent-substrate/substrate/internal/podidentitysigner" "github.com/agent-substrate/substrate/internal/rendezvous" - "github.com/agent-substrate/substrate/internal/servicednssigner" "github.com/agent-substrate/substrate/internal/signercontroller" "github.com/agent-substrate/substrate/internal/version" + "github.com/agent-substrate/substrate/internal/workerpoolsigner" "github.com/spf13/pflag" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -62,16 +65,22 @@ var ( shardingPodUID = pflag.String("sharding-pod-uid", "", "(Work Sharding) The pod UID of the controller") shardingApplicationName = pflag.String("sharding-application-name", "", "(Work Sharding) The application name to disambiguate Leases") - serviceDNSCAPoolFile = pflag.String( - "service-dns-ca-pool", + atesystemCAPoolFile = pflag.String( + "atesystem-ca-pool", "", - "File that contains the CA pool state for "+servicednssigner.Name, + "File that contains the CA pool state for "+atesystemsigner.Name, ) - podCAPoolFile = pflag.String( - "pod-identity-ca-pool", + workerpoolCAPoolFile = pflag.String( + "workerpool-ca-pool", "", - "File that contains the CA pool state for "+podidentitysigner.Name, + "File that contains the CA pool state for "+workerpoolsigner.Name, + ) + + atesystemNamespace = pflag.String( + "atesystem-namespace", + "ate-system", + "Namespace the "+atesystemsigner.Name+" signer is allowed to issue certificates in.", ) showVersion = pflag.Bool("version", false, "Print version and exit.") @@ -120,33 +129,33 @@ func main() { ) go hasher.Run(ctx) - // Create a signer for servicedns.ate.dev/identity - serviceDNSCAPoolBytes, err := os.ReadFile(*serviceDNSCAPoolFile) + // Create a signer for atesystem.podcert.ate.dev/identity + atesystemCAPoolBytes, err := os.ReadFile(*atesystemCAPoolFile) if err != nil { - slog.ErrorContext(ctx, "Error reading servicedns.ate.dev/identity CA pool state", slog.Any("err", err)) + slog.ErrorContext(ctx, "Error reading "+atesystemsigner.Name+" CA pool state", slog.Any("err", err)) os.Exit(1) } - serviceDNSCAPool, err := localca.Unmarshal(serviceDNSCAPoolBytes) + atesystemCAPool, err := localca.Unmarshal(atesystemCAPoolBytes) if err != nil { - slog.ErrorContext(ctx, "Error unmarshing servicedns.ate.dev/identity CA pool state", slog.Any("err", err)) + slog.ErrorContext(ctx, "Error unmarshaling "+atesystemsigner.Name+" CA pool state", slog.Any("err", err)) os.Exit(1) } - serviceDNSSignerController := signercontroller.New(clock.RealClock{}, servicednssigner.NewImpl(kc, serviceDNSCAPool, clock.RealClock{}), kc, hasher) - go serviceDNSSignerController.Run(ctx) + atesystemSignerController := signercontroller.New(clock.RealClock{}, atesystemsigner.NewImpl(kc, atesystemCAPool, *atesystemNamespace, clock.RealClock{}), kc, hasher) + go atesystemSignerController.Run(ctx) - // Create a signer for podidentity.podcert.ate.dev/identity - podIdentityCAPoolBytes, err := os.ReadFile(*podCAPoolFile) + // Create a signer for workerpool.podcert.ate.dev/identity + workerpoolCAPoolBytes, err := os.ReadFile(*workerpoolCAPoolFile) if err != nil { - slog.ErrorContext(ctx, "Error reading podidentity.podcert.ate.dev/identity CA pool state", slog.Any("err", err)) + slog.ErrorContext(ctx, "Error reading "+workerpoolsigner.Name+" CA pool state", slog.Any("err", err)) os.Exit(1) } - podIdentityCAPool, err := localca.Unmarshal(podIdentityCAPoolBytes) + workerpoolCAPool, err := localca.Unmarshal(workerpoolCAPoolBytes) if err != nil { - slog.ErrorContext(ctx, "Error unmarshing podidentity.podcert.ate.dev/identity CA pool state", slog.Any("err", err)) + slog.ErrorContext(ctx, "Error unmarshaling "+workerpoolsigner.Name+" CA pool state", slog.Any("err", err)) os.Exit(1) } - podIdentitySignerController := signercontroller.New(clock.RealClock{}, podidentitysigner.NewImpl(kc, podIdentityCAPool, clock.RealClock{}), kc, hasher) - go podIdentitySignerController.Run(ctx) + workerpoolSignerController := signercontroller.New(clock.RealClock{}, workerpoolsigner.NewImpl(kc, workerpoolCAPool, clock.RealClock{}), kc, hasher) + go workerpoolSignerController.Run(ctx) // TODO: Reload when the file changes. diff --git a/docs/dev/valkey-direct-access.md b/docs/dev/valkey-direct-access.md index ba8e38169..318d09f8b 100644 --- a/docs/dev/valkey-direct-access.md +++ b/docs/dev/valkey-direct-access.md @@ -3,4 +3,4 @@ Sometimes you need to access the valkey instance directly, you can do this with one "simple" command: 1. `kubectl exec -n=ate-system -it valkey-cluster-0 -- valkey-cli -h valkey-cluster-service -c --tls --cacert /etc/valkey-ca/ca.crt --cer -t /run/servicedns.podcert.ate.dev/credential-bundle.pem --key /run/servicedns.podcert.ate.dev/credential-bundle.pem` +t /run/atesystem.podcert.ate.dev/credential-bundle.pem --key /run/atesystem.podcert.ate.dev/credential-bundle.pem` diff --git a/hack/install-ate.sh b/hack/install-ate.sh index 6c7f46056..edafa3fa2 100755 --- a/hack/install-ate.sh +++ b/hack/install-ate.sh @@ -119,9 +119,9 @@ run_ko() { create_valkey_ca_certs_secret() { log_step "create_valkey_ca_certs_secret" local ca_certs="" - # Extract from in-cluster service-dns-ca-pool secret (base64 json) + # Extract from in-cluster atesystem-identity-ca-pool secret (base64 json) local pool_json="" - pool_json=$(run_kubectl get secret -n podcertificate-controller-system service-dns-ca-pool -o jsonpath='{.data.pool}' | base64 --decode) + pool_json=$(run_kubectl get secret -n podcertificate-controller-system atesystem-identity-ca-pool -o jsonpath='{.data.pool}' | base64 --decode) # Extract RootCertificateDER base64 string local der_base64="" der_base64=$(echo "${pool_json}" | grep -o '"RootCertificateDER":"[^"]*' | sed 's/"RootCertificateDER":"//') @@ -156,11 +156,11 @@ create_podcertificate_controller_cas() { run_kubectl create namespace podcertificate-controller-system || true run_kubectl_ate admin make-ca-pool \ --ca-id="1" \ - --name="service-dns-ca-pool" \ + --name="atesystem-identity-ca-pool" \ --secret-namespace=podcertificate-controller-system run_kubectl_ate admin make-ca-pool \ --ca-id="1" \ - --name="pod-identity-ca-pool" \ + --name="workerpool-ca-pool" \ --secret-namespace=podcertificate-controller-system } @@ -176,7 +176,7 @@ create_api_server_env_vars() { redis_address="valkey-cluster.ate-system.svc:6379" use_iam_auth="false" tls_server_name="valkey-cluster.ate-system.svc" - client_cert="/run/servicedns.podcert.ate.dev/credential-bundle.pem" + client_cert="/run/atesystem.podcert.ate.dev/credential-bundle.pem" echo "REDIS_ADDRESS: ${redis_address}" @@ -230,10 +230,10 @@ deploy_ate_system() { # Wait for both ClusterTrustBundles to be created by the controller echo "Waiting for podcertificate ClusterTrustBundles to be ready..." - until run_kubectl get clustertrustbundles podidentity.podcert.ate.dev:identity:primary-bundle >/dev/null 2>&1; do + until run_kubectl get clustertrustbundles workerpool.podcert.ate.dev:identity:primary-bundle >/dev/null 2>&1; do sleep 1 done - until run_kubectl get clustertrustbundles servicedns.podcert.ate.dev:identity:primary-bundle >/dev/null 2>&1; do + until run_kubectl get clustertrustbundles atesystem.podcert.ate.dev:identity:primary-bundle >/dev/null 2>&1; do sleep 1 done @@ -248,10 +248,13 @@ deploy_ate_system() { echo "${manifests}" | run_kubectl apply -f - log_step "Waiting for ATE system components to be ready..." + # Wait for Valkey before ate-api-server: the api server pings the Valkey + # cluster on startup and waits until the Valkey cluster is initialized. + run_kubectl rollout status statefulset/valkey-cluster -n ate-system --timeout=180s + run_kubectl wait --for=condition=complete job/valkey-cluster-init -n ate-system --timeout=180s run_kubectl rollout status deployment/ate-api-server-deployment -n ate-system --timeout=120s run_kubectl rollout status deployment/ate-controller -n ate-system --timeout=120s run_kubectl rollout status deployment/atenet-router -n ate-system --timeout=120s - run_kubectl rollout status statefulset/valkey-cluster -n ate-system --timeout=120s run_kubectl rollout status daemonset/atelet -n ate-system --timeout=120s } @@ -262,7 +265,7 @@ ensure_apiserver_prerequisites() { || create_jwt_authority_pool_secret run_kubectl get secret -n ate-system session-id-ca-pool >/dev/null 2>&1 \ || create_session_id_ca_pool_secret - run_kubectl get secret -n podcertificate-controller-system service-dns-ca-pool >/dev/null 2>&1 \ + run_kubectl get secret -n podcertificate-controller-system atesystem-identity-ca-pool >/dev/null 2>&1 \ || create_podcertificate_controller_cas run_kubectl get secret -n ate-system valkey-ca-certs >/dev/null 2>&1 \ || create_valkey_ca_certs_secret diff --git a/internal/atesystemsigner/atesystemsigner.go b/internal/atesystemsigner/atesystemsigner.go new file mode 100644 index 000000000..ac3d37fb9 --- /dev/null +++ b/internal/atesystemsigner/atesystemsigner.go @@ -0,0 +1,314 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package atesystemsigner implements the identity signer for ate-system +// control-plane pods. Every issued certificate carries a SPIFFE URI derived +// from the pod's namespace and ServiceAccount. Pods that serve traffic behind a +// Service additionally receive DNS SANs, opted into via the +// atesystem.podcert.ate.dev/expects-service userAnnotation on the podCertificate +// projection. +package atesystemsigner + +import ( + "bytes" + "context" + "crypto/rand" + "crypto/x509" + "encoding/pem" + "fmt" + "log/slog" + "net/url" + "path" + "time" + + "github.com/agent-substrate/substrate/internal/localca" + "github.com/agent-substrate/substrate/internal/podcertificate" + "github.com/agent-substrate/substrate/internal/signercontroller" + certsv1beta1 "k8s.io/api/certificates/v1beta1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/kubernetes" + "k8s.io/utils/clock" + "k8s.io/utils/ptr" +) + +const Name = "atesystem.podcert.ate.dev/identity" + +const CTBPrefix = "atesystem.podcert.ate.dev:identity:" + +// ExpectsServiceAnnotation is a podCertificate projection userAnnotation key. +// When set to "true", it opts the pod into DNS SANs derived from the Services +// that select it. It is the sole switch for DNS SANs: a projection without it +// always yields a SPIFFE-only client certificate, even if a Service happens to +// select the pod. Kubelet copies projection userAnnotations verbatim into the +// PodCertificateRequest's spec.unverifiedUserAnnotations. +const ExpectsServiceAnnotation = "atesystem.podcert.ate.dev/expects-service" + +type Impl struct { + kc kubernetes.Interface + caPool *localca.Pool + + // allowedNamespace is the only namespace this signer issues certificates + // for. PCRs from pods in any other namespace are denied. + allowedNamespace string + + clock clock.PassiveClock +} + +func NewImpl(kc kubernetes.Interface, caPool *localca.Pool, allowedNamespace string, clock clock.PassiveClock) *Impl { + return &Impl{ + kc: kc, + caPool: caPool, + allowedNamespace: allowedNamespace, + clock: clock, + } +} + +var _ signercontroller.SignerImpl = (*Impl)(nil) + +func (h *Impl) SignerName() string { + return Name +} + +func (h *Impl) DesiredClusterTrustBundles() []*certsv1beta1.ClusterTrustBundle { + name := CTBPrefix + "primary-bundle" + + wantTrustBundle := bytes.Buffer{} + for _, ca := range h.caPool.CAs { + block := pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE", + Bytes: ca.RootCertificate.Raw, + }) + _, _ = wantTrustBundle.Write(block) + } + + wantCTB := &certsv1beta1.ClusterTrustBundle{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + "podcert.ate.dev/canarying": "live", + }, + }, + Spec: certsv1beta1.ClusterTrustBundleSpec{ + SignerName: Name, + TrustBundle: wantTrustBundle.String(), + }, + } + + return []*certsv1beta1.ClusterTrustBundle{ + wantCTB, + } +} + +func (h *Impl) MakeCert(ctx context.Context, pcr *certsv1beta1.PodCertificateRequest) error { + // Only issue for pods in the allowed namespace. Anything else is denied + // terminally so the requesting pod fails fast instead of retrying forever. + if pcr.ObjectMeta.Namespace != h.allowedNamespace { + return h.deny(ctx, pcr, "NamespaceNotAllowed", + fmt.Sprintf("signer %s only issues certificates for pods in namespace %q", Name, h.allowedNamespace)) + } + + // Reject userAnnotation keys we don't recognize. The PodCertificateRequest + // contract asks signers to deny requests carrying unknown keys so that a + // typo or an annotation meant for a different signer fails loudly instead + // of being silently ignored. + for key := range pcr.Spec.UnverifiedUserAnnotations { + if key != ExpectsServiceAnnotation { + return h.deny(ctx, pcr, certsv1beta1.PodCertificateRequestConditionInvalidUserConfig, + fmt.Sprintf("unrecognized userAnnotation key %q; this signer only supports %q", key, ExpectsServiceAnnotation)) + } + } + + // Fetch the pod to read its labels, and to confirm the PCR refers to the + // live pod. + pod, err := h.kc.CoreV1().Pods(pcr.ObjectMeta.Namespace).Get(ctx, pcr.Spec.PodName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("while getting pod %s/%s: %w", pcr.ObjectMeta.Namespace, pcr.Spec.PodName, err) + } + if pod.ObjectMeta.UID != pcr.Spec.PodUID { + return fmt.Errorf("pod UID mismatch: expected %s, got %s", pcr.Spec.PodUID, pod.ObjectMeta.UID) + } + + dnsNames, err := h.coveringServiceDNSNames(ctx, pod) + if err != nil { + return err + } + + // The projection's userAnnotation is the sole switch for DNS SANs. A pod + // that opts in but has no covering Service yet leaves the PCR pending (the + // controller retries with backoff) until its Service exists. A pod that + // does not opt in always gets a SPIFFE-only client certificate, even if a + // Service happens to select it -- otherwise the cert contents would depend + // on apply ordering. + expectsService := pcr.Spec.UnverifiedUserAnnotations[ExpectsServiceAnnotation] == "true" + switch { + case expectsService && len(dnsNames) == 0: + return fmt.Errorf("no services select pod %s/%s yet; refusing to issue a certificate with no DNS SANs", pcr.ObjectMeta.Namespace, pcr.Spec.PodName) + case !expectsService && len(dnsNames) > 0: + slog.WarnContext(ctx, "Pod is selected by Services but is not annotated; issuing a SPIFFE-only certificate", + slog.String("pod", pcr.ObjectMeta.Namespace+"/"+pcr.Spec.PodName), + slog.String("annotation", ExpectsServiceAnnotation), + ) + dnsNames = nil + } + + // TODO: Encode the OIDC issuer of the cluster into the certificate. + + subjectPublicKey, err := podcertificate.PublicKey(pcr) + if err != nil { + return err + } + + // If our signer had an opinion on which key types were allowable, it would + // check subjectPublicKey, and deny the PCR with a SuggestedKeyType + // condition on it. + + lifetime := 24 * time.Hour + requestedLifetime := time.Duration(*pcr.Spec.MaxExpirationSeconds) * time.Second + if requestedLifetime < lifetime { + lifetime = requestedLifetime + } + + notBefore := h.clock.Now().Add(-2 * time.Minute) + notAfter := notBefore.Add(lifetime) + beginRefreshAt := notAfter.Add(-30 * time.Minute) + + // Every certificate carries the pod's SPIFFE identity. Serving pods + // (those with DNS SANs) also get ServerAuth so the same bundle works for + // both serving and client connections. + spiffeURI := &url.URL{ + Scheme: "spiffe", + Host: "cluster.local", + Path: path.Join("ns", pcr.ObjectMeta.Namespace, "sa", pcr.Spec.ServiceAccountName), + } + extKeyUsage := []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth} + if len(dnsNames) > 0 { + extKeyUsage = append(extKeyUsage, x509.ExtKeyUsageServerAuth) + } + + template := &x509.Certificate{ + BasicConstraintsValid: true, + NotBefore: notBefore, + NotAfter: notAfter, + DNSNames: dnsNames, + URIs: []*url.URL{spiffeURI}, + KeyUsage: x509.KeyUsageDigitalSignature, + ExtKeyUsage: extKeyUsage, + } + + subjectCertDER, err := x509.CreateCertificate(rand.Reader, template, h.caPool.CAs[0].RootCertificate, subjectPublicKey, h.caPool.CAs[0].SigningKey) + if err != nil { + return fmt.Errorf("while signing subject cert: %w", err) + } + + chainDER := [][]byte{subjectCertDER} + for _, intermed := range h.caPool.CAs[0].IntermediateCertificates { + chainDER = append(chainDER, intermed.Raw) + } + + chainPEM := &bytes.Buffer{} + for _, certDER := range chainDER { + err = pem.Encode(chainPEM, &pem.Block{ + Type: "CERTIFICATE", + Bytes: certDER, + }) + if err != nil { + return fmt.Errorf("while encoding certificate to PEM: %w", err) + } + } + + pcr = pcr.DeepCopy() + pcr.Status.Conditions = []metav1.Condition{ + { + Type: certsv1beta1.PodCertificateRequestConditionTypeIssued, + Status: metav1.ConditionTrue, + Reason: "Reason", + Message: "Issued", + LastTransitionTime: metav1.NewTime(h.clock.Now()), + }, + } + pcr.Status.CertificateChain = chainPEM.String() + pcr.Status.NotBefore = ptr.To(metav1.NewTime(notBefore)) + pcr.Status.BeginRefreshAt = ptr.To(metav1.NewTime(beginRefreshAt)) + pcr.Status.NotAfter = ptr.To(metav1.NewTime(notAfter)) + + _, err = h.kc.CertificatesV1beta1().PodCertificateRequests(pcr.ObjectMeta.Namespace).UpdateStatus(ctx, pcr, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("while updating PodCertificateRequest: %w", err) + } + + return nil +} + +// coveringServiceDNSNames returns the ..svc DNS name for +// every Service in the pod's namespace whose label selector matches the pod. +func (h *Impl) coveringServiceDNSNames(ctx context.Context, pod *corev1.Pod) ([]string, error) { + // TODO: Looping over every service isn't great. Maintain an index of pod + // to covering services. + svcs, err := h.kc.CoreV1().Services(pod.ObjectMeta.Namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("while listing services: %w", err) + } + + dnsNames := []string{} + for _, svc := range svcs.Items { + switch svc.Spec.Type { + case corev1.ServiceTypeClusterIP, corev1.ServiceTypeNodePort, corev1.ServiceTypeLoadBalancer: + // ok + default: + // This service type doesn't select pods using a label selector. + continue + } + + // A Service with no selector does not select pods by labels; skip it. + // (An empty selector would otherwise match every pod.) + if len(svc.Spec.Selector) == 0 { + continue + } + + if labels.SelectorFromSet(svc.Spec.Selector).Matches(labels.Set(pod.ObjectMeta.Labels)) { + // TODO: I'm making some assumptions about the DNS names that + // resolve to a given Service. I know at least one configuration + // that I suspect doesn't match these assumptions --- GKE with + // VPC-scoped Cloud DNS [1]. + // + // [1] https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-dns#vpc_scope_dns + dnsNames = append(dnsNames, fmt.Sprintf("%s.%s.svc", svc.ObjectMeta.Name, svc.ObjectMeta.Namespace)) + } + } + + return dnsNames, nil +} + +// deny sets a terminal Denied condition on the PCR. The signer controller +// treats a denied PCR as final and stops retrying. +func (h *Impl) deny(ctx context.Context, pcr *certsv1beta1.PodCertificateRequest, reason, message string) error { + pcr = pcr.DeepCopy() + pcr.Status.Conditions = []metav1.Condition{ + { + Type: certsv1beta1.PodCertificateRequestConditionTypeDenied, + Status: metav1.ConditionTrue, + Reason: reason, + Message: message, + LastTransitionTime: metav1.NewTime(h.clock.Now()), + }, + } + + _, err := h.kc.CertificatesV1beta1().PodCertificateRequests(pcr.ObjectMeta.Namespace).UpdateStatus(ctx, pcr, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("while denying PodCertificateRequest: %w", err) + } + return nil +} diff --git a/internal/servicednssigner/servicednssigner.go b/internal/servicednssigner/servicednssigner.go deleted file mode 100644 index f44fbe50b..000000000 --- a/internal/servicednssigner/servicednssigner.go +++ /dev/null @@ -1,210 +0,0 @@ -// Copyright 2026 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package servicednssigner - -import ( - "bytes" - "context" - "crypto/rand" - "crypto/x509" - "encoding/pem" - "fmt" - "time" - - "github.com/agent-substrate/substrate/internal/localca" - "github.com/agent-substrate/substrate/internal/podcertificate" - "github.com/agent-substrate/substrate/internal/signercontroller" - certsv1beta1 "k8s.io/api/certificates/v1beta1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" - "k8s.io/utils/clock" - "k8s.io/utils/ptr" -) - -const Name = "servicedns.podcert.ate.dev/identity" - -const CTBPrefix = "servicedns.podcert.ate.dev:identity:" - -type Impl struct { - kc kubernetes.Interface - caPool *localca.Pool - - clock clock.PassiveClock -} - -func NewImpl(kc kubernetes.Interface, caPool *localca.Pool, clock clock.PassiveClock) *Impl { - return &Impl{ - kc: kc, - caPool: caPool, - clock: clock, - } -} - -var _ signercontroller.SignerImpl = (*Impl)(nil) - -func (h *Impl) SignerName() string { - return Name -} - -func (h *Impl) DesiredClusterTrustBundles() []*certsv1beta1.ClusterTrustBundle { - name := CTBPrefix + "primary-bundle" - - wantTrustBundle := bytes.Buffer{} - for _, ca := range h.caPool.CAs { - block := pem.EncodeToMemory(&pem.Block{ - Type: "CERTIFICATE", - Bytes: ca.RootCertificate.Raw, - }) - _, _ = wantTrustBundle.Write(block) - } - - wantCTB := &certsv1beta1.ClusterTrustBundle{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Labels: map[string]string{ - "podcert.ate.dev/canarying": "live", - }, - }, - Spec: certsv1beta1.ClusterTrustBundleSpec{ - SignerName: Name, - TrustBundle: wantTrustBundle.String(), - }, - } - - return []*certsv1beta1.ClusterTrustBundle{ - wantCTB, - } -} - -func (h *Impl) MakeCert(ctx context.Context, pcr *certsv1beta1.PodCertificateRequest) error { - // TODO: Switch from live reads to indexer - - // If our signer had a policy about which pods are allowed to request - // certificates, it would be implemented here. - - svcs, err := h.kc.CoreV1().Services(pcr.ObjectMeta.Namespace).List(ctx, metav1.ListOptions{}) - if err != nil { - return fmt.Errorf("while listing services: %w", err) - } - - // TODO: Looping over every service isn't great. Maintain an index of pod - // to covering services. - - dnsNames := []string{} - for _, svc := range svcs.Items { - switch svc.Spec.Type { - case corev1.ServiceTypeClusterIP, corev1.ServiceTypeNodePort, corev1.ServiceTypeLoadBalancer: - // ok - default: - // This service type doesn't select pods using a label selector. - continue - } - - // Find the set of pods that the service selects. - matchedPods, err := h.kc.CoreV1().Pods(pcr.ObjectMeta.Namespace).List(ctx, metav1.ListOptions{ - LabelSelector: metav1.FormatLabelSelector(&metav1.LabelSelector{MatchLabels: svc.Spec.Selector}), - }) - if err != nil { - return fmt.Errorf("while selecting pods for service %q: %w", pcr.ObjectMeta.Namespace+"/"+svc.ObjectMeta.Name, err) - } - - for _, matchedPod := range matchedPods.Items { - if matchedPod.ObjectMeta.Name == pcr.Spec.PodName && matchedPod.ObjectMeta.UID == pcr.Spec.PodUID { - // TODO: I'm making some assumptions about the DNS names that - // resolve to a given Service. I know at least one - // configuration that I suspect doesn't match these assumptions - // --- GKE with VPC-scoped Cloud DNS [1]. - // - // [1] https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-dns#vpc_scope_dns - name := fmt.Sprintf("%s.%s.svc", svc.ObjectMeta.Name, svc.ObjectMeta.Namespace) - dnsNames = append(dnsNames, name) - } - } - } - - // TODO: Encode the OIDC issuer of the cluster into the certificate. - - subjectPublicKey, err := podcertificate.PublicKey(pcr) - if err != nil { - return err - } - - // If our signer had an opinion on which key types were allowable, it would - // check subjectPublicKey, and deny the PCR with a SuggestedKeyType - // condition on it. - - lifetime := 24 * time.Hour - requestedLifetime := time.Duration(*pcr.Spec.MaxExpirationSeconds) * time.Second - if requestedLifetime < lifetime { - lifetime = requestedLifetime - } - - notBefore := h.clock.Now().Add(-2 * time.Minute) - notAfter := notBefore.Add(lifetime) - beginRefreshAt := notAfter.Add(-30 * time.Minute) - - template := &x509.Certificate{ - BasicConstraintsValid: true, - NotBefore: notBefore, - NotAfter: notAfter, - DNSNames: dnsNames, - KeyUsage: x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth}, - } - - subjectCertDER, err := x509.CreateCertificate(rand.Reader, template, h.caPool.CAs[0].RootCertificate, subjectPublicKey, h.caPool.CAs[0].SigningKey) - if err != nil { - return fmt.Errorf("while signing subject cert: %w", err) - } - - chainDER := [][]byte{subjectCertDER} - for _, intermed := range h.caPool.CAs[0].IntermediateCertificates { - chainDER = append(chainDER, intermed.Raw) - } - - chainPEM := &bytes.Buffer{} - for _, certDER := range chainDER { - err = pem.Encode(chainPEM, &pem.Block{ - Type: "CERTIFICATE", - Bytes: certDER, - }) - if err != nil { - return fmt.Errorf("while encoding certificate to PEM: %w", err) - } - } - - pcr = pcr.DeepCopy() - pcr.Status.Conditions = []metav1.Condition{ - { - Type: certsv1beta1.PodCertificateRequestConditionTypeIssued, - Status: metav1.ConditionTrue, - Reason: "Reason", - Message: "Issued", - LastTransitionTime: metav1.NewTime(h.clock.Now()), - }, - } - pcr.Status.CertificateChain = chainPEM.String() - pcr.Status.NotBefore = ptr.To(metav1.NewTime(notBefore)) - pcr.Status.BeginRefreshAt = ptr.To(metav1.NewTime(beginRefreshAt)) - pcr.Status.NotAfter = ptr.To(metav1.NewTime(notAfter)) - - _, err = h.kc.CertificatesV1beta1().PodCertificateRequests(pcr.ObjectMeta.Namespace).UpdateStatus(ctx, pcr, metav1.UpdateOptions{}) - if err != nil { - return fmt.Errorf("while updating PodCertificateRequest: %w", err) - } - - return nil -} diff --git a/internal/podidentitysigner/podidentitysigner.go b/internal/workerpoolsigner/workerpoolsigner.go similarity index 96% rename from internal/podidentitysigner/podidentitysigner.go rename to internal/workerpoolsigner/workerpoolsigner.go index 28f2e5d32..69a02b112 100644 --- a/internal/podidentitysigner/podidentitysigner.go +++ b/internal/workerpoolsigner/workerpoolsigner.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package podidentitysigner +package workerpoolsigner import ( "bytes" @@ -35,8 +35,8 @@ import ( "k8s.io/utils/ptr" ) -const Name = "podidentity.podcert.ate.dev/identity" -const CTBPrefix = "podidentity.podcert.ate.dev:identity:" +const Name = "workerpool.podcert.ate.dev/identity" +const CTBPrefix = "workerpool.podcert.ate.dev:identity:" type Impl struct { kc kubernetes.Interface @@ -89,6 +89,7 @@ func (h *Impl) DesiredClusterTrustBundles() []*certsv1beta1.ClusterTrustBundle { } } +// TODO: Only MakeCert if the Pod belongs to a Workerpool. func (h *Impl) MakeCert(ctx context.Context, pcr *certsv1beta1.PodCertificateRequest) error { // Fetch the pod to get its ServiceAccount pod, err := h.kc.CoreV1().Pods(pcr.ObjectMeta.Namespace).Get(ctx, pcr.Spec.PodName, metav1.GetOptions{}) diff --git a/manifests/ate-install/ate-api-server.yaml b/manifests/ate-install/ate-api-server.yaml index 8d3c17086..7fb840848 100644 --- a/manifests/ate-install/ate-api-server.yaml +++ b/manifests/ate-install/ate-api-server.yaml @@ -77,7 +77,7 @@ spec: image: ko://github.com/agent-substrate/substrate/cmd/ateapi args: - "--grpc-listen-addr=0.0.0.0:443" - - "--grpc-server-cred-bundle=/run/servicedns.podcert.ate.dev/credential-bundle.pem" + - "--grpc-server-cred-bundle=/run/atesystem.podcert.ate.dev/credential-bundle.pem" - --redis-cluster-address=@env - --redis-ca-certs=/etc/valkey-ca/ca.crt - --redis-use-iam-auth=@env @@ -87,7 +87,8 @@ spec: - --client-jwt-audience=api.ate-system.svc - --session-id-jwt-pool=/run/session-id-jwt-pool/pool.json - --session-id-ca-pool=/run/session-id-ca-pool/pool.json - - --workerpool-ca-certs=/run/workerpool-ca-certs/trust-bundle.pem + - --client-ca-certs=/run/client-ca-certs/workerpool-trust-bundle.pem,/run/client-ca-certs/atesystem-trust-bundle.pem + - --workerpool-ca-certs=/run/client-ca-certs/workerpool-trust-bundle.pem env: - name: POD_NAME valueFrom: @@ -114,8 +115,8 @@ spec: name: ate-api-server-envvars optional: true volumeMounts: - - name: "servicedns" - mountPath: "/run/servicedns.podcert.ate.dev" + - name: "atesystem-identity" + mountPath: "/run/atesystem.podcert.ate.dev" - name: "session-id-jwt-pool" mountPath: "/run/session-id-jwt-pool" # Note: See README.md for how to generate this secret. @@ -125,8 +126,8 @@ spec: - name: "session-id-ca-pool" mountPath: "/run/session-id-ca-pool" readOnly: true - - name: "workerpool-ca-certs" - mountPath: "/run/workerpool-ca-certs" + - name: "client-ca-certs" + mountPath: "/run/client-ca-certs" readOnly: true ports: - containerPort: 443 @@ -139,13 +140,15 @@ spec: initialDelaySeconds: 5 periodSeconds: 2 volumes: - - name: "servicedns" + - name: "atesystem-identity" projected: sources: - podCertificate: - signerName: servicedns.podcert.ate.dev/identity + signerName: atesystem.podcert.ate.dev/identity keyType: ECDSAP256 credentialBundlePath: credential-bundle.pem + userAnnotations: + atesystem.podcert.ate.dev/expects-service: "true" - name: "session-id-jwt-pool" projected: sources: @@ -170,17 +173,23 @@ spec: items: - key: "pool" path: "pool.json" - - name: "workerpool-ca-certs" + - name: "client-ca-certs" projected: sources: - clusterTrustBundle: - signerName: podidentity.podcert.ate.dev/identity + signerName: workerpool.podcert.ate.dev/identity labelSelector: matchLabels: podcert.ate.dev/canarying: live - path: trust-bundle.pem + path: workerpool-trust-bundle.pem + - clusterTrustBundle: + signerName: atesystem.podcert.ate.dev/identity + labelSelector: + matchLabels: + podcert.ate.dev/canarying: live + path: atesystem-trust-bundle.pem --- -# 6. Expose the Session Assigner +# 6. Expose the ate-api-server Service/ apiVersion: v1 kind: Service metadata: diff --git a/manifests/ate-install/atenet-router.yaml b/manifests/ate-install/atenet-router.yaml index 43f69ab5e..cc5e3baa6 100644 --- a/manifests/ate-install/atenet-router.yaml +++ b/manifests/ate-install/atenet-router.yaml @@ -134,7 +134,7 @@ spec: - "--ateapi-address=api.ate-system.svc:443" - "--status-port=4040" - "--port-https=8443" - - "--envoy-cert-path=/run/servicedns.podcert.ate.dev/credential-bundle.pem" + - "--envoy-cert-path=/run/atesystem.podcert.ate.dev/credential-bundle.pem" env: - name: POD_NAME valueFrom: @@ -179,19 +179,23 @@ spec: volumeMounts: - name: envoy-config mountPath: /etc/envoy - - name: "servicedns" - mountPath: "/run/servicedns.podcert.ate.dev" + - name: atesystem-identity + mountPath: "/run/atesystem.podcert.ate.dev" volumes: - name: envoy-config configMap: name: atenet-router-envoy-config - - name: "servicedns" + # Serving certificate presented by envoy on the https listener, issued by + # the merged atesystem identity signer. + - name: atesystem-identity projected: sources: - podCertificate: - signerName: servicedns.podcert.ate.dev/identity + signerName: atesystem.podcert.ate.dev/identity keyType: ECDSAP256 credentialBundlePath: credential-bundle.pem + userAnnotations: + atesystem.podcert.ate.dev/expects-service: "true" --- apiVersion: v1 kind: Service diff --git a/manifests/ate-install/pod-certificate-controller.yaml b/manifests/ate-install/pod-certificate-controller.yaml index 17c7b6fd2..4f67e8cb6 100644 --- a/manifests/ate-install/pod-certificate-controller.yaml +++ b/manifests/ate-install/pod-certificate-controller.yaml @@ -63,8 +63,8 @@ rules: resources: - signers resourceNames: - - servicedns.podcert.ate.dev/* - - podidentity.podcert.ate.dev/* + - atesystem.podcert.ate.dev/* + - workerpool.podcert.ate.dev/* verbs: - sign - attest @@ -146,8 +146,8 @@ spec: - --sharding-pod-name=$(POD_NAME) - --sharding-pod-uid=$(POD_UID) - --sharding-application-name=podcertificate-controller - - --service-dns-ca-pool=/run/ca-state/service-dns-pool.json - - --pod-identity-ca-pool=/run/ca-state/pod-identity-pool.json + - --atesystem-ca-pool=/run/ca-state/atesystem-pool.json + - --workerpool-ca-pool=/run/ca-state/workerpool-pool.json env: - name: POD_NAMESPACE valueFrom: @@ -177,15 +177,15 @@ spec: projected: sources: - secret: - name: "service-dns-ca-pool" + name: "atesystem-identity-ca-pool" items: - key: "pool" - path: "service-dns-pool.json" + path: "atesystem-pool.json" - secret: - name: "pod-identity-ca-pool" + name: "workerpool-ca-pool" items: - key: "pool" - path: "pod-identity-pool.json" + path: "workerpool-pool.json" dnsPolicy: Default nodeSelector: kubernetes.io/os: linux diff --git a/manifests/ate-install/valkey.yaml b/manifests/ate-install/valkey.yaml index ac649a555..dbeb75104 100644 --- a/manifests/ate-install/valkey.yaml +++ b/manifests/ate-install/valkey.yaml @@ -26,8 +26,8 @@ data: tls-replication yes # Load certificates from projected volume - tls-cert-file /run/servicedns.podcert.ate.dev/credential-bundle.pem - tls-key-file /run/servicedns.podcert.ate.dev/credential-bundle.pem + tls-cert-file /run/atesystem.podcert.ate.dev/credential-bundle.pem + tls-key-file /run/atesystem.podcert.ate.dev/credential-bundle.pem tls-ca-cert-file /etc/valkey-ca/ca.crt tls-auth-clients yes @@ -97,8 +97,8 @@ spec: volumeMounts: - name: config mountPath: /etc/valkey - - name: servicedns - mountPath: /run/servicedns.podcert.ate.dev + - name: atesystem-identity + mountPath: /run/atesystem.podcert.ate.dev - name: valkey-ca-certs mountPath: /etc/valkey-ca readOnly: true @@ -108,13 +108,15 @@ spec: - name: config configMap: name: valkey-config - - name: servicedns + - name: atesystem-identity projected: sources: - podCertificate: - signerName: servicedns.podcert.ate.dev/identity + signerName: atesystem.podcert.ate.dev/identity keyType: ECDSAP256 credentialBundlePath: credential-bundle.pem + userAnnotations: + atesystem.podcert.ate.dev/expects-service: "true" - name: valkey-ca-certs projected: sources: @@ -148,8 +150,8 @@ spec: - name: init image: valkey/valkey:8.0 volumeMounts: - - name: servicedns - mountPath: /run/servicedns.podcert.ate.dev + - name: atesystem-identity + mountPath: /run/atesystem.podcert.ate.dev - name: valkey-ca-certs mountPath: /etc/valkey-ca readOnly: true @@ -174,19 +176,19 @@ spec: done echo "Checking if Valkey cluster is already initialized..." - until valkey-cli --tls --cacert /etc/valkey-ca/ca.crt --cert /run/servicedns.podcert.ate.dev/credential-bundle.pem --key /run/servicedns.podcert.ate.dev/credential-bundle.pem -h valkey-cluster-0.valkey-cluster-service.ate-system.svc ping >/dev/null 2>&1; do + until valkey-cli --tls --cacert /etc/valkey-ca/ca.crt --cert /run/atesystem.podcert.ate.dev/credential-bundle.pem --key /run/atesystem.podcert.ate.dev/credential-bundle.pem -h valkey-cluster-0.valkey-cluster-service.ate-system.svc ping >/dev/null 2>&1; do echo "Waiting for valkey-cluster-0 to respond to ping..." sleep 2 done - INIT_STATUS=$(valkey-cli --tls --cacert /etc/valkey-ca/ca.crt --cert /run/servicedns.podcert.ate.dev/credential-bundle.pem --key /run/servicedns.podcert.ate.dev/credential-bundle.pem -h valkey-cluster-0.valkey-cluster-service.ate-system.svc cluster info 2>/dev/null | grep cluster_state || true) + INIT_STATUS=$(valkey-cli --tls --cacert /etc/valkey-ca/ca.crt --cert /run/atesystem.podcert.ate.dev/credential-bundle.pem --key /run/atesystem.podcert.ate.dev/credential-bundle.pem -h valkey-cluster-0.valkey-cluster-service.ate-system.svc cluster info 2>/dev/null | grep cluster_state || true) if [ -z "${INIT_STATUS}" ] || ! echo "${INIT_STATUS}" | grep -q "cluster_state:ok"; then echo "Initializing Valkey cluster..." valkey-cli --tls \ --cacert /etc/valkey-ca/ca.crt \ - --cert /run/servicedns.podcert.ate.dev/credential-bundle.pem \ - --key /run/servicedns.podcert.ate.dev/credential-bundle.pem \ + --cert /run/atesystem.podcert.ate.dev/credential-bundle.pem \ + --key /run/atesystem.podcert.ate.dev/credential-bundle.pem \ --cluster create ${POD_IPS} \ --cluster-replicas 1 \ --cluster-yes @@ -195,11 +197,14 @@ spec: echo "Cluster already initialized." fi volumes: - - name: servicedns + # No expects-service annotation: this Job pod is not behind any Service, + # so it receives a SPIFFE-only client certificate, which is all valkey-cli + # needs to authenticate to the cluster. + - name: atesystem-identity projected: sources: - podCertificate: - signerName: servicedns.podcert.ate.dev/identity + signerName: atesystem.podcert.ate.dev/identity keyType: ECDSAP256 credentialBundlePath: credential-bundle.pem - name: valkey-ca-certs From c8ee6eb6fedcc581437b3c70fc083994be9ece46 Mon Sep 17 00:00:00 2001 From: zoezhao Date: Mon, 15 Jun 2026 08:59:03 -0700 Subject: [PATCH 2/3] Establish mutual TLS between the atenet router and ate-apiserver --- cmd/atenet/internal/router.go | 7 +- cmd/atenet/internal/router/router.go | 104 +++++++++------------- cmd/atenet/internal/router/status_test.go | 52 +++++++++-- cmd/atenet/internal/router/xds.go | 33 ++----- cmd/atenet/internal/router/xds_test.go | 2 +- internal/credbundle/credbundle.go | 10 +++ manifests/ate-install/atenet-router.yaml | 13 ++- 7 files changed, 120 insertions(+), 101 deletions(-) diff --git a/cmd/atenet/internal/router.go b/cmd/atenet/internal/router.go index 5cff97233..7028e70d3 100644 --- a/cmd/atenet/internal/router.go +++ b/cmd/atenet/internal/router.go @@ -55,7 +55,10 @@ func NewRouterCmd() *cobra.Command { cmd.Flags().IntVar(&cfg.StatusPort, "status-port", 4040, "Port to serve /statusz on (set <= 0 to disable serving status)") cmd.Flags().DurationVar(&cfg.HealthInterval, "health-interval", 1*time.Second, "Interval for checking health of dependent services") cmd.Flags().IntVar(&cfg.HttpsPort, "port-https", 8443, "TCP port for HTTPS workload traffic entering through the Envoy Router") - cmd.Flags().StringVar(&cfg.EnvoyCertPath, "envoy-cert-path", "", "Path to the Envoy certificate file (if empty, a self-signed cert will be generated for testing)") - + cmd.Flags().StringVar(&cfg.EnvoyCertPath, "envoy-cert-path", "", "Path to the Envoy certificate file") + cmd.Flags().StringVar(&cfg.AteapiServerCAPath, "ate-apiserver-ca-path", "", "Path to the CA bundle used to verify the ateapi server certificate.") + cmd.Flags().StringVar(&cfg.AteapiClientCredPath, "ateapi-client-cred-bundle", "", "Path to the credential bundle presented as this router's client certificate when talking to the ateapi server.") + cmd.MarkFlagRequired("envoy-cert-path") + cmd.MarkFlagRequired("ate-apiserver-ca-path") return cmd } diff --git a/cmd/atenet/internal/router/router.go b/cmd/atenet/internal/router/router.go index 797ce36c0..844c2780a 100644 --- a/cmd/atenet/internal/router/router.go +++ b/cmd/atenet/internal/router/router.go @@ -16,16 +16,11 @@ package router import ( "context" - "crypto/rand" - "crypto/rsa" "crypto/tls" "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" "errors" "fmt" "log/slog" - "math/big" "net" "net/http" "os" @@ -46,6 +41,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" + "github.com/agent-substrate/substrate/internal/credbundle" "github.com/agent-substrate/substrate/internal/serverboot" v1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" "github.com/agent-substrate/substrate/pkg/proto/ateapipb" @@ -62,22 +58,24 @@ func init() { // RouterConfig holds deployment setup and endpoint options for the router node instance. type RouterConfig struct { - Standalone bool - Namespace string - Kubeconfig string - AteapiAddr string - HttpPort int - XdsPort int - ExtprocPort int - ExtprocAddr string - EnvoyImage string - TemplatesFile string - StatusPort int - HealthInterval time.Duration - HttpsPort int - EnvoyCertPath string - LogLevel string - MetricsAddr string + Standalone bool + Namespace string + Kubeconfig string + AteapiAddr string + HttpPort int + XdsPort int + ExtprocPort int + ExtprocAddr string + EnvoyImage string + TemplatesFile string + StatusPort int + HealthInterval time.Duration + HttpsPort int + EnvoyCertPath string + LogLevel string + MetricsAddr string + AteapiServerCAPath string + AteapiClientCredPath string } // RouterServer instantiates and coordinates runtime threads executing system modules. @@ -125,7 +123,13 @@ func NewRouterServer(cfg RouterConfig) (*RouterServer, error) { } } - conn, err := grpc.NewClient(cfg.AteapiAddr, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}))) + tlsCfg, err := buildTLSConfig(cfg) + if err != nil { + slog.Error("Failed to build TLS config", slog.String("err", err.Error())) + return nil, err + } + conn, err := grpc.NewClient(cfg.AteapiAddr, grpc.WithTransportCredentials(credentials.NewTLS(tlsCfg))) + if err != nil { return nil, fmt.Errorf("failed to establish grpc channel to ateapi client: %w", err) } @@ -188,17 +192,7 @@ func (s *RouterServer) Run(ctx context.Context) error { xdsSrv := NewXdsServer(s.cfg.XdsPort) xdsSrv.SetConfig(s.cfg.HttpPort, s.cfg.ExtprocPort, s.cfg.ExtprocAddr) - var certContent, keyContent string - if s.cfg.EnvoyCertPath == "" { - slog.InfoContext(ctx, "No Envoy certificate path provided, generating self-signed certificate for testing") - var err error - certContent, keyContent, err = generateSelfSignedCert() - if err != nil { - return fmt.Errorf("failed to generate self-signed cert: %w", err) - } - } - - xdsSrv.SetTlsConfig(s.cfg.HttpsPort, s.cfg.EnvoyCertPath, certContent, keyContent) + xdsSrv.SetTlsConfig(s.cfg.HttpsPort, s.cfg.EnvoyCertPath) if s.extprocSrv == nil { routeDuration, err := newRouteDurationHistogram() if err != nil { @@ -275,38 +269,22 @@ func (s *RouterServer) Run(ctx context.Context) error { return g.Wait() } -func generateSelfSignedCert() (string, string, error) { - priv, err := rsa.GenerateKey(rand.Reader, 2048) - if err != nil { - return "", "", err - } - - template := x509.Certificate{ - SerialNumber: big.NewInt(1), - Subject: pkix.Name{ - Organization: []string{"Substrate Local Test"}, - }, - NotBefore: time.Now(), - NotAfter: time.Now().Add(time.Hour * 24 * 365), - - KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - BasicConstraintsValid: true, - DNSNames: []string{"localhost"}, +func buildTLSConfig(cfg RouterConfig) (*tls.Config, error) { + tlsCfg := &tls.Config{MinVersion: tls.VersionTLS12} + if cfg.AteapiServerCAPath == "" { + return nil, fmt.Errorf("ateapi server CA path not configured") } - - derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) + caBytes, err := os.ReadFile(cfg.AteapiServerCAPath) if err != nil { - return "", "", err + return nil, fmt.Errorf("failed to read ateapi server CA bundle: %w", err) } - - certPem := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) - - privBytes, err := x509.MarshalPKCS8PrivateKey(priv) - if err != nil { - return "", "", err + roots := x509.NewCertPool() + if ok := roots.AppendCertsFromPEM(caBytes); !ok { + return nil, fmt.Errorf("failed to parse ateapi server CA bundle from %s", cfg.AteapiServerCAPath) } - keyPem := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) - - return string(certPem), string(keyPem), nil + tlsCfg.RootCAs = roots + if cfg.AteapiClientCredPath != "" { + tlsCfg.GetClientCertificate = credbundle.ClientLoader(cfg.AteapiClientCredPath) + } + return tlsCfg, nil } diff --git a/cmd/atenet/internal/router/status_test.go b/cmd/atenet/internal/router/status_test.go index 5ce878d72..67d8c9f9f 100644 --- a/cmd/atenet/internal/router/status_test.go +++ b/cmd/atenet/internal/router/status_test.go @@ -16,12 +16,20 @@ package router import ( "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" "encoding/json" + "encoding/pem" "fmt" "io" + "math/big" "net" "net/http" "os" + "path/filepath" "strings" "testing" "time" @@ -48,13 +56,14 @@ func TestStatuszEndpoint(t *testing.T) { tmpFile.Close() cfg := RouterConfig{ - Standalone: true, - Namespace: "default", - StatusPort: httpPort, - HttpPort: 8080, - XdsPort: 18000, - ExtprocPort: 50051, - TemplatesFile: tmpFile.Name(), + Standalone: true, + Namespace: "default", + StatusPort: httpPort, + HttpPort: 8080, + XdsPort: 18000, + ExtprocPort: 50051, + TemplatesFile: tmpFile.Name(), + AteapiServerCAPath: writeTestCA(t), } srv, err := NewRouterServer(cfg) @@ -140,3 +149,32 @@ func TestStatuszEndpoint(t *testing.T) { t.Errorf("Target parameters unassigned inside context payload context properties: found %s", dashboard.Queries[0].Target) } } + +// writeTestCA writes a self-signed CA certificate PEM to a temp file, for +// configuring AteapiServerCAPath in tests. +func writeTestCA(t *testing.T) string { + t.Helper() + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + t.Fatalf("generating CA key: %v", err) + } + template := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "test-ca"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(time.Hour), + IsCA: true, + BasicConstraintsValid: true, + KeyUsage: x509.KeyUsageCertSign, + } + der, err := x509.CreateCertificate(rand.Reader, template, template, &key.PublicKey, key) + if err != nil { + t.Fatalf("creating CA certificate: %v", err) + } + path := filepath.Join(t.TempDir(), "ca.pem") + pemBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) + if err := os.WriteFile(path, pemBytes, 0o600); err != nil { + t.Fatalf("writing CA file: %v", err) + } + return path +} diff --git a/cmd/atenet/internal/router/xds.go b/cmd/atenet/internal/router/xds.go index 964fc5e92..02616754e 100644 --- a/cmd/atenet/internal/router/xds.go +++ b/cmd/atenet/internal/router/xds.go @@ -76,10 +76,8 @@ type XdsServer struct { mu sync.Mutex - httpsPort int - certPath string - certContent string - keyContent string + httpsPort int + certPath string } func NewXdsServer(xdsPort int) *XdsServer { @@ -104,13 +102,11 @@ func (x *XdsServer) SetConfig(ingressPort int, extprocPort int, extprocAddr stri x.extprocAddr = extprocAddr } -func (x *XdsServer) SetTlsConfig(httpsPort int, certPath string, certContent string, keyContent string) { +func (x *XdsServer) SetTlsConfig(httpsPort int, certPath string) { x.mu.Lock() defer x.mu.Unlock() x.httpsPort = httpsPort x.certPath = certPath - x.certContent = certContent - x.keyContent = keyContent } func (x *XdsServer) UpdateSnapshot() error { @@ -453,30 +449,15 @@ func (x *XdsServer) buildHttpsListener() *listenerv3.Listener { } func (x *XdsServer) buildTlsCertificate() *tlsv3.TlsCertificate { - if x.certPath != "" { - return &tlsv3.TlsCertificate{ - CertificateChain: &corev3.DataSource{ - Specifier: &corev3.DataSource_Filename{ - Filename: x.certPath, - }, - }, - PrivateKey: &corev3.DataSource{ - Specifier: &corev3.DataSource_Filename{ - Filename: x.certPath, // Assuming combined file - }, - }, - } - } - return &tlsv3.TlsCertificate{ CertificateChain: &corev3.DataSource{ - Specifier: &corev3.DataSource_InlineString{ - InlineString: x.certContent, + Specifier: &corev3.DataSource_Filename{ + Filename: x.certPath, }, }, PrivateKey: &corev3.DataSource{ - Specifier: &corev3.DataSource_InlineString{ - InlineString: x.keyContent, + Specifier: &corev3.DataSource_Filename{ + Filename: x.certPath, }, }, } diff --git a/cmd/atenet/internal/router/xds_test.go b/cmd/atenet/internal/router/xds_test.go index 92e347648..9a2c2460f 100644 --- a/cmd/atenet/internal/router/xds_test.go +++ b/cmd/atenet/internal/router/xds_test.go @@ -141,7 +141,7 @@ func TestXdsServer_UpdateSnapshot(t *testing.T) { func TestXdsServer_UpdateSnapshot_WithHttps(t *testing.T) { server := NewXdsServer(18000) server.SetConfig(8085, 50053, "127.0.0.1") - server.SetTlsConfig(8443, "", "dummy-cert", "dummy-key") + server.SetTlsConfig(8443, "") err := server.UpdateSnapshot() if err != nil { diff --git a/internal/credbundle/credbundle.go b/internal/credbundle/credbundle.go index 58d86df76..4d7a1dea7 100644 --- a/internal/credbundle/credbundle.go +++ b/internal/credbundle/credbundle.go @@ -38,6 +38,16 @@ func Loader(path string) func(*tls.ClientHelloInfo) (*tls.Certificate, error) { } } +// ClientLoader reads a private key and certificate chain from a credential bundle file as written +// by the Kubernetes Pod Certificates mechanism. +// +// Returns a function that can be used as GetClientCertificate in a tls.Config. +func ClientLoader(path string) func(*tls.CertificateRequestInfo) (*tls.Certificate, error) { + return func(_ *tls.CertificateRequestInfo) (*tls.Certificate, error) { + return Parse(path) + } +} + // Parse reads a private key and certificate chain from a credential bundle file as written by the // Kubernetes Pod Certificates mechanism. func Parse(bundlePath string) (*tls.Certificate, error) { diff --git a/manifests/ate-install/atenet-router.yaml b/manifests/ate-install/atenet-router.yaml index cc5e3baa6..2022931ca 100644 --- a/manifests/ate-install/atenet-router.yaml +++ b/manifests/ate-install/atenet-router.yaml @@ -135,6 +135,8 @@ spec: - "--status-port=4040" - "--port-https=8443" - "--envoy-cert-path=/run/atesystem.podcert.ate.dev/credential-bundle.pem" + - "--ate-apiserver-ca-path=/run/atesystem.podcert.ate.dev/ateapi-ca-trust-bundle.pem" + - "--ateapi-client-cred-bundle=/run/atesystem.podcert.ate.dev/credential-bundle.pem" env: - name: POD_NAME valueFrom: @@ -161,6 +163,9 @@ spec: containerPort: 4040 - name: metrics containerPort: 9090 + volumeMounts: + - name: atesystem-identity + mountPath: "/run/atesystem.podcert.ate.dev" - name: envoy image: envoyproxy/envoy:v1.30-latest command: @@ -185,8 +190,6 @@ spec: - name: envoy-config configMap: name: atenet-router-envoy-config - # Serving certificate presented by envoy on the https listener, issued by - # the merged atesystem identity signer. - name: atesystem-identity projected: sources: @@ -196,6 +199,12 @@ spec: credentialBundlePath: credential-bundle.pem userAnnotations: atesystem.podcert.ate.dev/expects-service: "true" + - clusterTrustBundle: + signerName: atesystem.podcert.ate.dev/identity + labelSelector: + matchLabels: + podcert.ate.dev/canarying: live + path: ateapi-ca-trust-bundle.pem --- apiVersion: v1 kind: Service From 88c6034cb230dcde3a84f1e30ab1d94d7b265215 Mon Sep 17 00:00:00 2001 From: zoezhao Date: Mon, 15 Jun 2026 08:59:37 -0700 Subject: [PATCH 3/3] Rename ate.dev/worker-pool label to ate.dev/workerpool. --- cmd/ateapi/internal/controlapi/functional_test.go | 2 +- cmd/ateapi/internal/controlapi/informer.go | 8 ++++---- internal/controllers/workerpool_controller.go | 4 ++-- internal/controllers/workerpool_controller_test.go | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/ateapi/internal/controlapi/functional_test.go b/cmd/ateapi/internal/controlapi/functional_test.go index 1c49a26c7..314255770 100644 --- a/cmd/ateapi/internal/controlapi/functional_test.go +++ b/cmd/ateapi/internal/controlapi/functional_test.go @@ -420,7 +420,7 @@ func createWorkerPod(t *testing.T, tc *testContext, ns string, name string, node Name: name, Namespace: ns, Labels: map[string]string{ - "ate.dev/worker-pool": "pool1", + "ate.dev/workerpool": "pool1", }, }, Spec: corev1.PodSpec{ diff --git a/cmd/ateapi/internal/controlapi/informer.go b/cmd/ateapi/internal/controlapi/informer.go index 1f082cdd0..86216201e 100644 --- a/cmd/ateapi/internal/controlapi/informer.go +++ b/cmd/ateapi/internal/controlapi/informer.go @@ -27,9 +27,9 @@ import ( const ( ateletNamespace = "ate-system" byNamespaceAndName = "by-namespace-and-name" - byWorkerPool = "by-worker-pool" + byWorkerPool = "by-workerpool" byNode = "by-node" - workerPodLabel = "ate.dev/worker-pool" + workerPodLabel = "ate.dev/workerpool" ) // AteletInformer creates a SharedInformerFactory and SharedIndexInformer for Atelet pods. @@ -54,7 +54,7 @@ func AteletInformer(kc kubernetes.Interface) (informers.SharedInformerFactory, c func WorkerPodInformer(kc kubernetes.Interface) (informers.SharedInformerFactory, cache.SharedIndexInformer) { factory := informers.NewSharedInformerFactoryWithOptions(kc, 5*time.Minute, informers.WithTweakListOptions(func(options *metav1.ListOptions) { - options.LabelSelector = "ate.dev/worker-pool" + options.LabelSelector = "ate.dev/workerpool" }), ) workerPodInformer := factory.Core().V1().Pods().Informer() @@ -66,7 +66,7 @@ func WorkerPodInformer(kc kubernetes.Interface) (informers.SharedInformerFactory }, byWorkerPool: func(obj any) ([]string, error) { pod := obj.(*corev1.Pod) - workerPoolRef := pod.ObjectMeta.Namespace + "/" + pod.ObjectMeta.Labels["ate.dev/worker-pool"] + workerPoolRef := pod.ObjectMeta.Namespace + "/" + pod.ObjectMeta.Labels["ate.dev/workerpool"] return []string{workerPoolRef}, nil }, }) diff --git a/internal/controllers/workerpool_controller.go b/internal/controllers/workerpool_controller.go index e72d2befd..c0e70fcd8 100644 --- a/internal/controllers/workerpool_controller.go +++ b/internal/controllers/workerpool_controller.go @@ -131,10 +131,10 @@ func buildDeploymentApplyConfig(wp *atev1alpha1.WorkerPool) *appsv1ac.Deployment WithSpec(appsv1ac.DeploymentSpec(). WithReplicas(wp.Spec.Replicas). WithSelector(metav1ac.LabelSelector(). - WithMatchLabels(map[string]string{"ate.dev/worker-pool": wp.Name})). + WithMatchLabels(map[string]string{"ate.dev/workerpool": wp.Name})). WithTemplate(corev1ac.PodTemplateSpec(). WithLabels(map[string]string{ - "ate.dev/worker-pool": wp.Name, + "ate.dev/workerpool": wp.Name, }). WithSpec(corev1ac.PodSpec(). WithContainers(corev1ac.Container(). diff --git a/internal/controllers/workerpool_controller_test.go b/internal/controllers/workerpool_controller_test.go index 026d9eb52..91de788a2 100644 --- a/internal/controllers/workerpool_controller_test.go +++ b/internal/controllers/workerpool_controller_test.go @@ -132,7 +132,7 @@ func TestWorkerPoolCreatesDeployment(t *testing.T) { if container.Image != "ateom:v1" || container.Name != "ateom" { return false, nil } - if dep.Spec.Template.Labels["ate.dev/worker-pool"] != wp.Name { + if dep.Spec.Template.Labels["ate.dev/workerpool"] != wp.Name { return false, nil } if len(dep.OwnerReferences) == 0 || dep.OwnerReferences[0].Name != wp.Name {