diff --git a/chart/templates/deployment-operator-controller-manager.yaml b/chart/templates/deployment-operator-controller-manager.yaml index e3d51af..d1e5e9d 100644 --- a/chart/templates/deployment-operator-controller-manager.yaml +++ b/chart/templates/deployment-operator-controller-manager.yaml @@ -38,6 +38,9 @@ spec: - --redirect-ingress-class={{ .Values.redirect.ingressClass }} - --redirect-cluster-issuer={{ .Values.redirect.clusterIssuer.name }} {{- end }} + {{- if .Values.redirect.blockedIPv6CIDRs }} + - --redirect-blocked-ipv6={{ join "," .Values.redirect.blockedIPv6CIDRs }} + {{- end }} {{- if .Values.redirect.namespace }} - --redirect-namespace={{ .Values.redirect.namespace }} {{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index eb3be50..e83e90f 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -140,6 +140,7 @@ operatorApi: redirect: namespace: "deco-redirect-system" ingressClass: "" # set to enable DecoRedirect controller (e.g. "redirect-nginx") + blockedIPv6CIDRs: [] # IPv6 CIDRs that block cert issuance when present in AAAA records (e.g. ["2600:1901::/32"]) clusterIssuer: enabled: false # set true to create the Let's Encrypt ClusterIssuer name: "" # ClusterIssuer name (e.g. "letsencrypt") diff --git a/cmd/main.go b/cmd/main.go index a937366..bd1e99c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -21,6 +21,7 @@ import ( "crypto/tls" "flag" "fmt" + "net" "os" "path/filepath" "strings" @@ -135,6 +136,10 @@ func main() { flag.StringVar(&redirectClusterIssuer, "redirect-cluster-issuer", getEnvOrDefault("REDIRECT_CLUSTER_ISSUER", "letsencrypt"), "cert-manager ClusterIssuer name (matches redirect.clusterIssuer.name in values).") + var redirectBlockedIPv6 string + flag.StringVar(&redirectBlockedIPv6, "redirect-blocked-ipv6", + getEnvOrDefault("REDIRECT_BLOCKED_IPV6", ""), + "Comma-separated IPv6 CIDRs that block cert issuance when present in a domain's AAAA records (e.g. 2600:1901::/32).") var controllersFlag string flag.StringVar(&controllersFlag, "controllers", "*", "Comma-separated list of controllers to enable. Use \"*\" to enable all. Valid values: "+ @@ -371,11 +376,25 @@ func main() { } if enabled(controller.DecoRedirectControllerName) { + var blockedIPv6CIDRs []*net.IPNet + for _, cidr := range strings.Split(redirectBlockedIPv6, ",") { + cidr = strings.TrimSpace(cidr) + if cidr == "" { + continue + } + _, ipNet, cidrErr := net.ParseCIDR(cidr) + if cidrErr != nil { + setupLog.Error(cidrErr, "invalid CIDR in --redirect-blocked-ipv6", "cidr", cidr) + os.Exit(1) + } + blockedIPv6CIDRs = append(blockedIPv6CIDRs, ipNet) + } if err = (&controller.DecoRedirectReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - IngressClass: redirectIngressClass, - ClusterIssuer: redirectClusterIssuer, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + IngressClass: redirectIngressClass, + ClusterIssuer: redirectClusterIssuer, + BlockedIPv6CIDRs: blockedIPv6CIDRs, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "DecoRedirect") os.Exit(1) diff --git a/hack/helm-generator/main.go b/hack/helm-generator/main.go index 56c7cb8..c322040 100644 --- a/hack/helm-generator/main.go +++ b/hack/helm-generator/main.go @@ -434,6 +434,9 @@ func addRedirectControllerArgs(templatesDir string) error { - --redirect-ingress-class={{ .Values.redirect.ingressClass }} - --redirect-cluster-issuer={{ .Values.redirect.clusterIssuer.name }} {{- end }} + {{- if .Values.redirect.blockedIPv6CIDRs }} + - --redirect-blocked-ipv6={{ join "," .Values.redirect.blockedIPv6CIDRs }} + {{- end }} {{- if .Values.redirect.namespace }} - --redirect-namespace={{ .Values.redirect.namespace }} {{- end }}` diff --git a/internal/controller/decoredirect_controller.go b/internal/controller/decoredirect_controller.go index 0238ca0..16a4482 100644 --- a/internal/controller/decoredirect_controller.go +++ b/internal/controller/decoredirect_controller.go @@ -4,6 +4,8 @@ import ( "context" "crypto/sha256" "fmt" + "net" + "net/http" "strconv" "strings" "time" @@ -30,6 +32,14 @@ type DecoRedirectReconciler struct { Scheme *runtime.Scheme IngressClass string // nginx ingress class name, e.g. "nginx" ClusterIssuer string // cert-manager ClusterIssuer name, e.g. "letsencrypt" + // BlockedIPv6CIDRs is a list of IPv6 CIDR ranges that, if present in a domain's + // AAAA records, indicate DNS is not ready for cert issuance. Typically legacy + // infrastructure addresses that intercept Let's Encrypt validation incorrectly. + // When empty, no AAAA check is performed. + BlockedIPv6CIDRs []*net.IPNet + // DNSReadyFunc checks if the domain DNS is correctly pointing to the redirect infrastructure. + // Defaults to isDNSReady. Injectable for testing. + DNSReadyFunc func(ctx context.Context, domain string) bool } // dummyBackendName satisfies the k8s Ingress API requirement for a backend on every path. @@ -51,6 +61,15 @@ func (r *DecoRedirectReconciler) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, client.IgnoreNotFound(err) } + // Auto-heal: if Certificate is stuck in Failed backoff and DNS is now correct, delete it + // so reconcileCertificate recreates it fresh and cert-manager retries without backoff. + if healed, err := r.maybeHealCertificate(ctx, rd); err != nil { + log.Error(err, "failed to heal Certificate") + return ctrl.Result{}, err + } else if healed { + return ctrl.Result{RequeueAfter: 2 * time.Second}, nil + } + if err := r.reconcileCertificate(ctx, rd); err != nil { log.Error(err, "failed to reconcile Certificate") return ctrl.Result{}, err @@ -86,6 +105,10 @@ func (r *DecoRedirectReconciler) reconcileCertificate(ctx context.Context, rd *d } _, err := controllerutil.CreateOrUpdate(ctx, r.Client, cert, func() error { + // Skip mutation while the object is being deleted — the Watch will re-trigger once gone. + if cert.DeletionTimestamp != nil { + return nil + } cert.Spec.SecretName = tlsSecretName(rd.Spec.From) cert.Spec.DNSNames = []string{rd.Spec.From} cert.Spec.IssuerRef = cmmeta.ObjectReference{ @@ -191,6 +214,93 @@ func (r *DecoRedirectReconciler) updateStatus(ctx context.Context, rd *decosites return certReady, r.Status().Patch(ctx, patch, client.MergeFrom(rd)) } +// maybeHealCertificate deletes a Certificate that is stuck in Failed backoff when DNS +// is already pointing correctly to the Deco redirect infrastructure. Returning true +// means the Certificate was deleted and the caller should requeue before recreating it. +func (r *DecoRedirectReconciler) maybeHealCertificate(ctx context.Context, rd *decositesv1alpha1.DecoRedirect) (bool, error) { + log := logf.FromContext(ctx) + + cert := &cmv1.Certificate{} + if err := r.Get(ctx, types.NamespacedName{Name: resourceName(rd.Spec.From), Namespace: rd.Namespace}, cert); err != nil { + return false, client.IgnoreNotFound(err) + } + + // Skip if already being deleted or not in the Failed backoff state. + if cert.DeletionTimestamp != nil || !isCertFailed(cert) { + return false, nil + } + + dnsReady := r.DNSReadyFunc + if dnsReady == nil { + dnsReady = r.isDNSReady + } + if !dnsReady(ctx, rd.Spec.From) { + log.Info("certificate in Failed backoff but DNS not ready yet", "domain", rd.Spec.From) + return false, nil + } + + log.Info("certificate in Failed backoff and DNS is ready — deleting to trigger retry", "domain", rd.Spec.From) + if err := r.Delete(ctx, cert); err != nil { + return false, client.IgnoreNotFound(err) + } + return true, nil +} + +// isCertFailed reports whether the Certificate is stuck in cert-manager's exponential +// backoff after a failed issuance attempt (Issuing=False, Reason=Failed). +func isCertFailed(cert *cmv1.Certificate) bool { + for _, c := range cert.Status.Conditions { + if c.Type == cmv1.CertificateConditionIssuing { + return c.Status == cmmeta.ConditionFalse && c.Reason == "Failed" + } + } + return false +} + +// isDNSReady checks that the domain is correctly pointing to the redirect infrastructure: +// 1. An HTTP request returns a redirect served by the nginx (X-Redirect-By: deco header). +// 2. No AAAA record falls within any BlockedIPv6CIDRs range, which would cause +// Let's Encrypt's IPv6 validation to reach the wrong server and fail the challenge. +func (r *DecoRedirectReconciler) isDNSReady(ctx context.Context, domain string) bool { + httpClient := &http.Client{ + CheckRedirect: func(*http.Request, []*http.Request) error { return http.ErrUseLastResponse }, + Timeout: 5 * time.Second, + } + req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://"+domain+"/", nil) + if err != nil { + return false + } + resp, err := httpClient.Do(req) + if err != nil { + return false + } + _ = resp.Body.Close() + if resp.Header.Get("X-Redirect-By") != "deco" { + return false + } + + if len(r.BlockedIPv6CIDRs) == 0 { + return true + } + + addrs, err := net.DefaultResolver.LookupIPAddr(ctx, domain) + if err != nil { + return false + } + for _, a := range addrs { + ip := a.IP + if ip.To4() != nil { + continue + } + for _, blocked := range r.BlockedIPv6CIDRs { + if blocked.Contains(ip) { + return false + } + } + } + return true +} + // resourceName returns a deterministic k8s-safe name for a domain, capped at 253 chars. // "client.com" → "redirect-client-com" func resourceName(domain string) string { diff --git a/internal/controller/decoredirect_controller_test.go b/internal/controller/decoredirect_controller_test.go index a73890a..a3846db 100644 --- a/internal/controller/decoredirect_controller_test.go +++ b/internal/controller/decoredirect_controller_test.go @@ -184,4 +184,161 @@ var _ = Describe("DecoRedirect Controller", func() { Expect(ing.Annotations["nginx.ingress.kubernetes.io/permanent-redirect-code"]).To(Equal("301")) }) }) + + Context("Auto-healing: maybeHealCertificate", func() { + const healNS = "default" + ctx := context.Background() + + newReconciler := func(dnsReady bool) *DecoRedirectReconciler { + return &DecoRedirectReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + IngressClass: "nginx", + ClusterIssuer: "letsencrypt", + DNSReadyFunc: func(_ context.Context, _ string) bool { return dnsReady }, + } + } + + // Each test uses a unique name to avoid state sharing between tests. + setup := func(suffix string) (nn, certNN types.NamespacedName, cleanup func()) { + name := "heal-" + suffix + domain := name + ".com" + nn = types.NamespacedName{Name: name + "-com", Namespace: healNS} + certNN = types.NamespacedName{Name: "redirect-" + name + "-com", Namespace: healNS} + + rd := &decositesv1alpha1.DecoRedirect{ + ObjectMeta: metav1.ObjectMeta{Name: name + "-com", Namespace: healNS}, + Spec: decositesv1alpha1.DecoRedirectSpec{ + From: domain, + To: "https://www." + domain, + }, + } + Expect(k8sClient.Create(ctx, rd)).To(Succeed()) + + cleanup = func() { + r := &decositesv1alpha1.DecoRedirect{} + if err := k8sClient.Get(ctx, nn, r); err == nil { + _ = k8sClient.Delete(ctx, r) + } + c := &cmv1.Certificate{} + if err := k8sClient.Get(ctx, certNN, c); err == nil { + _ = k8sClient.Delete(ctx, c) + } + } + return nn, certNN, cleanup + } + + patchCertFailed := func(certNN types.NamespacedName) { + cert := &cmv1.Certificate{} + Expect(k8sClient.Get(ctx, certNN, cert)).To(Succeed()) + patch := cert.DeepCopy() + patch.Status.Conditions = []cmv1.CertificateCondition{ + {Type: cmv1.CertificateConditionReady, Status: "False", Reason: "DoesNotExist", Message: "secret not found", LastTransitionTime: &[]metav1.Time{metav1.Now()}[0]}, + {Type: cmv1.CertificateConditionIssuing, Status: "False", Reason: "Failed", Message: "cert request failed", LastTransitionTime: &[]metav1.Time{metav1.Now()}[0]}, + } + Expect(k8sClient.Status().Patch(ctx, patch, client.MergeFrom(cert))).To(Succeed()) + } + + It("should delete the Certificate when it is in Failed backoff and DNS is ready", func() { + nn, certNN, cleanup := setup("delete") + DeferCleanup(cleanup) + + _, err := newReconciler(true).Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + + patchCertFailed(certNN) + + rd := &decositesv1alpha1.DecoRedirect{} + Expect(k8sClient.Get(ctx, nn, rd)).To(Succeed()) + + healed, err := newReconciler(true).maybeHealCertificate(ctx, rd) + Expect(err).NotTo(HaveOccurred()) + Expect(healed).To(BeTrue()) + + cert := &cmv1.Certificate{} + Expect(k8sClient.Get(ctx, certNN, cert)).To(MatchError(ContainSubstring("not found"))) + }) + + It("should NOT delete the Certificate when DNS is not ready", func() { + nn, certNN, cleanup := setup("dns-wrong") + DeferCleanup(cleanup) + + _, err := newReconciler(false).Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + + patchCertFailed(certNN) + + rd := &decositesv1alpha1.DecoRedirect{} + Expect(k8sClient.Get(ctx, nn, rd)).To(Succeed()) + + healed, err := newReconciler(false).maybeHealCertificate(ctx, rd) + Expect(err).NotTo(HaveOccurred()) + Expect(healed).To(BeFalse()) + + cert := &cmv1.Certificate{} + Expect(k8sClient.Get(ctx, certNN, cert)).To(Succeed()) + }) + + It("should NOT delete the Certificate when it is Issuing (actively trying)", func() { + nn, certNN, cleanup := setup("issuing") + DeferCleanup(cleanup) + + _, err := newReconciler(true).Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + + cert := &cmv1.Certificate{} + Expect(k8sClient.Get(ctx, certNN, cert)).To(Succeed()) + patch := cert.DeepCopy() + patch.Status.Conditions = []cmv1.CertificateCondition{ + {Type: cmv1.CertificateConditionIssuing, Status: "True", Reason: "Issuing", LastTransitionTime: &[]metav1.Time{metav1.Now()}[0]}, + } + Expect(k8sClient.Status().Patch(ctx, patch, client.MergeFrom(cert))).To(Succeed()) + + rd := &decositesv1alpha1.DecoRedirect{} + Expect(k8sClient.Get(ctx, nn, rd)).To(Succeed()) + + healed, err := newReconciler(true).maybeHealCertificate(ctx, rd) + Expect(err).NotTo(HaveOccurred()) + Expect(healed).To(BeFalse()) + + Expect(k8sClient.Get(ctx, certNN, cert)).To(Succeed()) + }) + + It("should NOT delete the Certificate when it is Ready", func() { + nn, certNN, cleanup := setup("ready") + DeferCleanup(cleanup) + + _, err := newReconciler(true).Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + + cert := &cmv1.Certificate{} + Expect(k8sClient.Get(ctx, certNN, cert)).To(Succeed()) + patch := cert.DeepCopy() + patch.Status.Conditions = []cmv1.CertificateCondition{ + {Type: cmv1.CertificateConditionReady, Status: "True", Reason: "Ready", LastTransitionTime: &[]metav1.Time{metav1.Now()}[0]}, + } + Expect(k8sClient.Status().Patch(ctx, patch, client.MergeFrom(cert))).To(Succeed()) + + rd := &decositesv1alpha1.DecoRedirect{} + Expect(k8sClient.Get(ctx, nn, rd)).To(Succeed()) + + healed, err := newReconciler(true).maybeHealCertificate(ctx, rd) + Expect(err).NotTo(HaveOccurred()) + Expect(healed).To(BeFalse()) + + Expect(k8sClient.Get(ctx, certNN, cert)).To(Succeed()) + }) + + It("should do nothing when the Certificate does not exist yet", func() { + nn, _, cleanup := setup("no-cert") + DeferCleanup(cleanup) + + rd := &decositesv1alpha1.DecoRedirect{} + Expect(k8sClient.Get(ctx, nn, rd)).To(Succeed()) + + healed, err := newReconciler(true).maybeHealCertificate(ctx, rd) + Expect(err).NotTo(HaveOccurred()) + Expect(healed).To(BeFalse()) + }) + }) })