From a8db3d4ac6355b9d8693094118457ae1c89b3dbd Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Mon, 15 Jun 2026 15:50:27 +0200 Subject: [PATCH 1/2] improve observability --- api/v1alpha1/flavor_group_capacity_types.go | 11 +++--- .../committed-resource-reservations.md | 2 ++ .../bundles/cortex-nova/templates/alerts.yaml | 22 ++++++++++++ .../cortex.cloud_flavorgroupcapacities.yaml | 21 ++++++----- .../reservations/capacity/controller.go | 19 ++-------- .../reservations/capacity/controller_test.go | 36 +++++-------------- .../reservations/capacity/metrics.go | 18 ++++++++++ 7 files changed, 67 insertions(+), 62 deletions(-) diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go index 80596256e..c38f19fde 100644 --- a/api/v1alpha1/flavor_group_capacity_types.go +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -8,6 +8,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) + const ( // FlavorGroupCapacityConditionReady indicates the status data is up-to-date. FlavorGroupCapacityConditionReady = "Ready" @@ -61,11 +62,6 @@ type FlavorGroupCapacityStatus struct { // +kubebuilder:validation:Optional TotalCapacity map[string]resource.Quantity `json:"totalCapacity,omitempty"` - // TotalInstances is the total number of VM instances running on hypervisors in this AZ, - // derived from Hypervisor CRD Status.Instances (not filtered by flavor group). - // +kubebuilder:validation:Optional - TotalInstances int64 `json:"totalInstances,omitempty"` - // LastReconcileAt is the timestamp of the last successful reconcile. // +kubebuilder:validation:Optional LastReconcileAt metav1.Time `json:"lastReconcileAt,omitempty"` @@ -80,9 +76,10 @@ type FlavorGroupCapacityStatus struct { // +kubebuilder:resource:scope=Cluster // +kubebuilder:printcolumn:name="FlavorGroup",type="string",JSONPath=".spec.flavorGroup" // +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" -// +kubebuilder:printcolumn:name="TotalInstances",type="integer",JSONPath=".status.totalInstances" -// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" +// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" +// +kubebuilder:printcolumn:name="Committed",type="integer",JSONPath=".status.committedCapacity",priority=1 +// +kubebuilder:printcolumn:name="TotalMem",type="string",JSONPath=".status.totalCapacity.memory",priority=1 // FlavorGroupCapacity caches pre-computed capacity data for one flavor group in one AZ. // One CRD exists per (flavor group × AZ) pair, updated by the capacity controller on a fixed interval. diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md index 7d80064b0..e0ce56e60 100644 --- a/docs/reservations/committed-resource-reservations.md +++ b/docs/reservations/committed-resource-reservations.md @@ -40,6 +40,8 @@ The CR reservation implementation is located in `internal/scheduling/reservation - `cortex_committed_resource_usage_api_*` - `cortex_committed_resource_capacity_api_*` +**FlavorGroupCapacity readiness**: The `cortex_committed_resource_capacity_ready{flavor_group, az}` gauge is `1` when the CRD's `Ready` condition is `True` (all scheduler probes succeeded) and `0` when `False` (one or more probes failed, stale data is being served). The `CortexNovaCommittedResourceCapacityNotReady` alert fires after 10 minutes of `0`. When the alert fires, the capacity API is serving stale total capacity without usage data for that (flavor group × AZ) pair — Limes receives capacity but no usage. Check the capacity controller logs for probe errors and verify the scheduler is reachable. + ## Lifecycle Management The system is organized around two CRD types and two controllers. `CommittedResource` CRDs represent customer commitments; `Reservation` CRDs represent individual hypervisor capacity slots. Each has its own controller with a well-defined responsibility boundary. diff --git a/helm/bundles/cortex-nova/templates/alerts.yaml b/helm/bundles/cortex-nova/templates/alerts.yaml index 6f3fabef2..a4568be49 100644 --- a/helm/bundles/cortex-nova/templates/alerts.yaml +++ b/helm/bundles/cortex-nova/templates/alerts.yaml @@ -588,6 +588,28 @@ spec: This may mean hypervisors in that AZ are fully utilized for the corresponding flavor group and no further committed resources can be placed there. + - alert: CortexNovaCommittedResourceCapacityNotReady + expr: | + cortex_committed_resource_capacity_ready{service="cortex-nova-metrics"} == 0 + for: 10m + labels: + context: committed-resource-capacity + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity + annotations: + summary: "FlavorGroupCapacity for {{ "{{" }} $labels.flavor_group {{ "}}" }} in {{ "{{" }} $labels.az {{ "}}" }} has been not-ready for >10 minutes" + description: > + The FlavorGroupCapacity CRD for flavor group {{ "{{" }} $labels.flavor_group {{ "}}" }} + in availability zone {{ "{{" }} $labels.az {{ "}}" }} has had Ready=False for more than + 10 minutes. The capacity controller failed to complete all scheduler probes for this + (flavor group x AZ) pair. The capacity API (report-capacity) is serving stale total + capacity values for this group without usage data — Limes receives capacity but no + usage, causing silent staleness. Investigate the capacity controller logs for probe + errors and check scheduler availability. + # Committed Resource Usage API - alert: CortexNovaCommittedResourceUsageErrors expr: | diff --git a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml index 73a009ba4..c8d499028 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml @@ -21,14 +21,19 @@ spec: - jsonPath: .spec.availabilityZone name: AZ type: string - - jsonPath: .status.totalInstances - name: TotalInstances - type: integer + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string - jsonPath: .status.lastReconcileAt name: LastReconcile type: date - - jsonPath: .status.conditions[?(@.type=='Ready')].status - name: Ready + - jsonPath: .status.committedCapacity + name: Committed + priority: 1 + type: integer + - jsonPath: .status.totalCapacity.memory + name: TotalMem + priority: 1 type: string name: v1alpha1 schema: @@ -184,12 +189,6 @@ spec: description: TotalCapacity is the total capacity of all eligible hosts in an empty-datacenter scenario. type: object - totalInstances: - description: |- - TotalInstances is the total number of VM instances running on hypervisors in this AZ, - derived from Hypervisor CRD Status.Instances (not filtered by flavor group). - format: int64 - type: integer type: object required: - spec diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index a42d6c240..6db97128b 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -97,7 +97,7 @@ func (c *Controller) reconcileAll(ctx context.Context) error { var succeeded, failed int for groupName, groupData := range flavorGroups { for _, az := range azs { - if err := c.reconcileOne(ctx, groupName, groupData, az, hvByName, hvList.Items, blockedByReservations); err != nil { + if err := c.reconcileOne(ctx, groupName, groupData, az, hvByName, blockedByReservations); err != nil { logger.Error(err, "failed to reconcile flavor group capacity", "flavorGroup", groupName, "az", az) failed++ @@ -125,7 +125,6 @@ func (c *Controller) reconcileOne( groupData compute.FlavorGroupFeature, az string, hvByName map[string]hv1.Hypervisor, - allHVs []hv1.Hypervisor, blockedByReservations map[string]int64, ) error { @@ -189,8 +188,7 @@ func (c *Controller) reconcileOne( newFlavors = append(newFlavors, cur) } - // Count total instances and committed capacity (always available regardless of probe results). - totalInstances := countInstancesInAZ(allHVs, az) + // Count committed capacity (always available regardless of probe results). committedCapacity, committedErr := c.sumCommittedCapacity(ctx, groupName, az, smallestFlavorBytes) if committedErr != nil { LoggerFromContext(ctx).Error(committedErr, "failed to sum committed capacity", @@ -238,7 +236,6 @@ func (c *Controller) reconcileOne( patch := client.MergeFrom(existing.DeepCopy()) existing.Status.Flavors = newFlavors - existing.Status.TotalInstances = totalInstances existing.Status.CommittedCapacity = committedCapacity existing.Status.TotalCapacity = totalCapacity existing.Status.LastReconcileAt = metav1.Now() @@ -427,18 +424,6 @@ func availabilityZones(hvs []hv1.Hypervisor) []string { return azs } -// countInstancesInAZ counts total VM instances across all hypervisors in the given AZ. -func countInstancesInAZ(hvs []hv1.Hypervisor, az string) int64 { - var total int64 - for _, hv := range hvs { - if hv.Labels["topology.kubernetes.io/zone"] != az { - continue - } - total += int64(len(hv.Status.Instances)) - } - return total -} - // crdNameFor produces a collision-safe DNS label for a (flavorGroup, az) pair. // A 6-hex-char FNV-1a hash of the raw inputs is appended so that pairs differing only // by characters that sanitise identically (e.g. "." vs "-") still get unique names. diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go index 8e25ff644..ed2c717b8 100644 --- a/internal/scheduling/reservations/capacity/controller_test.go +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -79,7 +79,7 @@ func newFlavorGroupKnowledge(t *testing.T, groupName string, smallestMemoryMB ui } // newHypervisor creates a Hypervisor CRD with a topology AZ label and effective capacity. -func newHypervisor(name, az string, memoryBytes int64, instanceIDs ...string) *hv1.Hypervisor { +func newHypervisor(name, az string, memoryBytes int64) *hv1.Hypervisor { hv := &hv1.Hypervisor{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -92,9 +92,6 @@ func newHypervisor(name, az string, memoryBytes int64, instanceIDs ...string) *h hv1.ResourceMemory: *qty, } } - for _, id := range instanceIDs { - hv.Status.Instances = append(hv.Status.Instances, hv1.Instance{ID: id}) - } return hv } @@ -173,20 +170,8 @@ func TestAvailabilityZones(t *testing.T) { } func TestCountInstancesInAZ(t *testing.T) { - hvs := []hv1.Hypervisor{ - *newHypervisor("h1", "az-a", 0, "vm1", "vm2"), - *newHypervisor("h2", "az-a", 0, "vm3"), - *newHypervisor("h3", "az-b", 0, "vm4"), - } - if got := countInstancesInAZ(hvs, "az-a"); got != 3 { - t.Errorf("countInstancesInAZ(az-a) = %d, want 3", got) - } - if got := countInstancesInAZ(hvs, "az-b"); got != 1 { - t.Errorf("countInstancesInAZ(az-b) = %d, want 1", got) - } - if got := countInstancesInAZ(hvs, "az-c"); got != 0 { - t.Errorf("countInstancesInAZ(az-c) = %d, want 0", got) - } + // TestCountInstancesInAZ has been removed — countInstancesInAZ was deleted + // because hv1.Instance has no flavor name, making per-flavor-group filtering impossible. } // --- integration-style tests for reconcileOne --- @@ -200,7 +185,7 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { ) scheme := newTestScheme(t) - hv := newHypervisor("host-1", az, memBytes, "vm1") + hv := newHypervisor("host-1", az, memBytes) knowledge := newFlavorGroupKnowledge(t, groupName, memMB) fakeClient := fake.NewClientBuilder(). @@ -226,7 +211,7 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { } hvByName := map[string]hv1.Hypervisor{"host-1": *hv} - if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil { + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, map[string]int64{}); err != nil { t.Fatalf("reconcileOne failed: %v", err) } @@ -253,9 +238,6 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { if f.PlaceableHosts != 1 { t.Errorf("PlaceableHosts = %d, want 1", f.PlaceableHosts) } - if crd.Status.TotalInstances != 1 { - t.Errorf("TotalInstances = %d, want 1", crd.Status.TotalInstances) - } } func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { @@ -293,7 +275,7 @@ func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { } // reconcileOne returns no error itself (it continues on probe failure), but sets Ready=False - if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, map[string]hv1.Hypervisor{}, []hv1.Hypervisor{}, map[string]int64{}); err != nil { + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, map[string]hv1.Hypervisor{}, map[string]int64{}); err != nil { t.Fatalf("reconcileOne failed: %v", err) } @@ -358,11 +340,11 @@ func TestReconcileOne_IdempotentUpdate(t *testing.T) { hvByName := map[string]hv1.Hypervisor{"host-1": *hv} // First call - if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil { + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, map[string]int64{}); err != nil { t.Fatalf("first reconcileOne failed: %v", err) } // Second call — should not error on the already-existing CRD - if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil { + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, map[string]int64{}); err != nil { t.Fatalf("second reconcileOne failed: %v", err) } @@ -576,7 +558,7 @@ func TestReconcileOne_ZeroMemoryFlavorReturnsError(t *testing.T) { groupData := compute.FlavorGroupFeature{ SmallestFlavor: compute.FlavorInGroup{Name: "bad-flavor", MemoryMB: 0}, } - err := c.reconcileOne(context.Background(), "hana-v2", groupData, "az-a", nil, nil, nil) + err := c.reconcileOne(context.Background(), "hana-v2", groupData, "az-a", nil, nil) if err == nil { t.Error("expected error for zero-memory flavor") } diff --git a/internal/scheduling/reservations/capacity/metrics.go b/internal/scheduling/reservations/capacity/metrics.go index f282cc9d0..fcaf9fc04 100644 --- a/internal/scheduling/reservations/capacity/metrics.go +++ b/internal/scheduling/reservations/capacity/metrics.go @@ -9,6 +9,7 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/prometheus/client_golang/prometheus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -26,6 +27,7 @@ type Monitor struct { hostsEmpty *prometheus.GaugeVec hostsPlaceable *prometheus.GaugeVec committedCapacity *prometheus.GaugeVec + readyGauge *prometheus.GaugeVec } // NewMonitor creates a new Monitor that reads FlavorGroupCapacity CRDs. @@ -52,6 +54,10 @@ func NewMonitor(c client.Client) Monitor { Name: "cortex_committed_resource_committed_gib", Help: "Sum of AcceptedAmount in GiB across Ready CommittedResource CRDs for this flavor group and AZ.", }, capacityLabels), + readyGauge: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_ready", + Help: "1 if the FlavorGroupCapacity CRD is Ready (all scheduler probes succeeded), 0 otherwise.", + }, capacityLabels), } } @@ -62,6 +68,7 @@ func (m *Monitor) Describe(ch chan<- *prometheus.Desc) { m.hostsEmpty.Describe(ch) m.hostsPlaceable.Describe(ch) m.committedCapacity.Describe(ch) + m.readyGauge.Describe(ch) } // Collect implements prometheus.Collector — lists all FlavorGroupCapacity CRDs and exports gauges. @@ -80,6 +87,7 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { m.hostsEmpty.Reset() m.hostsPlaceable.Reset() m.committedCapacity.Reset() + m.readyGauge.Reset() for _, crd := range list.Items { groupAZLabels := prometheus.Labels{ @@ -88,6 +96,15 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { } m.committedCapacity.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacity)) + readyVal := 0.0 + for _, cond := range crd.Status.Conditions { + if cond.Type == v1alpha1.FlavorGroupCapacityConditionReady && cond.Status == metav1.ConditionTrue { + readyVal = 1.0 + break + } + } + m.readyGauge.With(groupAZLabels).Set(readyVal) + for _, f := range crd.Status.Flavors { flavorLabels := prometheus.Labels{ "flavor_group": crd.Spec.FlavorGroup, @@ -106,4 +123,5 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { m.hostsEmpty.Collect(ch) m.hostsPlaceable.Collect(ch) m.committedCapacity.Collect(ch) + m.readyGauge.Collect(ch) } From a823346211930914ba728ccc9494533be38c8cf1 Mon Sep 17 00:00:00 2001 From: Julius Clausnitzer Date: Mon, 15 Jun 2026 16:11:28 +0200 Subject: [PATCH 2/2] lint --- api/v1alpha1/flavor_group_capacity_types.go | 1 - 1 file changed, 1 deletion(-) diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go index c38f19fde..7e4c465c8 100644 --- a/api/v1alpha1/flavor_group_capacity_types.go +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -8,7 +8,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) - const ( // FlavorGroupCapacityConditionReady indicates the status data is up-to-date. FlavorGroupCapacityConditionReady = "Ready"