Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions api/v1alpha1/flavor_group_capacity_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@ type FlavorGroupCapacityStatus struct {
// +kubebuilder:validation:Optional
TotalCapacity map[string]resource.Quantity `json:"totalCapacity,omitempty"`

// TotalInstances is the total number of VM instances running on hypervisors in this AZ,
// derived from Hypervisor CRD Status.Instances (not filtered by flavor group).
// +kubebuilder:validation:Optional
TotalInstances int64 `json:"totalInstances,omitempty"`

// LastReconcileAt is the timestamp of the last successful reconcile.
// +kubebuilder:validation:Optional
LastReconcileAt metav1.Time `json:"lastReconcileAt,omitempty"`
Expand All @@ -80,9 +75,10 @@ type FlavorGroupCapacityStatus struct {
// +kubebuilder:resource:scope=Cluster
// +kubebuilder:printcolumn:name="FlavorGroup",type="string",JSONPath=".spec.flavorGroup"
// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone"
// +kubebuilder:printcolumn:name="TotalInstances",type="integer",JSONPath=".status.totalInstances"
// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt"
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status"
// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt"
// +kubebuilder:printcolumn:name="Committed",type="integer",JSONPath=".status.committedCapacity",priority=1
// +kubebuilder:printcolumn:name="TotalMem",type="string",JSONPath=".status.totalCapacity.memory",priority=1

// FlavorGroupCapacity caches pre-computed capacity data for one flavor group in one AZ.
// One CRD exists per (flavor group × AZ) pair, updated by the capacity controller on a fixed interval.
Expand Down
2 changes: 2 additions & 0 deletions docs/reservations/committed-resource-reservations.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ The CR reservation implementation is located in `internal/scheduling/reservation
- `cortex_committed_resource_usage_api_*`
- `cortex_committed_resource_capacity_api_*`

**FlavorGroupCapacity readiness**: The `cortex_committed_resource_capacity_ready{flavor_group, az}` gauge is `1` when the CRD's `Ready` condition is `True` (all scheduler probes succeeded) and `0` when `False` (one or more probes failed, stale data is being served). The `CortexNovaCommittedResourceCapacityNotReady` alert fires after 10 minutes of `0`. When the alert fires, the capacity API is serving stale total capacity without usage data for that (flavor group × AZ) pair — Limes receives capacity but no usage. Check the capacity controller logs for probe errors and verify the scheduler is reachable.

## Lifecycle Management

The system is organized around two CRD types and two controllers. `CommittedResource` CRDs represent customer commitments; `Reservation` CRDs represent individual hypervisor capacity slots. Each has its own controller with a well-defined responsibility boundary.
Expand Down
22 changes: 22 additions & 0 deletions helm/bundles/cortex-nova/templates/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,28 @@ spec:
This may mean hypervisors in that AZ are fully utilized for the corresponding
flavor group and no further committed resources can be placed there.

- alert: CortexNovaCommittedResourceCapacityNotReady
expr: |
cortex_committed_resource_capacity_ready{service="cortex-nova-metrics"} == 0
for: 10m
labels:
context: committed-resource-capacity
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity
annotations:
summary: "FlavorGroupCapacity for {{ "{{" }} $labels.flavor_group {{ "}}" }} in {{ "{{" }} $labels.az {{ "}}" }} has been not-ready for >10 minutes"
description: >
The FlavorGroupCapacity CRD for flavor group {{ "{{" }} $labels.flavor_group {{ "}}" }}
in availability zone {{ "{{" }} $labels.az {{ "}}" }} has had Ready=False for more than
10 minutes. The capacity controller failed to complete all scheduler probes for this
(flavor group x AZ) pair. The capacity API (report-capacity) is serving stale total
capacity values for this group without usage data — Limes receives capacity but no
usage, causing silent staleness. Investigate the capacity controller logs for probe
errors and check scheduler availability.

# Committed Resource Usage API
- alert: CortexNovaCommittedResourceUsageErrors
expr: |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,19 @@ spec:
- jsonPath: .spec.availabilityZone
name: AZ
type: string
- jsonPath: .status.totalInstances
name: TotalInstances
type: integer
- jsonPath: .status.conditions[?(@.type=='Ready')].status
name: Ready
type: string
- jsonPath: .status.lastReconcileAt
name: LastReconcile
type: date
- jsonPath: .status.conditions[?(@.type=='Ready')].status
name: Ready
- jsonPath: .status.committedCapacity
name: Committed
priority: 1
type: integer
- jsonPath: .status.totalCapacity.memory
name: TotalMem
priority: 1
type: string
name: v1alpha1
schema:
Expand Down Expand Up @@ -184,12 +189,6 @@ spec:
description: TotalCapacity is the total capacity of all eligible hosts
in an empty-datacenter scenario.
type: object
totalInstances:
description: |-
TotalInstances is the total number of VM instances running on hypervisors in this AZ,
derived from Hypervisor CRD Status.Instances (not filtered by flavor group).
format: int64
type: integer
type: object
required:
- spec
Expand Down
19 changes: 2 additions & 17 deletions internal/scheduling/reservations/capacity/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func (c *Controller) reconcileAll(ctx context.Context) error {
var succeeded, failed int
for groupName, groupData := range flavorGroups {
for _, az := range azs {
if err := c.reconcileOne(ctx, groupName, groupData, az, hvByName, hvList.Items, blockedByReservations); err != nil {
if err := c.reconcileOne(ctx, groupName, groupData, az, hvByName, blockedByReservations); err != nil {
logger.Error(err, "failed to reconcile flavor group capacity",
"flavorGroup", groupName, "az", az)
failed++
Expand Down Expand Up @@ -125,7 +125,6 @@ func (c *Controller) reconcileOne(
groupData compute.FlavorGroupFeature,
az string,
hvByName map[string]hv1.Hypervisor,
allHVs []hv1.Hypervisor,
blockedByReservations map[string]int64,
) error {

Expand Down Expand Up @@ -189,8 +188,7 @@ func (c *Controller) reconcileOne(
newFlavors = append(newFlavors, cur)
}

// Count total instances and committed capacity (always available regardless of probe results).
totalInstances := countInstancesInAZ(allHVs, az)
// Count committed capacity (always available regardless of probe results).
committedCapacity, committedErr := c.sumCommittedCapacity(ctx, groupName, az, smallestFlavorBytes)
if committedErr != nil {
LoggerFromContext(ctx).Error(committedErr, "failed to sum committed capacity",
Expand Down Expand Up @@ -238,7 +236,6 @@ func (c *Controller) reconcileOne(

patch := client.MergeFrom(existing.DeepCopy())
existing.Status.Flavors = newFlavors
existing.Status.TotalInstances = totalInstances
existing.Status.CommittedCapacity = committedCapacity
existing.Status.TotalCapacity = totalCapacity
existing.Status.LastReconcileAt = metav1.Now()
Expand Down Expand Up @@ -427,18 +424,6 @@ func availabilityZones(hvs []hv1.Hypervisor) []string {
return azs
}

// countInstancesInAZ counts total VM instances across all hypervisors in the given AZ.
func countInstancesInAZ(hvs []hv1.Hypervisor, az string) int64 {
var total int64
for _, hv := range hvs {
if hv.Labels["topology.kubernetes.io/zone"] != az {
continue
}
total += int64(len(hv.Status.Instances))
}
return total
}

// crdNameFor produces a collision-safe DNS label for a (flavorGroup, az) pair.
// A 6-hex-char FNV-1a hash of the raw inputs is appended so that pairs differing only
// by characters that sanitise identically (e.g. "." vs "-") still get unique names.
Expand Down
36 changes: 9 additions & 27 deletions internal/scheduling/reservations/capacity/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func newFlavorGroupKnowledge(t *testing.T, groupName string, smallestMemoryMB ui
}

// newHypervisor creates a Hypervisor CRD with a topology AZ label and effective capacity.
func newHypervisor(name, az string, memoryBytes int64, instanceIDs ...string) *hv1.Hypervisor {
func newHypervisor(name, az string, memoryBytes int64) *hv1.Hypervisor {
hv := &hv1.Hypervisor{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Expand All @@ -92,9 +92,6 @@ func newHypervisor(name, az string, memoryBytes int64, instanceIDs ...string) *h
hv1.ResourceMemory: *qty,
}
}
for _, id := range instanceIDs {
hv.Status.Instances = append(hv.Status.Instances, hv1.Instance{ID: id})
}
return hv
}

Expand Down Expand Up @@ -173,20 +170,8 @@ func TestAvailabilityZones(t *testing.T) {
}

func TestCountInstancesInAZ(t *testing.T) {
hvs := []hv1.Hypervisor{
*newHypervisor("h1", "az-a", 0, "vm1", "vm2"),
*newHypervisor("h2", "az-a", 0, "vm3"),
*newHypervisor("h3", "az-b", 0, "vm4"),
}
if got := countInstancesInAZ(hvs, "az-a"); got != 3 {
t.Errorf("countInstancesInAZ(az-a) = %d, want 3", got)
}
if got := countInstancesInAZ(hvs, "az-b"); got != 1 {
t.Errorf("countInstancesInAZ(az-b) = %d, want 1", got)
}
if got := countInstancesInAZ(hvs, "az-c"); got != 0 {
t.Errorf("countInstancesInAZ(az-c) = %d, want 0", got)
}
// TestCountInstancesInAZ has been removed — countInstancesInAZ was deleted
// because hv1.Instance has no flavor name, making per-flavor-group filtering impossible.
}

// --- integration-style tests for reconcileOne ---
Expand All @@ -200,7 +185,7 @@ func TestReconcileOne_CreatesCRD(t *testing.T) {
)

scheme := newTestScheme(t)
hv := newHypervisor("host-1", az, memBytes, "vm1")
hv := newHypervisor("host-1", az, memBytes)
knowledge := newFlavorGroupKnowledge(t, groupName, memMB)

fakeClient := fake.NewClientBuilder().
Expand All @@ -226,7 +211,7 @@ func TestReconcileOne_CreatesCRD(t *testing.T) {
}
hvByName := map[string]hv1.Hypervisor{"host-1": *hv}

if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil {
if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, map[string]int64{}); err != nil {
t.Fatalf("reconcileOne failed: %v", err)
}

Expand All @@ -253,9 +238,6 @@ func TestReconcileOne_CreatesCRD(t *testing.T) {
if f.PlaceableHosts != 1 {
t.Errorf("PlaceableHosts = %d, want 1", f.PlaceableHosts)
}
if crd.Status.TotalInstances != 1 {
t.Errorf("TotalInstances = %d, want 1", crd.Status.TotalInstances)
}
}

func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) {
Expand Down Expand Up @@ -293,7 +275,7 @@ func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) {
}

// reconcileOne returns no error itself (it continues on probe failure), but sets Ready=False
if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, map[string]hv1.Hypervisor{}, []hv1.Hypervisor{}, map[string]int64{}); err != nil {
if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, map[string]hv1.Hypervisor{}, map[string]int64{}); err != nil {
t.Fatalf("reconcileOne failed: %v", err)
}

Expand Down Expand Up @@ -358,11 +340,11 @@ func TestReconcileOne_IdempotentUpdate(t *testing.T) {
hvByName := map[string]hv1.Hypervisor{"host-1": *hv}

// First call
if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil {
if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, map[string]int64{}); err != nil {
t.Fatalf("first reconcileOne failed: %v", err)
}
// Second call — should not error on the already-existing CRD
if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil {
if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, map[string]int64{}); err != nil {
t.Fatalf("second reconcileOne failed: %v", err)
}

Expand Down Expand Up @@ -576,7 +558,7 @@ func TestReconcileOne_ZeroMemoryFlavorReturnsError(t *testing.T) {
groupData := compute.FlavorGroupFeature{
SmallestFlavor: compute.FlavorInGroup{Name: "bad-flavor", MemoryMB: 0},
}
err := c.reconcileOne(context.Background(), "hana-v2", groupData, "az-a", nil, nil, nil)
err := c.reconcileOne(context.Background(), "hana-v2", groupData, "az-a", nil, nil)
if err == nil {
t.Error("expected error for zero-memory flavor")
}
Expand Down
18 changes: 18 additions & 0 deletions internal/scheduling/reservations/capacity/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/cobaltcore-dev/cortex/api/v1alpha1"
"github.com/prometheus/client_golang/prometheus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand All @@ -26,6 +27,7 @@ type Monitor struct {
hostsEmpty *prometheus.GaugeVec
hostsPlaceable *prometheus.GaugeVec
committedCapacity *prometheus.GaugeVec
readyGauge *prometheus.GaugeVec
}

// NewMonitor creates a new Monitor that reads FlavorGroupCapacity CRDs.
Expand All @@ -52,6 +54,10 @@ func NewMonitor(c client.Client) Monitor {
Name: "cortex_committed_resource_committed_gib",
Help: "Sum of AcceptedAmount in GiB across Ready CommittedResource CRDs for this flavor group and AZ.",
}, capacityLabels),
readyGauge: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cortex_committed_resource_capacity_ready",
Help: "1 if the FlavorGroupCapacity CRD is Ready (all scheduler probes succeeded), 0 otherwise.",
}, capacityLabels),
}
}

Expand All @@ -62,6 +68,7 @@ func (m *Monitor) Describe(ch chan<- *prometheus.Desc) {
m.hostsEmpty.Describe(ch)
m.hostsPlaceable.Describe(ch)
m.committedCapacity.Describe(ch)
m.readyGauge.Describe(ch)
}

// Collect implements prometheus.Collector — lists all FlavorGroupCapacity CRDs and exports gauges.
Expand All @@ -80,6 +87,7 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) {
m.hostsEmpty.Reset()
m.hostsPlaceable.Reset()
m.committedCapacity.Reset()
m.readyGauge.Reset()

for _, crd := range list.Items {
groupAZLabels := prometheus.Labels{
Expand All @@ -88,6 +96,15 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) {
}
m.committedCapacity.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacity))

readyVal := 0.0
for _, cond := range crd.Status.Conditions {
if cond.Type == v1alpha1.FlavorGroupCapacityConditionReady && cond.Status == metav1.ConditionTrue {
readyVal = 1.0
break
}
}
m.readyGauge.With(groupAZLabels).Set(readyVal)

for _, f := range crd.Status.Flavors {
flavorLabels := prometheus.Labels{
"flavor_group": crd.Spec.FlavorGroup,
Expand All @@ -106,4 +123,5 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) {
m.hostsEmpty.Collect(ch)
m.hostsPlaceable.Collect(ch)
m.committedCapacity.Collect(ch)
m.readyGauge.Collect(ch)
}