From 7a14ff5ccd75ead8958e20431fedab911dab7476 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=80=B8=E5=87=A1?= Date: Fri, 12 Jun 2026 15:58:49 +0800 Subject: [PATCH 1/3] feat: Add workerpool type #0 --- .go-version | 1 + docs/api-guide.md | 45 +++ docs/architecture.md | 7 +- internal/controllers/workerpool_apply.go | 132 ++++++++ .../workerpool_apply_config_test.go | 102 ++++++ internal/controllers/workerpool_apply_test.go | 81 +++++ internal/controllers/workerpool_controller.go | 64 ++-- .../controllers/workerpool_controller_test.go | 171 ++++++++++ .../generated/ate.dev_workerpools.yaml | 315 ++++++++++++++++++ pkg/api/v1alpha1/workerpool_types.go | 39 +++ .../v1alpha1/workerpool_validation_test.go | 29 ++ pkg/api/v1alpha1/zz_generated.deepcopy.go | 47 ++- 12 files changed, 1000 insertions(+), 33 deletions(-) create mode 100644 .go-version create mode 100644 internal/controllers/workerpool_apply.go create mode 100644 internal/controllers/workerpool_apply_config_test.go create mode 100644 internal/controllers/workerpool_apply_test.go diff --git a/.go-version b/.go-version new file mode 100644 index 000000000..f8f738140 --- /dev/null +++ b/.go-version @@ -0,0 +1 @@ +1.26.3 diff --git a/docs/api-guide.md b/docs/api-guide.md index fe609e3ad..e755350f3 100644 --- a/docs/api-guide.md +++ b/docs/api-guide.md @@ -12,6 +12,17 @@ The `WorkerPool` defines the pool of physical "warm" compute capacity. It manage | :--- | :--- | :--- | | `replicas` | `int32` | **Required.** Number of physical standby pods to maintain in the cluster. | | `ateomImage` | `string` | **Required.** The container image for the `ateom` herder process (e.g. `ko://github.com/agent-substrate/substrate/cmd/ateom-gvisor`). | +| `template` | `WorkerPoolPodTemplate` | **Optional.** Pod scheduling and resource settings for worker pods. | + +#### `WorkerPoolPodTemplate` (`spec.template`) + +| Field | Type | Pod mapping | +| :--- | :--- | :--- | +| `resources` | `ResourceRequirements` | `containers[name=ateom].resources` | +| `nodeSelector` | `map[string]string` | `spec.nodeSelector` | +| `tolerations` | `[]Toleration` | `spec.tolerations` (max 16) | +| `priorityClassName` | `string` | `spec.priorityClassName` | +| `nodeAffinity` | `NodeAffinity` | `spec.affinity.nodeAffinity` | ### Example @@ -26,6 +37,40 @@ spec: ateomImage: ko://github.com/agent-substrate/substrate/cmd/ateom-gvisor ``` +### Example with GPU node scheduling + +```yaml +apiVersion: ate.dev/v1alpha1 +kind: WorkerPool +metadata: + name: gpu-pool + namespace: ate-demo +spec: + replicas: 5 + ateomImage: ko://github.com/agent-substrate/substrate/cmd/ateom-gvisor + template: + resources: + requests: + cpu: "4" + memory: 16Gi + limits: + memory: 16Gi + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-tesla-t4 + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + priorityClassName: substrate-workers + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload + operator: In + values: [substrate] +``` + --- ## 2. ActorTemplate: The Workload Blueprint diff --git a/docs/architecture.md b/docs/architecture.md index 883a3c7ed..f29491979 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -223,9 +223,10 @@ These resources define the intended state of the system and are managed via Kubernetes CRD APIs. They are used for administrative operations and actor environment definitions. - * **WorkerPool**: Defines a pool of "warm" compute capacity. It specifies the - hardware shape (CPU, memory, accelerators), manages a fleet of standby - worker pods initialized and ready to receive resumed actor states. + * **WorkerPool**: Defines a pool of "warm" compute capacity. It manages a + fleet of standby worker pods initialized and ready to receive resumed actor + states. Optional `spec.template` fields configure worker pod resources, + node selection, tolerations, priority class, and node affinity. * **ActorTemplate**: An immutable definition of an actor-version. It encapsulates the container image, configuration, and environment required diff --git a/internal/controllers/workerpool_apply.go b/internal/controllers/workerpool_apply.go new file mode 100644 index 000000000..c1be5c215 --- /dev/null +++ b/internal/controllers/workerpool_apply.go @@ -0,0 +1,132 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controllers + +import ( + corev1 "k8s.io/api/core/v1" + corev1ac "k8s.io/client-go/applyconfigurations/core/v1" + + atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" +) + +func applyWorkerPoolPodTemplate( + podSpecAC *corev1ac.PodSpecApplyConfiguration, + containerAC *corev1ac.ContainerApplyConfiguration, + tmpl *atev1alpha1.WorkerPoolPodTemplate, +) { + if tmpl == nil { + podSpecAC.WithNodeSelector(map[string]string{}). + WithTolerations() + return + } + + nodeSelector := tmpl.NodeSelector + if nodeSelector == nil { + nodeSelector = map[string]string{} + } + podSpecAC.WithNodeSelector(nodeSelector) + podSpecAC.WithTolerations(tolerationsToApply(tmpl.Tolerations)...) + + podSpecAC.WithPriorityClassName(tmpl.PriorityClassName) + + if tmpl.NodeAffinity != nil { + podSpecAC.WithAffinity(corev1ac.Affinity().WithNodeAffinity(nodeAffinityToApply(tmpl.NodeAffinity))) + } + + if tmpl.Resources != nil { + containerAC.WithResources(resourceRequirementsToApply(tmpl.Resources)) + } +} + +func resourceRequirementsToApply(r *corev1.ResourceRequirements) *corev1ac.ResourceRequirementsApplyConfiguration { + ac := corev1ac.ResourceRequirements() + if len(r.Limits) > 0 { + ac.WithLimits(r.Limits) + } + if len(r.Requests) > 0 { + ac.WithRequests(r.Requests) + } + return ac +} + +func tolerationsToApply(tolerations []corev1.Toleration) []*corev1ac.TolerationApplyConfiguration { + out := make([]*corev1ac.TolerationApplyConfiguration, 0, len(tolerations)) + for i := range tolerations { + t := &tolerations[i] + ac := corev1ac.Toleration() + if t.Key != "" { + ac.WithKey(t.Key) + } + if t.Operator != "" { + ac.WithOperator(t.Operator) + } + if t.Value != "" { + ac.WithValue(t.Value) + } + if t.Effect != "" { + ac.WithEffect(t.Effect) + } + if t.TolerationSeconds != nil { + ac.WithTolerationSeconds(*t.TolerationSeconds) + } + out = append(out, ac) + } + return out +} + +func nodeAffinityToApply(na *corev1.NodeAffinity) *corev1ac.NodeAffinityApplyConfiguration { + ac := corev1ac.NodeAffinity() + if na.RequiredDuringSchedulingIgnoredDuringExecution != nil { + ac.WithRequiredDuringSchedulingIgnoredDuringExecution(nodeSelectorToApply(na.RequiredDuringSchedulingIgnoredDuringExecution)) + } + for i := range na.PreferredDuringSchedulingIgnoredDuringExecution { + term := &na.PreferredDuringSchedulingIgnoredDuringExecution[i] + ac.WithPreferredDuringSchedulingIgnoredDuringExecution(preferredSchedulingTermToApply(term)) + } + return ac +} + +func nodeSelectorToApply(ns *corev1.NodeSelector) *corev1ac.NodeSelectorApplyConfiguration { + ac := corev1ac.NodeSelector() + for i := range ns.NodeSelectorTerms { + ac.WithNodeSelectorTerms(nodeSelectorTermToApply(&ns.NodeSelectorTerms[i])) + } + return ac +} + +func preferredSchedulingTermToApply(term *corev1.PreferredSchedulingTerm) *corev1ac.PreferredSchedulingTermApplyConfiguration { + return corev1ac.PreferredSchedulingTerm(). + WithWeight(term.Weight). + WithPreference(nodeSelectorTermToApply(&term.Preference)) +} + +func nodeSelectorTermToApply(term *corev1.NodeSelectorTerm) *corev1ac.NodeSelectorTermApplyConfiguration { + ac := corev1ac.NodeSelectorTerm() + for i := range term.MatchExpressions { + ac.WithMatchExpressions(nodeSelectorRequirementToApply(&term.MatchExpressions[i])) + } + for i := range term.MatchFields { + ac.WithMatchFields(nodeSelectorRequirementToApply(&term.MatchFields[i])) + } + return ac +} + +func nodeSelectorRequirementToApply(req *corev1.NodeSelectorRequirement) *corev1ac.NodeSelectorRequirementApplyConfiguration { + ac := corev1ac.NodeSelectorRequirement().WithKey(req.Key).WithOperator(req.Operator) + if len(req.Values) > 0 { + ac.WithValues(req.Values...) + } + return ac +} diff --git a/internal/controllers/workerpool_apply_config_test.go b/internal/controllers/workerpool_apply_config_test.go new file mode 100644 index 000000000..275fa1747 --- /dev/null +++ b/internal/controllers/workerpool_apply_config_test.go @@ -0,0 +1,102 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controllers + +import ( + "encoding/json" + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" +) + +func TestBuildDeploymentApplyConfigPodTemplate(t *testing.T) { + wp := &atev1alpha1.WorkerPool{ + ObjectMeta: metav1.ObjectMeta{Name: "pool", Namespace: "default", UID: "uid"}, + Spec: atev1alpha1.WorkerPoolSpec{ + Replicas: 2, + AteomImage: "ateom:v1", + Template: sampleWorkerPoolPodTemplate(), + }, + } + + depAC := buildDeploymentApplyConfig(wp) + data, err := json.Marshal(depAC) + if err != nil { + t.Fatalf("marshal: %v", err) + } + + var dep map[string]any + if err := json.Unmarshal(data, &dep); err != nil { + t.Fatalf("unmarshal: %v", err) + } + + spec := dep["spec"].(map[string]any) + template := spec["template"].(map[string]any) + podSpec := template["spec"].(map[string]any) + + nodeSelector := podSpec["nodeSelector"].(map[string]any) + if nodeSelector["workload"] != "substrate" { + t.Fatalf("unexpected nodeSelector: %v", nodeSelector) + } + if podSpec["priorityClassName"] != "substrate-workers" { + t.Fatalf("unexpected priorityClassName: %v", podSpec["priorityClassName"]) + } + + containers := podSpec["containers"].([]any) + container := containers[0].(map[string]any) + resources := container["resources"].(map[string]any) + requests := resources["requests"].(map[string]any) + if requests["cpu"] != "2" { + t.Fatalf("unexpected cpu request: %v", requests["cpu"]) + } +} + +func TestBuildDeploymentApplyConfigClearsNodeSelector(t *testing.T) { + wp := &atev1alpha1.WorkerPool{ + ObjectMeta: metav1.ObjectMeta{Name: "pool", Namespace: "default", UID: "uid"}, + Spec: atev1alpha1.WorkerPoolSpec{ + Replicas: 1, + AteomImage: "ateom:v1", + Template: &atev1alpha1.WorkerPoolPodTemplate{ + Resources: &corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + }, + }, + }, + }, + } + + depAC := buildDeploymentApplyConfig(wp) + data, err := json.Marshal(depAC) + if err != nil { + t.Fatalf("marshal: %v", err) + } + + var dep map[string]any + if err := json.Unmarshal(data, &dep); err != nil { + t.Fatalf("unmarshal: %v", err) + } + + podSpec := dep["spec"].(map[string]any)["template"].(map[string]any)["spec"].(map[string]any) + nodeSelector, ok := podSpec["nodeSelector"].(map[string]any) + if ok && len(nodeSelector) != 0 { + t.Fatalf("expected empty nodeSelector, got %v", nodeSelector) + } +} diff --git a/internal/controllers/workerpool_apply_test.go b/internal/controllers/workerpool_apply_test.go new file mode 100644 index 000000000..c60a47833 --- /dev/null +++ b/internal/controllers/workerpool_apply_test.go @@ -0,0 +1,81 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controllers + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" +) + +func TestTolerationsToApply(t *testing.T) { + tolerations := []corev1.Toleration{{ + Key: "gpu", + Operator: corev1.TolerationOpEqual, + Value: "true", + Effect: corev1.TaintEffectNoSchedule, + TolerationSeconds: ptrInt64(300), + }} + + got := tolerationsToApply(tolerations) + if len(got) != 1 { + t.Fatalf("expected 1 toleration, got %d", len(got)) + } + if got[0].Key == nil || *got[0].Key != "gpu" { + t.Fatalf("unexpected key: %v", got[0].Key) + } + if got[0].TolerationSeconds == nil || *got[0].TolerationSeconds != 300 { + t.Fatalf("unexpected tolerationSeconds: %v", got[0].TolerationSeconds) + } +} + +func TestNodeAffinityToApply(t *testing.T) { + na := &corev1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ + NodeSelectorTerms: []corev1.NodeSelectorTerm{{ + MatchExpressions: []corev1.NodeSelectorRequirement{{ + Key: "zone", + Operator: corev1.NodeSelectorOpIn, + Values: []string{"us-west1-a"}, + }}, + }}, + }, + PreferredDuringSchedulingIgnoredDuringExecution: []corev1.PreferredSchedulingTerm{{ + Weight: 50, + Preference: corev1.NodeSelectorTerm{ + MatchExpressions: []corev1.NodeSelectorRequirement{{ + Key: "disk", + Operator: corev1.NodeSelectorOpIn, + Values: []string{"ssd"}, + }}, + }, + }}, + } + + got := nodeAffinityToApply(na) + if got.RequiredDuringSchedulingIgnoredDuringExecution == nil { + t.Fatal("expected required node selector") + } + if len(got.PreferredDuringSchedulingIgnoredDuringExecution) != 1 { + t.Fatalf("expected 1 preferred term, got %d", len(got.PreferredDuringSchedulingIgnoredDuringExecution)) + } + if got.PreferredDuringSchedulingIgnoredDuringExecution[0].Weight == nil || *got.PreferredDuringSchedulingIgnoredDuringExecution[0].Weight != 50 { + t.Fatalf("unexpected weight: %v", got.PreferredDuringSchedulingIgnoredDuringExecution[0].Weight) + } +} + +func ptrInt64(v int64) *int64 { + return &v +} diff --git a/internal/controllers/workerpool_controller.go b/internal/controllers/workerpool_controller.go index e72d2befd..176be71f0 100644 --- a/internal/controllers/workerpool_controller.go +++ b/internal/controllers/workerpool_controller.go @@ -120,6 +120,40 @@ func (r *WorkerPoolReconciler) syncStatus(ctx context.Context, wp *atev1alpha1.W // Deployment managed by a WorkerPool. Only fields owned by this controller // are declared here. func buildDeploymentApplyConfig(wp *atev1alpha1.WorkerPool) *appsv1ac.DeploymentApplyConfiguration { + containerAC := corev1ac.Container(). + WithName("ateom"). + WithImage(wp.Spec.AteomImage). + WithArgs( + "--pod-uid=$(POD_UID)", + ). + WithSecurityContext(corev1ac.SecurityContext(). + WithPrivileged(true). + WithRunAsUser(0). + WithRunAsGroup(0)). + WithEnv( + corev1ac.EnvVar(). + WithName("POD_UID"). + WithValueFrom(corev1ac.EnvVarSource(). + WithFieldRef(corev1ac.ObjectFieldSelector(). + WithFieldPath("metadata.uid"))), + ). + WithVolumeMounts(corev1ac.VolumeMount(). + WithName("run-ateom"). + WithMountPath(ateompath.BasePath)) + + podSpecAC := corev1ac.PodSpec(). + WithSecurityContext(corev1ac.PodSecurityContext(). + WithRunAsUser(0). + WithRunAsGroup(0)). + WithVolumes(corev1ac.Volume(). + WithName("run-ateom"). + WithHostPath(corev1ac.HostPathVolumeSource(). + WithPath(ateompath.BasePath). + WithType(corev1.HostPathDirectoryOrCreate))) + + applyWorkerPoolPodTemplate(podSpecAC, containerAC, wp.Spec.Template) + podSpecAC.WithContainers(containerAC) + return appsv1ac.Deployment(deploymentName(wp.Name), wp.Namespace). WithOwnerReferences(metav1ac.OwnerReference(). WithAPIVersion(atev1alpha1.GroupVersion.String()). @@ -136,35 +170,7 @@ func buildDeploymentApplyConfig(wp *atev1alpha1.WorkerPool) *appsv1ac.Deployment WithLabels(map[string]string{ "ate.dev/worker-pool": wp.Name, }). - WithSpec(corev1ac.PodSpec(). - WithContainers(corev1ac.Container(). - WithName("ateom"). - WithImage(wp.Spec.AteomImage). - WithArgs( - "--pod-uid=$(POD_UID)", - ). - WithSecurityContext(corev1ac.SecurityContext(). - WithPrivileged(true). - WithRunAsUser(0). - WithRunAsGroup(0)). - WithEnv( - corev1ac.EnvVar(). - WithName("POD_UID"). - WithValueFrom(corev1ac.EnvVarSource(). - WithFieldRef(corev1ac.ObjectFieldSelector(). - WithFieldPath("metadata.uid"))), - ). - WithVolumeMounts(corev1ac.VolumeMount(). - WithName("run-ateom"). - WithMountPath(ateompath.BasePath))). - WithSecurityContext(corev1ac.PodSecurityContext(). - WithRunAsUser(0). - WithRunAsGroup(0)). - WithVolumes(corev1ac.Volume(). - WithName("run-ateom"). - WithHostPath(corev1ac.HostPathVolumeSource(). - WithPath(ateompath.BasePath). - WithType(corev1.HostPathDirectoryOrCreate)))))) + WithSpec(podSpecAC))) } // SetupWithManager sets up the controller with the Manager. diff --git a/internal/controllers/workerpool_controller_test.go b/internal/controllers/workerpool_controller_test.go index 026d9eb52..32017ba0d 100644 --- a/internal/controllers/workerpool_controller_test.go +++ b/internal/controllers/workerpool_controller_test.go @@ -24,7 +24,9 @@ import ( "time" appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" k8errors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -341,6 +343,175 @@ func TestStatusReplicasPropagation(t *testing.T) { }) } +func sampleWorkerPoolPodTemplate() *atev1alpha1.WorkerPoolPodTemplate { + return &atev1alpha1.WorkerPoolPodTemplate{ + Resources: &corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + }, + }, + NodeSelector: map[string]string{ + "workload": "substrate", + }, + Tolerations: []corev1.Toleration{{ + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }}, + PriorityClassName: "substrate-workers", + NodeAffinity: &corev1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ + NodeSelectorTerms: []corev1.NodeSelectorTerm{{ + MatchExpressions: []corev1.NodeSelectorRequirement{{ + Key: "workload", + Operator: corev1.NodeSelectorOpIn, + Values: []string{"substrate"}, + }}, + }}, + }, + }, + } +} + +// TestWorkerPoolPodTemplatePropagation verifies that template fields propagate +// to the managed Deployment pod template. +func TestWorkerPoolPodTemplatePropagation(t *testing.T) { + wp := makeWorkerPool("test-template-propagate", "default", 1, "ateom:v1") + wp.Spec.Template = sampleWorkerPoolPodTemplate() + if err := k8sClient.Create(testCtx, wp); err != nil { + t.Fatalf("create WorkerPool: %v", err) + } + t.Cleanup(func() { k8sClient.Delete(testCtx, wp) }) //nolint:errcheck + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + if err != nil || len(dep.Spec.Template.Spec.Containers) == 0 { + return false, nil + } + podSpec := dep.Spec.Template.Spec + container := podSpec.Containers[0] + + if podSpec.NodeSelector["workload"] != "substrate" { + return false, nil + } + if len(podSpec.Tolerations) != 1 || podSpec.Tolerations[0].Key != "nvidia.com/gpu" { + return false, nil + } + if podSpec.PriorityClassName != "substrate-workers" { + return false, nil + } + if podSpec.Affinity == nil || podSpec.Affinity.NodeAffinity == nil { + return false, nil + } + if container.Resources.Requests.Cpu().Cmp(resource.MustParse("2")) != 0 { + return false, nil + } + return container.Resources.Requests.Memory().Cmp(resource.MustParse("4Gi")) == 0, nil + }) +} + +// TestWorkerPoolPodTemplateUpdate verifies that changing template fields on a +// WorkerPool propagates to the managed Deployment. +func TestWorkerPoolPodTemplateUpdate(t *testing.T) { + wp := makeWorkerPool("test-template-update", "default", 1, "ateom:v1") + wp.Spec.Template = sampleWorkerPoolPodTemplate() + if err := k8sClient.Create(testCtx, wp); err != nil { + t.Fatalf("create WorkerPool: %v", err) + } + t.Cleanup(func() { k8sClient.Delete(testCtx, wp) }) //nolint:errcheck + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + return err == nil && dep.Spec.Template.Spec.NodeSelector["workload"] == "substrate", nil + }) + + if err := k8sClient.Get(testCtx, types.NamespacedName{Name: wp.Name, Namespace: wp.Namespace}, wp); err != nil { + t.Fatalf("re-fetch WorkerPool: %v", err) + } + wp.Spec.Template.NodeSelector = map[string]string{"workload": "updated"} + wp.Spec.Template.Resources.Requests[corev1.ResourceCPU] = resource.MustParse("4") + if err := k8sClient.Update(testCtx, wp); err != nil { + t.Fatalf("update WorkerPool template: %v", err) + } + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + if err != nil || len(dep.Spec.Template.Spec.Containers) == 0 { + return false, nil + } + podSpec := dep.Spec.Template.Spec + return podSpec.NodeSelector["workload"] == "updated" && + podSpec.Containers[0].Resources.Requests.Cpu().Cmp(resource.MustParse("4")) == 0, nil + }) +} + +// TestWorkerPoolPodTemplateClear verifies that clearing template.nodeSelector +// removes it from the managed Deployment. +func TestWorkerPoolPodTemplateClear(t *testing.T) { + wp := makeWorkerPool("test-template-clear", "default", 1, "ateom:v1") + wp.Spec.Template = sampleWorkerPoolPodTemplate() + if err := k8sClient.Create(testCtx, wp); err != nil { + t.Fatalf("create WorkerPool: %v", err) + } + t.Cleanup(func() { k8sClient.Delete(testCtx, wp) }) //nolint:errcheck + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + return err == nil && dep.Spec.Template.Spec.NodeSelector["workload"] == "substrate", nil + }) + + if err := k8sClient.Get(testCtx, types.NamespacedName{Name: wp.Name, Namespace: wp.Namespace}, wp); err != nil { + t.Fatalf("re-fetch WorkerPool: %v", err) + } + wp.Spec.Template.NodeSelector = nil + if err := k8sClient.Update(testCtx, wp); err != nil { + t.Fatalf("clear WorkerPool nodeSelector: %v", err) + } + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + if err != nil { + return false, nil + } + return len(dep.Spec.Template.Spec.NodeSelector) == 0, nil + }) +} + +// TestSSARevertsOwnedPodTemplateFields verifies that if an external actor +// changes pod template fields owned by the workerpool-controller, the +// controller reverts them on the next reconcile. +func TestSSARevertsOwnedPodTemplateFields(t *testing.T) { + wp := makeWorkerPool("test-ssa-template", "default", 1, "ateom:v1") + wp.Spec.Template = sampleWorkerPoolPodTemplate() + if err := k8sClient.Create(testCtx, wp); err != nil { + t.Fatalf("create WorkerPool: %v", err) + } + t.Cleanup(func() { k8sClient.Delete(testCtx, wp) }) //nolint:errcheck + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + return err == nil && dep.Spec.Template.Spec.NodeSelector["workload"] == "substrate", nil + }) + + dep, err := getDeployment(testCtx, wp) + if err != nil { + t.Fatalf("get Deployment: %v", err) + } + dep.Spec.Template.Spec.NodeSelector = map[string]string{"workload": "rogue"} + if err := k8sClient.Update(testCtx, dep); err != nil { + t.Fatalf("rogue update: %v", err) + } + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + if err != nil { + return false, nil + } + return dep.Spec.Template.Spec.NodeSelector["workload"] == "substrate", nil + }) +} + // TestReplicasValidationRejectsNegative verifies that the API server rejects a // WorkerPool whose spec.replicas is negative. func TestReplicasValidationRejectsNegative(t *testing.T) { diff --git a/manifests/ate-install/generated/ate.dev_workerpools.yaml b/manifests/ate-install/generated/ate.dev_workerpools.yaml index 634512b23..ff2e988a8 100644 --- a/manifests/ate-install/generated/ate.dev_workerpools.yaml +++ b/manifests/ate-install/generated/ate.dev_workerpools.yaml @@ -75,6 +75,321 @@ spec: format: int32 minimum: 0 type: integer + template: + description: Template holds optional pod scheduling and resource settings + for worker pods. + properties: + nodeAffinity: + description: |- + NodeAffinity scheduling rules for the worker pods. Mapped to + spec.affinity.nodeAffinity on the pod. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: |- + The scheduler will prefer to schedule pods to nodes that satisfy + the affinity expressions specified by this field, but it may choose + a node that violates one or more of the expressions. The node that is + most preferred is the one with the greatest sum of weights, i.e. + for each node that meets all of the scheduling requirements (resource + request, requiredDuringScheduling affinity expressions, etc.), + compute a sum by iterating through the elements of this field and adding + "weight" to the sum if the node matches the corresponding matchExpressions; the + node(s) with the highest sum are the most preferred. + items: + description: |- + An empty preferred scheduling term matches all objects with implicit weight 0 + (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). + properties: + preference: + description: A node selector term, associated with the + corresponding weight. + properties: + matchExpressions: + description: A list of node selector requirements + by node's labels. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + description: A list of node selector requirements + by node's fields. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + weight: + description: Weight associated with matching the corresponding + nodeSelectorTerm, in the range 1-100. + format: int32 + type: integer + required: + - preference + - weight + type: object + type: array + x-kubernetes-list-type: atomic + requiredDuringSchedulingIgnoredDuringExecution: + description: |- + If the affinity requirements specified by this field are not met at + scheduling time, the pod will not be scheduled onto the node. + If the affinity requirements specified by this field cease to be met + at some point during pod execution (e.g. due to an update), the system + may or may not try to eventually evict the pod from its node. + properties: + nodeSelectorTerms: + description: Required. A list of node selector terms. + The terms are ORed. + items: + description: |- + A null or empty node selector term matches no objects. The requirements of + them are ANDed. + The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector requirements + by node's labels. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + description: A list of node selector requirements + by node's fields. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + type: object + nodeSelector: + additionalProperties: + type: string + description: NodeSelector is a selector which must be true for + the pod to fit on a node. + type: object + priorityClassName: + description: PriorityClassName for the worker pods. + type: string + resources: + description: Resources for the ateom container. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + tolerations: + description: Tolerations for the worker pods. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators). + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + maxItems: 16 + type: array + x-kubernetes-list-type: atomic + type: object required: - ateomImage - replicas diff --git a/pkg/api/v1alpha1/workerpool_types.go b/pkg/api/v1alpha1/workerpool_types.go index 2350432b6..5e75b4207 100644 --- a/pkg/api/v1alpha1/workerpool_types.go +++ b/pkg/api/v1alpha1/workerpool_types.go @@ -15,9 +15,43 @@ package v1alpha1 import ( + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// WorkerPoolPodTemplate defines optional scheduling and resource settings for +// worker pods. Resources apply to the fixed ateom container; nodeAffinity is +// mapped to spec.affinity.nodeAffinity on the pod. +type WorkerPoolPodTemplate struct { + // Resources for the ateom container. + // + // +optional + Resources *corev1.ResourceRequirements `json:"resources,omitempty"` + + // NodeSelector is a selector which must be true for the pod to fit on a node. + // + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + + // Tolerations for the worker pods. + // + // +optional + // +kubebuilder:validation:MaxItems=16 + // +listType=atomic + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + + // PriorityClassName for the worker pods. + // + // +optional + PriorityClassName string `json:"priorityClassName,omitempty"` + + // NodeAffinity scheduling rules for the worker pods. Mapped to + // spec.affinity.nodeAffinity on the pod. + // + // +optional + NodeAffinity *corev1.NodeAffinity `json:"nodeAffinity,omitempty"` +} + type WorkerPoolSpec struct { // Replicas is the number of worker pods to run. // +required @@ -28,6 +62,11 @@ type WorkerPoolSpec struct { // +kubebuilder:validation:MinLength=1 // +required AteomImage string `json:"ateomImage"` + + // Template holds optional pod scheduling and resource settings for worker pods. + // + // +optional + Template *WorkerPoolPodTemplate `json:"template,omitempty"` } type WorkerPoolStatus struct { diff --git a/pkg/api/v1alpha1/workerpool_validation_test.go b/pkg/api/v1alpha1/workerpool_validation_test.go index bda091583..5c213bc1e 100644 --- a/pkg/api/v1alpha1/workerpool_validation_test.go +++ b/pkg/api/v1alpha1/workerpool_validation_test.go @@ -19,6 +19,7 @@ import ( "strings" "testing" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -59,6 +60,34 @@ func TestWorkerPoolValidation(t *testing.T) { }, wantErr: true, errMsg: "spec.ateomImage: Invalid value: \"\": spec.ateomImage in body should be at least 1 chars long", + }, { + name: "valid template", + mutate: func(wp *WorkerPool) { + wp.Spec.Template = &WorkerPoolPodTemplate{ + NodeSelector: map[string]string{"workload": "substrate"}, + Tolerations: []corev1.Toleration{{ + Key: "gpu", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }}, + } + }, + wantErr: false, + }, { + name: "too many tolerations", + mutate: func(wp *WorkerPool) { + tolerations := make([]corev1.Toleration, 17) + for i := range tolerations { + tolerations[i] = corev1.Toleration{ + Key: "key", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + } + } + wp.Spec.Template = &WorkerPoolPodTemplate{Tolerations: tolerations} + }, + wantErr: true, + errMsg: "spec.template.tolerations: Too many", }} for _, tt := range tests { diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index 4bbfe02b6..fd3f7b223 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -19,6 +19,7 @@ package v1alpha1 import ( + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -318,7 +319,7 @@ func (in *WorkerPool) DeepCopyInto(out *WorkerPool) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec + in.Spec.DeepCopyInto(&out.Spec) out.Status = in.Status } @@ -372,9 +373,53 @@ func (in *WorkerPoolList) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkerPoolPodTemplate) DeepCopyInto(out *WorkerPoolPodTemplate) { + *out = *in + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(corev1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Tolerations != nil { + in, out := &in.Tolerations, &out.Tolerations + *out = make([]corev1.Toleration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.NodeAffinity != nil { + in, out := &in.NodeAffinity, &out.NodeAffinity + *out = new(corev1.NodeAffinity) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerPoolPodTemplate. +func (in *WorkerPoolPodTemplate) DeepCopy() *WorkerPoolPodTemplate { + if in == nil { + return nil + } + out := new(WorkerPoolPodTemplate) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkerPoolSpec) DeepCopyInto(out *WorkerPoolSpec) { *out = *in + if in.Template != nil { + in, out := &in.Template, &out.Template + *out = new(WorkerPoolPodTemplate) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerPoolSpec. From 01b9acf655d70e9c5a15728bc177640bcfcd11c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=80=B8=E5=87=A1?= Date: Fri, 12 Jun 2026 15:59:10 +0800 Subject: [PATCH 2/3] feat: Add workerpool type #1 --- .go-version | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .go-version diff --git a/.go-version b/.go-version deleted file mode 100644 index f8f738140..000000000 --- a/.go-version +++ /dev/null @@ -1 +0,0 @@ -1.26.3 From 54764647746a7ca51deecef2fc8b0cb7aab7fd53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=80=B8=E5=87=A1?= Date: Fri, 12 Jun 2026 17:30:34 +0800 Subject: [PATCH 3/3] feat: Add workerpool type #2 --- docs/api-guide.md | 2 +- internal/controllers/workerpool_apply.go | 27 ++++++---- .../workerpool_apply_config_test.go | 2 +- .../controllers/workerpool_controller_test.go | 51 ++++++++++++++++++- .../generated/ate.dev_workerpools.yaml | 46 +++-------------- pkg/api/v1alpha1/workerpool_types.go | 19 ++++++- pkg/api/v1alpha1/zz_generated.deepcopy.go | 31 ++++++++++- 7 files changed, 123 insertions(+), 55 deletions(-) diff --git a/docs/api-guide.md b/docs/api-guide.md index e755350f3..4a7627360 100644 --- a/docs/api-guide.md +++ b/docs/api-guide.md @@ -18,7 +18,7 @@ The `WorkerPool` defines the pool of physical "warm" compute capacity. It manage | Field | Type | Pod mapping | | :--- | :--- | :--- | -| `resources` | `ResourceRequirements` | `containers[name=ateom].resources` | +| `resources` | `WorkerPoolResourceRequirements` | `containers[name=ateom].resources` (`requests` and `limits` only) | | `nodeSelector` | `map[string]string` | `spec.nodeSelector` | | `tolerations` | `[]Toleration` | `spec.tolerations` (max 16) | | `priorityClassName` | `string` | `spec.priorityClassName` | diff --git a/internal/controllers/workerpool_apply.go b/internal/controllers/workerpool_apply.go index c1be5c215..9bcb905aa 100644 --- a/internal/controllers/workerpool_apply.go +++ b/internal/controllers/workerpool_apply.go @@ -26,19 +26,20 @@ func applyWorkerPoolPodTemplate( containerAC *corev1ac.ContainerApplyConfiguration, tmpl *atev1alpha1.WorkerPoolPodTemplate, ) { + podSpecAC.NodeSelector = map[string]string{} + podSpecAC.Tolerations = []corev1ac.TolerationApplyConfiguration{} + podSpecAC.WithPriorityClassName("") + podSpecAC.WithAffinity(corev1ac.Affinity()) + containerAC.WithResources(corev1ac.ResourceRequirements()) + if tmpl == nil { - podSpecAC.WithNodeSelector(map[string]string{}). - WithTolerations() return } - nodeSelector := tmpl.NodeSelector - if nodeSelector == nil { - nodeSelector = map[string]string{} + if tmpl.NodeSelector != nil { + podSpecAC.WithNodeSelector(tmpl.NodeSelector) } - podSpecAC.WithNodeSelector(nodeSelector) - podSpecAC.WithTolerations(tolerationsToApply(tmpl.Tolerations)...) - + podSpecAC.Tolerations = tolerationApplyValues(tolerationsToApply(tmpl.Tolerations)) podSpecAC.WithPriorityClassName(tmpl.PriorityClassName) if tmpl.NodeAffinity != nil { @@ -50,7 +51,7 @@ func applyWorkerPoolPodTemplate( } } -func resourceRequirementsToApply(r *corev1.ResourceRequirements) *corev1ac.ResourceRequirementsApplyConfiguration { +func resourceRequirementsToApply(r *atev1alpha1.WorkerPoolResourceRequirements) *corev1ac.ResourceRequirementsApplyConfiguration { ac := corev1ac.ResourceRequirements() if len(r.Limits) > 0 { ac.WithLimits(r.Limits) @@ -61,6 +62,14 @@ func resourceRequirementsToApply(r *corev1.ResourceRequirements) *corev1ac.Resou return ac } +func tolerationApplyValues(tolerations []*corev1ac.TolerationApplyConfiguration) []corev1ac.TolerationApplyConfiguration { + out := make([]corev1ac.TolerationApplyConfiguration, 0, len(tolerations)) + for _, toleration := range tolerations { + out = append(out, *toleration) + } + return out +} + func tolerationsToApply(tolerations []corev1.Toleration) []*corev1ac.TolerationApplyConfiguration { out := make([]*corev1ac.TolerationApplyConfiguration, 0, len(tolerations)) for i := range tolerations { diff --git a/internal/controllers/workerpool_apply_config_test.go b/internal/controllers/workerpool_apply_config_test.go index 275fa1747..b8414a00d 100644 --- a/internal/controllers/workerpool_apply_config_test.go +++ b/internal/controllers/workerpool_apply_config_test.go @@ -74,7 +74,7 @@ func TestBuildDeploymentApplyConfigClearsNodeSelector(t *testing.T) { Replicas: 1, AteomImage: "ateom:v1", Template: &atev1alpha1.WorkerPoolPodTemplate{ - Resources: &corev1.ResourceRequirements{ + Resources: &atev1alpha1.WorkerPoolResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("1"), }, diff --git a/internal/controllers/workerpool_controller_test.go b/internal/controllers/workerpool_controller_test.go index 32017ba0d..5363c84bf 100644 --- a/internal/controllers/workerpool_controller_test.go +++ b/internal/controllers/workerpool_controller_test.go @@ -345,7 +345,7 @@ func TestStatusReplicasPropagation(t *testing.T) { func sampleWorkerPoolPodTemplate() *atev1alpha1.WorkerPoolPodTemplate { return &atev1alpha1.WorkerPoolPodTemplate{ - Resources: &corev1.ResourceRequirements{ + Resources: &atev1alpha1.WorkerPoolResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("2"), corev1.ResourceMemory: resource.MustParse("4Gi"), @@ -478,6 +478,55 @@ func TestWorkerPoolPodTemplateClear(t *testing.T) { }) } +// TestWorkerPoolPodTemplateClearAll verifies that removing spec.template clears +// all pod template fields owned by the workerpool-controller. +func TestWorkerPoolPodTemplateClearAll(t *testing.T) { + wp := makeWorkerPool("test-template-clear-all", "default", 1, "ateom:v1") + wp.Spec.Template = sampleWorkerPoolPodTemplate() + if err := k8sClient.Create(testCtx, wp); err != nil { + t.Fatalf("create WorkerPool: %v", err) + } + t.Cleanup(func() { k8sClient.Delete(testCtx, wp) }) //nolint:errcheck + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + if err != nil || len(dep.Spec.Template.Spec.Containers) == 0 { + return false, nil + } + podSpec := dep.Spec.Template.Spec + container := podSpec.Containers[0] + return podSpec.NodeSelector["workload"] == "substrate" && + len(podSpec.Tolerations) == 1 && + podSpec.PriorityClassName == "substrate-workers" && + podSpec.Affinity != nil && + podSpec.Affinity.NodeAffinity != nil && + len(container.Resources.Requests) > 0, nil + }) + + if err := k8sClient.Get(testCtx, types.NamespacedName{Name: wp.Name, Namespace: wp.Namespace}, wp); err != nil { + t.Fatalf("re-fetch WorkerPool: %v", err) + } + wp.Spec.Template = nil + if err := k8sClient.Update(testCtx, wp); err != nil { + t.Fatalf("clear WorkerPool template: %v", err) + } + + eventually(t, func(ctx context.Context) (bool, error) { + dep, err := getDeployment(ctx, wp) + if err != nil || len(dep.Spec.Template.Spec.Containers) == 0 { + return false, nil + } + podSpec := dep.Spec.Template.Spec + container := podSpec.Containers[0] + return len(podSpec.NodeSelector) == 0 && + len(podSpec.Tolerations) == 0 && + podSpec.PriorityClassName == "" && + (podSpec.Affinity == nil || podSpec.Affinity.NodeAffinity == nil) && + len(container.Resources.Limits) == 0 && + len(container.Resources.Requests) == 0, nil + }) +} + // TestSSARevertsOwnedPodTemplateFields verifies that if an external actor // changes pod template fields owned by the workerpool-controller, the // controller reverts them on the next reconcile. diff --git a/manifests/ate-install/generated/ate.dev_workerpools.yaml b/manifests/ate-install/generated/ate.dev_workerpools.yaml index ff2e988a8..ac5b161a4 100644 --- a/manifests/ate-install/generated/ate.dev_workerpools.yaml +++ b/manifests/ate-install/generated/ate.dev_workerpools.yaml @@ -289,39 +289,9 @@ spec: description: PriorityClassName for the worker pods. type: string resources: - description: Resources for the ateom container. + description: Resources for the ateom container. Only requests + and limits are supported. properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - This field depends on the - DynamicResourceAllocation feature gate. - - This field is immutable. It can only be set for containers. - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - request: - description: |- - Request is the name chosen for a request in the referenced claim. - If empty, everything from the claim is made available, otherwise - only the result of this request. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map limits: additionalProperties: anyOf: @@ -329,9 +299,8 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + description: Limits describes the maximum amount of compute + resources allowed. type: object requests: additionalProperties: @@ -340,11 +309,8 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + description: Requests describes the minimum amount of compute + resources required. type: object type: object tolerations: diff --git a/pkg/api/v1alpha1/workerpool_types.go b/pkg/api/v1alpha1/workerpool_types.go index 5e75b4207..39d8239a9 100644 --- a/pkg/api/v1alpha1/workerpool_types.go +++ b/pkg/api/v1alpha1/workerpool_types.go @@ -19,14 +19,29 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// WorkerPoolResourceRequirements defines compute resources for the fixed ateom +// container. It intentionally exposes only requests and limits; resource claims +// require pod-level ResourceClaim wiring and are outside WorkerPool's shorthand. +type WorkerPoolResourceRequirements struct { + // Limits describes the maximum amount of compute resources allowed. + // + // +optional + Limits corev1.ResourceList `json:"limits,omitempty"` + + // Requests describes the minimum amount of compute resources required. + // + // +optional + Requests corev1.ResourceList `json:"requests,omitempty"` +} + // WorkerPoolPodTemplate defines optional scheduling and resource settings for // worker pods. Resources apply to the fixed ateom container; nodeAffinity is // mapped to spec.affinity.nodeAffinity on the pod. type WorkerPoolPodTemplate struct { - // Resources for the ateom container. + // Resources for the ateom container. Only requests and limits are supported. // // +optional - Resources *corev1.ResourceRequirements `json:"resources,omitempty"` + Resources *WorkerPoolResourceRequirements `json:"resources,omitempty"` // NodeSelector is a selector which must be true for the pod to fit on a node. // diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index fd3f7b223..3299bb1a5 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -378,7 +378,7 @@ func (in *WorkerPoolPodTemplate) DeepCopyInto(out *WorkerPoolPodTemplate) { *out = *in if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(WorkerPoolResourceRequirements) (*in).DeepCopyInto(*out) } if in.NodeSelector != nil { @@ -412,6 +412,35 @@ func (in *WorkerPoolPodTemplate) DeepCopy() *WorkerPoolPodTemplate { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkerPoolResourceRequirements) DeepCopyInto(out *WorkerPoolResourceRequirements) { + *out = *in + if in.Limits != nil { + in, out := &in.Limits, &out.Limits + *out = make(corev1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + if in.Requests != nil { + in, out := &in.Requests, &out.Requests + *out = make(corev1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerPoolResourceRequirements. +func (in *WorkerPoolResourceRequirements) DeepCopy() *WorkerPoolResourceRequirements { + if in == nil { + return nil + } + out := new(WorkerPoolResourceRequirements) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkerPoolSpec) DeepCopyInto(out *WorkerPoolSpec) { *out = *in