From 89b42f709ee7368310c43da9122bd1b79b5336b2 Mon Sep 17 00:00:00 2001 From: "g.beausire" Date: Tue, 18 Jun 2024 14:52:06 +0200 Subject: [PATCH 1/2] Refactor budget validation function to return statuses Returning status allow return a bool and a string with reason without using a Tuple. This refactor is consistent with nodeDisruption internal constraints checks that return status directly. --- .../applicationdisruptionbudget_controller.go | 45 +++++++++++++++---- internal/controller/budget.go | 6 +-- internal/controller/budget_test.go | 25 ++++++++--- .../controller/nodedisruption_controller.go | 27 +++-------- .../nodedisruption_controller_test.go | 4 +- .../nodedisruptionbudget_controller.go | 26 +++++++++-- 6 files changed, 90 insertions(+), 43 deletions(-) diff --git a/internal/controller/applicationdisruptionbudget_controller.go b/internal/controller/applicationdisruptionbudget_controller.go index 7e70ec9..661e015 100644 --- a/internal/controller/applicationdisruptionbudget_controller.go +++ b/internal/controller/applicationdisruptionbudget_controller.go @@ -199,8 +199,21 @@ func (r *ApplicationDisruptionBudgetResolver) IsImpacted(disruptedNodes resolver } // Return the number of disruption allowed considering a list of current node disruptions -func (r *ApplicationDisruptionBudgetResolver) TolerateDisruption(_ resolver.NodeSet) bool { - return r.ApplicationDisruptionBudget.Status.DisruptionsAllowed-1 >= 0 +func (r *ApplicationDisruptionBudgetResolver) TryValidateDisruptionFromBudgetConstraints(_ resolver.NodeSet) nodedisruptionv1alpha1.DisruptedBudgetStatus { + if r.ApplicationDisruptionBudget.Status.DisruptionsAllowed-1 >= 0 { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: "", + Ok: true, + } + } else { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: fmt.Sprintf("Number of allowed disruption exceeded (Remaining allowed disruptions: %d, current disruptions: %d)", + r.ApplicationDisruptionBudget.Status.DisruptionsAllowed, r.ApplicationDisruptionBudget.Status.CurrentDisruptions), + Ok: false, + } + } } func (r *ApplicationDisruptionBudgetResolver) UpdateStatus(ctx context.Context) error { @@ -215,8 +228,24 @@ func (r *ApplicationDisruptionBudgetResolver) GetNamespacedName() nodedisruption } } +func (r *ApplicationDisruptionBudgetResolver) TryValidateDisruptionFromHealthHook(ctx context.Context, nd nodedisruptionv1alpha1.NodeDisruption) nodedisruptionv1alpha1.DisruptedBudgetStatus { + err := r.callHealthHook(ctx, nd) + if err != nil { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: fmt.Sprintf("Failed to validate with healthHook: %s", err), + Ok: false, + } + } + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: "", + Ok: true, + } +} + // Call a lifecycle hook in order to synchronously validate a Node Disruption -func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context, nd nodedisruptionv1alpha1.NodeDisruption) error { +func (r *ApplicationDisruptionBudgetResolver) callHealthHook(ctx context.Context, nd nodedisruptionv1alpha1.NodeDisruption) error { if r.ApplicationDisruptionBudget.Spec.HealthHook.URL == "" { return nil } @@ -226,7 +255,7 @@ func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context data, err := json.Marshal(nd) if err != nil { - return err + return fmt.Errorf("controller error: Failed to serialize node disruption: %w", err) } namespacedName := r.GetNamespacedName() @@ -234,7 +263,7 @@ func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context req, err := http.NewRequestWithContext(ctx, http.MethodPost, r.ApplicationDisruptionBudget.Spec.HealthHook.URL, bytes.NewReader(data)) if err != nil { DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(namespacedName.Namespace, namespacedName.Name, namespacedName.Kind).Inc() - return err + return fmt.Errorf("controller error: Error while building request: %w", err) } headers["Content-Type"] = []string{"application/json"} @@ -244,13 +273,13 @@ func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context resp, err := client.Do(req) if err != nil { DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(namespacedName.Namespace, namespacedName.Name, namespacedName.Kind).Inc() - return err + return fmt.Errorf("error while performing request on healthHook: %w", err) } body, err := io.ReadAll(resp.Body) if err != nil { DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(namespacedName.Namespace, namespacedName.Name, namespacedName.Kind).Inc() - return err + return fmt.Errorf("error while reading response fron healthHook: %w", err) } DisruptionBudgetCheckHealthHookStatusCodeTotal.WithLabelValues(namespacedName.Namespace, namespacedName.Name, namespacedName.Kind, strconv.Itoa(resp.StatusCode)).Inc() @@ -258,7 +287,7 @@ func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context if resp.StatusCode >= 200 && resp.StatusCode < 300 { return nil } - return fmt.Errorf("http server responded with non 2XX status code: %s", string(body)) + return fmt.Errorf("HealthHook responded with non 2XX status code: %s", string(body)) } func (r *ApplicationDisruptionBudgetResolver) GetSelectedNodes(ctx context.Context) (resolver.NodeSet, error) { diff --git a/internal/controller/budget.go b/internal/controller/budget.go index df7b2dc..104df01 100644 --- a/internal/controller/budget.go +++ b/internal/controller/budget.go @@ -14,10 +14,10 @@ type Budget interface { Sync(context.Context) error // Check if the budget would be impacted by an operation on the provided set of nodes IsImpacted(resolver.NodeSet) bool - // Return the number of disruption allowed considering a list of current node disruptions - TolerateDisruption(resolver.NodeSet) bool + // Return the status of this budget when trying to compute the status for it + TryValidateDisruptionFromBudgetConstraints(resolver.NodeSet) nodedisruptionv1alpha1.DisruptedBudgetStatus // Call a lifecycle hook in order to synchronously validate a Node Disruption - CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption) error + TryValidateDisruptionFromHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption) nodedisruptionv1alpha1.DisruptedBudgetStatus // Apply the budget's status to Kubernetes UpdateStatus(context.Context) error // Get the name, namespace and kind of bduget diff --git a/internal/controller/budget_test.go b/internal/controller/budget_test.go index ef0319a..1af9b1f 100644 --- a/internal/controller/budget_test.go +++ b/internal/controller/budget_test.go @@ -31,14 +31,30 @@ func (m *MockBudget) IsImpacted(resolver.NodeSet) bool { } // Return the number of disruption allowed considering a list of current node disruptions -func (m *MockBudget) TolerateDisruption(resolver.NodeSet) bool { - return m.tolerate +func (m *MockBudget) TryValidateDisruptionFromBudgetConstraints(resolver.NodeSet) nodedisruptionv1alpha1.DisruptedBudgetStatus { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: nodedisruptionv1alpha1.NamespacedName{}, + Reason: "foo", + Ok: m.tolerate, + } } // Check health make a synchronous health check on the underlying resource of a budget -func (m *MockBudget) CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption) error { +func (m *MockBudget) TryValidateDisruptionFromHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption) nodedisruptionv1alpha1.DisruptedBudgetStatus { m.healthChecked = true - return m.health + if m.health != nil { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: nodedisruptionv1alpha1.NamespacedName{}, + Reason: m.health.Error(), + Ok: false, + } + } else { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: nodedisruptionv1alpha1.NamespacedName{}, + Reason: "", + Ok: true, + } + } } // Apply the budget's status to Kubernetes @@ -108,7 +124,6 @@ func TestValidationImpactedAllOk(t *testing.T) { } budgets := []controller.Budget{&budget1, &budget2} - anyFailed, statuses := reconciler.ValidateWithBudgetConstraints(context.Background(), budgets) assert.False(t, anyFailed) assert.Equal(t, len(statuses), 2) diff --git a/internal/controller/nodedisruption_controller.go b/internal/controller/nodedisruption_controller.go index 80e565e..c3ca88c 100644 --- a/internal/controller/nodedisruption_controller.go +++ b/internal/controller/nodedisruption_controller.go @@ -346,17 +346,12 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con continue } - if !budget.TolerateDisruption(disruptedNodes) { + status := budget.TryValidateDisruptionFromBudgetConstraints(disruptedNodes) + if !status.Ok { anyFailed = true - ref := budget.GetNamespacedName() - status := nodedisruptionv1alpha1.DisruptedBudgetStatus{ - Reference: ref, - Reason: "No more disruption allowed", - Ok: false, - } statuses = append(statuses, status) logger.Info("Disruption rejected because: ", "status", status) - DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() + DisruptionBudgetRejectedTotal.WithLabelValues(status.Reference.Namespace, status.Reference.Name, status.Reference.Kind).Inc() break } impactedBudgets = append(impactedBudgets, budget) @@ -367,26 +362,16 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con } for _, budget := range impactedBudgets { - err := budget.CallHealthHook(ctx, ndr.NodeDisruption) + status := budget.TryValidateDisruptionFromHealthHook(ctx, ndr.NodeDisruption) ref := budget.GetNamespacedName() - if err != nil { + statuses = append(statuses, status) + if !status.Ok { anyFailed = true - status := nodedisruptionv1alpha1.DisruptedBudgetStatus{ - Reference: ref, - Reason: fmt.Sprintf("Unhealthy: %s", err), - Ok: false, - } - statuses = append(statuses, status) logger.Info("Disruption rejected because: ", "status", status) DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() break } DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() - statuses = append(statuses, nodedisruptionv1alpha1.DisruptedBudgetStatus{ - Reference: budget.GetNamespacedName(), - Reason: "", - Ok: true, - }) } return anyFailed, statuses diff --git a/internal/controller/nodedisruption_controller_test.go b/internal/controller/nodedisruption_controller_test.go index b582b39..92b66e6 100644 --- a/internal/controller/nodedisruption_controller_test.go +++ b/internal/controller/nodedisruption_controller_test.go @@ -528,7 +528,7 @@ var _ = Describe("NodeDisruption controller", func() { Name: "test", Kind: "NodeDisruptionBudget", }, - Reason: "No more disruption allowed", + Reason: "Number of allowed disrupted nodes exceeded (Remaining disruptions allowed: 0, Number of current node disrupted: 0, Requested nodes to disrupt: 2)", Ok: false, }} Expect(disruption.Status.DisruptedDisruptionBudgets).Should(Equal(expectedStatus)) @@ -553,7 +553,7 @@ var _ = Describe("NodeDisruption controller", func() { Name: "test", Kind: "NodeDisruptionBudget", }, - Reason: "No more disruption allowed", + Reason: "Number of allowed disrupted nodes exceeded (Remaining disruptions allowed: 0, Number of current node disrupted: 0, Requested nodes to disrupt: 2)", Ok: false, }, { Reference: nodedisruptionv1alpha1.NamespacedName{ diff --git a/internal/controller/nodedisruptionbudget_controller.go b/internal/controller/nodedisruptionbudget_controller.go index cb677bb..5a61a31 100644 --- a/internal/controller/nodedisruptionbudget_controller.go +++ b/internal/controller/nodedisruptionbudget_controller.go @@ -18,6 +18,7 @@ package controller import ( "context" + "fmt" "math" "reflect" @@ -190,15 +191,32 @@ func (r *NodeDisruptionBudgetResolver) IsImpacted(disruptedNodes resolver.NodeSe } // Return the number of disruption allowed considering a list of current node disruptions -func (r *NodeDisruptionBudgetResolver) TolerateDisruption(disruptedNodes resolver.NodeSet) bool { +func (r *NodeDisruptionBudgetResolver) TryValidateDisruptionFromBudgetConstraints(disruptedNodes resolver.NodeSet) nodedisruptionv1alpha1.DisruptedBudgetStatus { watchedNodes := resolver.NewNodeSetFromStringList(r.NodeDisruptionBudget.Status.WatchedNodes) disruptedNodesCount := watchedNodes.Intersection(disruptedNodes).Len() - return r.NodeDisruptionBudget.Status.DisruptionsAllowed-disruptedNodesCount >= 0 + if r.NodeDisruptionBudget.Status.DisruptionsAllowed-disruptedNodesCount >= 0 { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: "", + Ok: true, + } + } else { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: fmt.Sprintf("Number of allowed disrupted nodes exceeded (Remaining disruptions allowed: %d, Number of current node disrupted: %d, Requested nodes to disrupt: %d)", + r.NodeDisruptionBudget.Status.DisruptionsAllowed, r.NodeDisruptionBudget.Status.CurrentDisruptions, disruptedNodesCount), + Ok: false, + } + } } // Call a lifecycle hook in order to synchronously validate a Node Disruption -func (r *NodeDisruptionBudgetResolver) CallHealthHook(_ context.Context, _ nodedisruptionv1alpha1.NodeDisruption) error { - return nil +func (r *NodeDisruptionBudgetResolver) TryValidateDisruptionFromHealthHook(_ context.Context, _ nodedisruptionv1alpha1.NodeDisruption) nodedisruptionv1alpha1.DisruptedBudgetStatus { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: "", + Ok: true, + } } func (r *NodeDisruptionBudgetResolver) UpdateStatus(ctx context.Context) error { From b0923f26b18d8d4da68d578d9ea5f2e9c2718d9e Mon Sep 17 00:00:00 2001 From: "g.beausire" Date: Tue, 18 Jun 2024 16:29:34 +0200 Subject: [PATCH 2/2] Add support for freeze in budgets Provide a new API to freeze budgets and metrics to track its usage as API --- DOC.md | 82 ++++++++++ Makefile | 2 +- README.md | 6 +- .../applicationdisruptionbudget_types.go | 10 ++ api/v1alpha1/nodedisruptionbudget_types.go | 2 + api/v1alpha1/zz_generated.deepcopy.go | 17 +++ .../applicationdisruptionbudget-crd.yaml | 11 ++ chart/templates/nodedisruptionbudget-crd.yaml | 11 ++ ...iteo.com_applicationdisruptionbudgets.yaml | 11 ++ ...tion.criteo.com_nodedisruptionbudgets.yaml | 11 ++ .../applicationdisruptionbudget_controller.go | 14 ++ internal/controller/metrics.go | 7 + .../nodedisruption_controller_test.go | 141 ++++++++++++++++++ .../nodedisruptionbudget_controller.go | 13 ++ 14 files changed, 336 insertions(+), 2 deletions(-) diff --git a/DOC.md b/DOC.md index fc56e71..da1cc45 100644 --- a/DOC.md +++ b/DOC.md @@ -94,6 +94,13 @@ ApplicationDisruptionBudgetSpec defines the desired state of ApplicationDisrupti A NodeDisruption is allowed if at most "maxDisruptions" nodes selected by selectors are unavailable after the disruption.
true + + freeze + object + + Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints
+ + false healthHook object @@ -121,6 +128,40 @@ Maintenance will proceed only if the endpoint responds 2XX.
+### ApplicationDisruptionBudget.spec.freeze +[↩ Parent](#applicationdisruptionbudgetspec) + + + +Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean + Freeze the budget to prevent any disruptions
+
false
reasonstring + Reason of the freeze
+
false
+ + ### ApplicationDisruptionBudget.spec.healthHook [↩ Parent](#applicationdisruptionbudgetspec) @@ -492,6 +533,13 @@ NodeDisruptionBudgetSpec defines the desired state of NodeDisruptionBudget A NodeDisruption is allowed if at most "minUndisruptedNodes" nodes selected by selectors are unavailable after the disruption.
true + + freeze + object + + Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints
+ + false nodeSelector object @@ -503,6 +551,40 @@ NodeDisruptionBudgetSpec defines the desired state of NodeDisruptionBudget +### NodeDisruptionBudget.spec.freeze +[↩ Parent](#nodedisruptionbudgetspec) + + + +Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean + Freeze the budget to prevent any disruptions
+
false
reasonstring + Reason of the freeze
+
false
+ + ### NodeDisruptionBudget.spec.nodeSelector [↩ Parent](#nodedisruptionbudgetspec) diff --git a/Makefile b/Makefile index 5b6ce47..8181796 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ test: manifests generate fmt vet envtest lint ## Run tests. ##@ Build .PHONY: build -build: manifests generate fmt vet ## Build manager binary. +build: manifests generate fmt vet gen-doc ## Build manager binary. CGO_ENABLED=0 go build -o bin/manager cmd/main.go .PHONY: run diff --git a/README.md b/README.md index cdd010a..31881d0 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,6 @@ In some cases, an application can be unhealthy even if all its pods are running. You can select Pods and/or PVCs. - ##### PVC selector The main reason of using a PVC selector is to ensure that node that contains data don't enter maintenance @@ -182,6 +181,11 @@ The hook will be called with a POST method containing the JSON encoded NodeDisru Note: It is not a replacement for readiness probes but a complement. +##### Freeze + +Budgets support freezing disruptions. By setting `spec.Freeze.Enabled`, the budget will reject all disruptions and give the reason specified in `spec.Freeze.Reason`. +It is equivalent to setting 0 as the max disruptions but it provide better messages. + #### Sample object ```yaml diff --git a/api/v1alpha1/applicationdisruptionbudget_types.go b/api/v1alpha1/applicationdisruptionbudget_types.go index 6d1a740..34bd40e 100644 --- a/api/v1alpha1/applicationdisruptionbudget_types.go +++ b/api/v1alpha1/applicationdisruptionbudget_types.go @@ -43,6 +43,16 @@ type ApplicationDisruptionBudgetSpec struct { // Maintenance will proceed only if the endpoint responds 2XX. // +kubebuilder:validation:Optional HealthHook HealthHookSpec `json:"healthHook,omitempty"` + // Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints + Freeze FreezeSpec `json:"freeze,omitempty"` +} + +// FreezeSpec defines the freeze status of the budget +type FreezeSpec struct { + // Freeze the budget to prevent any disruptions + Enabled bool `json:"enabled,omitempty"` + // Reason of the freeze + Reason string `json:"reason,omitempty"` } type HealthHookSpec struct { diff --git a/api/v1alpha1/nodedisruptionbudget_types.go b/api/v1alpha1/nodedisruptionbudget_types.go index a22da6c..df1df78 100644 --- a/api/v1alpha1/nodedisruptionbudget_types.go +++ b/api/v1alpha1/nodedisruptionbudget_types.go @@ -37,6 +37,8 @@ type NodeDisruptionBudgetSpec struct { MinUndisruptedNodes int `json:"minUndisruptedNodes"` // NodeSelector query over pods whose nodes are managed by the disruption budget. NodeSelector metav1.LabelSelector `json:"nodeSelector,omitempty"` + // Define the freeze status of the budget. Frozen budget reject all disruptions ignoring any other constraints + Freeze FreezeSpec `json:"freeze,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 8ffcaf9..04327a4 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -89,6 +89,7 @@ func (in *ApplicationDisruptionBudgetSpec) DeepCopyInto(out *ApplicationDisrupti in.PodSelector.DeepCopyInto(&out.PodSelector) in.PVCSelector.DeepCopyInto(&out.PVCSelector) out.HealthHook = in.HealthHook + out.Freeze = in.Freeze } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ApplicationDisruptionBudgetSpec. @@ -157,6 +158,21 @@ func (in *DisruptionBudgetStatus) DeepCopy() *DisruptionBudgetStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FreezeSpec) DeepCopyInto(out *FreezeSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FreezeSpec. +func (in *FreezeSpec) DeepCopy() *FreezeSpec { + if in == nil { + return nil + } + out := new(FreezeSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *HealthHookSpec) DeepCopyInto(out *HealthHookSpec) { *out = *in @@ -277,6 +293,7 @@ func (in *NodeDisruptionBudgetList) DeepCopyObject() runtime.Object { func (in *NodeDisruptionBudgetSpec) DeepCopyInto(out *NodeDisruptionBudgetSpec) { *out = *in in.NodeSelector.DeepCopyInto(&out.NodeSelector) + out.Freeze = in.Freeze } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeDisruptionBudgetSpec. diff --git a/chart/templates/applicationdisruptionbudget-crd.yaml b/chart/templates/applicationdisruptionbudget-crd.yaml index 83d2c9d..724eda5 100644 --- a/chart/templates/applicationdisruptionbudget-crd.yaml +++ b/chart/templates/applicationdisruptionbudget-crd.yaml @@ -54,6 +54,17 @@ spec: description: ApplicationDisruptionBudgetSpec defines the desired state of ApplicationDisruptionBudget properties: + freeze: + description: Define the freeze status of the budget. Frozen budget reject + all disruptions ignoring any other constraints + properties: + enabled: + description: Freeze the budget to prevent any disruptions + type: boolean + reason: + description: Reason of the freeze + type: string + type: object healthHook: description: |- Define a optional hook to call when validating a NodeDisruption. diff --git a/chart/templates/nodedisruptionbudget-crd.yaml b/chart/templates/nodedisruptionbudget-crd.yaml index cb9612f..5306384 100644 --- a/chart/templates/nodedisruptionbudget-crd.yaml +++ b/chart/templates/nodedisruptionbudget-crd.yaml @@ -56,6 +56,17 @@ spec: spec: description: NodeDisruptionBudgetSpec defines the desired state of NodeDisruptionBudget properties: + freeze: + description: Define the freeze status of the budget. Frozen budget reject + all disruptions ignoring any other constraints + properties: + enabled: + description: Freeze the budget to prevent any disruptions + type: boolean + reason: + description: Reason of the freeze + type: string + type: object maxDisruptedNodes: description: A NodeDisruption is allowed if at most "maxDisruptedNodes" nodes selected by selectors are unavailable after the disruption. diff --git a/config/crd/bases/nodedisruption.criteo.com_applicationdisruptionbudgets.yaml b/config/crd/bases/nodedisruption.criteo.com_applicationdisruptionbudgets.yaml index bd2f461..dbbaf7c 100644 --- a/config/crd/bases/nodedisruption.criteo.com_applicationdisruptionbudgets.yaml +++ b/config/crd/bases/nodedisruption.criteo.com_applicationdisruptionbudgets.yaml @@ -53,6 +53,17 @@ spec: description: ApplicationDisruptionBudgetSpec defines the desired state of ApplicationDisruptionBudget properties: + freeze: + description: Define the freeze status of the budget. Frozen budget + reject all disruptions ignoring any other constraints + properties: + enabled: + description: Freeze the budget to prevent any disruptions + type: boolean + reason: + description: Reason of the freeze + type: string + type: object healthHook: description: |- Define a optional hook to call when validating a NodeDisruption. diff --git a/config/crd/bases/nodedisruption.criteo.com_nodedisruptionbudgets.yaml b/config/crd/bases/nodedisruption.criteo.com_nodedisruptionbudgets.yaml index b418220..d804c59 100644 --- a/config/crd/bases/nodedisruption.criteo.com_nodedisruptionbudgets.yaml +++ b/config/crd/bases/nodedisruption.criteo.com_nodedisruptionbudgets.yaml @@ -55,6 +55,17 @@ spec: spec: description: NodeDisruptionBudgetSpec defines the desired state of NodeDisruptionBudget properties: + freeze: + description: Define the freeze status of the budget. Frozen budget + reject all disruptions ignoring any other constraints + properties: + enabled: + description: Freeze the budget to prevent any disruptions + type: boolean + reason: + description: Reason of the freeze + type: string + type: object maxDisruptedNodes: description: A NodeDisruption is allowed if at most "maxDisruptedNodes" nodes selected by selectors are unavailable after the disruption. diff --git a/internal/controller/applicationdisruptionbudget_controller.go b/internal/controller/applicationdisruptionbudget_controller.go index 661e015..bf64e61 100644 --- a/internal/controller/applicationdisruptionbudget_controller.go +++ b/internal/controller/applicationdisruptionbudget_controller.go @@ -115,6 +115,12 @@ func PruneADBMetrics(ref nodedisruptionv1alpha1.NamespacedName) { // UpdateADBMetrics update metrics for an ADB func UpdateADBMetrics(ref nodedisruptionv1alpha1.NamespacedName, adb *nodedisruptionv1alpha1.ApplicationDisruptionBudget) { DisruptionBudgetMaxDisruptions.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(adb.Spec.MaxDisruptions)) + if adb.Spec.Freeze.Enabled { + DisruptionBudgetFrozen.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(1) + } else { + DisruptionBudgetFrozen.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(0) + } + UpdateBudgetStatusMetrics(ref, adb.Status) } @@ -200,6 +206,14 @@ func (r *ApplicationDisruptionBudgetResolver) IsImpacted(disruptedNodes resolver // Return the number of disruption allowed considering a list of current node disruptions func (r *ApplicationDisruptionBudgetResolver) TryValidateDisruptionFromBudgetConstraints(_ resolver.NodeSet) nodedisruptionv1alpha1.DisruptedBudgetStatus { + if r.ApplicationDisruptionBudget.Spec.Freeze.Enabled { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: fmt.Sprintf("Budget frozen: %s", r.ApplicationDisruptionBudget.Spec.Freeze.Reason), + Ok: false, + } + } + if r.ApplicationDisruptionBudget.Status.DisruptionsAllowed-1 >= 0 { return nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: r.GetNamespacedName(), diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index 80597be..cddd879 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -139,4 +139,11 @@ var ( }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_disruption_name"}, ) + DisruptionBudgetFrozen = promauto.With(metrics.Registry).NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "budget_disruption_frozen", + Help: "Budget frozen", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) ) diff --git a/internal/controller/nodedisruption_controller_test.go b/internal/controller/nodedisruption_controller_test.go index 92b66e6..5ea79a0 100644 --- a/internal/controller/nodedisruption_controller_test.go +++ b/internal/controller/nodedisruption_controller_test.go @@ -372,6 +372,147 @@ var _ = Describe("NodeDisruption controller", func() { }) }) + When("there is a node disruption budget that is frozen", func() { + It("rejects the node disruption", func() { + By("creating a budget that accepts everything") + ndb := &nodedisruptionv1alpha1.NodeDisruptionBudget{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "nodedisruption.criteo.com/v1alpha1", + Kind: "NodeDisruptionBudget", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: nodedisruptionv1alpha1.NodeDisruptionBudgetSpec{ + NodeSelector: metav1.LabelSelector{MatchLabels: nodeLabels1}, + MaxDisruptedNodes: 99, + Freeze: nodedisruptionv1alpha1.FreezeSpec{ + Enabled: false, + }, + }, + } + Expect(k8sClient.Create(ctx, ndb.DeepCopy())).Should(Succeed()) + + By("creating a new NodeDisruption") + disruption := &nodedisruptionv1alpha1.NodeDisruption{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "nodedisruption.criteo.com/v1alpha1", + Kind: "NodeDisruption", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: NDName, + Namespace: NDNamespace, + }, + Spec: nodedisruptionv1alpha1.NodeDisruptionSpec{ + NodeSelector: metav1.LabelSelector{MatchLabels: nodeLabels1}, + }, + } + Expect(k8sClient.Create(ctx, disruption.DeepCopy())).Should(Succeed()) + + By("checking the NodeDisruption is being granted") + createdDisruption := &nodedisruptionv1alpha1.NodeDisruption{} + + Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState { + err := k8sClient.Get(ctx, NDLookupKey, createdDisruption) + if err != nil { + panic("should be able to get") + } + return createdDisruption.Status.State + }, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted)) + + By("creating a budget that accepts everything") + Expect(k8sClient.Delete(ctx, ndb)).Should(Succeed()) + ndb.Spec.Freeze.Enabled = true + ndb.Spec.Freeze.Reason = "foobar" + Expect(k8sClient.Create(ctx, ndb)).Should(Succeed()) + + By("Re-creating a new NodeDisruption") + Expect(k8sClient.Delete(ctx, disruption)).Should(Succeed()) + Expect(k8sClient.Create(ctx, disruption.DeepCopy())).Should(Succeed()) + + By("checking the NodeDisruption is being rejected") + Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState { + err := k8sClient.Get(ctx, NDLookupKey, createdDisruption) + if err != nil { + panic("should be able to get") + } + return createdDisruption.Status.State + }, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Rejected)) + + Expect(createdDisruption.Status.DisruptedDisruptionBudgets[0].Reason).Should(Equal("Budget frozen: foobar")) + }) + }) + + When("there is a application disruption budget that is frozen", func() { + It("rejects the node disruption", func() { + By("creating a budget that accepts everything") + adb := &nodedisruptionv1alpha1.ApplicationDisruptionBudget{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "nodedisruption.criteo.com/v1alpha1", + Kind: "ApplicationDisruptionBudget", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: NDNamespace, + }, + Spec: nodedisruptionv1alpha1.ApplicationDisruptionBudgetSpec{ + PodSelector: metav1.LabelSelector{MatchLabels: podLabels}, + PVCSelector: metav1.LabelSelector{MatchLabels: podLabels}, + MaxDisruptions: 1, + }, + } + Expect(k8sClient.Create(ctx, adb.DeepCopy())).Should(Succeed()) + + By("creating a new NodeDisruption") + disruption := &nodedisruptionv1alpha1.NodeDisruption{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "nodedisruption.criteo.com/v1alpha1", + Kind: "NodeDisruption", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: NDName, + Namespace: NDNamespace, + }, + Spec: nodedisruptionv1alpha1.NodeDisruptionSpec{ + NodeSelector: metav1.LabelSelector{MatchLabels: nodeLabels1}, + }, + } + Expect(k8sClient.Create(ctx, disruption.DeepCopy())).Should(Succeed()) + + By("checking the NodeDisruption is being granted") + createdDisruption := &nodedisruptionv1alpha1.NodeDisruption{} + + Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState { + err := k8sClient.Get(ctx, NDLookupKey, createdDisruption) + if err != nil { + panic("should be able to get") + } + return createdDisruption.Status.State + }, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted)) + + By("creating a budget that accepts everything") + Expect(k8sClient.Delete(ctx, adb)).Should(Succeed()) + adb.Spec.Freeze.Enabled = true + adb.Spec.Freeze.Reason = "foobar" + Expect(k8sClient.Create(ctx, adb)).Should(Succeed()) + + By("Re-creating a new NodeDisruption") + Expect(k8sClient.Delete(ctx, disruption)).Should(Succeed()) + Expect(k8sClient.Create(ctx, disruption.DeepCopy())).Should(Succeed()) + + By("checking the NodeDisruption is being rejected") + Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState { + err := k8sClient.Get(ctx, NDLookupKey, createdDisruption) + if err != nil { + panic("should be able to get") + } + return createdDisruption.Status.State + }, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Rejected)) + + Expect(createdDisruption.Status.DisruptedDisruptionBudgets[0].Reason).Should(Equal("Budget frozen: foobar")) + }) + }) + When("there is a budget that doesn't support any disruption and retry is activated", func() { It("rejects the node disruption", func() { By("creating a budget that rejects everything") diff --git a/internal/controller/nodedisruptionbudget_controller.go b/internal/controller/nodedisruptionbudget_controller.go index 5a61a31..5591fc9 100644 --- a/internal/controller/nodedisruptionbudget_controller.go +++ b/internal/controller/nodedisruptionbudget_controller.go @@ -109,6 +109,11 @@ func PruneNDBMetrics(ref nodedisruptionv1alpha1.NamespacedName) { func UpdateNDBMetrics(ref nodedisruptionv1alpha1.NamespacedName, ndb *nodedisruptionv1alpha1.NodeDisruptionBudget) { DisruptionBudgetMaxDisruptedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(ndb.Spec.MaxDisruptedNodes)) DisruptionBudgetMinUndisruptedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(ndb.Spec.MinUndisruptedNodes)) + if ndb.Spec.Freeze.Enabled { + DisruptionBudgetFrozen.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(1) + } else { + DisruptionBudgetFrozen.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(0) + } UpdateBudgetStatusMetrics(ref, ndb.Status) } @@ -192,6 +197,14 @@ func (r *NodeDisruptionBudgetResolver) IsImpacted(disruptedNodes resolver.NodeSe // Return the number of disruption allowed considering a list of current node disruptions func (r *NodeDisruptionBudgetResolver) TryValidateDisruptionFromBudgetConstraints(disruptedNodes resolver.NodeSet) nodedisruptionv1alpha1.DisruptedBudgetStatus { + if r.NodeDisruptionBudget.Spec.Freeze.Enabled { + return nodedisruptionv1alpha1.DisruptedBudgetStatus{ + Reference: r.GetNamespacedName(), + Reason: fmt.Sprintf("Budget frozen: %s", r.NodeDisruptionBudget.Spec.Freeze.Reason), + Ok: false, + } + } + watchedNodes := resolver.NewNodeSetFromStringList(r.NodeDisruptionBudget.Status.WatchedNodes) disruptedNodesCount := watchedNodes.Intersection(disruptedNodes).Len() if r.NodeDisruptionBudget.Status.DisruptionsAllowed-disruptedNodesCount >= 0 {