Skip to content

Commit 4400d80

Browse files
committed
feat: set IgnoreDaemonSetsUtilization per nodegroup
Signed-off-by: vadasambar <[email protected]> fix: test cases failing for actuator and scaledown/eligibility - abstract default values into `config` Signed-off-by: vadasambar <[email protected]> refactor: rename global `IgnoreDaemonSetsUtilization` -> `GlobalIgnoreDaemonSetsUtilization` in code - there is no change in the flag name - rename `thresholdGetter` -> `configGetter` and tweak it to accomodate `GetIgnoreDaemonSetsUtilization` Signed-off-by: vadasambar <[email protected]> refactor: reset help text for `ignore-daemonsets-utilization` flag - because per nodegroup override is supported only for AWS ASG tags as of now Signed-off-by: vadasambar <[email protected]> docs: add info about overriding `--ignore-daemonsets-utilization` per ASG - in AWS cloud provider README Signed-off-by: vadasambar <[email protected]> refactor: use a limiting interface in actuator in place of `NodeGroupConfigProcessor` interface - to limit the functions that can be used - since we need it only for `GetIgnoreDaemonSetsUtilization` Signed-off-by: vadasambar <[email protected]> fix: tests failing for actuator - rename `staticNodeGroupConfigProcessor` -> `MockNodeGroupConfigGetter` - move `MockNodeGroupConfigGetter` to test/common so that it can be used in different tests Signed-off-by: vadasambar <[email protected]> fix: go lint errors for `MockNodeGroupConfigGetter` Signed-off-by: vadasambar <[email protected]> test: add tests for `IgnoreDaemonSetsUtilization` in cloud provider dir Signed-off-by: vadasambar <[email protected]> test: update node group config processor tests for `IgnoreDaemonSetsUtilization` Signed-off-by: vadasambar <[email protected]> test: update eligibility test cases for `IgnoreDaemonSetsUtilization` Signed-off-by: vadasambar <[email protected]> test: run actuation tests for 2 NGS - one with `IgnoreDaemonSetsUtilization`: `false` - one with `IgnoreDaemonSetsUtilization`: `true` Signed-off-by: vadasambar <[email protected]> test: add tests for `IgnoreDaemonSetsUtilization` in actuator - add helper to generate multiple ds pods dynamically - get rid of mock config processor because it is not required Signed-off-by: vadasambar <[email protected]>
1 parent 299c963 commit 4400d80

File tree

16 files changed

+636
-332
lines changed

16 files changed

+636
-332
lines changed

cluster-autoscaler/cloudprovider/aws/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,8 @@ as string). Currently supported autoscaling options (and example values) are:
246246
(overrides `--scale-down-unneeded-time` value for that specific ASG)
247247
* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/scaledownunreadytime`: `20m0s`
248248
(overrides `--scale-down-unready-time` value for that specific ASG)
249+
* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/ignoredaemonsetsutilization`: `true`
250+
(overrides `--ignore-daemonsets-utilization` value for that specific ASG)
249251

250252
**NOTE:** It is your responsibility to ensure such labels and/or taints are
251253
applied via the node's kubelet configuration at startup. Cluster Autoscaler will not set the node taints for you.

cluster-autoscaler/cloudprovider/aws/aws_manager.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,15 @@ func (m *AwsManager) GetAsgOptions(asg asg, defaults config.NodeGroupAutoscaling
245245
}
246246
}
247247

248+
if stringOpt, found := options[config.DefaultIgnoreDaemonSetsUtilizationKey]; found {
249+
if opt, err := strconv.ParseBool(stringOpt); err != nil {
250+
klog.Warningf("failed to convert asg %s %s tag to bool: %v",
251+
asg.Name, config.DefaultScaleDownUnreadyTimeKey, err)
252+
} else {
253+
defaults.IgnoreDaemonSetsUtilization = opt
254+
}
255+
}
256+
248257
return &defaults
249258
}
250259

cluster-autoscaler/cloudprovider/aws/aws_manager_test.go

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ func TestGetAsgOptions(t *testing.T) {
109109
ScaleDownGpuUtilizationThreshold: 0.2,
110110
ScaleDownUnneededTime: time.Second,
111111
ScaleDownUnreadyTime: time.Minute,
112+
IgnoreDaemonSetsUtilization: false,
112113
}
113114

114115
tests := []struct {
@@ -124,39 +125,60 @@ func TestGetAsgOptions(t *testing.T) {
124125
{
125126
description: "keep defaults on invalid tags values",
126127
tags: map[string]string{
127-
"scaledownutilizationthreshold": "not-a-float",
128-
"scaledownunneededtime": "not-a-duration",
129-
"ScaleDownUnreadyTime": "",
128+
config.DefaultScaleDownUtilizationThresholdKey: "not-a-float",
129+
config.DefaultScaleDownUnneededTimeKey: "not-a-duration",
130+
"ScaleDownUnreadyTime": "",
131+
config.DefaultIgnoreDaemonSetsUtilizationKey: "not-a-bool",
130132
},
131133
expected: &defaultOptions,
132134
},
133135
{
134136
description: "use provided tags and fill missing with defaults",
135137
tags: map[string]string{
136-
"scaledownutilizationthreshold": "0.42",
137-
"scaledownunneededtime": "1h",
138+
config.DefaultScaleDownUtilizationThresholdKey: "0.42",
139+
config.DefaultScaleDownUnneededTimeKey: "1h",
140+
config.DefaultIgnoreDaemonSetsUtilizationKey: "true",
138141
},
139142
expected: &config.NodeGroupAutoscalingOptions{
140143
ScaleDownUtilizationThreshold: 0.42,
141144
ScaleDownGpuUtilizationThreshold: defaultOptions.ScaleDownGpuUtilizationThreshold,
142145
ScaleDownUnneededTime: time.Hour,
143146
ScaleDownUnreadyTime: defaultOptions.ScaleDownUnreadyTime,
147+
IgnoreDaemonSetsUtilization: true,
148+
},
149+
},
150+
{
151+
description: "use provided tags (happy path)",
152+
tags: map[string]string{
153+
config.DefaultScaleDownUtilizationThresholdKey: "0.42",
154+
config.DefaultScaleDownUnneededTimeKey: "1h",
155+
config.DefaultScaleDownGpuUtilizationThresholdKey: "0.7",
156+
config.DefaultScaleDownUnreadyTimeKey: "25m",
157+
config.DefaultIgnoreDaemonSetsUtilizationKey: "true",
158+
},
159+
expected: &config.NodeGroupAutoscalingOptions{
160+
ScaleDownUtilizationThreshold: 0.42,
161+
ScaleDownGpuUtilizationThreshold: 0.7,
162+
ScaleDownUnneededTime: time.Hour,
163+
ScaleDownUnreadyTime: 25 * time.Minute,
164+
IgnoreDaemonSetsUtilization: true,
144165
},
145166
},
146167
{
147168
description: "ignore unknown tags",
148169
tags: map[string]string{
149-
"scaledownutilizationthreshold": "0.6",
150-
"scaledowngpuutilizationthreshold": "0.7",
151-
"scaledownunneededtime": "1m",
152-
"scaledownunreadytime": "1h",
153-
"notyetspecified": "42",
170+
config.DefaultScaleDownUtilizationThresholdKey: "0.6",
171+
config.DefaultScaleDownGpuUtilizationThresholdKey: "0.7",
172+
config.DefaultScaleDownUnneededTimeKey: "1m",
173+
config.DefaultScaleDownUnreadyTimeKey: "1h",
174+
"notyetspecified": "42",
154175
},
155176
expected: &config.NodeGroupAutoscalingOptions{
156177
ScaleDownUtilizationThreshold: 0.6,
157178
ScaleDownGpuUtilizationThreshold: 0.7,
158179
ScaleDownUnneededTime: time.Minute,
159180
ScaleDownUnreadyTime: time.Hour,
181+
IgnoreDaemonSetsUtilization: false,
160182
},
161183
},
162184
}

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ type NodeGroupAutoscalingOptions struct {
4646
ScaleDownUnreadyTime time.Duration
4747
// Maximum time CA waits for node to be provisioned
4848
MaxNodeProvisionTime time.Duration
49+
// IgnoreDaemonSetsUtilization sets if daemonsets utilization should be considered during node scale-down
50+
IgnoreDaemonSetsUtilization bool
4951
}
5052

5153
// GCEOptions contain autoscaling options specific to GCE cloud provider.
@@ -115,8 +117,9 @@ type AutoscalingOptions struct {
115117
GRPCExpanderCert string
116118
// GRPCExpanderURL is the url of the gRPC server when using the gRPC expander
117119
GRPCExpanderURL string
118-
// IgnoreDaemonSetsUtilization is whether CA will ignore DaemonSet pods when calculating resource utilization for scaling down
119-
IgnoreDaemonSetsUtilization bool
120+
// GlobalIgnoreDaemonSetsUtilization is whether CA will ignore DaemonSet pods when calculating resource utilization for scaling down
121+
// for the entire cluster (unless per nodegroup IgnoreDaemonSetsUtilization is set by the user)
122+
GlobalIgnoreDaemonSetsUtilization bool
120123
// IgnoreMirrorPodsUtilization is whether CA will ignore Mirror pods when calculating resource utilization for scaling down
121124
IgnoreMirrorPodsUtilization bool
122125
// MaxGracefulTerminationSec is maximum number of seconds scale down waits for pods to terminate before

cluster-autoscaler/config/const.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ limitations under the License.
1616

1717
package config
1818

19+
import "time"
20+
1921
const (
2022
// DefaultMaxClusterCores is the default maximum number of cores in the cluster.
2123
DefaultMaxClusterCores = 5000 * 64
@@ -32,4 +34,14 @@ const (
3234
DefaultScaleDownUnreadyTimeKey = "scaledownunreadytime"
3335
// DefaultMaxNodeProvisionTimeKey identifies MaxNodeProvisionTime autoscaling option
3436
DefaultMaxNodeProvisionTimeKey = "maxnodeprovisiontime"
37+
// DefaultIgnoreDaemonSetsUtilizationKey identifies IgnoreDaemonSetsUtilization autoscaling option
38+
DefaultIgnoreDaemonSetsUtilizationKey = "ignoredaemonsetsutilization"
39+
// DefaultScaleDownUnneededTime identifies ScaleDownUnneededTime autoscaling option
40+
DefaultScaleDownUnneededTime = 10 * time.Minute
41+
// DefaultScaleDownUnreadyTime identifies ScaleDownUnreadyTime autoscaling option
42+
DefaultScaleDownUnreadyTime = 20 * time.Minute
43+
// DefaultScaleDownUtilizationThreshold identifies ScaleDownUtilizationThreshold autoscaling option
44+
DefaultScaleDownUtilizationThreshold = 0.5
45+
// DefaultScaleDownGpuUtilizationThreshold identifies ScaleDownGpuUtilizationThreshold autoscaling option
46+
DefaultScaleDownGpuUtilizationThreshold = 0.5
3547
)

cluster-autoscaler/core/scaledown/actuation/actuator.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status"
3434
"k8s.io/autoscaler/cluster-autoscaler/core/utils"
3535
"k8s.io/autoscaler/cluster-autoscaler/metrics"
36+
"k8s.io/autoscaler/cluster-autoscaler/processors"
3637
"k8s.io/autoscaler/cluster-autoscaler/simulator"
3738
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
3839
"k8s.io/autoscaler/cluster-autoscaler/simulator/utilization"
@@ -51,10 +52,18 @@ type Actuator struct {
5152
nodeDeletionBatcher *NodeDeletionBatcher
5253
evictor Evictor
5354
deleteOptions simulator.NodeDeleteOptions
55+
configGetter actuatorNodeGroupConfigGetter
56+
}
57+
58+
// actuatorNodeGroupConfigGetter is an interface to limit the functions that can be used
59+
// from NodeGroupConfigProcessor interface
60+
type actuatorNodeGroupConfigGetter interface {
61+
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup.
62+
GetIgnoreDaemonSetsUtilization(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (bool, error)
5463
}
5564

5665
// NewActuator returns a new instance of Actuator.
57-
func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions simulator.NodeDeleteOptions) *Actuator {
66+
func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions simulator.NodeDeleteOptions, processors *processors.AutoscalingProcessors) *Actuator {
5867
nbd := NewNodeDeletionBatcher(ctx, csr, ndt, ctx.NodeDeletionBatcherInterval)
5968
return &Actuator{
6069
ctx: ctx,
@@ -63,6 +72,7 @@ func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterState
6372
nodeDeletionBatcher: nbd,
6473
evictor: NewDefaultEvictor(deleteOptions, ndt),
6574
deleteOptions: deleteOptions,
75+
configGetter: processors.NodeGroupConfigProcessor,
6676
}
6777
}
6878

@@ -306,8 +316,14 @@ func (a *Actuator) scaleDownNodeToReport(node *apiv1.Node, drain bool) (*status.
306316
if err != nil {
307317
return nil, err
308318
}
319+
320+
ignoreDaemonSetsUtilization, err := a.configGetter.GetIgnoreDaemonSetsUtilization(a.ctx, nodeGroup)
321+
if err != nil {
322+
return nil, err
323+
}
324+
309325
gpuConfig := a.ctx.CloudProvider.GetNodeGpuConfig(node)
310-
utilInfo, err := utilization.Calculate(nodeInfo, a.ctx.IgnoreDaemonSetsUtilization, a.ctx.IgnoreMirrorPodsUtilization, gpuConfig, time.Now())
326+
utilInfo, err := utilization.Calculate(nodeInfo, ignoreDaemonSetsUtilization, a.ctx.IgnoreMirrorPodsUtilization, gpuConfig, time.Now())
311327
if err != nil {
312328
return nil, err
313329
}

0 commit comments

Comments
 (0)