Skip to content

Commit f14d91a

Browse files
committed
Add atomic scale down option for node groups
1 parent 1009797 commit f14d91a

File tree

8 files changed

+1065
-405
lines changed

8 files changed

+1065
-405
lines changed

cluster-autoscaler/cloudprovider/test/test_cloud_provider.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,9 @@ func (tng *TestNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
443443
id := tng.id
444444
tng.targetSize -= len(nodes)
445445
tng.Unlock()
446+
if tng.opts != nil && tng.opts.AtomicScaleDown && tng.targetSize != 0 {
447+
return fmt.Errorf("TestNodeGroup: attempted to partially scale down a node group that should be scaled down atomically")
448+
}
446449
for _, node := range nodes {
447450
err := tng.cloudProvider.onScaleDown(id, node.Name)
448451
if err != nil {

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ type NodeGroupAutoscalingOptions struct {
4444
ScaleDownUnneededTime time.Duration
4545
// ScaleDownUnreadyTime represents how long an unready node should be unneeded before it is eligible for scale down
4646
ScaleDownUnreadyTime time.Duration
47+
// AtomicScaleDown means that all nodes should be brought down all at once instead of one-by-one
48+
AtomicScaleDown bool
4749
}
4850

4951
const (

cluster-autoscaler/core/scaledown/actuation/actuator.go

Lines changed: 210 additions & 97 deletions
Large diffs are not rendered by default.

cluster-autoscaler/core/scaledown/actuation/actuator_test.go

Lines changed: 559 additions & 262 deletions
Large diffs are not rendered by default.

cluster-autoscaler/core/scaledown/actuation/delete_in_batch.go

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package actuation
1818

1919
import (
2020
"fmt"
21-
"reflect"
2221
"sync"
2322
"time"
2423

@@ -67,59 +66,68 @@ func NewNodeDeletionBatcher(ctx *context.AutoscalingContext, csr *clusterstate.C
6766
}
6867
}
6968

70-
// AddNode adds node to delete candidates and schedule deletion.
69+
// AddNode adds node to delete candidates and schedules deletion.
7170
func (d *NodeDeletionBatcher) AddNode(node *apiv1.Node, drain bool) error {
71+
nodeGroup, err := d.ctx.CloudProvider.NodeGroupForNode(node)
72+
if err != nil {
73+
return err
74+
}
75+
return d.AddNodes([]*apiv1.Node{node}, nodeGroup, drain)
76+
}
77+
78+
// AddNodes adds node list to delete candidates and schedules deletion.
79+
func (d *NodeDeletionBatcher) AddNodes(nodes []*apiv1.Node, nodeGroup cloudprovider.NodeGroup, drain bool) error {
7280
// If delete interval is 0, than instantly start node deletion.
7381
if d.deleteInterval == 0 {
74-
nodeGroup, err := deleteNodesFromCloudProvider(d.ctx, []*apiv1.Node{node})
75-
if err != nil {
76-
result := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: err}
77-
CleanUpAndRecordFailedScaleDownEvent(d.ctx, node, nodeGroup.Id(), drain, d.nodeDeletionTracker, "", result)
78-
} else {
79-
RegisterAndRecordSuccessfulScaleDownEvent(d.ctx, d.clusterState, node, nodeGroup, drain, d.nodeDeletionTracker)
82+
err := deleteNodesFromCloudProvider(d.ctx, nodes, nodeGroup)
83+
for _, node := range nodes {
84+
if err != nil {
85+
result := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: err}
86+
CleanUpAndRecordFailedScaleDownEvent(d.ctx, node, nodeGroup.Id(), drain, d.nodeDeletionTracker, "", result)
87+
} else {
88+
RegisterAndRecordSuccessfulScaleDownEvent(d.ctx, d.clusterState, node, nodeGroup, drain, d.nodeDeletionTracker)
89+
}
8090
}
8191
return nil
8292
}
83-
nodeGroupId, first, err := d.addNodeToBucket(node, drain)
93+
first, err := d.addNodesToBucket(nodes, nodeGroup, drain)
8494
if err != nil {
8595
return err
8696
}
8797
if first {
88-
go func(nodeGroupId string) {
98+
go func(nodeGroup cloudprovider.NodeGroup) {
8999
time.Sleep(d.deleteInterval)
90-
d.remove(nodeGroupId)
91-
}(nodeGroupId)
100+
d.executeForBucket(nodeGroup)
101+
}(nodeGroup)
92102
}
93103
return nil
94104
}
95105

96106
// AddToBucket adds node to delete candidates and return if it's a first node in the group.
97-
func (d *NodeDeletionBatcher) addNodeToBucket(node *apiv1.Node, drain bool) (string, bool, error) {
107+
func (d *NodeDeletionBatcher) addNodesToBucket(nodes []*apiv1.Node, nodeGroup cloudprovider.NodeGroup, drain bool) (bool, error) {
98108
d.Lock()
99109
defer d.Unlock()
100-
nodeGroup, err := d.ctx.CloudProvider.NodeGroupForNode(node)
101-
if err != nil {
102-
return "", false, err
110+
for _, node := range nodes {
111+
d.drainedNodeDeletions[node.Name] = drain
103112
}
104-
d.drainedNodeDeletions[node.Name] = drain
105113
val, ok := d.deletionsPerNodeGroup[nodeGroup.Id()]
106114
if !ok || len(val) == 0 {
107-
d.deletionsPerNodeGroup[nodeGroup.Id()] = []*apiv1.Node{node}
108-
return nodeGroup.Id(), true, nil
115+
d.deletionsPerNodeGroup[nodeGroup.Id()] = nodes
116+
return true, nil
109117
}
110-
d.deletionsPerNodeGroup[nodeGroup.Id()] = append(d.deletionsPerNodeGroup[nodeGroup.Id()], node)
111-
return nodeGroup.Id(), false, nil
118+
d.deletionsPerNodeGroup[nodeGroup.Id()] = append(d.deletionsPerNodeGroup[nodeGroup.Id()], nodes...)
119+
return false, nil
112120
}
113121

114-
// remove delete nodes of a given nodeGroup, if successful, the deletion is recorded in CSR, and an event is emitted on the node.
115-
func (d *NodeDeletionBatcher) remove(nodeGroupId string) error {
122+
// executeForBucket deletes nodes of a given nodeGroup, if successful, the deletion is recorded in CSR, and an event is emitted on the node.
123+
func (d *NodeDeletionBatcher) executeForBucket(nodeGroup cloudprovider.NodeGroup) error {
116124
d.Lock()
117125
defer d.Unlock()
118-
nodes, ok := d.deletionsPerNodeGroup[nodeGroupId]
126+
nodes, ok := d.deletionsPerNodeGroup[nodeGroup.Id()]
119127
if !ok {
120-
return fmt.Errorf("Node Group %s is not present in the batch deleter", nodeGroupId)
128+
return fmt.Errorf("Node Group %s is not present in the batch deleter", nodeGroup.Id())
121129
}
122-
delete(d.deletionsPerNodeGroup, nodeGroupId)
130+
delete(d.deletionsPerNodeGroup, nodeGroup.Id())
123131
drainedNodeDeletions := make(map[string]bool)
124132
for _, node := range nodes {
125133
drainedNodeDeletions[node.Name] = d.drainedNodeDeletions[node.Name]
@@ -128,7 +136,7 @@ func (d *NodeDeletionBatcher) remove(nodeGroupId string) error {
128136

129137
go func(nodes []*apiv1.Node, drainedNodeDeletions map[string]bool) {
130138
var result status.NodeDeleteResult
131-
nodeGroup, err := deleteNodesFromCloudProvider(d.ctx, nodes)
139+
err := deleteNodesFromCloudProvider(d.ctx, nodes, nodeGroup)
132140
for _, node := range nodes {
133141
drain := drainedNodeDeletions[node.Name]
134142
if err != nil {
@@ -137,26 +145,18 @@ func (d *NodeDeletionBatcher) remove(nodeGroupId string) error {
137145
} else {
138146
RegisterAndRecordSuccessfulScaleDownEvent(d.ctx, d.clusterState, node, nodeGroup, drain, d.nodeDeletionTracker)
139147
}
140-
141148
}
142149
}(nodes, drainedNodeDeletions)
143150
return nil
144151
}
145152

146153
// deleteNodeFromCloudProvider removes the given nodes from cloud provider. No extra pre-deletion actions are executed on
147154
// the Kubernetes side.
148-
func deleteNodesFromCloudProvider(ctx *context.AutoscalingContext, nodes []*apiv1.Node) (cloudprovider.NodeGroup, error) {
149-
nodeGroup, err := ctx.CloudProvider.NodeGroupForNode(nodes[0])
150-
if err != nil {
151-
return nodeGroup, errors.NewAutoscalerError(errors.CloudProviderError, "failed to find node group for %s: %v", nodes[0].Name, err)
152-
}
153-
if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
154-
return nodeGroup, errors.NewAutoscalerError(errors.InternalError, "picked node that doesn't belong to a node group: %s", nodes[0].Name)
155+
func deleteNodesFromCloudProvider(ctx *context.AutoscalingContext, nodes []*apiv1.Node, nodeGroup cloudprovider.NodeGroup) error {
156+
if err := nodeGroup.DeleteNodes(nodes); err != nil {
157+
return errors.NewAutoscalerError(errors.CloudProviderError, "failed to delete nodes from group %s: %v", nodeGroup.Id(), err)
155158
}
156-
if err = nodeGroup.DeleteNodes(nodes); err != nil {
157-
return nodeGroup, errors.NewAutoscalerError(errors.CloudProviderError, "failed to delete %s: %v", nodes[0].Name, err)
158-
}
159-
return nodeGroup, nil
159+
return nil
160160
}
161161

162162
func nodeScaleDownReason(node *apiv1.Node, drain bool) metrics.NodeScaleDownReason {

cluster-autoscaler/core/scaledown/actuation/delete_in_batch_test.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ func TestAddNodeToBucket(t *testing.T) {
4343
}
4444
nodeGroup1 := "ng-1"
4545
nodeGroup2 := "ng-2"
46-
nodes1 := generateNodes(5, "ng-1")
47-
nodes2 := generateNodes(5, "ng-2")
46+
nodes1 := generateNodes(0, 5, "ng-1")
47+
nodes2 := generateNodes(0, 5, "ng-2")
4848
provider.AddNodeGroup(nodeGroup1, 1, 10, 5)
4949
provider.AddNodeGroup(nodeGroup2, 1, 10, 5)
5050
for _, node := range nodes1 {
@@ -91,7 +91,11 @@ func TestAddNodeToBucket(t *testing.T) {
9191
}
9292
batchCount := 0
9393
for _, node := range test.nodes {
94-
_, first, err := d.addNodeToBucket(node, test.drained)
94+
nodeGroup, err := provider.NodeGroupForNode(node)
95+
if err != nil {
96+
t.Errorf("couldn't get node info for node %s: %s", node.Name, err)
97+
}
98+
first, err := d.addNodesToBucket([]*apiv1.Node{node}, nodeGroup, test.drained)
9599
if err != nil {
96100
t.Errorf("addNodeToBucket return error %q when addidng node %v", err, node)
97101
}
@@ -168,6 +172,7 @@ func TestRemove(t *testing.T) {
168172

169173
ng := "ng"
170174
provider.AddNodeGroup(ng, 1, 10, test.numNodes)
175+
nodeGroup := provider.GetNodeGroup(ng)
171176

172177
d := NodeDeletionBatcher{
173178
ctx: &ctx,
@@ -176,7 +181,7 @@ func TestRemove(t *testing.T) {
176181
deletionsPerNodeGroup: make(map[string][]*apiv1.Node),
177182
drainedNodeDeletions: make(map[string]bool),
178183
}
179-
nodes := generateNodes(test.numNodes, ng)
184+
nodes := generateNodes(0, test.numNodes, ng)
180185
failedDeletion := test.failedDeletion
181186
for _, node := range nodes {
182187
if failedDeletion > 0 {
@@ -191,14 +196,14 @@ func TestRemove(t *testing.T) {
191196
Key: taints.ToBeDeletedTaint,
192197
Effect: apiv1.TaintEffectNoSchedule,
193198
})
194-
_, _, err := d.addNodeToBucket(node, true)
199+
_, err = d.addNodesToBucket([]*apiv1.Node{node}, nodeGroup, true)
195200
if err != nil {
196201
t.Errorf("addNodeToBucket return error %q when addidng node %v", err, node)
197202
}
198203
}
199204
}
200205

201-
err = d.remove(ng)
206+
err = d.executeForBucket(nodeGroup)
202207
if test.err {
203208
if err == nil {
204209
t.Errorf("remove() should return error, but return nil")

cluster-autoscaler/core/scaledown/planner/planner.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ package planner
1818

1919
import (
2020
"fmt"
21+
"math"
2122
"time"
2223

2324
apiv1 "k8s.io/api/core/v1"
2425
policyv1 "k8s.io/api/policy/v1"
2526
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
2628
"k8s.io/autoscaler/cluster-autoscaler/context"
2729
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown"
2830
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility"
@@ -132,6 +134,7 @@ func (p *Planner) CleanUpUnneededNodes() {
132134
// NodesToDelete returns all Nodes that could be removed right now, according
133135
// to the Planner.
134136
func (p *Planner) NodesToDelete(_ time.Time) (empty, needDrain []*apiv1.Node) {
137+
empty, needDrain = []*apiv1.Node{}, []*apiv1.Node{}
135138
nodes, err := allNodes(p.context.ClusterSnapshot)
136139
if err != nil {
137140
klog.Errorf("Nothing will scale down, failed to list nodes from ClusterSnapshot: %v", err)
@@ -154,17 +157,63 @@ func (p *Planner) NodesToDelete(_ time.Time) (empty, needDrain []*apiv1.Node) {
154157
// downs already in progress. If we pass the empty nodes first, they will be first
155158
// to get deleted, thus we decrease chances of hitting the limit on non-empty scale down.
156159
append(emptyRemovable, needDrainRemovable...),
157-
p.context.AutoscalingOptions.MaxScaleDownParallelism)
160+
// No need to limit the number of nodes, since it will happen later, in the actuation stage.
161+
// It will make a more appropriate decision by using additional information about deletions
162+
// in progress.
163+
math.MaxInt)
158164
for _, nodeToRemove := range nodesToRemove {
159165
if len(nodeToRemove.PodsToReschedule) > 0 {
160166
needDrain = append(needDrain, nodeToRemove.Node)
161167
} else {
162168
empty = append(empty, nodeToRemove.Node)
163169
}
164170
}
171+
172+
empty, filteredOut := p.filterOutIncompleteAtomicNodeGroups(empty)
173+
needDrain, _ = p.filterOutIncompleteAtomicNodeGroups(append(needDrain, filteredOut...))
165174
return empty, needDrain
166175
}
167176

177+
func (p *Planner) filterOutIncompleteAtomicNodeGroups(nodes []*apiv1.Node) ([]*apiv1.Node, []*apiv1.Node) {
178+
nodesByGroup := map[cloudprovider.NodeGroup][]*apiv1.Node{}
179+
result := []*apiv1.Node{}
180+
filteredOut := []*apiv1.Node{}
181+
for _, node := range nodes {
182+
nodeGroup, err := p.context.CloudProvider.NodeGroupForNode(node)
183+
if err != nil {
184+
klog.Errorf("Node %v will not scale down, failed to get node info: %s", node.Name, err)
185+
continue
186+
}
187+
autoscalingOptions, err := nodeGroup.GetOptions(p.context.NodeGroupDefaults)
188+
if err != nil {
189+
klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
190+
continue
191+
}
192+
if autoscalingOptions != nil && autoscalingOptions.AtomicScaleDown {
193+
klog.V(2).Infof("Considering node %s for atomic scale down", node.Name)
194+
nodesByGroup[nodeGroup] = append(nodesByGroup[nodeGroup], node)
195+
} else {
196+
klog.V(2).Infof("Considering node %s for standard scale down", node.Name)
197+
result = append(result, node)
198+
}
199+
}
200+
for nodeGroup, nodes := range nodesByGroup {
201+
ngSize, err := nodeGroup.TargetSize()
202+
if err != nil {
203+
klog.Errorf("Nodes from group %s will not scale down, failed to get target size: %s", nodeGroup.Id(), err)
204+
continue
205+
}
206+
if ngSize == len(nodes) {
207+
klog.V(2).Infof("Scheduling atomic scale down for all %v nodes from node group %s", len(nodes), nodeGroup.Id())
208+
result = append(result, nodes...)
209+
} else {
210+
klog.V(2).Infof("Skipping scale down for %v nodes from node group %s, all %v nodes have to be scaled down atomically", len(nodes), nodeGroup.Id(), ngSize)
211+
filteredOut = append(filteredOut, nodes...)
212+
}
213+
}
214+
return result, filteredOut
215+
}
216+
168217
func allNodes(s clustersnapshot.ClusterSnapshot) ([]*apiv1.Node, error) {
169218
nodeInfos, err := s.NodeInfos().List()
170219
if err != nil {

0 commit comments

Comments
 (0)