Skip to content

Commit 052e165

Browse files
committed
Code review comments addressed
1 parent 4de4a27 commit 052e165

File tree

4 files changed

+106
-76
lines changed

4 files changed

+106
-76
lines changed

cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ type LatencyTracker interface {
3434
GetTrackedNodes() []string
3535
}
3636
type nodeInfo struct {
37-
UnneededSince time.Time
38-
Threshold time.Duration
37+
unneededSince time.Time
38+
threshold time.Duration
3939
}
4040

4141
// NodeLatencyTracker is a concrete implementation of LatencyTracker.
@@ -64,21 +64,21 @@ func (t *NodeLatencyTracker) UpdateStateWithUnneededList(
6464

6565
if _, exists := t.nodes[node.Name]; !exists {
6666
t.nodes[node.Name] = nodeInfo{
67-
UnneededSince: timestamp,
68-
Threshold: 0,
67+
unneededSince: timestamp,
68+
threshold: 0,
6969
}
70-
klog.V(2).Infof("Started tracking unneeded node %s at %v", node.Name, timestamp)
70+
klog.V(4).Infof("Started tracking unneeded node %s at %v", node.Name, timestamp)
7171
}
7272
}
7373

7474
for name, info := range t.nodes {
7575
if _, stillUnneeded := currentSet[name]; !stillUnneeded {
7676
if _, inDeletion := currentlyInDeletion[name]; !inDeletion {
77-
duration := timestamp.Sub(info.UnneededSince)
78-
metrics.UpdateScaleDownNodeDeletionDuration("false", duration-info.Threshold)
77+
duration := timestamp.Sub(info.unneededSince)
78+
metrics.UpdateScaleDownNodeDeletionLatency(false, duration-info.threshold)
7979
delete(t.nodes, name)
80-
klog.V(2).Infof("Node %q reported as deleted/missing (unneeded for %s, threshold %s)",
81-
name, duration, info.Threshold)
80+
klog.V(4).Infof("Node %q reported as deleted/missing (unneeded for %s, threshold %s)",
81+
name, duration, info.threshold)
8282
}
8383
}
8484
}
@@ -87,24 +87,24 @@ func (t *NodeLatencyTracker) UpdateStateWithUnneededList(
8787
// ObserveDeletion is called by the actuator just before node deletion.
8888
func (t *NodeLatencyTracker) ObserveDeletion(nodeName string, timestamp time.Time) {
8989
if info, exists := t.nodes[nodeName]; exists {
90-
duration := timestamp.Sub(info.UnneededSince)
90+
duration := timestamp.Sub(info.unneededSince)
9191

92-
klog.V(2).Infof(
92+
klog.V(4).Infof(
9393
"Observing deletion for node %s, unneeded for %s (threshold was %s).",
94-
nodeName, duration, info.Threshold,
94+
nodeName, duration, info.threshold,
9595
)
9696

97-
metrics.UpdateScaleDownNodeDeletionDuration("true", duration-info.Threshold)
97+
metrics.UpdateScaleDownNodeDeletionLatency(true, duration-info.threshold)
9898
delete(t.nodes, nodeName)
9999
}
100100
}
101101

102102
// UpdateThreshold updates the scale-down threshold for a tracked node.
103103
func (t *NodeLatencyTracker) UpdateThreshold(nodeName string, threshold time.Duration) {
104104
if info, exists := t.nodes[nodeName]; exists {
105-
info.Threshold = threshold
105+
info.threshold = threshold
106106
t.nodes[nodeName] = info
107-
klog.V(2).Infof("Updated threshold for node %q to %s", nodeName, threshold)
107+
klog.V(4).Infof("Updated threshold for node %q to %s", nodeName, threshold)
108108
} else {
109109
klog.Warningf("Attempted to update threshold for unknown node %q", nodeName)
110110
}

cluster-autoscaler/core/scaledown/latencytracker/node_latency_tracker_test.go

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -28,65 +28,65 @@ func TestNodeLatencyTracker(t *testing.T) {
2828
baseTime := time.Now()
2929

3030
tests := []struct {
31-
name string
32-
setupNodes map[string]nodeInfo
33-
unneededList []string
34-
currentlyInDeletion map[string]bool
35-
updateThresholds map[string]time.Duration
36-
observeDeletion []string
37-
expectedTrackedNodes []string
38-
expectedDeletionTimes map[string]time.Duration
31+
name string
32+
setupNodes map[string]nodeInfo
33+
unneededList []string
34+
currentlyInDeletion map[string]bool
35+
updateThresholds map[string]time.Duration
36+
observeDeletion []string
37+
wantTrackedNodes []string
38+
wantDeletionTimes map[string]time.Duration
3939
}{
4040
{
41-
name: "add new unneeded nodes",
42-
setupNodes: map[string]nodeInfo{},
43-
unneededList: []string{"node1", "node2"},
44-
currentlyInDeletion: map[string]bool{},
45-
updateThresholds: map[string]time.Duration{},
46-
observeDeletion: []string{},
47-
expectedTrackedNodes: []string{"node1", "node2"},
41+
name: "add new unneeded nodes",
42+
setupNodes: map[string]nodeInfo{},
43+
unneededList: []string{"node1", "node2"},
44+
currentlyInDeletion: map[string]bool{},
45+
updateThresholds: map[string]time.Duration{},
46+
observeDeletion: []string{},
47+
wantTrackedNodes: []string{"node1", "node2"},
4848
},
4949
{
5050
name: "observe deletion with threshold",
5151
setupNodes: map[string]nodeInfo{
52-
"node1": {UnneededSince: baseTime, Threshold: 2 * time.Second},
52+
"node1": {unneededSince: baseTime, threshold: 2 * time.Second},
5353
},
54-
unneededList: []string{},
55-
currentlyInDeletion: map[string]bool{},
56-
updateThresholds: map[string]time.Duration{},
57-
observeDeletion: []string{"node1"},
58-
expectedTrackedNodes: []string{},
59-
expectedDeletionTimes: map[string]time.Duration{
54+
unneededList: []string{},
55+
currentlyInDeletion: map[string]bool{},
56+
updateThresholds: map[string]time.Duration{},
57+
observeDeletion: []string{"node1"},
58+
wantTrackedNodes: []string{},
59+
wantDeletionTimes: map[string]time.Duration{
6060
"node1": 3 * time.Second, // simulate observation 5s after UnneededSince, threshold 2s
6161
},
6262
},
6363
{
6464
name: "remove unneeded node not in deletion",
6565
setupNodes: map[string]nodeInfo{
66-
"node1": {UnneededSince: baseTime, Threshold: 1 * time.Second},
67-
"node2": {UnneededSince: baseTime, Threshold: 0},
66+
"node1": {unneededSince: baseTime, threshold: 1 * time.Second},
67+
"node2": {unneededSince: baseTime, threshold: 0},
6868
},
69-
unneededList: []string{"node2"}, // node1 is removed from unneeded
70-
currentlyInDeletion: map[string]bool{},
71-
updateThresholds: map[string]time.Duration{},
72-
observeDeletion: []string{},
73-
expectedTrackedNodes: []string{"node2"},
74-
expectedDeletionTimes: map[string]time.Duration{
69+
unneededList: []string{"node2"}, // node1 is removed from unneeded
70+
currentlyInDeletion: map[string]bool{},
71+
updateThresholds: map[string]time.Duration{},
72+
observeDeletion: []string{},
73+
wantTrackedNodes: []string{"node2"},
74+
wantDeletionTimes: map[string]time.Duration{
7575
"node1": 5*time.Second - 1*time.Second, // assume current timestamp baseTime+5s
7676
},
7777
},
7878
{
7979
name: "update threshold",
8080
setupNodes: map[string]nodeInfo{
81-
"node1": {UnneededSince: baseTime, Threshold: 1 * time.Second},
81+
"node1": {unneededSince: baseTime, threshold: 1 * time.Second},
8282
},
8383
unneededList: []string{"node1"},
8484
currentlyInDeletion: map[string]bool{},
8585
updateThresholds: map[string]time.Duration{
8686
"node1": 4 * time.Second,
8787
},
88-
observeDeletion: []string{},
89-
expectedTrackedNodes: []string{"node1"},
88+
observeDeletion: []string{},
89+
wantTrackedNodes: []string{"node1"},
9090
},
9191
}
9292

@@ -116,7 +116,7 @@ func TestNodeLatencyTracker(t *testing.T) {
116116
// Check tracked nodes
117117
gotTracked := tracker.GetTrackedNodes()
118118
expectedMap := make(map[string]struct{})
119-
for _, n := range tt.expectedTrackedNodes {
119+
for _, n := range tt.wantTrackedNodes {
120120
expectedMap[n] = struct{}{}
121121
}
122122
for _, n := range gotTracked {
@@ -129,12 +129,12 @@ func TestNodeLatencyTracker(t *testing.T) {
129129
t.Errorf("expected node %q to be tracked, but was not", n)
130130
}
131131

132-
for node, expectedDuration := range tt.expectedDeletionTimes {
132+
for node, expectedDuration := range tt.wantDeletionTimes {
133133
info, ok := tt.setupNodes[node]
134134
if !ok {
135135
continue
136136
}
137-
duration := currentTime.Sub(info.UnneededSince) - info.Threshold
137+
duration := currentTime.Sub(info.unneededSince) - info.threshold
138138
if duration != expectedDuration {
139139
t.Errorf("node %q expected deletion duration %v, got %v", node, expectedDuration, duration)
140140
}

cluster-autoscaler/core/static_autoscaler_test.go

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ func (m *onNodeGroupDeleteMock) Delete(id string) error {
165165

166166
func setUpScaleDownActuator(ctx *context.AutoscalingContext, autoscalingOptions config.AutoscalingOptions) {
167167
deleteOptions := options.NewNodeDeleteOptions(autoscalingOptions)
168-
ctx.ScaleDownActuator = actuation.NewActuator(ctx, nil, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), deleteOptions, rules.Default(deleteOptions), processorstest.NewTestProcessors(ctx).NodeGroupConfigProcessor)
168+
ctx.ScaleDownActuator = actuation.NewActuator(ctx, nil, deletiontracker.NewNodeDeletionTracker(0*time.Second), nil, deleteOptions, rules.Default(deleteOptions), processorstest.NewTestProcessors(ctx).NodeGroupConfigProcessor)
169169
}
170170

171171
type nodeGroup struct {
@@ -211,7 +211,6 @@ type commonMocks struct {
211211
podDisruptionBudgetLister *podDisruptionBudgetListerMock
212212
daemonSetLister *daemonSetListerMock
213213
nodeDeletionTracker *deletiontracker.NodeDeletionTracker
214-
nodeLatencyTracker *latencytracker.NodeLatencyTracker
215214

216215
resourceClaimLister *fakeAllObjectsLister[*resourceapi.ResourceClaim]
217216
resourceSliceLister *fakeAllObjectsLister[*resourceapi.ResourceSlice]
@@ -322,12 +321,8 @@ func setupAutoscaler(config *autoscalerSetupConfig) (*StaticAutoscaler, error) {
322321
if nodeDeletionTracker == nil {
323322
nodeDeletionTracker = deletiontracker.NewNodeDeletionTracker(0 * time.Second)
324323
}
325-
nodeLatencyTracker := config.mocks.nodeLatencyTracker
326-
if nodeLatencyTracker == nil {
327-
nodeLatencyTracker = latencytracker.NewNodeLatencyTracker()
328-
}
329-
ctx.ScaleDownActuator = actuation.NewActuator(&ctx, clusterState, nodeDeletionTracker, nodeLatencyTracker, deleteOptions, drainabilityRules, processors.NodeGroupConfigProcessor)
330-
sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, nodeLatencyTracker)
324+
ctx.ScaleDownActuator = actuation.NewActuator(&ctx, clusterState, nodeDeletionTracker, nil, deleteOptions, drainabilityRules, processors.NodeGroupConfigProcessor)
325+
sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, nil)
331326

332327
processorCallbacks.scaleDownPlanner = sdPlanner
333328

@@ -383,6 +378,21 @@ func TestStaticAutoscalerRunOnce(t *testing.T) {
383378
ng1 := reflect.ValueOf(provider.GetNodeGroup("ng1")).Interface().(*testprovider.TestNodeGroup)
384379
assert.NotNil(t, ng1)
385380
assert.NotNil(t, provider)
381+
// NodeLatencyTracker mock
382+
nltMock := &latencytrackerMock{}
383+
nltMock.On("ObserveDeletion",
384+
"n2",
385+
mock.MatchedBy(func(t time.Time) bool { return !t.IsZero() }),
386+
).Return()
387+
nltMock.On("UpdateStateWithUnneededList",
388+
mock.MatchedBy(func(nodes []*apiv1.Node) bool { return true }),
389+
mock.MatchedBy(func(m map[string]bool) bool { return true }),
390+
mock.Anything,
391+
).Return()
392+
nltMock.On("UpdateThreshold",
393+
"n2",
394+
time.Minute,
395+
).Return()
386396

387397
// Create context with mocked lister registry.
388398
options := config.AutoscalingOptions{
@@ -415,7 +425,7 @@ func TestStaticAutoscalerRunOnce(t *testing.T) {
415425
}
416426
processors := processorstest.NewTestProcessors(&context)
417427
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker)
418-
sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nil)
428+
sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil, nltMock)
419429
suOrchestrator := orchestrator.New()
420430
suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{})
421431

@@ -2472,7 +2482,7 @@ func TestStaticAutoscalerUpcomingScaleDownCandidates(t *testing.T) {
24722482
csr := clusterstate.NewClusterStateRegistry(provider, csrConfig, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), processors.AsyncNodeGroupStateChecker)
24732483

24742484
// Setting the Actuator is necessary for testing any scale-down logic, it shouldn't have anything to do in this test.
2475-
actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), options.NodeDeleteOptions{}, nil, processorstest.NewTestProcessors(&ctx).NodeGroupConfigProcessor)
2485+
actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), nil, options.NodeDeleteOptions{}, nil, processorstest.NewTestProcessors(&ctx).NodeGroupConfigProcessor)
24762486
ctx.ScaleDownActuator = actuator
24772487

24782488
// Fake planner that keeps track of the scale-down candidates passed to UpdateClusterState.
@@ -3133,7 +3143,7 @@ func waitForDeleteToFinish(t *testing.T, deleteFinished <-chan bool) {
31333143
}
31343144
}
31353145

3136-
func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_processors.AutoscalingProcessors, cs *clusterstate.ClusterStateRegistry, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, nodeLatencyTracker *latencytracker.NodeLatencyTracker) (scaledown.Planner, scaledown.Actuator) {
3146+
func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_processors.AutoscalingProcessors, cs *clusterstate.ClusterStateRegistry, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, nodeDeletionLatencyTracker latencytracker.LatencyTracker) (scaledown.Planner, scaledown.Actuator) {
31373147
ctx.MaxScaleDownParallelism = 10
31383148
ctx.MaxDrainParallelism = 1
31393149
ctx.NodeDeletionBatcherInterval = 0 * time.Second
@@ -3148,11 +3158,8 @@ func newScaleDownPlannerAndActuator(ctx *context.AutoscalingContext, p *ca_proce
31483158
if nodeDeletionTracker == nil {
31493159
nodeDeletionTracker = deletiontracker.NewNodeDeletionTracker(0 * time.Second)
31503160
}
3151-
if nodeLatencyTracker == nil {
3152-
nodeLatencyTracker = latencytracker.NewNodeLatencyTracker()
3153-
}
3154-
planner := planner.New(ctx, p, deleteOptions, nil, nodeLatencyTracker)
3155-
actuator := actuation.NewActuator(ctx, cs, nodeDeletionTracker, nodeLatencyTracker, deleteOptions, nil, p.NodeGroupConfigProcessor)
3161+
planner := planner.New(ctx, p, deleteOptions, nil, nodeDeletionLatencyTracker)
3162+
actuator := actuation.NewActuator(ctx, cs, nodeDeletionTracker, nodeDeletionLatencyTracker, deleteOptions, nil, p.NodeGroupConfigProcessor)
31563163
return planner, actuator
31573164
}
31583165

@@ -3268,13 +3275,13 @@ func buildStaticAutoscaler(t *testing.T, provider cloudprovider.CloudProvider, a
32683275
processors.ScaleDownNodeProcessor = cp
32693276

32703277
csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{OkTotalUnreadyCount: 1}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), processors.AsyncNodeGroupStateChecker)
3271-
actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), latencytracker.NewNodeLatencyTracker(), options.NodeDeleteOptions{}, nil, processors.NodeGroupConfigProcessor)
3278+
actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), nil, options.NodeDeleteOptions{}, nil, processors.NodeGroupConfigProcessor)
32723279
ctx.ScaleDownActuator = actuator
32733280

32743281
deleteOptions := options.NewNodeDeleteOptions(ctx.AutoscalingOptions)
32753282
drainabilityRules := rules.Default(deleteOptions)
32763283

3277-
sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, latencytracker.NewNodeLatencyTracker())
3284+
sdPlanner := planner.New(&ctx, processors, deleteOptions, drainabilityRules, nil)
32783285

32793286
autoscaler := &StaticAutoscaler{
32803287
AutoscalingContext: &ctx,
@@ -3324,3 +3331,25 @@ func assertNodesSoftTaintsStatus(t *testing.T, fakeClient *fake.Clientset, nodes
33243331
assert.Equal(t, tainted, taints.HasDeletionCandidateTaint(newNode))
33253332
}
33263333
}
3334+
3335+
// latencytrackerMock implements LatencyTracker for mocking
3336+
type latencytrackerMock struct {
3337+
mock.Mock
3338+
}
3339+
3340+
func (m *latencytrackerMock) ObserveDeletion(nodeName string, timestamp time.Time) {
3341+
m.Called(nodeName, timestamp)
3342+
}
3343+
3344+
func (m *latencytrackerMock) UpdateStateWithUnneededList(list []*apiv1.Node, currentlyInDeletion map[string]bool, timestamp time.Time) {
3345+
m.Called(list, currentlyInDeletion, timestamp)
3346+
}
3347+
3348+
func (m *latencytrackerMock) UpdateThreshold(nodeName string, threshold time.Duration) {
3349+
m.Called(nodeName, threshold)
3350+
}
3351+
3352+
func (m *latencytrackerMock) GetTrackedNodes() []string {
3353+
args := m.Called()
3354+
return args.Get(0).([]string)
3355+
}

cluster-autoscaler/metrics/metrics.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package metrics
1818

1919
import (
2020
"fmt"
21+
"strconv"
2122
"time"
2223

2324
"k8s.io/autoscaler/cluster-autoscaler/simulator"
@@ -428,12 +429,12 @@ var (
428429
}, []string{"instance_type", "cpu_count", "namespace_count"},
429430
)
430431

431-
scaleDownNodeDeletionDuration = k8smetrics.NewHistogramVec(
432+
scaleDownNodeDeletionLatency = k8smetrics.NewHistogramVec(
432433
&k8smetrics.HistogramOpts{
433434
Namespace: caNamespace,
434-
Name: "node_deletion_duration_seconds",
435+
Name: "node_deletion_latency_seconds",
435436
Help: "Latency from planning (node marked) to final outcome (deleted, aborted, rescued).",
436-
Buckets: k8smetrics.ExponentialBuckets(1, 2, 12), //1, 2, 4, 8, ..., 2048
437+
Buckets: k8smetrics.ExponentialBuckets(1, 2, 18), //1, 2, 4, 8, ..., 131072 approx 1.5 days
437438
}, []string{"deleted"},
438439
)
439440
)
@@ -472,7 +473,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
472473
legacyregistry.MustRegister(nodeTaintsCount)
473474
legacyregistry.MustRegister(inconsistentInstancesMigsCount)
474475
legacyregistry.MustRegister(binpackingHeterogeneity)
475-
legacyregistry.MustRegister(scaleDownNodeDeletionDuration)
476+
legacyregistry.MustRegister(scaleDownNodeDeletionLatency)
476477

477478
if emitPerNodeGroupMetrics {
478479
legacyregistry.MustRegister(nodesGroupMinNodes)
@@ -761,8 +762,8 @@ func ObserveBinpackingHeterogeneity(instanceType, cpuCount, namespaceCount strin
761762
binpackingHeterogeneity.WithLabelValues(instanceType, cpuCount, namespaceCount).Observe(float64(pegCount))
762763
}
763764

764-
// UpdateScaleDownNodeDeletionDuration records the time after which node was deleted/needed
765+
// UpdateScaleDownNodeDeletionLatency records the time after which node was deleted/needed
765766
// again after being marked unneded
766-
func UpdateScaleDownNodeDeletionDuration(deleted string, duration time.Duration) {
767-
scaleDownNodeDeletionDuration.WithLabelValues(deleted).Observe(duration.Seconds())
767+
func UpdateScaleDownNodeDeletionLatency(deleted bool, duration time.Duration) {
768+
scaleDownNodeDeletionLatency.WithLabelValues(strconv.FormatBool(deleted)).Observe(duration.Seconds())
768769
}

0 commit comments

Comments
 (0)