Skip to content

Commit a64cd41

Browse files
committed
Watch external objects for machine before deleting
This fixes a race condition in the machine controller when the controller is restarted after a machine has been marked for deletion but before the infra machine or bootstrap config are deleted. In that case, the controller doesn't have watches on those external objects, so the reconciliation is not triggered once they are deleted. This means that Machines stay in Deleting state for the resync period, the default is 10 minutes.
1 parent fc43a06 commit a64cd41

File tree

3 files changed

+241
-35
lines changed

3 files changed

+241
-35
lines changed

internal/controllers/machine/machine_controller.go

+16-8
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
333333
if err != nil {
334334
switch err {
335335
case errNoControlPlaneNodes, errLastControlPlaneNode, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted:
336-
var nodeName = ""
336+
nodeName := ""
337337
if m.Status.NodeRef != nil {
338338
nodeName = m.Status.NodeRef.Name
339339
}
@@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
427427
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
428428
}
429429

430-
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, m)
430+
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, cluster, m)
431431
if err != nil {
432432
return ctrl.Result{}, err
433433
}
@@ -436,7 +436,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
436436
return ctrl.Result{}, nil
437437
}
438438

439-
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, m)
439+
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, cluster, m)
440440
if err != nil {
441441
return ctrl.Result{}, err
442442
}
@@ -723,8 +723,8 @@ func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster,
723723
return nil
724724
}
725725

726-
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.Machine) (bool, error) {
727-
obj, err := r.reconcileDeleteExternal(ctx, m, m.Spec.Bootstrap.ConfigRef)
726+
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
727+
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, m.Spec.Bootstrap.ConfigRef)
728728
if err != nil {
729729
return false, err
730730
}
@@ -743,8 +743,8 @@ func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.
743743
return false, nil
744744
}
745745

746-
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clusterv1.Machine) (bool, error) {
747-
obj, err := r.reconcileDeleteExternal(ctx, m, &m.Spec.InfrastructureRef)
746+
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
747+
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, &m.Spec.InfrastructureRef)
748748
if err != nil {
749749
return false, err
750750
}
@@ -764,7 +764,7 @@ func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clust
764764
}
765765

766766
// reconcileDeleteExternal tries to delete external references.
767-
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
767+
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
768768
if ref == nil {
769769
return nil, nil
770770
}
@@ -777,6 +777,14 @@ func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.M
777777
}
778778

779779
if obj != nil {
780+
// reconcileExternal ensures that we set the object's OwnerReferences correctly and watch the object.
781+
// The machine delete logic depends on reconciling the machine when the external objects are deleted.
782+
// This avoids a race condition where the machine is deleted before the external objects are ever reconciled
783+
// by this controller.
784+
if _, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref); err != nil {
785+
return nil, err
786+
}
787+
780788
// Issue a delete request.
781789
if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
782790
return obj, errors.Wrapf(err,

internal/controllers/machine/machine_controller_phases.go

+35-21
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,7 @@ import (
4343
"sigs.k8s.io/cluster-api/util/patch"
4444
)
4545

46-
var (
47-
externalReadyWait = 30 * time.Second
48-
)
46+
var externalReadyWait = 30 * time.Second
4947

5048
func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {
5149
originalPhase := m.Status.Phase
@@ -89,12 +87,44 @@ func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {
8987

9088
// reconcileExternal handles generic unstructured objects referenced by a Machine.
9189
func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
92-
log := ctrl.LoggerFrom(ctx)
93-
9490
if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil {
9591
return external.ReconcileOutput{}, err
9692
}
9793

94+
result, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref)
95+
if err != nil {
96+
return external.ReconcileOutput{}, err
97+
}
98+
if result.RequeueAfter > 0 || result.Paused {
99+
return result, nil
100+
}
101+
102+
obj := result.Result
103+
104+
// Set failure reason and message, if any.
105+
failureReason, failureMessage, err := external.FailuresFrom(obj)
106+
if err != nil {
107+
return external.ReconcileOutput{}, err
108+
}
109+
if failureReason != "" {
110+
machineStatusError := capierrors.MachineStatusError(failureReason)
111+
m.Status.FailureReason = &machineStatusError
112+
}
113+
if failureMessage != "" {
114+
m.Status.FailureMessage = pointer.String(
115+
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
116+
obj.GroupVersionKind(), obj.GetName(), failureMessage),
117+
)
118+
}
119+
120+
return external.ReconcileOutput{Result: obj}, nil
121+
}
122+
123+
// ensureExternalOwnershipAndWatch ensures that only the Machine owns the external object,
124+
// adds a watch to the external object if one does not already exist and adds the necessary labels.
125+
func (r *Reconciler) ensureExternalOwnershipAndWatch(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
126+
log := ctrl.LoggerFrom(ctx)
127+
98128
obj, err := external.Get(ctx, r.UnstructuredCachingClient, ref, m.Namespace)
99129
if err != nil {
100130
if apierrors.IsNotFound(errors.Cause(err)) {
@@ -146,22 +176,6 @@ func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.C
146176
return external.ReconcileOutput{}, err
147177
}
148178

149-
// Set failure reason and message, if any.
150-
failureReason, failureMessage, err := external.FailuresFrom(obj)
151-
if err != nil {
152-
return external.ReconcileOutput{}, err
153-
}
154-
if failureReason != "" {
155-
machineStatusError := capierrors.MachineStatusError(failureReason)
156-
m.Status.FailureReason = &machineStatusError
157-
}
158-
if failureMessage != "" {
159-
m.Status.FailureMessage = pointer.String(
160-
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
161-
obj.GroupVersionKind(), obj.GetName(), failureMessage),
162-
)
163-
}
164-
165179
return external.ReconcileOutput{Result: obj}, nil
166180
}
167181

0 commit comments

Comments
 (0)