Wait for alertmanagers to complete state sync before becoming ACTIVE.

stevesg · stevesg · commit 2a02ffd83e4b · 2021-05-06T14:44:56.000+02:00
Note: Applies to sharding operation only.

When starting up, make sure that the initial state sync is completed for
any users which were initially assigned to the instance. This will reduce
the possibility that we lose state when [cleanly] scaling up or down, by
inadvertently losing the replica which we need to obtain state from.

This change allows the state sync to progress concurrently for all
tenants. The alternative would be to block for the service in `New()`,
which is a slightly simpler code change, but would mean that start-up
is potentially much slower.

Signed-off-by: Steve Simpson &lt;steve.simpson@grafana.com&gt;
diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go
@@ -273,6 +273,15 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
 	return am, nil
 }
 
+func (am *Alertmanager) WaitInitialStateSync(ctx context.Context) error {
+	if service, ok := am.state.(services.Service); ok {
+		if err := service.AwaitRunning(ctx); err != nil {
+			return errors.Wrap(err, "failed to wait for ring-based replication service")
+		}
+	}
+	return nil
+}
+
 // clusterWait returns a function that inspects the current peer state and returns
 // a duration of one base timeout for each peer with a higher ID than ourselves.
 func clusterWait(position func() int, timeout time.Duration) func() time.Duration {
diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go
@@ -478,6 +478,14 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {
 	}
 
 	if am.cfg.ShardingEnabled {
+		// Make sure that all the alertmanagers we were initially configured with have
+		// fetched state from the replicas, before advertising as ACTIVE and letting
+		// them shut-down. This will reduce the possibility that we lose state when
+		// scaling up or down.
+		if err := am.waitInitialStateSync(ctx); err != nil {
+			return err
+		}
+
 		// With the initial sync now completed, we should have loaded all assigned alertmanager configurations to this instance. We can switch it to ACTIVE and start serving requests.
 		if err := am.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil {
 			return errors.Wrapf(err, "switch instance to %s in the ring", ring.ACTIVE)
@@ -663,6 +671,23 @@ func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncR
 	return nil
 }
 
+func (am *MultitenantAlertmanager) waitInitialStateSync(ctx context.Context) error {
+	am.alertmanagersMtx.Lock()
+	ams := make([]*Alertmanager, 0, len(am.alertmanagers))
+	for _, userAM := range am.alertmanagers {
+		ams = append(ams, userAM)
+	}
+	am.alertmanagersMtx.Unlock()
+
+	for _, userAM := range ams {
+		if err := userAM.WaitInitialStateSync(ctx); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
 // stopping runs when MultitenantAlertmanager transitions to Stopping state.
 func (am *MultitenantAlertmanager) stopping(_ error) error {
 	am.alertmanagersMtx.Lock()