cortexproject · pracucci · May 11, 2021 · May 6, 2021 · May 11, 2021
diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go
@@ -273,6 +273,15 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
 	return am, nil
 }
 
+func (am *Alertmanager) WaitInitialStateSync(ctx context.Context) error {
+	if service, ok := am.state.(services.Service); ok {
+		if err := service.AwaitRunning(ctx); err != nil {
+			return errors.Wrap(err, "failed to wait for ring-based replication service")
+		}
+	}
+	return nil
+}
+
 // clusterWait returns a function that inspects the current peer state and returns
 // a duration of one base timeout for each peer with a higher ID than ourselves.
 func clusterWait(position func() int, timeout time.Duration) func() time.Duration {

diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go
@@ -478,6 +478,15 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {
 	}
 
 	if am.cfg.ShardingEnabled {
+		// Make sure that all the alertmanagers we were initially configured with have
+		// fetched state from the replicas, before advertising as ACTIVE. This will
+		// reduce the possibility that we lose state when new instances join/leave.
+		level.Info(am.logger).Log("msg", "waiting until initial state sync is complete for all users")
+		if err := am.waitInitialStateSync(ctx); err != nil {
+			return errors.Wrap(err, "failed to wait for initial state sync")
+		}
+		level.Info(am.logger).Log("msg", "initial state sync is complete")
+
 		// With the initial sync now completed, we should have loaded all assigned alertmanager configurations to this instance. We can switch it to ACTIVE and start serving requests.
 		if err := am.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil {
 			return errors.Wrapf(err, "switch instance to %s in the ring", ring.ACTIVE)
@@ -663,6 +672,23 @@ func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncR
 	return nil
 }
 
+func (am *MultitenantAlertmanager) waitInitialStateSync(ctx context.Context) error {
+	am.alertmanagersMtx.Lock()
+	ams := make([]*Alertmanager, 0, len(am.alertmanagers))
+	for _, userAM := range am.alertmanagers {
+		ams = append(ams, userAM)
+	}
+	am.alertmanagersMtx.Unlock()
+
+	for _, userAM := range ams {
+		if err := userAM.WaitInitialStateSync(ctx); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
 // stopping runs when MultitenantAlertmanager transitions to Stopping state.
 func (am *MultitenantAlertmanager) stopping(_ error) error {
 	am.alertmanagersMtx.Lock()