Skip to content

Commit 2a02ffd

Browse files
committed
Wait for alertmanagers to complete state sync before becoming ACTIVE.
Note: Applies to sharding operation only. When starting up, make sure that the initial state sync is completed for any users which were initially assigned to the instance. This will reduce the possibility that we lose state when [cleanly] scaling up or down, by inadvertently losing the replica which we need to obtain state from. This change allows the state sync to progress concurrently for all tenants. The alternative would be to block for the service in `New()`, which is a slightly simpler code change, but would mean that start-up is potentially much slower. Signed-off-by: Steve Simpson <[email protected]>
1 parent 5d55a28 commit 2a02ffd

File tree

2 files changed

+34
-0
lines changed

2 files changed

+34
-0
lines changed

pkg/alertmanager/alertmanager.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,15 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
273273
return am, nil
274274
}
275275

276+
func (am *Alertmanager) WaitInitialStateSync(ctx context.Context) error {
277+
if service, ok := am.state.(services.Service); ok {
278+
if err := service.AwaitRunning(ctx); err != nil {
279+
return errors.Wrap(err, "failed to wait for ring-based replication service")
280+
}
281+
}
282+
return nil
283+
}
284+
276285
// clusterWait returns a function that inspects the current peer state and returns
277286
// a duration of one base timeout for each peer with a higher ID than ourselves.
278287
func clusterWait(position func() int, timeout time.Duration) func() time.Duration {

pkg/alertmanager/multitenant.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,14 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {
478478
}
479479

480480
if am.cfg.ShardingEnabled {
481+
// Make sure that all the alertmanagers we were initially configured with have
482+
// fetched state from the replicas, before advertising as ACTIVE and letting
483+
// them shut-down. This will reduce the possibility that we lose state when
484+
// scaling up or down.
485+
if err := am.waitInitialStateSync(ctx); err != nil {
486+
return err
487+
}
488+
481489
// With the initial sync now completed, we should have loaded all assigned alertmanager configurations to this instance. We can switch it to ACTIVE and start serving requests.
482490
if err := am.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil {
483491
return errors.Wrapf(err, "switch instance to %s in the ring", ring.ACTIVE)
@@ -663,6 +671,23 @@ func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncR
663671
return nil
664672
}
665673

674+
func (am *MultitenantAlertmanager) waitInitialStateSync(ctx context.Context) error {
675+
am.alertmanagersMtx.Lock()
676+
ams := make([]*Alertmanager, 0, len(am.alertmanagers))
677+
for _, userAM := range am.alertmanagers {
678+
ams = append(ams, userAM)
679+
}
680+
am.alertmanagersMtx.Unlock()
681+
682+
for _, userAM := range ams {
683+
if err := userAM.WaitInitialStateSync(ctx); err != nil {
684+
return err
685+
}
686+
}
687+
688+
return nil
689+
}
690+
666691
// stopping runs when MultitenantAlertmanager transitions to Stopping state.
667692
func (am *MultitenantAlertmanager) stopping(_ error) error {
668693
am.alertmanagersMtx.Lock()

0 commit comments

Comments
 (0)