Skip to content

Commit cefa32d

Browse files
committed
Fix repeated resolved alerts
Signed-off-by: Xiaochao Dong (@damnever) <[email protected]>
1 parent da6de1f commit cefa32d

File tree

3 files changed

+83
-20
lines changed

3 files changed

+83
-20
lines changed

dispatch/dispatch.go

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ package dispatch
1515

1616
import (
1717
"context"
18+
"errors"
1819
"fmt"
1920
"sort"
2021
"sync"
@@ -474,8 +475,12 @@ func (ag *aggrGroup) stop() {
474475

475476
// insert inserts the alert into the aggregation group.
476477
func (ag *aggrGroup) insert(alert *types.Alert) {
477-
if err := ag.alerts.Set(alert); err != nil {
478-
level.Error(ag.logger).Log("msg", "error on set alert", "err", err)
478+
if err := ag.alerts.SetOrReplaceResolved(alert); err != nil {
479+
if errors.Is(err, store.ErrNotFound) {
480+
level.Warn(ag.logger).Log("msg", "ignore resolved alert since there is no corresponding record in the store")
481+
} else {
482+
level.Error(ag.logger).Log("msg", "error on set alert", "err", err)
483+
}
479484
}
480485

481486
// Immediately trigger a flush if the wait duration for this
@@ -516,17 +521,15 @@ func (ag *aggrGroup) flush(notify func(...*types.Alert) bool) {
516521

517522
if notify(alertsSlice...) {
518523
for _, a := range alertsSlice {
519-
// Only delete if the fingerprint has not been inserted
524+
// Only delete the resolved alert if the fingerprint has not been active
520525
// again since we notified about it.
521-
fp := a.Fingerprint()
522-
got, err := ag.alerts.Get(fp)
523-
if err != nil {
524-
// This should never happen.
525-
level.Error(ag.logger).Log("msg", "failed to get alert", "err", err, "alert", a.String())
526+
if !a.Resolved() {
526527
continue
527528
}
528-
if a.Resolved() && got.UpdatedAt == a.UpdatedAt {
529-
if err := ag.alerts.Delete(fp); err != nil {
529+
if err := ag.alerts.DeleteIfResolved(a.Fingerprint()); err != nil {
530+
if errors.Is(err, store.ErrNotResolved) {
531+
level.Debug(ag.logger).Log("msg", "resolved alert has been active again", "alert", a.String())
532+
} else {
530533
level.Error(ag.logger).Log("msg", "error on delete alert", "err", err, "alert", a.String())
531534
}
532535
}

store/store.go

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,12 @@ import (
2424
"github.com/prometheus/alertmanager/types"
2525
)
2626

27-
// ErrNotFound is returned if a Store cannot find the Alert.
28-
var ErrNotFound = errors.New("alert not found")
27+
var (
28+
// ErrNotFound is returned if a Store cannot find the Alert.
29+
ErrNotFound = errors.New("alert not found")
30+
// ErrNotResolved is returned if the alert is not resolved.
31+
ErrNotResolved = errors.New("alert is not resolved")
32+
)
2933

3034
// Alerts provides lock-coordinated to an in-memory map of alerts, keyed by
3135
// their fingerprint. Resolved alerts are removed from the map based on
@@ -98,10 +102,26 @@ func (a *Alerts) Get(fp model.Fingerprint) (*types.Alert, error) {
98102

99103
// Set unconditionally sets the alert in memory.
100104
func (a *Alerts) Set(alert *types.Alert) error {
105+
fp := alert.Fingerprint()
106+
101107
a.Lock()
102108
defer a.Unlock()
103109

104-
a.c[alert.Fingerprint()] = alert
110+
a.c[fp] = alert
111+
return nil
112+
}
113+
114+
// SetOrReplaceResolved returns ErrNotFound if the alert is resolved and
115+
// there is no corresponding record in the store.
116+
func (a *Alerts) SetOrReplaceResolved(alert *types.Alert) error {
117+
fp := alert.Fingerprint()
118+
119+
a.Lock()
120+
defer a.Unlock()
121+
if _, ok := a.c[fp]; !ok && alert.Resolved() {
122+
return ErrNotFound
123+
}
124+
a.c[fp] = alert
105125
return nil
106126
}
107127

@@ -114,6 +134,20 @@ func (a *Alerts) Delete(fp model.Fingerprint) error {
114134
return nil
115135
}
116136

137+
// DeleteIfResolved removes the Alert if it is resolved.
138+
func (a *Alerts) DeleteIfResolved(fp model.Fingerprint) error {
139+
a.Lock()
140+
defer a.Unlock()
141+
142+
if exist, ok := a.c[fp]; !ok {
143+
return ErrNotFound
144+
} else if !exist.Resolved() {
145+
return ErrNotResolved
146+
}
147+
delete(a.c, fp)
148+
return nil
149+
}
150+
117151
// List returns a slice of Alerts currently held in memory.
118152
func (a *Alerts) List() []*types.Alert {
119153
a.Lock()

store/store_test.go

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,33 @@ func TestDelete(t *testing.T) {
5454
require.Equal(t, ErrNotFound, err)
5555
}
5656

57+
func TestResolved(t *testing.T) {
58+
a := NewAlerts()
59+
60+
now := time.Now()
61+
require.NoError(t, a.SetOrReplaceResolved(makeAlert("a", now, -2, 10)))
62+
resolved := makeAlert("a", now, -2, -1)
63+
require.NoError(t, a.SetOrReplaceResolved(resolved))
64+
require.NoError(t, a.SetOrReplaceResolved(resolved))
65+
a.gc()
66+
require.ErrorIs(t, a.SetOrReplaceResolved(resolved), ErrNotFound)
67+
68+
require.ErrorIs(t, a.DeleteIfResolved(resolved.Fingerprint()), ErrNotFound)
69+
require.NoError(t, a.SetOrReplaceResolved(makeAlert("a", now, -2, 10)))
70+
require.ErrorIs(t, a.DeleteIfResolved(resolved.Fingerprint()), ErrNotResolved)
71+
require.NoError(t, a.SetOrReplaceResolved(resolved))
72+
require.NoError(t, a.DeleteIfResolved(resolved.Fingerprint()))
73+
_, err := a.Get(resolved.Fingerprint())
74+
require.ErrorIs(t, err, ErrNotFound)
75+
}
76+
5777
func TestGC(t *testing.T) {
5878
now := time.Now()
79+
5980
newAlert := func(key string, start, end time.Duration) *types.Alert {
60-
return &types.Alert{
61-
Alert: model.Alert{
62-
Labels: model.LabelSet{model.LabelName(key): "b"},
63-
StartsAt: now.Add(start * time.Minute),
64-
EndsAt: now.Add(end * time.Minute),
65-
},
66-
}
81+
return makeAlert(key, now, start, end)
6782
}
83+
6884
active := []*types.Alert{
6985
newAlert("b", 10, 20),
7086
newAlert("c", -10, 10),
@@ -111,3 +127,13 @@ func TestGC(t *testing.T) {
111127
}
112128
require.Equal(t, len(resolved), n)
113129
}
130+
131+
func makeAlert(key string, now time.Time, start, end time.Duration) *types.Alert {
132+
return &types.Alert{
133+
Alert: model.Alert{
134+
Labels: model.LabelSet{model.LabelName(key): "b"},
135+
StartsAt: now.Add(start * time.Minute),
136+
EndsAt: now.Add(end * time.Minute),
137+
},
138+
}
139+
}

0 commit comments

Comments
 (0)