Skip to content

Commit 1cfb657

Browse files
Fixes a number of bugs in silences (#8525)
This commit fixes the following bugs in silences: - prometheus/alertmanager#3877 - prometheus/alertmanager#3898 - prometheus/alertmanager#3897 which could cause an existing silence to be deleted/expired when updating the silence failed. This could be because the replacing silence exceeded limits or was invalid.
1 parent 3e68842 commit 1cfb657

File tree

10 files changed

+178
-121
lines changed

10 files changed

+178
-121
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
* [BUGFIX] Query scheduler: fix a panic in request queueing. #8451
3737
* [BUGFIX] Querier: fix issue where "context canceled" is logged for trace spans for requests to store-gateways that return no series when chunks streaming is enabled. #8510
3838
* [BUGFIX] Alertmanager: Fix per-tenant silence limits not reloaded during runtime. #8456
39+
* [BUGFIX] Alertmanager: Fixes a number of bugs in silences which could cause an existing silence to be deleted/expired when updating the silence failed. This could happen when the replacing silence was invalid or exceeded limits. #8525
3940

4041
### Mixin
4142

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ require (
6363
github.com/google/go-github/v57 v57.0.0
6464
github.com/google/uuid v1.6.0
6565
github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc
66-
github.com/grafana/alerting v0.0.0-20240607182251-835aff588914
66+
github.com/grafana/alerting v0.0.0-20240626080128-8299cb22b8df
6767
github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc
6868
github.com/hashicorp/golang-lru/v2 v2.0.7
6969
github.com/hashicorp/vault/api v1.10.0
@@ -292,4 +292,4 @@ replace github.com/opentracing-contrib/go-stdlib => github.com/grafana/opentraci
292292
replace github.com/opentracing-contrib/go-grpc => github.com/charleskorn/go-grpc v0.0.0-20231024023642-e9298576254f
293293

294294
// Replacing prometheus/alertmanager with our fork.
295-
replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1
295+
replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240625192351-66ec17e3aa45

go.sum

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -505,8 +505,8 @@ github.com/gosimple/slug v1.1.1 h1:fRu/digW+NMwBIP+RmviTK97Ho/bEj/C9swrCspN3D4=
505505
github.com/gosimple/slug v1.1.1/go.mod h1:ER78kgg1Mv0NQGlXiDe57DpCyfbNywXXZ9mIorhxAf0=
506506
github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc h1:PXZQA2WCxe85Tnn+WEvr8fDpfwibmEPgfgFEaC87G24=
507507
github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4=
508-
github.com/grafana/alerting v0.0.0-20240607182251-835aff588914 h1:WXLbSnnomltxdNcE20CI8RD8quZ/L0YpXP0WK+0S1BU=
509-
github.com/grafana/alerting v0.0.0-20240607182251-835aff588914/go.mod h1:U7Ta3K4T7jVgqGSYuPsfuPKHFiL2GbCZSHa3nHjmCos=
508+
github.com/grafana/alerting v0.0.0-20240626080128-8299cb22b8df h1:HgsjWyuUhJQ3jgu+mumqKuPl/KLLuOoIoCBaSliSZNY=
509+
github.com/grafana/alerting v0.0.0-20240626080128-8299cb22b8df/go.mod h1:hWW8UNwXZCRT7SfsBRGygGfBYj2QwsnTM+OEraRiov0=
510510
github.com/grafana/dskit v0.0.0-20240614072459-4beef469a18e h1:x2OBF2vc9dtvvOEcndIM+cFDAfZ7K3GnKe6gdL0FSIU=
511511
github.com/grafana/dskit v0.0.0-20240614072459-4beef469a18e/go.mod h1:HvSf3uf8Ps2vPpzHeAFyZTdUcbVr+Rxpq1xcx7J/muc=
512512
github.com/grafana/e2e v0.1.2-0.20240118170847-db90b84177fc h1:BW+LjKJDz0So5LI8UZfW5neWeKpSkWqhmGjQFzcFfLM=
@@ -521,8 +521,8 @@ github.com/grafana/mimir-prometheus v0.0.0-20240620082736-3d8577bc0dfb h1:RRarht
521521
github.com/grafana/mimir-prometheus v0.0.0-20240620082736-3d8577bc0dfb/go.mod h1:ZrBwbXc+KqeAQT4QXHKfi68+7vaOzoSdrkk90RRgdkE=
522522
github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0=
523523
github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU=
524-
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1 h1:osfUzwxT9P3Iu5cIBLfAtyb5ybj/PB1rnFXLYPciPKE=
525-
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc=
524+
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240625192351-66ec17e3aa45 h1:AJKOtDKAOg8XNFnIZSmqqqutoTSxVlRs6vekL2p2KEY=
525+
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240625192351-66ec17e3aa45/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc=
526526
github.com/grafana/pyroscope-go/godeltaprof v0.1.6 h1:nEdZ8louGAplSvIJi1HVp7kWvFvdiiYg3COLlTwJiFo=
527527
github.com/grafana/pyroscope-go/godeltaprof v0.1.6/go.mod h1:Tk376Nbldo4Cha9RgiU7ik8WKFkNpfds98aUzS8omLE=
528528
github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b h1:oMAq12GxTpwo9jxbnG/M4F/HdpwbibTaVoxNA0NZprY=

integration/alertmanager_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ func TestAlertmanagerClassicMode(t *testing.T) {
192192
StartsAt: time.Now(),
193193
EndsAt: time.Now().Add(time.Minute),
194194
})
195-
require.EqualError(t, err, "creating the silence failed with status 400 and error \"silence invalid: invalid label matcher 0: invalid label name \\\"bar🙂\\\"\"\n")
195+
require.EqualError(t, err, "creating the silence failed with status 400 and error \"invalid silence: invalid label matcher 0: invalid label name \\\"bar🙂\\\"\"\n")
196196
require.Empty(t, silenceID)
197197

198198
// Should be able to post alerts with classic labels but not UTF-8 labels.

pkg/alertmanager/alertmanager_test.go

Lines changed: 102 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/grafana/dskit/test"
2525
"github.com/prometheus/alertmanager/cluster/clusterpb"
2626
"github.com/prometheus/alertmanager/featurecontrol"
27+
"github.com/prometheus/alertmanager/silence"
2728
"github.com/prometheus/alertmanager/silence/silencepb"
2829
"github.com/prometheus/alertmanager/types"
2930
"github.com/prometheus/client_golang/prometheus"
@@ -330,17 +331,33 @@ func testLimiter(t *testing.T, limits Limits, ops []callbackOp) {
330331
}
331332
}
332333

334+
// cloneSilence returns a shallow copy of a silence. It is used in tests.
335+
func cloneSilence(t *testing.T, sil *silencepb.Silence) *silencepb.Silence {
336+
t.Helper()
337+
s := *sil
338+
return &s
339+
}
340+
341+
func toMeshSilence(t *testing.T, sil *silencepb.Silence, retention time.Duration) *silencepb.MeshSilence {
342+
t.Helper()
343+
return &silencepb.MeshSilence{
344+
Silence: sil,
345+
ExpiresAt: sil.EndsAt.Add(retention),
346+
}
347+
}
348+
333349
func TestSilenceLimits(t *testing.T) {
334350
user := "test"
335351

336352
r := prometheus.NewPedanticRegistry()
353+
limits := mockAlertManagerLimits{
354+
maxSilencesCount: 1,
355+
maxSilenceSizeBytes: 2 << 11, // 4KB,
356+
}
337357
am, err := New(&Config{
338-
UserID: user,
339-
Logger: log.NewNopLogger(),
340-
Limits: &mockAlertManagerLimits{
341-
maxSilencesCount: 1,
342-
maxSilenceSizeBytes: 2 << 11, // 4KB,
343-
},
358+
UserID: user,
359+
Logger: log.NewNopLogger(),
360+
Limits: &limits,
344361
Features: featurecontrol.NoopFlags{},
345362
TenantDataDir: t.TempDir(),
346363
ExternalURL: &url.URL{Path: "/am"},
@@ -364,38 +381,26 @@ func TestSilenceLimits(t *testing.T) {
364381
StartsAt: time.Now(),
365382
EndsAt: time.Now().Add(5 * time.Minute),
366383
}
367-
id1, err := am.silences.Set(sil1)
368-
require.NoError(t, err)
369-
require.NotEqual(t, "", id1)
384+
require.NoError(t, am.silences.Set(sil1))
370385

371-
// Insert sil2 should fail because maximum number of silences
372-
// has been exceeded.
386+
// Insert sil2 should fail because maximum number of silences has been
387+
// exceeded.
373388
sil2 := &silencepb.Silence{
374-
Matchers: []*silencepb.Matcher{{Name: "a", Pattern: "b"}},
389+
Matchers: []*silencepb.Matcher{{Name: "c", Pattern: "d"}},
375390
StartsAt: time.Now(),
376391
EndsAt: time.Now().Add(5 * time.Minute),
377392
}
378-
id2, err := am.silences.Set(sil2)
379-
require.EqualError(t, err, "exceeded maximum number of silences: 1 (limit: 1)")
380-
require.Equal(t, "", id2)
393+
require.EqualError(t, am.silences.Set(sil2), "exceeded maximum number of silences: 1 (limit: 1)")
381394

382-
// Expire sil1 and run the GC. This should allow sil2 to be
383-
// inserted.
384-
require.NoError(t, am.silences.Expire(id1))
395+
// Expire sil1 and run the GC. This should allow sil2 to be inserted.
396+
require.NoError(t, am.silences.Expire(sil1.Id))
385397
n, err := am.silences.GC()
386398
require.NoError(t, err)
387399
require.Equal(t, 1, n)
400+
require.NoError(t, am.silences.Set(sil2))
388401

389-
id2, err = am.silences.Set(sil2)
390-
require.NoError(t, err)
391-
require.NotEqual(t, "", id2)
392-
393-
// Should be able to update sil2 without hitting the limit.
394-
_, err = am.silences.Set(sil2)
395-
require.NoError(t, err)
396-
397-
// Expire sil2.
398-
require.NoError(t, am.silences.Expire(id2))
402+
// Expire sil2 and run the GC.
403+
require.NoError(t, am.silences.Expire(sil2.Id))
399404
n, err = am.silences.GC()
400405
require.NoError(t, err)
401406
require.Equal(t, 1, n)
@@ -404,25 +409,82 @@ func TestSilenceLimits(t *testing.T) {
404409
sil3 := &silencepb.Silence{
405410
Matchers: []*silencepb.Matcher{
406411
{
407-
Name: strings.Repeat("a", 2<<9),
408-
Pattern: strings.Repeat("b", 2<<9),
412+
Name: strings.Repeat("e", 2<<9),
413+
Pattern: strings.Repeat("f", 2<<9),
409414
},
410415
{
411-
Name: strings.Repeat("c", 2<<9),
412-
Pattern: strings.Repeat("d", 2<<9),
416+
Name: strings.Repeat("g", 2<<9),
417+
Pattern: strings.Repeat("h", 2<<9),
413418
},
414419
},
415-
CreatedBy: strings.Repeat("e", 2<<9),
416-
Comment: strings.Repeat("f", 2<<9),
420+
CreatedBy: strings.Repeat("i", 2<<9),
421+
Comment: strings.Repeat("j", 2<<9),
417422
StartsAt: time.Now(),
418423
EndsAt: time.Now().Add(5 * time.Minute),
419424
}
420-
id3, err := am.silences.Set(sil3)
421-
require.Error(t, err)
422-
// Do not check the exact size as it can change between consecutive runs
423-
// due to padding.
424-
require.Contains(t, err.Error(), "silence exceeded maximum size")
425-
require.Equal(t, "", id3)
425+
require.EqualError(t, am.silences.Set(sil3), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", toMeshSilence(t, sil3, 0).Size()))
426+
427+
// Should be able to insert sil4.
428+
sil4 := &silencepb.Silence{
429+
Matchers: []*silencepb.Matcher{{Name: "k", Pattern: "l"}},
430+
StartsAt: time.Now(),
431+
EndsAt: time.Now().Add(5 * time.Minute),
432+
}
433+
require.NoError(t, am.silences.Set(sil4))
434+
435+
// Should be able to update sil4 without modifications. It is expected to
436+
// keep the same ID.
437+
sil5 := cloneSilence(t, sil4)
438+
require.NoError(t, am.silences.Set(sil5))
439+
require.Equal(t, sil4.Id, sil5.Id)
440+
441+
// Should be able to update the comment. It is also expected to keep the
442+
// same ID.
443+
sil6 := cloneSilence(t, sil5)
444+
sil6.Comment = "m"
445+
require.NoError(t, am.silences.Set(sil6))
446+
require.Equal(t, sil5.Id, sil6.Id)
447+
448+
// Should not be able to update the start and end time as this requires
449+
// sil6 to be expired and a new silence to be created. However, this would
450+
// exceed the maximum number of silences, which counts both active and
451+
// expired silences.
452+
sil7 := cloneSilence(t, sil6)
453+
sil7.StartsAt = time.Now().Add(5 * time.Minute)
454+
sil7.EndsAt = time.Now().Add(10 * time.Minute)
455+
require.EqualError(t, am.silences.Set(sil7), "exceeded maximum number of silences: 1 (limit: 1)")
456+
457+
// sil6 should not be expired because the update failed.
458+
sils, _, err := am.silences.Query(silence.QState(types.SilenceStateExpired))
459+
require.NoError(t, err)
460+
require.Len(t, sils, 0)
461+
462+
// Should not be able to update with a comment that exceeds maximum size.
463+
// Need to increase the maximum number of silences to test this.
464+
limits.maxSilencesCount = 2
465+
sil8 := cloneSilence(t, sil6)
466+
sil8.Comment = strings.Repeat("m", 2<<11)
467+
require.EqualError(t, am.silences.Set(sil8), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", toMeshSilence(t, sil8, 0).Size()))
468+
469+
// sil6 should not be expired because the update failed.
470+
sils, _, err = am.silences.Query(silence.QState(types.SilenceStateExpired))
471+
require.NoError(t, err)
472+
require.Len(t, sils, 0)
473+
474+
// Should not be able to replace with a silence that exceeds maximum size.
475+
// This is different from the previous assertion as unlike when adding or
476+
// updating a comment, changing the matchers for a silence should expire
477+
// the existing silence, unless the silence that is replacing it exceeds
478+
// limits, in which case the operation should fail and the existing silence
479+
// should still be active.
480+
sil9 := cloneSilence(t, sil8)
481+
sil9.Matchers = []*silencepb.Matcher{{Name: "n", Pattern: "o"}}
482+
require.EqualError(t, am.silences.Set(sil9), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", toMeshSilence(t, sil9, 0).Size()))
483+
484+
// sil6 should not be expired because the update failed.
485+
sils, _, err = am.silences.Query(silence.QState(types.SilenceStateExpired))
486+
require.NoError(t, err)
487+
require.Len(t, sils, 0)
426488
}
427489

428490
func TestExperimentalReceiversAPI(t *testing.T) {

vendor/github.com/grafana/alerting/notify/silences.go

Lines changed: 4 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/prometheus/alertmanager/api/v2/api.go

Lines changed: 2 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/prometheus/alertmanager/api/v2/testing.go

Lines changed: 2 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)