Skip to content

Add limits for silences #84

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ func run() int {
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of active and pending silences, excluding expired silences. If negative or zero, no limit is set.").Default("0").Int()
maxPerSilenceBytes = kingpin.Flag("silences.max-per-silence-bytes", "Maximum per silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()

webConfig = webflag.AddFlags(kingpin.CommandLine, ":9093")
Expand Down Expand Up @@ -258,8 +260,12 @@ func run() int {
silenceOpts := silence.Options{
SnapshotFile: filepath.Join(*dataDir, "silences"),
Retention: *retention,
Logger: log.With(logger, "component", "silences"),
Metrics: prometheus.DefaultRegisterer,
Limits: silence.Limits{
MaxSilences: *maxSilences,
MaxPerSilenceBytes: *maxPerSilenceBytes,
},
Logger: log.With(logger, "component", "silences"),
Metrics: prometheus.DefaultRegisterer,
}

silences, err := silence.New(silenceOpts)
Expand Down
11 changes: 11 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,17 @@ is not well-formed, the changes will not be applied and an error is logged.
A configuration reload is triggered by sending a `SIGHUP` to the process or
sending an HTTP POST request to the `/-/reload` endpoint.

## Limits

Alertmanager supports a number of configurable limits via command-line flags.

To limit the maximum number of active and pending silences, excluding expired ones,
use the `--silences.max-silences` flag.
You can limit the maximum size of individual silences with `--silences.max-per-silence-bytes`,
where the unit is in bytes.

Both limits are disabled by default.

## Configuration file introduction

To specify which configuration file to load, use the `--config.file` flag.
Expand Down
51 changes: 46 additions & 5 deletions silence/silence.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ type Silences struct {
logger log.Logger
metrics *metrics
retention time.Duration
limits Limits

mtx sync.RWMutex
st state
Expand All @@ -201,6 +202,16 @@ type Silences struct {
mc matcherCache
}

// Limits contains the limits for silences.
type Limits struct {
// MaxSilences limits the maximum number active and pending silences.
// It does not include expired silences.
MaxSilences int
// MaxPerSilenceBytes is the maximum size of an individual silence as
// stored on disk.
MaxPerSilenceBytes int
}

// MaintenanceFunc represents the function to run as part of the periodic maintenance for silences.
// It returns the size of the snapshot taken or an error if it failed.
type MaintenanceFunc func() (int64, error)
Expand Down Expand Up @@ -318,6 +329,7 @@ type Options struct {
// Retention time for newly created Silences. Silences may be
// garbage collected after the given duration after they ended.
Retention time.Duration
Limits Limits

// A logger used by background processing.
Logger log.Logger
Expand All @@ -342,6 +354,7 @@ func New(o Options) (*Silences, error) {
mc: matcherCache{},
logger: log.NewNopLogger(),
retention: o.Retention,
limits: o.Limits,
broadcast: func([]byte) {},
st: state{},
}
Expand Down Expand Up @@ -569,6 +582,13 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool)
return err
}

// Check the limit unless the silence has been expired. This is to avoid
// situations where silences cannot be expired after the limit has been
// reduced.
if n := msil.Size(); s.limits.MaxPerSilenceBytes > 0 && n > s.limits.MaxPerSilenceBytes && sil.EndsAt.After(now) {
return fmt.Errorf("silence exceeded maximum size: %d bytes (limit: %d bytes)", n, s.limits.MaxPerSilenceBytes)
}

if s.st.merge(msil, now) {
s.version++
}
Expand Down Expand Up @@ -608,10 +628,10 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) {
func (s *Silences) set(sil *pb.Silence) (string, error) {
now := s.nowUTC()
prev, ok := s.getSilence(sil.Id)

if sil.Id != "" && !ok {
return "", ErrNotFound
}

if ok {
if canUpdate(prev, sil, now) {
return sil.Id, s.setSilence(sil, now, false)
Expand All @@ -623,7 +643,24 @@ func (s *Silences) set(sil *pb.Silence) (string, error) {
}
}
}

// If we got here it's either a new silence or a replacing one.
if s.limits.MaxSilences > 0 {
// Get the number of active and pending silences to enforce limits.
q := &query{}
err := QState(types.SilenceStateActive, types.SilenceStatePending)(q)
if err != nil {
return "", fmt.Errorf("unable to query silences while checking limits: %w", err)
}
sils, _, err := s.query(q, s.nowUTC())
if err != nil {
return "", fmt.Errorf("unable to query silences while checking limits: %w", err)
}
if len(sils)+1 > s.limits.MaxSilences {
return "", fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(sils), s.limits.MaxSilences)
}
}

uid, err := uuid.NewV4()
if err != nil {
return "", fmt.Errorf("generate uuid: %w", err)
Expand All @@ -634,7 +671,11 @@ func (s *Silences) set(sil *pb.Silence) (string, error) {
sil.StartsAt = now
}

return sil.Id, s.setSilence(sil, now, false)
if err = s.setSilence(sil, now, false); err != nil {
return "", err
}

return sil.Id, nil
}

// canUpdate returns true if silence a can be updated to b without
Expand Down Expand Up @@ -778,6 +819,9 @@ func (s *Silences) QueryOne(params ...QueryParam) (*pb.Silence, error) {
// Query for silences based on the given query parameters. It returns the
// resulting silences and the state version the result is based on.
func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, int, error) {
s.mtx.Lock()
defer s.mtx.Unlock()

s.metrics.queriesTotal.Inc()
defer prometheus.NewTimer(s.metrics.queryDuration).ObserveDuration()

Expand Down Expand Up @@ -817,9 +861,6 @@ func (s *Silences) query(q *query, now time.Time) ([]*pb.Silence, int, error) {
// the use of post-filter functions is the trivial solution for now.
var res []*pb.Silence

s.mtx.Lock()
defer s.mtx.Unlock()

if q.ids != nil {
for _, id := range q.ids {
if s, ok := s.st[id]; ok {
Expand Down
69 changes: 69 additions & 0 deletions silence/silence_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"os"
"runtime"
"sort"
"strings"
"sync"
"testing"
"time"
Expand Down Expand Up @@ -458,6 +459,74 @@ func TestSilenceSet(t *testing.T) {
require.Equal(t, want, s.st, "unexpected state after silence creation")
}

func TestSilenceLimits(t *testing.T) {
s, err := New(Options{
Limits: Limits{
MaxSilences: 1,
MaxPerSilenceBytes: 2 << 11, // 4KB
},
})
require.NoError(t, err)

// Insert sil1 should succeed without error.
sil1 := &pb.Silence{
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id1, err := s.Set(sil1)
require.NoError(t, err)
require.NotEqual(t, "", id1)

// Insert sil2 should fail because maximum number of silences
// has been exceeded.
sil2 := &pb.Silence{
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id2, err := s.Set(sil2)
require.EqualError(t, err, "exceeded maximum number of silences: 1 (limit: 1)")
require.Equal(t, "", id2)

// Expire sil1. This should allow sil2 to be inserted.
require.NoError(t, s.Expire(id1))
id2, err = s.Set(sil2)
require.NoError(t, err)
require.NotEqual(t, "", id2)

// Should be able to update sil2 without hitting the limit.
_, err = s.Set(sil2)
require.NoError(t, err)

// Expire sil2.
require.NoError(t, s.Expire(id2))

// Insert sil3 should fail because it exceeds maximum size.
sil3 := &pb.Silence{
Matchers: []*pb.Matcher{
{
Name: strings.Repeat("a", 2<<9),
Pattern: strings.Repeat("b", 2<<9),
},
{
Name: strings.Repeat("c", 2<<9),
Pattern: strings.Repeat("d", 2<<9),
},
},
CreatedBy: strings.Repeat("e", 2<<9),
Comment: strings.Repeat("f", 2<<9),
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id3, err := s.Set(sil3)
require.Error(t, err)
// Do not check the exact size as it can change between consecutive runs
// due to padding.
require.Contains(t, err.Error(), "silence exceeded maximum size")
require.Equal(t, "", id3)
}

func TestSilenceUpsert(t *testing.T) {
s, err := New(Options{
Retention: time.Hour,
Expand Down
Loading