Skip to content

Commit 142b7a2

Browse files
56quarterspracucci
authored andcommitted
Add per-user query metrics for series and bytes returned (cortexproject#4343)
* Add per-user query metrics for series and bytes returned Add stats included in query responses from the querier and distributor for measuring the number of series and bytes included in successful queries. These stats are emitted per-user as summaries from the query frontends. These stats are picked to add visibility into the same resources limited as part of cortexproject#4179 and cortexproject#4216. Fixes cortexproject#4259 Signed-off-by: Nick Pillitteri <[email protected]> * Formatting fix Signed-off-by: Nick Pillitteri <[email protected]> * Fix changelog to match actual changes Signed-off-by: Nick Pillitteri <[email protected]> * Typo Signed-off-by: Nick Pillitteri <[email protected]> * Code review changes, rename things for clarity Signed-off-by: Nick Pillitteri <[email protected]> * Apply suggestions from code review Co-authored-by: Marco Pracucci <[email protected]> Signed-off-by: Nick Pillitteri <[email protected]> * Code review changes, remove superfluous summaries Signed-off-by: Nick Pillitteri <[email protected]> Co-authored-by: Marco Pracucci <[email protected]> Signed-off-by: Alvin Lin <[email protected]>
1 parent 11a31e1 commit 142b7a2

File tree

9 files changed

+337
-28
lines changed

9 files changed

+337
-28
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## master / unreleased
44
* [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` as a per-user metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317
55
* [FEATURE] Add shuffle sharding grouper and planner within compactor to allow further work towards parallelizing compaction #4357
6+
* [FEATURE] Query Frontend: Add `cortex_query_fetched_series_total` and `cortex_query_fetched_chunks_bytes_total` per-user counters to expose the number of series and bytes fetched as part of queries. These metrics can be enabled with the `-frontend.query-stats-enabled` flag (or its respective YAML config option `query_stats_enabled`). #4343
67
* [CHANGE] Update Go version to 1.16.6. #4362
78
* [CHANGE] Querier / ruler: Change `-querier.max-fetched-chunks-per-query` configuration to limit to maximum number of chunks that can be fetched in a single query. The number of chunks fetched by ingesters AND long-term storare combined should not exceed the value configured on `-querier.max-fetched-chunks-per-query`. #4260
89
* [CHANGE] Memberlist: the `memberlist_kv_store_value_bytes` has been removed due to values no longer being stored in-memory as encoded bytes. #4345

pkg/distributor/query.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313

1414
"github.com/cortexproject/cortex/pkg/cortexpb"
1515
ingester_client "github.com/cortexproject/cortex/pkg/ingester/client"
16+
"github.com/cortexproject/cortex/pkg/querier/stats"
1617
"github.com/cortexproject/cortex/pkg/ring"
1718
"github.com/cortexproject/cortex/pkg/tenant"
1819
"github.com/cortexproject/cortex/pkg/util"
@@ -282,6 +283,7 @@ func (d *Distributor) queryIngestersExemplars(ctx context.Context, replicationSe
282283
func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSet ring.ReplicationSet, req *ingester_client.QueryRequest) (*ingester_client.QueryStreamResponse, error) {
283284
var (
284285
queryLimiter = limiter.QueryLimiterFromContextWithFallback(ctx)
286+
reqStats = stats.FromContext(ctx)
285287
)
286288

287289
// Fetch samples from multiple ingesters
@@ -383,6 +385,9 @@ func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSet ri
383385
resp.Timeseries = append(resp.Timeseries, series)
384386
}
385387

388+
reqStats.AddFetchedSeries(uint64(len(resp.Chunkseries) + len(resp.Timeseries)))
389+
reqStats.AddFetchedChunkBytes(uint64(resp.ChunksSize()))
390+
386391
return resp, nil
387392
}
388393

pkg/frontend/transport/handler.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ type Handler struct {
6060

6161
// Metrics.
6262
querySeconds *prometheus.CounterVec
63+
querySeries *prometheus.CounterVec
64+
queryBytes *prometheus.CounterVec
6365
activeUsers *util.ActiveUsersCleanupService
6466
}
6567

@@ -77,8 +79,20 @@ func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logge
7779
Help: "Total amount of wall clock time spend processing queries.",
7880
}, []string{"user"})
7981

82+
h.querySeries = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
83+
Name: "cortex_query_fetched_series_total",
84+
Help: "Number of series fetched to execute a query.",
85+
}, []string{"user"})
86+
87+
h.queryBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
88+
Name: "cortex_query_fetched_chunks_bytes_total",
89+
Help: "Size of all chunks fetched to execute a query in bytes.",
90+
}, []string{"user"})
91+
8092
h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(func(user string) {
8193
h.querySeconds.DeleteLabelValues(user)
94+
h.querySeries.DeleteLabelValues(user)
95+
h.queryBytes.DeleteLabelValues(user)
8296
})
8397
// If cleaner stops or fail, we will simply not clean the metrics for inactive users.
8498
_ = h.activeUsers.StartAsync(context.Background())
@@ -165,9 +179,14 @@ func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, quer
165179
return
166180
}
167181
userID := tenant.JoinTenantIDs(tenantIDs)
182+
wallTime := stats.LoadWallTime()
183+
numSeries := stats.LoadFetchedSeries()
184+
numBytes := stats.LoadFetchedChunkBytes()
168185

169186
// Track stats.
170-
f.querySeconds.WithLabelValues(userID).Add(stats.LoadWallTime().Seconds())
187+
f.querySeconds.WithLabelValues(userID).Add(wallTime.Seconds())
188+
f.querySeries.WithLabelValues(userID).Add(float64(numSeries))
189+
f.queryBytes.WithLabelValues(userID).Add(float64(numBytes))
171190
f.activeUsers.UpdateUserTimestamp(userID, time.Now())
172191

173192
// Log stats.
@@ -177,7 +196,9 @@ func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, quer
177196
"method", r.Method,
178197
"path", r.URL.Path,
179198
"response_time", queryResponseTime,
180-
"query_wall_time_seconds", stats.LoadWallTime().Seconds(),
199+
"query_wall_time_seconds", wallTime.Seconds(),
200+
"fetched_series_count", numSeries,
201+
"fetched_chunks_bytes", numBytes,
181202
}, formatQueryString(queryString)...)
182203

183204
level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)

pkg/frontend/transport/handler_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,28 @@ package transport
22

33
import (
44
"context"
5+
"io"
56
"net/http"
67
"net/http/httptest"
8+
"strings"
79
"testing"
810

11+
"github.com/go-kit/kit/log"
912
"github.com/pkg/errors"
13+
"github.com/prometheus/client_golang/prometheus"
14+
promtest "github.com/prometheus/client_golang/prometheus/testutil"
15+
"github.com/stretchr/testify/assert"
1016
"github.com/stretchr/testify/require"
1117
"github.com/weaveworks/common/httpgrpc"
18+
"github.com/weaveworks/common/user"
1219
)
1320

21+
type roundTripperFunc func(*http.Request) (*http.Response, error)
22+
23+
func (f roundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) {
24+
return f(r)
25+
}
26+
1427
func TestWriteError(t *testing.T) {
1528
for _, test := range []struct {
1629
status int
@@ -28,3 +41,53 @@ func TestWriteError(t *testing.T) {
2841
})
2942
}
3043
}
44+
45+
func TestHandler_ServeHTTP(t *testing.T) {
46+
for _, tt := range []struct {
47+
name string
48+
cfg HandlerConfig
49+
expectedMetrics int
50+
}{
51+
{
52+
name: "test handler with stats enabled",
53+
cfg: HandlerConfig{QueryStatsEnabled: true},
54+
expectedMetrics: 3,
55+
},
56+
{
57+
name: "test handler with stats disabled",
58+
cfg: HandlerConfig{QueryStatsEnabled: false},
59+
expectedMetrics: 0,
60+
},
61+
} {
62+
t.Run(tt.name, func(t *testing.T) {
63+
roundTripper := roundTripperFunc(func(req *http.Request) (*http.Response, error) {
64+
return &http.Response{
65+
StatusCode: http.StatusOK,
66+
Body: io.NopCloser(strings.NewReader("{}")),
67+
}, nil
68+
})
69+
70+
reg := prometheus.NewPedanticRegistry()
71+
handler := NewHandler(tt.cfg, roundTripper, log.NewNopLogger(), reg)
72+
73+
ctx := user.InjectOrgID(context.Background(), "12345")
74+
req := httptest.NewRequest("GET", "/", nil)
75+
req = req.WithContext(ctx)
76+
resp := httptest.NewRecorder()
77+
78+
handler.ServeHTTP(resp, req)
79+
_, _ = io.ReadAll(resp.Body)
80+
require.Equal(t, resp.Code, http.StatusOK)
81+
82+
count, err := promtest.GatherAndCount(
83+
reg,
84+
"cortex_query_seconds_total",
85+
"cortex_query_fetched_series_total",
86+
"cortex_query_fetched_chunks_bytes_total",
87+
)
88+
89+
assert.NoError(t, err)
90+
assert.Equal(t, tt.expectedMetrics, count)
91+
})
92+
}
93+
}

pkg/querier/blocks_store_queryable.go

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929

3030
"github.com/cortexproject/cortex/pkg/cortexpb"
3131
"github.com/cortexproject/cortex/pkg/querier/series"
32+
"github.com/cortexproject/cortex/pkg/querier/stats"
3233
"github.com/cortexproject/cortex/pkg/ring"
3334
"github.com/cortexproject/cortex/pkg/ring/kv"
3435
"github.com/cortexproject/cortex/pkg/storage/bucket"
@@ -565,6 +566,7 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
565566
numChunks = atomic.NewInt32(0)
566567
spanLog = spanlogger.FromContext(ctx)
567568
queryLimiter = limiter.QueryLimiterFromContextWithFallback(ctx)
569+
reqStats = stats.FromContext(ctx)
568570
)
569571

570572
// Concurrently fetch series from all clients.
@@ -626,10 +628,7 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
626628
return validation.LimitError(fmt.Sprintf(errMaxChunksPerQueryLimit, util.LabelMatchersToString(matchers), maxChunksLimit))
627629
}
628630
}
629-
chunksSize := 0
630-
for _, c := range s.Chunks {
631-
chunksSize += c.Size()
632-
}
631+
chunksSize := countChunkBytes(s)
633632
if chunkBytesLimitErr := queryLimiter.AddChunkBytes(chunksSize); chunkBytesLimitErr != nil {
634633
return validation.LimitError(chunkBytesLimitErr.Error())
635634
}
@@ -657,10 +656,16 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
657656
}
658657
}
659658

659+
numSeries := len(mySeries)
660+
chunkBytes := countChunkBytes(mySeries...)
661+
662+
reqStats.AddFetchedSeries(uint64(numSeries))
663+
reqStats.AddFetchedChunkBytes(uint64(chunkBytes))
664+
660665
level.Debug(spanLog).Log("msg", "received series from store-gateway",
661666
"instance", c.RemoteAddress(),
662-
"num series", len(mySeries),
663-
"bytes series", countSeriesBytes(mySeries),
667+
"fetched series", numSeries,
668+
"fetched chunk bytes", chunkBytes,
664669
"requested blocks", strings.Join(convertULIDsToString(blockIDs), " "),
665670
"queried blocks", strings.Join(convertULIDsToString(myQueriedBlocks), " "))
666671

@@ -944,12 +949,11 @@ func convertBlockHintsToULIDs(hints []hintspb.Block) ([]ulid.ULID, error) {
944949
return res, nil
945950
}
946951

947-
func countSeriesBytes(series []*storepb.Series) (count uint64) {
952+
// countChunkBytes returns the size of the chunks making up the provided series in bytes
953+
func countChunkBytes(series ...*storepb.Series) (count int) {
948954
for _, s := range series {
949955
for _, c := range s.Chunks {
950-
if c.Raw != nil {
951-
count += uint64(len(c.Raw.Data))
952-
}
956+
count += c.Size()
953957
}
954958
}
955959

pkg/querier/stats/stats.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,47 @@ func (s *Stats) LoadWallTime() time.Duration {
5454
return time.Duration(atomic.LoadInt64((*int64)(&s.WallTime)))
5555
}
5656

57+
func (s *Stats) AddFetchedSeries(series uint64) {
58+
if s == nil {
59+
return
60+
}
61+
62+
atomic.AddUint64(&s.FetchedSeriesCount, series)
63+
}
64+
65+
func (s *Stats) LoadFetchedSeries() uint64 {
66+
if s == nil {
67+
return 0
68+
}
69+
70+
return atomic.LoadUint64(&s.FetchedSeriesCount)
71+
}
72+
73+
func (s *Stats) AddFetchedChunkBytes(bytes uint64) {
74+
if s == nil {
75+
return
76+
}
77+
78+
atomic.AddUint64(&s.FetchedChunkBytes, bytes)
79+
}
80+
81+
func (s *Stats) LoadFetchedChunkBytes() uint64 {
82+
if s == nil {
83+
return 0
84+
}
85+
86+
return atomic.LoadUint64(&s.FetchedChunkBytes)
87+
}
88+
5789
// Merge the provide Stats into this one.
5890
func (s *Stats) Merge(other *Stats) {
5991
if s == nil || other == nil {
6092
return
6193
}
6294

6395
s.AddWallTime(other.LoadWallTime())
96+
s.AddFetchedSeries(other.LoadFetchedSeries())
97+
s.AddFetchedChunkBytes(other.LoadFetchedChunkBytes())
6498
}
6599

66100
func ShouldTrackHTTPGRPCResponse(r *httpgrpc.HTTPResponse) bool {

0 commit comments

Comments
 (0)