Skip to content

Commit 303d84f

Browse files
maru-avacam-schultz
authored andcommitted
[ci] Move monitoring check from github action to code (#3766)
1 parent cc20dae commit 303d84f

File tree

6 files changed

+77
-40
lines changed

6 files changed

+77
-40
lines changed

.github/actions/run-monitored-tmpnet-cmd/action.yml

+1-27
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ runs:
6868
run: ${{ inputs.run_env }} nix develop --impure --command bash -x ${{ inputs.run }}
6969
env:
7070
TMPNET_START_COLLECTORS: ${{ inputs.prometheus_username != '' }}
71+
TMPNET_CHECK_MONITORING: ${{ inputs.prometheus_username != '' }}
7172
LOKI_USERNAME: ${{ inputs.loki_username }}
7273
LOKI_PASSWORD: ${{ inputs.loki_password }}
7374
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}
@@ -92,30 +93,3 @@ runs:
9293
~/.tmpnet/prometheus/prometheus.log
9394
~/.tmpnet/promtail/promtail.log
9495
if-no-files-found: error
95-
# TODO(marun) Maybe optionally run these checks in an AfterSuite step?
96-
- name: Check that logs were collected
97-
if: (inputs.prometheus_username != '')
98-
shell: bash
99-
run: go run github.com/ava-labs/avalanchego/tests/fixture/tmpnet/cmd check-logs
100-
env:
101-
LOKI_USERNAME: ${{ inputs.loki_username }}
102-
LOKI_PASSWORD: ${{ inputs.loki_password }}
103-
GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }}
104-
GH_WORKFLOW: ${{ inputs.workflow }}
105-
GH_RUN_ID: ${{ inputs.run_id }}
106-
GH_RUN_NUMBER: ${{ inputs.run_number }}
107-
GH_RUN_ATTEMPT: ${{ inputs.run_attempt }}
108-
GH_JOB_ID: ${{ inputs.job }}
109-
- name: Check that metrics were collected
110-
if: (inputs.prometheus_username != '')
111-
shell: bash
112-
run: go run github.com/ava-labs/avalanchego/tests/fixture/tmpnet/cmd check-metrics
113-
env:
114-
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}
115-
PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }}
116-
GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }}
117-
GH_WORKFLOW: ${{ inputs.workflow }}
118-
GH_RUN_ID: ${{ inputs.run_id }}
119-
GH_RUN_NUMBER: ${{ inputs.run_number }}
120-
GH_RUN_ATTEMPT: ${{ inputs.run_attempt }}
121-
GH_JOB_ID: ${{ inputs.job }}

tests/fixture/e2e/env.go

+20-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package e2e
55

66
import (
7+
"context"
78
"encoding/json"
89
"errors"
910
"math/rand"
@@ -79,12 +80,27 @@ func (te *TestEnvironment) Marshal() []byte {
7980
func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork *tmpnet.Network) *TestEnvironment {
8081
require := require.New(tc)
8182

82-
// Start collectors for any command but stop
83-
if flagVars.StartCollectors() && !flagVars.StopNetwork() {
84-
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
83+
var network *tmpnet.Network
84+
85+
// Consider monitoring flags for any command but stop
86+
if !flagVars.StopNetwork() {
87+
if flagVars.StartCollectors() {
88+
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
89+
}
90+
if flagVars.CheckMonitoring() {
91+
// Register cleanup before network start to ensure it runs after the network is stopped (LIFO)
92+
tc.DeferCleanup(func() {
93+
if network == nil {
94+
tc.Log().Warn("unable to check that logs and metrics were collected from an uninitialized network")
95+
return
96+
}
97+
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
98+
defer cancel()
99+
require.NoError(tmpnet.CheckMonitoring(ctx, tc.Log(), network.UUID))
100+
})
101+
}
85102
}
86103

87-
var network *tmpnet.Network
88104
// Need to load the network if it is being stopped or reused
89105
if flagVars.StopNetwork() || flagVars.ReuseNetwork() {
90106
networkDir := flagVars.NetworkDir()

tests/fixture/e2e/flags.go

+17-3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ type FlagVars struct {
2121
networkDir string
2222
reuseNetwork bool
2323
startCollectors bool
24+
checkMonitoring bool
2425
startNetwork bool
2526
stopNetwork bool
2627
restartNetwork bool
@@ -77,6 +78,10 @@ func (v *FlagVars) StartCollectors() bool {
7778
return v.startCollectors
7879
}
7980

81+
func (v *FlagVars) CheckMonitoring() bool {
82+
return v.checkMonitoring
83+
}
84+
8085
func (v *FlagVars) NetworkShutdownDelay() time.Duration {
8186
if v.startCollectors {
8287
// Only return a non-zero value if we want to ensure the collectors have
@@ -140,7 +145,10 @@ func RegisterFlags() *FlagVars {
140145
false,
141146
"[optional] restart an existing network previously started with --reuse-network. Useful for ensuring a network is running with the current state of binaries on disk. Ignored if a network is not already running or --stop-network is provided.",
142147
)
143-
SetStartCollectorsFlag(&vars.startCollectors)
148+
SetMonitoringFlags(
149+
&vars.startCollectors,
150+
&vars.checkMonitoring,
151+
)
144152
flag.BoolVar(
145153
&vars.startNetwork,
146154
"start-network",
@@ -170,11 +178,17 @@ func RegisterFlags() *FlagVars {
170178
}
171179

172180
// Enable reuse by the upgrade job
173-
func SetStartCollectorsFlag(p *bool) {
181+
func SetMonitoringFlags(startCollectors *bool, checkMonitoring *bool) {
174182
flag.BoolVar(
175-
p,
183+
startCollectors,
176184
"start-collectors",
177185
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_COLLECTORS", "false")),
178186
"[optional] whether to start collectors of logs and metrics from nodes of the temporary network.",
179187
)
188+
flag.BoolVar(
189+
checkMonitoring,
190+
"check-monitoring",
191+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_MONITORING", "false")),
192+
"[optional] whether to check that logs and metrics have been collected from nodes of the temporary network.",
193+
)
180194
}

tests/fixture/tmpnet/check_monitoring.go

+23-5
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@ import (
2727

2828
type getCountFunc func() (int, error)
2929

30+
// CheckMonitoring checks if logs and metrics exist for the given network. If no network
31+
// UUID is provided, an attempt will be made to derive selectors from env vars (GH_*)
32+
// identifying a github actions run.
33+
func CheckMonitoring(ctx context.Context, log logging.Logger, networkUUID string) error {
34+
return errors.Join(
35+
CheckLogsExist(ctx, log, networkUUID),
36+
CheckMetricsExist(ctx, log, networkUUID),
37+
)
38+
}
39+
3040
// waitForCount waits until the provided function returns greater than zero.
3141
func waitForCount(ctx context.Context, log logging.Logger, name string, getCount getCountFunc) error {
3242
err := pollUntilContextCancel(
@@ -55,8 +65,9 @@ func waitForCount(ctx context.Context, log logging.Logger, name string, getCount
5565
return nil
5666
}
5767

58-
// CheckLogsExist checks if logs exist for the given network. Github labels are also
59-
// included if provided as env vars (GH_*).
68+
// CheckLogsExist checks if logs exist for the given network. If no network UUID is
69+
// provided, an attempt will be made to derive selectors from env vars (GH_*) identifying
70+
// a github actions run.
6071
func CheckLogsExist(ctx context.Context, log logging.Logger, networkUUID string) error {
6172
username, password, err := getCollectorCredentials(promtailCmd)
6273
if err != nil {
@@ -163,7 +174,7 @@ func queryLoki(
163174
}
164175

165176
// CheckMetricsExist checks if metrics exist for the given network. Github labels are also
166-
// included if provided as env vars (GH_*).
177+
// used as filters if provided as env vars (GH_*).
167178
func CheckMetricsExist(ctx context.Context, log logging.Logger, networkUUID string) error {
168179
username, password, err := getCollectorCredentials(prometheusCmd)
169180
if err != nil {
@@ -253,10 +264,13 @@ func (b *basicAuthRoundTripper) RoundTrip(req *http.Request) (*http.Response, er
253264

254265
// getSelectors returns the comma-separated list of selectors.
255266
func getSelectors(networkUUID string) (string, error) {
256-
selectors := []string{}
267+
// If network UUID is provided, use it as the only selector
257268
if len(networkUUID) > 0 {
258-
selectors = append(selectors, fmt.Sprintf(`network_uuid="%s"`, networkUUID))
269+
return fmt.Sprintf(`network_uuid="%s"`, networkUUID), nil
259270
}
271+
272+
// Fall back to using Github labels as selectors
273+
selectors := []string{}
260274
githubLabels := githubLabelsFromEnv()
261275
for label := range githubLabels {
262276
value, err := githubLabels.GetStringVal(label)
@@ -268,5 +282,9 @@ func getSelectors(networkUUID string) (string, error) {
268282
}
269283
selectors = append(selectors, fmt.Sprintf(`%s="%s"`, label, value))
270284
}
285+
if len(selectors) == 0 {
286+
return "", errors.New("no GH_* env vars set to use for selectors")
287+
}
288+
271289
return strings.Join(selectors, ","), nil
272290
}

tests/fixture/tmpnet/network.go

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ type Network struct {
121121

122122
func NewDefaultNetwork(owner string) *Network {
123123
return &Network{
124+
UUID: uuid.NewString(),
124125
Owner: owner,
125126
Nodes: NewNodesOrPanic(DefaultNodeCount),
126127
}

tests/upgrade/upgrade_test.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package upgrade
55

66
import (
7+
"context"
78
"flag"
89
"fmt"
910
"testing"
@@ -24,6 +25,7 @@ var (
2425
avalancheGoExecPath string
2526
avalancheGoExecPathToUpgradeTo string
2627
startCollectors bool
28+
checkMonitoring bool
2729
)
2830

2931
func init() {
@@ -39,7 +41,10 @@ func init() {
3941
"",
4042
"avalanchego executable path to upgrade to",
4143
)
42-
e2e.SetStartCollectorsFlag(&startCollectors)
44+
e2e.SetMonitoringFlags(
45+
&startCollectors,
46+
&checkMonitoring,
47+
)
4348
}
4449

4550
var _ = ginkgo.Describe("[Upgrade]", func() {
@@ -59,6 +64,15 @@ var _ = ginkgo.Describe("[Upgrade]", func() {
5964
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
6065
shutdownDelay = tmpnet.NetworkShutdownDelay // Ensure a final metrics scrape
6166
}
67+
if checkMonitoring {
68+
// Since cleanups are run in LIFO order, adding this cleanup before
69+
// StartNetwork is called ensures network shutdown will be called first.
70+
tc.DeferCleanup(func() {
71+
ctx, cancel := context.WithTimeout(context.Background(), e2e.DefaultTimeout)
72+
defer cancel()
73+
require.NoError(tmpnet.CheckMonitoring(ctx, tc.Log(), network.UUID))
74+
})
75+
}
6276

6377
e2e.StartNetwork(
6478
tc,

0 commit comments

Comments
 (0)