Skip to content

Commit fba5f93

Browse files
committed
feat: add --tolerate-failures-status-check flag and deploy.tolerateFailures config for improved ci/cd usage
1 parent dfd6015 commit fba5f93

File tree

8 files changed

+148
-96
lines changed

8 files changed

+148
-96
lines changed

cmd/skaffold/app/cmd/flags.go

+9
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,15 @@ var flagRegistry = []Flag{
332332
DefinedOn: []string{"dev", "debug", "deploy", "run", "apply"},
333333
IsEnum: true,
334334
},
335+
{
336+
Name: "tolerate-failures-status-check",
337+
Usage: "Configures `status-check` to tolerate failures until Skaffold's statusCheckDeadline duration or the deployments progressDeadlineSeconds Otherwise deployment failures skaffold encounters will immediately fail the deployment. Defaults to 'false'",
338+
Value: &opts.TolerateFailuresStatusCheck,
339+
DefValue: false,
340+
FlagAddMethod: "BoolVar",
341+
DefinedOn: []string{"dev", "debug", "deploy", "run", "apply"},
342+
IsEnum: true,
343+
},
335344
{
336345
Name: "fast-fail-status-check",
337346
Usage: "Configures `status-check` to fail immediately if any error occurs. Otherwise `status-check` will attempt to check all resources once and only then report errors and possibly exit. Defaults to 'true'",

docs-v2/content/en/schemas/v4beta1.json

+7
Original file line numberDiff line numberDiff line change
@@ -1422,6 +1422,12 @@
14221422
"type": "integer",
14231423
"description": "*beta* deadline for deployments to stabilize in seconds.",
14241424
"x-intellij-html-description": "<em>beta</em> deadline for deployments to stabilize in seconds."
1425+
},
1426+
"tolerateFailures": {
1427+
"type": "boolean",
1428+
"description": "configures the Skaffold \"status-check\" to tolerate failures (flapping deployments, etc.) until the statusCheckDeadlineSeconds duration or k8s object timeouts such as progressDeadlineSeconds, etc.",
1429+
"x-intellij-html-description": "configures the Skaffold &quot;status-check&quot; to tolerate failures (flapping deployments, etc.) until the statusCheckDeadlineSeconds duration or k8s object timeouts such as progressDeadlineSeconds, etc.",
1430+
"default": "false"
14251431
}
14261432
},
14271433
"preferredOrder": [
@@ -1432,6 +1438,7 @@
14321438
"cloudrun",
14331439
"statusCheck",
14341440
"statusCheckDeadlineSeconds",
1441+
"tolerateFailures",
14351442
"kubeContext",
14361443
"logs"
14371444
],

docs/content/en/docs/references/cli/_index.md

+10
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ Options:
139139
--status-check=: Wait for deployed resources to stabilize
140140
--sync-remote-cache='always': Controls how Skaffold manages the remote config cache (see `remote-cache-dir`). One of `always` (default), `missing`, or `never`. `always` syncs remote repositories to latest on access. `missing` only clones remote repositories if they do not exist locally. `never` means the user takes responsibility for updating remote repositories.
141141
--tail=false: Stream logs from deployed objects
142+
--tolerate-failures-status-check=false: Configures `status-check` to tolerate failures until Skaffold's statusCheckDeadline duration or the deployments progressDeadlineSeconds Otherwise deployment failures skaffold encounters will immediately fail the deployment. Defaults to 'false'
142143
--wait-for-connection=false: Blocks ending execution of skaffold until the /v2/events gRPC/HTTP endpoint is hit
143144
144145
Usage:
@@ -168,6 +169,7 @@ Env vars:
168169
* `SKAFFOLD_STATUS_CHECK` (same as `--status-check`)
169170
* `SKAFFOLD_SYNC_REMOTE_CACHE` (same as `--sync-remote-cache`)
170171
* `SKAFFOLD_TAIL` (same as `--tail`)
172+
* `SKAFFOLD_TOLERATE_FAILURES_STATUS_CHECK` (same as `--tolerate-failures-status-check`)
171173
* `SKAFFOLD_WAIT_FOR_CONNECTION` (same as `--wait-for-connection`)
172174

173175
### skaffold build
@@ -453,6 +455,7 @@ Options:
453455
--sync-remote-cache='always': Controls how Skaffold manages the remote config cache (see `remote-cache-dir`). One of `always` (default), `missing`, or `never`. `always` syncs remote repositories to latest on access. `missing` only clones remote repositories if they do not exist locally. `never` means the user takes responsibility for updating remote repositories.
454456
-t, --tag='': The optional custom tag to use for images which overrides the current Tagger configuration
455457
--tail=true: Stream logs from deployed objects
458+
--tolerate-failures-status-check=false: Configures `status-check` to tolerate failures until Skaffold's statusCheckDeadline duration or the deployments progressDeadlineSeconds Otherwise deployment failures skaffold encounters will immediately fail the deployment. Defaults to 'false'
456459
--toot=false: Emit a terminal beep after the deploy is complete
457460
--trigger='notify': How is change detection triggered? (polling, notify, or manual)
458461
--wait-for-connection=false: Blocks ending execution of skaffold until the /v2/events gRPC/HTTP endpoint is hit
@@ -516,6 +519,7 @@ Env vars:
516519
* `SKAFFOLD_SYNC_REMOTE_CACHE` (same as `--sync-remote-cache`)
517520
* `SKAFFOLD_TAG` (same as `--tag`)
518521
* `SKAFFOLD_TAIL` (same as `--tail`)
522+
* `SKAFFOLD_TOLERATE_FAILURES_STATUS_CHECK` (same as `--tolerate-failures-status-check`)
519523
* `SKAFFOLD_TOOT` (same as `--toot`)
520524
* `SKAFFOLD_TRIGGER` (same as `--trigger`)
521525
* `SKAFFOLD_WAIT_FOR_CONNECTION` (same as `--wait-for-connection`)
@@ -633,6 +637,7 @@ Options:
633637
--sync-remote-cache='always': Controls how Skaffold manages the remote config cache (see `remote-cache-dir`). One of `always` (default), `missing`, or `never`. `always` syncs remote repositories to latest on access. `missing` only clones remote repositories if they do not exist locally. `never` means the user takes responsibility for updating remote repositories.
634638
-t, --tag='': The optional custom tag to use for images which overrides the current Tagger configuration
635639
--tail=false: Stream logs from deployed objects
640+
--tolerate-failures-status-check=false: Configures `status-check` to tolerate failures until Skaffold's statusCheckDeadline duration or the deployments progressDeadlineSeconds Otherwise deployment failures skaffold encounters will immediately fail the deployment. Defaults to 'false'
636641
--toot=false: Emit a terminal beep after the deploy is complete
637642
--wait-for-connection=false: Blocks ending execution of skaffold until the /v2/events gRPC/HTTP endpoint is hit
638643
--wait-for-deletions=true: Wait for pending deletions to complete before a deployment
@@ -681,6 +686,7 @@ Env vars:
681686
* `SKAFFOLD_SYNC_REMOTE_CACHE` (same as `--sync-remote-cache`)
682687
* `SKAFFOLD_TAG` (same as `--tag`)
683688
* `SKAFFOLD_TAIL` (same as `--tail`)
689+
* `SKAFFOLD_TOLERATE_FAILURES_STATUS_CHECK` (same as `--tolerate-failures-status-check`)
684690
* `SKAFFOLD_TOOT` (same as `--toot`)
685691
* `SKAFFOLD_WAIT_FOR_CONNECTION` (same as `--wait-for-connection`)
686692
* `SKAFFOLD_WAIT_FOR_DELETIONS` (same as `--wait-for-deletions`)
@@ -740,6 +746,7 @@ Options:
740746
--sync-remote-cache='always': Controls how Skaffold manages the remote config cache (see `remote-cache-dir`). One of `always` (default), `missing`, or `never`. `always` syncs remote repositories to latest on access. `missing` only clones remote repositories if they do not exist locally. `never` means the user takes responsibility for updating remote repositories.
741747
-t, --tag='': The optional custom tag to use for images which overrides the current Tagger configuration
742748
--tail=true: Stream logs from deployed objects
749+
--tolerate-failures-status-check=false: Configures `status-check` to tolerate failures until Skaffold's statusCheckDeadline duration or the deployments progressDeadlineSeconds Otherwise deployment failures skaffold encounters will immediately fail the deployment. Defaults to 'false'
743750
--toot=false: Emit a terminal beep after the deploy is complete
744751
--trigger='notify': How is change detection triggered? (polling, notify, or manual)
745752
--wait-for-connection=false: Blocks ending execution of skaffold until the /v2/events gRPC/HTTP endpoint is hit
@@ -803,6 +810,7 @@ Env vars:
803810
* `SKAFFOLD_SYNC_REMOTE_CACHE` (same as `--sync-remote-cache`)
804811
* `SKAFFOLD_TAG` (same as `--tag`)
805812
* `SKAFFOLD_TAIL` (same as `--tail`)
813+
* `SKAFFOLD_TOLERATE_FAILURES_STATUS_CHECK` (same as `--tolerate-failures-status-check`)
806814
* `SKAFFOLD_TOOT` (same as `--toot`)
807815
* `SKAFFOLD_TRIGGER` (same as `--trigger`)
808816
* `SKAFFOLD_WAIT_FOR_CONNECTION` (same as `--wait-for-connection`)
@@ -1095,6 +1103,7 @@ Options:
10951103
--sync-remote-cache='always': Controls how Skaffold manages the remote config cache (see `remote-cache-dir`). One of `always` (default), `missing`, or `never`. `always` syncs remote repositories to latest on access. `missing` only clones remote repositories if they do not exist locally. `never` means the user takes responsibility for updating remote repositories.
10961104
-t, --tag='': The optional custom tag to use for images which overrides the current Tagger configuration
10971105
--tail=false: Stream logs from deployed objects
1106+
--tolerate-failures-status-check=false: Configures `status-check` to tolerate failures until Skaffold's statusCheckDeadline duration or the deployments progressDeadlineSeconds Otherwise deployment failures skaffold encounters will immediately fail the deployment. Defaults to 'false'
10981107
--toot=false: Emit a terminal beep after the deploy is complete
10991108
--wait-for-connection=false: Blocks ending execution of skaffold until the /v2/events gRPC/HTTP endpoint is hit
11001109
--wait-for-deletions=true: Wait for pending deletions to complete before a deployment
@@ -1153,6 +1162,7 @@ Env vars:
11531162
* `SKAFFOLD_SYNC_REMOTE_CACHE` (same as `--sync-remote-cache`)
11541163
* `SKAFFOLD_TAG` (same as `--tag`)
11551164
* `SKAFFOLD_TAIL` (same as `--tail`)
1165+
* `SKAFFOLD_TOLERATE_FAILURES_STATUS_CHECK` (same as `--tolerate-failures-status-check`)
11561166
* `SKAFFOLD_TOOT` (same as `--toot`)
11571167
* `SKAFFOLD_WAIT_FOR_CONNECTION` (same as `--wait-for-connection`)
11581168
* `SKAFFOLD_WAIT_FOR_DELETIONS` (same as `--wait-for-deletions`)

pkg/skaffold/config/options.go

+75-74
Original file line numberDiff line numberDiff line change
@@ -32,80 +32,81 @@ type WaitForDeletions struct {
3232

3333
// SkaffoldOptions are options that are set by command line arguments not included in the config file itself
3434
type SkaffoldOptions struct {
35-
Apply bool
36-
AutoBuild bool
37-
AutoCreateConfig bool
38-
AutoDeploy bool
39-
AutoSync bool
40-
AssumeYes bool
41-
CacheArtifacts bool
42-
ContainerDebugging bool
43-
Cleanup bool
44-
DetectMinikube bool
45-
DryRun bool
46-
EnableRPC bool
47-
Force bool
48-
ForceLoadImages bool
49-
IterativeStatusCheck bool
50-
FastFailStatusCheck bool
51-
Notification bool
52-
NoPrune bool
53-
NoPruneChildren bool
54-
ProfileAutoActivation bool
55-
PropagateProfiles bool
56-
RenderOnly bool
57-
SkipTests bool
58-
SkipConfigDefaults bool
59-
Tail bool
60-
WaitForConnection bool
61-
EnablePlatformNodeAffinity bool
62-
EnableGKEARMNodeToleration bool
63-
DisableMultiPlatformBuild bool
64-
CheckClusterNodePlatforms bool
65-
MakePathsAbsolute *bool
66-
MultiLevelRepo *bool
67-
CloudRunProject string
68-
CloudRunLocation string
69-
ConfigurationFile string
70-
HydrationDir string
71-
InventoryNamespace string
72-
InventoryID string
73-
InventoryName string
74-
GlobalConfig string
75-
EventLogFile string
76-
RenderOutput string
77-
User string
78-
CustomTag string
79-
Namespace string
80-
CacheFile string
81-
Trigger string
82-
KubeContext string
83-
KubeConfig string
84-
LastLogFile string
85-
DigestSource string
86-
Command string
87-
MinikubeProfile string
88-
RepoCacheDir string
89-
TransformRulesFile string
90-
VerifyDockerNetwork string
91-
CustomLabels []string
92-
TargetImages []string
93-
Profiles []string
94-
InsecureRegistries []string
95-
ConfigurationFilter []string
96-
HydratedManifests []string
97-
Platforms []string
98-
BuildConcurrency int
99-
WatchPollInterval int
100-
StatusCheck BoolOrUndefined
101-
PushImages BoolOrUndefined
102-
RPCPort IntOrUndefined
103-
RPCHTTPPort IntOrUndefined
104-
Muted Muted
105-
PortForward PortForwardOptions
106-
DefaultRepo StringOrUndefined
107-
SyncRemoteCache SyncRemoteCacheOption
108-
WaitForDeletions WaitForDeletions
35+
Apply bool
36+
AutoBuild bool
37+
AutoCreateConfig bool
38+
AutoDeploy bool
39+
AutoSync bool
40+
AssumeYes bool
41+
CacheArtifacts bool
42+
ContainerDebugging bool
43+
Cleanup bool
44+
DetectMinikube bool
45+
DryRun bool
46+
EnableRPC bool
47+
Force bool
48+
ForceLoadImages bool
49+
IterativeStatusCheck bool
50+
FastFailStatusCheck bool
51+
TolerateFailuresStatusCheck bool
52+
Notification bool
53+
NoPrune bool
54+
NoPruneChildren bool
55+
ProfileAutoActivation bool
56+
PropagateProfiles bool
57+
RenderOnly bool
58+
SkipTests bool
59+
SkipConfigDefaults bool
60+
Tail bool
61+
WaitForConnection bool
62+
EnablePlatformNodeAffinity bool
63+
EnableGKEARMNodeToleration bool
64+
DisableMultiPlatformBuild bool
65+
CheckClusterNodePlatforms bool
66+
MakePathsAbsolute *bool
67+
MultiLevelRepo *bool
68+
CloudRunProject string
69+
CloudRunLocation string
70+
ConfigurationFile string
71+
HydrationDir string
72+
InventoryNamespace string
73+
InventoryID string
74+
InventoryName string
75+
GlobalConfig string
76+
EventLogFile string
77+
RenderOutput string
78+
User string
79+
CustomTag string
80+
Namespace string
81+
CacheFile string
82+
Trigger string
83+
KubeContext string
84+
KubeConfig string
85+
LastLogFile string
86+
DigestSource string
87+
Command string
88+
MinikubeProfile string
89+
RepoCacheDir string
90+
TransformRulesFile string
91+
VerifyDockerNetwork string
92+
CustomLabels []string
93+
TargetImages []string
94+
Profiles []string
95+
InsecureRegistries []string
96+
ConfigurationFilter []string
97+
HydratedManifests []string
98+
Platforms []string
99+
BuildConcurrency int
100+
WatchPollInterval int
101+
StatusCheck BoolOrUndefined
102+
PushImages BoolOrUndefined
103+
RPCPort IntOrUndefined
104+
RPCHTTPPort IntOrUndefined
105+
Muted Muted
106+
PortForward PortForwardOptions
107+
DefaultRepo StringOrUndefined
108+
SyncRemoteCache SyncRemoteCacheOption
109+
WaitForDeletions WaitForDeletions
109110
}
110111

111112
type RunMode string

pkg/skaffold/deploy/component/kubernetes/monitor_test.go

+2
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ func (m mockStatusConfig) GetKubeContext() string { return "" }
3838

3939
func (m mockStatusConfig) StatusCheckDeadlineSeconds() int { return 0 }
4040

41+
func (m mockStatusConfig) StatusCheckTolerateFailures() bool { return false }
42+
4143
func (m mockStatusConfig) FastFailStatusCheck() bool { return true }
4244

4345
func (m mockStatusConfig) Muted() config.Muted { return config.Muted{} }

pkg/skaffold/kubernetes/status/status_check.go

+25-22
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ type Config interface {
7878

7979
StatusCheckDeadlineSeconds() int
8080
FastFailStatusCheck() bool
81+
StatusCheckTolerateFailures() bool
8182
Muted() config.Muted
8283
StatusCheck() *bool
8384
}
@@ -89,32 +90,34 @@ type Monitor interface {
8990
}
9091

9192
type monitor struct {
92-
cfg Config
93-
labeller *label.DefaultLabeller
94-
deadlineSeconds int
95-
muteLogs bool
96-
failFast bool
97-
seenResources resource.Group
98-
singleRun singleflight.Group
99-
namespaces *[]string
100-
kubeContext string
101-
manifests manifest.ManifestList
93+
cfg Config
94+
labeller *label.DefaultLabeller
95+
deadlineSeconds int
96+
muteLogs bool
97+
failFast bool
98+
tolerateFailures bool
99+
seenResources resource.Group
100+
singleRun singleflight.Group
101+
namespaces *[]string
102+
kubeContext string
103+
manifests manifest.ManifestList
102104
}
103105

104106
// NewStatusMonitor returns a status monitor which runs checks on selected resource rollouts.
105107
// Currently implemented for deployments and statefulsets.
106108
func NewStatusMonitor(cfg Config, labeller *label.DefaultLabeller, namespaces *[]string) Monitor {
107109
return &monitor{
108-
muteLogs: cfg.Muted().MuteStatusCheck(),
109-
cfg: cfg,
110-
labeller: labeller,
111-
deadlineSeconds: cfg.StatusCheckDeadlineSeconds(),
112-
seenResources: make(resource.Group),
113-
singleRun: singleflight.Group{},
114-
namespaces: namespaces,
115-
kubeContext: cfg.GetKubeContext(),
116-
manifests: make(manifest.ManifestList, 0),
117-
failFast: cfg.FastFailStatusCheck(),
110+
muteLogs: cfg.Muted().MuteStatusCheck(),
111+
cfg: cfg,
112+
labeller: labeller,
113+
deadlineSeconds: cfg.StatusCheckDeadlineSeconds(),
114+
seenResources: make(resource.Group),
115+
singleRun: singleflight.Group{},
116+
namespaces: namespaces,
117+
kubeContext: cfg.GetKubeContext(),
118+
manifests: make(manifest.ManifestList, 0),
119+
failFast: cfg.FastFailStatusCheck(),
120+
tolerateFailures: cfg.StatusCheckTolerateFailures(),
118121
}
119122
}
120123

@@ -353,7 +356,7 @@ func getStatefulSets(ctx context.Context, client kubernetes.Interface, ns string
353356
return resources, nil
354357
}
355358

356-
func pollResourceStatus(ctx context.Context, cfg kubectl.Config, r *resource.Resource) {
359+
func pollResourceStatus(ctx context.Context, cfg Config, r *resource.Resource) {
357360
pollDuration := time.Duration(defaultPollPeriodInMilliseconds) * time.Millisecond
358361
ticker := time.NewTicker(pollDuration)
359362
defer ticker.Stop()
@@ -387,7 +390,7 @@ func pollResourceStatus(ctx context.Context, cfg kubectl.Config, r *resource.Res
387390
// As any changes to build or deploy dependencies are not triggered, exit
388391
// immediately rather than waiting for for statusCheckDeadlineSeconds
389392
// TODO: https://github.com/GoogleContainerTools/skaffold/pull/4591
390-
if r.HasEncounteredUnrecoverableError() {
393+
if r.HasEncounteredUnrecoverableError() && !cfg.StatusCheckTolerateFailures() {
391394
r.MarkComplete()
392395
return
393396
}

pkg/skaffold/runner/runcontext/context.go

+15
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,17 @@ func (ps Pipelines) TransformDenyList() []latest.ResourceFilter {
155155
return denylist
156156
}
157157

158+
func (ps Pipelines) StatusCheckTolerateFailures() bool {
159+
failureTolerance := false
160+
// set the group status check deadline to maximum of any individually specified value
161+
for _, p := range ps.pipelines {
162+
if p.Deploy.TolerateFailures {
163+
failureTolerance = true
164+
}
165+
}
166+
return failureTolerance
167+
}
168+
158169
func (ps Pipelines) StatusCheckDeadlineSeconds() int {
159170
c := 0
160171
// set the group status check deadline to maximum of any individually specified value
@@ -203,6 +214,10 @@ func (rc *RunContext) StatusCheckDeadlineSeconds() int {
203214
return rc.Pipelines.StatusCheckDeadlineSeconds()
204215
}
205216

217+
func (rc *RunContext) StatusCheckTolerateFailures() bool {
218+
return rc.Opts.TolerateFailuresStatusCheck || rc.Pipelines.StatusCheckTolerateFailures()
219+
}
220+
206221
func (rc *RunContext) SkipTests() bool {
207222
return rc.Opts.SkipTests
208223
}

0 commit comments

Comments
 (0)