Skip to content

Debug probes #5474

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion docs/content/en/docs/workflows/debug.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ syncing as it leads to users accidentally terminating debugging sessions by savi
These behaviours can be re-enabled with the `--auto-build`, `--auto-deploy`, and `--auto-sync`
flags.

## How It Works
## How It works

`skaffold debug` examines the built artifacts to determine the underlying language runtime technology.
Kubernetes manifests that reference these artifacts are transformed on-the-fly to enable the
Expand All @@ -31,6 +31,32 @@ each of the appropriate containers. These images are hosted at
`gcr.io/k8s-skaffold/skaffold-debug-support`; alternative locations can be
specified in [Skaffold's global configuration]({{< relref "/docs/design/global-config.md" >}}).

`debug` makes some other adjustments to simplify the debug experience:

- *Replica Counts*: `debug` rewrites the replica counts to 1 for
deployments, replica sets, and stateful sets. This results in
requests being considered one at a time.

- *Kubernetes Probes*: `debug` changes the timeouts on HTTP-based
[liveness, readiness, and startup probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/)
to 600 seconds (10 minutes) from the default of 1 second.
This change allows probes to be debugged, and avoids negative
consequences from blocked probes when the app is already suspended
during a debugging session.
Failed liveness probes in particular result in the container
being terminated and restarted.

The probe timeout value can be set on a per-podspec basis by setting
a `debug.cloud.google.com/probe/timeouts` annotation on the podspec's metadata
with a valid duration (see [Go's time.ParseDuration()](https://pkg.go.dev/time#ParseDuration)).
This probe timeout-rewriting can be skipped entirely by using `skip`. For example:
```yaml
metadata:
annotations:
debug.cloud.google.com/probe/timeouts: skip
spec: ...
```

### Supported Language Runtimes

Debugging is currently supported for:
Expand Down
4 changes: 2 additions & 2 deletions pkg/skaffold/debug/debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -639,9 +639,9 @@ func TestArtifactImage(t *testing.T) {
testutil.CheckDeepEqual(t, true, strings.Contains(debugConfig, `"artifact":"gcr.io/random/image"`))
}

// TestTransformPodSpecSkips verifies that transformPodSpec skips podspecs that have a
// TestSkipAnnotatedPodSpec verifies that transformPodSpec skips podspecs that have a
// `debug.cloud.google.com/config` annotation.
func TestTransformPodSpecSkips(t *testing.T) {
func TestSkipAnnotatedPodSpec(t *testing.T) {
defer func(c []containerTransformer) { containerTransforms = c }(containerTransforms)
containerTransforms = append(containerTransforms, testTransformer{})

Expand Down
63 changes: 63 additions & 0 deletions pkg/skaffold/debug/transform.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ import (
"encoding/json"
"fmt"
"strings"
"time"

shell "github.com/kballard/go-shellquote"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -116,7 +117,12 @@ const (
debuggingSupportFilesVolume = "debugging-support-files"

// DebugConfigAnnotation is the name of the podspec annotation that records debugging configuration information.
// The annotation should be a JSON-encoded map of container-name to a `ContainerDebugConfiguration` object.
DebugConfigAnnotation = "debug.cloud.google.com/config"

// DebugProbesAnnotation is the name of the podspec annotation that disables rewriting of probe timeouts.
// The annotation value should be `skip`.
DebugProbeTimeoutsAnnotation = "debug.cloud.google.com/probe/timeouts"
)

// containerTransforms are the set of configured transformers
Expand Down Expand Up @@ -199,6 +205,63 @@ func transformManifest(obj runtime.Object, retrieveImageConfiguration configurat
// transformPodSpec attempts to configure a podspec for debugging.
// Returns true if changed, false otherwise.
func transformPodSpec(metadata *metav1.ObjectMeta, podSpec *v1.PodSpec, retrieveImageConfiguration configurationRetriever, debugHelpersRegistry string) bool {
// order matters as rewriteProbes only affects containers marked for debugging
containers := rewriteContainers(metadata, podSpec, retrieveImageConfiguration, debugHelpersRegistry)
timeouts := rewriteProbes(metadata, podSpec)
return containers || timeouts
}

// rewriteProbes rewrites k8s probes to expand timeouts to 10 minutes to allow debugging local probes.
func rewriteProbes(metadata *metav1.ObjectMeta, podSpec *v1.PodSpec) bool {
var minTimeout time.Duration = 10 * time.Minute // make it configurable?
if annotation, found := metadata.Annotations[DebugProbeTimeoutsAnnotation]; found {
if annotation == "skip" {
logrus.Debugf("skipping probe rewrite on %q by request", metadata.Name)
return false
}
if d, err := time.ParseDuration(annotation); err != nil {
logrus.Warnf("invalid probe timeout value for %q: %q: %v", metadata.Name, annotation, err)
} else {
minTimeout = d
}
}
annotation, found := metadata.Annotations[DebugConfigAnnotation]
if !found {
logrus.Debugf("skipping probe rewrite on %q: not configured for debugging", metadata.Name)
return false
}
var config map[string]ContainerDebugConfiguration
if err := json.Unmarshal([]byte(annotation), &config); err != nil {
logrus.Warnf("error unmarshalling debugging configuration for %q: %v", metadata.Name, err)
return false
}

changed := false
for i := range podSpec.Containers {
c := &podSpec.Containers[i]
// only affect containers listed in debug-config
if _, found := config[c.Name]; found {
lp := rewriteHTTPGetProbe(c.LivenessProbe, minTimeout)
rp := rewriteHTTPGetProbe(c.ReadinessProbe, minTimeout)
sp := rewriteHTTPGetProbe(c.StartupProbe, minTimeout)
if lp || rp || sp {
logrus.Infof("Updated probe timeouts for %s/%s", metadata.Name, c.Name)
}
changed = changed || lp || rp || sp
}
}
return changed
}

func rewriteHTTPGetProbe(probe *v1.Probe, minTimeout time.Duration) bool {
if probe == nil || probe.HTTPGet == nil || int32(minTimeout.Seconds()) < probe.TimeoutSeconds {
return false
}
probe.TimeoutSeconds = int32(minTimeout.Seconds())
return true
}

func rewriteContainers(metadata *metav1.ObjectMeta, podSpec *v1.PodSpec, retrieveImageConfiguration configurationRetriever, debugHelpersRegistry string) bool {
// skip annotated podspecs — allows users to customize their own image
if _, found := metadata.Annotations[DebugConfigAnnotation]; found {
return false
Expand Down
149 changes: 149 additions & 0 deletions pkg/skaffold/debug/transform_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ import (
"reflect"
"strings"
"testing"
"time"

"github.com/google/go-cmp/cmp"
appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/GoogleContainerTools/skaffold/testutil"
)
Expand Down Expand Up @@ -332,3 +334,150 @@ func TestUpdateForShDashC(t *testing.T) {
})
}
}

func TestRewriteHTTPGetProbe(t *testing.T) {
tests := []struct {
description string
input v1.Probe
minTimeout time.Duration
changed bool
expected v1.Probe
}{
{
description: "non-http probe should be skipped",
input: v1.Probe{Handler: v1.Handler{Exec: &v1.ExecAction{Command: []string{"echo"}}}, TimeoutSeconds: 10},
minTimeout: 20 * time.Second,
changed: false,
},
{
description: "http probe with big timeout should be skipped",
input: v1.Probe{Handler: v1.Handler{Exec: &v1.ExecAction{Command: []string{"echo"}}}, TimeoutSeconds: 100 * 60},
minTimeout: 20 * time.Second,
changed: false,
},
{
description: "http probe with no timeout",
input: v1.Probe{Handler: v1.Handler{Exec: &v1.ExecAction{Command: []string{"echo"}}}},
minTimeout: 20 * time.Second,
changed: true,
expected: v1.Probe{Handler: v1.Handler{Exec: &v1.ExecAction{Command: []string{"echo"}}}, TimeoutSeconds: 20},
},
{
description: "http probe with small timeout",
input: v1.Probe{Handler: v1.Handler{Exec: &v1.ExecAction{Command: []string{"echo"}}}, TimeoutSeconds: 60},
minTimeout: 100 * time.Second,
changed: true,
expected: v1.Probe{Handler: v1.Handler{Exec: &v1.ExecAction{Command: []string{"echo"}}}, TimeoutSeconds: 100},
},
}
for _, test := range tests {
testutil.Run(t, test.description, func(t *testutil.T) {
p := test.input
if rewriteHTTPGetProbe(&p, test.minTimeout) {
t.CheckDeepEqual(test.expected, p)
} else {
t.CheckDeepEqual(test.input, p) // should not have changed
}
})
}
}

// TestRewriteProbes verifies that rewriteProbes skips podspecs that have a
// `debug.cloud.google.com/config` annotation.
func TestRewriteProbes(t *testing.T) {
tests := []struct {
name string
input v1.Pod
changed bool
result v1.Pod
}{
{
name: "skips pod missing debug annotation",
input: v1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: v1.SchemeGroupVersion.Version, Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{Name: "podname"},
Spec: v1.PodSpec{Containers: []v1.Container{{
Name: "name1",
Image: "image1",
LivenessProbe: &v1.Probe{Handler: v1.Handler{HTTPGet: &v1.HTTPGetAction{Path: "/", Port: intstr.FromInt(8080)}}, TimeoutSeconds: 1}}}}},
changed: false,
},
{
name: "processes pod with debug annotation and uses default timeout",
input: v1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: v1.SchemeGroupVersion.Version, Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{Name: "podname", Annotations: map[string]string{"debug.cloud.google.com/config": `{"name1":{"runtime":"test"}}`}},
Spec: v1.PodSpec{Containers: []v1.Container{{
Name: "name1",
Image: "image1",
LivenessProbe: &v1.Probe{Handler: v1.Handler{HTTPGet: &v1.HTTPGetAction{Path: "/", Port: intstr.FromInt(8080)}}, TimeoutSeconds: 1}}}}},
changed: true,
result: v1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: v1.SchemeGroupVersion.Version, Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{Name: "podname", Annotations: map[string]string{"debug.cloud.google.com/config": `{"name1":{"runtime":"test"}}`}},
Spec: v1.PodSpec{Containers: []v1.Container{{
Name: "name1",
Image: "image1",
LivenessProbe: &v1.Probe{Handler: v1.Handler{HTTPGet: &v1.HTTPGetAction{Path: "/", Port: intstr.FromInt(8080)}}, TimeoutSeconds: 600}}}}},
},
{
name: "skips pod with skip-probes annotation",
input: v1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: v1.SchemeGroupVersion.Version, Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{Name: "podname", Annotations: map[string]string{"debug.cloud.google.com/probe/timeouts": `skip`}},
Spec: v1.PodSpec{Containers: []v1.Container{{
Name: "name1",
Image: "image1",
LivenessProbe: &v1.Probe{Handler: v1.Handler{HTTPGet: &v1.HTTPGetAction{Path: "/", Port: intstr.FromInt(8080)}}, TimeoutSeconds: 1}}}}},
changed: false,
},
{
name: "processes pod with probes annotation with explicit timeout",
input: v1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: v1.SchemeGroupVersion.Version, Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{Name: "podname", Annotations: map[string]string{"debug.cloud.google.com/probe/timeouts": `1m`}},
Spec: v1.PodSpec{Containers: []v1.Container{{
Name: "name1",
Image: "image1",
LivenessProbe: &v1.Probe{Handler: v1.Handler{HTTPGet: &v1.HTTPGetAction{Path: "/", Port: intstr.FromInt(8080)}}, TimeoutSeconds: 1}}}}},
changed: false,
result: v1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: v1.SchemeGroupVersion.Version, Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{Name: "podname", Annotations: map[string]string{"debug.cloud.google.com/probe/timeouts": `1m`}},
Spec: v1.PodSpec{Containers: []v1.Container{{
Name: "name1",
Image: "image1",
LivenessProbe: &v1.Probe{Handler: v1.Handler{HTTPGet: &v1.HTTPGetAction{Path: "/", Port: intstr.FromInt(8080)}}, TimeoutSeconds: 60}}}}},
},
{
name: "processes pod with probes annotation with invalid timeout",
input: v1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: v1.SchemeGroupVersion.Version, Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{Name: "podname", Annotations: map[string]string{"debug.cloud.google.com/probe/timeouts": `on`}},
Spec: v1.PodSpec{Containers: []v1.Container{{
Name: "name1",
Image: "image1",
LivenessProbe: &v1.Probe{Handler: v1.Handler{HTTPGet: &v1.HTTPGetAction{Path: "/", Port: intstr.FromInt(8080)}}, TimeoutSeconds: 1}}}}},
changed: false,
result: v1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: v1.SchemeGroupVersion.Version, Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{Name: "podname", Annotations: map[string]string{"debug.cloud.google.com/probe/timeouts": `on`}},
Spec: v1.PodSpec{Containers: []v1.Container{{
Name: "name1",
Image: "image1",
LivenessProbe: &v1.Probe{Handler: v1.Handler{HTTPGet: &v1.HTTPGetAction{Path: "/", Port: intstr.FromInt(8080)}}, TimeoutSeconds: 600}}}}},
},
}
for _, test := range tests {
testutil.Run(t, test.name, func(t *testutil.T) {
pod := test.input
result := rewriteProbes(&pod.ObjectMeta, &pod.Spec)
t.CheckDeepEqual(test.changed, result)
if test.changed {
t.CheckDeepEqual(test.result, pod)
} else {
t.CheckDeepEqual(test.input, pod)
}
})
}
}