Skip to content

Commit 3e3ec18

Browse files
authored
Merge pull request #1936 from rksharma95/snitch-conditional-mounts
fix(operator): snitch conditional mounts
2 parents 59ac302 + 95be886 commit 3e3ec18

File tree

3 files changed

+183
-16
lines changed

3 files changed

+183
-16
lines changed

deployments/helm/KubeArmorOperator/templates/clusterrole-rbac.yaml

+28
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,34 @@ rules:
246246
---
247247
apiVersion: rbac.authorization.k8s.io/v1
248248
kind: Role
249+
metadata:
250+
name: {{ .Values.kubearmorOperator.name }}-manage-snitch-job
251+
namespace: {{ .Release.Namespace }}
252+
rules:
253+
# to handle snitch mounts dynamically
254+
- apiGroups:
255+
- ""
256+
resources:
257+
- events
258+
verbs:
259+
- list
260+
- apiGroups:
261+
- ""
262+
resources:
263+
- pods
264+
verbs:
265+
- list
266+
- apiGroups:
267+
- batch
268+
resources:
269+
- jobs
270+
verbs:
271+
- get
272+
- create
273+
- delete
274+
---
275+
apiVersion: rbac.authorization.k8s.io/v1
276+
kind: Role
249277
metadata:
250278
name: {{ .Values.kubearmorOperator.name }}-tls-secrets-role
251279
namespace: {{ .Release.Namespace }}

pkg/KubeArmorOperator/internal/controller/cluster.go

+154-15
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"context"
99
"fmt"
1010
"reflect"
11+
"regexp"
1112
"slices"
1213
"sort"
1314
"strconv"
@@ -103,6 +104,142 @@ func NewClusterWatcher(client *kubernetes.Clientset, log *zap.SugaredLogger, ext
103104
}
104105
}
105106

107+
func extractVolumeFromMessage(message string) (string, bool) {
108+
// find volume name between quotes after "volume"
109+
// Message: MountVolume.SetUp failed for volume \"notexists-path\"
110+
re := regexp.MustCompile(`volume\s*\"([^\"]+)\"`)
111+
matches := re.FindStringSubmatch(message)
112+
113+
if len(matches) > 1 {
114+
return matches[1], true
115+
}
116+
return "", false
117+
}
118+
119+
func extractPathFromMessage(message string) (string, bool) {
120+
// find mount path between quotes after "mkdir"
121+
// Message: failed to mkdir \"/etc/apparmor.d/\": mkdir /etc/apparmor.d/: read-only file system
122+
re := regexp.MustCompile(`mkdir\s+\"([^\"]+)\"`)
123+
matches := re.FindStringSubmatch(message)
124+
125+
if len(matches) > 1 {
126+
return matches[1], true
127+
}
128+
return "", false
129+
}
130+
131+
func (clusterWatcher *ClusterWatcher) checkJobStatus(job, runtime, nodename string) {
132+
defer func() {
133+
clusterWatcher.Log.Infof("checkJobStatus completed for job: %s", job)
134+
}()
135+
136+
for {
137+
select {
138+
case <-time.After(5 * time.Minute):
139+
clusterWatcher.Log.Infof("watcher exit after timeout for job: %s", job)
140+
return
141+
default:
142+
clusterWatcher.Log.Infof("watching status for job: %s", job)
143+
144+
j, err := clusterWatcher.Client.BatchV1().Jobs(common.Namespace).Get(context.TODO(), job, v1.GetOptions{})
145+
if err != nil {
146+
clusterWatcher.Log.Warnf("cannot get job: %s", job)
147+
return
148+
}
149+
150+
if j.Status.Succeeded > 0 {
151+
return
152+
}
153+
154+
podsList, err := clusterWatcher.Client.CoreV1().Pods(common.Namespace).List(context.TODO(), v1.ListOptions{
155+
LabelSelector: fmt.Sprintf("job-name=%s", job),
156+
})
157+
158+
if err != nil {
159+
clusterWatcher.Log.Warnf("Cannot get job pod: %s", job)
160+
return
161+
}
162+
163+
for _, pod := range podsList.Items {
164+
mountFailure := false
165+
failedMount := ""
166+
events, err := clusterWatcher.Client.CoreV1().Events(common.Namespace).List(context.TODO(), v1.ListOptions{
167+
FieldSelector: fmt.Sprintf("involvedObject.name=%s", pod.Name),
168+
})
169+
if err != nil {
170+
clusterWatcher.Log.Warnf("cannot get pod events for pod: %s", pod.Name)
171+
return
172+
}
173+
174+
for _, event := range events.Items {
175+
if event.Type == "Warning" && (event.Reason == "FailedMount" ||
176+
event.Reason == "FailedAttachVolume" ||
177+
event.Reason == "VolumeMountsFailed") {
178+
clusterWatcher.Log.Infof("Got Failed Event for job pod: %v", event.Message)
179+
mountFailure = true
180+
failedMount, _ = extractVolumeFromMessage(event.Message)
181+
clusterWatcher.Log.Infof("FailedMount: %s", failedMount)
182+
break
183+
}
184+
185+
if event.Type == "Warning" && event.Reason == "Failed" && strings.Contains(event.Message, "mkdir") {
186+
clusterWatcher.Log.Infof("Got Failed Event for job pod: %v", event.Message)
187+
if path, readOnly := extractPathFromMessage(event.Message); readOnly {
188+
failedMount = path
189+
mountFailure = true
190+
clusterWatcher.Log.Infof("ReadOnly FS: %s", failedMount)
191+
break
192+
}
193+
}
194+
}
195+
196+
if mountFailure {
197+
propogatePodDeletion := v1.DeletePropagationBackground
198+
err := clusterWatcher.Client.BatchV1().Jobs(common.Namespace).Delete(context.TODO(), job, v1.DeleteOptions{
199+
PropagationPolicy: &propogatePodDeletion,
200+
})
201+
if err != nil {
202+
clusterWatcher.Log.Warnf("Cannot delete job: %s, err=%s", job, err)
203+
return
204+
}
205+
206+
newJob := deploySnitch(nodename, runtime)
207+
208+
volumeToDelete := ""
209+
for _, vol := range newJob.Spec.Template.Spec.Volumes {
210+
if vol.HostPath.Path == failedMount || vol.Name == failedMount {
211+
volumeToDelete = vol.Name
212+
break
213+
}
214+
}
215+
216+
newJob.Spec.Template.Spec.Volumes = slices.DeleteFunc(newJob.Spec.Template.Spec.Volumes, func(vol corev1.Volume) bool {
217+
if vol.Name == volumeToDelete {
218+
return true
219+
}
220+
return false
221+
})
222+
223+
newJob.Spec.Template.Spec.Containers[0].VolumeMounts = slices.DeleteFunc(newJob.Spec.Template.Spec.Containers[0].VolumeMounts, func(volMount corev1.VolumeMount) bool {
224+
if volMount.Name == volumeToDelete {
225+
return true
226+
}
227+
return false
228+
})
229+
230+
newJ, err := clusterWatcher.Client.BatchV1().Jobs(common.Namespace).Create(context.TODO(), newJob, v1.CreateOptions{})
231+
if err != nil {
232+
clusterWatcher.Log.Warnf("Cannot create job: %s, error=%s", newJob.Name, err)
233+
return
234+
}
235+
job = newJ.Name
236+
break
237+
}
238+
}
239+
}
240+
}
241+
}
242+
106243
func (clusterWatcher *ClusterWatcher) WatchNodes() {
107244
log := clusterWatcher.Log
108245
nodeInformer := informer.Core().V1().Nodes().Informer()
@@ -113,12 +250,13 @@ func (clusterWatcher *ClusterWatcher) WatchNodes() {
113250
runtime = strings.Split(runtime, ":")[0]
114251
if val, ok := node.Labels[common.OsLabel]; ok && val == "linux" {
115252
log.Infof("Installing snitch on node %s", node.Name)
116-
_, err := clusterWatcher.Client.BatchV1().Jobs(common.Namespace).Create(context.Background(), deploySnitch(node.Name, runtime), v1.CreateOptions{})
253+
snitchJob, err := clusterWatcher.Client.BatchV1().Jobs(common.Namespace).Create(context.Background(), deploySnitch(node.Name, runtime), v1.CreateOptions{})
117254
if err != nil {
118255
log.Errorf("Cannot run snitch on node %s, error=%s", node.Name, err.Error())
119256
return
120257
}
121258
log.Infof("Snitch was installed on node %s", node.Name)
259+
go clusterWatcher.checkJobStatus(snitchJob.Name, runtime, node.Name)
122260
}
123261
}
124262
},
@@ -136,12 +274,13 @@ func (clusterWatcher *ClusterWatcher) WatchNodes() {
136274
clusterWatcher.Log.Infof("Node might have been restarted, redeploying snitch ")
137275
if val, ok := node.Labels[common.OsLabel]; ok && val == "linux" {
138276
log.Infof("Installing snitch on node %s", node.Name)
139-
_, err := clusterWatcher.Client.BatchV1().Jobs(common.Namespace).Create(context.Background(), deploySnitch(node.Name, runtime), v1.CreateOptions{})
277+
snitchJob, err := clusterWatcher.Client.BatchV1().Jobs(common.Namespace).Create(context.Background(), deploySnitch(node.Name, runtime), v1.CreateOptions{})
140278
if err != nil {
141279
log.Errorf("Cannot run snitch on node %s, error=%s", node.Name, err.Error())
142280
return
143281
}
144282
log.Infof("Snitch was installed on node %s", node.Name)
283+
go clusterWatcher.checkJobStatus(snitchJob.Name, runtime, node.Name)
145284
}
146285
}
147286
}
@@ -788,14 +927,14 @@ func (clusterWatcher *ClusterWatcher) UpdateCrdStatus(cfg, phase, message string
788927
// retry the update
789928
return false, nil
790929
}
930+
clusterWatcher.Log.Info("Config CR Status Updated Successfully")
791931
}
792932
return true, nil
793933
})
794934
if err != nil {
795935
clusterWatcher.Log.Errorf("Error updating the ConfigCR status %s", err)
796936
return
797937
}
798-
clusterWatcher.Log.Info("Config CR Status Updated Successfully")
799938
}
800939

801940
func (clusterWatcher *ClusterWatcher) UpdateKubeArmorConfigMap(cfg *opv1.KubeArmorConfig) {
@@ -1002,19 +1141,19 @@ func (clusterWatcher *ClusterWatcher) WatchRecommendedPolicies() error {
10021141
var yamlBytes []byte
10031142
policies, err := recommend.CRDFs.ReadDir(".")
10041143
if err != nil {
1005-
clusterWatcher.Log.Warnf("error reading policies FS", err)
1144+
clusterWatcher.Log.Warnf("error reading policies FS %s", err)
10061145
return err
10071146
}
10081147
for _, policy := range policies {
10091148
csp := &secv1.KubeArmorClusterPolicy{}
10101149
if !policy.IsDir() {
10111150
yamlBytes, err = recommend.CRDFs.ReadFile(policy.Name())
10121151
if err != nil {
1013-
clusterWatcher.Log.Warnf("error reading csp", policy.Name())
1152+
clusterWatcher.Log.Warnf("error reading csp %s", policy.Name())
10141153
continue
10151154
}
10161155
if err := runtime.DecodeInto(scheme.Codecs.UniversalDeserializer(), yamlBytes, csp); err != nil {
1017-
clusterWatcher.Log.Warnf("error decoding csp", policy.Name())
1156+
clusterWatcher.Log.Warnf("error decoding csp %s", policy.Name())
10181157
continue
10191158
}
10201159
}
@@ -1024,31 +1163,31 @@ func (clusterWatcher *ClusterWatcher) WatchRecommendedPolicies() error {
10241163
clusterWatcher.Log.Infof("excluding csp ", csp.Name)
10251164
err = clusterWatcher.Secv1Client.SecurityV1().KubeArmorClusterPolicies().Delete(context.Background(), csp.GetName(), metav1.DeleteOptions{})
10261165
if err != nil && !metav1errors.IsNotFound(err) {
1027-
clusterWatcher.Log.Warnf("error deleting csp", csp.GetName())
1166+
clusterWatcher.Log.Warnf("error deleting csp %s", csp.GetName())
10281167
} else if err == nil {
1029-
clusterWatcher.Log.Infof("deleted csp", csp.GetName())
1168+
clusterWatcher.Log.Infof("deleted csp :%s", csp.GetName())
10301169
}
10311170
continue
10321171
}
10331172
csp.Spec.Selector.MatchExpressions = common.RecommendedPolicies.MatchExpressions
10341173
_, err = clusterWatcher.Secv1Client.SecurityV1().KubeArmorClusterPolicies().Create(context.Background(), csp, metav1.CreateOptions{})
10351174
if err != nil && !metav1errors.IsAlreadyExists(err) {
1036-
clusterWatcher.Log.Warnf("error creating csp", csp.GetName())
1175+
clusterWatcher.Log.Warnf("error creating csp %s", csp.GetName())
10371176
continue
10381177
} else if metav1errors.IsAlreadyExists(err) {
10391178
pol, err := clusterWatcher.Secv1Client.SecurityV1().KubeArmorClusterPolicies().Get(context.Background(), csp.GetName(), metav1.GetOptions{})
10401179
if err != nil {
1041-
clusterWatcher.Log.Warnf("error getting csp", csp.GetName())
1180+
clusterWatcher.Log.Warnf("error getting csp %s", csp.GetName())
10421181
continue
10431182
}
10441183
if !reflect.DeepEqual(pol.Spec.Selector.MatchExpressions, common.RecommendedPolicies.MatchExpressions) {
10451184
pol.Spec.Selector.MatchExpressions = common.RecommendedPolicies.MatchExpressions
10461185
_, err := clusterWatcher.Secv1Client.SecurityV1().KubeArmorClusterPolicies().Update(context.Background(), pol, metav1.UpdateOptions{})
10471186
if err != nil {
1048-
clusterWatcher.Log.Warnf("error updating csp", csp.GetName())
1187+
clusterWatcher.Log.Warnf("error updating csp %s", csp.GetName())
10491188
continue
10501189
} else {
1051-
clusterWatcher.Log.Info("updated csp", csp.GetName())
1190+
clusterWatcher.Log.Infof("updated csp %s", csp.GetName())
10521191
}
10531192
}
10541193
} else {
@@ -1058,10 +1197,10 @@ func (clusterWatcher *ClusterWatcher) WatchRecommendedPolicies() error {
10581197
if !policy.IsDir() {
10591198
err = clusterWatcher.Secv1Client.SecurityV1().KubeArmorClusterPolicies().Delete(context.Background(), csp.GetName(), metav1.DeleteOptions{})
10601199
if err != nil && !metav1errors.IsNotFound(err) {
1061-
clusterWatcher.Log.Warnf("error deleting csp", csp.GetName())
1200+
clusterWatcher.Log.Warnf("error deleting csp %s", csp.GetName())
10621201
continue
1063-
} else {
1064-
clusterWatcher.Log.Info("deleted csp", csp.GetName())
1202+
} else if err == nil {
1203+
clusterWatcher.Log.Info("deleted csp %s", csp.GetName())
10651204
}
10661205
}
10671206
}

pkg/KubeArmorOperator/internal/controller/resources.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ func deploySnitch(nodename string, runtime string) *batchv1.Job {
322322
VolumeSource: corev1.VolumeSource{
323323
HostPath: &corev1.HostPathVolumeSource{
324324
Path: "/etc/apparmor.d/",
325-
Type: &common.HostPathDirectoryOrCreate,
325+
Type: &common.HostPathDirectory,
326326
},
327327
},
328328
},

0 commit comments

Comments
 (0)