Skip to content

Commit 25dd95e

Browse files
committed
Get allocated nvidia gpus from pod namespace
Signed-off-by: Cyclinder Kuo <[email protected]>
1 parent 4ca67bd commit 25dd95e

22 files changed

+861
-185
lines changed

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ require (
4545
go.uber.org/zap v1.27.0
4646
golang.org/x/net v0.35.0
4747
golang.org/x/sync v0.11.0 // indirect
48-
golang.org/x/sys v0.30.0
48+
golang.org/x/sys v0.33.0
4949
golang.org/x/tools v0.29.0
5050
gopkg.in/natefinch/lumberjack.v2 v2.2.1
5151
gopkg.in/yaml.v2 v2.4.0

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -805,8 +805,8 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
805805
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
806806
golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
807807
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
808-
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
809-
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
808+
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
809+
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
810810
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
811811
golang.org/x/term v0.0.0-20210615171337-6886f2dfbf5b/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
812812
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=

pkg/dra/config.go

-27
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@ package dra
33
import (
44
"encoding/json"
55
"fmt"
6-
"regexp"
7-
"strings"
8-
"unicode"
96

107
resourcev1beta1 "k8s.io/api/resource/v1beta1"
118
)
@@ -111,27 +108,3 @@ func (d *NetworkConfig) GetResourceNames() []string {
111108
func MultusAnnotationValue(namespace, name string) string {
112109
return fmt.Sprintf("%s/%s", namespace, name)
113110
}
114-
115-
// NormalizedDNS1123Label normalizes the interface name to a valid DNS1123 label
116-
func NormalizedDNS1123Label(iface string) string {
117-
// Convert to lowercase
118-
normalized := strings.ToLower(iface)
119-
// Replace invalid chars with hyphen
120-
reg := regexp.MustCompile("[^a-z0-9-]")
121-
normalized = reg.ReplaceAllString(normalized, "-")
122-
// Remove leading and trailing hyphens
123-
normalized = strings.Trim(normalized, "-")
124-
// Replace multiple consecutive hyphens with a single one
125-
reg = regexp.MustCompile("-+")
126-
normalized = reg.ReplaceAllString(normalized, "-")
127-
128-
// If the string is empty after normalization, use a default name
129-
if normalized == "" {
130-
normalized = "iface"
131-
}
132-
// If it starts with a number, prefix it
133-
if unicode.IsDigit(rune(normalized[0])) {
134-
normalized = "iface-" + normalized
135-
}
136-
return normalized
137-
}

pkg/dra/driver.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,11 @@ func (d *Driver) prepareMultusConfigs(pod *corev1.Pod, configs []resourcev1beta1
161161

162162
// PublishResources periodically publishes the available SR-IOV resources
163163
func (d *Driver) PublishResources(ctx context.Context) {
164-
d.logger.Info("Starting to publish resources")
165164
devices := d.state.GetNetDevices()
166165
if err := d.draPlugin.PublishResources(ctx, kubeletplugin.Resources{Devices: devices}); err != nil {
167166
d.logger.Error("failed to publish resources", zap.Error(err))
167+
} else {
168+
d.logger.Info("Published DRA resources")
168169
}
169170

170171
ticker := time.NewTicker(time.Minute)
@@ -183,8 +184,6 @@ func (d *Driver) PublishResources(ctx context.Context) {
183184
devices := d.state.GetNetDevices()
184185
if err := d.draPlugin.PublishResources(ctx, kubeletplugin.Resources{Devices: devices}); err != nil {
185186
d.logger.Error("failed to publish resources", zap.Error(err))
186-
} else {
187-
d.logger.Info("published resources")
188187
}
189188
}
190189
}

pkg/dra/nri/gpu.go

+33-17
Original file line numberDiff line numberDiff line change
@@ -2,43 +2,59 @@ package nri
22

33
import (
44
"context"
5-
"strings"
65

76
"github.com/containerd/nri/pkg/api"
87
"go.uber.org/zap"
98
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
109
)
1110

1211
const (
12+
NvidiaGPU = iota
13+
// More GPU vendor
14+
)
15+
16+
var (
1317
NvidiaGPUResourceName = "nvidia.com/gpu"
18+
NvidiaDriverGPUPath = "/proc/driver/nvidia/gpus"
1419
)
1520

1621
func (n *nriPlugin) getAllocatedGpusForPodSandbox(ctx context.Context, pod *api.PodSandbox) (gpus []string, err error) {
1722
n.logger.Info("Getting allocated GPUs for pod", zap.String("podID", pod.GetId()))
1823

19-
podResources, err := n.kubeletClient.Get(ctx, &podresourcesapi.GetPodResourcesRequest{
20-
PodName: pod.GetName(),
21-
PodNamespace: pod.GetNamespace(),
22-
})
24+
// It shoule be better use Get function here, but we should enable the kubelet feature-gate
25+
// "KubeletPodResourcesGetAllocatable"(alpha in 1.27).
26+
// podResources, err := n.kubeletClient.Get(ctx, &podresourcesapi.GetPodResourcesRequest{
27+
// PodName: pod.GetName(),
28+
// PodNamespace: pod.GetNamespace(),
29+
// })
30+
31+
resp, err := n.kubeletClient.List(ctx, &podresourcesapi.ListPodResourcesRequest{})
2332
if err != nil {
2433
n.logger.Error("Failed to get pod resource map", zap.Error(err))
2534
return
2635
}
2736

28-
return GetPodAllocatedGpuResources(NvidiaGPUResourceName, podResources), nil
29-
}
37+
for _, r := range resp.PodResources {
38+
if r.Name == pod.Name && r.Namespace == pod.Namespace {
39+
return n.getPodAllocatedGpuResources(pod, r)
40+
}
41+
}
3042

31-
func (n *nriPlugin) getAllocatedGpusForContainer(contaienr *api.Container) (gpus []string, err error) {
32-
n.logger.Info("Debug GPU in CreateContainer", zap.Any("container", contaienr))
33-
return nil, nil
43+
// return if no any resources allocated
44+
return
3445
}
3546

36-
// 从设备路径中提取设备ID
37-
func extractDeviceIDFromPath(path string) string {
38-
// 假设路径格式为 /dev/nvidia0
39-
parts := strings.Split(path, "nvidia")
40-
if len(parts) < 2 {
41-
return ""
47+
func (n *nriPlugin) getPodAllocatedGpuResources(sandbox *api.PodSandbox, PodResources *podresourcesapi.PodResources) ([]string, error) {
48+
var gpuDevices []string
49+
for _, c := range PodResources.Containers {
50+
// device plugin resouresouces
51+
for _, dev := range c.Devices {
52+
if dev.ResourceName == NvidiaGPUResourceName {
53+
// get allocated nvidia gpus from the pod
54+
return GetAllocatedNvidiaGpusFromPodNamespace(int(sandbox.Pid), NvidiaDriverGPUPath)
55+
}
56+
}
4257
}
43-
return parts[1]
58+
59+
return gpuDevices, nil
4460
}

pkg/dra/nri/kubelet_resources.go

-81
This file was deleted.

pkg/dra/nri/nri.go

+17-15
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,17 @@ var (
2222
)
2323

2424
type nriPlugin struct {
25-
logger *zap.Logger
26-
nri stub.Stub
27-
kubeletClient podresourcesapi.PodResourcesListerClient
28-
conn *grpc.ClientConn
25+
gpuResourceNames map[string]struct{}
26+
logger *zap.Logger
27+
nri stub.Stub
28+
kubeletClient podresourcesapi.PodResourcesListerClient
29+
conn *grpc.ClientConn
2930
}
3031

3132
func Run(ctx context.Context) error {
3233
n := &nriPlugin{
33-
logger: logutils.Logger.Named("nri"),
34+
logger: logutils.Logger.Named("nri"),
35+
gpuResourceNames: make(map[string]struct{}),
3436
}
3537
// register the NRI plugin
3638
nriOpts := []stub.Option{
@@ -43,13 +45,13 @@ func Run(ctx context.Context) error {
4345
}
4446
n.nri = stub
4547

46-
kubeletClient, conn, err := GetKubeletResourceClient()
48+
n.kubeletClient, n.conn, err = GetKubeletResourceClient()
4749
if err != nil {
4850
return err
4951
}
5052

51-
n.kubeletClient = kubeletClient
52-
n.conn = conn
53+
// TODO: make it configuiretable
54+
n.gpuResourceNames[NvidiaGPUResourceName] = struct{}{}
5355

5456
go func() {
5557
if err = n.nri.Run(ctx); err != nil {
@@ -64,10 +66,11 @@ func Run(ctx context.Context) error {
6466

6567
func (n *nriPlugin) RunPodSandbox(ctx context.Context, pod *api.PodSandbox) error {
6668
n.logger.Info("RunPodSandbox is called", zap.Any("pod", pod))
67-
// 1. get pod net namespace
68-
// 2. get multus config
69-
// 3. get
70-
gpus, _ := n.getAllocatedGpusForPodSandbox(ctx, pod)
69+
gpus, err := n.getAllocatedGpusForPodSandbox(ctx, pod)
70+
if err != nil {
71+
n.logger.Error(err.Error())
72+
return err
73+
}
7174
n.logger.Info("Allocated GPUs for pod", zap.Strings("gpus", gpus))
7275
return nil
7376
}
@@ -81,8 +84,7 @@ func (n *nriPlugin) Configure(ctx context.Context, config, runtime, version stri
8184
return api.EventMask(
8285
api.Event_RUN_POD_SANDBOX |
8386
api.Event_STOP_POD_SANDBOX |
84-
api.Event_REMOVE_POD_SANDBOX |
85-
api.Event_CREATE_CONTAINER), nil
87+
api.Event_REMOVE_POD_SANDBOX), nil
8688
}
8789

8890
func (n *nriPlugin) StopPodSandbox(ctx context.Context, pod *api.PodSandbox) error {
@@ -91,12 +93,12 @@ func (n *nriPlugin) StopPodSandbox(ctx context.Context, pod *api.PodSandbox) err
9193
}
9294

9395
func (n *nriPlugin) CreateContainer(ctx context.Context, pod *api.PodSandbox, container *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) {
94-
n.getAllocatedGpusForContainer(container)
9596
n.logger.Info("CreateContainer is called", zap.Any("container", container))
9697
return nil, nil, nil
9798
}
9899

99100
func (n *nriPlugin) RemovePodSandbox(ctx context.Context, pod *api.PodSandbox) error {
101+
n.logger.Info("RemovePodSandbox is called", zap.Any("pod", pod))
100102
return nil
101103
}
102104

0 commit comments

Comments
 (0)