@@ -2,10 +2,12 @@ package nri
2
2
3
3
import (
4
4
"context"
5
+ "sort"
5
6
"strings"
6
7
7
8
"github.com/containerd/nri/pkg/api"
8
9
"go.uber.org/zap"
10
+ resourcev1beta1 "k8s.io/api/resource/v1beta1"
9
11
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
10
12
)
11
13
19
21
NvidiaDriverGPUPath = "/proc/driver/nvidia/gpus"
20
22
)
21
23
24
+ type networkSupport struct {
25
+ cniConfig string
26
+ gpuCount int
27
+ gpus map [string ]struct {}
28
+ }
29
+
22
30
func (n * nriPlugin ) getAllocatedGpusForPodSandbox (ctx context.Context , pod * api.PodSandbox ) (gpus []string , err error ) {
23
31
n .logger .Debug ("Getting allocated GPUs for pod" , zap .String ("podID" , pod .GetId ()))
24
32
@@ -88,3 +96,158 @@ func (n *nriPlugin) getPodAllocatedGpuResources(sandbox *api.PodSandbox, PodReso
88
96
89
97
return gpusDevicePciAddr , nil
90
98
}
99
+
100
+ func filterCniConfigsWithGpuAffinity (gpus []string , resourceSlice * resourcev1beta1.ResourceSlice ) []string {
101
+ if len (gpus ) == 0 {
102
+ return nil
103
+ }
104
+
105
+ // Map to track network configurations found for each GPU
106
+ gpuNetworkMap := make (map [string ][]string )
107
+ // Map to track which GPUs each network interface supports
108
+ networkGpuMap := make (map [string ]map [string ]struct {})
109
+
110
+ // Step 1: Collect all available network interface CNI configurations for each GPU
111
+ for _ , dev := range resourceSlice .Spec .Devices {
112
+ if dev .Basic == nil || dev .Basic .Attributes == nil {
113
+ continue
114
+ }
115
+
116
+ if ! IsReadyRdmaResourceDevice (dev .Basic ) {
117
+ continue
118
+ }
119
+
120
+ // Get CNI configuration for this network interface
121
+ cniConfigsStr := GetStringValueForAttributes ("cniConfigs" , dev .Basic .Attributes )
122
+ if cniConfigsStr == "" {
123
+ continue
124
+ }
125
+
126
+ // Get GPU affinity for this network interface
127
+ gpusInAttribute := GetStringValueForAttributes ("gdrAffinityGpus" , dev .Basic .Attributes )
128
+ if gpusInAttribute == "" {
129
+ continue
130
+ }
131
+
132
+ // Initialize the map for this network interface if not already done
133
+ if _ , exists := networkGpuMap [cniConfigsStr ]; ! exists {
134
+ networkGpuMap [cniConfigsStr ] = make (map [string ]struct {})
135
+ }
136
+
137
+ // Check if each requested GPU has affinity with this network interface
138
+ for _ , gpu := range gpus {
139
+ if strings .Contains (gpusInAttribute , gpu ) {
140
+ // Add this network interface's CNI config to the corresponding GPU's config list
141
+ gpuNetworkMap [gpu ] = append (gpuNetworkMap [gpu ], cniConfigsStr )
142
+ // Record that this network interface supports this GPU
143
+ networkGpuMap [cniConfigsStr ][gpu ] = struct {}{}
144
+ }
145
+ }
146
+ }
147
+
148
+ // Step 2: Check if any network interface supports all GPUs
149
+ for cniConfig , supportedGpus := range networkGpuMap {
150
+ if len (supportedGpus ) == len (gpus ) {
151
+ // This network interface supports all GPUs
152
+ allGpusSupported := true
153
+ for _ , gpu := range gpus {
154
+ if _ , exists := supportedGpus [gpu ]; ! exists {
155
+ allGpusSupported = false
156
+ break
157
+ }
158
+ }
159
+ if allGpusSupported {
160
+ return []string {cniConfig }
161
+ }
162
+ }
163
+ }
164
+
165
+ // Step 3: If no network interface supports all GPUs, we need to find a combination of networks
166
+ // that can cover all GPUs with minimal number of networks
167
+
168
+ // First, try to find networks that support multiple GPUs
169
+ var coveredGpus = make (map [string ]struct {})
170
+ var selectedConfigs []string
171
+ configsMap := make (map [string ]struct {})
172
+
173
+ // Sort networks by the number of GPUs they support (descending)
174
+ var networkSupports []networkSupport
175
+ for cniConfig , supportedGpus := range networkGpuMap {
176
+ networkSupports = append (networkSupports , networkSupport {
177
+ cniConfig : cniConfig ,
178
+ gpuCount : len (supportedGpus ),
179
+ gpus : supportedGpus ,
180
+ })
181
+ }
182
+
183
+ // Sort by GPU count descending
184
+ sort .Slice (networkSupports , func (i , j int ) bool {
185
+ return networkSupports [i ].gpuCount > networkSupports [j ].gpuCount
186
+ })
187
+
188
+ // Greedily select networks that cover the most uncovered GPUs
189
+ for len (coveredGpus ) < len (gpus ) && len (networkSupports ) > 0 {
190
+ // Find the network that covers the most uncovered GPUs
191
+ bestIdx := - 1
192
+ bestNewCoverage := 0
193
+
194
+ for i , ns := range networkSupports {
195
+ // Count how many new GPUs this network would cover
196
+ newCoverage := 0
197
+ for gpu := range ns .gpus {
198
+ if _ , covered := coveredGpus [gpu ]; ! covered {
199
+ newCoverage ++
200
+ }
201
+ }
202
+
203
+ if newCoverage > bestNewCoverage {
204
+ bestNewCoverage = newCoverage
205
+ bestIdx = i
206
+ }
207
+ }
208
+
209
+ // If we couldn't find a network that covers new GPUs, break
210
+ if bestIdx == - 1 || bestNewCoverage == 0 {
211
+ break
212
+ }
213
+
214
+ // Add the selected network
215
+ selected := networkSupports [bestIdx ]
216
+ if _ , exists := configsMap [selected .cniConfig ]; ! exists {
217
+ configsMap [selected .cniConfig ] = struct {}{}
218
+ selectedConfigs = append (selectedConfigs , selected .cniConfig )
219
+ }
220
+
221
+ // Mark the GPUs as covered
222
+ for gpu := range selected .gpus {
223
+ coveredGpus [gpu ] = struct {}{}
224
+ }
225
+
226
+ // Remove the selected network from consideration
227
+ networkSupports = append (networkSupports [:bestIdx ], networkSupports [bestIdx + 1 :]... )
228
+ }
229
+
230
+ // If we've covered all GPUs, return the selected configs
231
+ if len (coveredGpus ) == len (gpus ) {
232
+ return selectedConfigs
233
+ }
234
+
235
+ // Step 4: If no single network interface can support all GPUs, select one for each GPU
236
+ // Reuse the existing map but create a new slice
237
+ var selectedCniConfigs []string
238
+
239
+ for _ , gpu := range gpus {
240
+ configs , found := gpuNetworkMap [gpu ]
241
+ if ! found || len (configs ) == 0 {
242
+ continue
243
+ }
244
+
245
+ // Add the first configuration if not already added
246
+ if _ , exists := configsMap [configs [0 ]]; ! exists {
247
+ configsMap [configs [0 ]] = struct {}{}
248
+ selectedCniConfigs = append (selectedCniConfigs , configs [0 ])
249
+ }
250
+ }
251
+
252
+ return selectedCniConfigs
253
+ }
0 commit comments