1
1
/*
2
+ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
2
3
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
3
4
* Copyright (c) 2014 Research Organization for Information Science
4
5
* and Technology (RIST). All rights reserved.
5
6
* Copyright (c) 2014 Mellanox Technologies, Inc.
6
7
* All rights reserved.
7
8
* Copyright (c) Amazon.com, Inc. or its affiliates.
8
9
* All Rights reserved.
9
- * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
10
10
* Copyright (c) 2024 The University of Tennessee and The University
11
11
* of Tennessee Research Foundation. All rights
12
12
* reserved.
@@ -154,9 +154,75 @@ static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
154
154
return dev_id ;
155
155
}
156
156
157
+ static int accelerator_cuda_check_vmm (CUdeviceptr dbuf , CUmemorytype * mem_type ,
158
+ int * dev_id )
159
+ {
160
+ #if OPAL_CUDA_VMM_SUPPORT
161
+ static int device_count = -1 ;
162
+ CUmemAllocationProp prop ;
163
+ CUmemLocation location ;
164
+ CUresult result ;
165
+ unsigned long long flags ;
166
+ CUmemGenericAllocationHandle alloc_handle ;
167
+
168
+ if (device_count == -1 ) {
169
+ result = cuDeviceGetCount (& device_count );
170
+ if (result != CUDA_SUCCESS ) {
171
+ return 0 ;
172
+ }
173
+ }
174
+
175
+ result = cuMemRetainAllocationHandle (& alloc_handle , (void * )dbuf );
176
+ if (result != CUDA_SUCCESS ) {
177
+ return 0 ;
178
+ }
179
+
180
+ result = cuMemGetAllocationPropertiesFromHandle (& prop , alloc_handle );
181
+ if (result != CUDA_SUCCESS ) {
182
+ cuMemRelease (alloc_handle );
183
+ return 0 ;
184
+ }
185
+
186
+ if (prop .location .type == CU_MEM_LOCATION_TYPE_DEVICE ) {
187
+ * mem_type = CU_MEMORYTYPE_DEVICE ;
188
+ * dev_id = prop .location .id ;
189
+ cuMemRelease (alloc_handle );
190
+ return 1 ;
191
+ }
192
+
193
+ if (prop .location .type == CU_MEM_LOCATION_TYPE_HOST_NUMA ) {
194
+ /* check if device has access */
195
+ for (int i = 0 ; i < device_count ; i ++ ) {
196
+ location .type = CU_MEM_LOCATION_TYPE_DEVICE ;
197
+ location .id = i ;
198
+ result = cuMemGetAccess (& flags , & location , dbuf );
199
+ if ((CUDA_SUCCESS == result ) &&
200
+ (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags )) {
201
+ * mem_type = CU_MEMORYTYPE_DEVICE ;
202
+ * dev_id = i ;
203
+ cuMemRelease (alloc_handle );
204
+ return 1 ;
205
+ }
206
+ }
207
+ }
208
+
209
+ /* host must have access as device access possibility is exhausted */
210
+ * mem_type = CU_MEMORYTYPE_HOST ;
211
+ * dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
212
+ cuMemRelease (alloc_handle );
213
+ return 1 ;
214
+
215
+ #endif
216
+
217
+ return 0 ;
218
+ }
219
+
157
220
static int accelerator_cuda_check_addr (const void * addr , int * dev_id , uint64_t * flags )
158
221
{
159
222
CUresult result ;
223
+ int is_vmm = 0 ;
224
+ int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
225
+ CUmemorytype vmm_mem_type = 0 ;
160
226
CUmemorytype mem_type = 0 ;
161
227
CUdeviceptr dbuf = (CUdeviceptr ) addr ;
162
228
CUcontext ctx = NULL , mem_ctx = NULL ;
@@ -168,6 +234,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
168
234
169
235
* flags = 0 ;
170
236
237
+ is_vmm = accelerator_cuda_check_vmm (dbuf , & vmm_mem_type , & vmm_dev_id );
238
+
171
239
#if OPAL_CUDA_GET_ATTRIBUTES
172
240
uint32_t is_managed = 0 ;
173
241
/* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -197,17 +265,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
197
265
return OPAL_ERROR ;
198
266
}
199
267
} else if (CU_MEMORYTYPE_HOST == mem_type ) {
200
- /* Host memory, nothing to do here */
201
- return 0 ;
268
+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
269
+ mem_type = CU_MEMORYTYPE_DEVICE ;
270
+ * dev_id = vmm_dev_id ;
271
+ } else {
272
+ /* Host memory, nothing to do here */
273
+ return 0 ;
274
+ }
202
275
} else if (0 == mem_type ) {
203
276
/* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
204
277
return 0 ;
205
278
} else {
206
- /* query the device from the context */
207
- * dev_id = accelerator_cuda_get_device_id (mem_ctx );
279
+ if (is_vmm ) {
280
+ * dev_id = vmm_dev_id ;
281
+ } else {
282
+ /* query the device from the context */
283
+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
284
+ }
208
285
}
209
- /* Must be a device pointer */
210
- assert (CU_MEMORYTYPE_DEVICE == mem_type );
211
286
#else /* OPAL_CUDA_GET_ATTRIBUTES */
212
287
result = cuPointerGetAttribute (& mem_type , CU_POINTER_ATTRIBUTE_MEMORY_TYPE , dbuf );
213
288
if (CUDA_SUCCESS != result ) {
@@ -218,16 +293,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
218
293
return OPAL_ERROR ;
219
294
}
220
295
} else if (CU_MEMORYTYPE_HOST == mem_type ) {
221
- /* Host memory, nothing to do here */
222
- return 0 ;
296
+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
297
+ mem_type = CU_MEMORYTYPE_DEVICE ;
298
+ * dev_id = vmm_dev_id ;
299
+ } else {
300
+ /* Host memory, nothing to do here */
301
+ return 0 ;
302
+ }
223
303
} else {
224
- result = cuPointerGetAttribute (& mem_ctx , CU_POINTER_ATTRIBUTE_CONTEXT , dbuf );
225
- /* query the device from the context */
226
- * dev_id = accelerator_cuda_get_device_id (mem_ctx );
304
+ if (is_vmm ) {
305
+ * dev_id = vmm_dev_id ;
306
+ } else {
307
+ result = cuPointerGetAttribute (& mem_ctx ,
308
+ CU_POINTER_ATTRIBUTE_CONTEXT , dbuf );
309
+ /* query the device from the context */
310
+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
311
+ }
227
312
}
313
+ #endif /* OPAL_CUDA_GET_ATTRIBUTES */
314
+
228
315
/* Must be a device pointer */
229
316
assert (CU_MEMORYTYPE_DEVICE == mem_type );
230
- #endif /* OPAL_CUDA_GET_ATTRIBUTES */
231
317
232
318
/* This piece of code was added in to handle in a case involving
233
319
* OMP threads. The user had initialized CUDA and then spawned
@@ -250,6 +336,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
250
336
return OPAL_ERROR ;
251
337
}
252
338
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
339
+ if (is_vmm ) {
340
+ /* This function is expected to set context if pointer is device
341
+ * accessible but VMM allocations have NULL context associated
342
+ * which cannot be set against the calling thread */
343
+ opal_output (0 ,
344
+ "CUDA: unable to set context with the given pointer"
345
+ "ptr=%p aborting..." , addr );
346
+ return OPAL_ERROR ;
347
+ }
348
+
253
349
result = cuCtxSetCurrent (mem_ctx );
254
350
if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
255
351
opal_output (0 ,
0 commit comments