Skip to content

Commit 71097d1

Browse files
authored
cuda.parallel: Check compiled code for LDL/STL instructions in tests (#4472)
* cuda.parallel: Check compiled code for LDL/STL instructions in tests * No need for global & use monkeypatch --------- Co-authored-by: Ashwin Srinath <[email protected]>
1 parent 8da9b8f commit 71097d1

File tree

3 files changed

+76
-8
lines changed

3 files changed

+76
-8
lines changed

python/cuda_parallel/cuda/parallel/experimental/_bindings.pyx

+34-6
Original file line numberDiff line numberDiff line change
@@ -963,7 +963,8 @@ cdef class CommonData:
963963

964964
cdef extern from "cccl/c/reduce.h":
965965
cdef struct cccl_device_reduce_build_result_t 'cccl_device_reduce_build_result_t':
966-
pass
966+
const char* cubin
967+
size_t cubin_size
967968

968969
cdef CUresult cccl_device_reduce_build(
969970
cccl_device_reduce_build_result_t*,
@@ -1071,6 +1072,8 @@ cdef class DeviceReduceBuildResult:
10711072
)
10721073
return storage_sz
10731074

1075+
def _get_cubin(self):
1076+
return self.build_data.cubin[:self.build_data.cubin_size]
10741077

10751078
# ------------
10761079
# DeviceScan
@@ -1081,7 +1084,8 @@ cdef extern from "cccl/c/scan.h":
10811084
ctypedef bint _Bool
10821085

10831086
cdef struct cccl_device_scan_build_result_t 'cccl_device_scan_build_result_t':
1084-
pass
1087+
const char* cubin
1088+
size_t cubin_size
10851089

10861090
cdef CUresult cccl_device_scan_build(
10871091
cccl_device_scan_build_result_t*,
@@ -1236,6 +1240,8 @@ cdef class DeviceScanBuildResult:
12361240
)
12371241
return storage_sz
12381242

1243+
def _get_cubin(self):
1244+
return self.build_data.cubin[:self.build_data.cubin_size]
12391245

12401246
# -----------------------
12411247
# DeviceSegmentedReduce
@@ -1244,7 +1250,8 @@ cdef class DeviceScanBuildResult:
12441250

12451251
cdef extern from "cccl/c/segmented_reduce.h":
12461252
cdef struct cccl_device_segmented_reduce_build_result_t 'cccl_device_segmented_reduce_build_result_t':
1247-
pass
1253+
const char* cubin
1254+
size_t cubin_size
12481255

12491256
cdef CUresult cccl_device_segmented_reduce_build(
12501257
cccl_device_segmented_reduce_build_result_t*,
@@ -1364,14 +1371,17 @@ cdef class DeviceSegmentedReduceBuildResult:
13641371
)
13651372
return storage_sz
13661373

1374+
def _get_cubin(self):
1375+
return self.build_data.cubin[:self.build_data.cubin_size]
13671376
# -----------------
13681377
# DeviceMergeSort
13691378
# -----------------
13701379

13711380

13721381
cdef extern from "cccl/c/merge_sort.h":
13731382
cdef struct cccl_device_merge_sort_build_result_t 'cccl_device_merge_sort_build_result_t':
1374-
pass
1383+
const char* cubin
1384+
size_t cubin_size
13751385

13761386
cdef CUresult cccl_device_merge_sort_build(
13771387
cccl_device_merge_sort_build_result_t *bld_ptr,
@@ -1484,13 +1494,20 @@ cdef class DeviceMergeSortBuildResult:
14841494
)
14851495
return storage_sz
14861496

1497+
1498+
def _get_cubin(self):
1499+
return self.build_data.cubin[:self.build_data.cubin_size]
1500+
1501+
14871502
# -------------------
14881503
# DeviceUniqueByKey
14891504
# -------------------
14901505

14911506
cdef extern from "cccl/c/unique_by_key.h":
14921507
cdef struct cccl_device_unique_by_key_build_result_t 'cccl_device_unique_by_key_build_result_t':
1493-
pass
1508+
const char* cubin
1509+
size_t cubin_size
1510+
14941511

14951512
cdef CUresult cccl_device_unique_by_key_build(
14961513
cccl_device_unique_by_key_build_result_t *build_ptr,
@@ -1611,12 +1628,16 @@ cdef class DeviceUniqueByKeyBuildResult:
16111628
return storage_sz
16121629

16131630

1631+
def _get_cubin(self):
1632+
return self.build_data.cubin[:self.build_data.cubin_size]
1633+
16141634
# --------------------------------------------
16151635
# DeviceUnaryTransform/DeviceBinaryTransform
16161636
# --------------------------------------------
16171637
cdef extern from "cccl/c/transform.h":
16181638
cdef struct cccl_device_transform_build_result_t:
1619-
pass
1639+
const char* cubin
1640+
size_t cubin_size
16201641

16211642
cdef CUresult cccl_device_unary_transform_build(
16221643
cccl_device_transform_build_result_t *build_ptr,
@@ -1723,6 +1744,10 @@ cdef class DeviceUnaryTransform:
17231744
raise RuntimeError("Failed to compute unary transform")
17241745

17251746

1747+
def _get_cubin(self):
1748+
return self.build_data.cubin[:self.build_data.cubin_size]
1749+
1750+
17261751
cdef class DeviceBinaryTransform:
17271752
cdef cccl_device_transform_build_result_t build_data
17281753

@@ -1791,3 +1816,6 @@ cdef class DeviceBinaryTransform:
17911816
)
17921817
if (status != 0):
17931818
raise RuntimeError("Failed to compute binary transform")
1819+
1820+
def _get_cubin(self):
1821+
return self.build_data.cubin[:self.build_data.cubin_size]

python/cuda_parallel/cuda/parallel/experimental/_cccl_interop.py

+31-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
from __future__ import annotations
66

77
import functools
8+
import os
9+
import subprocess
10+
import tempfile
811
from typing import Callable, List
912

1013
import numba
@@ -220,6 +223,29 @@ def get_paths() -> List[str]:
220223
return paths
221224

222225

226+
def _check_compile_result(cubin: bytes):
227+
# check compiled code for LDL/STL instructions
228+
temp_cubin_file = tempfile.NamedTemporaryFile(delete=False)
229+
try:
230+
temp_cubin_file.write(cubin)
231+
out = subprocess.run(
232+
["nvdisasm", "-gi", temp_cubin_file.name], capture_output=True
233+
)
234+
if out.returncode != 0:
235+
raise RuntimeError("nvdisasm failed")
236+
sass = out.stdout.decode("utf-8")
237+
finally:
238+
os.unlink(temp_cubin_file.name)
239+
240+
assert "LDL" not in sass, "LDL instruction found in SASS"
241+
assert "STL" not in sass, "STL instruction found in SASS"
242+
243+
244+
# this global variable controls whether the compile result is checked
245+
# for LDL/STL instructions. Should be set to `True` for testing only.
246+
_check_sass: bool = False
247+
248+
223249
def call_build(build_impl_fn: Callable, *args, **kwargs):
224250
"""Calls given build_impl_fn callable while providing compute capability and paths
225251
@@ -230,9 +256,12 @@ def call_build(build_impl_fn: Callable, *args, **kwargs):
230256
common_data = CommonData(
231257
cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, cuda_include_path
232258
)
233-
error = build_impl_fn(
259+
result = build_impl_fn(
234260
*args,
235261
common_data,
236262
**kwargs,
237263
)
238-
return error
264+
if _check_sass:
265+
cubin = result._get_cubin()
266+
_check_compile_result(cubin)
267+
return result

python/cuda_parallel/tests/conftest.py

+11
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,14 @@ def ptr(self):
6767
@pytest.fixture(scope="function")
6868
def cuda_stream() -> Stream:
6969
return Stream(cp.cuda.Stream())
70+
71+
72+
@pytest.fixture(scope="function", autouse=True)
73+
def verify_sass(monkeypatch):
74+
import cuda.parallel.experimental._cccl_interop
75+
76+
monkeypatch.setattr(
77+
cuda.parallel.experimental._cccl_interop,
78+
"_check_sass",
79+
False, # todo: change to True
80+
)

0 commit comments

Comments
 (0)