|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import unittest |
| 4 | + |
| 5 | +from expecttest import TestCase |
| 6 | +import torch |
| 7 | + |
| 8 | +import helion |
| 9 | +from helion._testing import DEVICE |
| 10 | +from helion._testing import code_and_output |
| 11 | +import helion.language as hl |
| 12 | + |
| 13 | + |
| 14 | +@helion.kernel() |
| 15 | +def atomic_add_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: |
| 16 | + """Test basic atomic_add functionality.""" |
| 17 | + for i in hl.tile(x.size(0)): |
| 18 | + hl.atomic_add(x, [i], y[i]) |
| 19 | + return x |
| 20 | + |
| 21 | + |
| 22 | +@helion.kernel(static_shapes=True) |
| 23 | +def atomic_add_overlap_kernel( |
| 24 | + x: torch.Tensor, y: torch.Tensor, indices: torch.Tensor |
| 25 | +) -> torch.Tensor: |
| 26 | + """Test atomic_add with overlapping indices.""" |
| 27 | + for i in hl.tile([y.size(0)]): |
| 28 | + idx = indices[i] |
| 29 | + hl.atomic_add(x, [idx], y[i]) |
| 30 | + return x |
| 31 | + |
| 32 | + |
| 33 | +@helion.kernel() |
| 34 | +def atomic_add_2d_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: |
| 35 | + """Test atomic_add with 2D indexing.""" |
| 36 | + for i, j in hl.tile([y.size(0), y.size(1)]): |
| 37 | + hl.atomic_add(x, [i, j], y[i, j]) |
| 38 | + return x |
| 39 | + |
| 40 | + |
| 41 | +@helion.kernel() |
| 42 | +def atomic_add_float_kernel(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: |
| 43 | + """Test atomic_add with a float constant value and reading from lookup""" |
| 44 | + for i in hl.tile(indices.size(0)): |
| 45 | + idx = indices[i] |
| 46 | + hl.atomic_add(x, [idx], 2.0) |
| 47 | + return x |
| 48 | + |
| 49 | + |
| 50 | +class TestAtomicOperations(TestCase): |
| 51 | + maxDiff = 16384 |
| 52 | + |
| 53 | + def test_basic_atomic_add(self): |
| 54 | + x = torch.zeros(10, device=DEVICE) |
| 55 | + y = torch.ones(10, device=DEVICE) |
| 56 | + args = (x, y) |
| 57 | + |
| 58 | + code, result = code_and_output( |
| 59 | + atomic_add_kernel, |
| 60 | + args, |
| 61 | + block_sizes=[32], |
| 62 | + ) |
| 63 | + |
| 64 | + expected = torch.ones(10, device=DEVICE) |
| 65 | + torch.testing.assert_close(result, expected) |
| 66 | + self.assertExpectedInline( |
| 67 | + code, |
| 68 | + """\ |
| 69 | +from __future__ import annotations |
| 70 | +
|
| 71 | +import torch |
| 72 | +import triton |
| 73 | +import triton.language as tl |
| 74 | +
|
| 75 | +@triton.jit |
| 76 | +def _atomic_add_kernel_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr): |
| 77 | + pid_0 = tl.program_id(0) |
| 78 | + offset_0 = pid_0 * _BLOCK_SIZE_0 |
| 79 | + indices_0 = offset_0 + tl.arange(0, _BLOCK_SIZE_0).to(tl.int32) |
| 80 | + mask_0 = indices_0 < x_size_0 |
| 81 | + load = tl.load(y + indices_0 * y_stride_0, mask_0, other=0) |
| 82 | + tl.atomic_add(x + indices_0 * x_stride_0, load, mask=mask_0, sem='relaxed') |
| 83 | +
|
| 84 | +def atomic_add_kernel(x: torch.Tensor, y: torch.Tensor): |
| 85 | + \"\"\"Test basic atomic_add functionality.\"\"\" |
| 86 | + _BLOCK_SIZE_0 = 32 |
| 87 | + _atomic_add_kernel_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3) |
| 88 | + return x |
| 89 | +
|
| 90 | +def _atomic_add_kernel_make_precompiler(x: torch.Tensor, y: torch.Tensor): |
| 91 | + \"\"\"Test basic atomic_add functionality.\"\"\" |
| 92 | + _BLOCK_SIZE_0 = 32 |
| 93 | + from helion.runtime.precompile_shim import make_precompiler |
| 94 | + return make_precompiler(_atomic_add_kernel_kernel)(x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)""", |
| 95 | + ) |
| 96 | + |
| 97 | + def test_overlapping_atomic_add(self): |
| 98 | + # Test with overlapping indices |
| 99 | + x = torch.zeros(5, device=DEVICE) |
| 100 | + y = torch.ones(10, device=DEVICE) |
| 101 | + indices = torch.tensor([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], device=DEVICE) |
| 102 | + args = (x, y, indices) |
| 103 | + |
| 104 | + code, result = code_and_output( |
| 105 | + atomic_add_overlap_kernel, |
| 106 | + args, |
| 107 | + block_sizes=[32], |
| 108 | + ) |
| 109 | + |
| 110 | + expected = torch.ones(5, device=DEVICE) * 2 |
| 111 | + torch.testing.assert_close(result, expected) |
| 112 | + self.assertExpectedInline( |
| 113 | + code, |
| 114 | + """\ |
| 115 | +from __future__ import annotations |
| 116 | +
|
| 117 | +import torch |
| 118 | +import triton |
| 119 | +import triton.language as tl |
| 120 | +
|
| 121 | +@triton.jit |
| 122 | +def _atomic_add_overlap_kernel_kernel(indices, y, x, _BLOCK_SIZE_0: tl.constexpr): |
| 123 | + pid_0 = tl.program_id(0) |
| 124 | + offset_0 = pid_0 * _BLOCK_SIZE_0 |
| 125 | + indices_0 = offset_0 + tl.arange(0, _BLOCK_SIZE_0).to(tl.int32) |
| 126 | + mask_0 = indices_0 < 10 |
| 127 | + idx = tl.load(indices + indices_0 * 1, mask_0, other=0) |
| 128 | + load_1 = tl.load(y + indices_0 * 1, mask_0, other=0) |
| 129 | + tl.atomic_add(x + idx * 1, load_1, mask=mask_0, sem='relaxed') |
| 130 | +
|
| 131 | +def atomic_add_overlap_kernel(x: torch.Tensor, y: torch.Tensor, indices: torch.Tensor): |
| 132 | + \"\"\"Test atomic_add with overlapping indices.\"\"\" |
| 133 | + _BLOCK_SIZE_0 = 32 |
| 134 | + _atomic_add_overlap_kernel_kernel[triton.cdiv(10, _BLOCK_SIZE_0),](indices, y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=3) |
| 135 | + return x |
| 136 | +
|
| 137 | +def _atomic_add_overlap_kernel_make_precompiler(x: torch.Tensor, y: torch.Tensor, indices: torch.Tensor): |
| 138 | + \"\"\"Test atomic_add with overlapping indices.\"\"\" |
| 139 | + _BLOCK_SIZE_0 = 32 |
| 140 | + from helion.runtime.precompile_shim import make_precompiler |
| 141 | + return make_precompiler(_atomic_add_overlap_kernel_kernel)(indices, y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=3)""", |
| 142 | + ) |
| 143 | + |
| 144 | + def test_2d_atomic_add(self): |
| 145 | + """Test atomic_add with 2D tensor indexing.""" |
| 146 | + x = torch.zeros(3, 4, device=DEVICE) |
| 147 | + y = torch.ones(3, 4, device=DEVICE) |
| 148 | + args = (x, y) |
| 149 | + |
| 150 | + code, result = code_and_output( |
| 151 | + atomic_add_2d_kernel, |
| 152 | + args, |
| 153 | + block_sizes=[8, 8], |
| 154 | + ) |
| 155 | + |
| 156 | + expected = torch.ones(3, 4, device=DEVICE) |
| 157 | + torch.testing.assert_close(result, expected) |
| 158 | + self.assertIn("atomic_add", code) |
| 159 | + |
| 160 | + def test_atomic_add_code_generation(self): |
| 161 | + """Test that the generated code contains atomic_add.""" |
| 162 | + x = torch.zeros(10, device=DEVICE) |
| 163 | + y = torch.ones(10, device=DEVICE) |
| 164 | + args = (x, y) |
| 165 | + |
| 166 | + code, _ = code_and_output(atomic_add_kernel, args) |
| 167 | + self.assertIn("atomic_add", code) |
| 168 | + |
| 169 | + def test_atomic_add_float(self): |
| 170 | + """Test that atomic_add works with float constants.""" |
| 171 | + x = torch.zeros(5, device=DEVICE, dtype=torch.float32) |
| 172 | + |
| 173 | + indices = torch.tensor([0, 1, 2, 2, 3, 3, 3, 4], device=DEVICE) |
| 174 | + expected = torch.tensor( |
| 175 | + [2.0, 2.0, 4.0, 6.0, 2.0], device=DEVICE, dtype=torch.float32 |
| 176 | + ) |
| 177 | + |
| 178 | + args = (x, indices) |
| 179 | + code, result = code_and_output( |
| 180 | + atomic_add_float_kernel, |
| 181 | + args, |
| 182 | + block_sizes=[32], |
| 183 | + ) |
| 184 | + |
| 185 | + torch.testing.assert_close(result, expected) |
| 186 | + |
| 187 | + def test_atomic_add_invalid_sem(self): |
| 188 | + """Test that atomic_add raises with an invalid sem value.""" |
| 189 | + x = torch.zeros(10, device=DEVICE) |
| 190 | + y = torch.ones(10, device=DEVICE) |
| 191 | + |
| 192 | + @helion.kernel() |
| 193 | + def bad_atomic_add_kernel(x: torch.Tensor, y: torch.Tensor): |
| 194 | + for i in hl.tile(x.size(0)): |
| 195 | + hl.atomic_add(x, [i], y[i], sem="ERROR") |
| 196 | + return x |
| 197 | + |
| 198 | + with self.assertRaises(helion.exc.InternalError) as ctx: |
| 199 | + code_and_output( |
| 200 | + bad_atomic_add_kernel, |
| 201 | + (x, y), |
| 202 | + block_sizes=[32], |
| 203 | + ) |
| 204 | + self.assertIn("Invalid memory semantic 'ERROR'", str(ctx.exception)) |
| 205 | + |
| 206 | + |
| 207 | +if __name__ == "__main__": |
| 208 | + unittest.main() |
0 commit comments