Prototyping an hl.atomic opp

drisspg · drisspg · commit 0fb0b67c3b80 · 2025-05-20T12:36:14.000-07:00
stack-info: PR: #63, branch: drisspg/stack/5
diff --git a/helion/_compiler/device_ir.py b/helion/_compiler/device_ir.py
@@ -693,6 +693,25 @@ def visit_Attribute(self, node: ast.Attribute) -> object:
                 raise exc.CantReadOnDevice(type_info) from None
         return getattr(self.visit(node.value), node.attr)
 
+    def visit_Expr(self, node):
+        approved_ops = ["atomic_add"]
+
+        # Check if there is an inner call to an approved op
+        if isinstance(node.value, ast.Call):
+            func = node.value.func
+            op_name = None
+
+            if isinstance(func, ast.Name):
+                op_name = func.id
+
+            elif isinstance(func, ast.Attribute):
+                op_name = func.attr
+
+            if op_name in approved_ops:
+                return self.visit(node.value)
+
+        raise exc.StatementNotSupported(type(node).__name__)
+
     def visit_Constant(self, node: ast.Constant) -> object:
         return node.value
 
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -254,7 +254,6 @@ def create(
                 raise exc.InvalidIndexingType(k)
         assert len(output_size) == output_idx
         assert len(index_values) == fake_value.ndim
-
         index_expr = []
         for i, idx in enumerate(index_values):
             if fake_value.size(i) != 1:
diff --git a/helion/language/__init__.py b/helion/language/__init__.py
@@ -5,6 +5,7 @@
 from .creation_ops import zeros as zeros
 from .loops import register_block_size as register_block_size
 from .loops import tile as tile
+from .memory_ops import atomic_add as atomic_add
 from .memory_ops import load as load
 from .memory_ops import store as store
 from .view_ops import subscript as subscript
diff --git a/helion/language/_decorators.py b/helion/language/_decorators.py
@@ -38,6 +38,32 @@ def __call__(self, fn: Callable[..., _T]) -> object: ...
 
 
 class APIFunc(Protocol):
+    """Protocol for Helion API functions that define operations within kernel code.
+
+    This protocol defines the interface for functions decorated with @api. These functions
+    represent operations that can be called in Helion kernel code and are compiled
+    into the final device code.
+
+    Attributes:
+        __qualname__: The qualified name of the function.
+        _helion_api: A literal True marker indicating this is a Helion API function.
+        _is_device_loop: Whether this API function can transition between host and device code.
+            When True, the function can contain both host and device code sections.
+        _is_device_only: Whether this API function is intended for device code only.
+            When True, the function can only be used within device code sections.
+        _tiles_as_sizes: Whether tile indices should be converted to sizes automatically.
+            Used primarily with tiling operations to transform indices to dimensions.
+        _cache_type: Whether to cache the type information for repeated calls.
+        _type_function: A callable that determines the return type of this function
+            during type propagation phase.
+        _codegen: A callable that generates the device code for this function.
+        _fake_fn: A callable that provides a "fake" implementation used during
+            tracing and compilation.
+        _prepare_args: A callable that preprocesses the arguments before they're
+            passed to the actual function implementation.
+        _signature: The function signature for binding and validating arguments.
+    """
+
     __qualname__: str
     _helion_api: Literal[True]
     # a device loop can transition between host and device code
diff --git a/helion/language/memory_ops.py b/helion/language/memory_ops.py
@@ -14,7 +14,7 @@
 
     from .._compiler.inductor_lowering import CodegenState
 
-__all__ = ["load", "store"]
+__all__ = ["atomic_add", "load", "store"]
 
 
 @has_side_effect
@@ -53,6 +53,14 @@ def _(state: CodegenState) -> ast.AST:
 
 @_decorators.api(tiles_as_sizes=True)
 def load(tensor: torch.Tensor, index: list[object]) -> torch.Tensor:
+    """Load a value from a tensor using a list of indices.
+
+    Args:
+        tensor: The tensor to load from
+        index: The indices to use to index into the tensor
+    Returns:
+        torch.Tensor: The loaded value
+    """
     raise exc.NotInsideKernel
 
 
@@ -70,3 +78,60 @@ def _(state: CodegenState) -> ast.AST:
     return state.device_function.indexing_strategy.codegen_load(
         state, tensor, [*subscript]
     )
+
+
+@has_side_effect
+@_decorators.api(tiles_as_sizes=True)
+def atomic_add(target: torch.Tensor, index: list[object], value: torch.Tensor) -> None:
+    """
+    Atomically add a value to a target tensor.
+
+    Args:
+        target: The tensor to add to
+        index: Indices into target for way to accumulate values
+        value: The value to add
+
+    Returns:
+        None
+    """
+    raise exc.NotInsideKernel
+
+
+@_decorators.prepare_args(atomic_add)
+def _(
+    target: torch.Tensor, index: list[object], value: torch.Tensor
+) -> tuple[torch.Tensor, object, torch.Tensor]:
+    from helion._compiler.tile_index_proxy import TileIndexProxy
+
+    assert value.dtype == target.dtype, (
+        f"Expected value dtype {target.dtype}, got {value.dtype}"
+    )
+    index = TileIndexProxy.prepare_index(index)
+    index = TileIndexProxy.tiles_to_sizes(index)
+    return (target, index, value)
+
+
+@_decorators.register_fake(atomic_add)
+def _(target: torch.Tensor, index: list[object], value: torch.Tensor) -> None:
+    return None
+
+
+@_decorators.codegen(atomic_add)
+def _(state: CodegenState) -> ast.AST:
+    from .._compiler.ast_extension import expr_from_string
+
+    target = state.proxy_arg(0)
+    index = state.proxy_arg(1)
+    value = state.proxy_arg(2)
+    assert isinstance(target, torch.Tensor)
+    assert isinstance(value, torch.Tensor)
+
+    indices = SubscriptIndexing.create(state, target, index)
+    name = state.device_function.tensor_arg(target).name
+    return expr_from_string(
+        f"tl.atomic_add({name} + offset, value, mask=mask, sem=sem)",
+        value=state.ast_args[2],
+        offset=indices.index_expr,
+        mask=indices.mask_expr,
+        sem=expr_from_string("'relaxed'"),
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -87,4 +87,3 @@ exclude = [
 
 [tool.hatch.metadata]
 allow-direct-references = true
-
diff --git a/test/test_atomic_add.py b/test/test_atomic_add.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import unittest
+
+from expecttest import TestCase
+import torch
+
+import helion
+from helion._testing import DEVICE
+from helion._testing import code_and_output
+import helion.language as hl
+
+
+@helion.kernel()
+def atomic_add_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """Test basic atomic_add functionality."""
+    for i in hl.tile([x.size(0)]):
+        hl.atomic_add(x, [i], y[i])
+    return x
+
+
+@helion.kernel()
+def atomic_add_overlap_kernel(
+    x: torch.Tensor, y: torch.Tensor, indices: torch.Tensor
+) -> torch.Tensor:
+    """Test atomic_add with overlapping indices."""
+    for i in hl.tile([y.size(0)]):
+        idx = indices[i]
+        hl.atomic_add(x, [idx], y[i])
+    return x
+
+
+class TestAtomicOperations(TestCase):
+    maxDiff = 16384
+
+    def test_basic_atomic_add(self):
+        # Basic test with sequential indices
+        x = torch.zeros(10, device=DEVICE)
+        y = torch.ones(10, device=DEVICE)
+        args = (x, y)
+
+        code, result = code_and_output(
+            atomic_add_kernel,
+            args,
+            block_sizes=[32],
+        )
+
+        expected = torch.ones(10, device=DEVICE)
+        torch.testing.assert_close(result, expected)
+        self.assertExpectedInline(
+            code,
+            """\
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _atomic_add_kernel_kernel(x, y, x_size_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = offset_0 + tl.arange(0, _BLOCK_SIZE_0).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    load = tl.load(y + indices_0 * y_stride_0, mask_0, other=0)
+    tl.atomic_add(x + indices_0 * x_stride_0, load, mask=mask_0, sem='relaxed')
+
+def atomic_add_kernel(x: torch.Tensor, y: torch.Tensor):
+    \"\"\"Test basic atomic_add functionality.\"\"\"
+    _BLOCK_SIZE_0 = 32
+    _atomic_add_kernel_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return x
+
+def _atomic_add_kernel_make_precompiler(x: torch.Tensor, y: torch.Tensor):
+    \"\"\"Test basic atomic_add functionality.\"\"\"
+    _BLOCK_SIZE_0 = 32
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_atomic_add_kernel_kernel)(x, y, x.size(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)""",
+        )
+
+    def test_overlapping_atomic_add(self):
+        # Test with overlapping indices
+        x = torch.zeros(5, device=DEVICE)
+        y = torch.ones(10, device=DEVICE)
+        indices = torch.tensor([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], device=DEVICE)
+        args = (x, y, indices)
+
+        code, result = code_and_output(
+            atomic_add_overlap_kernel,
+            args,
+            block_sizes=[32],
+        )
+
+        expected = torch.ones(5, device=DEVICE) * 2
+        torch.testing.assert_close(result, expected)
+        self.assertExpectedInline(
+            code,
+            """\
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _atomic_add_overlap_kernel_kernel(y, indices, x, y_size_0, indices_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = offset_0 + tl.arange(0, _BLOCK_SIZE_0).to(tl.int32)
+    mask_0 = indices_0 < y_size_0
+    idx = tl.load(indices + indices_0 * indices_stride_0, mask_0, other=0)
+    load_1 = tl.load(y + indices_0 * y_stride_0, mask_0, other=0)
+    tl.atomic_add(x + idx * x_stride_0, load_1, mask=mask_0, sem='relaxed')
+
+def atomic_add_overlap_kernel(x: torch.Tensor, y: torch.Tensor, indices: torch.Tensor):
+    \"\"\"Test atomic_add with overlapping indices.\"\"\"
+    _BLOCK_SIZE_0 = 32
+    _atomic_add_overlap_kernel_kernel[triton.cdiv(y.size(0), _BLOCK_SIZE_0),](y, indices, x, y.size(0), indices.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return x
+
+def _atomic_add_overlap_kernel_make_precompiler(x: torch.Tensor, y: torch.Tensor, indices: torch.Tensor):
+    \"\"\"Test atomic_add with overlapping indices.\"\"\"
+    _BLOCK_SIZE_0 = 32
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_atomic_add_overlap_kernel_kernel)(y, indices, x, y.size(0), indices.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)""",
+        )
+
+    def test_atomic_add_code_generation(self):
+        """Test that the generated code contains atomic_add."""
+        x = torch.zeros(10, device=DEVICE)
+        y = torch.ones(10, device=DEVICE)
+        args = (x, y)
+
+        code, _ = code_and_output(atomic_add_kernel, args)
+        # Verify atomic_add appears in the generated code
+        self.assertIn("atomic_add", code)
+        # Verify the new signature format (using the target tensor and indices list)
+        self.assertIn("tl.atomic_add(x + offset_0", code)
+
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -87,4 +87,3 @@ exclude = [`
`87`	`87`
`88`	`88`	`[tool.hatch.metadata]`
`89`	`89`	`allow-direct-references = true`
`90`		`-`