Remove 'first_non_grid_index' for hl.grid index

jansel · jansel · commit 821784e3fcd7 · 2025-05-31T16:41:05.000-07:00
I don't think this was correct, since the grid index shouldn't get expanded. stack-info: PR: #113, branch: jansel/stack/13
diff --git a/README.md b/README.md
@@ -23,8 +23,8 @@ refers to hydrogen-3.
 Helion can be viewed either as *PyTorch with tiles* or as *a higher-level Triton*. Compared to
 Triton, Helion reduces manual coding effort through autotuning. Helion spends more time (approx
 10 min) autotuning as it evaluates hundreds of potential Triton implementations generated
-from a single Helion kernel.  This larger search space also makes kernels more performance
-portable between different hardware.  Helion automates and autotunes over:
+from a single Helion kernel. This larger search space also makes kernels more performance
+portable between different hardware. Helion automates and autotunes over:
 
 1. **Tensor Indexing:**
 
@@ -37,7 +37,7 @@ portable between different hardware.  Helion automates and autotunes over:
 
 3. **Grid Sizes and PID Calculations:**
 
-   * Automatically determines  grid sizes.
+   * Automatically determines grid sizes.
    * Autotunes multiple mappings from Program IDs (PIDs) to data tiles.
 
 4. **Implicit Search Space Definition:**
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -259,8 +259,8 @@ def create(
         index_values = []
         mask_values = {}
         output_size = SubscriptIndexing.compute_shape(fake_value, index)
-        dtype = CompileEnvironment.current().triton_index_type()
-        first_non_grid_index = 0
+        env = CompileEnvironment.current()
+        dtype = env.triton_index_type()
         for n, k in enumerate(index):
             if k is None:
                 output_idx += 1
@@ -272,18 +272,11 @@ def create(
                 if isinstance(symbol, sympy.Symbol):
                     origin = HostFunction.current().expr_to_origin.get(symbol)
                 if origin and isinstance(origin.origin, BlockSizeOrigin):
-                    if (
-                        CompileEnvironment.current()
-                        .block_sizes[origin.origin.block_size_idx]
-                        .is_grid()
-                    ):
-                        first_non_grid_index = n + 1
-                        expand = tile_strategy.expand_str(output_size, output_idx)
-                    else:
-                        expand = tile_strategy.expand_str(
-                            output_size, output_idx - first_non_grid_index
-                        )
                     index_var = state.codegen.index_var(origin.origin.block_size_idx)
+                    if env.block_sizes[origin.origin.block_size_idx].is_grid():
+                        index_values.append(index_var)
+                        continue
+                    expand = tile_strategy.expand_str(output_size, output_idx)
                     i = len(index_values)
                     index_values.append(f"({index_var}){expand}")
                     if (
@@ -292,18 +285,13 @@ def create(
                         mask_values.setdefault(f"({mask}){expand}")
                     output_idx += 1
                 else:
-                    expand = tile_strategy.expand_str(
-                        output_size, output_idx - first_non_grid_index
-                    )
+                    expand = tile_strategy.expand_str(output_size, output_idx)
                     val = state.device_function.literal_expr(k)
                     index_values.append(f"tl.full([1], {val}, {dtype}){expand}")
             elif isinstance(k, slice) and str(k) == "slice(None, None, None)":
-                expand = tile_strategy.expand_str(
-                    output_size, output_idx - first_non_grid_index
-                )
+                expand = tile_strategy.expand_str(output_size, output_idx)
                 size = fake_value.size(len(index_values))
                 if size != 1:
-                    env = CompileEnvironment.current()
                     rdim = env.allocate_reduction_dimension(size)
                     block_idx = rdim.block_size_idx
                     index_var = state.codegen.index_var(block_idx)
@@ -314,18 +302,14 @@ def create(
                     index_values.append(f"tl.zeros([1], {dtype}){expand}")
                 output_idx += 1
             elif isinstance(k, torch.Tensor) and k.ndim == 1:
-                expand = tile_strategy.expand_str(
-                    output_size, output_idx - first_non_grid_index
-                )
+                expand = tile_strategy.expand_str(output_size, output_idx)
                 ast_index = state.ast_args[1]
                 assert isinstance(ast_index, (list, tuple))
                 assert len(ast_index) == len(index)
                 index_var = state.codegen.lift(ast_index[n]).id
                 index_values.append(f"({index_var}){expand}")
                 if (
-                    block_idx := TileStrategy.get_block_index(
-                        output_size[output_idx - first_non_grid_index]
-                    )
+                    block_idx := TileStrategy.get_block_index(output_size[output_idx])
                 ) is not None:
                     if mask := state.codegen.mask_var(block_idx):
                         mask_values.setdefault(f"({mask}){expand}")
@@ -349,7 +333,7 @@ def create(
                         )
             else:
                 raise exc.InvalidIndexingType(type(k))
-        assert len(output_size) == output_idx - first_non_grid_index
+        assert len(output_size) == output_idx
         assert len(index_values) == fake_value.ndim
         index_expr = []
         for i, idx in enumerate(index_values):
diff --git a/test/test_loops.py b/test/test_loops.py
@@ -517,11 +517,63 @@ def _grid_1d_kernel(x, y, out, _BLOCK_SIZE_2: tl.constexpr, _BLOCK_SIZE_1: tl.co
             for offset_3 in range(0, 32, _BLOCK_SIZE_3):
                 indices_3 = offset_3 + tl.arange(0, _BLOCK_SIZE_3).to(tl.int32)
                 acc_copy = acc
-                load = tl.load(x + (indices_0[:, None] * 512 + indices_1[:, None] * 32 + indices_3[None, :] * 1), None)
+                load = tl.load(x + (indices_0 * 512 + indices_1[:, None] * 32 + indices_3[None, :] * 1), None)
                 load_1 = tl.load(y + (indices_3[:, None] * 4 + indices_2[None, :] * 1), mask_2[None, :], other=0)
                 acc = tl.dot(load, load_1, acc=acc_copy, input_precision='tf32')
             v_0 = acc.to(tl.float16)
-            tl.store(out + (indices_0[:, None] * 64 + indices_1[:, None] * 4 + indices_2[None, :] * 1), v_0, mask_2[None, :])
+            tl.store(out + (indices_0 * 64 + indices_1[:, None] * 4 + indices_2[None, :] * 1), v_0, mask_2[None, :])
+
+def grid_1d(x: torch.Tensor, y: torch.Tensor):
+    b, m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f'size mismatch {k} != {k2}'
+    out = torch.empty(b, m, n, dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
+    _BLOCK_SIZE_2 = 16
+    _BLOCK_SIZE_1 = 16
+    _BLOCK_SIZE_3 = 16
+    _grid_1d_kernel[8,](x, y, out, _BLOCK_SIZE_2, _BLOCK_SIZE_1, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
+    return out
+
+def _grid_1d_make_precompiler(x: torch.Tensor, y: torch.Tensor):
+    b, m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f'size mismatch {k} != {k2}'
+    out = torch.empty(b, m, n, dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
+    _BLOCK_SIZE_2 = 16
+    _BLOCK_SIZE_1 = 16
+    _BLOCK_SIZE_3 = 16
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_grid_1d_kernel)(x, y, out, _BLOCK_SIZE_2, _BLOCK_SIZE_1, _BLOCK_SIZE_3, num_warps=4, num_stages=3)""",
+        )
+
+        # test again with block_ptr indexing
+        code, result = code_and_output(
+            grid_1d, args, block_sizes=[[16, 16], 16], indexing="block_ptr"
+        )
+        torch.testing.assert_close(result, grid_1d_pytorch(args[0], args[1]))
+        self.assertExpectedInline(
+            code,
+            """\
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _grid_1d_kernel(x, y, out, _BLOCK_SIZE_2: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_3: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0
+    for offset_1 in range(0, 16, _BLOCK_SIZE_1):
+        for offset_2 in range(0, 4, _BLOCK_SIZE_2):
+            acc = tl.full([_BLOCK_SIZE_1, _BLOCK_SIZE_2], 0.0, tl.float32)
+            for offset_3 in range(0, 32, _BLOCK_SIZE_3):
+                acc_copy = acc
+                load = tl.reshape(tl.load(tl.make_block_ptr(x, [8, 16, 32], [512, 32, 1], [offset_0, offset_1, offset_3], [1, _BLOCK_SIZE_1, _BLOCK_SIZE_3], [2, 1, 0]), boundary_check=[0, 1, 2], padding_option='zero'), [_BLOCK_SIZE_1, _BLOCK_SIZE_3])
+                load_1 = tl.load(tl.make_block_ptr(y, [32, 4], [4, 1], [offset_3, offset_2], [_BLOCK_SIZE_3, _BLOCK_SIZE_2], [1, 0]), boundary_check=[0, 1], padding_option='zero')
+                acc = tl.dot(load, load_1, acc=acc_copy, input_precision='tf32')
+            v_0 = acc.to(tl.float16)
+            tl.store(tl.make_block_ptr(out, [8, 16, 4], [64, 4, 1], [offset_0, offset_1, offset_2], [1, _BLOCK_SIZE_1, _BLOCK_SIZE_2], [2, 1, 0]), tl.reshape(v_0, [1, _BLOCK_SIZE_1, _BLOCK_SIZE_2]), boundary_check=[0, 1, 2])
 
 def grid_1d(x: torch.Tensor, y: torch.Tensor):
     b, m, k = x.size()
@@ -603,11 +655,11 @@ def _grid_2d_idx_list_kernel(x, y, out, _BLOCK_SIZE_3: tl.constexpr, _BLOCK_SIZE
             for offset_4 in range(0, 32, _BLOCK_SIZE_4):
                 indices_4 = offset_4 + tl.arange(0, _BLOCK_SIZE_4).to(tl.int32)
                 acc_copy = acc
-                load = tl.load(x + (indices_0[:, None] * 8192 + indices_1[None, :] * 2048 + indices_2[:, None] * 32 + indices_4[None, :] * 1), None)
+                load = tl.load(x + (indices_0 * 8192 + indices_1 * 2048 + indices_2[:, None] * 32 + indices_4[None, :] * 1), None)
                 load_1 = tl.load(y + (indices_4[:, None] * 16 + indices_3[None, :] * 1), None)
                 acc = tl.dot(load, load_1, acc=acc_copy, input_precision='tf32')
             v_0 = acc.to(tl.float16)
-            tl.store(out + (indices_0[:, None] * 4096 + indices_1[None, :] * 1024 + indices_2[:, None] * 16 + indices_3[None, :] * 1), v_0, None)
+            tl.store(out + (indices_0 * 4096 + indices_1 * 1024 + indices_2[:, None] * 16 + indices_3[None, :] * 1), v_0, None)
 
 def grid_2d_idx_list(x: torch.Tensor, y: torch.Tensor):
     bi, bj, m, k = x.size()
@@ -741,11 +793,11 @@ def _grid_2d_idx_nested_kernel(x, y, out, _BLOCK_SIZE_3: tl.constexpr, _BLOCK_SI
                 for offset_4 in range(0, 32, _BLOCK_SIZE_4):
                     indices_4 = offset_4 + tl.arange(0, _BLOCK_SIZE_4).to(tl.int32)
                     acc_copy = acc
-                    load = tl.load(x + (indices_0[:, None] * 8192 + indices_1[None, :] * 2048 + indices_2[:, None] * 32 + indices_4[None, :] * 1), None)
+                    load = tl.load(x + (indices_0 * 8192 + indices_1 * 2048 + indices_2[:, None] * 32 + indices_4[None, :] * 1), None)
                     load_1 = tl.load(y + (indices_4[:, None] * 16 + indices_3[None, :] * 1), None)
                     acc = tl.dot(load, load_1, acc=acc_copy, input_precision='tf32')
                 v_0 = acc.to(tl.float16)
-                tl.store(out + (indices_0[:, None] * 4096 + indices_1[None, :] * 1024 + indices_2[:, None] * 16 + indices_3[None, :] * 1), v_0, None)
+                tl.store(out + (indices_0 * 4096 + indices_1 * 1024 + indices_2[:, None] * 16 + indices_3[None, :] * 1), v_0, None)
 
 def grid_2d_idx_nested(x: torch.Tensor, y: torch.Tensor):
     bi, bj, m, k = x.size()