Add support for multiple top level for loops

oulgen · oulgen · commit fc21a87688b0 · 2025-05-17T19:42:33.000-07:00
ghstack-source-id: 056d55f Pull Request resolved: #52
diff --git a/helion/_compiler/ast_extension.py b/helion/_compiler/ast_extension.py
@@ -49,13 +49,15 @@ def __init__(
         _type_info: TypeInfo | None = None,
         _loop_type: LoopType = LoopType.UNSET,
         _is_kernel_call: bool = False,
+        _root_id: int | None = None,
         **kwargs: object,
     ) -> None:
         super().__init__(**kwargs)
         self._type_info: TypeInfo | None = _type_info
         self._location: SourceLocation = _location
         self._loop_type: LoopType = _loop_type
         self._is_kernel_call: bool = _is_kernel_call
+        self._root_id: int | None = _root_id
 
     def new(self, fields: dict[str, object]) -> ExtendedAST:
         result = self.__class__(
@@ -64,6 +66,7 @@ def new(self, fields: dict[str, object]) -> ExtendedAST:
             _type_info=self._type_info,
             _loop_type=self._loop_type,
             _is_kernel_call=self._is_kernel_call,
+            _root_id=self._root_id,
         )
         return self._location.to_ast(result)
 
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -36,6 +36,7 @@
 
 if TYPE_CHECKING:
     from ..runtime.config import Config
+    from .program_id import SharedProgramIDs
 
     _P = TypeVar("_P", bound="TensorPropertyArg")
 
@@ -158,6 +159,8 @@ def __init__(self, name: str, config: Config) -> None:
         self.tile_strategy: TileStrategyDispatch = TileStrategyDispatch(self, config)
         self.indexing_strategy: IndexingStrategy = IndexingStrategy.select(config)
 
+        self.shared_pid: SharedProgramIDs | None = None
+
     def block_size_var(self, block_size_idx: int) -> str | None:
         return self.block_size_var_cache.get((block_size_idx,))
 
@@ -170,7 +173,9 @@ def merge_variable_names(self, a: str, b: str) -> None:
             self._variable_renames[n] = name_group
 
     def set_grid_expr(self, grid_expr: ast.AST) -> None:
-        assert self.grid_expr is None, "grid_expr already set"
+        if not self.shared_pid:
+            # For shared pid, its OK to set grid expr multiple times, just use the last one
+            assert self.grid_expr is None, "grid_expr already set"
         self.grid_expr = grid_expr
 
     def sympy_expr(self, expr: sympy.Expr) -> str:
diff --git a/helion/_compiler/device_ir.py b/helion/_compiler/device_ir.py
@@ -211,22 +211,23 @@ class DeviceIR:
     def __init__(self) -> None:
         super().__init__()
         self.graphs: list[GraphInfo] = []
-        self.root_id: int | None = None
+        self.root_ids: list[int] = []
         self.rolled_reductions: list[RolledReductionInfo] = []
         self.grid_block_indices: list[list[int]] = []
 
-    def get_root(self, config: Config) -> torch.fx.GraphModule:
+    def get_root(self, config: Config, root_id: int) -> torch.fx.GraphModule:
         """ " If we are using a rolled reduction, return the rolled reduction graph otherwise
         return the root graph."""
-        if (root_id := self.root_id) is None:
-            raise AssertionError("No root graph")
+        if root_id >= len(self.root_ids):
+            raise AssertionError("Invalid root graph")
+        rid = self.root_ids[root_id]
         reduction_loops = config.reduction_loops
         if len(reduction_loops) > 1:
             raise NotImplementedError("Multiple reduction loops not implemented")
         if len(reduction_loops) == 0 or reduction_loops[0] is None:
-            return self.graphs[root_id].graph
+            return self.graphs[rid].graph
         for info in reversed(self.rolled_reductions):
-            if info.original_graph_id == root_id:
+            if info.original_graph_id == rid:
                 assert info.new_graph_id is not None
                 return self.graphs[info.new_graph_id].graph
         raise AssertionError("No rolled reduction graph found")
@@ -259,8 +260,7 @@ def add_reduction_loop_graph(
         )
 
     def add_root_graph(self, graph: torch.fx.GraphModule) -> None:
-        assert self.root_id is None
-        self.root_id = self.add_graph(graph, graph_info_cls=RootGraphInfo)
+        self.root_ids.append(self.add_graph(graph, graph_info_cls=RootGraphInfo))
 
     def build_rolled_reductions(self) -> None:
         env = CompileEnvironment.current()
diff --git a/helion/_compiler/generate_ast.py b/helion/_compiler/generate_ast.py
@@ -18,6 +18,7 @@
 from .compile_environment import CompileEnvironment
 from .device_function import DeviceFunction
 from .inductor_lowering import codegen_call_with_graph
+from .program_id import SharedProgramIDs
 from .variable_origin import ArgumentOrigin
 
 if TYPE_CHECKING:
@@ -41,6 +42,7 @@ def __init__(self, func: HostFunction, config: Config) -> None:
         self.active_device_loops: dict[int, list[DeviceLoopOrGridState]] = (
             collections.defaultdict(list)
         )
+        self.next_else_block: list[ast.AST] | None = None
 
     def offset_var(self, block_idx: int) -> str:
         return self.active_device_loops[block_idx][-1].strategy.offset_var(block_idx)
@@ -51,7 +53,9 @@ def index_var(self, block_idx: int) -> str:
     def mask_var(self, block_idx: int) -> str | None:
         return self.active_device_loops[block_idx][-1].strategy.mask_var(block_idx)
 
-    def add_statement(self, stmt: ast.AST | str) -> None:
+    def add_statement(self, stmt: ast.AST | str | None) -> None:
+        if stmt is None:
+            return
         if isinstance(stmt, str):
             stmt = statement_from_string(stmt)
         self.statements_stack[-1].append(stmt)
@@ -131,13 +135,34 @@ def generic_visit(self, node: ast.AST) -> ast.AST:
                 fields[field] = old_value
         return node.new(fields)
 
-    def visit_For(self, node: ast.For) -> ast.AST:
+    def visit_For(self, node: ast.For) -> ast.AST | None:
         assert isinstance(node, ExtendedAST)
         if node._loop_type == LoopType.GRID:
             assert not node.orelse
+
+            if len(self.host_fn.device_ir.root_ids) == 1:
+                body = self.device_function.body
+            else:
+                assert len(self.host_fn.device_ir.root_ids) > 1
+                assert node._root_id is not None
+                # Multiple top level for loops
+
+                if node._root_id == 0:
+                    self.device_function.shared_pid = SharedProgramIDs(
+                        self.device_function.new_var("pid_shared", dce=False)
+                    )
+                    self.device_function.body.append(
+                        self.device_function.shared_pid.codegen_pid_init()
+                    )
+                if node._root_id < len(self.host_fn.device_ir.root_ids) - 1:
+                    body = []
+                else:
+                    # This is the last top level for, dont emit more if statements
+                    assert self.next_else_block is not None
+                    body = self.next_else_block
             with (
                 self.set_on_device(),
-                self.set_statements(self.device_function.body),
+                self.set_statements(body),
             ):
                 iter_node = node.iter
                 assert isinstance(iter_node, ExtendedAST)
@@ -163,21 +188,43 @@ def visit_For(self, node: ast.For) -> ast.AST:
 
                     from .inductor_lowering import CodegenState
 
-                    fn._codegen(
-                        CodegenState(
-                            self,
-                            fx_node=None,
-                            proxy_args=[*bound.arguments.values()],
-                            ast_args=None,
-                        ),
+                    state = CodegenState(
+                        self,
+                        fx_node=None,
+                        proxy_args=[*bound.arguments.values()],
+                        ast_args=None,
                     )
+
+                    fn._codegen(state)
+                assert node._root_id is not None
                 codegen_call_with_graph(
                     self,
-                    self.host_fn.device_ir.get_root(self.device_function.config),
+                    self.host_fn.device_ir.get_root(
+                        self.device_function.config, node._root_id
+                    ),
                     [],
                 )
+                # If we are in a multi top level loop, for all loops except for the last one
+                # emit ifthenelse blocks
+                if node._root_id < len(self.host_fn.device_ir.root_ids) - 1:
+                    block = (
+                        self.device_function.body
+                        if self.next_else_block is None
+                        else self.next_else_block
+                    )
+                    self.next_else_block = []
+                    block.append(
+                        create(
+                            ast.If,
+                            test=self.device_function.shared_pid.codegen_test(state),
+                            body=body,
+                            orelse=self.next_else_block,
+                        )
+                    )
             self.device_function.dead_code_elimination()
-            return self.device_function.codegen_function_call()
+            if node._root_id == len(self.host_fn.device_ir.root_ids) - 1:
+                return self.device_function.codegen_function_call()
+            return None
         return self.generic_visit(node)
 
     def visit_Name(self, node: ast.Name) -> ast.AST:
diff --git a/helion/_compiler/program_id.py b/helion/_compiler/program_id.py
@@ -9,6 +9,8 @@
 from helion._compiler.host_function import HostFunction
 
 if TYPE_CHECKING:
+    import ast
+
     import sympy
 
     from helion._compiler.inductor_lowering import CodegenState
@@ -57,6 +59,49 @@ def codegen(self, state: CodegenState) -> None:
         state.device_function.set_grid_expr(expr_from_string(f"({', '.join(grid)},)"))
 
 
+class SharedProgramIDs(ProgramIDs):
+    """
+    Use the same PID for all blocks
+    TODO(oulgen): Currently only supports 1 dimension
+    """
+
+    def __init__(self, shared_pid_var: str) -> None:
+        super().__init__()
+        self.shared_pid_var = shared_pid_var
+
+    def codegen_pid_init(
+        self,
+    ) -> ast.stmt:
+        return statement_from_string(f"{self.shared_pid_var} = tl.program_id(0)")
+
+    def codegen_test(self, state: CodegenState) -> ast.AST:
+        blocks = []
+        for pid in self.pids:
+            blocks.append(pid.device_cdiv(state))
+
+        assert len(blocks) > 0
+        return expr_from_string(f"{self.shared_pid_var} < ({'+ '.join(blocks)})")
+
+    def codegen(self, state: CodegenState) -> None:
+        # TODO(oulgen): We need CSE between codegen_test and codegen for shared device cdivs
+        blocks = []
+        for pid in self.pids[:-1]:
+            blocks.append(pid.device_cdiv(state))
+
+        if blocks:
+            state.codegen.statements_stack[-1].insert(
+                0,
+                statement_from_string(
+                    f"{self.shared_pid_var} -= ({'+ '.join(blocks)})"
+                ),
+            )
+
+        grid = []
+        for pid in self.pids:
+            grid.append(pid.host_cdiv())
+        state.device_function.set_grid_expr(expr_from_string(f"({'+ '.join(grid)},)"))
+
+
 class VirtualProgramIDs(ProgramIDs):
     """Only use the x grid and compute other dimensions"""
 
diff --git a/helion/_compiler/tile_strategy.py b/helion/_compiler/tile_strategy.py
@@ -24,8 +24,10 @@
 from .program_id import L2GroupingProgramIDs
 from .program_id import ProgramID
 from .program_id import ProgramIDs
+from .program_id import SharedProgramIDs
 from .program_id import VirtualProgramIDs
 from .variable_origin import BlockSizeOrigin
+from helion import exc
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -379,14 +381,20 @@ def codegen_grid(self, state: CodegenState) -> None:
         dtype = env.triton_index_type()
         block_sizes = self.block_size
         assert len(block_sizes) == len(block_indices)
-        pids = self.select_pid_strategy()
+        pids = self.select_pid_strategy(state)
+        if isinstance(pids, SharedProgramIDs) and len(block_sizes) > 1:
+            # TODO(oulgen): Support this
+            raise exc.MultipleDeviceLoopBlocks
         for i, (block_idx, block_size) in enumerate(
             reversed(self._reorder([*zip(block_indices, block_sizes, strict=True)]))
         ):
             numel = env.block_sizes[block_idx].numel
             offset_var = self.offset_var(block_idx)
             index_var = self.index_var(block_idx)
-            pid_var = device_fn.new_var(f"pid_{i}", dce=True)
+            if isinstance(pids, SharedProgramIDs):
+                pid_var = pids.shared_pid_var
+            else:
+                pid_var = device_fn.new_var(f"pid_{i}", dce=True)
             if block_size != 1:
                 block_size_var = self.block_size_var(block_idx)
                 assert block_size_var is not None
@@ -433,7 +441,9 @@ def _setup_mask(
             f"{mask_var} = ({index_var} < ({state.device_function.sympy_expr(numel)}))"
         )
 
-    def select_pid_strategy(self) -> ProgramIDs:
+    def select_pid_strategy(self, state: CodegenState) -> ProgramIDs:
+        if (shared_pid := state.device_function.shared_pid) is not None:
+            return shared_pid
         if self.l2_grouping > 1:
             return L2GroupingProgramIDs(group_size=self.l2_grouping)
         if 1 < len(self.block_indices) <= 3 and self.fn.config.use_yz_grid:
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -1901,12 +1901,11 @@ def visit_For(self, node: ast.For) -> TypeInfo:
             if node.orelse:
                 raise exc.DeviceLoopElseBlock(fn.__qualname__)
 
-            self.device_loop_count += 1
             if self.device_loop_depth == 0:
                 self.func.set_local_types(parent_scope.extract_locals())
                 node._loop_type = LoopType.GRID
-                if self.device_loop_count != 1:
-                    raise exc.MultipleDeviceLoops
+                node._root_id = self.device_loop_count
+                self.device_loop_count += 1
                 if len(ExtendedAST.current()) != 1:
                     raise exc.NestedGridLoop
 
diff --git a/helion/exc.py b/helion/exc.py
@@ -73,8 +73,8 @@ class DeviceLoopElseBlock(BaseError):
     message = "for...else block is not allowed in a {0} device loop."
 
 
-class MultipleDeviceLoops(BaseError):
-    message = "Multiple grid loops are not allowed. Support for this may be added in the future."
+class MultipleDeviceLoopBlocks(BaseError):
+    message = "Multiple blocks for multiple top level grid loops are not yet allowed. Support for this may be added in the future."
 
 
 class NestedGridLoop(BaseError):
diff --git a/test/test_loops.py b/test/test_loops.py