pytorch-labs
diff --git a/‎helion/_compiler/device_ir.py
Lines changed: 71 additions & 6 deletions b/‎helion/_compiler/device_ir.py
Lines changed: 71 additions & 6 deletions
diff --git a/‎helion/_compiler/inductor_lowering.py
Lines changed: 16 additions & 6 deletions b/‎helion/_compiler/inductor_lowering.py
Lines changed: 16 additions & 6 deletions
@@ -8,9 +8,12 @@
 import operator
 import re
 import textwrap
+import threading
 from typing import TYPE_CHECKING
 from typing import Iterator
 from typing import NamedTuple
+from typing import Protocol
+from typing import cast
 from unittest.mock import patch
 
 import torch
@@ -36,6 +39,7 @@
 from .inductor_lowering import CodegenState
 from .inductor_lowering import codegen_call_with_graph
 from .inductor_lowering import prepare_graph_lowerings
+from .node_masking import remove_unnecessary_masking
 from .roll_reduction import ReductionRoller
 from .source_location import current_location
 from .tile_index_proxy import CheckForIndexCalls
@@ -55,6 +59,12 @@
     from collections.abc import Callable
     from collections.abc import Sequence
 
+    class _TLS(Protocol):
+        device_irs: list[DeviceIR]
+
+
+tls: _TLS = cast("_TLS", threading.local())
+
 
 def _make_fx(fn: Callable[..., object], *args: object) -> torch.fx.GraphModule:
     """
@@ -151,7 +161,31 @@ def name(self) -> str:
 
 
 @dataclasses.dataclass
-class ForLoopGraphInfo(GraphInfo):
+class NodeArgsGraphInfo(GraphInfo):
+    """Common base class for graphs that have arguments from another graph."""
+
+    node_args: list[torch.fx.Node]
+
+    def placeholder_to_outer_arg(self, node: torch.fx.Node) -> torch.fx.Node:
+        assert node.op == "placeholder"
+        for placeholder, outer_node in zip(
+            node.graph.find_nodes(op="placeholder"),
+            self.node_args,
+            strict=True,
+        ):
+            if placeholder is node:
+                return outer_node
+        raise KeyError("Placeholder not found in node_args")
+
+    def kwargs(self) -> dict[str, object]:
+        # TODO(jansel): do we need to map these to the new graph in the case of a copy?
+        return {
+            "node_args": [*self.node_args],
+        }
+
+
+@dataclasses.dataclass
+class ForLoopGraphInfo(NodeArgsGraphInfo):
     block_indices: list[int]
 
     @property
@@ -160,6 +194,7 @@ def name(self) -> str:
 
     def kwargs(self) -> dict[str, object]:
         return {
+            **super().kwargs(),
             "block_indices": [*self.block_indices],
         }
 
@@ -179,14 +214,13 @@ def codegen(self, state: CodegenState) -> list[object]:
             )
 
 
-@dataclasses.dataclass
 class ReductionLoopGraphInfo(ForLoopGraphInfo):
     @property
     def name(self) -> str:
         return f"reduction_loop_{self.graph_id}"
 
 
-class IfGraphInfo(GraphInfo):
+class IfGraphInfo(NodeArgsGraphInfo):
     @property
     def name(self) -> str:
         return f"if_else_graph_{self.graph_id}"
@@ -252,12 +286,16 @@ def add_graph(
         return graph_id
 
     def add_reduction_loop_graph(
-        self, graph: torch.fx.GraphModule, block_index: int
+        self,
+        graph: torch.fx.GraphModule,
+        block_index: int,
+        node_args: list[torch.fx.Node],
     ) -> int:
         return self.add_graph(
             graph,
             graph_info_cls=ReductionLoopGraphInfo,
             block_indices=[block_index],
+            node_args=node_args,
         )
 
     def add_root_graph(self, graph: torch.fx.GraphModule) -> None:
@@ -302,6 +340,19 @@ def build_rolled_reductions(self) -> None:
             )
             first = False
 
+    def __enter__(self) -> None:
+        try:
+            tls.device_irs.append(self)
+        except AttributeError:
+            tls.device_irs = [self]
+
+    def __exit__(self, *args: object) -> None:
+        tls.device_irs.pop()
+
+    @staticmethod
+    def current() -> DeviceIR:
+        return tls.device_irs[-1]
+
 
 class WalkDeviceAST(NodeVisitor):
     def __init__(self, device_ir: DeviceIR) -> None:
@@ -494,6 +545,7 @@ def run_subgraph(*args: object) -> list[object]:
                     graph,
                     ForLoopGraphInfo,
                     block_indices=[x.block_size_idx for x in iter_vars],
+                    node_args=inputs.get_node_args(tracer),
                 )
                 args = (
                     graph_idx,
@@ -576,6 +628,7 @@ def run_body(*args: object) -> list[object]:
             graph_idx = self.device_ir.add_graph(
                 body_graph,
                 IfGraphInfo,
+                node_args=inputs.get_node_args(tracer),
             )
             args = (
                 test_proxy,
@@ -746,6 +799,16 @@ def replace_tensor_args(self, args: Sequence[object]) -> dict[str, object]:
     def get_tensor_args(self) -> list[object]:
         return [self.flat_values[i] for i in self.tensor_indices]
 
+    def get_node_args(
+        self, tracer: proxy_tensor.PythonKeyTracer
+    ) -> list[torch.fx.Node]:
+        proxy_args = args_to_proxies(tracer, self.get_tensor_args())[0]
+        result = []
+        for proxy in proxy_args:
+            assert isinstance(proxy, torch.fx.Proxy)
+            result.append(proxy.node)
+        return result
+
 
 class WalkHostAST(NodeVisitor):
     def __init__(self, device_ir: DeviceIR) -> None:
@@ -771,13 +834,15 @@ def visit_For(self, node: ast.For) -> None:
 
 
 def lower_to_device_ir(func: HostFunction) -> DeviceIR:
-    with func, compile_lock:
-        device_ir = DeviceIR()
+    device_ir = DeviceIR()
+    with func, device_ir, compile_lock:
         visitor = WalkHostAST(device_ir)
         for stmt in func.body:
             visitor.visit(stmt)
         CompileEnvironment.current().errors.raise_if_errors()
         for graph in device_ir.graphs:
             prepare_graph_lowerings(graph.graph)
+        for graph in device_ir.graphs:
+            remove_unnecessary_masking(graph.graph.graph)
         device_ir.build_rolled_reductions()
         return device_ir
@@ -46,6 +46,8 @@
 from .compile_environment import CompileEnvironment
 from .node_masking import apply_masking
 from .node_masking import cached_masked_value
+from .node_masking import getitem_masked_value
+from .node_masking import inductor_masked_value
 from .node_masking import mask_node_inputs
 from .tile_strategy import TileStrategy
 
@@ -372,9 +374,7 @@ def codegen(self, ctx: GraphInterpreter, node: torch.fx.Node) -> object:
             return expr_from_string(output_name)
 
     def get_masked_value(self, node: torch.fx.Node) -> float | bool | None:
-        """Get the masked value for this node."""
-        # TODO(jansel): use valueranges to determine masked value
-        return None
+        return inductor_masked_value(self, node)
 
 
 @dataclasses.dataclass
@@ -465,10 +465,20 @@ def codegen(self, ctx: GraphInterpreter, node: torch.fx.Node) -> object:
             node.meta["val"],
         )
 
+    def get_masked_value(self, node: torch.fx.Node) -> float | bool | None:
+        # reduction types that preserve zeroness
+        if self.reduction_type in {"sum", "prod", "min", "max"}:
+            value = inductor_masked_value(self, node)
+            if value == 0:
+                return value
+        return None
+
 
-@dataclasses.dataclass
 class APIFuncLowering(Lowering):
-    api_func: APIFunc
+    def __init__(self, api_func: object) -> None:
+        super().__init__()
+        assert is_api_func(api_func)
+        self.api_func: APIFunc = api_func
 
     def codegen(self, ctx: GraphInterpreter, node: torch.fx.Node) -> object:
         assert not node.kwargs
@@ -580,7 +590,7 @@ def codegen_sym_size(ctx: GraphInterpreter, node: torch.fx.Node) -> object:
     return val
 
 
-@register_lowering(getitem)
+@register_lowering(getitem, masked_value_fn=getitem_masked_value)
 def codegen_getitem(ctx: GraphInterpreter, node: torch.fx.Node) -> object:
     assert not node.kwargs, "getitem kwargs not supported"
     lhs, rhs = map_arg(node.args, lambda arg: ctx.env[arg])