Replace torch.fx.GraphModule with torch.fx.Graph

jansel · jansel · commit 643d723948d8 · 2025-05-31T18:08:27.000-07:00
stack-info: PR: #116, branch: jansel/stack/16
diff --git a/helion/_compiler/device_ir.py b/helion/_compiler/device_ir.py
@@ -19,6 +19,7 @@
 import torch
 from torch._dynamo.convert_frame import compile_lock
 from torch._inductor.decomposition import select_decomp_table
+from torch.fx._lazy_graph_module import _LazyGraphModule
 from torch.fx.experimental import proxy_tensor
 from torch.fx.traceback import preserve_node_meta
 from torch.utils import _pytree as pytree
@@ -66,7 +67,7 @@ class _TLS(Protocol):
 tls: _TLS = cast("_TLS", threading.local())
 
 
-def _make_fx(fn: Callable[..., object], *args: object) -> torch.fx.GraphModule:
+def _make_fx(fn: Callable[..., object], *args: object) -> torch.fx.Graph:
     """
     We monkey patch get_proxy_slot to support Tensor/SymInt/SymFloat/SymBool in the
     graph without any origin for them.  We instead insert _host_tensor(), _get_symnode()
@@ -122,14 +123,13 @@ def _get_proxy_slot(
         current_location().set_fx_location()
         return proxy_tensor.make_fx(fn, decomposition_table=select_decomp_table())(
             *args
-        )
+        ).graph
 
 
 @dataclasses.dataclass
 class GraphInfo:
     graph_id: int
-    # TODO(jansel): GraphModule -> Graph to avoid fx compile
-    graph: torch.fx.GraphModule
+    graph: torch.fx.Graph
 
     @property
     def name(self) -> str:
@@ -140,7 +140,9 @@ def kwargs(self) -> dict[str, object]:
         return {}
 
     def __str__(self) -> str:
-        output = self.graph.print_readable(print_output=False).strip()
+        output = (
+            _LazyGraphModule({}, self.graph).print_readable(print_output=False).strip()
+        )
         return textwrap.dedent(
             re.sub(
                 r"forward\(self,? ?([^)]*)\)",
@@ -251,7 +253,7 @@ def __init__(self) -> None:
         self.rolled_reductions: list[RolledReductionInfo] = []
         self.grid_block_indices: list[list[int]] = []
 
-    def get_root(self, config: Config) -> torch.fx.GraphModule:
+    def get_root(self, config: Config) -> torch.fx.Graph:
         """ " If we are using a rolled reduction, return the rolled reduction graph otherwise
         return the root graph."""
         if (root_id := self.root_id) is None:
@@ -276,18 +278,18 @@ def debug_str(self) -> str:
 
     def add_graph(
         self,
-        graph: torch.fx.GraphModule,
+        graph: torch.fx.Graph,
         graph_info_cls: type[GraphInfo] = GraphInfo,
         **kwargs: object,
     ) -> int:
-        graph.graph.eliminate_dead_code()
+        graph.eliminate_dead_code()
         graph_id = len(self.graphs)
         self.graphs.append(graph_info_cls(graph_id=graph_id, graph=graph, **kwargs))
         return graph_id
 
     def add_reduction_loop_graph(
         self,
-        graph: torch.fx.GraphModule,
+        graph: torch.fx.Graph,
         block_index: int,
         node_args: list[torch.fx.Node],
     ) -> int:
@@ -298,7 +300,7 @@ def add_reduction_loop_graph(
             node_args=node_args,
         )
 
-    def add_root_graph(self, graph: torch.fx.GraphModule) -> None:
+    def add_root_graph(self, graph: torch.fx.Graph) -> None:
         assert self.root_id is None
         self.root_id = self.add_graph(graph, graph_info_cls=RootGraphInfo)
 
@@ -314,9 +316,7 @@ def build_rolled_reductions(self) -> None:
             for graph_id, graph_info in enumerate([*self.graphs]):
                 assert graph_id == graph_info.graph_id
                 roller = ReductionRoller(self, rdim, graph_to_info)
-                new_graph = torch.fx.GraphModule(
-                    {}, roller.process(graph_info.graph.graph)
-                )
+                new_graph = roller.process(graph_info.graph)
                 new_graph_id = self.add_graph(
                     new_graph, type(graph_info), **graph_info.kwargs()
                 )
@@ -540,7 +540,7 @@ def run_subgraph(*args: object) -> list[object]:
             with self.disable_tracing() as tracer:
                 graph = proxy_tensor.make_fx(
                     run_subgraph, decomposition_table=select_decomp_table()
-                )(*inputs.get_tensor_args())
+                )(*inputs.get_tensor_args()).graph
                 graph_idx = self.device_ir.add_graph(
                     graph,
                     ForLoopGraphInfo,
@@ -623,7 +623,7 @@ def run_body(*args: object) -> list[object]:
         with self.disable_tracing() as tracer:
             body_graph = proxy_tensor.make_fx(
                 run_body, decomposition_table=select_decomp_table()
-            )(*inputs.get_tensor_args())
+            )(*inputs.get_tensor_args()).graph
             assert outputs is not None
             graph_idx = self.device_ir.add_graph(
                 body_graph,
@@ -843,8 +843,8 @@ def lower_to_device_ir(func: HostFunction) -> DeviceIR:
         for graph in device_ir.graphs:
             prepare_graph_lowerings(graph.graph)
         for graph in device_ir.graphs:
-            remove_unnecessary_tile_index(graph.graph.graph)
-            remove_unnecessary_masking(graph.graph.graph)
+            remove_unnecessary_tile_index(graph.graph)
+            remove_unnecessary_masking(graph.graph)
         device_ir.build_rolled_reductions()
         return device_ir
 
diff --git a/helion/_compiler/inductor_lowering.py b/helion/_compiler/inductor_lowering.py
@@ -30,6 +30,7 @@
 from torch._inductor.utils import triton_type
 from torch._inductor.virtualized import OpsValue
 from torch._inductor.virtualized import V
+from torch.fx._lazy_graph_module import _LazyGraphModule
 from torch.fx.experimental import proxy_tensor
 from torch.fx.experimental.sym_node import SymNode
 from torch.fx.interpreter import Interpreter
@@ -65,14 +66,15 @@
     CodegenHandler = Callable[["GraphInterpreter", torch.fx.Node], object]
 
 
-def prepare_graph_lowerings(gm: torch.fx.GraphModule) -> None:
+def prepare_graph_lowerings(graph: torch.fx.Graph) -> None:
     with compile_lock:
         graph_lowering = GraphLowering(
-            gm, shape_env=CompileEnvironment.current().shape_env
+            _LazyGraphModule({}, graph),
+            shape_env=CompileEnvironment.current().shape_env,
         )
         # pyre-ignore[19]
         with V.set_graph_handler(graph_lowering):
-            for node in gm.graph.nodes:
+            for node in graph.nodes:
                 assert node.op in {
                     "call_function",
                     "placeholder",
@@ -815,8 +817,8 @@ def _unpack_opsvalue(value: object) -> str:
 
 
 class GraphInterpreter(Interpreter):
-    def __init__(self, gm: torch.fx.GraphModule, cg: GenerateAST) -> None:
-        super().__init__(gm, garbage_collect_values=False)
+    def __init__(self, graph: torch.fx.Graph, cg: GenerateAST) -> None:
+        super().__init__(_LazyGraphModule({}, graph), garbage_collect_values=False)
         self.cg = cg
 
     def run_node(self, n: Node) -> object:
@@ -844,11 +846,11 @@ def run_node(self, n: Node) -> object:
 
 
 def codegen_call_with_graph(
-    cg: GenerateAST, gm: torch.fx.GraphModule, args: list[ast.AST]
+    cg: GenerateAST, graph: torch.fx.Graph, args: list[ast.AST]
 ) -> list[object]:
     with compile_lock:
         new_args = []
-        placeholders = gm.graph.find_nodes(op="placeholder")
+        placeholders = graph.find_nodes(op="placeholder")
         for arg, placeholder in zip(args, placeholders, strict=True):
             if all(
                 user.target == torch.ops.aten.sym_size.int for user in placeholder.users
@@ -864,7 +866,7 @@ def codegen_call_with_graph(
                 new_args.append(expr_from_string(copy_name))
             else:
                 new_args.append(cg.lift(arg))
-        return GraphInterpreter(gm, cg).run(*new_args)
+        return GraphInterpreter(graph, cg).run(*new_args)
 
 
 class CodegenState(NamedTuple):
diff --git a/helion/_compiler/node_masking.py b/helion/_compiler/node_masking.py
@@ -90,7 +90,7 @@ def cached_masked_value(
         """
         device_ir = DeviceIR.current()
         for graph_info in device_ir.graphs:
-            if node.graph is graph_info.graph.graph and isinstance(
+            if node.graph is graph_info.graph and isinstance(
                 graph_info, NodeArgsGraphInfo
             ):
                 outer_node = graph_info.placeholder_to_outer_arg(node)
@@ -142,7 +142,7 @@ def getitem_masked_value(
     else:
         return None
     assert isinstance(graph_id, int)
-    graph = DeviceIR.current().graphs[graph_id].graph.graph
+    graph = DeviceIR.current().graphs[graph_id].graph
     (output_node,) = graph.find_nodes(op="output")
     (outputs,) = output_node.args
     assert isinstance(outputs, (list, tuple))
diff --git a/helion/_compiler/roll_reduction.py b/helion/_compiler/roll_reduction.py
@@ -141,9 +141,8 @@ def start_new_graph(self) -> None:
             self.available.add(orig_node)
         graph = self.inner_graph
         graph.output([*outputs.values()])
-        gm = torch.fx.GraphModule({}, graph)
         graph_id = self.device_ir.add_reduction_loop_graph(
-            gm,
+            graph,
             block_index=self.rdim.block_size_idx,
             node_args=self.inner_args,
         )