feat: Add ATen lowering pass system

gs-olive · gs-olive · commit 245ea94bda93 · 2023-09-05T22:02:15.000-07:00
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -9,11 +9,10 @@
 import torch.utils._pytree as pytree
 from torch._dynamo.utils import detect_fake_mode
 from torch._functorch.aot_autograd import _aot_export_function
-from torch._inductor.constant_folding import ConstantFolder, replace_node_with_constant
 from torch._ops import OpOverload
 from torch_tensorrt.dynamo import CompilationSettings
 from torch_tensorrt.dynamo.compile import compile_module
-from torch_tensorrt.dynamo.lowering._decompositions import get_decompositions
+from torch_tensorrt.dynamo.lowering import apply_lowering_passes, get_decompositions
 from torch_tensorrt.dynamo.lowering._pre_aot_lowering import pre_aot_substitutions
 from torch_tensorrt.dynamo.utils import parse_dynamo_kwargs
 
@@ -75,7 +74,7 @@ def _pretraced_backend(
             fake_mode, "allow_non_fake_inputs", True
         ), fake_mode:
             # Invoke AOTAutograd to translate operators to aten
-            graph_module = aot_export_for_compile(
+            gm = aot_export_for_compile(
                 gm,
                 sample_inputs,
                 decompositions=get_decompositions(
@@ -85,10 +84,10 @@ def _pretraced_backend(
 
             logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph))
 
-            constant_fold(graph_module)
+            gm = apply_lowering_passes(gm)
 
             trt_compiled = compile_module(
-                graph_module,
+                gm,
                 sample_inputs,
                 settings=settings,
             )
@@ -112,35 +111,6 @@ def _pretraced_backend(
             raise
 
 
-@torch.utils._python_dispatch._disable_current_modes()  # type: ignore
-def constant_fold(gm: torch.fx.GraphModule) -> Any:
-    """Adapted from:
-    https://github.com/pytorch/pytorch/blob/3a79621c9dce17f77fbddc06aab21f6bc477f313/torch/_inductor/freezing.py#L178-L197
-
-    Folds constants in the graph module, not skipping constructors
-
-    Modifies the graph in-place and replaces node with constants
-    """
-    cf = ConstantFolder(gm, skip_constructors=False)
-    cf.run()
-
-    for node, constant in cf.node_replacements.items():
-        replace_node_with_constant(gm, node, constant)
-
-    erased_params = []
-    for node in gm.graph.nodes:
-        if node.op == "get_attr" and len(node.users) == 0:
-            delattr(gm, node.target)
-            erased_params.append(node)
-
-    for node in erased_params:
-        gm.graph.erase_node(node)
-
-    gm.graph.eliminate_dead_code()
-    gm.graph.lint()
-    gm.recompile()
-
-
 def aot_export_for_compile(
     func: torch.fx.GraphModule,
     args: Sequence[torch.Tensor],
diff --git a/py/torch_tensorrt/dynamo/lowering/__init__.py b/py/torch_tensorrt/dynamo/lowering/__init__.py
@@ -2,4 +2,5 @@
 from ._fusers import *  # noqa: F401
 from ._pre_aot_lowering import SUBSTITUTION_REGISTRY  # noqa: F401
 from ._pre_aot_lowering import register_substitution  # noqa: F401
+from .passes import add_lowering_pass, apply_lowering_passes
 from .substitutions import *  # noqa: F401
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/__init__.py b/py/torch_tensorrt/dynamo/lowering/passes/__init__.py
@@ -0,0 +1,27 @@
+from typing import Callable
+
+import torch
+from torch.fx.passes.pass_manager import PassManager
+
+from .constant_folding import constant_fold
+from .repair_input_as_output import repair_input_as_output
+
+ATEN_LOWERING_PASSES = PassManager.build_from_passlist(
+    [
+        constant_fold,
+        repair_input_as_output,
+    ]
+)
+
+
+def add_lowering_pass(
+    lowering_pass: Callable[[torch.fx.GraphModule], torch.fx.GraphModule]
+) -> None:
+    """Adds a lowering pass to the registry"""
+    ATEN_LOWERING_PASSES.add_pass(lowering_pass)
+    return
+
+
+def apply_lowering_passes(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """Applies the lowering passes to a graph module, returns the modified GraphModule"""
+    return ATEN_LOWERING_PASSES(gm)
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py
@@ -0,0 +1,39 @@
+import logging
+
+import torch
+from torch._inductor.constant_folding import ConstantFolder, replace_node_with_constant
+
+logger = logging.getLogger(__name__)
+
+
+@torch.utils._python_dispatch._disable_current_modes()  # type: ignore
+def constant_fold(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """Adapted from:
+    https://github.com/pytorch/pytorch/blob/3a79621c9dce17f77fbddc06aab21f6bc477f313/torch/_inductor/freezing.py#L178-L197
+
+    Folds constants in the graph module, not skipping constructors
+
+    Modifies the graph in-place and replaces node with constants
+    """
+    cf = ConstantFolder(gm, skip_constructors=False)
+    cf.run()
+
+    for node, constant in cf.node_replacements.items():
+        replace_node_with_constant(gm, node, constant)
+
+    erased_params = []
+    for node in gm.graph.nodes:
+        if node.op == "get_attr" and len(node.users) == 0:
+            delattr(gm, node.target)
+            erased_params.append(node)
+
+    for node in erased_params:
+        gm.graph.erase_node(node)
+
+    gm.graph.eliminate_dead_code()
+    gm.graph.lint()
+    gm.recompile()
+
+    logger.debug(f"Graph after constant folding:\n{gm.graph}")
+
+    return gm
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/repair_input_as_output.py b/py/torch_tensorrt/dynamo/lowering/passes/repair_input_as_output.py
@@ -0,0 +1,45 @@
+import logging
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def repair_input_as_output(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """Repair scenarios where inputs are also outputs of the graph
+
+    TRT does not allow such cases, so we insert a `clone` (identity) layer
+    """
+    modified_graph = False
+
+    # Extract graph placeholders
+    placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"]
+
+    for placeholder in placeholders:
+        # If any placeholder has any users which are direct graph outputs
+        if len(placeholder.users) >= 1 and any(
+            user.op == "output" for user in placeholder.users
+        ):
+            modified_graph = True
+
+            # Get direct graph outputs which are direct uses of placeholders
+            direct_outputs = [user for user in placeholder.users if user.op == "output"]
+
+            # Insert clone node for placeholder to ensure placeholder is not a direct output
+            with gm.graph.inserting_after(placeholder):
+                cloned_placeholder = gm.graph.call_function(
+                    torch.ops.aten.clone.default,
+                    args=(placeholder,),
+                )
+
+            # Replace placeholder as output with cloned version
+            for output in direct_outputs:
+                output.replace_input_with(placeholder, cloned_placeholder)
+
+    if modified_graph:
+        gm.graph.eliminate_dead_code()
+        gm.graph.lint()
+        gm.recompile()
+        logger.debug(f"Graph after repair_input_as_output:\n{gm.graph}")
+
+    return gm
diff --git a/setup.py b/setup.py
@@ -392,6 +392,7 @@ def run(self):
     "torch_tensorrt.dynamo.conversion.impl.unary",
     "torch_tensorrt.dynamo.lowering",
     "torch_tensorrt.dynamo.lowering.substitutions",
+    "torch_tensorrt.dynamo.lowering.passes",
     "torch_tensorrt.dynamo.partitioning",
     "torch_tensorrt.dynamo.runtime",
     "torch_tensorrt.dynamo.tools",
@@ -419,6 +420,7 @@ def run(self):
     "torch_tensorrt.dynamo.conversion.impl.unary": "py/torch_tensorrt/dynamo/conversion/impl/unary",
     "torch_tensorrt.dynamo.lowering": "py/torch_tensorrt/dynamo/lowering",
     "torch_tensorrt.dynamo.lowering.substitutions": "py/torch_tensorrt/dynamo/lowering/substitutions",
+    "torch_tensorrt.dynamo.lowering.passes": "py/torch_tensorrt/dynamo/lowering/passes",
     "torch_tensorrt.dynamo.partitioning": "py/torch_tensorrt/dynamo/partitioning",
     "torch_tensorrt.dynamo.runtime": "py/torch_tensorrt/dynamo/runtime",
     "torch_tensorrt.dynamo.tools": "py/torch_tensorrt/dynamo/tools",
diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py
@@ -0,0 +1,65 @@
+import torch
+import torch_tensorrt
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+from ..testing_utilities import lower_graph_testing
+
+
+class TestInputAsOutput(TestCase):
+    def test_input_as_output(self):
+        class InputAsOutput(torch.nn.Module):
+            def forward(self, x, y):
+                y_new = y + 1
+                y_new = y_new * 7
+                return (y_new, (x, y_new))
+
+        # Operations expected to be included in the traced graph after decompositions
+        expected_ops = {torch.ops.aten.clone.default}
+
+        inputs = [
+            torch.rand(
+                5,
+                7,
+            ).cuda(),
+            torch.rand(
+                5,
+                7,
+            ).cuda(),
+        ]
+
+        fx_graph = torch.fx.symbolic_trace(InputAsOutput())
+        _, expected_ops_unseen = lower_graph_testing(
+            fx_graph, inputs, expected_ops=expected_ops, min_block_size=1
+        )
+
+        self.assertEquals(
+            len(expected_ops_unseen),
+            0,
+            f"The following expected ops were not encountered: {expected_ops_unseen}",
+        )
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            msg=f"InputAsOutput TRT outputs don't match with the original model.",
+        )
+        torch._dynamo.reset()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/py/dynamo/testing_utilities.py b/tests/py/dynamo/testing_utilities.py
@@ -6,8 +6,8 @@
 import torch
 from torch._dynamo.utils import detect_fake_mode
 from torch_tensorrt.dynamo import partitioning
-from torch_tensorrt.dynamo.backend.backends import aot_export_for_compile, constant_fold
-from torch_tensorrt.dynamo.lowering._decompositions import get_decompositions
+from torch_tensorrt.dynamo.backend.backends import aot_export_for_compile
+from torch_tensorrt.dynamo.lowering import apply_lowering_passes, get_decompositions
 from torch_tensorrt.dynamo.lowering._pre_aot_lowering import pre_aot_substitutions
 
 DECIMALS_OF_AGREEMENT = 4
@@ -40,16 +40,16 @@ def fx_dynamo_testing_backend(
         fake_mode, "allow_non_fake_inputs", True
     ), fake_mode:
         # Invoke AOTAutograd to translate operators to aten
-        graph_module = aot_export_for_compile(
+        gm = aot_export_for_compile(
             gm,
             sample_inputs,
             decompositions=get_decompositions(),
         )
 
-        constant_fold(graph_module)
+        gm = apply_lowering_passes(gm)
 
         trt_compiled = custom_backend(
-            graph_module,
+            gm,
             sample_inputs,
         )
         return trt_compiled