Update

vkuzo · vkuzo · commit dc0803c309bc · 2025-06-25T07:38:40.000-07:00
[ghstack-poisoned]
diff --git a/test/prototype/mx_formats/test_mx_dtensor.py b/test/prototype/mx_formats/test_mx_dtensor.py
@@ -69,12 +69,25 @@ def _test_dtensor_cast_to_mxfp8(mesh: DeviceMesh, size=4):
 
 
 def _test_mxfp8_mlp_tensor_parallelism(mesh: DeviceMesh, size=128):
+    config = MXLinearConfig.from_recipe_name("mxfp8_emulated")
+    config.block_size = 32
+    _test_lowp_mlp_tensor_parallelism_base(
+        mesh, config, size, compile=False, allgather_in_lowp=False
+    )
+    _test_lowp_mlp_tensor_parallelism_base(
+        mesh, config, size, compile=True, allgather_in_lowp=False
+    )
+
+
+def _test_mxfp8_mlp_tensor_parallelism_dim1_triton(mesh: DeviceMesh, size=128):
     config = MXLinearConfig.from_recipe_name("mxfp8_emulated")
     config.block_size = 32
     config.use_fp8_dim1_cast_triton_kernel = True
     _test_lowp_mlp_tensor_parallelism_base(
         mesh, config, size, compile=False, allgather_in_lowp=False
     )
+    # TODO(future PR): enable compile here, currently seeing
+    # https://www.internalfb.com/phabricator/paste/view/P1851219639
     # _test_lowp_mlp_tensor_parallelism_base(
     #     mesh, config, size, compile=True, allgather_in_lowp=False
     # )
@@ -83,8 +96,9 @@ def _test_mxfp8_mlp_tensor_parallelism(mesh: DeviceMesh, size=128):
 if __name__ == "__main__":
     device_mesh = setup_distributed()
     tests = [
-        # _test_dtensor_cast_to_mxfp8,
+        _test_dtensor_cast_to_mxfp8,
         _test_mxfp8_mlp_tensor_parallelism,
+        _test_mxfp8_mlp_tensor_parallelism_dim1_triton,
     ]
 
     for test in tqdm(tests, desc="Running tests"):
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1363,10 +1363,21 @@ def triton_to_mxfp8_dim1(
             output_col_major.t(),
             col_scale.view(torch.float8_e8m0fnu),
         )
-        
-    print('ASDFASDFASDF')
-    from torchao import triton_to_mxfp8_dim1
-    print(triton_to_mxfp8_dim1)
+
+    # print(torch.ops.torchao.triton_to_mxfp8_dim1.default)
+
+    from torch.distributed.tensor import Replicate, Shard
+    from torch.distributed.tensor.experimental import register_sharding
+
+    @register_sharding(torch.ops.torchao.triton_to_mxfp8_dim1.default)
+    def custom_triton_to_mxfp8_dim1_sharding(x, inner_block_size=32):
+        replicate = ([Replicate(), Replicate()], [Replicate(), None])
+        # Note that the data is returned transposed, which is why
+        # we flip the sharding dim below
+        shard_dim0 = ([Shard(1), Shard(1)], [Shard(0), None])
+        shard_dim1 = ([Shard(0), Shard(0)], [Shard(1), None])
+        acceptable_shardings = [replicate, shard_dim0, shard_dim1]
+        return acceptable_shardings
 
     def triton_to_mxfp8_dim1_reference(
         x_hp: torch.Tensor, block_size
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py
@@ -12,6 +12,7 @@
 
 import torch
 import torch.nn.functional as F
+from torch.distributed._tensor import DTensor
 
 from torchao.prototype.mx_formats.config import (
     MXGemmKernelChoice,
@@ -25,6 +26,46 @@
 )
 
 
+def _triton_to_mxfp8_dim1_wrapper(
+    a, block_size, elem_dtype, hp_dtype, gemm_kernel_choice
+):
+    a_data, a_scale = triton_to_mxfp8_dim1(a, block_size)
+    if isinstance(a_data, DTensor):
+        assert isinstance(a_scale, DTensor)
+        a_data_local = a_data.to_local()
+        a_scale_local = a_scale.to_local()
+        inner = MXTensor(
+            a_scale_local,
+            a_data_local.t(),
+            elem_dtype,
+            block_size,
+            hp_dtype,
+            False,
+            gemm_kernel_choice,
+            False,
+        )
+        mx_tensor = DTensor.from_local(
+            inner,
+            a_data.device_mesh,
+            a_data.placements,
+            run_check=False,
+            shape=a_data.t().size(),
+            stride=a_data.t().stride(),
+        )
+    else:
+        mx_tensor = MXTensor(
+            a_scale,
+            a_data.t(),
+            elem_dtype,
+            block_size,
+            hp_dtype,
+            False,
+            gemm_kernel_choice,
+            False,
+        )
+    return mx_tensor
+
+
 @torch._dynamo.allow_in_graph
 class mx_mm(torch.autograd.Function):
     # There are three gemms in a forward + backward of a Linear layer:
@@ -95,20 +136,9 @@ def backward(ctx, grad_output_hp: torch.Tensor):
         )
 
         if use_fp8_dim1_cast_triton_kernel:
-            weight_mx_dim1_data, weight_mx_dim1_scale = triton_to_mxfp8_dim1(
-                weight_hp, block_size
+            weight_mx_dim1 = _triton_to_mxfp8_dim1_wrapper(
+                weight_hp, block_size, w_elem_dtype, weight_hp.dtype, gemm_kernel_choice
             )
-            weight_mx_dim1 = MXTensor(
-                weight_mx_dim1_scale.reshape(-1),
-                weight_mx_dim1_data.t(),
-                w_elem_dtype,
-                block_size,
-                weight_hp.dtype,
-                False,
-                gemm_kernel_choice,
-                False,
-            )
-
         else:
             weight_hp_t_c = weight_hp.t().contiguous()
             weight_mx_dim1 = MXTensor.to_mx(
@@ -124,18 +154,12 @@ def backward(ctx, grad_output_hp: torch.Tensor):
 
         # input_t @ grad_output = grad_weight
         if use_fp8_dim1_cast_triton_kernel:
-            grad_output_mx_dim1_data, grad_output_mx_dim1_scale = triton_to_mxfp8_dim1(
-                grad_output_hp_r, block_size
-            )
-            grad_output_mx_dim1 = MXTensor(
-                grad_output_mx_dim1_scale.reshape(-1),
-                grad_output_mx_dim1_data.t(),
-                grad_elem_dtype,
+            grad_output_mx_dim1 = _triton_to_mxfp8_dim1_wrapper(
+                grad_output_hp_r,
                 block_size,
+                grad_elem_dtype,
                 grad_output_hp_r.dtype,
-                False,
                 gemm_kernel_choice,
-                False,
             )
         else:
             grad_output_mx_dim1 = MXTensor.to_mx(
@@ -146,18 +170,12 @@ def backward(ctx, grad_output_hp: torch.Tensor):
             )
 
         if use_fp8_dim1_cast_triton_kernel:
-            input_t_mx_dim0_tmp_data, input_t_mx_dim0_tmp_scale = triton_to_mxfp8_dim1(
-                input_hp_r, block_size
-            )
-            input_t_mx_dim0_tmp = MXTensor(
-                input_t_mx_dim0_tmp_scale.reshape(-1),
-                input_t_mx_dim0_tmp_data.t(),
-                in_elem_dtype,
+            input_t_mx_dim0_tmp = _triton_to_mxfp8_dim1_wrapper(
+                input_hp_r,
                 block_size,
+                in_elem_dtype,
                 input_hp_r.dtype,
-                False,
                 gemm_kernel_choice,
-                False,
             )
             input_t_mx_dim0 = input_t_mx_dim0_tmp.t()
         else:
diff --git a/torchao/testing/training/dtensor_utils.py b/torchao/testing/training/dtensor_utils.py
@@ -151,8 +151,8 @@ def _test_lowp_mlp_tensor_parallelism_base(
         sp_model = torch.compile(sp_model)
         sp_model2 = torch.compile(sp_model2)
 
-    x_fp32 = torch.rand(size, size * 2, size, device=device, requires_grad=False)
-    go_fp32 = torch.rand(size, size * 2, size, device=device, requires_grad=False)
+    x_fp32 = torch.rand(1, size * 2, size, device=device, requires_grad=False)
+    go_fp32 = torch.rand(1, size * 2, size, device=device, requires_grad=False)
     x_fp32_tp_input = x_fp32.clone()
     go_fp32_tp = go_fp32.clone()
     x_fp32_sp_input = distribute_tensor(x_fp32.clone(), mesh, [Shard(0)])