Register choose_qparams_affine_float8 as custom op

angelayi · angelayi · commit ce8cd8e11771 · 2025-07-01T08:31:41.000-07:00
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -356,7 +356,7 @@ def test_mm_float8dq_per_row(
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
-    @common_utils.parametrize("block_size", [None, (1, 32), (2, 16), (4, 8)])
+    @common_utils.parametrize("block_size", [(), (1, 32), (2, 16), (4, 8)])
     def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
         """Test _dequantize_affine_float8 with various configurations"""
 
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -14,6 +14,7 @@
 
 import torch
 import torch.nn as nn
+from torch.testing import FileCheck
 from parameterized import parameterized
 from torch._dynamo import config
 from torch._inductor.utils import run_and_get_code
@@ -41,6 +42,7 @@
     change_linear_weights_to_int4_woqtensors,
     change_linear_weights_to_int8_dqtensors,
     change_linear_weights_to_int8_woqtensors,
+    Float8DynamicActivationFloat8WeightConfig,
     int4_weight_only,
     int8_dynamic_activation_int4_weight,
     int8_dynamic_activation_int8_weight,
@@ -2077,6 +2079,29 @@ def forward(self, x):
             self.assertTrue(torch.ops.torchao.quantize_affine.default in targets)
             self.assertFalse(torch.ops.aten.narrow.default in targets)
 
+    @unittest.skipIf(
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+    )
+    def test_export_float8(self):
+        class SimpleNetwork(torch.nn.Module):
+            def __init__(self):
+                super(SimpleNetwork, self).__init__()
+                self.linear = torch.nn.Linear(in_features=32, out_features=16, bias=False)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model= SimpleNetwork().eval().cuda()
+        inp = torch.randn(2, 32).cuda()
+        config = Float8DynamicActivationFloat8WeightConfig()
+        quantize_(model, config)
+
+        ep = torch.export.export(model, (inp,))
+        print(ep)
+        FileCheck().check_count("torch.ops.torchao.choose_qparams_affine_float8.default", 1, exactly=True).run(
+            str(ep.graph)
+        )
+
 
 class TestUtils(unittest.TestCase):
     @parameterized.expand(
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -2178,11 +2178,12 @@ def _dequantize_affine_floatx(
     return tensor
 
 
+@register_custom_op
 def _choose_qparams_affine_float8(
     tensor: torch.Tensor,
+    block_size: List[int],
     float8_dtype: torch.dtype = torch.float8_e4m3fn,
     scale_dtype: torch.dtype = torch.float32,
-    block_size: Optional[Tuple[int, ...]] = None,
 ) -> torch.Tensor:
     """
     Calculates float8 scaling factor for the given high precision tensor, using tensorwise granularity.
@@ -2195,7 +2196,7 @@ def _choose_qparams_affine_float8(
     """
     quant_max = torch.finfo(float8_dtype).max
     # only tensorwise scaling is supported for now:
-    if block_size is None:
+    if len(block_size) == 0:
         max_abs = tensor.abs().max()
         scale = max_abs / quant_max
     else:

Original file line number	Diff line number	Diff line change
`@@ -356,7 +356,7 @@ def test_mm_float8dq_per_row(`
`356`	`356`	`)`
`357`	`357`	`@common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])`
`358`	`358`	`@common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])`
`359`		`- @common_utils.parametrize("block_size", [None, (1, 32), (2, 16), (4, 8)])`
	`359`	`+ @common_utils.parametrize("block_size", [(), (1, 32), (2, 16), (4, 8)])`
`360`	`360`	`def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):`
`361`	`361`	`"""Test _dequantize_affine_float8 with various configurations"""`
`362`	`362`