fix: fix cublas_scaled_mm

dc3671 · dc3671 · commit 37b5b67f3e3c · 2025-04-16T23:28:02.000-07:00
Signed-off-by: Zhenhuan Chen &lt;chenzhh3671@gmail.com&gt;
diff --git a/tests/unittest/_torch/thop/test_scaled_mm.py b/tests/unittest/_torch/thop/test_scaled_mm.py
@@ -39,8 +39,6 @@
 )
 def test_fp8_scaled_mm(output_dtype, m, k_n):
     # Skip specific problematic case
-    if m == 228 and k_n == (28672, 8192):
-        pytest.skip("Skipping problematic case with m=228, k=28672, n=8192")
 
     k, n = k_n
     torch.random.manual_seed(0)
@@ -50,6 +48,10 @@ def test_fp8_scaled_mm(output_dtype, m, k_n):
     w = torch.rand(shape_w, device="cuda").to(torch.float8_e4m3fn)
     scale_x = torch.rand(1, device="cuda")
     scale_w = torch.rand(1, device="cuda")
+    if (m == 12 or m == 228) and k_n == (28672, 8192):
+        from torch.profiler import ProfilerActivity, profile
+        p = profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA
+                                ]).__enter__()
     output = torch.ops.trtllm.cublas_scaled_mm(
         x,
         w.t(),
@@ -60,7 +62,7 @@ def test_fp8_scaled_mm(output_dtype, m, k_n):
     )
     # set pytorch's cublas workspace size to 32MB to be aligned with trtllm
     old_env = os.environ.get("CUBLASLT_WORKSPACE_SIZE", "")
-    os.environ["CUBLASLT_WORKSPACE_SIZE"] = f"{32*1024*1024}"
+    os.environ["CUBLASLT_WORKSPACE_SIZE"] = f"{32*1024}"
     ref = torch._scaled_mm(
         x,
         w.t(),
@@ -69,6 +71,10 @@ def test_fp8_scaled_mm(output_dtype, m, k_n):
         scale_b=scale_w,
         use_fast_accum=True,
     )
+    if (m == 12 or m == 228) and k_n == (28672, 8192):
+        p.__exit__(None, None, None)
+        warn(p.key_averages().table(sort_by="self_cuda_time_total",
+                                    row_limit=-1))
     os.environ["CUBLASLT_WORKSPACE_SIZE"] = old_env
     np.testing.assert_allclose(ref.float().cpu(), output.float().cpu())