CI clean

zongfeijing · zongfeijing · commit 8cf28e8e417d · 2025-04-19T06:39:57.000-07:00
Signed-off-by: Zongfei Jing &lt;20381269+zongfeijing@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/fp4Op.cpp b/cpp/tensorrt_llm/thop/fp4Op.cpp
@@ -144,7 +144,7 @@ int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn, tensor
 }
 
 torch::autograd::variable_list FloatToE2M1AndUFP8SFScale(
-    th::Tensor floatTensor, int64_t sfVecSize, int64_t sfType, bool isSfSwizzledLayout = true)
+    th::Tensor floatTensor, int64_t sfVecSize, int64_t sfType, torch::optional<bool> isSfSwizzledLayout)
 {
     CHECK_CPU_INPUT(floatTensor, th::kFloat32);
     auto inputShape = floatTensor.sizes();
@@ -160,8 +160,10 @@ torch::autograd::variable_list FloatToE2M1AndUFP8SFScale(
     int packedFp4HiddenDim = hiddenDim / 2;
     int groupsPerHiddenDim = hiddenDim / sfVecSize;
 
-    tensorrt_llm::FP4QuantizationSFLayout layout = isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED
-                                                                      : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+    // Note: if isSfSwizzledLayout is provided, use its value; otherwise default to true.
+    tensorrt_llm::FP4QuantizationSFLayout layout = isSfSwizzledLayout.value_or(true)
+        ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED
+        : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
 
     for (size_t vIdx = 0; vIdx < static_cast<size_t>(inputShape[0]); ++vIdx)
     {
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -32,7 +32,6 @@
 
 import torch
 import torch.nn.functional as F
-from examples.infinitebench import args
 import triton
 import triton.language as tl
 from torch import nn
@@ -269,13 +268,9 @@ def __init__(
         topk_group: int,
         routed_scaling_factor: float,
         dtype: Optional[torch.dtype] = None,
-<<<<<<< HEAD
         fuse_routing_kernel: bool = True,
         apply_routing: bool = False,
-=======
-        is_thop: bool = True,
         moe_backend: str = 'CUTLASS',
->>>>>>> 14626789cf (Add TRT-LLM Gen MOE to Deepseek)
     ):
         super().__init__()
         self.weight = nn.Parameter(torch.empty((num_experts, hidden_size),
@@ -358,12 +353,9 @@ def __init__(self,
             topk_group=config.topk_group,
             routed_scaling_factor=config.routed_scaling_factor,
             dtype=dtype,
-<<<<<<< HEAD
             fuse_routing_kernel=True,
-            apply_routing=False)
-=======
+            apply_routing=False,
             moe_backend=model_config.moe_backend)
->>>>>>> 14626789cf (Add TRT-LLM Gen MOE to Deepseek)
         self.experts = FusedMoE(
             num_experts=num_experts,
             routing_method=self.gate.routing_method,
@@ -602,7 +594,7 @@ def forward(
             attn_metadata=attn_metadata,
             all_reduce_params=AllReduceParams(
                 enable_allreduce=not self.disable_attn_allreduce),
-            **args,
+            **kwargs,
         )
 
         # deepseek allreduce kernel is better when m < 512, two shot(128~512) has acc bug, waive
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
@@ -221,6 +221,14 @@ def _get_modelopt_kv_cache_dtype(self):
             return None
 
     def is_module_excluded_from_quantization(self, name: str) -> bool:
+        """Check if the module is excluded from quantization.
+
+        Args:
+            name (str): The name of the module.
+
+        Returns:
+            bool: True if the module is excluded from quantization, False otherwise.
+        """
         if self.exclude_modules is not None:
             for exclude_module in self.exclude_modules:
                 if fnmatch.fnmatchcase(name, exclude_module):
diff --git a/tests/unittest/_torch/test_fp4_gemm_quantize.py b/tests/unittest/_torch/test_fp4_gemm_quantize.py
@@ -15,6 +15,7 @@
 
 import unittest
 
+import pytest
 import torch
 from parameterized import parameterized
 from utils.util import skip_pre_blackwell_unittest, unittest_name_func
diff --git a/tests/unittest/api_stability/references/quant_config.yaml b/tests/unittest/api_stability/references/quant_config.yaml
@@ -38,4 +38,10 @@ methods:
   to_dict:
     parameters: {}
     return_annotation: dict
+  is_module_excluded_from_quantization:
+    parameters:
+      name:
+        annotation: str
+        default: inspect._empty
+    return_annotation: bool
 properties: {}
diff --git a/tests/unittest/trt/functional/test_fp4_gemm.py b/tests/unittest/trt/functional/test_fp4_gemm.py
@@ -28,9 +28,10 @@
 # ufp8_type: 0 for ue8m0, 1 for ue4m3
 def float_tensor_to_e2m1_and_ufp8_scale(float_tensor: torch.Tensor,
                                         sf_vec_size,
-                                        ufp8_type: int = 1):
+                                        ufp8_type: int = 1,
+                                        is_sf_swizzled_layout: bool = True):
     value_e2m1, scale_ufp8, rep_float = torch.ops.tensorrt_llm.float_to_e2m1_and_ufp8sf_scale(
-        float_tensor, sf_vec_size, ufp8_type)
+        float_tensor, sf_vec_size, ufp8_type, is_sf_swizzled_layout)
     return value_e2m1, scale_ufp8, rep_float
 
 

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn, tensor`
`144`	`144`	`}`
`145`	`145`
`146`	`146`	`torch::autograd::variable_list FloatToE2M1AndUFP8SFScale(`
`147`		`- th::Tensor floatTensor, int64_t sfVecSize, int64_t sfType, bool isSfSwizzledLayout = true)`
	`147`	`+ th::Tensor floatTensor, int64_t sfVecSize, int64_t sfType, torch::optional<bool> isSfSwizzledLayout)`
`148`	`148`	`{`
`149`	`149`	`CHECK_CPU_INPUT(floatTensor, th::kFloat32);`
`150`	`150`	`auto inputShape = floatTensor.sizes();`
`@@ -160,8 +160,10 @@ torch::autograd::variable_list FloatToE2M1AndUFP8SFScale(`
`160`	`160`	`int packedFp4HiddenDim = hiddenDim / 2;`
`161`	`161`	`int groupsPerHiddenDim = hiddenDim / sfVecSize;`
`162`	`162`
`163`		`- tensorrt_llm::FP4QuantizationSFLayout layout = isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED`
`164`		`- : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;`
	`163`	`+ // Note: if isSfSwizzledLayout is provided, use its value; otherwise default to true.`
	`164`	`+ tensorrt_llm::FP4QuantizationSFLayout layout = isSfSwizzledLayout.value_or(true)`
	`165`	`+ ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED`
	`166`	`+ : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;`
`165`	`167`
`166`	`168`	`for (size_t vIdx = 0; vIdx < static_cast<size_t>(inputShape[0]); ++vIdx)`
`167`	`169`	`{`