NVIDIA
diff --git a/‎cpp/tensorrt_llm/common/cudaFp8Utils.cu
+40-9 b/‎cpp/tensorrt_llm/common/cudaFp8Utils.cu
+40-9
diff --git a/‎cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
+29-6 b/‎cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
+29-6
diff --git a/‎cpp/tensorrt_llm/thop/CMakeLists.txt
+1 b/‎cpp/tensorrt_llm/thop/CMakeLists.txt
+1
diff --git a/‎cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp
+72 b/‎cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp
+72
diff --git a/‎examples/mllama/requirements.txt
+1 b/‎examples/mllama/requirements.txt
+1
diff --git a/‎examples/pytorch/README.md
+1 b/‎examples/pytorch/README.md
+1
diff --git a/‎requirements.txt
+2-2 b/‎requirements.txt
+2-2
diff --git a/‎tensorrt_llm/_torch/model_config.py
+2-2 b/‎tensorrt_llm/_torch/model_config.py
+2-2
@@ -16,6 +16,7 @@
 
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/reduceKernelUtils.cuh"
 #include <algorithm>
 #include <cstdio>
@@ -40,6 +41,10 @@ __inline__ __device__ float scale(float a, float b)
 template <QuantizeMode QUANTIZE_MODE, bool QUANTIZE, typename T_OUT, typename T_S, typename T_IN>
 __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* input, int64_t numel, int64_t lda)
 {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
+
     for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel; i += blockDim.x * gridDim.x)
     {
 
@@ -56,6 +61,9 @@ __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i
             output[i] = T_OUT(scale<QUANTIZE>(static_cast<float>(input[i]), static_cast<float>(input_scale[0])));
         }
     }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.launch_dependents;");
+#endif
 }
 
 template <typename T_OUT, typename T_S, typename T_IN>
@@ -64,18 +72,30 @@ void invokeQuantizeMatrix(T_OUT* output, T_S const* input_scale, T_IN const* inp
 {
     dim3 grid(1024);
     dim3 block(CTA_SIZE);
+    cudaLaunchConfig_t config;
+    config.gridDim = grid;
+    config.blockDim = block;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    config.numAttrs = 1;
+    config.attrs = attrs;
     if (quantize_mode == QuantizeMode::PER_CHANNEL)
     {
-        scaleMatrix<QuantizeMode::PER_CHANNEL, true>
-            <<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);
+        cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_CHANNEL, true, T_OUT, T_S, T_IN>, output, input_scale,
+            input, numel, lda);
     }
     else if (quantize_mode == QuantizeMode::PER_TOKEN)
     {
-        scaleMatrix<QuantizeMode::PER_TOKEN, true><<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);
+        cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_TOKEN, true, T_OUT, T_S, T_IN>, output, input_scale,
+            input, numel, lda);
     }
     else if (quantize_mode == QuantizeMode::PER_TENSOR)
     {
-        scaleMatrix<QuantizeMode::PER_TENSOR, true><<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);
+        cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_TENSOR, true, T_OUT, T_S, T_IN>, output, input_scale,
+            input, numel, lda);
     }
     sync_check_cuda_error(stream);
 }
@@ -86,19 +106,30 @@ void invokeDequantizeMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i
 {
     dim3 grid(1024);
     dim3 block(CTA_SIZE);
+    cudaLaunchConfig_t config;
+    config.gridDim = grid;
+    config.blockDim = block;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    config.numAttrs = 1;
+    config.attrs = attrs;
     if (quantize_mode == QuantizeMode::PER_CHANNEL)
     {
-        scaleMatrix<QuantizeMode::PER_CHANNEL, false>
-            <<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);
+        cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_CHANNEL, false, T_OUT, T_S, T_IN>, output,
+            input_scale, input, numel, lda);
     }
     else if (quantize_mode == QuantizeMode::PER_TOKEN)
     {
-        scaleMatrix<QuantizeMode::PER_TOKEN, false><<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);
+        cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_TOKEN, false, T_OUT, T_S, T_IN>, output, input_scale,
+            input, numel, lda);
     }
     else if (quantize_mode == QuantizeMode::PER_TENSOR)
     {
-        scaleMatrix<QuantizeMode::PER_TENSOR, false>
-            <<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);
+        cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_TENSOR, false, T_OUT, T_S, T_IN>, output, input_scale,
+            input, numel, lda);
     }
     sync_check_cuda_error(stream);
 }
 
@@ -20,6 +20,7 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/reduceKernelUtils.cuh"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
@@ -778,6 +779,9 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams<T, KVCacheBu
 
     // Head idx.
     int const head_idx = blockIdx.y;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
 
     // Variable sequence length.
     bool const variable_sequence_length = params.tokens_info != nullptr && params.cu_seq_lens != nullptr;
@@ -1093,6 +1097,9 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams<T, KVCacheBu
             params.fmha_bmm2_scale[0] = o_scale_orig_quant * kv_scale_quant_orig;
         }
     }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.launch_dependents;");
+#endif
 }
 
 // Use more blocks for the batch dimension in the generation phase.
@@ -1255,22 +1262,38 @@ void kernelV1Dispatch(QKVPreprocessingParams<T, KVCacheBuffer> params, cudaStrea
     dim3 grid(1, params.head_num);                                                                                     \
     int num_blocks_for_tokens = int(divUp(params.token_num, tokens_per_cuda_block));                                   \
     calGridSizeWithBestEfficiency(block, grid, num_blocks_for_tokens, params.multi_processor_count, 1024);             \
+    cudaLaunchConfig_t config;                                                                                         \
+    config.gridDim = grid;                                                                                             \
+    config.blockDim = block;                                                                                           \
+    config.dynamicSmemBytes = 0;                                                                                       \
+    config.stream = stream;                                                                                            \
+    cudaLaunchAttribute attrs[1];                                                                                      \
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;                                                  \
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();                     \
+    config.numAttrs = 1;                                                                                               \
+    config.attrs = attrs;                                                                                              \
     if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPT_NEOX                                        \
         || params.position_embedding_type == PositionEmbeddingType::kLONG_ROPE                                         \
         || params.position_embedding_type == PositionEmbeddingType::kROPE_M)                                           \
     {                                                                                                                  \
-        applyBiasRopeUpdateKVCacheV2<T, TCache, BLOCK_SIZE, Dh, ADD_BIAS, STORE_QKV, FP8_OUTPUT, GEN_PHASE,            \
-            KVCacheBuffer, RotaryPositionEmbeddingType::GPT_NEOX><<<grid, block, 0, stream>>>(params);                 \
+        cudaLaunchKernelEx(&config,                                                                                    \
+            applyBiasRopeUpdateKVCacheV2<T, TCache, BLOCK_SIZE, Dh, ADD_BIAS, STORE_QKV, FP8_OUTPUT, GEN_PHASE,        \
+                KVCacheBuffer, RotaryPositionEmbeddingType::GPT_NEOX>,                                                 \
+            params);                                                                                                   \
     }                                                                                                                  \
     else if (params.position_embedding_type == PositionEmbeddingType::kROPE_GPTJ)                                      \
     {                                                                                                                  \
-        applyBiasRopeUpdateKVCacheV2<T, TCache, BLOCK_SIZE, Dh, ADD_BIAS, STORE_QKV, FP8_OUTPUT, GEN_PHASE,            \
-            KVCacheBuffer, RotaryPositionEmbeddingType::GPTJ><<<grid, block, 0, stream>>>(params);                     \
+        cudaLaunchKernelEx(&config,                                                                                    \
+            applyBiasRopeUpdateKVCacheV2<T, TCache, BLOCK_SIZE, Dh, ADD_BIAS, STORE_QKV, FP8_OUTPUT, GEN_PHASE,        \
+                KVCacheBuffer, RotaryPositionEmbeddingType::GPTJ>,                                                     \
+            params);                                                                                                   \
     }                                                                                                                  \
     else                                                                                                               \
     {                                                                                                                  \
-        applyBiasRopeUpdateKVCacheV2<T, TCache, BLOCK_SIZE, Dh, ADD_BIAS, STORE_QKV, FP8_OUTPUT, GEN_PHASE,            \
-            KVCacheBuffer, RotaryPositionEmbeddingType::NONE><<<grid, block, 0, stream>>>(params);                     \
+        cudaLaunchKernelEx(&config,                                                                                    \
+            applyBiasRopeUpdateKVCacheV2<T, TCache, BLOCK_SIZE, Dh, ADD_BIAS, STORE_QKV, FP8_OUTPUT, GEN_PHASE,        \
+                KVCacheBuffer, RotaryPositionEmbeddingType::NONE>,                                                     \
+            params);                                                                                                   \
     }
 
 #define STORE_QKV_AND_FP8_OUTPUT_DISPATCH(ADD_BIAS, GEN_PHASE)                                                         \
 
@@ -52,6 +52,7 @@ add_library(
   fp4BatchedQuantize.cpp
   fp8BlockScalingGemm.cpp
   fp8Quantize.cpp
+  fusedTopkSoftmax.cpp
   gatherTreeOp.cpp
   logitsBitmaskOp.cpp
   mambaConv1dOp.cpp
 
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/workspace.h"
+#include "tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h"
+#include "tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h"
+#include "tensorrt_llm/runtime/torchUtils.h"
+#include "tensorrt_llm/thop/thUtils.h"
+
+#include <ATen/cuda/EmptyTensor.h>
+
+#include <cuda_fp16.h>
+
+#include <cstdint>
+
+namespace torch_ext
+{
+
+std::tuple<torch::Tensor, torch::Tensor> fused_topk_softmax(torch::Tensor const& router_logits, int64_t const top_k,
+    int64_t const num_experts_total, int64_t const start_expert, int64_t const end_expert)
+{
+    // TODO: enable once the kernel has been added to the internal CUTLASS library.
+    TLLM_CHECK_WITH_INFO(false, "Fused topk/softmax op has not been enabled yet.");
+
+    CHECK_INPUT(router_logits, torch::kBFloat16);
+
+    auto const& router_logits_shape = router_logits.sizes();
+    auto const& rank = router_logits_shape.size();
+
+    TORCH_CHECK(rank == 2, "router_logits should be 2D tensor.");
+    int64_t const num_rows = router_logits_shape[0];
+
+    auto token_final_scales
+        = torch::empty({num_rows, top_k}, torch::dtype(torch::kFloat32).device(router_logits.device()));
+    auto token_selected_experts
+        = torch::empty({num_rows, top_k}, torch::dtype(torch::kInt32).device(router_logits.device()));
+
+    // auto stream = at::cuda::getCurrentCUDAStream(router_logits.get_device());
+    // tensorrt_llm::kernels::topkGatingSoftmaxKernelLauncher(
+    //     static_cast<__nv_bfloat16 const*>(router_logits.const_data_ptr()),
+    //     static_cast<float*>(token_final_scales.data_ptr()), static_cast<int*>(token_selected_experts.data_ptr()),
+    //     num_rows, top_k, num_experts_total, start_expert, end_expert, stream);
+    return {token_final_scales, token_selected_experts};
+}
+} // namespace torch_ext
+
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    m.def(
+        "fused_topk_softmax(Tensor router_logits, int top_k, "
+        "int num_experts_total, int start_expert, "
+        "int end_expert) -> (Tensor, Tensor) ");
+}
+
+TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
+{
+    m.impl("fused_topk_softmax", &torch_ext::fused_topk_softmax);
+}
@@ -1 +1,2 @@
 nvidia-modelopt[torch]~=0.21.0
+transformers==4.48.3
@@ -55,6 +55,7 @@ python3 quickstart_multimodal.py --model_dir Efficient-Large-Model/NVILA-8B --mo
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B` | L |
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B` | L |
 | `Qwen2VLForConditionalGeneration` | Qwen2-VL | `Qwen/Qwen2-VL-7B-Instruct` | L + V |
+| `Llama4ForConditionalGeneration` | Llama 4 | `meta-llama/Llama-4-Scout-17B-16E-Instruct` | L |
 
 Note:
 - L: Language only
 
@@ -23,10 +23,10 @@ tensorrt~=10.8.0
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01 uses 2.6.0a0.
 torch>=2.6.0a0,<=2.6.0
 torchvision
-nvidia-modelopt[torch]~=0.25.0
+nvidia-modelopt[torch]~=0.27.0
 nvidia-nccl-cu12
 nvidia-cuda-nvrtc-cu12
-transformers==4.48.3
+transformers==4.51.0
 pydantic>=2.9.1
 pillow==10.3.0
 wheel
 
@@ -89,8 +89,8 @@ def from_pretrained(cls,
         # Find the cache path by looking for the config.json file which should be in all
         # huggingface models
         model_dir = Path(
-            transformers.file_utils.get_file_from_repo(checkpoint_dir,
-                                                       'config.json')).parent
+            transformers.utils.hub.cached_file(checkpoint_dir,
+                                               'config.json')).parent
         quant_config = QuantConfig()
         layer_quant_config = None
         # quantized ckpt in modelopt format
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`
`17`	`17`	`#include "tensorrt_llm/common/cudaFp8Utils.h"`
`18`	`18`	`#include "tensorrt_llm/common/cudaUtils.h"`
	`19`	`+#include "tensorrt_llm/common/envUtils.h"`
`19`	`20`	`#include "tensorrt_llm/common/reduceKernelUtils.cuh"`
`20`	`21`	`#include <algorithm>`
`21`	`22`	`#include <cstdio>`
`@@ -40,6 +41,10 @@ __inline__ __device__ float scale(float a, float b)`
`40`	`41`	`template <QuantizeMode QUANTIZE_MODE, bool QUANTIZE, typename T_OUT, typename T_S, typename T_IN>`
`41`	`42`	`__global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* input, int64_t numel, int64_t lda)`
`42`	`43`	`{`
	`44`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`45`	`+ asm volatile("griddepcontrol.wait;");`
	`46`	`+#endif`
	`47`	`+`
`43`	`48`	`for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel; i += blockDim.x * gridDim.x)`
`44`	`49`	`{`
`45`	`50`
`@@ -56,6 +61,9 @@ __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i`
`56`	`61`	`output[i] = T_OUT(scale<QUANTIZE>(static_cast<float>(input[i]), static_cast<float>(input_scale[0])));`
`57`	`62`	`}`
`58`	`63`	`}`
	`64`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`65`	`+ asm volatile("griddepcontrol.launch_dependents;");`
	`66`	`+#endif`
`59`	`67`	`}`
`60`	`68`
`61`	`69`	`template <typename T_OUT, typename T_S, typename T_IN>`
`@@ -64,18 +72,30 @@ void invokeQuantizeMatrix(T_OUT* output, T_S const* input_scale, T_IN const* inp`
`64`	`72`	`{`
`65`	`73`	`dim3 grid(1024);`
`66`	`74`	`dim3 block(CTA_SIZE);`
	`75`	`+ cudaLaunchConfig_t config;`
	`76`	`+ config.gridDim = grid;`
	`77`	`+ config.blockDim = block;`
	`78`	`+ config.dynamicSmemBytes = 0;`
	`79`	`+ config.stream = stream;`
	`80`	`+ cudaLaunchAttribute attrs[1];`
	`81`	`+ attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;`
	`82`	`+ attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();`
	`83`	`+ config.numAttrs = 1;`
	`84`	`+ config.attrs = attrs;`
`67`	`85`	`if (quantize_mode == QuantizeMode::PER_CHANNEL)`
`68`	`86`	`{`
`69`		`- scaleMatrix<QuantizeMode::PER_CHANNEL, true>`
`70`		`- <<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);`
	`87`	`+ cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_CHANNEL, true, T_OUT, T_S, T_IN>, output, input_scale,`
	`88`	`+ input, numel, lda);`
`71`	`89`	`}`
`72`	`90`	`else if (quantize_mode == QuantizeMode::PER_TOKEN)`
`73`	`91`	`{`
`74`		`- scaleMatrix<QuantizeMode::PER_TOKEN, true><<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);`
	`92`	`+ cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_TOKEN, true, T_OUT, T_S, T_IN>, output, input_scale,`
	`93`	`+ input, numel, lda);`
`75`	`94`	`}`
`76`	`95`	`else if (quantize_mode == QuantizeMode::PER_TENSOR)`
`77`	`96`	`{`
`78`		`- scaleMatrix<QuantizeMode::PER_TENSOR, true><<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);`
	`97`	`+ cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_TENSOR, true, T_OUT, T_S, T_IN>, output, input_scale,`
	`98`	`+ input, numel, lda);`
`79`	`99`	`}`
`80`	`100`	`sync_check_cuda_error(stream);`
`81`	`101`	`}`
`@@ -86,19 +106,30 @@ void invokeDequantizeMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i`
`86`	`106`	`{`
`87`	`107`	`dim3 grid(1024);`
`88`	`108`	`dim3 block(CTA_SIZE);`
	`109`	`+ cudaLaunchConfig_t config;`
	`110`	`+ config.gridDim = grid;`
	`111`	`+ config.blockDim = block;`
	`112`	`+ config.dynamicSmemBytes = 0;`
	`113`	`+ config.stream = stream;`
	`114`	`+ cudaLaunchAttribute attrs[1];`
	`115`	`+ attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;`
	`116`	`+ attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();`
	`117`	`+ config.numAttrs = 1;`
	`118`	`+ config.attrs = attrs;`
`89`	`119`	`if (quantize_mode == QuantizeMode::PER_CHANNEL)`
`90`	`120`	`{`
`91`		`- scaleMatrix<QuantizeMode::PER_CHANNEL, false>`
`92`		`- <<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);`
	`121`	`+ cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_CHANNEL, false, T_OUT, T_S, T_IN>, output,`
	`122`	`+ input_scale, input, numel, lda);`
`93`	`123`	`}`
`94`	`124`	`else if (quantize_mode == QuantizeMode::PER_TOKEN)`
`95`	`125`	`{`
`96`		`- scaleMatrix<QuantizeMode::PER_TOKEN, false><<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);`
	`126`	`+ cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_TOKEN, false, T_OUT, T_S, T_IN>, output, input_scale,`
	`127`	`+ input, numel, lda);`
`97`	`128`	`}`
`98`	`129`	`else if (quantize_mode == QuantizeMode::PER_TENSOR)`
`99`	`130`	`{`
`100`		`- scaleMatrix<QuantizeMode::PER_TENSOR, false>`
`101`		`- <<<grid, block, 0, stream>>>(output, input_scale, input, numel, lda);`
	`131`	`+ cudaLaunchKernelEx(&config, scaleMatrix<QuantizeMode::PER_TENSOR, false, T_OUT, T_S, T_IN>, output, input_scale,`
	`132`	`+ input, numel, lda);`
`102`	`133`	`}`
`103`	`134`	`sync_check_cuda_error(stream);`
`104`	`135`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`nvidia-modelopt[torch]~=0.21.0`
	`2`	`+transformers==4.48.3`