vegaluisjose
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
+1-1 b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
+1-1
diff --git a/‎cpp/tensorrt_llm/common/attentionOp.cpp
+4-4 b/‎cpp/tensorrt_llm/common/attentionOp.cpp
+4-4
diff --git a/‎cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
+2-2 b/‎cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
+2-2
diff --git a/‎cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
+14-14 b/‎cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
+14-14
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
+3-3 b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
+3-3
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.cpp
+1-1 b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.cpp
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
+2-2 b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
+2-2
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h
+1-1 b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
+3-2 b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
+3-2
diff --git a/‎cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu
+1-1 b/‎cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
+1-1 b/‎cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
+1-1
diff --git a/‎cpp/tensorrt_llm/plugins/fp4GemmPlugin/fp4GemmPlugin.cpp
+2-1 b/‎cpp/tensorrt_llm/plugins/fp4GemmPlugin/fp4GemmPlugin.cpp
+2-1
diff --git a/‎cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
+4-3 b/‎cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
+4-3
diff --git a/‎cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
+3-2 b/‎cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
+3-2
diff --git a/‎cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu
+10-6 b/‎cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu
+10-6
diff --git a/‎cpp/tests/unit_tests/kernels/ropeTest.cu
+1-1 b/‎cpp/tests/unit_tests/kernels/ropeTest.cu
+1-1
@@ -383,7 +383,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         static_assert(!FP4, "FP4 Tests enabled on unsupported CUDA version");
 #endif
         bool should_skip_unsupported_fp8 = getSMVersion() < 89 && FP8;
-        bool should_skip_unsupported_fp4 = getSMVersion() < 100 && FP4;
+        bool should_skip_unsupported_fp4 = (getSMVersion() < 100 || getSMVersion() >= 120) && FP4;
         return should_skip_unsupported_fp8 || should_skip_unsupported_fp4;
     }
 
 
@@ -209,7 +209,7 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
         xqaParams.kv_cache_data_type = xqaParams.data_type;
     }
     if (xqaParams.kv_cache_data_type == DATA_TYPE_INT8
-        || (xqaParams.kv_cache_data_type == DATA_TYPE_E4M3 && mSM < kSM_90))
+        || (xqaParams.kv_cache_data_type == DATA_TYPE_E4M3 && (mSM < kSM_90 || mSM >= kSM_120)))
     {
         xqaParams.multi_block_mode = false;
     }
@@ -2276,8 +2276,8 @@ int AttentionOp::initialize() noexcept
     if (mFP8ContextFMHA)
     {
         TLLM_CHECK_WITH_INFO(mEnableContextFMHA, "FP8 FMHA cannot be enabled because Context FMHA is not supported.");
-        TLLM_CHECK_WITH_INFO(
-            mSM == 89 || mSM == 90 || mSM == 100, "FP8 FMHA can only be enabled on sm_89, sm_90 or sm_100.");
+        TLLM_CHECK_WITH_INFO(mSM == 89 || mSM == 90 || mSM == 100 || mSM == 120,
+            "FP8 FMHA can only be enabled on sm_89, sm_90, sm_100 or sm_120.");
     }
 
     // Pre-Check of FP8 Generation MLA.
@@ -2290,7 +2290,7 @@ int AttentionOp::initialize() noexcept
 
     // Check requirements for FP4 output.
     TLLM_CHECK_WITH_INFO(!mFuseFp4Quant || mEnableContextFMHA, "Context FMHA must enable if fuse_fp4_quant is enabled");
-    TLLM_CHECK_WITH_INFO(!mFuseFp4Quant || (mSM >= 100), "fuse_fp4_quant only supports SM100 and later devices.");
+    TLLM_CHECK_WITH_INFO(!mFuseFp4Quant || mSM == 100, "fuse_fp4_quant only supports SM100 devices.");
 
     TLLM_CHECK(isRoPE() == (mRotaryEmbeddingDim != 0));
     TLLM_CHECK_WITH_INFO((mSM >= 80) || (mType != nvinfer1::DataType::kBF16),
 
@@ -415,7 +415,7 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)
         mLaunchParams.kernel_s = 0;
         mLaunchParams.force_unroll = true;
         // enable tiled kernels on Ampere/Ada
-        if (isSm89 && mFixedParams.dataType == DATA_TYPE_E4M3)
+        if ((isSm89 || isSm120) && mFixedParams.dataType == DATA_TYPE_E4M3)
         {
             // so far Ada QMMA only supports non-tiled kernels.
             mLaunchParams.granular_tiling = false;
@@ -427,7 +427,7 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)
             // can suffer from tile quantization loss therefore use flash attention non-tiled instead
             mLaunchParams.granular_tiling = false;
         }
-        else if (isSm8x && mFixedParams.headSize < 256)
+        else if ((isSm8x || isSm120) && mFixedParams.headSize < 256)
         {
             // flash attention tiled kernel is faster on Ada and Ampere derivatives when head_size>=256
             mLaunchParams.granular_tiling = false;
 
@@ -266,7 +266,7 @@ __global__ void rms_norm_kernel(AllReduceParams params)
     local_final_output_buffer += block_offset;
     intermediate_buffer += block_offset;
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaGridDependencySynchronize();
 #endif
 
@@ -309,7 +309,7 @@ __global__ void rms_norm_kernel(AllReduceParams params)
         inter_vec.packed = rms_norm<T, Affine>(denom, inter_vec, weight_vec);
         *reinterpret_cast<int4*>(&local_final_output_buffer[offset]) = inter_vec.packed;
     }
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
@@ -340,7 +340,7 @@ __global__ void rms_pre_post_norm_kernel(AllReduceParams params) // for gemma2 p
     local_final_output_buffer += block_offset;
     intermediate_buffer += block_offset;
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaGridDependencySynchronize();
 #endif
 
@@ -393,7 +393,7 @@ __global__ void rms_pre_post_norm_kernel(AllReduceParams params) // for gemma2 p
         inter_vec.packed = rms_norm<T, Affine>(denom, inter_vec, weight_vec);
         *reinterpret_cast<int4*>(&local_final_output_buffer[offset]) = inter_vec.packed;
     }
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
@@ -744,7 +744,7 @@ struct Reducer<T, RanksPerNode, false>
 template <int ClusterSize, typename T, int RanksPerNode, bool Bias = false, bool Affine = false, bool PushMode = true>
 static __global__ void lamport_style_one_shot_all_reduce_norm_kernel(AllReduceParams params)
 {
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     namespace cg = cooperative_groups;
     static_assert(RanksPerNode <= MAX_RANKS_PER_NODE);
     static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
@@ -937,7 +937,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne
         buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);
     }
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaGridDependencySynchronize();
 #endif
 
@@ -1001,7 +1001,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne
             *reinterpret_cast<int4*>(&local_final_output_buffer[norm_offset + offset]) = sum_vec.packed;
         }
     }
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
@@ -1044,7 +1044,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_prenorm_all_reduce_no
         buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);
     }
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaGridDependencySynchronize();
 #endif
 
@@ -1114,7 +1114,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_prenorm_all_reduce_no
         sum_vec.packed = rms_norm<T, Affine>(denom, sum_vec, weight_vec);
         *reinterpret_cast<int4*>(&local_final_output_buffer[norm_offset + thread_offset]) = sum_vec.packed;
     }
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
@@ -1128,7 +1128,7 @@ bool is_lamport_supported(int token_num, int hidden_size)
     if (disableLamportReduceNormFusion)
         return false;
     static int sm = tensorrt_llm::common::getSMVersion();
-    if (sm < 90)
+    if (sm < 90 || sm >= 120)
     {
         return false;
     }
@@ -1355,7 +1355,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params)
         buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);
     }
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaGridDependencySynchronize();
 #endif
 
@@ -1424,7 +1424,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params)
         *reinterpret_cast<int4*>(&local_output_buffer[iter_offset]) = sums.packed;
     }
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
@@ -1497,7 +1497,7 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
         buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);
     }
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaGridDependencySynchronize();
 #endif
 
@@ -1631,7 +1631,7 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc
         }
     }
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -175,7 +175,7 @@ std::vector<CutlassTileConfig> get_candidate_tiles(
     case CutlassGemmType::Fp8:
         if (config_type_param & CutlassGemmConfig::GROUPED_GEMM)
         {
-            if (sm == 89)
+            if (sm == 89 || sm >= 120)
             {
                 return {CutlassTileConfig::CtaShape16x256x128_WarpShape16x64x128,
                     CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
@@ -193,7 +193,7 @@ std::vector<CutlassTileConfig> get_candidate_tiles(
         }
         else
         {
-            if (sm == 89)
+            if (sm == 89 || sm >= 120)
             {
                 return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
                     CutlassTileConfig::CtaShape64x128x64_WarpShape32x64x64,
@@ -414,7 +414,7 @@ std::vector<CutlassGemmConfig> get_candidate_configs(
     {
         return get_candidate_configs_sm90(config_type_param);
     }
-    if (sm >= 100 && sm != 120 && (config_type_param & CutlassGemmConfig::BLACKWELL))
+    if (sm >= 100 && sm < 120 && (config_type_param & CutlassGemmConfig::BLACKWELL))
     {
         return get_candidate_configs_sm100(config_type_param);
     }
 
@@ -571,7 +571,7 @@ void preprocess_weights_for_mixed_gemm(int8_t* preprocessed_quantized_weight, in
         arch = 80;
     }
     // Force use sm80 kernel for GB20x.
-    if (arch == 120)
+    if (arch >= 120)
     {
         arch = 80;
     }
 
@@ -508,7 +508,7 @@ size_t CutlassFp8RowwiseGemmRunner<T>::dispatchToArch(void* D, void const* A, vo
         return dispatchGemmToCutlassSm90<T>(D, A, B, C_bias, quantOption, m, n, k, scale_d0, scale_d1, gemmConfig,
             workspace, workspaceBytes, stream, occupancy);
     }
-    else if (mSm == 89)
+    else if (mSm == 89 || mSm >= 120)
     {
         return dispatchGemmToCutlassSm89<T>(D, A, B, C_bias, quantOption, m, n, k, scale_d0, scale_d1, gemmConfig,
             workspace, workspaceBytes, stream, occupancy);
@@ -574,7 +574,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFp8RowwiseGemmRunner<T>::getConfigs()
             }
         }
     }
-    else if (mSm == 89)
+    else if (mSm == 89 || mSm >= 120)
     {
         tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param
             = tkc::CutlassGemmConfig::CandidateConfigTypeParam::FP8_ONLY;
 
@@ -334,7 +334,7 @@ void CutlassInt8GemmRunner<T>::dispatchToArch(int8_t const* A, int8_t const* B,
         dispatchGemmToCutlass<T, cutlass::arch::Sm75>(A, B, quantOption, alphaCol, alphaRow, C, m, n, k, workspacePtr,
             workspaceBytes, gemmConfig, stream, occupancy);
     }
-    else if (mSm >= 80 && mSm <= 90)
+    else if (mSm >= 80 && mSm <= 90 || mSm >= 120)
     {
         dispatchGemmToCutlass<T, cutlass::arch::Sm80>(A, B, quantOption, alphaCol, alphaRow, C, m, n, k, workspacePtr,
             workspaceBytes, gemmConfig, stream, occupancy);
 
@@ -393,7 +393,7 @@ def is_grouped_gemm_op_valid(op):
 
 
 def is_op_valid(op):
-    if op.arch >= 100:
+    if op.arch >= 100 and op.arch < 120:
         return is_gemm_op_valid_sm100(op)
 
     if op.gemm_kind == GemmKind.Gemm:
@@ -666,7 +666,8 @@ def has_arch(sm):
     operations = []
     operations += generate_sm100_operations(has_arch(100))
     operations += generate_sm90_operations(has_arch(90))
-    operations += generate_sm80_operations(has_arch(80) or has_arch(89))
+    operations += generate_sm80_operations(
+        has_arch(80) or has_arch(89) or has_arch(120))
 
     def should_skip(op):
         is_internal = op.gemm_kind == GemmKind.Grouped
 
@@ -793,7 +793,7 @@ void invokeMambaConv1dContext(MambaConv1dParamsBase& params, cudaStream_t stream
 
     if (std::is_same_v<input_t, float>)
     {
-        if (tensorrt_llm::common::getSMVersion() >= 90)
+        if (tensorrt_llm::common::getSMVersion() >= 90 && tensorrt_llm::common::getSMVersion() < 120)
         {
             if (B * L * D <= 262144)
             {
 
@@ -53,7 +53,7 @@ inline void kernel_launcher(int arch, Params& params, cudaStream_t s)
     }
     else if ((arch >= 80 && arch < 90) || arch >= 100)
     {
-        if (arch == 89)
+        if (arch == 89 || arch >= 120)
         {
             EXEC_W4A8(KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true);
             EXEC_W4A8(KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true);
 
@@ -116,7 +116,8 @@ Fp4GemmPlugin::Fp4GemmPlugin(void const* data, size_t length, Fp4GemmPlugin::Plu
 
 void Fp4GemmPlugin::init(nvinfer1::DataType type)
 {
-    TLLM_CHECK_WITH_INFO((getSMVersion() >= 100), "FP4 Gemm not supported before Blackwell");
+    TLLM_CHECK_WITH_INFO((getSMVersion() >= 100 && getSMVersion() < 120),
+        "FP4 Gemm not supported before Blackwell, nor GeForce Blackwell");
     TLLM_CHECK_WITH_INFO(
         (mOutputType == DataType::kBF16) || (mOutputType == DataType::kFLOAT) || (mOutputType == DataType::kHALF),
         "Only support float, half, bfloat16, got %d.", (int) mOutputType);
 
@@ -389,17 +389,18 @@ int GemmPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
     }
 
     bool cudaKernelFinished = false;
+    bool isArch90or100 = mArch >= 90 && mArch < 120;
     // TODO: sub tensor matmul is not supported in fp8 gemm cuda kernel
-    if (mArch < 90 && M <= 4 && N <= 128000 && mUseFp8 && noPadDim && cudaKernelSupportType)
+    if (!isArch90or100 && M <= 4 && N <= 128000 && mUseFp8 && noPadDim && cudaKernelSupportType)
     {
         tensorrt_llm::common::QuantMode quantMode = tensorrt_llm::common::QuantMode::fromQuantAlgo("FP8");
         tensorrt_llm::kernels::cuda_core_gemm::Params params(reinterpret_cast<void const*>(inputs[0]),
             reinterpret_cast<void const*>(inputs[1]), mAlpha, reinterpret_cast<void*>(outputs[0]), M, N, K, quantMode,
             nvinfer1::DataType::kFP8, mOutputType);
         cudaKernelFinished = tensorrt_llm::kernels::cuda_core_gemm::cudaCoreGemmDispatcher(params, stream);
     }
-    else if (((mArch < 90 && M <= 6) || (mArch >= 90 && M <= 2)) && N <= 128000 && !mUseFp8 && noPadDim
-        && cudaKernelSupportType)
+    else if (!isArch90or100 && ((mArch < 90 && M <= 6) || (isArch90or100 && M <= 2)) && N <= 128000 && !mUseFp8
+        && noPadDim && cudaKernelSupportType)
     {
         tensorrt_llm::common::QuantMode quantMode;
         tensorrt_llm::kernels::cuda_core_gemm::Params params(reinterpret_cast<void const*>(inputs[0]),
 
@@ -257,8 +257,9 @@ void MixtureOfExpertsPlugin::init()
         "MOE plugin only supports a different output type for FP4/FP8");
     TLLM_CHECK_WITH_INFO(mType != DataType::kFP8 || tensorrt_llm::common::getSMVersion() >= 89,
         "MoE FP8 is not supported for architectures less than SM89");
-    TLLM_CHECK_WITH_INFO(mType != DataType::kFP4 || tensorrt_llm::common::getSMVersion() >= 100,
-        "MoE FP4 is not supported for architectures less than SM100");
+    TLLM_CHECK_WITH_INFO(mType != DataType::kFP4
+            || (tensorrt_llm::common::getSMVersion() >= 100 && tensorrt_llm::common::getSMVersion() < 120),
+        "MoE FP4 is only supported on architecture SM100");
 
     TLLM_CHECK_WITH_INFO(!hasLora() || mLoraType == mOutputType, "The LoraType need to keep same with moe OutputType.");
 
 
@@ -161,7 +161,7 @@ protected:
 #endif
         bool should_skip_no_device = mDeviceCount <= 0;
         bool should_skip_unsupported_fp8 = getSMVersion() < 89 && FP8;
-        bool should_skip_unsupported_fp4 = getSMVersion() < 100 && FP4;
+        bool should_skip_unsupported_fp4 = (getSMVersion() < 100 || getSMVersion() >= 120) && FP4;
         return should_skip_no_device || should_skip_unsupported_fp8 || should_skip_unsupported_fp4;
     }
 
@@ -862,7 +862,7 @@ protected:
     auto getFilteredConfigs(int sm)
     {
         auto tactics = mMoERunner.getTactics();
-        if (sm == 89)
+        if (sm == 89 || sm >= 120)
         {
             // Filter some unsupported configs for L40S
             auto it = std::remove_if(tactics.begin(), tactics.end(),
@@ -1308,7 +1308,8 @@ void MixtureOfExpertsTest<TypeParam_>::BasicPermuteTest(
         auto [expected_experts, token_final_scales] = populateRouting(num_experts, num_tokens, k);
 
         runMoEPermute(hidden_input, expected_experts, token_final_scales, hidden_size, num_experts, k);
-        bool should_be_deterministic = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90;
+        bool should_be_deterministic
+            = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
         if (should_be_deterministic && !mIsLongTest)
         {
             auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);
@@ -1546,7 +1547,8 @@ void MixtureOfExpertsTest<TypeParam_>::ParallelismTest(
                     // Only need to init the inputs on the first iteration
                     runMoEPermute(hidden_input, expected_experts, token_final_scales, hidden_size, num_experts, k,
                         MOEParallelismConfig{tp_size, i, ep_size, j});
-                    bool should_be_deterministic = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90;
+                    bool should_be_deterministic
+                        = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
                     if (should_be_deterministic && !mIsLongTest)
                     {
                         auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);
@@ -1560,7 +1562,8 @@ void MixtureOfExpertsTest<TypeParam_>::ParallelismTest(
                 else
                 {
                     runMoEPermute(MOEParallelismConfig{tp_size, i, ep_size, j});
-                    bool should_be_deterministic = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90;
+                    bool should_be_deterministic
+                        = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
                     if (should_be_deterministic && !mIsLongTest)
                     {
                         auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);
@@ -1866,7 +1869,8 @@ TEST_F(MixtureOfExpertsProfilerTest, TestGeneratedProfilerDistribution)
 
             backend.prepare(num_tokens, workspace, mStream->get());
 
-            auto getNext = backend.getWorkspacePointerGenerator(workspace, num_tokens, getSMVersion() >= 90);
+            auto getNext = backend.getWorkspacePointerGenerator(
+                workspace, num_tokens, getSMVersion() >= 90 && getSMVersion() < 120);
             auto const* expert_first_token_offset_size = reinterpret_cast<int64_t*>(getNext());
             auto const* source_to_dest_map = reinterpret_cast<int*>(getNext());
             auto const* dest_to_source_map = reinterpret_cast<int*>(getNext());
 
@@ -498,7 +498,7 @@ protected:
         if constexpr (std::is_same_v<KVCacheType, __nv_fp4_e2m1>)
         {
             // Quant helper functions will not work on lower SM versions.
-            return getSMVersion() < 100;
+            return getSMVersion() < 100 || getSMVersion() >= 120;
         }
 #endif
         return false;
Original file line number	Diff line number	Diff line change
`@@ -383,7 +383,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture`
`383`	`383`	`static_assert(!FP4, "FP4 Tests enabled on unsupported CUDA version");`
`384`	`384`	`#endif`
`385`	`385`	`bool should_skip_unsupported_fp8 = getSMVersion() < 89 && FP8;`
`386`		`- bool should_skip_unsupported_fp4 = getSMVersion() < 100 && FP4;`
	`386`	`+ bool should_skip_unsupported_fp4 = (getSMVersion() < 100 \|\| getSMVersion() >= 120) && FP4;`
`387`	`387`	`return should_skip_unsupported_fp8 \|\| should_skip_unsupported_fp4;`
`388`	`388`	`}`
`389`	`389`
Original file line number	Diff line number	Diff line change
`@@ -209,7 +209,7 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&`
`209`	`209`	`xqaParams.kv_cache_data_type = xqaParams.data_type;`
`210`	`210`	`}`
`211`	`211`	`if (xqaParams.kv_cache_data_type == DATA_TYPE_INT8`
`212`		`- \|\| (xqaParams.kv_cache_data_type == DATA_TYPE_E4M3 && mSM < kSM_90))`
	`212`	`+ \|\| (xqaParams.kv_cache_data_type == DATA_TYPE_E4M3 && (mSM < kSM_90 \|\| mSM >= kSM_120)))`
`213`	`213`	`{`
`214`	`214`	`xqaParams.multi_block_mode = false;`
`215`	`215`	`}`
`@@ -2276,8 +2276,8 @@ int AttentionOp::initialize() noexcept`
`2276`	`2276`	`if (mFP8ContextFMHA)`
`2277`	`2277`	`{`
`2278`	`2278`	`TLLM_CHECK_WITH_INFO(mEnableContextFMHA, "FP8 FMHA cannot be enabled because Context FMHA is not supported.");`
`2279`		`- TLLM_CHECK_WITH_INFO(`
`2280`		`- mSM == 89 \|\| mSM == 90 \|\| mSM == 100, "FP8 FMHA can only be enabled on sm_89, sm_90 or sm_100.");`
	`2279`	`+ TLLM_CHECK_WITH_INFO(mSM == 89 \|\| mSM == 90 \|\| mSM == 100 \|\| mSM == 120,`
	`2280`	`+ "FP8 FMHA can only be enabled on sm_89, sm_90, sm_100 or sm_120.");`
`2281`	`2281`	`}`
`2282`	`2282`
`2283`	`2283`	`// Pre-Check of FP8 Generation MLA.`
`@@ -2290,7 +2290,7 @@ int AttentionOp::initialize() noexcept`
`2290`	`2290`
`2291`	`2291`	`// Check requirements for FP4 output.`
`2292`	`2292`	`TLLM_CHECK_WITH_INFO(!mFuseFp4Quant \|\| mEnableContextFMHA, "Context FMHA must enable if fuse_fp4_quant is enabled");`
`2293`		`- TLLM_CHECK_WITH_INFO(!mFuseFp4Quant \|\| (mSM >= 100), "fuse_fp4_quant only supports SM100 and later devices.");`
	`2293`	`+ TLLM_CHECK_WITH_INFO(!mFuseFp4Quant \|\| mSM == 100, "fuse_fp4_quant only supports SM100 devices.");`
`2294`	`2294`
`2295`	`2295`	`TLLM_CHECK(isRoPE() == (mRotaryEmbeddingDim != 0));`
`2296`	`2296`	`TLLM_CHECK_WITH_INFO((mSM >= 80) \|\| (mType != nvinfer1::DataType::kBF16),`
Original file line number	Diff line number	Diff line change
`@@ -415,7 +415,7 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)`
`415`	`415`	`mLaunchParams.kernel_s = 0;`
`416`	`416`	`mLaunchParams.force_unroll = true;`
`417`	`417`	`// enable tiled kernels on Ampere/Ada`
`418`		`- if (isSm89 && mFixedParams.dataType == DATA_TYPE_E4M3)`
	`418`	`+ if ((isSm89 \|\| isSm120) && mFixedParams.dataType == DATA_TYPE_E4M3)`
`419`	`419`	`{`
`420`	`420`	`// so far Ada QMMA only supports non-tiled kernels.`
`421`	`421`	`mLaunchParams.granular_tiling = false;`
`@@ -427,7 +427,7 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)`
`427`	`427`	`// can suffer from tile quantization loss therefore use flash attention non-tiled instead`
`428`	`428`	`mLaunchParams.granular_tiling = false;`
`429`	`429`	`}`
`430`		`- else if (isSm8x && mFixedParams.headSize < 256)`
	`430`	`+ else if ((isSm8x \|\| isSm120) && mFixedParams.headSize < 256)`
`431`	`431`	`{`
`432`	`432`	`// flash attention tiled kernel is faster on Ada and Ampere derivatives when head_size>=256`
`433`	`433`	`mLaunchParams.granular_tiling = false;`
Original file line number	Diff line number	Diff line change
`@@ -266,7 +266,7 @@ __global__ void rms_norm_kernel(AllReduceParams params)`
`266`	`266`	`local_final_output_buffer += block_offset;`
`267`	`267`	`intermediate_buffer += block_offset;`
`268`	`268`
`269`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`269`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`270`	`270`	`cudaGridDependencySynchronize();`
`271`	`271`	`#endif`
`272`	`272`
`@@ -309,7 +309,7 @@ __global__ void rms_norm_kernel(AllReduceParams params)`
`309`	`309`	`inter_vec.packed = rms_norm<T, Affine>(denom, inter_vec, weight_vec);`
`310`	`310`	`reinterpret_cast<int4>(&local_final_output_buffer[offset]) = inter_vec.packed;`
`311`	`311`	`}`
`312`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`312`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`313`	`313`	`cudaTriggerProgrammaticLaunchCompletion();`
`314`	`314`	`#endif`
`315`	`315`	`}`
`@@ -340,7 +340,7 @@ __global__ void rms_pre_post_norm_kernel(AllReduceParams params) // for gemma2 p`
`340`	`340`	`local_final_output_buffer += block_offset;`
`341`	`341`	`intermediate_buffer += block_offset;`
`342`	`342`
`343`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`343`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`344`	`344`	`cudaGridDependencySynchronize();`
`345`	`345`	`#endif`
`346`	`346`
`@@ -393,7 +393,7 @@ __global__ void rms_pre_post_norm_kernel(AllReduceParams params) // for gemma2 p`
`393`	`393`	`inter_vec.packed = rms_norm<T, Affine>(denom, inter_vec, weight_vec);`
`394`	`394`	`reinterpret_cast<int4>(&local_final_output_buffer[offset]) = inter_vec.packed;`
`395`	`395`	`}`
`396`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`396`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`397`	`397`	`cudaTriggerProgrammaticLaunchCompletion();`
`398`	`398`	`#endif`
`399`	`399`	`}`
`@@ -744,7 +744,7 @@ struct Reducer<T, RanksPerNode, false>`
`744`	`744`	`template <int ClusterSize, typename T, int RanksPerNode, bool Bias = false, bool Affine = false, bool PushMode = true>`
`745`	`745`	`static __global__ void lamport_style_one_shot_all_reduce_norm_kernel(AllReduceParams params)`
`746`	`746`	`{`
`747`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`747`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`748`	`748`	`namespace cg = cooperative_groups;`
`749`	`749`	`static_assert(RanksPerNode <= MAX_RANKS_PER_NODE);`
`750`	`750`	`static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);`
`@@ -937,7 +937,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne`
`937`	`937`	`buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);`
`938`	`938`	`}`
`939`	`939`
`940`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`940`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`941`	`941`	`cudaGridDependencySynchronize();`
`942`	`942`	`#endif`
`943`	`943`
`@@ -1001,7 +1001,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne`
`1001`	`1001`	`reinterpret_cast<int4>(&local_final_output_buffer[norm_offset + offset]) = sum_vec.packed;`
`1002`	`1002`	`}`
`1003`	`1003`	`}`
`1004`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`1004`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`1005`	`1005`	`cudaTriggerProgrammaticLaunchCompletion();`
`1006`	`1006`	`#endif`
`1007`	`1007`	`}`
`@@ -1044,7 +1044,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_prenorm_all_reduce_no`
`1044`	`1044`	`buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);`
`1045`	`1045`	`}`
`1046`	`1046`
`1047`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`1047`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`1048`	`1048`	`cudaGridDependencySynchronize();`
`1049`	`1049`	`#endif`
`1050`	`1050`
`@@ -1114,7 +1114,7 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_prenorm_all_reduce_no`
`1114`	`1114`	`sum_vec.packed = rms_norm<T, Affine>(denom, sum_vec, weight_vec);`
`1115`	`1115`	`reinterpret_cast<int4>(&local_final_output_buffer[norm_offset + thread_offset]) = sum_vec.packed;`
`1116`	`1116`	`}`
`1117`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`1117`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`1118`	`1118`	`cudaTriggerProgrammaticLaunchCompletion();`
`1119`	`1119`	`#endif`
`1120`	`1120`	`}`
`@@ -1128,7 +1128,7 @@ bool is_lamport_supported(int token_num, int hidden_size)`
`1128`	`1128`	`if (disableLamportReduceNormFusion)`
`1129`	`1129`	`return false;`
`1130`	`1130`	`static int sm = tensorrt_llm::common::getSMVersion();`
`1131`		`- if (sm < 90)`
	`1131`	`+ if (sm < 90 \|\| sm >= 120)`
`1132`	`1132`	`{`
`1133`	`1133`	`return false;`
`1134`	`1134`	`}`
`@@ -1355,7 +1355,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params)`
`1355`	`1355`	`buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);`
`1356`	`1356`	`}`
`1357`	`1357`
`1358`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`1358`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`1359`	`1359`	`cudaGridDependencySynchronize();`
`1360`	`1360`	`#endif`
`1361`	`1361`
`@@ -1424,7 +1424,7 @@ static __global__ void oneShotAllReduceKernel(AllReduceParams params)`
`1424`	`1424`	`reinterpret_cast<int4>(&local_output_buffer[iter_offset]) = sums.packed;`
`1425`	`1425`	`}`
`1426`	`1426`
`1427`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`1427`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`1428`	`1428`	`cudaTriggerProgrammaticLaunchCompletion();`
`1429`	`1429`	`#endif`
`1430`	`1430`	`}`
`@@ -1497,7 +1497,7 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc`
`1497`	`1497`	`buffers[ii] = reinterpret_cast<T*>(params.peer_comm_buffer_ptrs[rank]);`
`1498`	`1498`	`}`
`1499`	`1499`
`1500`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`1500`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`1501`	`1501`	`cudaGridDependencySynchronize();`
`1502`	`1502`	`#endif`
`1503`	`1503`
`@@ -1631,7 +1631,7 @@ static __global__ void __launch_bounds__(512, 1) twoShotAllReduceKernel(AllReduc`
`1631`	`1631`	`}`
`1632`	`1632`	`}`
`1633`	`1633`
`1634`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`1634`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDA_ARCH__ < 1200))`
`1635`	`1635`	`cudaTriggerProgrammaticLaunchCompletion();`
`1636`	`1636`	`#endif`
`1637`	`1637`	`}`
Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,7 @@ std::vector<CutlassTileConfig> get_candidate_tiles(`
`175`	`175`	`case CutlassGemmType::Fp8:`
`176`	`176`	`if (config_type_param & CutlassGemmConfig::GROUPED_GEMM)`
`177`	`177`	`{`
`178`		`- if (sm == 89)`
	`178`	`+ if (sm == 89 \|\| sm >= 120)`
`179`	`179`	`{`
`180`	`180`	`return {CutlassTileConfig::CtaShape16x256x128_WarpShape16x64x128,`
`181`	`181`	`CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,`
`@@ -193,7 +193,7 @@ std::vector<CutlassTileConfig> get_candidate_tiles(`
`193`	`193`	`}`
`194`	`194`	`else`
`195`	`195`	`{`
`196`		`- if (sm == 89)`
	`196`	`+ if (sm == 89 \|\| sm >= 120)`
`197`	`197`	`{`
`198`	`198`	`return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,`
`199`	`199`	`CutlassTileConfig::CtaShape64x128x64_WarpShape32x64x64,`
`@@ -414,7 +414,7 @@ std::vector<CutlassGemmConfig> get_candidate_configs(`
`414`	`414`	`{`
`415`	`415`	`return get_candidate_configs_sm90(config_type_param);`
`416`	`416`	`}`
`417`		`- if (sm >= 100 && sm != 120 && (config_type_param & CutlassGemmConfig::BLACKWELL))`
	`417`	`+ if (sm >= 100 && sm < 120 && (config_type_param & CutlassGemmConfig::BLACKWELL))`
`418`	`418`	`{`
`419`	`419`	`return get_candidate_configs_sm100(config_type_param);`
`420`	`420`	`}`
Original file line number	Diff line number	Diff line change
`@@ -571,7 +571,7 @@ void preprocess_weights_for_mixed_gemm(int8_t* preprocessed_quantized_weight, in`
`571`	`571`	`arch = 80;`
`572`	`572`	`}`
`573`	`573`	`// Force use sm80 kernel for GB20x.`
`574`		`- if (arch == 120)`
	`574`	`+ if (arch >= 120)`
`575`	`575`	`{`
`576`	`576`	`arch = 80;`
`577`	`577`	`}`
Original file line number	Diff line number	Diff line change
`@@ -508,7 +508,7 @@ size_t CutlassFp8RowwiseGemmRunner<T>::dispatchToArch(void* D, void const* A, vo`
`508`	`508`	`return dispatchGemmToCutlassSm90<T>(D, A, B, C_bias, quantOption, m, n, k, scale_d0, scale_d1, gemmConfig,`
`509`	`509`	`workspace, workspaceBytes, stream, occupancy);`
`510`	`510`	`}`
`511`		`- else if (mSm == 89)`
	`511`	`+ else if (mSm == 89 \|\| mSm >= 120)`
`512`	`512`	`{`
`513`	`513`	`return dispatchGemmToCutlassSm89<T>(D, A, B, C_bias, quantOption, m, n, k, scale_d0, scale_d1, gemmConfig,`
`514`	`514`	`workspace, workspaceBytes, stream, occupancy);`
`@@ -574,7 +574,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFp8RowwiseGemmRunner<T>::getConfigs()`
`574`	`574`	`}`
`575`	`575`	`}`
`576`	`576`	`}`
`577`		`- else if (mSm == 89)`
	`577`	`+ else if (mSm == 89 \|\| mSm >= 120)`
`578`	`578`	`{`
`579`	`579`	`tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param`
`580`	`580`	`= tkc::CutlassGemmConfig::CandidateConfigTypeParam::FP8_ONLY;`
Original file line number	Diff line number	Diff line change
`@@ -334,7 +334,7 @@ void CutlassInt8GemmRunner<T>::dispatchToArch(int8_t const* A, int8_t const* B,`
`334`	`334`	`dispatchGemmToCutlass<T, cutlass::arch::Sm75>(A, B, quantOption, alphaCol, alphaRow, C, m, n, k, workspacePtr,`
`335`	`335`	`workspaceBytes, gemmConfig, stream, occupancy);`
`336`	`336`	`}`
`337`		`- else if (mSm >= 80 && mSm <= 90)`
	`337`	`+ else if (mSm >= 80 && mSm <= 90 \|\| mSm >= 120)`
`338`	`338`	`{`
`339`	`339`	`dispatchGemmToCutlass<T, cutlass::arch::Sm80>(A, B, quantOption, alphaCol, alphaRow, C, m, n, k, workspacePtr,`
`340`	`340`	`workspaceBytes, gemmConfig, stream, occupancy);`
Original file line number	Diff line number	Diff line change
`@@ -793,7 +793,7 @@ void invokeMambaConv1dContext(MambaConv1dParamsBase& params, cudaStream_t stream`
`793`	`793`
`794`	`794`	`if (std::is_same_v<input_t, float>)`
`795`	`795`	`{`
`796`		`- if (tensorrt_llm::common::getSMVersion() >= 90)`
	`796`	`+ if (tensorrt_llm::common::getSMVersion() >= 90 && tensorrt_llm::common::getSMVersion() < 120)`
`797`	`797`	`{`
`798`	`798`	`if (B * L * D <= 262144)`
`799`	`799`	`{`
Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ inline void kernel_launcher(int arch, Params& params, cudaStream_t s)`
`53`	`53`	`}`
`54`	`54`	`else if ((arch >= 80 && arch < 90) \|\| arch >= 100)`
`55`	`55`	`{`
`56`		`- if (arch == 89)`
	`56`	`+ if (arch == 89 \|\| arch >= 120)`
`57`	`57`	`{`
`58`	`58`	`EXEC_W4A8(KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true);`
`59`	`59`	`EXEC_W4A8(KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true);`