NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/quantization.cu
+26 b/‎cpp/tensorrt_llm/kernels/quantization.cu
+26
diff --git a/‎cpp/tensorrt_llm/kernels/quantization.cuh
+2-25 b/‎cpp/tensorrt_llm/kernels/quantization.cuh
+2-25
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/CMakeLists.txt
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/CMakeLists.txt
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/CMakeLists.txt renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/CMakeLists.txt b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/CMakeLists.txt renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/CMakeLists.txt
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCommon.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCommon.h
+4-4 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCommon.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCommon.h
+4-4
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCubins/MoE_ProjDown_BatchN_E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCubins/MoE_ProjDown_BatchN_E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin.h b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCubins/MoE_ProjDown_BatchN_E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCubins/MoE_ProjDown_BatchN_E2m1Fp32_Bfloat16_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_sm100a_cubin.h
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCubins/MoE_ProjDown_BatchN_E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCubins/MoE_ProjDown_BatchN_E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin.h b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCubins/MoE_ProjDown_BatchN_E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCubins/MoE_ProjDown_BatchN_E4m3Fp32_Bfloat16_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_sm100a_cubin.h
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCubins/MoE_ProjUp_BatchN_E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_sm100a_cubin.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCubins/MoE_ProjUp_BatchN_E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_sm100a_cubin.h b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCubins/MoE_ProjUp_BatchN_E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_sm100a_cubin.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCubins/MoE_ProjUp_BatchN_E2m1Fp32_E2m1_Tile128x8x512_EpiTile128x8_Mma128x8x64_Cluster1x1x1_transposeMmaOutput_InplaceRoute_GatedAct_sm100a_cubin.h
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCubins/MoE_ProjUp_BatchN_E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_sm100a_cubin.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCubins/MoE_ProjUp_BatchN_E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_sm100a_cubin.h b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmCubins/MoE_ProjUp_BatchN_E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_sm100a_cubin.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmCubins/MoE_ProjUp_BatchN_E4m3Fp32_E4m3_Tile128x8x128_EpiTile64x8_Mma64x8x32_Cluster1x1x1_transposeMmaOutput_DsFp8_InplaceRoute_sm100a_cubin.h
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmList.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmList.h b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/gemmList.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/gemmList.h
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/runner.cu renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu
+1-115 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/runner.cu renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu
+1-115
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/runner.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/runner.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/DevKernel.cu renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/DevKernel.cu
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/DevKernel.cu renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/DevKernel.cu
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/DevKernel.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/DevKernel.h
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/DevKernel.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/DevKernel.h
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/Dtype.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/Dtype.h
+2-2 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/Dtype.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/Dtype.h
+2-2
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/DtypeDecl.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/DtypeDecl.h
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/DtypeDecl.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/DtypeDecl.h
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/Enums.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/Enums.h
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/Enums.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/Enums.h
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/KernelParams.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/KernelParams.h
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/KernelParams.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/KernelParams.h
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/KernelTraits.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/KernelTraits.h
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/KernelTraits.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/KernelTraits.h
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/RoutingKernel.cu renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/RoutingKernel.cu
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/RoutingKernel.cu renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/RoutingKernel.cu
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/RoutingKernel.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/RoutingKernel.h
+1-1 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/RoutingKernel.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/RoutingKernel.h
+1-1
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/SfLayout.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/SfLayout.h
+13-13 b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fp8BlockScaleMoe/trtllmGenSrc/SfLayout.h renamed to ‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/SfLayout.h
+13-13
@@ -277,6 +277,32 @@ void invokeBatchedFP4Quantization(int b, int m, int n, __nv_fp8_e4m3 const* inpu
 }
 #endif
 
+__global__ void nvfp4_block_scale_interleave_kernel(
+    int numbatches, int numRows, int numCols, uint8_t const* SFIn, uint8_t* SFOutput)
+{
+    for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x)
+    {
+        for (int batchIdx = 0; batchIdx < numbatches; batchIdx++)
+        {
+            for (int colIdx = threadIdx.x; colIdx < numCols; colIdx += blockDim.x)
+            {
+                int64_t inOffset = batchIdx * numRows * numCols + rowIdx * numCols + colIdx;
+                auto sf = SFIn[inOffset];
+
+                std::optional<int> batchIdxOpt = batchIdx;
+                std::optional<int> numRowsOpt = numRows;
+
+                // Without batching, the math in get_sf_out_offset is the same as
+                // int const numSfTilesK = (numCols + 4 - 1) / 4;
+                // int const tileOffset = ((mi / 128) * numSfTilesK + ki / 4) * 512;
+                // int const dstIdx = tileOffset + (mi % 32) * 16 + ((mi % 128) / 32) * 4 + ki % 4;
+                auto dstIdx = get_sf_out_offset_128x4(batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols * 16);
+                SFOutput[dstIdx] = sf;
+            }
+        }
+    }
+}
+
 // This is intended for weight loading, so m and n are large, b <= 256
 void invokeNVFP4BlockScaleInterleave(
     int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount, cudaStream_t stream)
 
@@ -826,30 +826,7 @@ cvt_fp8_to_fp4(
 #endif
 }
 
-inline __global__ void nvfp4_block_scale_interleave_kernel(
-    int numbatches, int numRows, int numCols, uint8_t const* SFIn, uint8_t* SFOutput)
-{
-    for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x)
-    {
-        for (int batchIdx = 0; batchIdx < numbatches; batchIdx++)
-        {
-            for (int colIdx = threadIdx.x; colIdx < numCols; colIdx += blockDim.x)
-            {
-                int64_t inOffset = batchIdx * numRows * numCols + rowIdx * numCols + colIdx;
-                auto sf = SFIn[inOffset];
-
-                std::optional<int> batchIdxOpt = batchIdx;
-                std::optional<int> numRowsOpt = numRows;
-
-                // Without batching, the math in get_sf_out_offset is the same as
-                // int const numSfTilesK = (numCols + 4 - 1) / 4;
-                // int const tileOffset = ((mi / 128) * numSfTilesK + ki / 4) * 512;
-                // int const dstIdx = tileOffset + (mi % 32) * 16 + ((mi % 128) / 32) * 4 + ki % 4;
-                auto dstIdx = get_sf_out_offset_128x4(batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols * 16);
-                SFOutput[dstIdx] = sf;
-            }
-        }
-    }
-}
+__global__ void nvfp4_block_scale_interleave_kernel(
+    int numbatches, int numRows, int numCols, uint8_t const* SFIn, uint8_t* SFOutput);
 } // namespace kernels
 } // namespace tensorrt_llm
@@ -17,4 +17,4 @@
 
 add_subdirectory(fmha)
 add_subdirectory(blockscaleGemm)
-add_subdirectory(fp8BlockScaleMoe)
+add_subdirectory(blockScaleMoe)
@@ -190,8 +190,8 @@ inline void checkAndUpdateGemmOptions(
     }
     if (options.mDtypeElt == tg::Dtype::E4m3 && options.mMmaK != 32)
     {
-        TLLM_LOG_WARNING(
-            "Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt), ". Setting MmaK to 32");
+        TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt).c_str(),
+            ". Setting MmaK to 32");
         options.mMmaK = 32;
         options.mTileK = std::max(options.mMmaK, options.mTileK);
     }
@@ -205,15 +205,15 @@ inline void checkAndUpdateGemmOptions(
         if (options.mMmaK != 64)
         {
             int newTileK = 64 * divUp(options.mTileK, 64);
-            TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt),
+            TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt).c_str(),
                 ". Setting MmaK to 64 and TileK to ", newTileK);
             options.mMmaK = 64;
             options.mTileK = newTileK;
         }
         if (options.mMmaM != 128)
         {
             int newTileM = 128 * divUp(options.mTileM, 128);
-            TLLM_LOG_WARNING("Unsupported MmaM (", options.mMmaM, ") for ", gemm::toString(options.mDtypeElt),
+            TLLM_LOG_WARNING("Unsupported MmaM (", options.mMmaM, ") for ", gemm::toString(options.mDtypeElt).c_str(),
                 ". Setting MmaM to 128 and TileM to ", newTileM);
             options.mMmaM = 128;
             options.mTileM = newTileM;
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 #include "gemmList.h"
 #include "runner.h"
 #include "trtllmGenSrc/DevKernel.h"
-#include "trtllmGenSrc/MixtureOfExpertsInterface.h"
 #include "trtllmGenSrc/RoutingKernel.h"
 #include <iostream>
 
@@ -135,43 +134,6 @@ void Runner::run(void* hidden_state, void* hidden_state_scale, void* weight, voi
     TLLM_CHECK_WITH_INFO(selectedIndex.size() == 1, "Multiple kernels found for the given element type");
     auto const& kernelInfo = gemmList[*selectedIndex.begin()];
 
-    // TODO: remove this once we find the way to shuffle the weights offline
-    // if (kernelInfo.shuffledMatrixA || kernelInfo.useFusedAct)
-    // {
-    //     // Allocate temporary buffers for shuffled weights using vectors
-    //     auto numBitsPerElt = trtllm::gen::dtypeGetNumBits(mDtypeElt);
-    //     const size_t numBytesA = num_experts * hidden_size * intermediate_size * 2 * numBitsPerElt / /* bits */ 8;
-    //     std::vector<uint8_t> hShuffledA(numBytesA);
-
-    //     auto numBitsPerSf = trtllm::gen::dtypeGetNumBits(tg::dtypeBlockSfType(mDtypeElt));
-    //     const size_t numSfBytes = num_experts * hidden_size * intermediate_size / 16 * 2 * numBitsPerSf / /* bits */
-    //     8; std::vector<uint8_t> hShuffledASf(numSfBytes);
-
-    //     // Copy weights to host
-    //     cudaMemcpy(hShuffledA.data(), weight, numBytesA, cudaMemcpyDeviceToHost);
-    //     cudaMemcpy(hShuffledASf.data(), weight_scale, numSfBytes, cudaMemcpyDeviceToHost);
-
-    //     // Prepare and shuffle the weights
-    //     prepareBatchWeightsOnHost(hShuffledA.data(), // wIn
-    //         hShuffledASf.data(),                     // wSfIn
-    //         hShuffledA.data(),                       // wOut (in-place)
-    //         hShuffledASf.data(),                     // wSfOut (in-place)
-    //         mDtypeElt,                               // dtypeElt
-    //         intermediate_size * 2,                   // m (2x for gated activation)
-    //         hidden_size,                             // k
-    //         kernelInfo.epilogueTileM,                // epilogueTileM (from tileN)
-    //         num_experts,                             // numBatches
-    //         kernelInfo.shuffledMatrixA,              // useShuffleMatrix
-    //         kernelInfo.useFusedAct,                  // useFusedAct (for gated activation)
-    //         mDtypeElt == tg::Dtype::E2m1,            // useBlockScaling
-    //         16                                       // numEltsPerSf (for E2m1)
-    //     );
-
-    //     // Copy shuffled weights back to device
-    //     // cudaMemcpy(weight, hShuffledA.data(), numBytesA, cudaMemcpyHostToDevice);
-    //     // cudaMemcpy(weight_scale, hShuffledASf.data(), numSfBytes, cudaMemcpyHostToDevice);
-    // }
-
     gemmCommon::MyOptions options;
     options.mTopK = top_k;
     options.mBatchM = false;
@@ -239,43 +201,6 @@ void Runner::run(void* permuted_hidden_state, void* permuted_hidden_state_scale,
     TLLM_CHECK_WITH_INFO(selectedIndex.size() == 1, "Multiple kernels found for the given element and output types");
     auto const& kernelInfo = gemmList[*selectedIndex.begin()];
 
-    // TODO: remove this once we find the way to shuffle the weights offline
-    // if (kernelInfo.shuffledMatrixA)
-    // {
-    //     // Allocate temporary buffers for shuffled weights using vectors
-    //     auto numBitsPerElt = trtllm::gen::dtypeGetNumBits(mDtypeElt);
-    //     const size_t numBytesA = num_experts * hidden_size * intermediate_size * numBitsPerElt / /* bits */ 8;
-    //     std::vector<uint8_t> hShuffledA(numBytesA);
-
-    //     auto numBitsPerSf = trtllm::gen::dtypeGetNumBits(tg::dtypeBlockSfType(mDtypeElt));
-    //     const size_t numSfBytes = num_experts * hidden_size * intermediate_size / 16 * numBitsPerSf / /* bits */ 8;
-    //     std::vector<uint8_t> hShuffledASf(numSfBytes);
-
-    //     // Copy weights to host
-    //     cudaMemcpy(hShuffledA.data(), weight, numBytesA, cudaMemcpyDeviceToHost);
-    //     cudaMemcpy(hShuffledASf.data(), weight_scale, numSfBytes, cudaMemcpyDeviceToHost);
-
-    //     // Prepare and shuffle the weights
-    //     prepareBatchWeightsOnHost(hShuffledA.data(), // wIn
-    //         hShuffledASf.data(),                     // wSfIn
-    //         hShuffledA.data(),                       // wOut (in-place)
-    //         hShuffledASf.data(),                     // wSfOut (in-place)
-    //         mDtypeElt,                               // dtypeElt
-    //         hidden_size,                             // m
-    //         intermediate_size,                       // k
-    //         kernelInfo.epilogueTileM,                // epilogueTileM (from tileN)
-    //         num_experts,                             // numBatches
-    //         kernelInfo.shuffledMatrixA,              // useShuffleMatrix
-    //         false,                                   // useFusedAct (for gated activation)
-    //         mDtypeElt == tg::Dtype::E2m1,            // useBlockScaling
-    //         16                                       // numEltsPerSf (for E2m1)
-    //     );
-
-    //     // Copy shuffled weights back to device
-    //     // cudaMemcpy(weight, hShuffledA.data(), numBytesA, cudaMemcpyHostToDevice);
-    //     // cudaMemcpy(weight_scale, hShuffledASf.data(), numSfBytes, cudaMemcpyHostToDevice);
-    // }
-
     gemmCommon::MyOptions options;
     options.mTopK = top_k;
     options.mBatchM = false;
@@ -373,16 +298,7 @@ void Runner::run(MoERunnerArgs const& args, MoEWorkspace const& workspace, cudaS
 
     setOpsData(args, workspace, convertSfData, activationData, finalizeData);
 
-    // Calling routing outside to properly allocate workspace
-    // moe::dev::routing::run(routingData, stream);
-
     void* hidden_states_scale_linear{args.hidden_states_scale};
-    // FIXME check that we receive r128c4 sf layout
-    // if (args.mDtypeElt == tg::Dtype::E2m1)
-    // {
-    //     hidden_states_scale_linear = workspace.hidden_states_scale_linear;
-    //     moe::dev::convertsf::run(convertSfData, stream);
-    // }
 
     PermuteGemm1::Runner permuteGemm1(args.mDtypeElt);
     permuteGemm1.run(args.hidden_states, hidden_states_scale_linear, args.gemm1_weights, args.gemm1_weights_scale,
@@ -411,36 +327,6 @@ void Runner::run(MoERunnerArgs const& args, MoEWorkspace const& workspace, cudaS
 
     // Run finalize
     moe::dev::finalize::run(finalizeData, stream);
-
-    // std::vector<uint8_t> gemm1_output_fp8(64 * args.intermediate_size / 2);
-    // printf("array addr 0x%lx\n", &gemm1_output_fp8[0]);
-    // printf("local_num_experts addr 0x%lx\n", &args.local_num_experts);
-    // cudaMemcpy(gemm1_output_fp8.data(), workspace.gemm1_output, gemm1_output_fp8.size() * sizeof(uint8_t),
-    //     cudaMemcpyDeviceToHost);
-    // std::cout << "args.local_num_experts: " << args.local_num_experts << std::endl;
-    // std::cout << "gemm1 output (hex):" << std::endl;
-    // for (int offset = 0; offset < 8; offset++)
-    // {
-    //     int base = offset * 2048;
-    //     for (int i = 0; i < args.num_tokens; i++)
-    //     {
-    //         for (int j = 0; j < args.intermediate_size / 2; j += 16)
-    //         {
-    //             std::cout << "Token " << i << " [" << std::dec << base + j << "]: ";
-    //             for (int k = 0; k < 16 && (j + k) < args.intermediate_size / 2; k++)
-    //             {
-    //                 // std::cout << "offset: " << std::dec << base + i * args.intermediate_size / 2 + j + k <<
-    //                 // std::endl;
-    //                 std::cout << "0x" << std::hex << std::setw(2) << std::setfill('0')
-    //                           << static_cast<uint>(gemm1_output_fp8[base + i * args.intermediate_size / 2 + j + k])
-    //                           << " ";
-    //             }
-    //             std::cout << std::endl;
-    //         }
-    //         std::cout << std::endl;
-    //     }
-    // }
-    // std::cout << "args.local_num_experts: " << args.local_num_experts << std::endl;
 }
 } // namespace MoE
 
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -267,7 +267,7 @@ inline Dtype dtypeFromString(std::string const& str)
     }
     else
     {
-        TLLM_LOG_ERROR("Unknown Dtype ", str);
+        TLLM_LOG_ERROR("Unknown Dtype ", str.c_str());
     }
     return Dtype::Void;
 }
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 
@@ -1,19 +1,19 @@
-/***************************************************************************************************
- * Copyright (c) 2011-2025, NVIDIA CORPORATION.  All rights reserved.
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without modification, are not permit-
- * ted.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
- **************************************************************************************************/
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #pragma once
 
 #include "SfLayoutDecl.h"
Original file line number	Diff line number	Diff line change
`@@ -190,8 +190,8 @@ inline void checkAndUpdateGemmOptions(`
`190`	`190`	`}`
`191`	`191`	`if (options.mDtypeElt == tg::Dtype::E4m3 && options.mMmaK != 32)`
`192`	`192`	`{`
`193`		`- TLLM_LOG_WARNING(`
`194`		`- "Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt), ". Setting MmaK to 32");`
	`193`	`+ TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt).c_str(),`
	`194`	`+ ". Setting MmaK to 32");`
`195`	`195`	`options.mMmaK = 32;`
`196`	`196`	`options.mTileK = std::max(options.mMmaK, options.mTileK);`
`197`	`197`	`}`
`@@ -205,15 +205,15 @@ inline void checkAndUpdateGemmOptions(`
`205`	`205`	`if (options.mMmaK != 64)`
`206`	`206`	`{`
`207`	`207`	`int newTileK = 64 * divUp(options.mTileK, 64);`
`208`		`- TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt),`
	`208`	`+ TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt).c_str(),`
`209`	`209`	`". Setting MmaK to 64 and TileK to ", newTileK);`
`210`	`210`	`options.mMmaK = 64;`
`211`	`211`	`options.mTileK = newTileK;`
`212`	`212`	`}`
`213`	`213`	`if (options.mMmaM != 128)`
`214`	`214`	`{`
`215`	`215`	`int newTileM = 128 * divUp(options.mTileM, 128);`
`216`		`- TLLM_LOG_WARNING("Unsupported MmaM (", options.mMmaM, ") for ", gemm::toString(options.mDtypeElt),`
	`216`	`+ TLLM_LOG_WARNING("Unsupported MmaM (", options.mMmaM, ") for ", gemm::toString(options.mDtypeElt).c_str(),`
`217`	`217`	`". Setting MmaM to 128 and TileM to ", newTileM);`
`218`	`218`	`options.mMmaM = 128;`
`219`	`219`	`options.mTileM = newTileM;`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.`
	`2`	`+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.`
`3`	`3`	`*`
`4`	`4`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`* you may not use this file except in compliance with the License.`