DomBrown
diff --git a/‎.github/CODEOWNERS
+5 b/‎.github/CODEOWNERS
+5
diff --git a/‎cpp/tensorrt_llm/common/attentionOp.cpp
+9-4 b/‎cpp/tensorrt_llm/common/attentionOp.cpp
+9-4
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu
+2-1 b/‎cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu
+2-1
diff --git a/‎cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
+4 b/‎cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
+4
diff --git a/‎cpp/tensorrt_llm/thop/moeOp.cpp
+17-14 b/‎cpp/tensorrt_llm/thop/moeOp.cpp
+17-14
diff --git a/‎docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
+49 b/‎docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
+49
diff --git a/‎docs/source/torch.md
+2-1 b/‎docs/source/torch.md
+2-1
@@ -0,0 +1,5 @@
+# This file defines code ownership rules for the repository.
+# The rule below requires that any PR to release/**/* branches must be approved by at least one member
+# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
+# Without approval from a member of this team, PRs cannot be merged to release branches.
+* @NVIDIA/trt-llm-release-branch-approval
@@ -915,7 +915,8 @@ int AttentionOp::mlaGeneration(
     params.quant_scale_kv = generation_params.kv_scale_orig_quant;
     params.dequant_scale_q = generation_params.kv_scale_quant_orig;
     params.dequant_scale_kv = generation_params.kv_scale_quant_orig;
-    params.host_bmm1_scale = 1 / (sqrt((float) (mMLAParams.qk_nope_head_dim + mMLAParams.qk_rope_head_dim)));
+    params.host_bmm1_scale
+        = 1 / (mQScaling * sqrt((float) (mMLAParams.qk_nope_head_dim + mMLAParams.qk_rope_head_dim)));
 
     invokeMLARopeGeneration<T>(params, kv_cache_buffer, stream);
     sync_check_cuda_error(stream);
@@ -1001,9 +1002,13 @@ int AttentionOp::mlaGeneration(
         tllmRunnerParams.mSfStartTokenIdx = generation_params.start_token_idx_sf;
 
         // Scales for quantization
-        static constexpr int bmm1_scale_offset = 1;
-        tllmRunnerParams.outputScalePtr = reinterpret_cast<float const*>(params.bmm2_scale);
-        tllmRunnerParams.scaleSoftmaxLog2Ptr = reinterpret_cast<float const*>(params.bmm1_scale) + bmm1_scale_offset;
+        if (mFP8GenerationMLA)
+        {
+            static constexpr int bmm1_scale_offset = 1;
+            tllmRunnerParams.outputScalePtr = reinterpret_cast<float const*>(params.bmm2_scale);
+            tllmRunnerParams.scaleSoftmaxLog2Ptr
+                = reinterpret_cast<float const*>(params.bmm1_scale) + bmm1_scale_offset;
+        }
 
         TLLM_CHECK_WITH_INFO(mTllmGenFMHARunner.get(), "mTllmGenFMHARunner not initialized.");
         mTllmGenFMHARunner->run(tllmRunnerParams);
 
@@ -28,7 +28,8 @@ __global__ void lamport_initialize_kernel(float* ptr, int size)
 
 void lamport_initialize(void* ptr, int bytes, cudaStream_t stream)
 {
-    lamport_initialize_kernel<<<bytes / 128, 128, 0, stream>>>(reinterpret_cast<float*>(ptr), bytes / sizeof(float));
+    int grid_size = (bytes + 127) / 128;
+    lamport_initialize_kernel<<<grid_size, 128, 0, stream>>>(reinterpret_cast<float*>(ptr), bytes / sizeof(float));
 }
 
 Workspace::Workspace(int rank, int tp_size, int max_token_num, int hidden_dim,
 
@@ -1989,6 +1989,10 @@ void residualRmsNorm(
 void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream)
 {
     sync_check_cuda_error(stream);
+    if (size == 0)
+    {
+        return;
+    }
     switch (dataType)
     {
     case nvinfer1::DataType::kFLOAT:
 
@@ -163,17 +163,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
         torch::optional<c10::ArrayRef<int64_t>> profile_ids)
     {
         // Free the profile workspace to save memory
-        if (mProfileWorkspace != nullptr)
-        {
-            auto const cu_free_status = cudaFree(mProfileWorkspace);
-            TORCH_CHECK(
-                cu_free_status == cudaSuccess, "Can't free profile workspace for MoE GEMM profile before runMoe.");
-            mProfileWorkspace = nullptr;
-        }
+        freeProfileWorkspace();
 
         std::lock_guard<std::mutex> lock(mMutex);
 
         TORCH_CHECK(cluster_size == 1 && cluster_rank == 0, "smart_router is supported in min_latency mode");
+
         CHECK_INPUT(input, mActivationDtype)
         CHECK_INPUT(token_selected_experts, at::ScalarType::Int)
         if (token_final_scales)
@@ -251,6 +246,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
     {
         std::lock_guard<std::mutex> lock(mMutex);
 
+        // Free the profile workspace to save memory
+        freeProfileWorkspace();
+
         CHECK_INPUT(input, mActivationDtype)
         CHECK_INPUT(token_selected_experts, at::ScalarType::Int)
         if (token_final_scales)
@@ -381,13 +379,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
                 hidden_size, inter_size, GROUP_SIZE, tensorrt_llm::ActivationType::Swiglu, USE_BIAS, USE_LORA,
                 min_latency_mode, parallelism_config);
 
-            if (mProfileWorkspace != nullptr)
-            {
-                auto const cu_free_status = cudaFree(mProfileWorkspace);
-                TORCH_CHECK(cu_free_status == cudaSuccess,
-                    "Can't free profile workspace for MoE GEMM profile during memory reallocation.");
-                mProfileWorkspace = nullptr;
-            }
+            freeProfileWorkspace();
             size_t profile_workspace_size = mProfiler->getWorkspaceSize(num_rows);
             auto const cu_malloc_status = cudaMalloc(&mProfileWorkspace, profile_workspace_size);
             TORCH_CHECK(cu_malloc_status == cudaSuccess, "Can't allocate profile workspace for MoE GEMM profile.");
@@ -422,6 +414,17 @@ class FusedMoeRunner : public torch::CustomClassHolder
     using Profile = tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
     std::vector<Profile> mAllProfiles;
 
+    void freeProfileWorkspace()
+    {
+        if (mProfileWorkspace != nullptr)
+        {
+            auto const cu_free_status = cudaFree(mProfileWorkspace);
+            TORCH_CHECK(cu_free_status == cudaSuccess,
+                "Can't free profile workspace for MoE GEMM profile during memory reallocation.");
+            mProfileWorkspace = nullptr;
+        }
+    }
+
     void setRunnerProfiles(torch::optional<c10::ArrayRef<int64_t>> profile_ids)
     {
         if (mUseFp8BlockScaling)
 
@@ -4,6 +4,33 @@ NVIDIA has announced world-record DeepSeek-R1 inference performance at NVIDIA GT
 
 In this blog, we share the configurations and procedures about how to reproduce the number on both B200 and H200 with PyTorch workflow.
 
+## Table of Contents
+
+- [How to get best performance on DeepSeek-R1 in TensorRT-LLM](#how-to-get-best-performance-on-deepseek-r1-in-tensorrt-llm)
+  - [Table of Contents](#table-of-contents)
+  - [Prerequisites: Install TensorRT-LLM and download models](#prerequisites-install-tensorrt-llm-and-download-models)
+      - [1. Download TensorRT-LLM](#1-download-tensorrt-llm)
+      - [2. Download the DeepSeek R1 models](#2-download-the-deepseek-r1-models)
+      - [3. Build and run TensorRT-LLM container](#3-build-and-run-tensorrt-llm-container)
+      - [4. Compile and Install TensorRT-LLM](#4-compile-and-install-tensorrt-llm)
+      - [5. Optional: Tune GPU clocks](#5-optional-tune-gpu-clocks)
+      - [6. Dataset preparation](#6-dataset-preparation)
+  - [Reproducing steps](#reproducing-steps)
+    - [B200 min-latency](#b200-min-latency)
+      - [Expected Results](#expected-results)
+    - [B200 max-throughput](#b200-max-throughput)
+      - [Benchmark](#benchmark)
+      - [Expected Result Format](#expected-result-format)
+    - [H200 min-latency](#h200-min-latency)
+      - [Expected Result Format](#expected-result-format-1)
+    - [H200 max-throughput](#h200-max-throughput)
+      - [Expected Result Format](#expected-result-format-2)
+  - [Exploring more ISL/OSL combinations](#exploring-more-islosl-combinations)
+    - [WIP: Enable more features by default](#wip-enable-more-features-by-default)
+    - [WIP: Chunked context support on DeepSeek models](#wip-chunked-context-support-on-deepseek-models)
+    - [Out of memory issues](#out-of-memory-issues)
+
+
 ## Prerequisites: Install TensorRT-LLM and download models
 
 This section can be skipped if you already have TensorRT-LLM installed and have already downloaded the DeepSeek R1 model checkpoint.
@@ -324,3 +351,25 @@ Total Token Throughput (tokens/sec):              15707.0888
 Total Latency (ms):                               993548.8470
 Average request latency (ms):                     197768.0434
 ```
+
+## Exploring more ISL/OSL combinations
+
+To benchmark TensorRT-LLM on DeepSeek models with more ISL/OSL combinations, you can use `prepare_dataset.py` to generate the dataset and use similar commands mentioned in the previous section. TensorRT-LLM is working on enhancements that can make the benchmark process smoother.
+### WIP: Enable more features by default
+
+Currently, there are some features that need to be enabled through a user-defined file `extra-llm-api-config.yml`, such as CUDA graph, overlap scheduler and attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.
+
+Note that, `max_batch_size` and `max_num_tokens` can easily affect the performance. The default values for them are already carefully designed and should deliver good performance on overall cases, however, you may still need to tune it for peak performance.
+
+Generally, you should make sure that `max_batch_size` is not too low to bottleneck the throughput, and `max_num_tokens` needs to be large enough so that it covers the max input sequence length of the samples in dataset, as mentioned in below section "WIP: Chunked context support on DeepSeek models".
+
+For more details on `max_batch_size` and `max_num_tokens`, refer to [Tuning Max Batch Size and Max Num Tokens](../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.md).
+
+### WIP: Chunked context support on DeepSeek models
+
+TensorRT-LLM team is actively working on chunked context support for DeepSeek models. Because of that missing feature, there is currently a limitation that `max_num_tokens` has to be at least larger than the max input sequence length of the samples in dataset.
+For more details on `max_num_tokens`, refer to [Tuning Max Batch Size and Max Num Tokens](../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.md).
+
+### Out of memory issues
+
+It's possible seeing OOM issues on some cases. Considering reducing `kv_cache_free_gpu_mem_fraction` to a smaller value as a workaround. We're working on the investigation and addressing the problem.
@@ -41,6 +41,7 @@ scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --ex
 
 - [Architecture Overview](./torch/arch_overview.md)
 - [Adding a New Model](./torch/adding_new_model.md)
+- [Examples](../../examples/pytorch/README.md)
 
 ## Key Components
 
@@ -50,4 +51,4 @@ scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --ex
 
 ## Known Issues
 
-- The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container (https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
+- The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,8 @@ __global__ void lamport_initialize_kernel(float* ptr, int size)`
`28`	`28`
`29`	`29`	`void lamport_initialize(void* ptr, int bytes, cudaStream_t stream)`
`30`	`30`	`{`
`31`		`- lamport_initialize_kernel<<<bytes / 128, 128, 0, stream>>>(reinterpret_cast<float*>(ptr), bytes / sizeof(float));`
	`31`	`+ int grid_size = (bytes + 127) / 128;`
	`32`	`+ lamport_initialize_kernel<<<grid_size, 128, 0, stream>>>(reinterpret_cast<float*>(ptr), bytes / sizeof(float));`
`32`	`33`	`}`
`33`	`34`
`34`	`35`	`Workspace::Workspace(int rank, int tp_size, int max_token_num, int hidden_dim,`
Original file line number	Diff line number	Diff line change
`@@ -1989,6 +1989,10 @@ void residualRmsNorm(`
`1989`	`1989`	`void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream)`
`1990`	`1990`	`{`
`1991`	`1991`	`sync_check_cuda_error(stream);`
	`1992`	`+ if (size == 0)`
	`1993`	`+ {`
	`1994`	`+ return;`
	`1995`	`+ }`
`1992`	`1996`	`switch (dataType)`
`1993`	`1997`	`{`
`1994`	`1998`	`case nvinfer1::DataType::kFLOAT:`