[Deepseek] Redesign multi-stream API

hlu1 · hlu1 · commit 2cbbdc9795ab · 2025-04-12T04:27:25.000Z
Signed-off-by: Hao Lu &lt;haolu@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -23,10 +23,10 @@
 from ..modules.fused_moe import BaseMoeRoutingMethod, FusedMoE
 from ..modules.gated_mlp import GatedMLP
 from ..modules.linear import Linear
+from ..modules.multi_stream_utils import maybe_execute_in_parallel
 from ..modules.rms_norm import RMSNorm
 from ..modules.rotary_embedding import RotaryEmbedding
 from ..pipeline_interface import PipelineInterface
-from ..pyexecutor.cuda_graph_runner import is_graph_capturing
 from ..speculative import MTPEagleWorker, MTPSpecMetadata, MTPWorker
 from ..utils import (AuxStreamType, EventType, Fp4QuantizedTensor,
                      disable_fp4_allgather)
@@ -351,27 +351,25 @@ def forward(
     ) -> torch.Tensor:
         if min_latency_mode:
             assert not self.use_dp
-        # Only enable multi-stream for cuda graph since switch stream has extra host overhead
-        # This design is mainly for low latency use case. Need to improve for max throughput use case.
-        do_multi_stream = is_graph_capturing()
-        if do_multi_stream:
-            self.event_dict[EventType.Main].record()
-        shared_output = self.shared_experts(hidden_states)
-        if self.shared_output_scale is not None:
-            shared_output *= self.shared_output_scale
-        if do_multi_stream:
-            with torch.cuda.stream(self.aux_stream):
-                self.event_dict[EventType.Main].wait()
-                routed_output = self.compute_routed_output(
-                    hidden_states, hidden_states_fp4, all_rank_num_tokens,
-                    min_latency_mode)
-                self.event_dict[EventType.MoeShared].record()
-            self.event_dict[EventType.MoeShared].wait()
-        else:
+
+        def _compute_shared_output():
+            shared_output = self.shared_experts(hidden_states)
+            if self.shared_output_scale is not None:
+                shared_output *= self.shared_output_scale
+            return shared_output
+
+        def _compute_routed_output():
             routed_output = self.compute_routed_output(hidden_states,
                                                        hidden_states_fp4,
                                                        all_rank_num_tokens,
                                                        min_latency_mode)
+            return routed_output
+
+        shared_output, routed_output = maybe_execute_in_parallel(
+            _compute_shared_output, _compute_routed_output,
+            self.event_dict[EventType.Main],
+            self.event_dict[EventType.MoeShared], self.aux_stream)
+
         if min_latency_mode:
             return [shared_output, *routed_output]
 
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -15,6 +15,7 @@
 from ..model_config import ModelConfig
 from ..peft.lora.layer import LoraLayer, LoraModuleType
 from .linear import Linear, TensorParallelMode, WeightMode, WeightsLoadingConfig
+from .multi_stream_utils import maybe_execute_in_parallel
 from .rms_norm import RMSNorm
 from .rotary_embedding import RotaryEmbedding
 
@@ -517,19 +518,14 @@ def forward(
             q, compressed_kv, k_pe = self.fused_a(hidden_states).split(
                 [self.q_lora_rank, self.kv_lora_rank, self.qk_rope_head_dim],
                 -1)
-            do_multi_stream = torch.cuda.is_current_stream_capturing(
-            ) and self.aux_stream is not None
-            if do_multi_stream:
-                self.ln_events[0].record()
-                compressed_kv = self.kv_a_layernorm(compressed_kv)
-                with torch.cuda.stream(self.aux_stream):
-                    self.ln_events[0].wait()
-                    q = self.q_a_layernorm(q)
-                    self.ln_events[1].record()
-                self.ln_events[1].wait()
-            else:
-                q = self.q_a_layernorm(q)
-                compressed_kv = self.kv_a_layernorm(compressed_kv)
+
+            q, compressed_kv = maybe_execute_in_parallel(
+                lambda: self.q_a_layernorm(q),
+                lambda: self.kv_a_layernorm(compressed_kv),
+                self.ln_events[0],
+                self.ln_events[1],
+                self.aux_stream,
+            )
 
         q = self.q_b_proj(q)
 
@@ -641,54 +637,65 @@ def forward_generation(
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         num_tokens = q.shape[0]
-        latent_cache = torch.concat([compressed_kv, k_pe], dim=-1)
-
         q_nope, q_pe = q.view([
             -1, self.num_heads, self.qk_nope_head_dim + self.qk_rope_head_dim
         ]).split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
-        # fused_q contains 1) the result of the following bmm with shape [num_tokens, num_heads, kv_lora_rank]
-        # 2) rope(q_pe) with shape [num_tokens, num_heads, qk_rope_head_dim]. rope is applied inside AttentionOp
-        fused_q = torch.empty(
-            [
-                num_tokens, self.num_heads,
-                (self.kv_lora_rank + self.qk_rope_head_dim)
-            ],
-            dtype=q.dtype,
-            device=q.device,
+        def _run_bmm():
+            # fused_q contains 1) the result of the following bmm with shape [num_tokens, num_heads, kv_lora_rank]
+            # 2) rope(q_pe) with shape [num_tokens, num_heads, qk_rope_head_dim]. rope is applied inside AttentionOp
+            fused_q = torch.empty(
+                [
+                    num_tokens, self.num_heads,
+                    (self.kv_lora_rank + self.qk_rope_head_dim)
+                ],
+                dtype=q.dtype,
+                device=q.device,
+            )
+            if self.k_b_proj_trans.dtype == torch.bfloat16:
+                # [num_heads, num_tokens, self.qk_nope_head_dim]
+                q_nope_t = q_nope.transpose(0, 1)
+                # [num_heads, num_tokens, self.kv_lora_rank]
+                q_nope_out = fused_q[..., :self.kv_lora_rank].transpose(0, 1)
+
+                # [num_heads, num_tokens, self.qk_nope_head_dim] x [num_heads, kv_lora_rank, qk_nope_head_dim]
+                # -> [num_heads, num_tokens, kv_lora_rank] -> [num_tokens, num_heads, kv_lora_rank]
+                # The output of bmm is written directly into fused_q
+                torch.ops.trtllm.bmm_out(q_nope_t,
+                                         self.k_b_proj_trans.transpose(1, 2),
+                                         q_nope_out)
+            elif self.k_b_proj_trans.dtype == torch.float8_e4m3fn:
+                q_nope_fp8, q_nope_scales = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
+                    q_nope)
+                # [num_heads, num_tokens, self.kv_lora_rank]
+                q_nope_out = fused_q[..., :self.kv_lora_rank].transpose(0, 1)
+
+                torch.ops.trtllm.fp8_block_scaling_bmm_out(
+                    q_nope_fp8, self.k_b_proj_trans, q_nope_scales,
+                    self.k_b_proj_trans_scale, q_nope_out)
+                q_nope_scales = None
+            else:
+                raise NotImplementedError(
+                    f"Missing bmm impl for dtype: {self.k_b_proj_trans.dtype}.")
+
+            fused_q = fused_q.view([
+                num_tokens,
+                self.num_heads * (self.kv_lora_rank + self.qk_rope_head_dim)
+            ])
+            return fused_q
+
+        def _concat_kv_cache():
+            latent_cache = torch.concat([compressed_kv, k_pe], dim=-1)
+            return latent_cache
+
+        fused_q, latent_cache = maybe_execute_in_parallel(
+            _run_bmm,
+            _concat_kv_cache,
+            self.ln_events[0],
+            self.ln_events[1],
+            self.aux_stream,
         )
 
-        if self.k_b_proj_trans.dtype == torch.bfloat16:
-            # [num_heads, num_tokens, self.qk_nope_head_dim]
-            q_nope = q_nope.transpose(0, 1)
-            # [num_heads, num_tokens, self.kv_lora_rank]
-            q_nope_out = fused_q[..., :self.kv_lora_rank].transpose(0, 1)
-
-            # [num_heads, num_tokens, self.qk_nope_head_dim] x [num_heads, kv_lora_rank, qk_nope_head_dim]
-            # -> [num_heads, num_tokens, kv_lora_rank] -> [num_tokens, num_heads, kv_lora_rank]
-            # The output of bmm is written directly into fused_q
-            torch.ops.trtllm.bmm_out(q_nope,
-                                     self.k_b_proj_trans.transpose(1, 2),
-                                     q_nope_out)
-        elif self.k_b_proj_trans.dtype == torch.float8_e4m3fn:
-            q_nope, q_nope_scales = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
-                q_nope)
-            # [num_heads, num_tokens, self.kv_lora_rank]
-            q_nope_out = fused_q[..., :self.kv_lora_rank].transpose(0, 1)
-
-            torch.ops.trtllm.fp8_block_scaling_bmm_out(
-                q_nope, self.k_b_proj_trans, q_nope_scales,
-                self.k_b_proj_trans_scale, q_nope_out)
-            q_nope_scales = None
-        else:
-            raise NotImplementedError(
-                f"Missing bmm impl for dtype: {self.k_b_proj_trans.dtype}.")
-
-        fused_q = fused_q.view([
-            num_tokens,
-            self.num_heads * (self.kv_lora_rank + self.qk_rope_head_dim)
-        ])
-
         # out_scale = getattr(self.o_proj, "inv_input_scale", None)
         out_scale = None  # Although we use FP8 MLA for generation phase, the output is still in BF16
 
diff --git a/tensorrt_llm/_torch/modules/multi_stream_utils.py b/tensorrt_llm/_torch/modules/multi_stream_utils.py
@@ -0,0 +1,47 @@
+from typing import Any, Callable, Optional
+
+import torch
+
+from ..pyexecutor.cuda_graph_runner import is_graph_capturing
+
+
+def maybe_execute_in_parallel(
+        fn0: Callable,
+        fn1: Callable,
+        event0: torch.cuda.Event,
+        event1: torch.cuda.Event,
+        aux_stream: Optional[torch.cuda.Stream] = None) -> tuple[Any, Any]:
+    """Utility function to run two functions in two cuda streams in parallel. Multi-stream is
+    only enabled when cuda graph is turned on because switch stream has extra host overhead.
+
+    This design is mainly for low latency use case. It needs to be improved for max throughput
+    use case.
+    For simplicity, fn0 and fn1 do not support inputs.
+
+    Args:
+        fn0 (Callable): callable for the default stream
+        fn1 (Callable): callable for the second stream, aux_stream
+        event0 (torch.cuda.Event): cuda event for fn0
+        event1 (torch.cuda.Event): cuda event for fn1
+        aux_stream (Optional[torch.cuda.Stream]): the second cuda stream for fn1.
+            Mutil-stream is disabled when aux_stream is None.
+
+    Returns:
+        tuple[Any, Any]: the return values of fn0() and fn1()
+    """
+
+    do_multi_stream = is_graph_capturing() and aux_stream is not None
+
+    if do_multi_stream:
+        event0.record()
+        result0 = fn0()
+
+        with torch.cuda.stream(aux_stream):
+            event0.wait()
+            result1 = fn1()
+            event1.record()
+        event1.wait()
+    else:
+        result0 = fn0()
+        result1 = fn1()
+    return (result0, result1)