add wip flex dispatcher

ForFishes · ForFishes · commit 1e2882c2ed80 · 2025-03-04T11:16:40.000+08:00
diff --git a/paddlenlp/transformers/fused_a2a.py b/paddlenlp/transformers/fused_a2a.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Portions of this code are from DeepSeek DeepEP project
+# Copyright (c) 2025 DeepSeek
+# Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
+
+try:
+    from paddle.distributed import deep_ep
+
+    HAVE_DEEP_EP = True
+except ImportError:
+    HAVE_DEEP_EP = False
+
+import paddle
+from paddle.distributed.communication.group import Group
+from paddle.autograd import PyLayer
+
+_buffer = None
+
+
+def get_hidden_bytes(x: paddle.Tensor) -> int:
+    """Calculate the number of hidden bytes for a tensor.
+
+    Args:
+        x (paddle.Tensor): Input tensor
+
+    Returns:
+        int: Number of hidden bytes
+    """
+    return x.shape[1] * max(x.element_size(), 2)
+
+
+def get_buffer(group: Group, hidden_bytes: int):
+    """Get or create a buffer for all-to-all communication.
+
+    Args:
+        group (paddle.distributed.ProcessGroup): Process group for communication
+        hidden_bytes (int): Number of hidden bytes needed
+
+    Returns:
+        Buffer: Communication buffer
+    """
+    global _buffer
+    num_nvl_bytes, num_rdma_bytes = 0, 0
+    num_nvl_bytes = int(1e9)
+    # TODO: hongqing
+    # for config in (
+    #     deep_ep.Buffer.get_dispatch_config(group.world_size),
+    #     deep_ep.Buffer.get_combine_config(group.world_size),
+    # ):
+    #     # Split long line for PEP8 compliance
+    #     num_nvl_bytes = max(
+    #         config.get_nvl_buffer_size_hint(hidden_bytes, group.world_size), num_nvl_bytes
+    #     )
+    #     num_rdma_bytes = max(
+    #         config.get_rdma_buffer_size_hint(hidden_bytes, group.world_size), num_rdma_bytes
+    #     )
+
+    # Allocate buffer if not existed or not enough buffer
+    # NOTES: the adaptive routing configuration of the network **must be off**
+    if (
+        _buffer is None
+        or _buffer.group != group
+        or _buffer.num_nvl_bytes < num_nvl_bytes
+        or _buffer.num_rdma_bytes < num_rdma_bytes
+    ):
+        _buffer = deep_ep.Buffer(group, num_nvl_bytes, num_rdma_bytes)
+    return _buffer
+
+
+class FusedDispatch(PyLayer):
+    """Fused dispatch operation for MoE routing combining computation and communication."""
+
+    @staticmethod
+    def forward(ctx, x, token_indices, token_probs, num_experts, group, previous_event=None):
+        """Forward pass of fused dispatch."""
+        # Calculate layout before actual dispatch
+        buffer = get_buffer(group, get_hidden_bytes(x))
+        (
+            num_tokens_per_rank,
+            num_tokens_per_rdma_rank,
+            num_tokens_per_expert,
+            is_token_in_rank,
+            previous_event,
+        ) = buffer.get_dispatch_layout(
+            token_indices,
+            num_experts,
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False,
+        )
+
+        # Do MoE dispatch
+        # NOTES: the CPU will wait for GPU's signal to arrive,
+        # so this is not compatible with CUDA graph
+        (
+            recv_x,
+            recv_token_indices,
+            recv_token_probs,
+            num_recv_tokens_per_expert_list,
+            handle,
+            event,
+        ) = buffer.dispatch(
+            x,
+            topk_idx=token_indices,
+            topk_weights=token_probs.cast(paddle.float32),
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert,
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False,
+        )
+
+        ctx.group = group
+        ctx.handle = handle
+        ctx.event = event
+        tokens_per_expert = paddle.to_tensor(num_recv_tokens_per_expert_list)
+
+        return (recv_x, recv_token_indices, recv_token_probs, tokens_per_expert, handle)
+
+    @staticmethod
+    def backward(
+        ctx, grad_output, grad_token_indices, grad_token_probs, grad_tokens_per_expert, grad_handle
+    ):
+        """Backward pass of fused dispatch."""
+        buffer = get_buffer(ctx.group, get_hidden_bytes(grad_output))
+        handle = ctx.handle
+
+        grad_x, grad_token_probs, event = buffer.combine(
+            grad_output.contiguous(),
+            handle,
+            topk_weights=grad_token_probs.cast(paddle.float32),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False,
+        )
+        return grad_x, None, grad_token_probs, None, None, None
+
+
+class FusedCombine(PyLayer):
+    """Fused combine operation for MoE output combining computation and communication."""
+
+    @staticmethod
+    def forward(ctx, x, group, handle, previous_event=None):
+        """Forward pass of fused combine."""
+        buffer = get_buffer(group, get_hidden_bytes(x))
+        combined_x, _, event = buffer.combine(
+            x, handle=handle, async_finish=False, previous_event=None, allocate_on_comm_stream=False
+        )
+        ctx.handle = handle
+        ctx.group = group
+        ctx.previous_event=previous_event
+
+        # return combined_x, event
+        return combined_x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward pass of fused combine."""
+        buffer = get_buffer(ctx.group, get_hidden_bytes(grad_output))
+        grad_x, _, _, _, _, event = buffer.dispatch(
+            grad_output.contiguous(),
+            handle=ctx.handle,
+            previous_event=ctx.previous_event,
+            async_finish=False,
+            allocate_on_comm_stream=False,
+        )
+        return grad_x, (None, None, None, None, None, None)
+
+
+if HAVE_DEEP_EP:
+
+    def fused_dispatch(x, token_indices, token_probs, num_experts, group, previous_event=None):
+        """Perform fused dispatch operation if deep_ep is available.
+
+        Args:
+            x: Input tensor [num_tokens, hidden_size]
+            token_indices: Token routing indices [num_tokens, topk]
+            token_probs: Token routing probabilities [num_tokens, topk]
+            num_experts: Number of experts
+            group: Process group
+            previous_event: Previous CUDA event
+
+        Returns:
+            Result of FusedDispatch
+        """
+        return FusedDispatch.apply(
+            x.contiguous(), token_indices, token_probs, num_experts, group, previous_event
+        )
+
+    def fused_combine(x, group, handle, previous_event=None):
+        """Perform fused combine operation if deep_ep is available.
+
+        Args:
+            x: Input tensor
+            group: Process group
+            handle: Communication handle
+            previous_event: Previous CUDA event
+
+        Returns:
+            Result of FusedCombine
+        """
+        print(f'wsm fused_combine handle: {handle}')
+        return FusedCombine.apply(x, group, handle, previous_event)
+
+else:
+    fused_dispatch = None
+    fused_combine = None
diff --git a/paddlenlp/transformers/moe_utils.py b/paddlenlp/transformers/moe_utils.py
@@ -0,0 +1,133 @@
+import paddle
+from typing import Optional
+
+def permute(
+    tokens,
+    routing_map,
+    num_out_tokens: Optional[int] = None,
+    fused: bool = False,
+    drop_and_pad: bool = False,
+):
+    """Permute the tokens and probs based on the mask.
+    Tokens with the same designated expert will be grouped together.
+    The shape of mask is [tokens, num_experts], it indicates which experts were selected
+    by each token.
+
+    When drop_and_pad=True, in routing_map, the number of non-zeros in each column equals to
+    expert capacity. This function exploits this feature to use ops that support cuda graph.
+
+    Args:
+        tokens (paddle.Tensor): The input token tensor, [num_tokens, hidden].
+        routing_map (paddle.Tensor): The sparse token to expert mapping, [num_tokens, num_experts].
+        num_out_tokens (int, optional): The number of output tokens. If None, it's set to
+                                        the number of input tokens.
+        fused (bool, optional): Whether use the fused permute function.
+        drop_and_pad (bool, optional): Whether or not the token dispatcher uses token-drop
+                                       and pads the number of tokens to the expert capacity.
+                                       If set to true, routing_map has a fixed number of non-zeros
+                                       in each column.
+    """
+    if fused:
+        if not HAVE_TE or fused_permute is None:
+            raise ValueError("fused_permute is not available. Please install TE >= 2.1.0.")
+        return fused_permute(tokens, routing_map, num_out_tokens)
+
+    num_tokens, hidden = tokens.shape
+    num_experts = routing_map.shape[1]
+    if drop_and_pad and not (num_out_tokens is None):
+        capacity = num_out_tokens // num_experts
+        assert not routing_map.requires_grad
+        # mask [num_tokens, num_experts] -> [num_experts, num_tokens]
+        routing_map = routing_map.to(dtype=paddle.int8).t().contiguous()
+        # use argsort to put indices of all non-zeros in the beginning of list
+        # and keep the first `capacity` number of indices
+        sorted_indices = routing_map.argsort(dim=-1, descending=True, stable=True)[
+            :, :capacity
+        ].contiguous()
+        # flatten from [num_experts, capacity] to 1D
+        sorted_indices = sorted_indices.view(-1)
+    else:
+        # mask [num_tokens, num_experts] -> [num_experts, num_tokens]
+        routing_map = routing_map.cast(paddle.bool).T.contiguous()
+
+        # Create a dense expert-to-token mapping from the sparse token-to-expert mapping
+        token_indices = (
+            paddle.arange(num_tokens).unsqueeze(0).expand([num_experts, -1])
+        )
+        sorted_indices = token_indices.masked_select(routing_map)
+
+    # use the mapping to permute the tokens
+    permuted_input = tokens.index_select(axis=0, index=sorted_indices)
+
+    return permuted_input, sorted_indices
+
+
+def unpermute(
+    permuted_tokens: paddle.Tensor,
+    sorted_indices: paddle.Tensor,
+    restore_shape: paddle.shape,
+    probs: paddle.Tensor = None,
+    routing_map: paddle.Tensor = None,
+    fused: bool = False,
+    drop_and_pad: bool = False,
+):
+    """
+    Restore the original order of tokens after permutation. If probs are provided, it
+    will also apply them to the tokens before restoring the order.
+
+    When drop_and_pad=True, the tensors will have the following properties:
+      - In routing_map, the number of non-zeros in each column equals to expert capacity
+      - The size of sorted_indices equals to num_experts * capacity, each split of `capacity`
+        contains the indices of tokens routed to an expert.
+    This function exploits these features to use ops that support cuda graph.
+
+    Args:
+        permuted_tokens (paddle.Tensor): The permuted token tensor.
+        sorted_indices (paddle.Tensor): The indices used to sort the tokens.
+        restore_shape (paddle.shape): The shape of the unpermuted tensor.
+        probs (paddle.Tensor, optional): The unpermuted probs tensor,
+        routing_map (paddle.Tensor, optional): Token to expert mapping, shape
+            [num_tokens, num_experts].
+        fused (bool, optional): Whether use the fused unpermute function.
+        drop_and_pad (bool, optional): Whether or not the token dispatcher uses token-drop
+                                       and pads the number of tokens to the expert capacity.
+
+    Returns:
+        paddle.Tensor: The tokens restored to their original order.
+    """
+    if fused:
+        if not HAVE_TE or fused_unpermute is None:
+            raise ValueError("fused_unpermute is not available. Please install TE >= 2.1.0.")
+        return fused_unpermute(permuted_tokens, sorted_indices, probs, restore_shape)
+
+    _, hidden = restore_shape
+
+    if probs is not None:
+        assert routing_map is not None, "Mask must be provided to permute the probs."
+        if drop_and_pad:
+            num_experts = routing_map.shape[1]
+            num_permuted_tokens = sorted_indices.shape[0]
+            capacity = num_permuted_tokens // num_experts
+            num_unpermuted_tokens = probs.shape[0]
+
+            # [num_unpermuted_tokens, num_experts] -> num_experts * num_unpermuted_tokens
+            probs_T_1D = probs.T.contiguous().view(-1)
+
+            # get 1D indices of the probs selected by routing_map
+            indices_dim0 = paddle.arange(num_experts).unsqueeze(-1)
+            indices_dim1 = sorted_indices.view(num_experts, capacity)
+            indices_1D = (indices_dim0 * num_unpermuted_tokens + indices_dim1).view(-1)
+
+            # get probs from indices
+            permuted_probs = probs_T_1D.index_select(axis=0, index=indices_1D)
+        else:
+            permuted_probs = probs.T.contiguous().masked_select(routing_map.T.contiguous())
+        permuted_tokens = permuted_tokens * permuted_probs.unsqueeze(-1)
+
+    # Create an output tensor filled with zeros
+    output_tokens = paddle.zeros(
+        restore_shape, dtype=permuted_tokens.dtype
+    )
+    # Scatter add the permuted_input back to the original positions
+    output_tokens.put_along_axis_(axis=0, indices=sorted_indices.unsqueeze(1).expand([-1, hidden]), values=permuted_tokens, reduce='add', include_self=True)
+    return output_tokens
diff --git a/paddlenlp/transformers/token_dispatcher.py b/paddlenlp/transformers/token_dispatcher.py