Implement split_and_shuffle, as a building block for sort-shuffling

seberg · seberg · commit 10c024b7fa94 · 2025-05-06T15:38:10.000+02:00
Implementing a distributed sort has the following components
(more complex things possibly exists, but I think this should be OK):

LocalSort -&gt; SelectEquidistant/SplitPointValues -&gt; BroadcastToAll
(Split points are selected by the number of partitions.)

The BroadcastToAll result will then contain:
1. The original data (by which we sorted, no need for all rows)
2. Two additional rows with (partition_id, local_row_id) to establish
   a global order.

The `BroadcastToAll` result can then be sorted again and used to
figure out how to move each split around.

And for that step one needs something like the `split_and_pack()` to
replace the `partition_and_pack()` currently used for hash partitioning.

So with the above one should be able to implement a
ShuffleForSort(LocalSort, BroadcastToAll).
The final sort result would then be another local sort after shuffling.

(The needed broadcast is small, so I assume there is no need to do it
with rapidsmpf initially.)

The above approach guarantees that the result partitions are at most
a factor of two less balanced than the input (I can look up the
reference).

Signed-off-by: Sebastian Berg &lt;sebastianb@nvidia.com&gt;
diff --git a/cpp/include/rapidsmpf/shuffler/partition.hpp b/cpp/include/rapidsmpf/shuffler/partition.hpp
@@ -85,6 +85,32 @@ partition_and_split(
     rmm::device_async_resource_ref mr
 );
 
+
+/**
+ * @brief Splits rows from the input table into multiple packed (serialized) tables.
+ *
+ * @param tables The tables to pack into partitions.
+ * @param splits The split points, equivalent to cudf::split(), i.e. one less than
+ * the number of result partitions.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned table's device memory.
+ *
+ * @return A map of partition IDs and their packed tables.
+ *
+ * @throw std::invalid_argument if the input table is empty
+ *
+ * @see unpack_and_concat
+ * @see cudf::split
+ * @see partition_and_pack
+ */
+[[nodiscard]] std::unordered_map<PartID, PackedData> split_and_pack(
+    cudf::table_view const& table,
+    std::vector<cudf::size_type> const& splits,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr
+);
+
+
 /**
  * @brief Unpack (deserialize) input tables and concatenate them.
  *
diff --git a/cpp/src/shuffler/partition.cpp b/cpp/src/shuffler/partition.cpp
@@ -79,6 +79,27 @@ std::unordered_map<PartID, PackedData> partition_and_pack(
     return ret;
 }
 
+std::unordered_map<PartID, PackedData> split_and_pack(
+    cudf::table_view const& table,
+    std::vector<cudf::size_type> const& splits,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr
+) {
+    RAPIDSMPF_NVTX_FUNC_RANGE();
+    // Can't split empty tables (0 is out of bounds), so raise
+    RAPIDSMPF_EXPECTS(
+        table.num_rows() > 0, "the input table cannot be empty", std::invalid_argument
+    );
+
+    auto tables = cudf::split(table, splits, stream);
+    std::unordered_map<PartID, PackedData> ret;
+    for (PartID i = 0; static_cast<std::size_t>(i) < tables.size(); ++i) {
+        auto pack = cudf::detail::pack(tables[i], stream, mr);
+        ret.emplace(i, PackedData(std::move(pack.metadata), std::move(pack.gpu_data)));
+    }
+    return ret;
+}
+
 std::unique_ptr<cudf::table> unpack_and_concat(
     std::vector<PackedData>&& partitions,
     rmm::cuda_stream_view stream,
diff --git a/cpp/tests/test_shuffler.cpp b/cpp/tests/test_shuffler.cpp
@@ -67,6 +67,36 @@ TEST_P(NumOfPartitions, partition_and_pack) {
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sort_table(expect), sort_table(result));
 }
 
+TEST_P(NumOfPartitions, split_and_pack) {
+    int const num_partitions = std::get<0>(GetParam());
+    int const num_rows = std::get<1>(GetParam());
+    std::int64_t const seed = 42;
+    auto stream = cudf::get_default_stream();
+    auto mr = cudf::get_current_device_resource_ref();
+
+    cudf::table expect = random_table_with_index(seed, num_rows, 0, 10);
+
+    std::vector<cudf::size_type> splits;
+    for (int i = 1; i < num_partitions; ++i) {
+        splits.emplace_back(i * num_rows / num_partitions);
+    }
+
+    auto chunks = rapidsmpf::shuffler::split_and_pack(expect, splits, stream, mr);
+
+    // Convert to a vector (restoring the original order).
+    std::vector<rapidsmpf::PackedData> chunks_vector;
+    for (int i = 0; i < num_partitions; ++i) {
+        chunks_vector.emplace_back(std::move(chunks.at(i)));
+    }
+    EXPECT_EQ(chunks_vector.size(), num_partitions);
+
+    auto result =
+        rapidsmpf::shuffler::unpack_and_concat(std::move(chunks_vector), stream, mr);
+
+    // Compare the input table with the result.
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expect, *result);
+}
+
 TEST(MetadataMessage, round_trip) {
     auto metadata = iota_vector<uint8_t>(100);
 
diff --git a/python/rapidsmpf/rapidsmpf/buffer/packed_data.pyx b/python/rapidsmpf/rapidsmpf/buffer/packed_data.pyx
@@ -3,7 +3,6 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
 from rapidsmpf.buffer.packed_data cimport cpp_PackedData
 
 
diff --git a/python/rapidsmpf/rapidsmpf/buffer/resource.pxd b/python/rapidsmpf/rapidsmpf/buffer/resource.pxd
@@ -8,15 +8,14 @@ from libcpp cimport bool
 from libcpp.memory cimport shared_ptr
 from libcpp.optional cimport optional
 from libcpp.unordered_map cimport unordered_map
+from rapidsmpf.buffer.buffer cimport MemoryType
+from rapidsmpf.buffer.spill_manager cimport SpillManager, cpp_SpillManager
+from rapidsmpf.utils.time cimport cpp_Duration
 from rmm.librmm.memory_resource cimport (device_memory_resource,
                                          statistics_resource_adaptor)
 from rmm.pylibrmm.memory_resource cimport (DeviceMemoryResource,
                                            StatisticsResourceAdaptor)
 
-from rapidsmpf.buffer.buffer cimport MemoryType
-from rapidsmpf.buffer.spill_manager cimport SpillManager, cpp_SpillManager
-from rapidsmpf.utils.time cimport cpp_Duration
-
 
 cdef extern from "<functional>" nogil:
     cdef cppclass cpp_MemoryAvailable "std::function<std::int64_t()>":
diff --git a/python/rapidsmpf/rapidsmpf/buffer/spill_manager.pyx b/python/rapidsmpf/rapidsmpf/buffer/spill_manager.pyx
@@ -3,7 +3,6 @@
 
 from cython.operator cimport dereference as deref
 from libc.stddef cimport size_t
-
 from rapidsmpf.buffer.resource cimport BufferResource
 from rapidsmpf.exception_handling cimport (CppExcept,
                                            throw_py_as_cpp_exception,
diff --git a/python/rapidsmpf/rapidsmpf/communicator/mpi.pyx b/python/rapidsmpf/rapidsmpf/communicator/mpi.pyx
@@ -4,7 +4,6 @@
 from libcpp.memory cimport make_shared
 from mpi4py cimport libmpi
 from mpi4py.MPI cimport Intracomm
-
 from rapidsmpf.communicator.communicator cimport Communicator
 
 
diff --git a/python/rapidsmpf/rapidsmpf/communicator/ucxx.pyx b/python/rapidsmpf/rapidsmpf/communicator/ucxx.pyx
@@ -10,10 +10,9 @@ from libcpp.optional cimport nullopt, nullopt_t
 from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.utility cimport move
-from ucxx._lib.libucxx cimport Address, UCXAddress, UCXWorker, Worker
-
 from rapidsmpf.communicator.communicator cimport *
 from rapidsmpf.communicator.ucxx cimport *
+from ucxx._lib.libucxx cimport Address, UCXAddress, UCXWorker, Worker
 
 
 cdef extern from "<variant>" namespace "std" nogil:
diff --git a/python/rapidsmpf/rapidsmpf/progress_thread.pxd b/python/rapidsmpf/rapidsmpf/progress_thread.pxd
@@ -7,7 +7,6 @@ from libc.stdint cimport uint64_t, uintptr_t
 from libcpp.functional cimport function
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
-
 from rapidsmpf.communicator.communicator cimport cpp_Logger
 from rapidsmpf.statistics cimport cpp_Statistics
 
diff --git a/python/rapidsmpf/rapidsmpf/progress_thread.pyx b/python/rapidsmpf/rapidsmpf/progress_thread.pyx
@@ -4,7 +4,6 @@
 
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport make_shared
-
 from rapidsmpf.communicator.communicator cimport Communicator
 from rapidsmpf.statistics cimport Statistics
 
diff --git a/python/rapidsmpf/rapidsmpf/shuffler.pxd b/python/rapidsmpf/rapidsmpf/shuffler.pxd
@@ -8,15 +8,14 @@ from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pylibcudf.table cimport Table
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
-from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
-
 from rapidsmpf.buffer.packed_data cimport cpp_PackedData
 from rapidsmpf.buffer.resource cimport BufferResource, cpp_BufferResource
 from rapidsmpf.communicator.communicator cimport Communicator, cpp_Communicator
 from rapidsmpf.progress_thread cimport cpp_ProgressThread
 from rapidsmpf.statistics cimport cpp_Statistics
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef dict partition_and_pack(
diff --git a/python/rapidsmpf/rapidsmpf/shuffler.pyi b/python/rapidsmpf/rapidsmpf/shuffler.pyi
@@ -21,6 +21,12 @@ def partition_and_pack(
     stream: Stream,
     device_mr: DeviceMemoryResource,
 ) -> dict[int, PackedData]: ...
+def split_and_pack(
+    table: Table,
+    splits: Iterable[int],
+    stream: Stream,
+    device_mr: DeviceMemoryResource,
+) -> dict[int, PackedData]: ...
 def unpack_and_concat(
     partitions: Iterable[PackedData],
     stream: Stream,
diff --git a/python/rapidsmpf/rapidsmpf/shuffler.pyx b/python/rapidsmpf/rapidsmpf/shuffler.pyx
@@ -13,15 +13,14 @@ from pylibcudf.libcudf.table.table cimport table as cpp_table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.table cimport Table
+from rapidsmpf.buffer.packed_data cimport PackedData, cpp_PackedData
+from rapidsmpf.progress_thread cimport ProgressThread
+from rapidsmpf.statistics cimport Statistics
 from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 from rmm.librmm.memory_resource cimport device_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
-from rapidsmpf.buffer.packed_data cimport PackedData, cpp_PackedData
-from rapidsmpf.progress_thread cimport ProgressThread
-from rapidsmpf.statistics cimport Statistics
-
 
 cdef extern from "<rapidsmpf/shuffler/partition.hpp>" nogil:
     int cpp_HASH_MURMUR3"cudf::hash_id::HASH_MURMUR3"
@@ -38,6 +37,14 @@ cdef extern from "<rapidsmpf/shuffler/partition.hpp>" nogil:
             device_memory_resource *mr,
         ) except +
 
+    cdef unordered_map[uint32_t, cpp_PackedData] cpp_split_and_pack \
+        "rapidsmpf::shuffler::split_and_pack"(
+            const table_view& table,
+            const vector[size_type] &splits,
+            cuda_stream_view stream,
+            device_memory_resource *mr,
+        ) except +
+
 
 cpdef dict partition_and_pack(
     Table table,
@@ -76,6 +83,7 @@ cpdef dict partition_and_pack(
     rapidsmpf.shuffler.unpack_and_concat
     pylibcudf.partitioning.hash_partition
     pylibcudf.contiguous_split.pack
+    pylibcudf.partitioning.split_and_pack
     """
     cdef vector[size_type] _columns_to_hash = tuple(columns_to_hash)
     cdef unordered_map[uint32_t, cpp_PackedData] _ret
@@ -103,6 +111,67 @@ cpdef dict partition_and_pack(
     return ret
 
 
+cpdef dict split_and_pack(
+    Table table,
+    splits,
+    stream,
+    DeviceMemoryResource device_mr,
+):
+    """
+    Splits rows from the input table into multiple packed (serialized) tables.
+
+    Parameters
+    ----------
+    table
+        The input table to split and pack.  The table cannot be empty (the
+        split points would not be valid).
+    splits
+        The split points, equivalent to cudf::split(), i.e. one less than
+        the number of result partitions.
+    stream
+        The CUDA stream used for memory operations.
+    device_mr
+        Reference to the RMM device memory resource used for device allocations.
+
+    Returns
+    -------
+    A dictionary where the keys are partition IDs and the values are packed tables.
+
+    Raises
+    ------
+    ValueError
+        If the input table is empty.
+
+    See Also
+    --------
+    rapidsmp.shuffler.unpack_and_concat
+    pylibcudf.copy.split
+    pylibcudf.partitioning.partition_and_pack
+    """
+    cdef vector[size_type] _splits = tuple(splits)
+    cdef unordered_map[uint32_t, cpp_PackedData] _ret
+    cdef table_view tbl = table.view()
+    if stream is None:
+        raise ValueError("stream cannot be None")
+    cdef cuda_stream_view _stream = Stream(stream).view()
+
+    with nogil:
+        _ret = cpp_split_and_pack(
+            tbl,
+            _splits,
+            _stream,
+            device_mr.get_mr()
+        )
+    ret = {}
+    cdef unordered_map[uint32_t, cpp_PackedData].iterator it = _ret.begin()
+    while(it != _ret.end()):
+        ret[deref(it).first] = PackedData.from_librapidsmpf(
+            make_unique[cpp_PackedData](move(deref(it).second))
+        )
+        postincrement(it)
+    return ret
+
+
 cdef extern from "<rapidsmpf/shuffler/partition.hpp>" nogil:
     cdef unique_ptr[cpp_table] cpp_unpack_and_concat \
         "rapidsmpf::shuffler::unpack_and_concat"(
diff --git a/python/rapidsmpf/rapidsmpf/tests/test_shuffler.py b/python/rapidsmpf/rapidsmpf/tests/test_shuffler.py
@@ -13,7 +13,12 @@
 
 from rapidsmpf.buffer.resource import BufferResource
 from rapidsmpf.progress_thread import ProgressThread
-from rapidsmpf.shuffler import Shuffler, partition_and_pack, unpack_and_concat
+from rapidsmpf.shuffler import (
+    Shuffler,
+    partition_and_pack,
+    split_and_pack,
+    unpack_and_concat,
+)
 from rapidsmpf.testing import assert_eq
 from rapidsmpf.utils.cudf import (
     cudf_to_pylibcudf_table,
@@ -50,6 +55,51 @@ def test_partition_and_pack_unpack(
     assert_eq(expect, got, sort_rows="0")
 
 
+@pytest.mark.parametrize(
+    "df",
+    [
+        {"0": [1, 2, 3], "1": [2, 2, 1]},
+        {"0": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
+    ],
+)
+@pytest.mark.parametrize("num_partitions", [1, 2, 3, 10])
+def test_split_and_pack_unpack(
+    device_mr: rmm.mr.CudaMemoryResource, df: dict[str, list[int]], num_partitions: int
+) -> None:
+    expect = cudf.DataFrame(df)
+    splits = np.linspace(0, len(expect), num_partitions, endpoint=False)[1:].astype(int)
+    partitions = split_and_pack(
+        cudf_to_pylibcudf_table(expect),
+        splits=splits,
+        stream=DEFAULT_STREAM,
+        device_mr=device_mr,
+    )
+    got = pylibcudf_to_cudf_dataframe(
+        unpack_and_concat(
+            tuple(partitions[i] for i in range(num_partitions)),
+            stream=DEFAULT_STREAM,
+            device_mr=device_mr,
+        )
+    )
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("num_partitions", [1, 2, 3, 10])
+def test_split_and_pack_unpack_empty_table(
+    device_mr: rmm.mr.CudaMemoryResource, num_partitions: int
+) -> None:
+    expect = cudf.DataFrame({"0": [], "1": []})
+    splits = np.linspace(0, len(expect), num_partitions, endpoint=False)[1:].astype(int)
+    with pytest.raises(ValueError, match=".*the input table cannot be empty"):
+        split_and_pack(
+            cudf_to_pylibcudf_table(expect),
+            splits=splits,
+            stream=DEFAULT_STREAM,
+            device_mr=device_mr,
+        )
+
+
 @pytest.mark.parametrize("wait_on", [False, True])
 @pytest.mark.parametrize("total_num_partitions", [1, 2, 3, 10])
 def test_shuffler_single_nonempty_partition(