Implement split_and_shuffle, as a building block for sort-shuffling (#249)

seberg · web-flow · commit d4c738bff1e1 · 2025-05-07T11:33:39.000Z
Implement the one missing piece for sorting in rapidsmpf. More could be done here, but I think this is sufficient. This is almost exactly identical to `partition_and_pack` (we could also pass in the split tables already, making it even more of a component of `partition_and_pack` maybe). `split_and_pack` then should be able to replace `partition_and_pack` in the `Shuffle` for a new `SortShuffle`(?), I assumed it is OK to skip a fully empty part there. (More on how sorting can work in details.) ~EDIT: Hmmm, my pre-commit borked some formatting... (different computer, correct result)~ <details> The basic steps for sorting are the following: 1. Local sorting 2. From local result, extract evenly spaced points `[0, step, ... step*(N-1)]` (roughly). * (Only from the columns actually being sorted) 3. Continue with these "split candidates": * Also attach the `(partition_id, row)` (i.e. the row we split in global coordinates). * Broadcast all split candidates to all parts. (*I am assuming this is OK to do with Dask for now, as it is small*) * Do a local sort of all (from all parts). * We can use these to find which of our local chunk needs to go to which node -> i.e. the input for `split_and_pack`. 4. Use the shuffler, but with `split_and_pack(local_sorted_result, split_points)` (found in 1. and 3. 5. Do another local sort after gathering. For stable sorting, care needs to be taken, and I am not sure how the shuffler works. I had implemented this for [legate-df](https://github.com/rapidsai/legate-dataframe/blob/main/cpp/src/sort.cpp), which uses `libcudf` API, so it should translate pretty well to `pylibcudf` code. (For me the slow thing is figuring out the exact graph building, etc. right now) Authors: - Sebastian Berg (https://github.com/seberg) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Richard (Rick) Zamora (https://github.com/rjzamora) URL: #249
diff --git a/cpp/include/rapidsmpf/shuffler/partition.hpp b/cpp/include/rapidsmpf/shuffler/partition.hpp
@@ -85,6 +85,32 @@ partition_and_split(
     rmm::device_async_resource_ref mr
 );
 
+
+/**
+ * @brief Splits rows from the input table into multiple packed (serialized) tables.
+ *
+ * @param table The table to split and pack into partitions.
+ * @param splits The split points, equivalent to cudf::split(), i.e. one less than
+ * the number of result partitions.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned table's device memory.
+ *
+ * @return A map of partition IDs and their packed tables.
+ *
+ * @throw std::out_of_range if the splits are invalid.
+ *
+ * @see unpack_and_concat
+ * @see cudf::split
+ * @see partition_and_pack
+ */
+[[nodiscard]] std::unordered_map<PartID, PackedData> split_and_pack(
+    cudf::table_view const& table,
+    std::vector<cudf::size_type> const& splits,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr
+);
+
+
 /**
  * @brief Unpack (deserialize) input tables and concatenate them.
  *
diff --git a/cpp/src/shuffler/partition.cpp b/cpp/src/shuffler/partition.cpp
@@ -58,6 +58,20 @@ partition_and_split(
     return std::make_pair(std::move(tbl_partitioned), std::move(partition_table));
 }
 
+static std::unordered_map<PartID, PackedData> pack_tables(
+    std::vector<cudf::table_view> const& tables,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr
+) {
+    std::unordered_map<PartID, PackedData> ret;
+    ret.reserve(tables.size());
+    for (PartID i = 0; static_cast<std::size_t>(i) < tables.size(); ++i) {
+        auto pack = cudf::detail::pack(tables[i], stream, mr);
+        ret.emplace(i, PackedData(std::move(pack.metadata), std::move(pack.gpu_data)));
+    }
+    return ret;
+}
+
 std::unordered_map<PartID, PackedData> partition_and_pack(
     cudf::table_view const& table,
     std::vector<cudf::size_type> const& columns_to_hash,
@@ -71,12 +85,32 @@ std::unordered_map<PartID, PackedData> partition_and_pack(
     auto [tables, owner] = partition_and_split(
         table, columns_to_hash, num_partitions, hash_function, seed, stream, mr
     );
-    std::unordered_map<PartID, PackedData> ret;
-    for (PartID i = 0; static_cast<std::size_t>(i) < tables.size(); ++i) {
-        auto pack = cudf::detail::pack(tables[i], stream, mr);
-        ret.emplace(i, PackedData(std::move(pack.metadata), std::move(pack.gpu_data)));
+    return pack_tables(tables, stream, mr);
+}
+
+std::unordered_map<PartID, PackedData> split_and_pack(
+    cudf::table_view const& table,
+    std::vector<cudf::size_type> const& splits,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr
+) {
+    RAPIDSMPF_NVTX_FUNC_RANGE();
+    std::vector<cudf::table_view> tables;
+
+    if (table.num_rows() == 0) {
+        // Work around cudf::split() not supporting empty tables.
+        RAPIDSMPF_EXPECTS(
+            std::all_of(splits.begin(), splits.end(), [](auto val) { return val == 0; }),
+            "split point != 0 is invalid for empty table",
+            std::out_of_range
+        );
+        tables = std::vector<cudf::table_view>(
+            static_cast<std::size_t>(splits.size() + 1), table
+        );
+    } else {
+        tables = cudf::split(table, splits, stream);
     }
-    return ret;
+    return pack_tables(tables, stream, mr);
 }
 
 std::unique_ptr<cudf::table> unpack_and_concat(
diff --git a/cpp/tests/test_shuffler.cpp b/cpp/tests/test_shuffler.cpp
@@ -67,6 +67,36 @@ TEST_P(NumOfPartitions, partition_and_pack) {
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sort_table(expect), sort_table(result));
 }
 
+TEST_P(NumOfPartitions, split_and_pack) {
+    int const num_partitions = std::get<0>(GetParam());
+    int const num_rows = std::get<1>(GetParam());
+    std::int64_t const seed = 42;
+    auto stream = cudf::get_default_stream();
+    auto mr = cudf::get_current_device_resource_ref();
+
+    cudf::table expect = random_table_with_index(seed, num_rows, 0, 10);
+
+    std::vector<cudf::size_type> splits;
+    for (int i = 1; i < num_partitions; ++i) {
+        splits.emplace_back(i * num_rows / num_partitions);
+    }
+
+    auto chunks = rapidsmpf::shuffler::split_and_pack(expect, splits, stream, mr);
+
+    // Convert to a vector (restoring the original order).
+    std::vector<rapidsmpf::PackedData> chunks_vector;
+    for (int i = 0; i < num_partitions; ++i) {
+        chunks_vector.emplace_back(std::move(chunks.at(i)));
+    }
+    EXPECT_EQ(chunks_vector.size(), num_partitions);
+
+    auto result =
+        rapidsmpf::shuffler::unpack_and_concat(std::move(chunks_vector), stream, mr);
+
+    // Compare the input table with the result.
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expect, *result);
+}
+
 TEST(MetadataMessage, round_trip) {
     auto metadata = iota_vector<uint8_t>(100);
 
diff --git a/python/rapidsmpf/rapidsmpf/shuffler.pyi b/python/rapidsmpf/rapidsmpf/shuffler.pyi
@@ -21,6 +21,12 @@ def partition_and_pack(
     stream: Stream,
     device_mr: DeviceMemoryResource,
 ) -> dict[int, PackedData]: ...
+def split_and_pack(
+    table: Table,
+    splits: Iterable[int],
+    stream: Stream,
+    device_mr: DeviceMemoryResource,
+) -> dict[int, PackedData]: ...
 def unpack_and_concat(
     partitions: Iterable[PackedData],
     stream: Stream,
diff --git a/python/rapidsmpf/rapidsmpf/shuffler.pyx b/python/rapidsmpf/rapidsmpf/shuffler.pyx
@@ -38,6 +38,14 @@ cdef extern from "<rapidsmpf/shuffler/partition.hpp>" nogil:
             device_memory_resource *mr,
         ) except +
 
+    cdef unordered_map[uint32_t, cpp_PackedData] cpp_split_and_pack \
+        "rapidsmpf::shuffler::split_and_pack"(
+            const table_view& table,
+            const vector[size_type] &splits,
+            cuda_stream_view stream,
+            device_memory_resource *mr,
+        ) except +
+
 
 cpdef dict partition_and_pack(
     Table table,
@@ -76,6 +84,7 @@ cpdef dict partition_and_pack(
     rapidsmpf.shuffler.unpack_and_concat
     pylibcudf.partitioning.hash_partition
     pylibcudf.contiguous_split.pack
+    rapidsmpf.shuffler.split_and_pack
     """
     cdef vector[size_type] _columns_to_hash = tuple(columns_to_hash)
     cdef unordered_map[uint32_t, cpp_PackedData] _ret
@@ -103,6 +112,67 @@ cpdef dict partition_and_pack(
     return ret
 
 
+cpdef dict split_and_pack(
+    Table table,
+    splits,
+    stream,
+    DeviceMemoryResource device_mr,
+):
+    """
+    Splits rows from the input table into multiple packed (serialized) tables.
+
+    Parameters
+    ----------
+    table
+        The input table to split and pack.  The table cannot be empty (the
+        split points would not be valid).
+    splits
+        The split points, equivalent to cudf::split(), i.e. one less than
+        the number of result partitions.
+    stream
+        The CUDA stream used for memory operations.
+    device_mr
+        Reference to the RMM device memory resource used for device allocations.
+
+    Returns
+    -------
+    A dictionary where the keys are partition IDs and the values are packed tables.
+
+    Raises
+    ------
+    IndexError
+        If the splits are out of range for ``[0, len(table)]``.
+
+    See Also
+    --------
+    rapidsmpf.shuffler.unpack_and_concat
+    pylibcudf.copying.split
+    rapidsmpf.shuffler.partition_and_pack
+    """
+    cdef vector[size_type] _splits = tuple(splits)
+    cdef unordered_map[uint32_t, cpp_PackedData] _ret
+    cdef table_view tbl = table.view()
+    if stream is None:
+        raise ValueError("stream cannot be None")
+    cdef cuda_stream_view _stream = Stream(stream).view()
+
+    with nogil:
+        _ret = cpp_split_and_pack(
+            tbl,
+            _splits,
+            _stream,
+            device_mr.get_mr()
+        )
+    ret = {}
+    cdef unordered_map[uint32_t, cpp_PackedData].iterator it = _ret.begin()
+    while(it != _ret.end()):
+        ret[deref(it).first] = PackedData.from_librapidsmpf(
+            make_unique[cpp_PackedData](move(deref(it).second))
+        )
+        postincrement(it)
+    return ret
+
+
 cdef extern from "<rapidsmpf/shuffler/partition.hpp>" nogil:
     cdef unique_ptr[cpp_table] cpp_unpack_and_concat \
         "rapidsmpf::shuffler::unpack_and_concat"(
diff --git a/python/rapidsmpf/rapidsmpf/tests/test_shuffler.py b/python/rapidsmpf/rapidsmpf/tests/test_shuffler.py
@@ -13,7 +13,12 @@
 
 from rapidsmpf.buffer.resource import BufferResource
 from rapidsmpf.progress_thread import ProgressThread
-from rapidsmpf.shuffler import Shuffler, partition_and_pack, unpack_and_concat
+from rapidsmpf.shuffler import (
+    Shuffler,
+    partition_and_pack,
+    split_and_pack,
+    unpack_and_concat,
+)
 from rapidsmpf.testing import assert_eq
 from rapidsmpf.utils.cudf import (
     cudf_to_pylibcudf_table,
@@ -50,6 +55,52 @@ def test_partition_and_pack_unpack(
     assert_eq(expect, got, sort_rows="0")
 
 
+@pytest.mark.parametrize(
+    "df",
+    [
+        {"0": [1, 2, 3], "1": [2, 2, 1]},
+        {"0": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
+        {"0": [], "1": []},
+    ],
+)
+@pytest.mark.parametrize("num_partitions", [1, 2, 3, 10])
+def test_split_and_pack_unpack(
+    device_mr: rmm.mr.CudaMemoryResource, df: dict[str, list[int]], num_partitions: int
+) -> None:
+    expect = cudf.DataFrame(df)
+    splits = np.linspace(0, len(expect), num_partitions, endpoint=False)[1:].astype(int)
+    partitions = split_and_pack(
+        cudf_to_pylibcudf_table(expect),
+        splits=splits,
+        stream=DEFAULT_STREAM,
+        device_mr=device_mr,
+    )
+    got = pylibcudf_to_cudf_dataframe(
+        unpack_and_concat(
+            tuple(partitions[i] for i in range(num_partitions)),
+            stream=DEFAULT_STREAM,
+            device_mr=device_mr,
+        )
+    )
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("df", [{"0": [1, 2, 3], "1": [2, 2, 1]}, {"0": [], "1": []}])
+@pytest.mark.parametrize("num_partitions", [1, 2, 3, 10])
+def test_split_and_pack_unpack_out_of_range(
+    device_mr: rmm.mr.CudaMemoryResource, df: dict[str, list[int]], num_partitions: int
+) -> None:
+    expect = cudf.DataFrame({"0": [], "1": []})
+    with pytest.raises(IndexError):
+        split_and_pack(
+            cudf_to_pylibcudf_table(expect),
+            splits=[100],
+            stream=DEFAULT_STREAM,
+            device_mr=device_mr,
+        )
+
+
 @pytest.mark.parametrize("wait_on", [False, True])
 @pytest.mark.parametrize("total_num_partitions", [1, 2, 3, 10])
 def test_shuffler_single_nonempty_partition(