Skip to content

Commit 59896c1

Browse files
authored
[libc] Remove the 'rpc_reset' routine from the RPC implementation (#66700)
Summary: This patch removes the `rpc_reset` function. This was previously used to initialize the RPC client on the device by setting up the pointers to communicate with the server. The purpose of this was to make it easier to initialize the device for testing. However, this prevented us from enforcing an invariant that the buffers are all read-only from the client side. The expected way to initialize the server is now to copy it from the host runtime. This will allow us to maintain that the RPC client is in the constant address space on the GPU, potentially through inference, and improving caching behaviour.
1 parent 88800f7 commit 59896c1

File tree

14 files changed

+115
-151
lines changed

14 files changed

+115
-151
lines changed

libc/config/gpu/entrypoints.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ set(TARGET_LIBC_ENTRYPOINTS
117117
libc.src.time.nanosleep
118118

119119
# gpu/rpc.h entrypoints
120-
libc.src.gpu.rpc_reset
121120
libc.src.gpu.rpc_host_call
122121
)
123122

libc/spec/gpu_ext.td

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,6 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
55
[], // Types
66
[], // Enumerations
77
[
8-
FunctionSpec<
9-
"rpc_reset",
10-
RetValSpec<VoidType>,
11-
[ArgSpec<UnsignedIntType>, ArgSpec<VoidPtr>]
12-
>,
138
FunctionSpec<
149
"rpc_host_call",
1510
RetValSpec<VoidType>,

libc/src/__support/RPC/rpc.h

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -88,20 +88,13 @@ template <bool Invert, typename Packet> struct Process {
8888
static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
8989
cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
9090

91-
/// Initialize the communication channels.
92-
LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
93-
this->port_count = port_count;
94-
this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
95-
advance(buffer, inbox_offset(port_count)));
96-
this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
97-
advance(buffer, outbox_offset(port_count)));
98-
this->packet =
99-
reinterpret_cast<Packet *>(advance(buffer, buffer_offset(port_count)));
100-
}
101-
102-
/// Returns the beginning of the unified buffer. Intended for initializing the
103-
/// client after the server has been started.
104-
LIBC_INLINE void *get_buffer_start() const { return Invert ? outbox : inbox; }
91+
LIBC_INLINE Process(uint32_t port_count, void *buffer)
92+
: port_count(port_count), inbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
93+
advance(buffer, inbox_offset(port_count)))),
94+
outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
95+
advance(buffer, outbox_offset(port_count)))),
96+
packet(reinterpret_cast<Packet *>(
97+
advance(buffer, buffer_offset(port_count)))) {}
10598

10699
/// Allocate a memory buffer sufficient to store the following equivalent
107100
/// representation in memory.
@@ -116,13 +109,13 @@ template <bool Invert, typename Packet> struct Process {
116109
}
117110

118111
/// Retrieve the inbox state from memory shared between processes.
119-
LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
112+
LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
120113
return gpu::broadcast_value(lane_mask,
121114
inbox[index].load(cpp::MemoryOrder::RELAXED));
122115
}
123116

124117
/// Retrieve the outbox state from memory shared between processes.
125-
LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
118+
LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
126119
return gpu::broadcast_value(lane_mask,
127120
outbox[index].load(cpp::MemoryOrder::RELAXED));
128121
}
@@ -349,13 +342,12 @@ struct Client {
349342
LIBC_INLINE Client &operator=(const Client &) = delete;
350343
LIBC_INLINE ~Client() = default;
351344

345+
LIBC_INLINE Client(uint32_t port_count, void *buffer)
346+
: process(port_count, buffer) {}
347+
352348
using Port = rpc::Port<false, Packet<gpu::LANE_SIZE>>;
353349
template <uint16_t opcode> LIBC_INLINE Port open();
354350

355-
LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
356-
process.reset(port_count, buffer);
357-
}
358-
359351
private:
360352
Process<false, Packet<gpu::LANE_SIZE>> process;
361353
};
@@ -371,18 +363,13 @@ template <uint32_t lane_size> struct Server {
371363
LIBC_INLINE Server &operator=(const Server &) = delete;
372364
LIBC_INLINE ~Server() = default;
373365

366+
LIBC_INLINE Server(uint32_t port_count, void *buffer)
367+
: process(port_count, buffer) {}
368+
374369
using Port = rpc::Port<true, Packet<lane_size>>;
375370
LIBC_INLINE cpp::optional<Port> try_open();
376371
LIBC_INLINE Port open();
377372

378-
LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
379-
process.reset(port_count, buffer);
380-
}
381-
382-
LIBC_INLINE void *get_buffer_start() const {
383-
return process.get_buffer_start();
384-
}
385-
386373
LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
387374
return Process<true, Packet<lane_size>>::allocation_size(port_count);
388375
}

libc/src/gpu/CMakeLists.txt

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,3 @@
1-
add_entrypoint_object(
2-
rpc_reset
3-
SRCS
4-
rpc_reset.cpp
5-
HDRS
6-
rpc_reset.h
7-
DEPENDS
8-
libc.src.__support.RPC.rpc_client
9-
libc.src.__support.GPU.utils
10-
)
11-
121
add_entrypoint_object(
132
rpc_host_call
143
SRCS

libc/src/gpu/rpc_reset.cpp

Lines changed: 0 additions & 24 deletions
This file was deleted.

libc/src/gpu/rpc_reset.h

Lines changed: 0 additions & 18 deletions
This file was deleted.

libc/startup/gpu/amdgpu/start.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,7 @@ static void call_fini_array_callbacks() {
4444
} // namespace __llvm_libc
4545

4646
extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
47-
_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
48-
// We need to set up the RPC client first in case any of the constructors
49-
// require it.
50-
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
51-
rpc_shared_buffer);
52-
47+
_begin(int argc, char **argv, char **env) {
5348
// We want the fini array callbacks to be run after other atexit
5449
// callbacks are run. So, we register them before running the init
5550
// array callbacks as they can potentially register their own atexit

libc/startup/gpu/nvptx/start.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,7 @@ static void call_fini_array_callbacks() {
4242
} // namespace __llvm_libc
4343

4444
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
45-
_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
46-
// We need to set up the RPC client first in case any of the constructors
47-
// require it.
48-
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
49-
rpc_shared_buffer);
50-
45+
_begin(int argc, char **argv, char **env) {
5146
// We want the fini array callbacks to be run after other atexit
5247
// callbacks are run. So, we register them before running the init
5348
// array callbacks as they can potentially register their own atexit

libc/test/src/__support/RPC/rpc_smoke_test.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,8 @@ alignas(64) char buffer[alloc_size] = {0};
3333

3434
TEST(LlvmLibcRPCSmoke, SanityCheck) {
3535

36-
ProcAType ProcA;
37-
ProcBType ProcB;
38-
39-
ProcA.reset(port_count, buffer);
40-
ProcB.reset(port_count, buffer);
41-
42-
EXPECT_EQ(ProcA.get_buffer_start(), ProcB.get_buffer_start());
36+
ProcAType ProcA(port_count, buffer);
37+
ProcBType ProcB(port_count, buffer);
4338

4439
uint64_t index = 0; // any < port_count
4540
uint64_t lane_mask = 1;

libc/utils/gpu/loader/Loader.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ struct begin_args_t {
3434
int argc;
3535
void *argv;
3636
void *envp;
37-
void *rpc_shared_buffer;
3837
};
3938

4039
/// The arguments to the '_start' kernel.

libc/utils/gpu/loader/amdgpu/Loader.cpp

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,49 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
430430
else
431431
handle_error("Invalid wavefront size");
432432

433+
// Initialize the RPC client on the device by copying the local data to the
434+
// device's internal pointer.
435+
hsa_executable_symbol_t rpc_client_sym;
436+
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
437+
executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
438+
handle_error(err);
439+
440+
void *rpc_client_host;
441+
if (hsa_status_t err =
442+
hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(void *),
443+
/*flags=*/0, &rpc_client_host))
444+
handle_error(err);
445+
446+
void *rpc_client_dev;
447+
if (hsa_status_t err = hsa_executable_symbol_get_info(
448+
rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
449+
&rpc_client_dev))
450+
handle_error(err);
451+
452+
// Copy the address of the client buffer from the device to the host.
453+
if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
454+
dev_agent, sizeof(void *)))
455+
handle_error(err);
456+
457+
void *rpc_client_buffer;
458+
if (hsa_status_t err = hsa_amd_memory_pool_allocate(
459+
coarsegrained_pool, rpc_get_client_size(),
460+
/*flags=*/0, &rpc_client_buffer))
461+
handle_error(err);
462+
std::memcpy(rpc_client_buffer, rpc_get_client_buffer(device_id),
463+
rpc_get_client_size());
464+
465+
// Copy the RPC client buffer to the address pointed to by the symbol.
466+
if (hsa_status_t err =
467+
hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
468+
rpc_client_buffer, host_agent, rpc_get_client_size()))
469+
handle_error(err);
470+
471+
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_buffer))
472+
handle_error(err);
473+
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
474+
handle_error(err);
475+
433476
// Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
434477
// If the clock_freq symbol is missing, no work to do.
435478
hsa_executable_symbol_t freq_sym;
@@ -474,8 +517,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
474517
handle_error(err);
475518

476519
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
477-
begin_args_t init_args = {argc, dev_argv, dev_envp,
478-
rpc_get_buffer(device_id)};
520+
begin_args_t init_args = {argc, dev_argv, dev_envp};
479521
if (hsa_status_t err = launch_kernel(
480522
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
481523
single_threaded_params, "_begin.kd", init_args))

libc/utils/gpu/loader/nvptx/Loader.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -309,10 +309,25 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
309309
warp_size, rpc_alloc, nullptr))
310310
handle_error(err);
311311

312+
// Initialize the RPC client on the device by copying the local data to the
313+
// device's internal pointer.
314+
CUdeviceptr rpc_client_dev = 0;
315+
uint64_t client_ptr_size = sizeof(void *);
316+
if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
317+
binary, rpc_client_symbol_name))
318+
handle_error(err);
319+
320+
CUdeviceptr rpc_client_host = 0;
321+
if (CUresult err =
322+
cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
323+
handle_error(err);
324+
if (CUresult err =
325+
cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(device_id),
326+
rpc_get_client_size()))
327+
handle_error(err);
328+
312329
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
313-
// Call the kernel to
314-
begin_args_t init_args = {argc, dev_argv, dev_envp,
315-
rpc_get_buffer(device_id)};
330+
begin_args_t init_args = {argc, dev_argv, dev_envp};
316331
if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
317332
"_begin", init_args))
318333
handle_error(err);

0 commit comments

Comments
 (0)