Skip to content

Commit 450086a

Browse files
committed
OSHMEM/MCA/SPML/UCX: WIP
1 parent cff7c93 commit 450086a

File tree

2 files changed

+45
-40
lines changed

2 files changed

+45
-40
lines changed

oshmem/mca/spml/ucx/spml_ucx.c

Lines changed: 45 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,17 @@ static void mca_spml_ucx_rkey_store_put(mca_spml_ucx_rkey_store_t *store,
292292
ucp_rkey_destroy(rkey);
293293
}
294294

295+
static void mca_spml_ucx_team_world_init()
296+
{
297+
int rc = mca_spml_ucx_team_split_strided(NULL, 0, 1, oshmem_num_procs(), NULL, 0,
298+
&SHMEM_TEAM_WORLD);
299+
300+
if (rc != OSHMEM_SUCCESS) {
301+
SPML_UCX_ERROR("mca_spml_ucx_team_split_strided failed (SHMEM_TEAM_WORLD creation)");
302+
oshmem_shmem_abort(-1);
303+
}
304+
}
305+
295306
int mca_spml_ucx_enable(bool enable)
296307
{
297308
SPML_UCX_VERBOSE(50, "*** ucx ENABLED ****");
@@ -315,8 +326,7 @@ void mca_spml_ucx_peer_mkey_cache_init(mca_spml_ucx_ctx_t *ucx_ctx, int pe)
315326
int mca_spml_ucx_peer_mkey_cache_add(ucp_peer_t *ucp_peer, int index)
316327
{
317328
/* Allocate an array to hold the pointers to the ucx_cached_mkey */
318-
if (index >= (int)ucp_peer->mkeys_cnt){
319-
int old_size = ucp_peer->mkeys_cnt;
329+
if (index >= (int)ucp_peer->mkeys_cnt){ int old_size = ucp_peer->mkeys_cnt;
320330
ucp_peer->mkeys_cnt = index + 1;
321331
ucp_peer->mkeys = realloc(ucp_peer->mkeys, sizeof(ucp_peer->mkeys[0]) * ucp_peer->mkeys_cnt);
322332
if (NULL == ucp_peer->mkeys) {
@@ -451,6 +461,14 @@ int mca_spml_ucx_ctx_mkey_del(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn
451461
return OSHMEM_SUCCESS;
452462
}
453463

464+
static void mca_spml_ucx_team_world_destroy()
465+
{
466+
if (SHMEM_TEAM_WORLD != NULL) {
467+
mca_spml_ucx_team_destroy(SHMEM_TEAM_WORLD);
468+
SHMEM_TEAM_WORLD = NULL;
469+
}
470+
}
471+
454472
int mca_spml_ucx_del_procs(oshmem_group_t* group, size_t nprocs)
455473
{
456474
size_t ucp_workers = mca_spml_ucx.ucp_workers;
@@ -460,6 +478,8 @@ int mca_spml_ucx_del_procs(oshmem_group_t* group, size_t nprocs)
460478

461479
oshmem_shmem_barrier();
462480

481+
mca_spml_ucx_team_world_destroy();
482+
463483
if (!mca_spml_ucx_ctx_default.ucp_peers) {
464484
return OSHMEM_SUCCESS;
465485
}
@@ -1163,8 +1183,6 @@ int mca_spml_ucx_ctx_create(long options, shmem_ctx_t *ctx)
11631183
}
11641184
SHMEM_MUTEX_UNLOCK(mca_spml_ucx.internal_mutex);
11651185
}
1166-
1167-
mca_spml_ucx_team_world_init();
11681186

11691187
(*ctx) = (shmem_ctx_t)ucx_ctx;
11701188
return OSHMEM_SUCCESS;
@@ -1183,8 +1201,6 @@ void mca_spml_ucx_ctx_destroy(shmem_ctx_t ctx)
11831201
opal_progress_unregister(spml_ucx_ctx_progress);
11841202
}
11851203

1186-
mca_spml_ucx_team_world_destroy();
1187-
11881204
SHMEM_MUTEX_UNLOCK(mca_spml_ucx.internal_mutex);
11891205
}
11901206

@@ -1754,25 +1770,6 @@ size_t mca_spml_ucx_test_some_vector(void *ivars, int cmp,
17541770
RUNTIME_SHMEM_NOT_IMPLEMENTED_API_ABORT_RET_SIZE_T();
17551771
}
17561772

1757-
void mca_spml_ucx_team_world_init()
1758-
{
1759-
int rc = mca_spml_ucx_team_split_strided(NULL, 0, 1, oshmem_num_procs(), NULL, 0,
1760-
&SHMEM_TEAM_WORLD);
1761-
1762-
if (rc != OSHMEM_SUCCESS) {
1763-
SPML_UCX_ERROR("mca_spml_ucx_team_split_strided failed (SHMEM_TEAM_WORLD creation)");
1764-
oshmem_shmem_abort(-1);
1765-
}
1766-
}
1767-
1768-
void mca_spml_ucx_team_world_destroy()
1769-
{
1770-
if (SHMEM_TEAM_WORLD != NULL) {
1771-
mca_spml_ucx_team_destroy(SHMEM_TEAM_WORLD);
1772-
SHMEM_TEAM_WORLD = NULL;
1773-
}
1774-
}
1775-
17761773
/* This routine is not implemented */
17771774
int mca_spml_ucx_team_sync(shmem_team_t team)
17781775
{
@@ -1820,16 +1817,16 @@ int mca_spml_ucx_team_translate_pe(shmem_team_t src_team, int src_pe,
18201817
mca_spml_ucx_team_t *ucx_dest_team = (mca_spml_ucx_team_t*) dest_team;
18211818
int global_pe;
18221819

1823-
if (src_pe == SPML_UCX_PE_NOT_IN_TEAM || (src_team == dest_team)) {
1820+
if ((src_pe == SPML_UCX_PE_NOT_IN_TEAM) || (src_team == dest_team)) {
18241821
return src_pe;
18251822
}
18261823

1827-
if (src_team == dest_team) {
1828-
return src_pe;
1829-
}
1830-
18311824
global_pe = ucx_src_team->start + src_pe * ucx_src_team->stride;
18321825

1826+
SPML_UCX_WARN("team_translate_pe(src_team=%p, src_pe=%d, dest_team=%p), global pe: %d, "
1827+
"src_team->start: %d, src pe: %d, src_team->stride: %d",
1828+
src_team, src_pe, dest_team, global_pe, ucx_src_team->start, src_pe, ucx_src_team->stride);
1829+
18331830
if (dest_team == SHMEM_TEAM_WORLD) {
18341831
return global_pe;
18351832
}
@@ -1849,32 +1846,44 @@ int mca_spml_ucx_team_split_strided(shmem_team_t parent_team, int start, int
18491846
mca_spml_ucx_team_t *ucx_parent_team;
18501847
mca_spml_ucx_team_t *ucx_new_team;
18511848
int my_pe;
1852-
int n_pes;
18531849

18541850
SPML_UCX_ASSERT(((start + size * stride) <= oshmem_num_procs()) && (start < size) && (stride > 0) && (size > 0));
18551851

1852+
SPML_UCX_WARN("team_split_strided(parent_team=%p, start=%d, stride=%d, size=%d, config=%p, "
1853+
"config_mask=%ld, new_team=%p)",
1854+
parent_team, start, stride, size, config, config_mask, new_team);
1855+
1856+
ucx_new_team = (mca_spml_ucx_team_t *)malloc(sizeof(mca_spml_ucx_team_t));
1857+
ucx_new_team->start = start;
1858+
ucx_new_team->stride = stride;
1859+
18561860
if (parent_team == NULL) {
18571861
my_pe = shmem_my_pe();
18581862
} else {
18591863
ucx_parent_team = (mca_spml_ucx_team_t*) parent_team;
1864+
18601865
SPML_UCX_VALIDATE_TEAM(parent_team);
18611866
if (mca_spml_ucx_is_pe_in_strided_team(ucx_parent_team->my_pe, start, stride, size)) {
18621867
my_pe = (ucx_parent_team->my_pe - start) / stride;
1868+
SPML_UCX_WARN("split: my_pe at parent team: %d, start: %d, stride: %d, size: %d, "
1869+
"my_pe at new team: %d", ucx_parent_team->my_pe, start, stride, size, my_pe);
18631870
} else {
18641871
/* not in team, according to spec it should be SHMEM_TEAM_INVALID but its value is NULL which
18651872
can be also interpreted as 0 (first pe), therefore -1 is used */
1873+
1874+
SPML_UCX_WARN("pe #%d is not part of the new team", ucx_parent_team->my_pe);
18661875
my_pe = SPML_UCX_PE_NOT_IN_TEAM;
18671876
}
1877+
1878+
/* In order to simplify pe translations start and stride are calculated with respect to
1879+
* world_team */
1880+
ucx_new_team->start += ucx_parent_team->start;
1881+
ucx_new_team->stride *= ucx_parent_team->stride;
18681882
}
18691883

1870-
ucx_new_team = (mca_spml_ucx_team_t *)malloc(sizeof(mca_spml_ucx_team_t));
18711884
ucx_new_team->n_pes = size;
18721885
ucx_new_team->my_pe = my_pe;
18731886

1874-
/* In order to simplify pe translations start and stride are calculated with respect to
1875-
* world_team */
1876-
ucx_new_team->start = ucx_parent_team->start + start;
1877-
ucx_new_team->stride = ucx_parent_team->stride * stride;
18781887
ucx_new_team->config = calloc(1, sizeof(mca_spml_ucx_team_config_t));
18791888

18801889
if (config != NULL) {

oshmem/mca/spml/ucx/spml_ucx.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ typedef struct mca_spml_ucx_team_config {
130130
} mca_spml_ucx_team_config_t;
131131

132132
typedef struct mca_spml_ucx_team {
133-
shmem_team_t super;
134133
int n_pes;
135134
int my_pe;
136135
int stride;
@@ -324,9 +323,6 @@ mca_spml_ucx_mem_map_flags_symmetric_rkey(struct mca_spml_ucx *spml_ucx);
324323
extern void mca_spml_ucx_rkey_store_init(mca_spml_ucx_rkey_store_t *store);
325324
extern void mca_spml_ucx_rkey_store_cleanup(mca_spml_ucx_rkey_store_t *store);
326325

327-
void mca_spml_ucx_team_world_init();
328-
void mca_spml_ucx_team_world_destroy();
329-
330326
static inline int
331327
mca_spml_ucx_peer_mkey_get(ucp_peer_t *ucp_peer, int index, spml_ucx_cached_mkey_t **out_rmkey)
332328
{

0 commit comments

Comments
 (0)