@@ -292,6 +292,17 @@ static void mca_spml_ucx_rkey_store_put(mca_spml_ucx_rkey_store_t *store,
292
292
ucp_rkey_destroy (rkey );
293
293
}
294
294
295
+ static void mca_spml_ucx_team_world_init ()
296
+ {
297
+ int rc = mca_spml_ucx_team_split_strided (NULL , 0 , 1 , oshmem_num_procs (), NULL , 0 ,
298
+ & SHMEM_TEAM_WORLD );
299
+
300
+ if (rc != OSHMEM_SUCCESS ) {
301
+ SPML_UCX_ERROR ("mca_spml_ucx_team_split_strided failed (SHMEM_TEAM_WORLD creation)" );
302
+ oshmem_shmem_abort (-1 );
303
+ }
304
+ }
305
+
295
306
int mca_spml_ucx_enable (bool enable )
296
307
{
297
308
SPML_UCX_VERBOSE (50 , "*** ucx ENABLED ****" );
@@ -315,8 +326,7 @@ void mca_spml_ucx_peer_mkey_cache_init(mca_spml_ucx_ctx_t *ucx_ctx, int pe)
315
326
int mca_spml_ucx_peer_mkey_cache_add (ucp_peer_t * ucp_peer , int index )
316
327
{
317
328
/* Allocate an array to hold the pointers to the ucx_cached_mkey */
318
- if (index >= (int )ucp_peer -> mkeys_cnt ){
319
- int old_size = ucp_peer -> mkeys_cnt ;
329
+ if (index >= (int )ucp_peer -> mkeys_cnt ){ int old_size = ucp_peer -> mkeys_cnt ;
320
330
ucp_peer -> mkeys_cnt = index + 1 ;
321
331
ucp_peer -> mkeys = realloc (ucp_peer -> mkeys , sizeof (ucp_peer -> mkeys [0 ]) * ucp_peer -> mkeys_cnt );
322
332
if (NULL == ucp_peer -> mkeys ) {
@@ -451,6 +461,14 @@ int mca_spml_ucx_ctx_mkey_del(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn
451
461
return OSHMEM_SUCCESS ;
452
462
}
453
463
464
+ static void mca_spml_ucx_team_world_destroy ()
465
+ {
466
+ if (SHMEM_TEAM_WORLD != NULL ) {
467
+ mca_spml_ucx_team_destroy (SHMEM_TEAM_WORLD );
468
+ SHMEM_TEAM_WORLD = NULL ;
469
+ }
470
+ }
471
+
454
472
int mca_spml_ucx_del_procs (oshmem_group_t * group , size_t nprocs )
455
473
{
456
474
size_t ucp_workers = mca_spml_ucx .ucp_workers ;
@@ -460,6 +478,8 @@ int mca_spml_ucx_del_procs(oshmem_group_t* group, size_t nprocs)
460
478
461
479
oshmem_shmem_barrier ();
462
480
481
+ mca_spml_ucx_team_world_destroy ();
482
+
463
483
if (!mca_spml_ucx_ctx_default .ucp_peers ) {
464
484
return OSHMEM_SUCCESS ;
465
485
}
@@ -1163,8 +1183,6 @@ int mca_spml_ucx_ctx_create(long options, shmem_ctx_t *ctx)
1163
1183
}
1164
1184
SHMEM_MUTEX_UNLOCK (mca_spml_ucx .internal_mutex );
1165
1185
}
1166
-
1167
- mca_spml_ucx_team_world_init ();
1168
1186
1169
1187
(* ctx ) = (shmem_ctx_t )ucx_ctx ;
1170
1188
return OSHMEM_SUCCESS ;
@@ -1183,8 +1201,6 @@ void mca_spml_ucx_ctx_destroy(shmem_ctx_t ctx)
1183
1201
opal_progress_unregister (spml_ucx_ctx_progress );
1184
1202
}
1185
1203
1186
- mca_spml_ucx_team_world_destroy ();
1187
-
1188
1204
SHMEM_MUTEX_UNLOCK (mca_spml_ucx .internal_mutex );
1189
1205
}
1190
1206
@@ -1754,25 +1770,6 @@ size_t mca_spml_ucx_test_some_vector(void *ivars, int cmp,
1754
1770
RUNTIME_SHMEM_NOT_IMPLEMENTED_API_ABORT_RET_SIZE_T ();
1755
1771
}
1756
1772
1757
- void mca_spml_ucx_team_world_init ()
1758
- {
1759
- int rc = mca_spml_ucx_team_split_strided (NULL , 0 , 1 , oshmem_num_procs (), NULL , 0 ,
1760
- & SHMEM_TEAM_WORLD );
1761
-
1762
- if (rc != OSHMEM_SUCCESS ) {
1763
- SPML_UCX_ERROR ("mca_spml_ucx_team_split_strided failed (SHMEM_TEAM_WORLD creation)" );
1764
- oshmem_shmem_abort (-1 );
1765
- }
1766
- }
1767
-
1768
- void mca_spml_ucx_team_world_destroy ()
1769
- {
1770
- if (SHMEM_TEAM_WORLD != NULL ) {
1771
- mca_spml_ucx_team_destroy (SHMEM_TEAM_WORLD );
1772
- SHMEM_TEAM_WORLD = NULL ;
1773
- }
1774
- }
1775
-
1776
1773
/* This routine is not implemented */
1777
1774
int mca_spml_ucx_team_sync (shmem_team_t team )
1778
1775
{
@@ -1820,16 +1817,16 @@ int mca_spml_ucx_team_translate_pe(shmem_team_t src_team, int src_pe,
1820
1817
mca_spml_ucx_team_t * ucx_dest_team = (mca_spml_ucx_team_t * ) dest_team ;
1821
1818
int global_pe ;
1822
1819
1823
- if (src_pe == SPML_UCX_PE_NOT_IN_TEAM || (src_team == dest_team )) {
1820
+ if (( src_pe == SPML_UCX_PE_NOT_IN_TEAM ) || (src_team == dest_team )) {
1824
1821
return src_pe ;
1825
1822
}
1826
1823
1827
- if (src_team == dest_team ) {
1828
- return src_pe ;
1829
- }
1830
-
1831
1824
global_pe = ucx_src_team -> start + src_pe * ucx_src_team -> stride ;
1832
1825
1826
+ SPML_UCX_WARN ("team_translate_pe(src_team=%p, src_pe=%d, dest_team=%p), global pe: %d, "
1827
+ "src_team->start: %d, src pe: %d, src_team->stride: %d" ,
1828
+ src_team , src_pe , dest_team , global_pe , ucx_src_team -> start , src_pe , ucx_src_team -> stride );
1829
+
1833
1830
if (dest_team == SHMEM_TEAM_WORLD ) {
1834
1831
return global_pe ;
1835
1832
}
@@ -1849,32 +1846,44 @@ int mca_spml_ucx_team_split_strided(shmem_team_t parent_team, int start, int
1849
1846
mca_spml_ucx_team_t * ucx_parent_team ;
1850
1847
mca_spml_ucx_team_t * ucx_new_team ;
1851
1848
int my_pe ;
1852
- int n_pes ;
1853
1849
1854
1850
SPML_UCX_ASSERT (((start + size * stride ) <= oshmem_num_procs ()) && (start < size ) && (stride > 0 ) && (size > 0 ));
1855
1851
1852
+ SPML_UCX_WARN ("team_split_strided(parent_team=%p, start=%d, stride=%d, size=%d, config=%p, "
1853
+ "config_mask=%ld, new_team=%p)" ,
1854
+ parent_team , start , stride , size , config , config_mask , new_team );
1855
+
1856
+ ucx_new_team = (mca_spml_ucx_team_t * )malloc (sizeof (mca_spml_ucx_team_t ));
1857
+ ucx_new_team -> start = start ;
1858
+ ucx_new_team -> stride = stride ;
1859
+
1856
1860
if (parent_team == NULL ) {
1857
1861
my_pe = shmem_my_pe ();
1858
1862
} else {
1859
1863
ucx_parent_team = (mca_spml_ucx_team_t * ) parent_team ;
1864
+
1860
1865
SPML_UCX_VALIDATE_TEAM (parent_team );
1861
1866
if (mca_spml_ucx_is_pe_in_strided_team (ucx_parent_team -> my_pe , start , stride , size )) {
1862
1867
my_pe = (ucx_parent_team -> my_pe - start ) / stride ;
1868
+ SPML_UCX_WARN ("split: my_pe at parent team: %d, start: %d, stride: %d, size: %d, "
1869
+ "my_pe at new team: %d" , ucx_parent_team -> my_pe , start , stride , size , my_pe );
1863
1870
} else {
1864
1871
/* not in team, according to spec it should be SHMEM_TEAM_INVALID but its value is NULL which
1865
1872
can be also interpreted as 0 (first pe), therefore -1 is used */
1873
+
1874
+ SPML_UCX_WARN ("pe #%d is not part of the new team" , ucx_parent_team -> my_pe );
1866
1875
my_pe = SPML_UCX_PE_NOT_IN_TEAM ;
1867
1876
}
1877
+
1878
+ /* In order to simplify pe translations start and stride are calculated with respect to
1879
+ * world_team */
1880
+ ucx_new_team -> start += ucx_parent_team -> start ;
1881
+ ucx_new_team -> stride *= ucx_parent_team -> stride ;
1868
1882
}
1869
1883
1870
- ucx_new_team = (mca_spml_ucx_team_t * )malloc (sizeof (mca_spml_ucx_team_t ));
1871
1884
ucx_new_team -> n_pes = size ;
1872
1885
ucx_new_team -> my_pe = my_pe ;
1873
1886
1874
- /* In order to simplify pe translations start and stride are calculated with respect to
1875
- * world_team */
1876
- ucx_new_team -> start = ucx_parent_team -> start + start ;
1877
- ucx_new_team -> stride = ucx_parent_team -> stride * stride ;
1878
1887
ucx_new_team -> config = calloc (1 , sizeof (mca_spml_ucx_team_config_t ));
1879
1888
1880
1889
if (config != NULL ) {
0 commit comments