@@ -132,9 +132,6 @@ def spmd(
132
132
j: {nnodes}x{nproc_per_node}. For GPU hosts omitting nproc_per_node will infer it from the GPU count on the host
133
133
env: environment variables to be passed to the run (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
134
134
max_retries: the number of scheduler retries allowed
135
- rdzv_port: the port on rank0's host to use for hosting the c10d store used for rendezvous.
136
- Only takes effect when running multi-node. When running single node, this parameter
137
- is ignored and a random free port is chosen.
138
135
mounts: (for docker based runs only) mounts to mount into the worker environment/container
139
136
(ex. type=<bind/volume>,src=/host,dst=/job[,readonly]).
140
137
debug: whether to run with preset debug flags enabled
@@ -174,6 +171,7 @@ def ddp(
174
171
max_retries : int = 0 ,
175
172
rdzv_port : int = 29500 ,
176
173
rdzv_backend : str = "c10d" ,
174
+ rdzv_conf : Optional [str ] = None ,
177
175
mounts : Optional [List [str ]] = None ,
178
176
debug : bool = False ,
179
177
tee : int = 3 ,
@@ -208,6 +206,7 @@ def ddp(
208
206
Only takes effect when running multi-node. When running single node, this parameter
209
207
is ignored and a random free port is chosen.
210
208
rdzv_backend: the rendezvous backend to use. Only takes effect when running multi-node.
209
+ rdzv_conf: the additional rendezvous configuration to use (ex. join_timeout=600,close_timeout=600,timeout=600).
211
210
mounts: mounts to mount into the worker environment/container (ex. type=<bind/volume>,src=/host,dst=/job[,readonly]).
212
211
See scheduler documentation for more info.
213
212
debug: whether to run with preset debug flags enabled
@@ -258,6 +257,7 @@ def ddp(
258
257
"torchrun" ,
259
258
"--rdzv_backend" ,
260
259
rdzv_backend ,
260
+ * (["--rdzv_conf" , rdzv_conf ] if rdzv_conf is not None else []),
261
261
"--rdzv_endpoint" ,
262
262
rdzv_endpoint ,
263
263
"--rdzv_id" ,
0 commit comments