Open
Description
See for setup details: dmlc/xgboost#6232
Last thing in logs before hang is:
[1603364135.700317] [mr-dl10:19153:0] sock.c:344 UCX ERROR recv(fd=145) failed: Connection reset by peer
If I poke the faulthandler in python to see where things are hung for a process using 100% of 1 core, I see:
Thread 0x000014b686414700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/multiprocessing/popen_fork.py", line 28 in poll
File "/home/jon/minicondadai/lib/python3.6/multiprocessing/popen_fork.py", line 50 in wait
File "/home/jon/minicondadai/lib/python3.6/multiprocessing/process.py", line 124 in join
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/process.py", line 232 in _watch_process
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x000014b671fff700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 295 in wait
File "/home/jon/minicondadai/lib/python3.6/queue.py", line 164 in get
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/process.py", line 217 in _watch_message_queue
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x000014b671dfe700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/multiprocessing/popen_fork.py", line 28 in poll
File "/home/jon/minicondadai/lib/python3.6/multiprocessing/popen_fork.py", line 50 in wait
File "/home/jon/minicondadai/lib/python3.6/multiprocessing/process.py", line 124 in join
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/process.py", line 232 in _watch_process
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x000014b6717fb700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 295 in wait
File "/home/jon/minicondadai/lib/python3.6/queue.py", line 164 in get
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/process.py", line 217 in _watch_message_queue
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x000014b6719fc700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/profile.py", line 269 in _watch
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x000014b671bfd700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/contextlib.py", line 157 in helper
File "/home/jon/minicondadai/lib/python3.6/contextlib.py", line 52 in inner
File "/home/jon/minicondadai/lib/python3.6/site-packages/ucp/continuous_ucx_progress.py", line 81 in _fd_reader_callback
File "/home/jon/minicondadai/lib/python3.6/asyncio/events.py", line 145 in _run
File "/home/jon/minicondadai/lib/python3.6/asyncio/base_events.py", line 1462 in _run_once
File "/home/jon/minicondadai/lib/python3.6/asyncio/base_events.py", line 442 in run_forever
File "/home/jon/minicondadai/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 149 in start
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/utils.py", line 418 in run_loop
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x000014b686dd5700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 295 in wait
File "/home/jon/minicondadai/lib/python3.6/queue.py", line 164 in get
File "/home/jon/minicondadai/lib/python3.6/concurrent/futures/thread.py", line 67 in _worker
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x000014b7948bd700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/profile.py", line 269 in _watch
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Thread 0x000014b78662c700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/contextlib.py", line 52 in inner
File "/home/jon/minicondadai/lib/python3.6/site-packages/ucp/continuous_ucx_progress.py", line 81 in _fd_reader_callback
File "/home/jon/minicondadai/lib/python3.6/asyncio/events.py", line 145 in _run
File "/home/jon/minicondadai/lib/python3.6/asyncio/base_events.py", line 1462 in _run_once
File "/home/jon/minicondadai/lib/python3.6/asyncio/base_events.py", line 442 in run_forever
File "/home/jon/minicondadai/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 149 in start
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/utils.py", line 418 in run_loop
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 864 in run
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 884 in _bootstrap
Current thread 0x000014b8a3dfe700 (most recent call first):
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 299 in wait
File "/home/jon/minicondadai/lib/python3.6/threading.py", line 551 in wait
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/utils.py", line 336 in sync
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/deploy/cluster.py", line 163 in sync
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/deploy/cluster.py", line 364 in __exit__
File "/home/jon/minicondadai/lib/python3.6/site-packages/distributed/deploy/spec.py", line 409 in __exit__
File "/data/jon/h2oai.fullcondatest3/h2oaicore/train.py", line 1428 in predict
I run dask_cudf like:
import pandas as pd
os.environ["DASK_RMM__POOL_SIZE"] = "1GB"
os.environ["DASK_UCX__CUDA_COPY"] = "True" # os.environ needs using strings, not Python True/False
os.environ["DASK_UCX__TCP"] = "True"
os.environ["DASK_UCX__NVLINK"] = "False"
os.environ["DASK_UCX__INFINIBAND"] = "False"
def fit():
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
kwargs = dict(n_workers=None, threads_per_worker=1, processes=True, memory_limit='auto', device_memory_limit=None, CUDA_VISIBLE_DEVICES=None, data=None, local_directory=None, protocol='ucx', enable_tcp_over_ucx=True, enable_infiniband=False, enable_nvlink=False, enable_rdmacm=False, ucx_net_devices='auto', rmm_pool_size='1GB')
with LocalCUDACluster(**kwargs) as cluster:
with Client(cluster) as client:
import xgboost as xgb
import dask_cudf
target = "default payment next month"
Xpd = pd.read_csv("creditcard.csv")
Xpd = Xpd[['AGE', target]]
Xpd.to_csv("creditcard.csv")
X = dask_cudf.read_csv("creditcard.csv")
y = X[target]
X = X.drop(target, axis=1)
kwargs_fit = {}
kwargs_cudf_fit = kwargs_fit.copy()
valid_X = dask_cudf.read_csv("creditcard.csv")
valid_y = valid_X[target]
valid_X = valid_X.drop(target, axis=1)
kwargs_cudf_fit['eval_set'] = [(valid_X, valid_y)]
params = {} # copy.deepcopy(self.model.get_params())
params['tree_method'] = 'gpu_hist'
dask_model = xgb.dask.DaskXGBClassifier(**params)
dask_model.fit(X, y, verbose=True)
def predict():
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
kwargs = dict(n_workers=None, threads_per_worker=1, processes=True, memory_limit='auto', device_memory_limit=None, CUDA_VISIBLE_DEVICES=None, data=None, local_directory=None, protocol='ucx', enable_tcp_over_ucx=True, enable_infiniband=False, enable_nvlink=False, enable_rdmacm=False, ucx_net_devices='auto', rmm_pool_size='1GB')
with LocalCUDACluster(**kwargs) as cluster:
with Client(cluster) as client:
import xgboost as xgb
import dask_cudf
target = "default payment next month"
Xpd = pd.read_csv("creditcard.csv")
Xpd = Xpd[['AGE', target]]
Xpd.to_csv("creditcard.csv")
X = dask_cudf.read_csv("creditcard.csv")
y = X[target]
X = X.drop(target, axis=1)
kwargs_fit = {}
kwargs_cudf_fit = kwargs_fit.copy()
valid_X = dask_cudf.read_csv("creditcard.csv")
valid_y = valid_X[target]
valid_X = valid_X.drop(target, axis=1)
kwargs_cudf_fit['eval_set'] = [(valid_X, valid_y)]
params = {} # copy.deepcopy(self.model.get_params())
params['tree_method'] = 'gpu_hist'
dask_model = xgb.dask.DaskXGBClassifier(**params)
dask_model.predict_proba(X)
if __name__ == '__main__':
model = fit()
preds = predict()
I'm only giving a schematic of the code. It is not an MRE yet. But, for me here the hang is during the predict.
I'm new to using ucx, and hadn't seen this kind of hang before when using default options.
Any ideas?
Thanks!
Metadata
Metadata
Assignees
Labels
No labels
Type
Projects
Status
No status