You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Current batch/total batch num: 0/1243
2019-09-18 09:37:40.102211: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:666] Iteration = 0, topological sort failed with message: The graph couldn't be sorted in topological order.
2019-09-18 09:37:40.303894: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:666] Iteration = 1, topological sort failed with message: The graph couldn't be sorted in topological order.
2019-09-18 09:37:42.000486: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:666] Iteration = 0, topological sort failed with message: The graph couldn't be sorted in topological order.
2019-09-18 09:37:42.117158: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:666] Iteration = 1, topological sort failed with message: The graph couldn't be sorted in topological order.
2019-09-18 09:37:44.299611: E tensorflow/stream_executor/cuda/cuda_blas.cc:652] failed to run cuBLAS routine cublasSgemmBatched: CUBLAS_STATUS_EXECUTION_FAILED
2019-09-18 09:37:44.299648: E tensorflow/stream_executor/cuda/cuda_blas.cc:2574] Internal: failed BLAS call, see log for details
Traceback (most recent call last):
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1334, in _do_call
return fn(*args)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1319, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1407, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InternalError: Blas xGEMMBatched launch failed : a.shape=[8,4096,3], b.shape=[8,3,4096], m=4096, n=4096, k=3, batch_size=8
[[{{node tower_0/MatMul}} = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](tower_0/Squeeze, tower_0/transpose)]]
[[{{node tower_1/adj_conv_27/bn/cond/add/_4375}} = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:1", send_device_incarnation=1, tensor_name="edge_25108_tower_1/adj_conv_27/bn/cond/add", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "train.py", line 327, in
train()
File "train.py", line 270, in train
train_one_epoch(sess, ops, train_writer)
File "train.py", line 314, in train_one_epoch
feed_dict=feed_dict)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: Blas xGEMMBatched launch failed : a.shape=[8,4096,3], b.shape=[8,3,4096], m=4096, n=4096, k=3, batch_size=8
[[node tower_0/MatMul (defined at /disk/tia/tia/deep_gcns/utils/tf_util.py:655) = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](tower_0/Squeeze, tower_0/transpose)]]
[[{{node tower_1/adj_conv_27/bn/cond/add/_4375}} = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:1", send_device_incarnation=1, tensor_name="edge_25108_tower_1/adj_conv_27/bn/cond/add", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
Caused by op 'tower_0/MatMul', defined at:
File "train.py", line 327, in
train()
File "train.py", line 203, in train
skip_connect=SKIP_CONNECT)
File "/disk/tia/tia/deep_gcns/sem_seg/model.py", line 49, in init
skip_connect)
File "/disk/tia/tia/deep_gcns/sem_seg/model.py", line 82, in build_gcn_backbone_block
is_training=self.is_training)
File "/disk/tia/tia/deep_gcns/gcn_lib/gcn_utils.py", line 50, in build
is_training=is_training)
File "/disk/tia/tia/deep_gcns/gcn_lib/tf_edge.py", line 40, in dilated_knn_graph
dists = distance_metric(vertex_features)
File "/disk/tia/tia/deep_gcns/utils/tf_util.py", line 655, in pairwise_distance
point_cloud_inner = tf.matmul(point_cloud, point_cloud_transpose)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 2019, in matmul
a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1245, in batch_mat_mul
"BatchMatMul", x=x, y=y, adj_x=adj_x, adj_y=adj_y, name=name)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in init
self._traceback = tf_stack.extract_stack()
I have tried both the above two solutions, none of them works.....dispirited
The error is still here: InternalError (see above for traceback): Blas xGEMMBatched launch failed : a.shape=[8,4096,3], b.shape=[8,3,4096], m=4096, n=4096, k=3, batch_size=8 [[node tower_1/MatMul (defined at /disk/tia/tia/deep_gcns/utils/tf_util.py:655) = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:1"](tower_1/Squeeze, tower_1/transpose)]] [[{{node tower_0/cond_4/strided_slice_1/_1129}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_3329_tower_0/cond_4/strided_slice_1", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
**** EPOCH 001 ****
Current batch/total batch num: 0/1243
2019-09-18 09:37:40.102211: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:666] Iteration = 0, topological sort failed with message: The graph couldn't be sorted in topological order.
2019-09-18 09:37:40.303894: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:666] Iteration = 1, topological sort failed with message: The graph couldn't be sorted in topological order.
2019-09-18 09:37:42.000486: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:666] Iteration = 0, topological sort failed with message: The graph couldn't be sorted in topological order.
2019-09-18 09:37:42.117158: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:666] Iteration = 1, topological sort failed with message: The graph couldn't be sorted in topological order.
2019-09-18 09:37:44.299611: E tensorflow/stream_executor/cuda/cuda_blas.cc:652] failed to run cuBLAS routine cublasSgemmBatched: CUBLAS_STATUS_EXECUTION_FAILED
2019-09-18 09:37:44.299648: E tensorflow/stream_executor/cuda/cuda_blas.cc:2574] Internal: failed BLAS call, see log for details
Traceback (most recent call last):
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1334, in _do_call
return fn(*args)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1319, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1407, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InternalError: Blas xGEMMBatched launch failed : a.shape=[8,4096,3], b.shape=[8,3,4096], m=4096, n=4096, k=3, batch_size=8
[[{{node tower_0/MatMul}} = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](tower_0/Squeeze, tower_0/transpose)]]
[[{{node tower_1/adj_conv_27/bn/cond/add/_4375}} = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:1", send_device_incarnation=1, tensor_name="edge_25108_tower_1/adj_conv_27/bn/cond/add", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "train.py", line 327, in
train()
File "train.py", line 270, in train
train_one_epoch(sess, ops, train_writer)
File "train.py", line 314, in train_one_epoch
feed_dict=feed_dict)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: Blas xGEMMBatched launch failed : a.shape=[8,4096,3], b.shape=[8,3,4096], m=4096, n=4096, k=3, batch_size=8
[[node tower_0/MatMul (defined at /disk/tia/tia/deep_gcns/utils/tf_util.py:655) = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](tower_0/Squeeze, tower_0/transpose)]]
[[{{node tower_1/adj_conv_27/bn/cond/add/_4375}} = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:1", send_device_incarnation=1, tensor_name="edge_25108_tower_1/adj_conv_27/bn/cond/add", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
Caused by op 'tower_0/MatMul', defined at:
File "train.py", line 327, in
train()
File "train.py", line 203, in train
skip_connect=SKIP_CONNECT)
File "/disk/tia/tia/deep_gcns/sem_seg/model.py", line 49, in init
skip_connect)
File "/disk/tia/tia/deep_gcns/sem_seg/model.py", line 82, in build_gcn_backbone_block
is_training=self.is_training)
File "/disk/tia/tia/deep_gcns/gcn_lib/gcn_utils.py", line 50, in build
is_training=is_training)
File "/disk/tia/tia/deep_gcns/gcn_lib/tf_edge.py", line 40, in dilated_knn_graph
dists = distance_metric(vertex_features)
File "/disk/tia/tia/deep_gcns/utils/tf_util.py", line 655, in pairwise_distance
point_cloud_inner = tf.matmul(point_cloud, point_cloud_transpose)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 2019, in matmul
a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1245, in batch_mat_mul
"BatchMatMul", x=x, y=y, adj_x=adj_x, adj_y=adj_y, name=name)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/home/data/anaconda3/envs/lmt36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in init
self._traceback = tf_stack.extract_stack()
InternalError (see above for traceback): Blas xGEMMBatched launch failed : a.shape=[8,4096,3], b.shape=[8,3,4096], m=4096, n=4096, k=3, batch_size=8
[[node tower_0/MatMul (defined at /disk/tia/tia/deep_gcns/utils/tf_util.py:655) = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](tower_0/Squeeze, tower_0/transpose)]]
[[{{node tower_1/adj_conv_27/bn/cond/add/_4375}} = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:1", send_device_incarnation=1, tensor_name="edge_25108_tower_1/adj_conv_27/bn/cond/add", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
My environment is CUDA 9.0; cudnn 7.4; tensorflow-gpu 1.12.0 with two 2080ti
Can you help me with this problem?
The text was updated successfully, but these errors were encountered: