You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Describe the bug
Getting an error when running distributed training of lightgbm on ray cluster.
To Reproduce
To help us reproducing this bug, please provide information below:
Your Python version 3.8.13
The version of Mars you use master
Versions of crucial packages, numpy==1.23.2, pandas==1.4.2
Full stack of the error.
Minimized code to reproduce the error.
Here are my codes.
` from sklearn.datasets import load_boston
import mars.dataframe as md
from mars.learn.model_selection import train_test_split
from mars.learn.contrib import lightgbm as lgb
# connecting to exist ray cluster
ray.init(address=HEAD_ADDRESS)
sess = mars.new_session(backend="ray")
boston = load_boston()
data = md.DataFrame(boston.data, columns=boston.feature_names)
X_train, X_test, y_train, y_test = train_test_split(data, boston.target, train_size=0.7, random_state=0)
lg_reg = lgb.LGBMRegressor(colsample_bytree=0.3, learning_rate=0.1, max_depth=5, reg_alpha=10, n_estimators=10)
lg_reg.fit(X_train, y_train, session=sess)`
Here are the error logs.
Traceback (most recent call last):
File "/home/xxxx/test_in_cluster.py", line 146, in
lg_reg.fit(X_train, y_train, session=sess)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/learn/contrib/lightgbm/regressor.py", line 46, in fit
model = train(
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/learn/contrib/lightgbm/_train.py", line 454, in train
ret = op().execute(session=session, **run_kwargs).fetch(session=session)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/entity/executable.py", line 144, in execute
return execute(self, session=session, **kw)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 1890, in execute
return session.execute(
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 1684, in execute
execution_info: ExecutionInfo = fut.result(
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/concurrent/futures/_base.py", line 444, in result
return self.__get_result()
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
raise self._exception
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 1870, in _execute
await execution_info
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 105, in wait
return await self._aio_task
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 953, in _run_in_background
raise task_result.error.with_traceback(task_result.traceback)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/processor.py", line 368, in run
async for stage_args in self._iter_stage_chunk_graph():
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/processor.py", line 158, in _iter_stage_chunk_graph
chunk_graph = await self._get_next_chunk_graph(chunk_graph_iter)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/processor.py", line 149, in _get_next_chunk_graph
chunk_graph = await fut
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/lib/aio/_threads.py", line 36, in to_thread
return await loop.run_in_executor(None, func_call)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/processor.py", line 144, in next_chunk_graph
return next(chunk_graph_iter)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/preprocessor.py", line 194, in tile
for chunk_graph in chunk_graph_builder.build():
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 440, in build
yield from self._build()
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 434, in _build
graph = next(tile_iterator)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/preprocessor.py", line 74, in _iter_without_check
to_update_tileables = self._iter()
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 317, in _iter
self._tile(
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 211, in _tile
need_process = next(tile_handler)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 183, in _tile_handler
tiled_tileables = yield from handler.tile(tiled_tileables)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/entity/tileables.py", line 79, in tile
tiled_result = yield from tile_handler(op)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/learn/contrib/lightgbm/_train.py", line 233, in tile
workers = cls._get_data_chunks_workers(ctx, data)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/learn/contrib/lightgbm/_train.py", line 214, in _get_data_chunks_workers
metas = ctx.get_chunks_meta([c.key for c in data.chunks], fields=["bands"])
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/execution/ray/context.py", line 141, in get_chunks_meta
chunk_meta = self._task_chunks_meta[key]
KeyError: 'c5f54378896913ff524cde6a8e1ea1f6_0'
The text was updated successfully, but these errors were encountered:
Thanks for reporting this bug. Currently, the advanced third-party frameworks of mars learn (the libs in /mars/learn/contrib/) are not supported by ray backend.
You can try the mars backend or use mars -> to_ray_dataset then trainning in ray.
Thanks for reporting this bug. Currently, the advanced third-party frameworks of mars learn (the libs in /mars/learn/contrib/) are not supported by ray backend.
You can try the mars backend or use mars -> to_ray_dataset then trainning in ray.
thank you! I will try both methods and see if it works.
Describe the bug
Getting an error when running distributed training of lightgbm on ray cluster.
To Reproduce
To help us reproducing this bug, please provide information below:
Here are my codes.
` from sklearn.datasets import load_boston
import mars.dataframe as md
from mars.learn.model_selection import train_test_split
from mars.learn.contrib import lightgbm as lgb
Here are the error logs.
Traceback (most recent call last):
File "/home/xxxx/test_in_cluster.py", line 146, in
lg_reg.fit(X_train, y_train, session=sess)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/learn/contrib/lightgbm/regressor.py", line 46, in fit
model = train(
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/learn/contrib/lightgbm/_train.py", line 454, in train
ret = op().execute(session=session, **run_kwargs).fetch(session=session)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/entity/executable.py", line 144, in execute
return execute(self, session=session, **kw)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 1890, in execute
return session.execute(
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 1684, in execute
execution_info: ExecutionInfo = fut.result(
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/concurrent/futures/_base.py", line 444, in result
return self.__get_result()
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
raise self._exception
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 1870, in _execute
await execution_info
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 105, in wait
return await self._aio_task
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/deploy/oscar/session.py", line 953, in _run_in_background
raise task_result.error.with_traceback(task_result.traceback)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/processor.py", line 368, in run
async for stage_args in self._iter_stage_chunk_graph():
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/processor.py", line 158, in _iter_stage_chunk_graph
chunk_graph = await self._get_next_chunk_graph(chunk_graph_iter)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/processor.py", line 149, in _get_next_chunk_graph
chunk_graph = await fut
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/lib/aio/_threads.py", line 36, in to_thread
return await loop.run_in_executor(None, func_call)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/processor.py", line 144, in next_chunk_graph
return next(chunk_graph_iter)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/preprocessor.py", line 194, in tile
for chunk_graph in chunk_graph_builder.build():
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 440, in build
yield from self._build()
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 434, in _build
graph = next(tile_iterator)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/supervisor/preprocessor.py", line 74, in _iter_without_check
to_update_tileables = self._iter()
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 317, in _iter
self._tile(
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 211, in _tile
need_process = next(tile_handler)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/graph/builder/chunk.py", line 183, in _tile_handler
tiled_tileables = yield from handler.tile(tiled_tileables)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/core/entity/tileables.py", line 79, in tile
tiled_result = yield from tile_handler(op)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/learn/contrib/lightgbm/_train.py", line 233, in tile
workers = cls._get_data_chunks_workers(ctx, data)
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/learn/contrib/lightgbm/_train.py", line 214, in _get_data_chunks_workers
metas = ctx.get_chunks_meta([c.key for c in data.chunks], fields=["bands"])
File "/home/xxxx/anaconda3/envs/xxxx/lib/python3.8/site-packages/mars/services/task/execution/ray/context.py", line 141, in get_chunks_meta
chunk_meta = self._task_chunks_meta[key]
KeyError: 'c5f54378896913ff524cde6a8e1ea1f6_0'
The text was updated successfully, but these errors were encountered: