You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have read the README and searched the existing issues.
System Info
10/23/2024 16:41:05 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_0.5M_CN...
Traceback (most recent call last):
File "/pingchuan/miniconda3/envs/python3.11/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
10/23/2024 16:41:05 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_0.5M_CN...
Traceback (most recent call last):
File "/pingchuan/miniconda3/envs/python3.11/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Reminder
System Info
10/23/2024 16:41:05 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_0.5M_CN...
Traceback (most recent call last):
File "/pingchuan/miniconda3/envs/python3.11/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
/pingchuan/LLaMA-Factory/src/llamafactory/launcher.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2024-10-23_16:41:16
host : dgx12
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 2648370)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
python:3.11
linux:unbuntu
llafactory:最新版
Reproduction
10/23/2024 16:41:05 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_0.5M_CN...
Traceback (most recent call last):
File "/pingchuan/miniconda3/envs/python3.11/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/pingchuan/miniconda3/envs/python3.11/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
/pingchuan/LLaMA-Factory/src/llamafactory/launcher.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2024-10-23_16:41:16
host : dgx12
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 2648370)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
python:3.11
linux:unbuntu
llafactory:最新版
Expected behavior
No response
Others
above
The text was updated successfully, but these errors were encountered: