\n",
+ "== Status == Current time: 2022-07-20 21:48:52 (running for 00:00:39.66) Memory usage on this node: 33.1/64.0 GiB Using FIFO scheduling algorithm. Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-48-13 Number of trials: 1/1 (1 TERMINATED)
\n",
"\n",
- "
Trial name
status
loc
\n",
+ "
Trial name
status
loc
iter
total time (s)
loss
_timestamp
_time_this_iter_s
\n",
"\n",
"\n",
- "
TorchTrainer_a8585_00000
TERMINATED
172.28.0.2:2126
\n",
+ "
TorchTrainer_53c58_00000
TERMINATED
127.0.0.1:39548
4
36.4582
824.229
1658378932
6.46339
\n",
"\n",
"
"
],
@@ -800,125 +714,102 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(_map_block_nosplit pid=2159)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(_map_block_nosplit pid=2159)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m 2022-05-25 22:26:19,944\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m 2022-05-25 22:26:20,033\tINFO torch.py:98 -- Moving model to device: cuda:0\n"
+ "2022-07-20 21:48:13,244\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 2.315190, epoch: 0, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.464406, epoch: 0, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.279081, epoch: 0, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.052461, epoch: 0, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.816213, epoch: 1, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.019127, epoch: 1, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.525613, epoch: 1, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.570595, epoch: 1, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.572004, epoch: 2, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.543432, epoch: 2, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.350156, epoch: 2, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.443743, epoch: 2, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.438318, epoch: 3, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.342512, epoch: 3, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.302048, epoch: 3, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.414025, epoch: 3, iteration: 1500\n"
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 2.282040, epoch: 0, iteration: 0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "2022-05-25 22:27:16,013\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n"
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m 2022-07-20 21:48:26,772\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Trial TorchTrainer_a8585_00000 completed. Last result: \n"
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 1.521038, epoch: 0, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 1.169452, epoch: 0, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.856338, epoch: 0, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.788410, epoch: 1, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.854239, epoch: 1, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.533351, epoch: 1, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.591339, epoch: 1, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.457057, epoch: 2, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.594715, epoch: 2, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.477588, epoch: 2, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.235412, epoch: 2, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.507374, epoch: 3, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.447128, epoch: 3, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.381943, epoch: 3, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.347877, epoch: 3, iteration: 1500\n",
+ "Result for TorchTrainer_53c58_00000:\n",
+ " _time_this_iter_s: 6.463389873504639\n",
+ " _timestamp: 1658378932\n",
+ " _training_iteration: 4\n",
+ " date: 2022-07-20_21-48-52\n",
+ " done: true\n",
+ " experiment_id: abc531ef544440268933d8221addeb9d\n",
+ " experiment_tag: '0'\n",
+ " hostname: Jiaos-MacBook-Pro-16-inch-2019\n",
+ " iterations_since_restore: 4\n",
+ " loss: 824.2287287414074\n",
+ " node_ip: 127.0.0.1\n",
+ " pid: 39548\n",
+ " should_checkpoint: true\n",
+ " time_since_restore: 36.45815992355347\n",
+ " time_this_iter_s: 6.464020013809204\n",
+ " time_total_s: 36.45815992355347\n",
+ " timestamp: 1658378932\n",
+ " timesteps_since_restore: 0\n",
+ " training_iteration: 4\n",
+ " trial_id: 53c58_00000\n",
+ " warmup_time: 0.003597259521484375\n",
+ " \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "2022-05-25 22:27:16,138\tINFO tune.py:753 -- Total run time: 74.68 seconds (74.45 seconds for the tuning loop).\n",
- "Map Progress (1 actors 1 pending): 0%| | 0/1 [00:01, ?it/s]\u001b[2m\u001b[36m(BlockWorker pid=2267)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(BlockWorker pid=2267)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "Map Progress (2 actors 1 pending): 100%|██████████| 1/1 [00:04<00:00, 4.18s/it]\n",
- "Map_Batches: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n",
- "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 13.60it/s]\n",
- "Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 24.76it/s]\n",
- "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 49.17it/s]\n"
+ "2022-07-20 21:48:52,891\tINFO tune.py:738 -- Total run time: 39.80 seconds (39.66 seconds for the tuning loop).\n",
+ "Map Progress (1 actors 1 pending): 0%| | 0/1 [00:01, ?it/s]\u001b[2m\u001b[36m(BlockWorker pid=39601)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n",
+ "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:03<00:00, 3.01s/it]\n",
+ "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 8.70it/s]\n",
+ "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 76.13it/s]\n",
+ "Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 82.57it/s]\n",
+ "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 134.32it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Accuracy for task 1: 0.946\n"
+ "Accuracy for task 1: 0.3767\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(ServeController pid=2382)\u001b[0m INFO 2022-05-25 22:27:23,467 controller 2382 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n",
- "\u001b[2m\u001b[36m(ServeController pid=2382)\u001b[0m INFO 2022-05-25 22:27:23,470 controller 2382 http_state.py:115 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:XnXlnS:SERVE_PROXY_ACTOR-node:172.28.0.2-0' on node 'node:172.28.0.2-0' listening on '127.0.0.1:8000'\n",
- "Shuffle Map: 0%| | 0/1 [00:00, ?it/s]\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO: Started server process [2415]\n",
- "Shuffle Map: 100%|██████████| 1/1 [00:01<00:00, 1.40s/it]\n",
- "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 7.72it/s]\n",
- "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.18s/it]\n",
- "\u001b[2m\u001b[36m(ServeController pid=2382)\u001b[0m INFO 2022-05-25 22:27:28,825 controller 2382 deployment_state.py:1219 - Adding 2 replicas to deployment 'mnist_model'.\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:32,954 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 4.8ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:32,977 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 21.4ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:32,985 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 4.4ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:32,976 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 15.5ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:32,992 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 5.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:32,952 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.4ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:32,982 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:32,997 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 11.8ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,008 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 6.1ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,017 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.1ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,022 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,031 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,036 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,044 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.4ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,048 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,057 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.9ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,061 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.2ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,070 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.4ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,074 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,082 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.4ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,088 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 3.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:33,016 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 4.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:33,029 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 4.4ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:33,043 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 4.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:33,056 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 4.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:33,068 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 4.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:33,081 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 4.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:33,007 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:33,021 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:33,035 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:33,047 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:33,060 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:33,073 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:33,086 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,122 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 25.8ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,134 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 3.0ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=2415)\u001b[0m INFO 2022-05-25 22:27:33,142 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:33,117 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 14.4ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2573)\u001b[0m INFO 2022-05-25 22:27:33,141 mnist_model mnist_model#vDEhSp replica.py:483 - HANDLE __call__ OK 4.4ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=2575)\u001b[0m INFO 2022-05-25 22:27:33,133 mnist_model mnist_model#QdDxIB replica.py:483 - HANDLE __call__ OK 0.4ms\n",
- "\u001b[2m\u001b[36m(ServeController pid=2382)\u001b[0m INFO 2022-05-25 22:27:33,225 controller 2382 deployment_state.py:1243 - Removing 2 replicas from deployment 'mnist_model'.\n",
- "Map Progress (2 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.58s/it]\n",
- "\u001b[2m\u001b[36m(_prepare_read pid=2726)\u001b[0m 2022-05-25 22:27:40,353\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n",
- "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.20it/s]\n",
- "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.41s/it]"
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:48:57,458 controller 39625 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:48:57,460 controller 39625 http_state.py:126 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:oEzsmU:SERVE_PROXY_ACTOR-db68eafa3bbe9042df574f3c9974b40ce8d97728db90282feefb4690' on node 'db68eafa3bbe9042df574f3c9974b40ce8d97728db90282feefb4690' listening on '127.0.0.1:8000'\n",
+ "Shuffle Map: 0%| | 0/1 [00:00, ?it/s]\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO: Started server process [39628]\n",
+ "Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 8.12it/s]\n",
+ "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 5.80it/s]\n",
+ "Map Progress (1 actors 0 pending): 100%|██████████| 1/1 [00:01<00:00, 1.16s/it]\n",
+ "/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/ipykernel_launcher.py:25: UserWarning: From /var/folders/1s/wy6f3ytn3q726p5hl8fw8d780000gn/T/ipykernel_39344/1249059442.py:25: deploy (from ray.serve.deployment) is deprecated and will be removed in a future version Please see https://docs.ray.io/en/latest/serve/index.html\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:49:00,913 controller 39625 deployment_state.py:1281 - Adding 2 replicas to deployment 'mnist_model'.\n",
+ "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.39s/it]\n",
+ "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.39it/s]\n",
+ "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n"
]
},
{
@@ -928,22 +819,15 @@
"Starting training for task: 1\n"
]
},
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
{
"data": {
"text/html": [
- "== Status == Current time: 2022-05-25 22:28:52 (running for 00:01:09.00) Memory usage on this node: 5.0/12.7 GiB Using FIFO scheduling algorithm. Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.31 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4) Result logdir: /root/ray_results/TorchTrainer_2022-05-25_22-27-43 Number of trials: 1/1 (1 TERMINATED)
\n",
+ "== Status == Current time: 2022-07-20 21:50:36 (running for 00:00:37.98) Memory usage on this node: 33.7/64.0 GiB Using FIFO scheduling algorithm. Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-49-58 Number of trials: 1/1 (1 TERMINATED)
\n",
"\n",
- "
Trial name
status
loc
\n",
+ "
Trial name
status
loc
iter
total time (s)
loss
_timestamp
_time_this_iter_s
\n",
"\n",
"\n",
- "
TorchTrainer_e4f66_00000
TERMINATED
172.28.0.2:2875
\n",
+ "
TorchTrainer_92bcd_00000
TERMINATED
127.0.0.1:39736
4
34.1132
707.634
1658379035
6.45643
\n",
"\n",
"
"
],
@@ -958,125 +842,105 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(_map_block_nosplit pid=2909)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(_map_block_nosplit pid=2909)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m 2022-05-25 22:28:01,917\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m 2022-05-25 22:28:02,063\tINFO torch.py:98 -- Moving model to device: cuda:0\n"
+ "\u001b[2m\u001b[36m(TorchTrainer pid=39736)\u001b[0m 2022-07-20 21:50:01,936\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,489\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 3.347775, epoch: 0, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 1.343975, epoch: 0, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.768560, epoch: 0, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.607410, epoch: 0, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.578952, epoch: 1, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.473788, epoch: 1, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.609530, epoch: 1, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.741895, epoch: 1, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.417272, epoch: 2, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.510404, epoch: 2, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.422137, epoch: 2, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.403623, epoch: 2, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.384720, epoch: 3, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.414567, epoch: 3, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.274302, epoch: 3, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.348169, epoch: 3, iteration: 1500\n"
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 3.301114, epoch: 0, iteration: 0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "2022-05-25 22:28:52,221\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n"
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,795\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Trial TorchTrainer_e4f66_00000 completed. Last result: \n"
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 1.075076, epoch: 0, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.536976, epoch: 0, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.600182, epoch: 0, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.546070, epoch: 1, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.448120, epoch: 1, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.392481, epoch: 1, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.371981, epoch: 1, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.521735, epoch: 2, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.635850, epoch: 2, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.395862, epoch: 2, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.402500, epoch: 2, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.236922, epoch: 3, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.528482, epoch: 3, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.372242, epoch: 3, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.355759, epoch: 3, iteration: 1500\n",
+ "Result for TorchTrainer_92bcd_00000:\n",
+ " _time_this_iter_s: 6.456433057785034\n",
+ " _timestamp: 1658379035\n",
+ " _training_iteration: 4\n",
+ " date: 2022-07-20_21-50-36\n",
+ " done: true\n",
+ " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n",
+ " experiment_tag: '0'\n",
+ " hostname: Jiaos-MacBook-Pro-16-inch-2019\n",
+ " iterations_since_restore: 4\n",
+ " loss: 707.6341038495302\n",
+ " node_ip: 127.0.0.1\n",
+ " pid: 39736\n",
+ " should_checkpoint: true\n",
+ " time_since_restore: 34.11321783065796\n",
+ " time_this_iter_s: 6.463765859603882\n",
+ " time_total_s: 34.11321783065796\n",
+ " timestamp: 1658379036\n",
+ " timesteps_since_restore: 0\n",
+ " training_iteration: 4\n",
+ " trial_id: 92bcd_00000\n",
+ " warmup_time: 0.005189180374145508\n",
+ " \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "2022-05-25 22:28:52,344\tINFO tune.py:753 -- Total run time: 69.20 seconds (68.99 seconds for the tuning loop).\n",
- "Map Progress (1 actors 1 pending): 0%| | 0/2 [00:01, ?it/s]\u001b[2m\u001b[36m(BlockWorker pid=3027)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(BlockWorker pid=3027)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "Map Progress (2 actors 1 pending): 100%|██████████| 2/2 [00:05<00:00, 2.64s/it]\n",
- "Map_Batches: 100%|██████████| 2/2 [00:01<00:00, 1.07it/s]\n",
- "Map_Batches: 100%|██████████| 2/2 [00:01<00:00, 1.55it/s]\n",
- "Shuffle Map: 100%|██████████| 2/2 [00:00<00:00, 3.78it/s]\n",
- "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 72.95it/s]\n"
+ "2022-07-20 21:50:36,835\tINFO tune.py:738 -- Total run time: 38.13 seconds (37.98 seconds for the tuning loop).\n",
+ "Map Progress (1 actors 1 pending): 0%| | 0/2 [00:01, ?it/s]\u001b[2m\u001b[36m(BlockWorker pid=39801)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n",
+ "Map Progress (2 actors 1 pending): 100%|██████████| 2/2 [00:03<00:00, 1.96s/it]\n",
+ "Map_Batches: 100%|██████████| 2/2 [00:00<00:00, 5.28it/s]\n",
+ "Map_Batches: 100%|██████████| 2/2 [00:00<00:00, 114.72it/s]\n",
+ "Shuffle Map: 100%|██████████| 2/2 [00:00<00:00, 162.16it/s]\n",
+ "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 140.57it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Accuracy for task 2: 0.9261\n"
+ "Accuracy for task 2: 0.36795\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(ServeController pid=3209)\u001b[0m INFO 2022-05-25 22:29:02,797 controller 3209 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n",
- "\u001b[2m\u001b[36m(ServeController pid=3209)\u001b[0m INFO 2022-05-25 22:29:02,802 controller 3209 http_state.py:115 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:lsPTvu:SERVE_PROXY_ACTOR-node:172.28.0.2-0' on node 'node:172.28.0.2-0' listening on '127.0.0.1:8000'\n",
- "Shuffle Map: 0%| | 0/1 [00:00, ?it/s]\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO: Started server process [3241]\n",
- "Shuffle Map: 100%|██████████| 1/1 [00:01<00:00, 1.54s/it]\n",
- "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 8.17it/s]\n",
- "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.15s/it]\n",
- "\u001b[2m\u001b[36m(ServeController pid=3209)\u001b[0m INFO 2022-05-25 22:29:08,327 controller 3209 deployment_state.py:1219 - Adding 2 replicas to deployment 'mnist_model'.\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,440 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 5.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,438 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,460 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 15.4ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,466 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 24.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,471 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.8ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,481 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.6ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,487 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 3.9ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,496 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.9ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,501 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,509 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.8ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,514 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.6ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,523 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.8ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,528 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,537 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.2ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,542 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.5ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,550 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 6.7ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,556 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 3.7ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,564 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.0ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,480 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 5.1ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,495 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 4.5ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,508 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 4.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,522 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 4.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,536 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 4.7ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,549 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 4.4ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,563 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 4.7ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,470 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,485 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,500 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,513 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,527 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,540 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,554 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,586 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 4.6ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,596 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 9.3ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,601 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.7ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=3241)\u001b[0m INFO 2022-05-25 22:29:12,610 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.0ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,594 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 6.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3401)\u001b[0m INFO 2022-05-25 22:29:12,609 mnist_model mnist_model#uumYOV replica.py:483 - HANDLE __call__ OK 4.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,583 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=3402)\u001b[0m INFO 2022-05-25 22:29:12,600 mnist_model mnist_model#Egafuf replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(ServeController pid=3209)\u001b[0m INFO 2022-05-25 22:29:12,699 controller 3209 deployment_state.py:1243 - Removing 2 replicas from deployment 'mnist_model'.\n",
- "Map Progress (2 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.56s/it]\n",
- "\u001b[2m\u001b[36m(_prepare_read pid=3556)\u001b[0m 2022-05-25 22:29:19,825\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n",
- "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.44it/s]\n",
- "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.41s/it]"
+ "Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 6.24it/s]\n",
+ "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 6.19it/s]\n",
+ "Map Progress (1 actors 0 pending): 100%|██████████| 1/1 [00:01<00:00, 1.18s/it]\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:42,924 controller 39625 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:45,044 controller 39625 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:47,377 controller 39625 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:49,504 controller 39625 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n",
+ "Map Progress (2 actors 0 pending): 100%|██████████| 1/1 [00:02<00:00, 2.36s/it]\n",
+ "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.04it/s]\n",
+ "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n"
]
},
{
@@ -1086,22 +950,15 @@
"Starting training for task: 2\n"
]
},
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
{
"data": {
"text/html": [
- "== Status == Current time: 2022-05-25 22:30:31 (running for 00:01:09.12) Memory usage on this node: 5.0/12.7 GiB Using FIFO scheduling algorithm. Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.31 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4) Result logdir: /root/ray_results/TorchTrainer_2022-05-25_22-29-22 Number of trials: 1/1 (1 TERMINATED)
\n",
+ "== Status == Current time: 2022-07-20 21:52:25 (running for 00:00:37.97) Memory usage on this node: 34.0/64.0 GiB Using FIFO scheduling algorithm. Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-51-47 Number of trials: 1/1 (1 TERMINATED)
\n",
"\n",
- "
Trial name
status
loc
\n",
+ "
Trial name
status
loc
iter
total time (s)
loss
_timestamp
_time_this_iter_s
\n",
"\n",
"\n",
- "
TorchTrainer_2040e_00000
TERMINATED
172.28.0.2:3703
\n",
+ "
TorchTrainer_d37db_00000
TERMINATED
127.0.0.1:39948
4
34.0141
671.998
1658379144
6.59292
\n",
"\n",
"
"
],
@@ -1116,123 +973,89 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(_map_block_nosplit pid=3738)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(_map_block_nosplit pid=3738)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m 2022-05-25 22:29:41,392\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m 2022-05-25 22:29:41,549\tINFO torch.py:98 -- Moving model to device: cuda:0\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 4.353125, epoch: 0, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 1.147782, epoch: 0, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.609233, epoch: 0, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.606812, epoch: 0, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.494777, epoch: 1, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.776362, epoch: 1, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.376833, epoch: 1, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.478181, epoch: 1, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.413856, epoch: 2, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.668218, epoch: 2, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.318078, epoch: 2, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.427121, epoch: 2, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.369263, epoch: 3, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.479945, epoch: 3, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.457482, epoch: 3, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.318416, epoch: 3, iteration: 1500\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2022-05-25 22:30:31,831\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n"
+ "\u001b[2m\u001b[36m(TorchTrainer pid=39948)\u001b[0m 2022-07-20 21:51:50,596\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,118\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,367\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Trial TorchTrainer_2040e_00000 completed. Last result: \n"
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 4.062408, epoch: 0, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.970063, epoch: 0, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.658269, epoch: 0, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.442650, epoch: 0, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.603212, epoch: 1, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.534739, epoch: 1, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.420072, epoch: 1, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.351545, epoch: 1, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.347010, epoch: 2, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.419703, epoch: 2, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.350773, epoch: 2, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.231652, epoch: 2, iteration: 1500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.343125, epoch: 3, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.547853, epoch: 3, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.353915, epoch: 3, iteration: 1000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.260028, epoch: 3, iteration: 1500\n",
+ "Result for TorchTrainer_d37db_00000:\n",
+ " _time_this_iter_s: 6.5929179191589355\n",
+ " _timestamp: 1658379144\n",
+ " _training_iteration: 4\n",
+ " date: 2022-07-20_21-52-24\n",
+ " done: true\n",
+ " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n",
+ " experiment_tag: '0'\n",
+ " hostname: Jiaos-MacBook-Pro-16-inch-2019\n",
+ " iterations_since_restore: 4\n",
+ " loss: 671.9976235236973\n",
+ " node_ip: 127.0.0.1\n",
+ " pid: 39948\n",
+ " should_checkpoint: true\n",
+ " time_since_restore: 34.01405596733093\n",
+ " time_this_iter_s: 6.590774774551392\n",
+ " time_total_s: 34.01405596733093\n",
+ " timestamp: 1658379144\n",
+ " timesteps_since_restore: 0\n",
+ " training_iteration: 4\n",
+ " trial_id: d37db_00000\n",
+ " warmup_time: 0.005116939544677734\n",
+ " \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "2022-05-25 22:30:31,953\tINFO tune.py:753 -- Total run time: 69.33 seconds (69.12 seconds for the tuning loop).\n",
- "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01, ?it/s]\u001b[2m\u001b[36m(BlockWorker pid=3857)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(BlockWorker pid=3857)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "Map Progress (2 actors 1 pending): 33%|███▎ | 1/3 [00:04<00:08, 4.24s/it]\u001b[2m\u001b[36m(BlockWorker pid=3886)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(BlockWorker pid=3886)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "Map Progress (2 actors 1 pending): 100%|██████████| 3/3 [00:06<00:00, 2.16s/it]\n",
- "Map_Batches: 100%|██████████| 3/3 [00:01<00:00, 1.53it/s]\n",
- "Map_Batches: 100%|██████████| 3/3 [00:00<00:00, 19.25it/s]\n",
- "Shuffle Map: 100%|██████████| 3/3 [00:00<00:00, 97.56it/s]\n",
- "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 64.24it/s]\n"
+ "2022-07-20 21:52:25,471\tINFO tune.py:738 -- Total run time: 38.13 seconds (37.97 seconds for the tuning loop).\n",
+ "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01, ?it/s]\u001b[2m\u001b[36m(BlockWorker pid=40038)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n",
+ "Map Progress (2 actors 1 pending): 100%|██████████| 3/3 [00:04<00:00, 1.62s/it]\n",
+ "Map_Batches: 100%|██████████| 3/3 [00:00<00:00, 7.77it/s]\n",
+ "Map_Batches: 100%|██████████| 3/3 [00:00<00:00, 136.51it/s]\n",
+ "Shuffle Map: 100%|██████████| 3/3 [00:00<00:00, 216.98it/s]\n",
+ "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 135.98it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Accuracy for task 3: 0.9001333333333333\n"
+ "Accuracy for task 3: 0.3590333333333333\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(ServeController pid=4011)\u001b[0m INFO 2022-05-25 22:30:43,081 controller 4011 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n",
- "\u001b[2m\u001b[36m(ServeController pid=4011)\u001b[0m INFO 2022-05-25 22:30:43,084 controller 4011 http_state.py:115 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:viEsyL:SERVE_PROXY_ACTOR-node:172.28.0.2-0' on node 'node:172.28.0.2-0' listening on '127.0.0.1:8000'\n",
- "Shuffle Map: 0%| | 0/1 [00:00, ?it/s]\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO: Started server process [4043]\n",
- "Shuffle Map: 100%|██████████| 1/1 [00:01<00:00, 1.61s/it]\n",
- "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 7.16it/s]\n",
- "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.36s/it]\n",
- "\u001b[2m\u001b[36m(ServeController pid=4011)\u001b[0m INFO 2022-05-25 22:30:48,663 controller 4011 deployment_state.py:1219 - Adding 2 replicas to deployment 'mnist_model'.\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,754 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 5.0ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,771 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 15.8ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,777 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 3.1ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,788 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 9.0ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,794 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 3.5ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,803 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.0ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,808 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.5ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,817 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.2ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,822 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:52,770 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 11.5ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:52,787 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 6.1ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:52,802 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 4.8ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:52,815 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 4.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:52,752 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:52,776 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:52,793 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:52,807 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:52,821 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,848 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 24.9ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,853 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.6ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,869 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 13.4ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:52,847 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 8.4ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:52,867 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 6.6ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:52,852 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,984 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 3.5ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:52,995 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 9.0ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:53,001 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 3.4ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:53,011 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 8.1ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:53,016 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.7ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:53,025 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 7.4ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:53,030 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 307 2.5ms\n",
- "\u001b[2m\u001b[36m(HTTPProxyActor pid=4043)\u001b[0m INFO 2022-05-25 22:30:53,045 http_proxy 172.28.0.2 http_proxy.py:320 - POST /mnist_predict 200 11.9ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:52,993 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 5.9ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:53,010 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 5.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:53,024 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 4.9ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4199)\u001b[0m INFO 2022-05-25 22:30:53,043 mnist_model mnist_model#kzOVuE replica.py:483 - HANDLE __call__ OK 4.9ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:52,982 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:52,999 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:53,015 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.2ms\n",
- "\u001b[2m\u001b[36m(mnist_model pid=4200)\u001b[0m INFO 2022-05-25 22:30:53,029 mnist_model mnist_model#QFllkk replica.py:483 - HANDLE __call__ OK 0.3ms\n",
- "\u001b[2m\u001b[36m(ServeController pid=4011)\u001b[0m INFO 2022-05-25 22:30:53,125 controller 4011 deployment_state.py:1243 - Removing 2 replicas from deployment 'mnist_model'.\n"
+ "Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 6.01it/s]\n",
+ "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 6.26it/s]\n",
+ "Map Progress (1 actors 0 pending): 100%|██████████| 1/1 [00:01<00:00, 1.17s/it]\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:52:32,498 controller 39625 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:52:34,634 controller 39625 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:52:36,956 controller 39625 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:52:39,078 controller 39625 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n",
+ "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:53:31,642 controller 39625 deployment_state.py:1304 - Removing 2 replicas from deployment 'mnist_model'.\n"
]
}
],
@@ -1258,7 +1081,7 @@
"# Number of data parallel workers to use for training.\n",
"num_workers = 1\n",
"# Whether to use GPU or not.\n",
- "use_gpu = True\n",
+ "use_gpu = False\n",
"\n",
"permuted_mnist = PermutedMNISTStream(n_tasks=n_tasks)\n",
"train_stream = permuted_mnist.generate_train_stream()\n",
@@ -1352,7 +1175,7 @@
{
"data": {
"text/plain": [
- "[0.946, 0.9261, 0.9001333333333333]"
+ "[0.3767, 0.36795, 0.3590333333333333]"
]
},
"execution_count": 11,
@@ -1388,7 +1211,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1401,26 +1224,11 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Map Progress (2 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.93s/it]\n",
- "Map Progress (2 actors 1 pending): 100%|██████████| 1/1 [00:03<00:00, 3.11s/it]\n",
- "Map Progress: 0%| | 0/1 [00:00, ?it/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2m\u001b[1m\u001b[36m(scheduler +8m58s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n",
- "\u001b[2m\u001b[1m\u001b[33m(scheduler +8m58s)\u001b[0m Warning: The following resource request cannot be scheduled right now: {'CPU': 1.0}. This is likely due to all cluster resources being claimed by actors. Consider creating fewer actors or adding more nodes to this Ray cluster.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Map Progress (2 actors 1 pending): 100%|██████████| 1/1 [00:03<00:00, 3.06s/it]\n",
- "Shuffle Map: 100%|██████████| 3/3 [00:04<00:00, 1.64s/it]\n",
- "Shuffle Reduce: 100%|██████████| 3/3 [00:02<00:00, 1.07it/s]\n"
+ "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.33s/it]\n",
+ "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.32s/it]\n",
+ "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.31s/it]\n",
+ "Shuffle Map: 100%|██████████| 3/3 [00:01<00:00, 2.55it/s]\n",
+ "Shuffle Reduce: 100%|██████████| 3/3 [00:01<00:00, 2.55it/s]\n"
]
}
],
@@ -1448,7 +1256,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -1458,15 +1266,22 @@
"outputId": "653b4dfc-ed47-4307-fa84-e4c4ea3ec354"
},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2022-07-20 21:53:44,223\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n"
+ ]
+ },
{
"data": {
"text/html": [
- "== Status == Current time: 2022-05-18 23:52:49 (running for 00:03:27.40) Memory usage on this node: 7.0/12.7 GiB Using FIFO scheduling algorithm. Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:T4) Result logdir: /root/ray_results/TorchTrainer_2022-05-18_23-49-22 Number of trials: 1/1 (1 TERMINATED)
\n",
+ "== Status == Current time: 2022-07-20 21:55:10 (running for 00:01:25.89) Memory usage on this node: 34.4/64.0 GiB Using FIFO scheduling algorithm. Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-53-44 Number of trials: 1/1 (1 TERMINATED)
\n",
"\n",
- "
Trial name
status
loc
\n",
+ "
Trial name
status
loc
iter
total time (s)
loss
_timestamp
_time_this_iter_s
\n",
"\n",
"\n",
- "
TorchTrainer_24496_00000
TERMINATED
172.28.0.2:4630
\n",
+ "
TorchTrainer_1923b_00000
TERMINATED
127.0.0.1:40228
4
82.7285
2328.8
1658379309
17.0239
\n",
"\n",
"
"
],
@@ -1481,85 +1296,59 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[2m\u001b[36m(_map_block_nosplit pid=4666)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(_map_block_nosplit pid=4666)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m 2022-05-18 23:50:06,950\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m 2022-05-18 23:50:07,011\tINFO torch.py:98 -- Moving model to device: cuda:0\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 2.373475, epoch: 0, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.699985, epoch: 0, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.636039, epoch: 0, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.334987, epoch: 0, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.152312, epoch: 0, iteration: 2000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.998297, epoch: 0, iteration: 2500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.434949, epoch: 0, iteration: 3000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.971171, epoch: 0, iteration: 3500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.796480, epoch: 0, iteration: 4000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.802282, epoch: 0, iteration: 4500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.731363, epoch: 0, iteration: 5000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.847772, epoch: 0, iteration: 5500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.879676, epoch: 1, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.564319, epoch: 1, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.714444, epoch: 1, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.565163, epoch: 1, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.739525, epoch: 1, iteration: 2000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.510878, epoch: 1, iteration: 2500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.814798, epoch: 1, iteration: 3000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.473765, epoch: 1, iteration: 3500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.557866, epoch: 1, iteration: 4000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.674371, epoch: 1, iteration: 4500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.532800, epoch: 1, iteration: 5000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.832442, epoch: 1, iteration: 5500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.557547, epoch: 2, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.355255, epoch: 2, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.426749, epoch: 2, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.484543, epoch: 2, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.360856, epoch: 2, iteration: 2000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.444718, epoch: 2, iteration: 2500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.596777, epoch: 2, iteration: 3000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.289816, epoch: 2, iteration: 3500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.407941, epoch: 2, iteration: 4000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.438239, epoch: 2, iteration: 4500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.379983, epoch: 2, iteration: 5000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.527786, epoch: 2, iteration: 5500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.598584, epoch: 3, iteration: 0\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.355202, epoch: 3, iteration: 500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.392683, epoch: 3, iteration: 1000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.415264, epoch: 3, iteration: 1500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.417230, epoch: 3, iteration: 2000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.289974, epoch: 3, iteration: 2500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.648514, epoch: 3, iteration: 3000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.369468, epoch: 3, iteration: 3500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.378548, epoch: 3, iteration: 4000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.392761, epoch: 3, iteration: 4500\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.555575, epoch: 3, iteration: 5000\n",
- "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.394487, epoch: 3, iteration: 5500\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2022-05-18 23:52:49,915\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n"
+ "\u001b[2m\u001b[36m(TorchTrainer pid=40228)\u001b[0m 2022-07-20 21:53:47,328\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Trial TorchTrainer_24496_00000 completed. Last result: \n"
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 2.305423, epoch: 0, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.935424, epoch: 0, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.174222, epoch: 0, iteration: 5000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.776577, epoch: 0, iteration: 5500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.674814, epoch: 1, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.699747, epoch: 1, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.795673, epoch: 1, iteration: 5000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.651217, epoch: 1, iteration: 5500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.743072, epoch: 2, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.745054, epoch: 2, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.639829, epoch: 2, iteration: 5000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.682482, epoch: 2, iteration: 5500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.553197, epoch: 3, iteration: 0\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.471037, epoch: 3, iteration: 500\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.538055, epoch: 3, iteration: 5000\n",
+ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.534079, epoch: 3, iteration: 5500\n",
+ "Result for TorchTrainer_1923b_00000:\n",
+ " _time_this_iter_s: 17.023871898651123\n",
+ " _timestamp: 1658379309\n",
+ " _training_iteration: 4\n",
+ " date: 2022-07-20_21-55-10\n",
+ " done: true\n",
+ " experiment_id: d304983bfe3f4e269118f8618aa9b02f\n",
+ " experiment_tag: '0'\n",
+ " hostname: Jiaos-MacBook-Pro-16-inch-2019\n",
+ " iterations_since_restore: 4\n",
+ " loss: 2328.8038033917546\n",
+ " node_ip: 127.0.0.1\n",
+ " pid: 40228\n",
+ " should_checkpoint: true\n",
+ " time_since_restore: 82.72845268249512\n",
+ " time_this_iter_s: 17.024354696273804\n",
+ " time_total_s: 82.72845268249512\n",
+ " timestamp: 1658379310\n",
+ " timesteps_since_restore: 0\n",
+ " training_iteration: 4\n",
+ " trial_id: 1923b_00000\n",
+ " warmup_time: 0.004433870315551758\n",
+ " \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "2022-05-18 23:52:50,042\tINFO tune.py:753 -- Total run time: 207.53 seconds (207.39 seconds for the tuning loop).\n"
+ "2022-07-20 21:55:10,233\tINFO tune.py:738 -- Total run time: 86.00 seconds (85.88 seconds for the tuning loop).\n"
]
}
],
@@ -1593,7 +1382,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1606,13 +1395,12 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01, ?it/s]\u001b[2m\u001b[36m(BlockWorker pid=4840)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
- "\u001b[2m\u001b[36m(BlockWorker pid=4840)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n",
- "Map Progress (2 actors 1 pending): 100%|██████████| 3/3 [00:06<00:00, 2.25s/it]\n",
- "Map Progress: 100%|██████████| 3/3 [00:01<00:00, 1.51it/s]\n",
- "Map Progress: 100%|██████████| 3/3 [00:01<00:00, 1.94it/s]\n",
- "Shuffle Map: 100%|██████████| 3/3 [00:00<00:00, 5.53it/s]\n",
- "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 65.42it/s]\n"
+ "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01, ?it/s]\u001b[2m\u001b[36m(BlockWorker pid=40400)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n",
+ "Map Progress (2 actors 1 pending): 100%|██████████| 3/3 [00:04<00:00, 1.62s/it]\n",
+ "Map_Batches: 100%|██████████| 3/3 [00:00<00:00, 63.30it/s]\n",
+ "Map_Batches: 100%|██████████| 3/3 [00:00<00:00, 129.65it/s]\n",
+ "Shuffle Map: 100%|██████████| 3/3 [00:00<00:00, 312.18it/s]\n",
+ "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 149.25it/s]\n"
]
}
],
@@ -1634,7 +1422,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1647,8 +1435,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Fully trained model accuracy: 0.9468\n",
- "Incrementally trained model accuracy: 0.9207666666666666\n"
+ "Fully trained model accuracy: 0.38016666666666665\n",
+ "Incrementally trained model accuracy: 0.3590333333333333\n"
]
}
],
@@ -1671,15 +1459,6 @@
"\n",
"\n"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "2GdLZD4od3oI"
- },
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -1690,7 +1469,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python 3.7.10 ('ray3.7')",
"language": "python",
"name": "python3"
},
@@ -1704,7 +1483,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.6"
+ "version": "3.7.10"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "99d89bfe98f3aa2d7facda0d08d31ff2a0af9559e5330d719288ce64a1966273"
+ }
}
},
"nbformat": 4,
diff --git a/python/ray/data/extensions/__init__.py b/python/ray/data/extensions/__init__.py
index a9ae87faeb02..70b4daedd328 100644
--- a/python/ray/data/extensions/__init__.py
+++ b/python/ray/data/extensions/__init__.py
@@ -1,6 +1,7 @@
from ray.data.extensions.tensor_extension import (
TensorDtype,
TensorArray,
+ TensorArrayElement,
ArrowTensorType,
ArrowTensorArray,
)
@@ -9,6 +10,7 @@
# Tensor array extension.
"TensorDtype",
"TensorArray",
+ "TensorArrayElement",
"ArrowTensorType",
"ArrowTensorArray",
]
diff --git a/python/ray/data/extensions/tensor_extension.py b/python/ray/data/extensions/tensor_extension.py
index 0fa577250a83..96b868bb47f5 100644
--- a/python/ray/data/extensions/tensor_extension.py
+++ b/python/ray/data/extensions/tensor_extension.py
@@ -1,6 +1,7 @@
from ray.air.util.tensor_extensions.pandas import ( # noqa: F401
TensorDtype,
TensorArray,
+ TensorArrayElement,
)
from ray.air.util.tensor_extensions.arrow import ( # noqa: F401
ArrowTensorType,
diff --git a/python/ray/serve/air_integrations.py b/python/ray/serve/air_integrations.py
index d3868763d683..ddf87c268c0a 100644
--- a/python/ray/serve/air_integrations.py
+++ b/python/ray/serve/air_integrations.py
@@ -45,6 +45,23 @@ def _load_predictor_cls(
return predictor_cls
+def _unpack_tensorarray_from_pandas(output_df: "pd.DataFrame") -> "pd.DataFrame":
+ """Unpack predictor's return value with TensorArray into numpy.
+
+ In dl_predictor.py we return a pd.DataFrame that could have multiple
+ columns but value of each column is a TensorArray. Flatten the
+ TensorArray to list to ensure output is json serializable as http
+ response.
+ """
+ from ray.data.extensions import TensorArray, TensorArrayElement
+
+ for col in output_df:
+ if isinstance(output_df[col].values, (TensorArray, TensorArrayElement)):
+ output_df[col] = output_df[col].to_numpy()
+
+ return output_df
+
+
class BatchingManager:
"""A collection of utilities for batching and splitting data."""
@@ -91,6 +108,9 @@ def split_dataframe(
f"The output dataframe should have length divisible by {batch_size}, "
f"but Serve got length {len(output_df)}."
)
+
+ output_df = _unpack_tensorarray_from_pandas(output_df)
+
return [df.reset_index(drop=True) for df in np.split(output_df, batch_size)]
@staticmethod
@@ -200,6 +220,8 @@ async def predict_impl(inp: Union[np.ndarray, "pd.DataFrame"]):
out = self.model.predict(inp, **predict_kwargs)
if isinstance(out, ray.ObjectRef):
out = await out
+ elif pd is not None and isinstance(out, pd.DataFrame):
+ out = _unpack_tensorarray_from_pandas(out)
return out
else:
diff --git a/python/ray/serve/tests/test_air_integrations.py b/python/ray/serve/tests/test_air_integrations.py
index 26bd6f8a39c9..b0a222a69879 100644
--- a/python/ray/serve/tests/test_air_integrations.py
+++ b/python/ray/serve/tests/test_air_integrations.py
@@ -16,6 +16,7 @@
from ray.serve.deployment_graph_build import build
from ray.serve.http_adapters import json_to_ndarray
from ray.train.predictor import DataBatchType, Predictor
+from ray.data.extensions import TensorArray
class TestBatchingFunctionFunctions:
@@ -73,6 +74,25 @@ def test_dataframe(self):
for i, j in zip(unpacked_list, list_of_dfs):
assert i.equals(j)
+ def test_dataframe_with_tensorarray(self):
+ batched_df = pd.DataFrame(
+ {
+ "a": TensorArray([1, 2, 3, 4]),
+ "b": TensorArray([5, 6, 7, 8]),
+ }
+ )
+ split_df = pd.DataFrame(
+ {
+ "a": [1, 2, 3, 4],
+ "b": [5, 6, 7, 8],
+ }
+ )
+
+ unpacked_list = BatchingManager.split_dataframe(batched_df, 1)
+ assert len(unpacked_list) == 1
+ assert unpacked_list[0]["a"].equals(split_df["a"])
+ assert unpacked_list[0]["b"].equals(split_df["b"])
+
class AdderPredictor(Predictor):
def __init__(self, increment: int, do_double: bool) -> None: