From 1931cf1df75be2b9c9e9ed821f3b6d50687c3227 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 22 Jun 2022 17:33:50 +0000 Subject: [PATCH 1/2] [AIR] Remove unnecessary pandas from examples --- .../ray-air/doc_code/air_key_concepts.py | 6 +- doc/source/ray-air/doc_code/preprocessors.py | 22 +- doc/source/ray-air/doc_code/tf_starter.py | 4 +- .../ray-air/doc_code/xgboost_starter.py | 14 +- ...ert_existing_pytorch_code_to_ray_air.ipynb | 883 ++++++++++++------ .../huggingface_text_classification.ipynb | 3 +- .../ray-air/examples/lightgbm_example.ipynb | 389 ++++---- .../ray-air/examples/sklearn_example.ipynb | 277 +++--- .../ray-air/examples/upload_to_comet_ml.ipynb | 10 +- .../ray-air/examples/upload_to_wandb.ipynb | 10 +- .../ray-air/examples/xgboost_example.ipynb | 351 +++---- ...ingface_basic_language_modeling_example.py | 3 +- .../tf/tensorflow_linear_dataset_example.py | 5 +- 13 files changed, 1119 insertions(+), 858 deletions(-) diff --git a/doc/source/ray-air/doc_code/air_key_concepts.py b/doc/source/ray-air/doc_code/air_key_concepts.py index 6d4e29798432..42727433751b 100644 --- a/doc/source/ray-air/doc_code/air_key_concepts.py +++ b/doc/source/ray-air/doc_code/air_key_concepts.py @@ -75,10 +75,8 @@ batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, XGBoostPredictor) # Bulk batch prediction. -predicted_labels = ( - batch_predictor.predict(test_dataset) - .map_batches(lambda df: (df > 0.5).astype(int), batch_format="pandas") - .to_pandas(limit=float("inf")) +predicted_labels = batch_predictor.predict(test_dataset).map_batches( + lambda df: (df > 0.5).astype(int), batch_format="pandas" ) # Pipelined batch prediction: instead of processing the data in bulk, process it diff --git a/doc/source/ray-air/doc_code/preprocessors.py b/doc/source/ray-air/doc_code/preprocessors.py index d5eb3d86b164..1c19f08d2b8f 100644 --- a/doc/source/ray-air/doc_code/preprocessors.py +++ b/doc/source/ray-air/doc_code/preprocessors.py @@ -89,18 +89,16 @@ batch_predictor = BatchPredictor.from_checkpoint(checkpoint, XGBoostPredictor) predicted_labels = batch_predictor.predict(test_dataset) -print(predicted_labels.to_pandas()) -# predictions -# 0 0.098437 -# 1 5.604667 -# 2 11.405312 -# 3 15.684700 -# 4 23.990948 -# 5 29.900211 -# 6 34.599442 -# 7 40.696899 -# 8 45.681076 -# 9 50.290031 +predicted_labels.show() +# {'predictions': 0.09843720495700836} +# {'predictions': 5.604666709899902} +# {'predictions': 11.405311584472656} +# {'predictions': 15.684700012207031} +# {'predictions': 23.990947723388672} +# {'predictions': 29.900211334228516} +# {'predictions': 34.59944152832031} +# {'predictions': 40.6968994140625} +# {'predictions': 45.68107604980469} # __predictor_end__ diff --git a/doc/source/ray-air/doc_code/tf_starter.py b/doc/source/ray-air/doc_code/tf_starter.py index a121bedae7f5..360503294fdc 100644 --- a/doc/source/ray-air/doc_code/tf_starter.py +++ b/doc/source/ray-air/doc_code/tf_starter.py @@ -105,7 +105,7 @@ def train_func(config: dict): predictions = batch_predictor.predict(prediction_dataset, dtype=tf.float32) -pandas_predictions = predictions.to_pandas(float("inf")) +print(f"PREDICTIONS") +predictions.show() -print(f"PREDICTIONS\n{pandas_predictions}") # __air_tf_batchpred_end__ diff --git a/doc/source/ray-air/doc_code/xgboost_starter.py b/doc/source/ray-air/doc_code/xgboost_starter.py index 94616fe77582..f2e7a3793d95 100644 --- a/doc/source/ray-air/doc_code/xgboost_starter.py +++ b/doc/source/ray-air/doc_code/xgboost_starter.py @@ -59,17 +59,13 @@ batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, XGBoostPredictor) -predicted_labels = ( - batch_predictor.predict(test_dataset) - .map_batches(lambda df: (df > 0.5).astype(int), batch_format="pandas") - .to_pandas(limit=float("inf")) +predicted_labels = batch_predictor.predict(test_dataset).map_batches( + lambda df: (df > 0.5).astype(int), batch_format="pandas" ) print("PREDICTED LABELS") -print(f"{predicted_labels}") +predicted_labels.show() -shap_values = batch_predictor.predict(test_dataset, pred_contribs=True).to_pandas( - limit=float("inf") -) +shap_values = batch_predictor.predict(test_dataset, pred_contribs=True) print("SHAP VALUES") -print(f"{shap_values}") +shap_values.show() # __air_xgb_batchpred_end__ diff --git a/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb b/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb index 6e0379973ff3..a94a9e4cfc81 100644 --- a/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb +++ b/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb @@ -73,11 +73,105 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "763ad47190b1461285dc3ea3a8177e6f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/26421880 [00:00Current time: 2022-06-14 13:03:47 (running for 00:00:49.63)
Memory usage on this node: 10.3/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/3.63 GiB heap, 0.0/1.81 GiB objects
Result logdir: /Users/kai/ray_results/TorchTrainer_2022-06-14_13-02-55
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-06-22 16:29:30 (running for 00:00:56.32)
Memory usage on this node: 7.5/31.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/15.32 GiB heap, 0.0/7.66 GiB objects
Result logdir: /home/ubuntu/ray_results/TorchTrainer_2022-06-22_16-28-33
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_8bcc7_00000TERMINATED127.0.0.1:7443 4 42.56151.24926 1655204626 9.67353
TorchTrainer_5c84a_00000TERMINATED172.31.43.110:1481731 4 47.56351.2631 1655915369 11.0948


" ], @@ -793,126 +895,126 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-06-14 13:03:00,221\tWARNING worker.py:1737 -- Warning: The actor TrainTrainable is very large (52 MiB). Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use ray.put() to put large objects in the Ray object store.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7448)\u001b[0m 2022-06-14 13:03:06,880\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=2]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7449)\u001b[0m 2022-06-14 13:03:06,879\tINFO config.py:71 -- Setting up process group for: env:// [rank=1, world_size=2]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7448)\u001b[0m 2022-06-14 13:03:08,303\tINFO train_loop_utils.py:293 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7448)\u001b[0m 2022-06-14 13:03:08,303\tINFO train_loop_utils.py:331 -- Wrapping provided model in DDP.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7449)\u001b[0m 2022-06-14 13:03:08,303\tINFO train_loop_utils.py:293 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7449)\u001b[0m 2022-06-14 13:03:08,303\tINFO train_loop_utils.py:331 -- Wrapping provided model in DDP.\n" + "2022-06-22 16:28:38,581\tWARNING worker.py:1726 -- Warning: The actor TrainTrainable is very large (52 MiB). Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use ray.put() to put large objects in the Ray object store.\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481763)\u001b[0m 2022-06-22 16:28:44,894\tINFO config.py:70 -- Setting up process group for: env:// [rank=0, world_size=2]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481764)\u001b[0m 2022-06-22 16:28:44,891\tINFO config.py:70 -- Setting up process group for: env:// [rank=1, world_size=2]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481763)\u001b[0m 2022-06-22 16:28:46,425\tINFO train_loop_utils.py:293 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481763)\u001b[0m 2022-06-22 16:28:46,425\tINFO train_loop_utils.py:331 -- Wrapping provided model in DDP.\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481764)\u001b[0m 2022-06-22 16:28:46,425\tINFO train_loop_utils.py:293 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481764)\u001b[0m 2022-06-22 16:28:46,425\tINFO train_loop_utils.py:331 -- Wrapping provided model in DDP.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Result for TorchTrainer_8bcc7_00000:\n", - " _time_this_iter_s: 9.377830982208252\n", - " _timestamp: 1655204597\n", + "Result for TorchTrainer_5c84a_00000:\n", + " _time_this_iter_s: 10.956670761108398\n", + " _timestamp: 1655915337\n", " _training_iteration: 1\n", - " date: 2022-06-14_13-03-17\n", + " date: 2022-06-22_16-28-57\n", " done: false\n", - " experiment_id: d262dbae86774c4fb809871401db393d\n", - " hostname: Kais-MacBook-Pro.fritz.box\n", + " experiment_id: dd2810ff95f74b1a8390f918b6c122fe\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 1\n", - " loss: 2.1573975238071124\n", - " node_ip: 127.0.0.1\n", - " pid: 7443\n", - " time_since_restore: 13.348651885986328\n", - " time_this_iter_s: 13.348651885986328\n", - " time_total_s: 13.348651885986328\n", - " timestamp: 1655204597\n", + " loss: 2.1705087840936748\n", + " node_ip: 172.31.43.110\n", + " pid: 1481731\n", + " time_since_restore: 14.807097911834717\n", + " time_this_iter_s: 14.807097911834717\n", + " time_total_s: 14.807097911834717\n", + " timestamp: 1655915337\n", " timesteps_since_restore: 0\n", " training_iteration: 1\n", - " trial_id: 8bcc7_00000\n", - " warmup_time: 0.0038008689880371094\n", + " trial_id: 5c84a_00000\n", + " warmup_time: 0.0042934417724609375\n", " \n", - "Result for TorchTrainer_8bcc7_00000:\n", - " _time_this_iter_s: 9.486207962036133\n", - " _timestamp: 1655204607\n", + "Result for TorchTrainer_5c84a_00000:\n", + " _time_this_iter_s: 10.683637142181396\n", + " _timestamp: 1655915347\n", " _training_iteration: 2\n", - " date: 2022-06-14_13-03-27\n", + " date: 2022-06-22_16-29-07\n", " done: false\n", - " experiment_id: d262dbae86774c4fb809871401db393d\n", - " hostname: Kais-MacBook-Pro.fritz.box\n", + " experiment_id: dd2810ff95f74b1a8390f918b6c122fe\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 2\n", - " loss: 1.88913657179304\n", - " node_ip: 127.0.0.1\n", - " pid: 7443\n", - " time_since_restore: 22.830953121185303\n", - " time_this_iter_s: 9.482301235198975\n", - " time_total_s: 22.830953121185303\n", - " timestamp: 1655204607\n", + " loss: 1.918477459318319\n", + " node_ip: 172.31.43.110\n", + " pid: 1481731\n", + " time_since_restore: 25.498638153076172\n", + " time_this_iter_s: 10.691540241241455\n", + " time_total_s: 25.498638153076172\n", + " timestamp: 1655915347\n", " timesteps_since_restore: 0\n", " training_iteration: 2\n", - " trial_id: 8bcc7_00000\n", - " warmup_time: 0.0038008689880371094\n", + " trial_id: 5c84a_00000\n", + " warmup_time: 0.0042934417724609375\n", " \n", - "Result for TorchTrainer_8bcc7_00000:\n", - " _time_this_iter_s: 10.05704402923584\n", - " _timestamp: 1655204617\n", + "Result for TorchTrainer_5c84a_00000:\n", + " _time_this_iter_s: 10.996578216552734\n", + " _timestamp: 1655915358\n", " _training_iteration: 3\n", - " date: 2022-06-14_13-03-37\n", + " date: 2022-06-22_16-29-18\n", " done: false\n", - " experiment_id: d262dbae86774c4fb809871401db393d\n", - " hostname: Kais-MacBook-Pro.fritz.box\n", + " experiment_id: dd2810ff95f74b1a8390f918b6c122fe\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 3\n", - " loss: 1.5208747804544533\n", - " node_ip: 127.0.0.1\n", - " pid: 7443\n", - " time_since_restore: 32.88802194595337\n", - " time_this_iter_s: 10.057068824768066\n", - " time_total_s: 32.88802194595337\n", - " timestamp: 1655204617\n", + " loss: 1.54556822397147\n", + " node_ip: 172.31.43.110\n", + " pid: 1481731\n", + " time_since_restore: 36.48866558074951\n", + " time_this_iter_s: 10.99002742767334\n", + " time_total_s: 36.48866558074951\n", + " timestamp: 1655915358\n", " timesteps_since_restore: 0\n", " training_iteration: 3\n", - " trial_id: 8bcc7_00000\n", - " warmup_time: 0.0038008689880371094\n", + " trial_id: 5c84a_00000\n", + " warmup_time: 0.0042934417724609375\n", " \n", - "Result for TorchTrainer_8bcc7_00000:\n", - " _time_this_iter_s: 9.673533201217651\n", - " _timestamp: 1655204626\n", + "Result for TorchTrainer_5c84a_00000:\n", + " _time_this_iter_s: 11.09483027458191\n", + " _timestamp: 1655915369\n", " _training_iteration: 4\n", - " date: 2022-06-14_13-03-46\n", + " date: 2022-06-22_16-29-29\n", " done: false\n", - " experiment_id: d262dbae86774c4fb809871401db393d\n", - " hostname: Kais-MacBook-Pro.fritz.box\n", + " experiment_id: dd2810ff95f74b1a8390f918b6c122fe\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 4\n", - " loss: 1.2492616913121217\n", - " node_ip: 127.0.0.1\n", - " pid: 7443\n", - " time_since_restore: 42.56152319908142\n", - " time_this_iter_s: 9.673501253128052\n", - " time_total_s: 42.56152319908142\n", - " timestamp: 1655204626\n", + " loss: 1.263096342800529\n", + " node_ip: 172.31.43.110\n", + " pid: 1481731\n", + " time_since_restore: 47.56349587440491\n", + " time_this_iter_s: 11.074830293655396\n", + " time_total_s: 47.56349587440491\n", + " timestamp: 1655915369\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: 8bcc7_00000\n", - " warmup_time: 0.0038008689880371094\n", + " trial_id: 5c84a_00000\n", + " warmup_time: 0.0042934417724609375\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7449)\u001b[0m Done!\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7448)\u001b[0m Done!\n", - "Result for TorchTrainer_8bcc7_00000:\n", - " _time_this_iter_s: 9.673533201217651\n", - " _timestamp: 1655204626\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481763)\u001b[0m Done!\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481764)\u001b[0m Done!\n", + "Result for TorchTrainer_5c84a_00000:\n", + " _time_this_iter_s: 11.09483027458191\n", + " _timestamp: 1655915369\n", " _training_iteration: 4\n", - " date: 2022-06-14_13-03-46\n", + " date: 2022-06-22_16-29-29\n", " done: true\n", - " experiment_id: d262dbae86774c4fb809871401db393d\n", + " experiment_id: dd2810ff95f74b1a8390f918b6c122fe\n", " experiment_tag: '0'\n", - " hostname: Kais-MacBook-Pro.fritz.box\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 4\n", - " loss: 1.2492616913121217\n", - " node_ip: 127.0.0.1\n", - " pid: 7443\n", - " time_since_restore: 42.56152319908142\n", - " time_this_iter_s: 9.673501253128052\n", - " time_total_s: 42.56152319908142\n", - " timestamp: 1655204626\n", + " loss: 1.263096342800529\n", + " node_ip: 172.31.43.110\n", + " pid: 1481731\n", + " time_since_restore: 47.56349587440491\n", + " time_this_iter_s: 11.074830293655396\n", + " time_total_s: 47.56349587440491\n", + " timestamp: 1655915369\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: 8bcc7_00000\n", - " warmup_time: 0.0038008689880371094\n", + " trial_id: 5c84a_00000\n", + " warmup_time: 0.0042934417724609375\n", " \n" ] }, @@ -920,14 +1022,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-06-14 13:03:47,154\tINFO tune.py:742 -- Total run time: 51.21 seconds (49.63 seconds for the tuning loop).\n" + "2022-06-22 16:29:31,024\tINFO tune.py:734 -- Total run time: 57.58 seconds (56.31 seconds for the tuning loop).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Last result: {'loss': 1.2492616913121217, '_timestamp': 1655204626, '_time_this_iter_s': 9.673533201217651, '_training_iteration': 4, 'time_this_iter_s': 9.673501253128052, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 4, 'trial_id': '8bcc7_00000', 'experiment_id': 'd262dbae86774c4fb809871401db393d', 'date': '2022-06-14_13-03-46', 'timestamp': 1655204626, 'time_total_s': 42.56152319908142, 'pid': 7443, 'hostname': 'Kais-MacBook-Pro.fritz.box', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 42.56152319908142, 'timesteps_since_restore': 0, 'iterations_since_restore': 4, 'warmup_time': 0.0038008689880371094, 'experiment_tag': '0'}\n" + "Last result: {'loss': 1.263096342800529, '_timestamp': 1655915369, '_time_this_iter_s': 11.09483027458191, '_training_iteration': 4, 'time_this_iter_s': 11.074830293655396, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 4, 'trial_id': '5c84a_00000', 'experiment_id': 'dd2810ff95f74b1a8390f918b6c122fe', 'date': '2022-06-22_16-29-29', 'timestamp': 1655915369, 'time_total_s': 47.56349587440491, 'pid': 1481731, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 47.56349587440491, 'timesteps_since_restore': 0, 'iterations_since_restore': 4, 'warmup_time': 0.0042934417724609375, 'experiment_tag': '0'}\n" ] } ], @@ -1051,12 +1153,12 @@ { "data": { "text/html": [ - "== Status ==
Current time: 2022-06-14 13:04:34 (running for 00:00:47.20)
Memory usage on this node: 11.4/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/3.63 GiB heap, 0.0/1.81 GiB objects
Result logdir: /Users/kai/ray_results/TorchTrainer_2022-06-14_13-03-47
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-06-22 16:30:41 (running for 00:00:56.46)
Memory usage on this node: 7.2/31.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/15.32 GiB heap, 0.0/7.66 GiB objects
Result logdir: /home/ubuntu/ray_results/TorchTrainer_2022-06-22_16-29-44
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_a9dda_00000TERMINATED127.0.0.1:7485 4 41.98631.22261 1655204673 9.94109
TorchTrainer_86514_00000TERMINATED172.31.43.110:1481879 4 53.10381.24844 1655915440 11.4238


" ], @@ -1071,134 +1173,280 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7491)\u001b[0m 2022-06-14 13:03:54,234\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=2]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7492)\u001b[0m 2022-06-14 13:03:54,234\tINFO config.py:71 -- Setting up process group for: env:// [rank=1, world_size=2]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7491)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/torchvision/datasets/mnist.py:498: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:180.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7491)\u001b[0m return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7491)\u001b[0m 2022-06-14 13:03:55,404\tINFO train_loop_utils.py:293 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7491)\u001b[0m 2022-06-14 13:03:55,404\tINFO train_loop_utils.py:331 -- Wrapping provided model in DDP.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7492)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/torchvision/datasets/mnist.py:498: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:180.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7492)\u001b[0m return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7492)\u001b[0m 2022-06-14 13:03:55,404\tINFO train_loop_utils.py:293 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=7492)\u001b[0m 2022-06-14 13:03:55,404\tINFO train_loop_utils.py:331 -- Wrapping provided model in DDP.\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481912)\u001b[0m 2022-06-22 16:29:50,060\tINFO config.py:70 -- Setting up process group for: env:// [rank=1, world_size=2]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481911)\u001b[0m 2022-06-22 16:29:50,039\tINFO config.py:70 -- Setting up process group for: env:// [rank=0, world_size=2]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481912)\u001b[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481911)\u001b[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481912)\u001b[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=1481911)\u001b[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/26421880 [00:00\n" + "Last result: {'loss': 1.2484390530616614, '_timestamp': 1655915440, '_time_this_iter_s': 11.423810482025146, '_training_iteration': 4, 'time_this_iter_s': 11.438615560531616, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 4, 'trial_id': '86514_00000', 'experiment_id': '1e7954bef1c6432785374780fb0da29e', 'date': '2022-06-22_16-30-40', 'timestamp': 1655915440, 'time_total_s': 53.103771924972534, 'pid': 1481879, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 53.103771924972534, 'timesteps_since_restore': 0, 'iterations_since_restore': 4, 'warmup_time': 0.004637956619262695, 'experiment_tag': '0'}\n", + "Checkpoint: \n" ] } ], @@ -1393,7 +1641,19 @@ "execution_count": 23, "id": "4d8b0f50", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'BatchPredictor' from 'ray.air' (/home/ubuntu/ray/python/ray/air/__init__.py)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/home/ubuntu/ray/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb Cell 49'\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mray\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mair\u001b[39;00m \u001b[39mimport\u001b[39;00m BatchPredictor\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mray\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mair\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpredictors\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mintegrations\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtorch\u001b[39;00m \u001b[39mimport\u001b[39;00m TorchPredictor\n\u001b[1;32m 4\u001b[0m batch_predictor \u001b[39m=\u001b[39m BatchPredictor\u001b[39m.\u001b[39mfrom_checkpoint(result\u001b[39m.\u001b[39mcheckpoint, TorchPredictor, model\u001b[39m=\u001b[39mNeuralNetwork())\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'BatchPredictor' from 'ray.air' (/home/ubuntu/ray/python/ray/air/__init__.py)" + ] + } + ], "source": [ "from ray.air import BatchPredictor\n", "from ray.air.predictors.integrations.torch import TorchPredictor\n", @@ -1411,7 +1671,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "8cb0556f", "metadata": {}, "outputs": [], @@ -1431,7 +1691,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "8a823f7a", "metadata": {}, "outputs": [ @@ -1452,12 +1712,12 @@ "id": "41094a55", "metadata": {}, "source": [ - "`results` is another Ray Dataset. We can use `results.to_pandas()` to see our prediction results:" + "`results` is another Ray Dataset. We can use `results.show()` to see our prediction results:" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "d3dce40d", "metadata": {}, "outputs": [ @@ -1558,7 +1818,7 @@ } ], "source": [ - "results.to_pandas()" + "results.show()" ] }, { @@ -1571,7 +1831,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "f17b5c10", "metadata": {}, "outputs": [ @@ -1599,7 +1859,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "207e13b9", "metadata": {}, "outputs": [], @@ -1618,7 +1878,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "2b2decc6", "metadata": {}, "outputs": [ @@ -1719,7 +1979,7 @@ } ], "source": [ - "merged.to_pandas()" + "merged.show()" ] }, { @@ -1744,7 +2004,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.8.10 ('venv': venv)", "language": "python", "name": "python3" }, @@ -1758,7 +2018,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" + } } }, "nbformat": 4, diff --git a/doc/source/ray-air/examples/huggingface_text_classification.ipynb b/doc/source/ray-air/examples/huggingface_text_classification.ipynb index e75ed1e95cdb..529e5f236a99 100644 --- a/doc/source/ray-air/examples/huggingface_text_classification.ipynb +++ b/doc/source/ray-air/examples/huggingface_text_classification.ipynb @@ -1384,8 +1384,7 @@ ")\n", "data = ray.data.from_pandas(pd.DataFrame(sentences, columns=[\"sentence\"]))\n", "prediction = predictor.predict(data)\n", - "prediction = prediction.to_pandas()\n", - "prediction" + "prediction.show()" ] }, { diff --git a/doc/source/ray-air/examples/lightgbm_example.ipynb b/doc/source/ray-air/examples/lightgbm_example.ipynb index c59ba7262f61..4083be1dfe80 100644 --- a/doc/source/ray-air/examples/lightgbm_example.ipynb +++ b/doc/source/ray-air/examples/lightgbm_example.ipynb @@ -37,17 +37,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "102ef1ac", "metadata": {}, "outputs": [], "source": [ - "import argparse\n", - "import math\n", "from typing import Tuple\n", "\n", - "import pandas as pd\n", - "\n", "import ray\n", "from ray.train.batch_predictor import BatchPredictor\n", "from ray.train.lightgbm import LightGBMPredictor\n", @@ -56,9 +52,8 @@ "from ray.train.lightgbm import LightGBMTrainer\n", "from ray.data.dataset import Dataset\n", "from ray.air.result import Result\n", - "from ray.data.preprocessors import StandardScaler\n", - "from sklearn.datasets import load_breast_cancer\n", - "from sklearn.model_selection import train_test_split" + "from ray.air.util.datasets import train_test_split\n", + "from ray.data.preprocessors import StandardScaler" ] }, { @@ -71,24 +66,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "id": "f1f35cd7", "metadata": {}, "outputs": [], "source": [ "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n", - " data_raw = load_breast_cancer()\n", - " dataset_df = pd.DataFrame(data_raw[\"data\"], columns=data_raw[\"feature_names\"])\n", - " dataset_df[\"target\"] = data_raw[\"target\"]\n", - " # add a random categorical column\n", - " num_samples = len(dataset_df)\n", - " dataset_df[\"categorical_column\"] = pd.Series(\n", - " ([\"A\", \"B\"] * math.ceil(num_samples / 2))[:num_samples]\n", - " )\n", - " train_df, test_df = train_test_split(dataset_df, test_size=0.3)\n", - " train_dataset = ray.data.from_pandas(train_df)\n", - " valid_dataset = ray.data.from_pandas(test_df)\n", - " test_dataset = ray.data.from_pandas(test_df.drop(\"target\", axis=1))\n", + " dataset = ray.data.read_csv(\"s3://air-example-data/breast_cancer_with_categorical.csv\")\n", + " train_dataset, valid_dataset = train_test_split(dataset, test_size=0.3)\n", + " test_dataset = valid_dataset.map_batches(lambda df: df.drop(\"target\", axis=1), batch_format=\"pandas\")\n", " return train_dataset, valid_dataset, test_dataset" ] }, @@ -102,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "id": "fefcbc8a", "metadata": {}, "outputs": [], @@ -150,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "id": "3f1d0c19", "metadata": {}, "outputs": [], @@ -164,14 +150,13 @@ " predicted_labels = (\n", " batch_predictor.predict(test_dataset)\n", " .map_batches(lambda df: (df > 0.5).astype(int), batch_format=\"pandas\")\n", - " .to_pandas(limit=float(\"inf\"))\n", " )\n", - " print(f\"PREDICTED LABELS\\n{predicted_labels}\")\n", + " print(f\"PREDICTED LABELS\")\n", + " predicted_labels.show()\n", "\n", - " shap_values = batch_predictor.predict(test_dataset, pred_contrib=True).to_pandas(\n", - " limit=float(\"inf\")\n", - " )\n", - " print(f\"SHAP VALUES\\n{shap_values}\")" + " shap_values = batch_predictor.predict(test_dataset, pred_contrib=True)\n", + " print(f\"SHAP VALUES\")\n", + " shap_values.show()" ] }, { @@ -184,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "id": "8244ff3c", "metadata": {}, "outputs": [ @@ -192,18 +177,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-19 11:18:27,652\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + "2022-06-22 17:26:41,346\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 46.26it/s]\n" ] }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-19 11:18:47 (running for 00:00:15.19)
Memory usage on this node: 10.2/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/4.86 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/kai/ray_results/LightGBMTrainer_2022-05-19_11-18-30
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-06-22 17:26:56 (running for 00:00:14.07)
Memory usage on this node: 10.0/31.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/13.32 GiB heap, 0.0/6.66 GiB objects
Result logdir: /home/ubuntu/ray_results/LightGBMTrainer_2022-06-22_17-26-41
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) train-binary_logloss train-binary_error valid-binary_logloss
Trial name status loc iter total time (s) train-binary_logloss train-binary_error valid-binary_logloss
LightGBMTrainer_07bf3_00000TERMINATED127.0.0.1:9219 100 10.4622 0.000197893 0 0.289033
LightGBMTrainer_7b049_00000TERMINATED172.31.43.110:1491578 100 10.9726 0.000574522 0 0.171898


" ], @@ -219,96 +205,139 @@ "output_type": "stream", "text": [ "UserWarning: cpus_per_actor is set to less than 2. Distributed LightGBM needs at least 2 CPUs per actor to train efficiently. This may lead to a degradation of performance during training.\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 11:18:32,940\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 11:18:36,664\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=9219)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=9219)\u001b[0m UserWarning: Dataset 'valid' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=9219)\u001b[0m UserWarning: cpus_per_actor is set to less than 2. Distributed LightGBM needs at least 2 CPUs per actor to train efficiently. This may lead to a degradation of performance during training.\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 11:18:38,980\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331069\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 11:18:38,997\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331069\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 11:18:39,091\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 11:18:39,095\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 11:18:39,107\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=23 --runtime-env-hash=-2010331134\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 11:18:39,107\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n" + "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(LightGBMTrainer pid=1491578)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001b[2m\u001b[36m(LightGBMTrainer pid=1491578)\u001b[0m UserWarning: Dataset 'valid' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001b[2m\u001b[36m(LightGBMTrainer pid=1491578)\u001b[0m UserWarning: cpus_per_actor is set to less than 2. Distributed LightGBM needs at least 2 CPUs per actor to train efficiently. This may lead to a degradation of performance during training.\n", + "\u001b[2m\u001b[36m(pid=1491651)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491651)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(pid=1491651)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491651)\u001b[0m FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491651)\u001b[0m FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491653)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491653)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491653)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491653)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491653)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491653)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491653)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491653)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(pid=1491652)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491652)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491652)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491652)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491652)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491652)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(pid=1491652)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1491652)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m 2022-06-22 17:26:50,509\tWARNING __init__.py:190 -- DeprecationWarning: `ray.worker.get_resource_ids` is a private attribute and access will be removed in a future Ray version.\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m 2022-06-22 17:26:50,658\tWARNING __init__.py:190 -- DeprecationWarning: `ray.worker.get_resource_ids` is a private attribute and access will be removed in a future Ray version.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m [LightGBM] [Info] Trying to bind port 59039...\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m [LightGBM] [Info] Binding port 59039 succeeded\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m [LightGBM] [Info] Listening...\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m [LightGBM] [Info] Trying to bind port 46955...\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m [LightGBM] [Info] Binding port 46955 succeeded\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m [LightGBM] [Info] Listening...\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m [LightGBM] [Warning] Connecting to rank 1 failed, waiting for 200 milliseconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m UserWarning: Overriding the parameters from Reference Dataset.\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m UserWarning: categorical_column in param dict is overridden.\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m UserWarning: Overriding the parameters from Reference Dataset.\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m UserWarning: categorical_column in param dict is overridden.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m [LightGBM] [Info] Trying to bind port 52127...\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m [LightGBM] [Info] Binding port 52127 succeeded\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m [LightGBM] [Info] Listening...\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m [LightGBM] [Warning] Connecting to rank 1 failed, waiting for 200 milliseconds\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9243)\u001b[0m [LightGBM] [Info] Trying to bind port 52128...\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9243)\u001b[0m [LightGBM] [Info] Binding port 52128 succeeded\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9243)\u001b[0m [LightGBM] [Info] Listening...\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m [LightGBM] [Info] Connected to rank 1\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m [LightGBM] [Info] Local rank: 0, total number of machines: 2\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m [LightGBM] [Warning] num_threads is set=1, n_jobs=-1 will be ignored. Current value: num_threads=1\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9243)\u001b[0m [LightGBM] [Info] Connected to rank 0\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9243)\u001b[0m [LightGBM] [Info] Local rank: 1, total number of machines: 2\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9243)\u001b[0m [LightGBM] [Warning] num_threads is set=1, n_jobs=-1 will be ignored. Current value: num_threads=1\n" + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m [LightGBM] [Info] Connected to rank 0\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m [LightGBM] [Info] Local rank: 1, total number of machines: 2\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491653)\u001b[0m [LightGBM] [Warning] num_threads is set=1, n_jobs=-1 will be ignored. Current value: num_threads=1\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m [LightGBM] [Info] Connected to rank 1\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m [LightGBM] [Info] Local rank: 0, total number of machines: 2\n", + "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=1491652)\u001b[0m [LightGBM] [Warning] num_threads is set=1, n_jobs=-1 will be ignored. Current value: num_threads=1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9243)\u001b[0m UserWarning: Overriding the parameters from Reference Dataset.\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9243)\u001b[0m UserWarning: categorical_column in param dict is overridden.\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m UserWarning: Overriding the parameters from Reference Dataset.\n", - "\u001b[2m\u001b[36m(_RemoteRayLightGBMActor pid=9242)\u001b[0m UserWarning: categorical_column in param dict is overridden.\n" + "\u001b[2m\u001b[36m(_QueueActor pid=1491650)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1491650)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1491650)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1491650)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1491650)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/dask/dataframe/backends.py:181: FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1491650)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1491650)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1491650)\u001b[0m from pandas import MultiIndex, Int64Index\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Result for LightGBMTrainer_07bf3_00000:\n", - " date: 2022-05-19_11-18-44\n", + "Result for LightGBMTrainer_7b049_00000:\n", + " date: 2022-06-22_17-26-53\n", " done: false\n", - " experiment_id: 1d3640d1c3a743aeae7274a0ce253107\n", - " hostname: Kais-MacBook-Pro.local\n", + " experiment_id: b4a87c26a7604a43baf895755d4f16b3\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 9219\n", + " node_ip: 172.31.43.110\n", + " pid: 1491578\n", " should_checkpoint: true\n", - " time_since_restore: 8.41084909439087\n", - " time_this_iter_s: 8.41084909439087\n", - " time_total_s: 8.41084909439087\n", - " timestamp: 1652955524\n", + " time_since_restore: 8.369545459747314\n", + " time_this_iter_s: 8.369545459747314\n", + " time_total_s: 8.369545459747314\n", + " timestamp: 1655918813\n", " timesteps_since_restore: 0\n", - " train-binary_error: 0.36683417085427134\n", - " train-binary_logloss: 0.5804693664919086\n", + " train-binary_error: 0.5175879396984925\n", + " train-binary_logloss: 0.6302848981539763\n", " training_iteration: 1\n", - " trial_id: 07bf3_00000\n", - " valid-binary_error: 0.36470588235294116\n", - " valid-binary_logloss: 0.5868466345817073\n", - " warmup_time: 0.004106044769287109\n", + " trial_id: 7b049_00000\n", + " valid-binary_error: 0.2\n", + " valid-binary_logloss: 0.558752017793943\n", + " warmup_time: 0.008721590042114258\n", " \n", - "Result for LightGBMTrainer_07bf3_00000:\n", - " date: 2022-05-19_11-18-46\n", + "Result for LightGBMTrainer_7b049_00000:\n", + " date: 2022-06-22_17-26-56\n", " done: true\n", - " experiment_id: 1d3640d1c3a743aeae7274a0ce253107\n", + " experiment_id: b4a87c26a7604a43baf895755d4f16b3\n", " experiment_tag: '0'\n", - " hostname: Kais-MacBook-Pro.local\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 100\n", - " node_ip: 127.0.0.1\n", - " pid: 9219\n", + " node_ip: 172.31.43.110\n", + " pid: 1491578\n", " should_checkpoint: true\n", - " time_since_restore: 10.46218204498291\n", - " time_this_iter_s: 0.025421857833862305\n", - " time_total_s: 10.46218204498291\n", - " timestamp: 1652955526\n", + " time_since_restore: 10.972588300704956\n", + " time_this_iter_s: 0.027977466583251953\n", + " time_total_s: 10.972588300704956\n", + " timestamp: 1655918816\n", " timesteps_since_restore: 0\n", " train-binary_error: 0.0\n", - " train-binary_logloss: 0.00019789273681613937\n", + " train-binary_logloss: 0.0005745220956391456\n", " training_iteration: 100\n", - " trial_id: 07bf3_00000\n", + " trial_id: 7b049_00000\n", " valid-binary_error: 0.058823529411764705\n", - " valid-binary_logloss: 0.2890328865004496\n", - " warmup_time: 0.004106044769287109\n", + " valid-binary_logloss: 0.17189847605331432\n", + " warmup_time: 0.008721590042114258\n", " \n" ] }, @@ -316,14 +345,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-19 11:18:47,218\tINFO tune.py:753 -- Total run time: 16.87 seconds (15.17 seconds for the tuning loop).\n" + "2022-06-22 17:26:56,406\tINFO tune.py:734 -- Total run time: 14.73 seconds (14.06 seconds for the tuning loop).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'train-binary_logloss': 0.00019789273681613937, 'train-binary_error': 0.0, 'valid-binary_logloss': 0.2890328865004496, 'valid-binary_error': 0.058823529411764705, 'time_this_iter_s': 0.025421857833862305, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 100, 'trial_id': '07bf3_00000', 'experiment_id': '1d3640d1c3a743aeae7274a0ce253107', 'date': '2022-05-19_11-18-46', 'timestamp': 1652955526, 'time_total_s': 10.46218204498291, 'pid': 9219, 'hostname': 'Kais-MacBook-Pro.local', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 10.46218204498291, 'timesteps_since_restore': 0, 'iterations_since_restore': 100, 'warmup_time': 0.004106044769287109, 'experiment_tag': '0'}\n" + "{'train-binary_logloss': 0.0005745220956391456, 'train-binary_error': 0.0, 'valid-binary_logloss': 0.17189847605331432, 'valid-binary_error': 0.058823529411764705, 'time_this_iter_s': 0.027977466583251953, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 100, 'trial_id': '7b049_00000', 'experiment_id': 'b4a87c26a7604a43baf895755d4f16b3', 'date': '2022-06-22_17-26-56', 'timestamp': 1655918816, 'time_total_s': 10.972588300704956, 'pid': 1491578, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 10.972588300704956, 'timesteps_since_restore': 0, 'iterations_since_restore': 100, 'warmup_time': 0.008721590042114258, 'experiment_tag': '0'}\n" ] } ], @@ -341,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "id": "871c9be6", "metadata": {}, "outputs": [ @@ -349,8 +378,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "Map Progress (1 actors 1 pending): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00, 2.21s/it]\n", - "Map_Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 93.04it/s]\n" + "2022-06-22 17:26:57,517\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 50.96it/s]\n", + "Map_Batches: 0%| | 0/1 [00:00 Tuple[Dataset, Dataset, Dataset]:\n", - " data_raw = load_breast_cancer()\n", - " dataset_df = pd.DataFrame(data_raw[\"data\"], columns=data_raw[\"feature_names\"])\n", - " dataset_df[\"target\"] = data_raw[\"target\"]\n", - " # add a random categorical column\n", - " num_samples = len(dataset_df)\n", - " dataset_df[\"categorical_column\"] = pd.Series(\n", - " ([\"A\", \"B\"] * math.ceil(num_samples / 2))[:num_samples]\n", - " )\n", - " train_df, test_df = train_test_split(dataset_df, test_size=0.3)\n", - " train_dataset = ray.data.from_pandas(train_df)\n", - " valid_dataset = ray.data.from_pandas(test_df)\n", - " test_dataset = ray.data.from_pandas(test_df.drop(\"target\", axis=1))\n", + " dataset = ray.data.read_csv(\"s3://air-example-data/breast_cancer_with_categorical.csv\")\n", + " train_dataset, valid_dataset = train_test_split(dataset, test_size=0.3)\n", + " test_dataset = valid_dataset.map_batches(lambda df: df.drop(\"target\", axis=1), batch_format=\"pandas\")\n", " return train_dataset, valid_dataset, test_dataset" ] }, @@ -112,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "id": "0fd39e42", "metadata": {}, "outputs": [], @@ -162,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "id": "59eeadd8", "metadata": {}, "outputs": [], @@ -180,9 +166,9 @@ " num_gpus_per_worker=int(use_gpu),\n", " )\n", " .map_batches(lambda df: (df > 0.5).astype(int), batch_format=\"pandas\")\n", - " .to_pandas(limit=float(\"inf\"))\n", " )\n", - " print(f\"PREDICTED LABELS\\n{predicted_labels}\")" + " print(f\"PREDICTED LABELS\")\n", + " predicted_labels.show()" ] }, { @@ -195,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "id": "43f9170a", "metadata": {}, "outputs": [ @@ -203,18 +189,20 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-19 11:56:26,664\tINFO services.py:1483 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8266\u001B[39m\u001B[22m\n" + "2022-06-22 17:27:37,741\tINFO services.py:1477 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8269\u001b[39m\u001b[22m\n", + "2022-06-22 17:27:39,822\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 44.05it/s]\n" ] }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-19 11:56:51 (running for 00:00:20.56)
Memory usage on this node: 10.1/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/4.64 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/kai/ray_results/SklearnTrainer_2022-05-19_11-56-29
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-06-22 17:27:59 (running for 00:00:18.31)
Memory usage on this node: 10.7/31.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/12.9 GiB heap, 0.0/6.45 GiB objects
Result logdir: /home/ubuntu/ray_results/SklearnTrainer_2022-06-22_17-27-40
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) fit_time
Trial name status loc iter total time (s) fit_time
SklearnTrainer_564d9_00000TERMINATED127.0.0.1:12221 1 17.1905 2.48662
SklearnTrainer_9dec8_00000TERMINATED172.31.43.110:1492629 1 15.6842 2.31571


" ], @@ -229,127 +217,107 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:31,837\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:34,848\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m 2022-05-19 11:56:36,385\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:37,344\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:37,344\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:39,843\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:39,845\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:42,324\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=23 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:42,324\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:44,748\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=24 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:44,749\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=25 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:47,193\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=27 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:47,193\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=26 --runtime-env-hash=-2010331134\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:49,612\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=28 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:56:49,612\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=29 --runtime-env-hash=-2010331134\n" + "\u001b[2m\u001b[36m(SklearnTrainer pid=1492629)\u001b[0m 2022-06-22 17:27:45,647\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Result for SklearnTrainer_564d9_00000:\n", + "Result for SklearnTrainer_9dec8_00000:\n", " cv:\n", " fit_time:\n", - " - 2.402121067047119\n", - " - 2.312839984893799\n", - " - 2.3265390396118164\n", - " - 2.325679063796997\n", - " - 2.3602960109710693\n", - " fit_time_mean: 2.34549503326416\n", - " fit_time_std: 0.032384969255539235\n", + " - 2.221003770828247\n", + " - 2.215489387512207\n", + " - 2.2075674533843994\n", + " - 2.222351312637329\n", + " - 2.312389612197876\n", + " fit_time_mean: 2.235760307312012\n", + " fit_time_std: 0.03866614559685742\n", " score_time:\n", - " - 0.10820889472961426\n", - " - 0.10829401016235352\n", - " - 0.10703587532043457\n", - " - 0.10512709617614746\n", - " - 0.10840892791748047\n", - " score_time_mean: 0.10741496086120605\n", - " score_time_std: 0.0012465199424455708\n", + " - 0.022464990615844727\n", + " - 0.0230865478515625\n", + " - 0.02564835548400879\n", + " - 0.029137849807739258\n", + " - 0.021221637725830078\n", + " score_time_mean: 0.02431187629699707\n", + " score_time_std: 0.0028120522003997595\n", " test_score:\n", " - 0.9625\n", - " - 0.8875\n", + " - 0.9125\n", + " - 0.9875\n", " - 1.0\n", - " - 0.9493670886075949\n", - " - 0.9240506329113924\n", - " test_score_mean: 0.9446835443037976\n", - " test_score_std: 0.03766947497186954\n", - " date: 2022-05-19_11-56-51\n", + " - 0.9367088607594937\n", + " test_score_mean: 0.9598417721518986\n", + " test_score_std: 0.032128186960552516\n", + " date: 2022-06-22_17-27-59\n", " done: false\n", - " experiment_id: 200cbc1e2b84434882732d2053ec45c2\n", - " fit_time: 2.4866180419921875\n", - " hostname: Kais-MacBook-Pro.local\n", + " experiment_id: f8215019c10e4a81ba2187c38e875365\n", + " fit_time: 2.3157050609588623\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 12221\n", + " node_ip: 172.31.43.110\n", + " pid: 1492629\n", " should_checkpoint: true\n", - " time_since_restore: 17.19045615196228\n", - " time_this_iter_s: 17.19045615196228\n", - " time_total_s: 17.19045615196228\n", - " timestamp: 1652957811\n", + " time_since_restore: 15.684244871139526\n", + " time_this_iter_s: 15.684244871139526\n", + " time_total_s: 15.684244871139526\n", + " timestamp: 1655918879\n", " timesteps_since_restore: 0\n", " training_iteration: 1\n", - " trial_id: 564d9_00000\n", + " trial_id: 9dec8_00000\n", " valid:\n", - " score_time: 0.10993409156799316\n", - " test_score: 0.9473684210526315\n", - " warmup_time: 0.0039539337158203125\n", + " score_time: 0.03549623489379883\n", + " test_score: 0.9532163742690059\n", + " warmup_time: 0.0057866573333740234\n", " \n", - "Result for SklearnTrainer_564d9_00000:\n", + "Result for SklearnTrainer_9dec8_00000:\n", " cv:\n", " fit_time:\n", - " - 2.402121067047119\n", - " - 2.312839984893799\n", - " - 2.3265390396118164\n", - " - 2.325679063796997\n", - " - 2.3602960109710693\n", - " fit_time_mean: 2.34549503326416\n", - " fit_time_std: 0.032384969255539235\n", + " - 2.221003770828247\n", + " - 2.215489387512207\n", + " - 2.2075674533843994\n", + " - 2.222351312637329\n", + " - 2.312389612197876\n", + " fit_time_mean: 2.235760307312012\n", + " fit_time_std: 0.03866614559685742\n", " score_time:\n", - " - 0.10820889472961426\n", - " - 0.10829401016235352\n", - " - 0.10703587532043457\n", - " - 0.10512709617614746\n", - " - 0.10840892791748047\n", - " score_time_mean: 0.10741496086120605\n", - " score_time_std: 0.0012465199424455708\n", + " - 0.022464990615844727\n", + " - 0.0230865478515625\n", + " - 0.02564835548400879\n", + " - 0.029137849807739258\n", + " - 0.021221637725830078\n", + " score_time_mean: 0.02431187629699707\n", + " score_time_std: 0.0028120522003997595\n", " test_score:\n", " - 0.9625\n", - " - 0.8875\n", + " - 0.9125\n", + " - 0.9875\n", " - 1.0\n", - " - 0.9493670886075949\n", - " - 0.9240506329113924\n", - " test_score_mean: 0.9446835443037976\n", - " test_score_std: 0.03766947497186954\n", - " date: 2022-05-19_11-56-51\n", + " - 0.9367088607594937\n", + " test_score_mean: 0.9598417721518986\n", + " test_score_std: 0.032128186960552516\n", + " date: 2022-06-22_17-27-59\n", " done: true\n", - " experiment_id: 200cbc1e2b84434882732d2053ec45c2\n", + " experiment_id: f8215019c10e4a81ba2187c38e875365\n", " experiment_tag: '0'\n", - " fit_time: 2.4866180419921875\n", - " hostname: Kais-MacBook-Pro.local\n", + " fit_time: 2.3157050609588623\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 12221\n", + " node_ip: 172.31.43.110\n", + " pid: 1492629\n", " should_checkpoint: true\n", - " time_since_restore: 17.19045615196228\n", - " time_this_iter_s: 17.19045615196228\n", - " time_total_s: 17.19045615196228\n", - " timestamp: 1652957811\n", + " time_since_restore: 15.684244871139526\n", + " time_this_iter_s: 15.684244871139526\n", + " time_total_s: 15.684244871139526\n", + " timestamp: 1655918879\n", " timesteps_since_restore: 0\n", " training_iteration: 1\n", - " trial_id: 564d9_00000\n", + " trial_id: 9dec8_00000\n", " valid:\n", - " score_time: 0.10993409156799316\n", - " test_score: 0.9473684210526315\n", - " warmup_time: 0.0039539337158203125\n", + " score_time: 0.03549623489379883\n", + " test_score: 0.9532163742690059\n", + " warmup_time: 0.0057866573333740234\n", " \n" ] }, @@ -357,28 +325,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/joblib/externals/loky/backend/resource_tracker.py:320: UserWarning: resource_tracker: There appear to be 6 leaked folder objects to clean up at shutdown\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m (len(rtype_registry), rtype))\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/joblib/externals/loky/backend/resource_tracker.py:333: UserWarning: resource_tracker: /var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/joblib_memmapping_folder_12221_5f6216ae1e6a46ba9d419e794af5d6af_23c04cd6260143c0ac6f5dbe654ee805: FileNotFoundError(2, 'No such file or directory')\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m warnings.warn('resource_tracker: %s: %r' % (name, e))\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/joblib/externals/loky/backend/resource_tracker.py:333: UserWarning: resource_tracker: /var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/joblib_memmapping_folder_12221_a9bc9a60f53a487e91b551aaace31955_1d562711c03e42ff9f97698134ab33f7: FileNotFoundError(2, 'No such file or directory')\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m warnings.warn('resource_tracker: %s: %r' % (name, e))\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/joblib/externals/loky/backend/resource_tracker.py:333: UserWarning: resource_tracker: /var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/joblib_memmapping_folder_12221_4130f87b8a7a41d4bb44d3ff87c47d73_d3df48add59547d89737f42c03172fa5: FileNotFoundError(2, 'No such file or directory')\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m warnings.warn('resource_tracker: %s: %r' % (name, e))\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/joblib/externals/loky/backend/resource_tracker.py:333: UserWarning: resource_tracker: /var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/joblib_memmapping_folder_12221_28d4366efda3422c93d8ad3a8d66986e_9d1ab8d6a92146829caf48550752190d: FileNotFoundError(2, 'No such file or directory')\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m warnings.warn('resource_tracker: %s: %r' % (name, e))\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/joblib/externals/loky/backend/resource_tracker.py:333: UserWarning: resource_tracker: /var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/joblib_memmapping_folder_12221_4dc9b4c717294776b8162f30cc5eb4fe_068611691a404ca18d46ab1be089bc5a: FileNotFoundError(2, 'No such file or directory')\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m warnings.warn('resource_tracker: %s: %r' % (name, e))\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/joblib/externals/loky/backend/resource_tracker.py:333: UserWarning: resource_tracker: /var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/joblib_memmapping_folder_12221_0b60850fd8704b0e83f6c2758d9c1f2a_6ae1cfa0a68741b8b71f28a262bd7f7a: FileNotFoundError(2, 'No such file or directory')\n", - "\u001B[2m\u001B[36m(TrainTrainable pid=12221)\u001B[0m warnings.warn('resource_tracker: %s: %r' % (name, e))\n", - "2022-05-19 11:56:51,305\tINFO tune.py:753 -- Total run time: 21.67 seconds (20.55 seconds for the tuning loop).\n" + "2022-06-22 17:27:59,333\tINFO tune.py:734 -- Total run time: 19.09 seconds (18.31 seconds for the tuning loop).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'valid': {'score_time': 0.10993409156799316, 'test_score': 0.9473684210526315}, 'cv': {'fit_time': array([2.40212107, 2.31283998, 2.32653904, 2.32567906, 2.36029601]), 'score_time': array([0.10820889, 0.10829401, 0.10703588, 0.1051271 , 0.10840893]), 'test_score': array([0.9625 , 0.8875 , 1. , 0.94936709, 0.92405063]), 'fit_time_mean': 2.34549503326416, 'fit_time_std': 0.032384969255539235, 'score_time_mean': 0.10741496086120605, 'score_time_std': 0.0012465199424455708, 'test_score_mean': 0.9446835443037976, 'test_score_std': 0.03766947497186954}, 'fit_time': 2.4866180419921875, 'time_this_iter_s': 17.19045615196228, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '564d9_00000', 'experiment_id': '200cbc1e2b84434882732d2053ec45c2', 'date': '2022-05-19_11-56-51', 'timestamp': 1652957811, 'time_total_s': 17.19045615196228, 'pid': 12221, 'hostname': 'Kais-MacBook-Pro.local', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 17.19045615196228, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.0039539337158203125, 'experiment_tag': '0'}\n" + "{'valid': {'score_time': 0.03549623489379883, 'test_score': 0.9532163742690059}, 'cv': {'fit_time': array([2.22100377, 2.21548939, 2.20756745, 2.22235131, 2.31238961]), 'score_time': array([0.02246499, 0.02308655, 0.02564836, 0.02913785, 0.02122164]), 'test_score': array([0.9625 , 0.9125 , 0.9875 , 1. , 0.93670886]), 'fit_time_mean': 2.235760307312012, 'fit_time_std': 0.03866614559685742, 'score_time_mean': 0.02431187629699707, 'score_time_std': 0.0028120522003997595, 'test_score_mean': 0.9598417721518986, 'test_score_std': 0.032128186960552516}, 'fit_time': 2.3157050609588623, 'time_this_iter_s': 15.684244871139526, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '9dec8_00000', 'experiment_id': 'f8215019c10e4a81ba2187c38e875365', 'date': '2022-06-22_17-27-59', 'timestamp': 1655918879, 'time_total_s': 15.684244871139526, 'pid': 1492629, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 15.684244871139526, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.0057866573333740234, 'experiment_tag': '0'}\n" ] } ], @@ -396,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "id": "24b16ede", "metadata": { "pycharm": { @@ -408,8 +362,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "Map Progress (1 actors 1 pending): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00, 1.59s/it]\n", - "Map_Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 95.33it/s]" + "2022-06-22 17:27:59,658\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 64.73it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.60s/it]\n", + "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 71.41it/s]" ] }, { @@ -417,20 +373,26 @@ "output_type": "stream", "text": [ "PREDICTED LABELS\n", - " predictions\n", - "0 1\n", - "1 1\n", - "2 1\n", - "3 1\n", - "4 1\n", - ".. ...\n", - "166 1\n", - "167 1\n", - "168 0\n", - "169 0\n", - "170 1\n", - "\n", - "[171 rows x 1 columns]\n" + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n" ] }, { @@ -453,7 +415,7 @@ "notebook_metadata_filter": "-all" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.8.10 ('venv': venv)", "language": "python", "name": "python3" }, @@ -467,9 +429,14 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" + } } }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/doc/source/ray-air/examples/upload_to_comet_ml.ipynb b/doc/source/ray-air/examples/upload_to_comet_ml.ipynb index 934368789a6a..35cfec99eaaa 100644 --- a/doc/source/ray-air/examples/upload_to_comet_ml.ipynb +++ b/doc/source/ray-air/examples/upload_to_comet_ml.ipynb @@ -49,8 +49,7 @@ "from ray.air import RunConfig\n", "from ray.air.result import Result\n", "from ray.train.xgboost import XGBoostTrainer\n", - "from ray.tune.integration.comet import CometLoggerCallback\n", - "from sklearn.datasets import load_breast_cancer" + "from ray.tune.integration.comet import CometLoggerCallback" ] }, { @@ -69,11 +68,8 @@ "outputs": [], "source": [ "def get_train_dataset() -> ray.data.Dataset:\n", - " \"\"\"Return the \"Breast cancer\" dataset as a Ray dataset.\"\"\"\n", - " data_raw = load_breast_cancer(as_frame=True)\n", - " df = data_raw[\"data\"]\n", - " df[\"target\"] = data_raw[\"target\"]\n", - " return ray.data.from_pandas(df)" + " dataset = ray.data.read_csv(\"s3://air-example-data/breast_cancer.csv\")\n", + " return dataset" ] }, { diff --git a/doc/source/ray-air/examples/upload_to_wandb.ipynb b/doc/source/ray-air/examples/upload_to_wandb.ipynb index 48cb51065b5d..58e8463b13da 100644 --- a/doc/source/ray-air/examples/upload_to_wandb.ipynb +++ b/doc/source/ray-air/examples/upload_to_wandb.ipynb @@ -49,8 +49,7 @@ "from ray.air import RunConfig\n", "from ray.air.result import Result\n", "from ray.train.xgboost import XGBoostTrainer\n", - "from ray.tune.integration.wandb import WandbLoggerCallback\n", - "from sklearn.datasets import load_breast_cancer" + "from ray.tune.integration.wandb import WandbLoggerCallback" ] }, { @@ -69,11 +68,8 @@ "outputs": [], "source": [ "def get_train_dataset() -> ray.data.Dataset:\n", - " \"\"\"Return the \"Breast cancer\" dataset as a Ray dataset.\"\"\"\n", - " data_raw = load_breast_cancer(as_frame=True)\n", - " df = data_raw[\"data\"]\n", - " df[\"target\"] = data_raw[\"target\"]\n", - " return ray.data.from_pandas(df)" + " dataset = ray.data.read_csv(\"s3://air-example-data/breast_cancer.csv\")\n", + " return dataset" ] }, { diff --git a/doc/source/ray-air/examples/xgboost_example.ipynb b/doc/source/ray-air/examples/xgboost_example.ipynb index 601bb3db6de6..892647f6d9c3 100644 --- a/doc/source/ray-air/examples/xgboost_example.ipynb +++ b/doc/source/ray-air/examples/xgboost_example.ipynb @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "41f20cc1", "metadata": { "pycharm": { @@ -53,25 +53,33 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "7232303d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + " from pandas import MultiIndex, Int64Index\n", + "FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n" + ] + } + ], "source": [ - "import argparse\n", "from typing import Tuple\n", "\n", - "import pandas as pd\n", - "\n", "import ray\n", "from ray.train.batch_predictor import BatchPredictor\n", "from ray.train.xgboost import XGBoostPredictor\n", "from ray.train.xgboost import XGBoostTrainer\n", "from ray.data.dataset import Dataset\n", "from ray.air.result import Result\n", - "from ray.data.preprocessors import StandardScaler\n", - "from sklearn.datasets import load_breast_cancer\n", - "from sklearn.model_selection import train_test_split" + "from ray.air.util.datasets import train_test_split\n", + "from ray.data.preprocessors import StandardScaler" ] }, { @@ -88,19 +96,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "37c4f38f", "metadata": {}, "outputs": [], "source": [ "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n", - " data_raw = load_breast_cancer()\n", - " dataset_df = pd.DataFrame(data_raw[\"data\"], columns=data_raw[\"feature_names\"])\n", - " dataset_df[\"target\"] = data_raw[\"target\"]\n", - " train_df, test_df = train_test_split(dataset_df, test_size=0.3)\n", - " train_dataset = ray.data.from_pandas(train_df)\n", - " valid_dataset = ray.data.from_pandas(test_df)\n", - " test_dataset = ray.data.from_pandas(test_df.drop(\"target\", axis=1))\n", + " dataset = ray.data.read_csv(\"s3://air-example-data/breast_cancer.csv\")\n", + " train_dataset, valid_dataset = train_test_split(dataset, test_size=0.3)\n", + " test_dataset = valid_dataset.map_batches(lambda df: df.drop(\"target\", axis=1), batch_format=\"pandas\")\n", " return train_dataset, valid_dataset, test_dataset" ] }, @@ -118,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "dae8998d", "metadata": { "pycharm": { @@ -168,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "5b8076d3", "metadata": { "pycharm": { @@ -187,14 +191,13 @@ " predicted_labels = (\n", " batch_predictor.predict(test_dataset)\n", " .map_batches(lambda df: (df > 0.5).astype(int), batch_format=\"pandas\")\n", - " .to_pandas(limit=float(\"inf\"))\n", " )\n", - " print(f\"PREDICTED LABELS\\n{predicted_labels}\")\n", + " print(f\"PREDICTED LABELS\")\n", + " predicted_labels.show()\n", "\n", - " shap_values = batch_predictor.predict(test_dataset, pred_contribs=True).to_pandas(\n", - " limit=float(\"inf\")\n", - " )\n", - " print(f\"SHAP VALUES\\n{shap_values}\")\n" + " shap_values = batch_predictor.predict(test_dataset, pred_contribs=True)\n", + " print(f\"SHAP VALUES\")\n", + " shap_values.show()\n" ] }, { @@ -207,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "0f96d62b", "metadata": { "pycharm": { @@ -219,18 +222,20 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-19 11:44:42,413\tINFO services.py:1483 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265\u001B[39m\u001B[22m\n" + "2022-06-22 17:28:55,841\tINFO services.py:1477 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8270\u001b[39m\u001b[22m\n", + "2022-06-22 17:28:58,044\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 40.28it/s]\n" ] }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-19 11:45:00 (running for 00:00:13.93)
Memory usage on this node: 10.3/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/4.5 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/kai/ray_results/XGBoostTrainer_2022-05-19_11-44-45
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-06-22 17:29:15 (running for 00:00:16.11)
Memory usage on this node: 11.5/31.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/12.35 GiB heap, 0.0/6.18 GiB objects
Result logdir: /home/ubuntu/ray_results/XGBoostTrainer_2022-06-22_17-28-58
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) train-logloss train-error valid-logloss
Trial name status loc iter total time (s) train-logloss train-error valid-logloss
XGBoostTrainer_b273b_00000TERMINATED127.0.0.1:11036 100 9.03935 0.005949 0 0.07483
XGBoostTrainer_cc863_00000TERMINATED172.31.43.110:1493910 100 12.5164 0.005874 0 0.078188


" ], @@ -245,47 +250,68 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:44:47,554\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=54067 --object-store-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=61242 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61017 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:44:51,603\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=54067 --object-store-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=61242 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61017 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n", - "\u001B[2m\u001B[36m(GBDTTrainable pid=11036)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001B[2m\u001B[36m(GBDTTrainable pid=11036)\u001B[0m UserWarning: Dataset 'valid' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001B[2m\u001B[36m(GBDTTrainable pid=11036)\u001B[0m 2022-05-19 11:44:53,035\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:44:54,085\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=54067 --object-store-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=61242 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61017 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331069\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:44:54,106\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=54067 --object-store-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=61242 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61017 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331069\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:44:54,252\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=54067 --object-store-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=61242 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61017 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:44:54,266\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=54067 --object-store-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=61242 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61017 --redis-password=5241590000000000 --startup-token=23 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:44:54,266\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=54067 --object-store-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=61242 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61017 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 11:44:54,271\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=54067 --object-store-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-44-39_813259_10959/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=61242 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61017 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n", - "\u001B[2m\u001B[36m(GBDTTrainable pid=11036)\u001B[0m 2022-05-19 11:44:56,874\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=11104)\u001B[0m [11:44:56] task [xgboost.ray]:4517180944 got new rank 1\n", - "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=11103)\u001B[0m [11:44:56] task [xgboost.ray]:4655847056 got new rank 0\n" + "\u001b[2m\u001b[36m(pid=1493910)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1493910)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(pid=1493910)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1493910)\u001b[0m FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1493910)\u001b[0m FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m UserWarning: Dataset 'valid' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m 2022-06-22 17:29:04,073\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001b[2m\u001b[36m(pid=1494007)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494007)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(pid=1494008)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494008)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(pid=1494009)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494009)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(pid=1494007)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494007)\u001b[0m FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494007)\u001b[0m FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494008)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494008)\u001b[0m FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494008)\u001b[0m FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=1494008)\u001b[0m 2022-06-22 17:29:07,324\tWARNING __init__.py:190 -- DeprecationWarning: `ray.worker.get_resource_ids` is a private attribute and access will be removed in a future Ray version.\n", + "\u001b[2m\u001b[36m(pid=1494009)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494009)\u001b[0m FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(pid=1494009)\u001b[0m FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=1494009)\u001b[0m 2022-06-22 17:29:07,421\tWARNING __init__.py:190 -- DeprecationWarning: `ray.worker.get_resource_ids` is a private attribute and access will be removed in a future Ray version.\n", + "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m 2022-06-22 17:29:07,874\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=1494008)\u001b[0m [17:29:07] task [xgboost.ray]:139731353900128 got new rank 0\n", + "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=1494008)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=1494009)\u001b[0m [17:29:07] task [xgboost.ray]:140076138558608 got new rank 1\n", + "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=1494009)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1494006)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1494006)\u001b[0m from pandas import MultiIndex, Int64Index\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1494006)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1494006)\u001b[0m FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001b[2m\u001b[36m(_QueueActor pid=1494006)\u001b[0m FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Result for XGBoostTrainer_b273b_00000:\n", - " date: 2022-05-19_11-44-57\n", + "Result for XGBoostTrainer_cc863_00000:\n", + " date: 2022-06-22_17-29-09\n", " done: false\n", - " experiment_id: 991235d8b76649398688695ca70a08e4\n", - " hostname: Kais-MacBook-Pro.local\n", + " experiment_id: dc3dac01a34043cfb5751907e2bc648e\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 11036\n", + " node_ip: 172.31.43.110\n", + " pid: 1493910\n", " should_checkpoint: true\n", - " time_since_restore: 7.17207407951355\n", - " time_this_iter_s: 7.17207407951355\n", - " time_total_s: 7.17207407951355\n", - " timestamp: 1652957097\n", + " time_since_restore: 7.967940330505371\n", + " time_this_iter_s: 7.967940330505371\n", + " time_total_s: 7.967940330505371\n", + " timestamp: 1655918949\n", " timesteps_since_restore: 0\n", - " train-error: 0.020101\n", - " train-logloss: 0.465715\n", + " train-error: 0.017588\n", + " train-logloss: 0.464648\n", " training_iteration: 1\n", - " trial_id: b273b_00000\n", - " valid-error: 0.052632\n", - " valid-logloss: 0.480831\n", - " warmup_time: 0.003935098648071289\n", + " trial_id: cc863_00000\n", + " valid-error: 0.081871\n", + " valid-logloss: 0.496374\n", + " warmup_time: 0.004768848419189453\n", " \n" ] }, @@ -293,35 +319,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(GBDTTrainable pid=11036)\u001B[0m 2022-05-19 11:44:59,796\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=398 in 6.80 seconds (2.92 pure XGBoost training time).\n" + "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m 2022-06-22 17:29:14,546\tINFO main.py:1516 -- [RayXGBoost] Finished XGBoost training on training data with total N=398 in 10.52 seconds (6.66 pure XGBoost training time).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Result for XGBoostTrainer_b273b_00000:\n", - " date: 2022-05-19_11-44-59\n", + "Result for XGBoostTrainer_cc863_00000:\n", + " date: 2022-06-22_17-29-14\n", " done: true\n", - " experiment_id: 991235d8b76649398688695ca70a08e4\n", + " experiment_id: dc3dac01a34043cfb5751907e2bc648e\n", " experiment_tag: '0'\n", - " hostname: Kais-MacBook-Pro.local\n", + " hostname: ip-172-31-43-110\n", " iterations_since_restore: 100\n", - " node_ip: 127.0.0.1\n", - " pid: 11036\n", + " node_ip: 172.31.43.110\n", + " pid: 1493910\n", " should_checkpoint: true\n", - " time_since_restore: 9.03934907913208\n", - " time_this_iter_s: 0.018042802810668945\n", - " time_total_s: 9.03934907913208\n", - " timestamp: 1652957099\n", + " time_since_restore: 12.516392230987549\n", + " time_this_iter_s: 0.03008890151977539\n", + " time_total_s: 12.516392230987549\n", + " timestamp: 1655918954\n", " timesteps_since_restore: 0\n", " train-error: 0.0\n", - " train-logloss: 0.005949\n", + " train-logloss: 0.005874\n", " training_iteration: 100\n", - " trial_id: b273b_00000\n", - " valid-error: 0.017544\n", - " valid-logloss: 0.07483\n", - " warmup_time: 0.003935098648071289\n", + " trial_id: cc863_00000\n", + " valid-error: 0.040936\n", + " valid-logloss: 0.078188\n", + " warmup_time: 0.004768848419189453\n", " \n" ] }, @@ -329,14 +355,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-19 11:45:00,535\tINFO tune.py:753 -- Total run time: 15.30 seconds (13.91 seconds for the tuning loop).\n" + "2022-06-22 17:29:15,362\tINFO tune.py:734 -- Total run time: 16.94 seconds (16.08 seconds for the tuning loop).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'train-logloss': 0.005949, 'train-error': 0.0, 'valid-logloss': 0.07483, 'valid-error': 0.017544, 'time_this_iter_s': 0.018042802810668945, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 100, 'trial_id': 'b273b_00000', 'experiment_id': '991235d8b76649398688695ca70a08e4', 'date': '2022-05-19_11-44-59', 'timestamp': 1652957099, 'time_total_s': 9.03934907913208, 'pid': 11036, 'hostname': 'Kais-MacBook-Pro.local', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 9.03934907913208, 'timesteps_since_restore': 0, 'iterations_since_restore': 100, 'warmup_time': 0.003935098648071289, 'experiment_tag': '0'}\n" + "{'train-logloss': 0.005874, 'train-error': 0.0, 'valid-logloss': 0.078188, 'valid-error': 0.040936, 'time_this_iter_s': 0.03008890151977539, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 100, 'trial_id': 'cc863_00000', 'experiment_id': 'dc3dac01a34043cfb5751907e2bc648e', 'date': '2022-06-22_17-29-14', 'timestamp': 1655918954, 'time_total_s': 12.516392230987549, 'pid': 1493910, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 12.516392230987549, 'timesteps_since_restore': 0, 'iterations_since_restore': 100, 'warmup_time': 0.004768848419189453, 'experiment_tag': '0'}\n" ] } ], @@ -354,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "283b1dba", "metadata": { "pycharm": { @@ -366,8 +392,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "Map Progress (1 actors 1 pending): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00, 1.96s/it]\n", - "Map_Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 87.81it/s]\n" + "2022-06-22 17:29:16,463\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 46.14it/s]\n", + "Map_Batches: 0%| | 0/1 [00:00 Dataset: predictions = batch_predictor.predict(prediction_dataset, dtype=tf.float32) - pandas_predictions = predictions.to_pandas(float("inf")) - - print(f"PREDICTIONS\n{pandas_predictions}") + print("PREDICTIONS") + predictions.show() return predictions From 2c8de938e6e8e501a1c0dbde3d0098a3ffd74405 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 24 Jun 2022 15:34:36 +0000 Subject: [PATCH 2/2] Implement feedback from code review --- doc/source/ray-air/doc_code/air_key_concepts.py | 4 +--- doc/source/ray-air/doc_code/preprocessors.py | 4 ++-- doc/source/ray-air/doc_code/tf_starter.py | 2 +- doc/source/ray-air/doc_code/xgboost_starter.py | 8 +++----- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/doc/source/ray-air/doc_code/air_key_concepts.py b/doc/source/ray-air/doc_code/air_key_concepts.py index 42727433751b..df71a98dea2a 100644 --- a/doc/source/ray-air/doc_code/air_key_concepts.py +++ b/doc/source/ray-air/doc_code/air_key_concepts.py @@ -75,9 +75,7 @@ batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, XGBoostPredictor) # Bulk batch prediction. -predicted_labels = batch_predictor.predict(test_dataset).map_batches( - lambda df: (df > 0.5).astype(int), batch_format="pandas" -) +predicted_probabilities = batch_predictor.predict(test_dataset) # Pipelined batch prediction: instead of processing the data in bulk, process it # incrementally in windows of the given size. diff --git a/doc/source/ray-air/doc_code/preprocessors.py b/doc/source/ray-air/doc_code/preprocessors.py index 1c19f08d2b8f..bd533f46a23c 100644 --- a/doc/source/ray-air/doc_code/preprocessors.py +++ b/doc/source/ray-air/doc_code/preprocessors.py @@ -88,8 +88,8 @@ test_dataset = ray.data.from_items([{"x": x} for x in range(2, 32, 3)]) batch_predictor = BatchPredictor.from_checkpoint(checkpoint, XGBoostPredictor) -predicted_labels = batch_predictor.predict(test_dataset) -predicted_labels.show() +predicted_probabilities = batch_predictor.predict(test_dataset) +predicted_probabilities.show() # {'predictions': 0.09843720495700836} # {'predictions': 5.604666709899902} # {'predictions': 11.405311584472656} diff --git a/doc/source/ray-air/doc_code/tf_starter.py b/doc/source/ray-air/doc_code/tf_starter.py index 360503294fdc..4116ff5dba58 100644 --- a/doc/source/ray-air/doc_code/tf_starter.py +++ b/doc/source/ray-air/doc_code/tf_starter.py @@ -105,7 +105,7 @@ def train_func(config: dict): predictions = batch_predictor.predict(prediction_dataset, dtype=tf.float32) -print(f"PREDICTIONS") +print("PREDICTIONS") predictions.show() # __air_tf_batchpred_end__ diff --git a/doc/source/ray-air/doc_code/xgboost_starter.py b/doc/source/ray-air/doc_code/xgboost_starter.py index f2e7a3793d95..1d917178e691 100644 --- a/doc/source/ray-air/doc_code/xgboost_starter.py +++ b/doc/source/ray-air/doc_code/xgboost_starter.py @@ -59,11 +59,9 @@ batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, XGBoostPredictor) -predicted_labels = batch_predictor.predict(test_dataset).map_batches( - lambda df: (df > 0.5).astype(int), batch_format="pandas" -) -print("PREDICTED LABELS") -predicted_labels.show() +predicted_probabilities = batch_predictor.predict(test_dataset) +print("PREDICTED PROBABILITIES") +predicted_probabilities.show() shap_values = batch_predictor.predict(test_dataset, pred_contribs=True) print("SHAP VALUES")