From 71a919c21dfa9f3d1ed83fd7ac861d0876d3add4 Mon Sep 17 00:00:00 2001 From: Jiao Dong Date: Wed, 20 Jul 2022 20:10:00 -0700 Subject: [PATCH 01/10] working solution for tabular train to serve --- doc/source/ray-air/examples/BUILD | 1 - .../examples/tfx_tabular_train_to_serve.ipynb | 383 ++++++++++++++++-- python/ray/data/extensions/__init__.py | 2 + .../ray/data/extensions/tensor_extension.py | 1 + python/ray/serve/air_integrations.py | 11 + .../ray/serve/tests/test_air_integrations.py | 20 + 6 files changed, 386 insertions(+), 32 deletions(-) diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD index b356a741519e..c78ae78ce2e4 100644 --- a/doc/source/ray-air/examples/BUILD +++ b/doc/source/ray-air/examples/BUILD @@ -35,7 +35,6 @@ py_test_run_all_notebooks( "feast_example.ipynb", # REGRESSION "rl_offline_example.ipynb", # REGRESSION "rl_online_example.ipynb", # REGRESSION - "tfx_tabular_train_to_serve.ipynb", # REGRESSION ], data = ["//doc/source/ray-air/examples:air_examples"], tags = ["exclusive", "team:ml", "ray_air"], diff --git a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb index 78b43111b761..ece0a92d89cc 100644 --- a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb +++ b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -73,7 +73,68 @@ "id": "MOsHUjgdIrIW", "outputId": "8a21ead5-bb2d-4a3d-ae41-17a313688b24" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-20 18:45:28,814\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "
\n", + "

Ray

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.7.10
Ray version: 3.0.0.dev0
Dashboard:http://127.0.0.1:8266
\n", + "
\n", + "
\n" + ], + "text/plain": [ + "RayContext(dashboard_url='127.0.0.1:8266', python_version='3.7.10', ray_version='3.0.0.dev0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-07-20_18-45-26_127581_21006/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-07-20_18-45-26_127581_21006/sockets/raylet', 'webui_url': '127.0.0.1:8266', 'session_dir': '/tmp/ray/session_2022-07-20_18-45-26_127581_21006', 'metrics_export_port': 63884, 'gcs_address': '127.0.0.1:63685', 'address': '127.0.0.1:63685', 'dashboard_agent_listen_port': 52365, 'node_id': 'c21f810137e56bd967ab3f246c66aadc5262e00bdbe19c34c23456e7'})" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from pprint import pprint\n", "import ray\n", @@ -93,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -101,7 +162,18 @@ "id": "KlMz0dt9hYbS", "outputId": "e7234b52-08b4-49fc-e14c-72f283b893f2" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'CPU': 16.0,\n", + " 'memory': 30436675994.0,\n", + " 'node:127.0.0.1': 1.0,\n", + " 'object_store_memory': 2147483648.0}\n" + ] + } + ], "source": [ "pprint(ray.cluster_resources())" ] @@ -126,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "gAbhv9OqhYbT" }, @@ -160,7 +232,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "FbeYf1aF8ISK" }, @@ -180,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -189,7 +261,157 @@ "id": "8tugpr5S8gPq", "outputId": "3c57a348-12a7-4b6c-f9b2-fabdcb7a7c88" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pickup_community_areatrip_start_monthtrip_start_hourtrip_start_daytrip_start_timestamptrip_milesdropoff_census_tractpayment_typecompanytrip_secondsdropoff_community_areais_big_tip
0NaN519614002695000.0NaNCredit CardChicago Elite Cab Corp. (Chicago Carriag0.0NaNFalse
1NaN319513626837000.0NaNUnknownChicago Elite Cab Corp.300.0NaNFalse
260.01023138059370012.6NaNCashTaxi Affiliation Services1380.0NaNFalse
310.0101213823190000.0NaNCashTaxi Affiliation Services180.0NaNFalse
414.057513698972000.0NaNCashDispatch Taxi Affiliation1080.0NaNFalse
\n", + "
" + ], + "text/plain": [ + " pickup_community_area trip_start_month trip_start_hour trip_start_day \\\n", + "0 NaN 5 19 6 \n", + "1 NaN 3 19 5 \n", + "2 60.0 10 2 3 \n", + "3 10.0 10 1 2 \n", + "4 14.0 5 7 5 \n", + "\n", + " trip_start_timestamp trip_miles dropoff_census_tract payment_type \\\n", + "0 1400269500 0.0 NaN Credit Card \n", + "1 1362683700 0.0 NaN Unknown \n", + "2 1380593700 12.6 NaN Cash \n", + "3 1382319000 0.0 NaN Cash \n", + "4 1369897200 0.0 NaN Cash \n", + "\n", + " company trip_seconds \\\n", + "0 Chicago Elite Cab Corp. (Chicago Carriag 0.0 \n", + "1 Chicago Elite Cab Corp. 300.0 \n", + "2 Taxi Affiliation Services 1380.0 \n", + "3 Taxi Affiliation Services 180.0 \n", + "4 Dispatch Taxi Affiliation 1080.0 \n", + "\n", + " dropoff_community_area is_big_tip \n", + "0 NaN False \n", + "1 NaN False \n", + "2 NaN False \n", + "3 NaN False \n", + "4 NaN False " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data.head(5)" ] @@ -206,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "YSLvrBMC9aRv" }, @@ -239,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -247,7 +469,15 @@ "id": "xfhRl7eO981w", "outputId": "f80d90ff-fc8a-4a7d-b544-31633823d596" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 11251 samples for training and 3751 samples for testing.\n" + ] + } + ], "source": [ "print(f\"There are {train_ds.count()} samples for training and {test_df.shape[0]} samples for testing.\")" ] @@ -286,7 +516,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "zVvslsfMIrIh" }, @@ -345,7 +575,8 @@ " result = {}\n", " feature_cols = [col for col in dataframe.columns if col != LABEL]\n", " result[\"input\"] = TensorArray(dataframe[feature_cols].to_numpy(dtype=np.float32))\n", - " result[LABEL] = dataframe[LABEL]\n", + " if LABEL in dataframe.columns:\n", + " result[LABEL] = dataframe[LABEL]\n", " return pd.DataFrame(result)\n", "\n", " chained_pp = Chain(\n", @@ -372,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "ejGVU-uN_dVP" }, @@ -414,7 +645,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "MwhAeEOuhYbV" }, @@ -557,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "id": "BBbcMwc9Rz66" }, @@ -582,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "id": "ujmwT8ZhScq1" }, @@ -608,7 +839,7 @@ " # This is due to a current limitation on Serve that's\n", " # being addressed.\n", " # TODO(xwjiang): Change to True.\n", - " batching_params=False,\n", + " batching_params=dict(max_batch_size=2, batch_wait_timeout_s=5),\n", " model_definition=model_definition,\n", " http_adapter=adapter,\n", " )\n", @@ -617,12 +848,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "id": "uRe9a8947pl9" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-20 18:46:11,759\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n", + "\u001b[2m\u001b[36m(ServeController pid=21308)\u001b[0m INFO 2022-07-20 18:46:15,348 controller 21308 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n", + "\u001b[2m\u001b[36m(ServeController pid=21308)\u001b[0m INFO 2022-07-20 18:46:15,350 controller 21308 http_state.py:126 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:SERVE_PROXY_ACTOR-58fb3ee046cdce5c602369291de78f60c65dcbd7c5c5a8af57ec3a26' on node '58fb3ee046cdce5c602369291de78f60c65dcbd7c5c5a8af57ec3a26' listening on '127.0.0.1:8000'\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=21311)\u001b[0m INFO: Started server process [21311]\n", + "/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/ipykernel_launcher.py:23: UserWarning: From /var/folders/1s/wy6f3ytn3q726p5hl8fw8d780000gn/T/ipykernel_21006/609683685.py:23: deploy (from ray.serve.deployment) is deprecated and will be removed in a future version Please see https://docs.ray.io/en/latest/serve/index.html\n", + "\u001b[2m\u001b[36m(ServeController pid=21308)\u001b[0m INFO 2022-07-20 18:46:17,658 controller 21308 deployment_state.py:1281 - Adding 1 replicas to deployment 'Model'.\n", + "\u001b[2m\u001b[36m(ServeReplica:Model pid=21314)\u001b[0m 2022-07-20 18:46:23,199\tWARNING compression.py:18 -- lz4 not available, disabling sample compression. This will significantly impact RLlib performance. To install lz4, run `pip install lz4`.\n" + ] + } + ], "source": [ + "import ray\n", "# Generally speaking, training and serving are done in totally different ray clusters.\n", "# To simulate that, let's shutdown the old ray cluster in preparation for serving.\n", "ray.shutdown()\n", @@ -641,33 +887,106 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "id": "E9m80HDmSz66" }, "outputs": [], "source": [ "import requests\n", + "import pandas as pd\n", + "import numpy as np\n", "\n", - "NUM_SERVE_REQUESTS = 100\n", + "NUM_SERVE_REQUESTS = 10\n", "\n", "def send_requests(df: pd.DataFrame, label: np.array):\n", " for i in range(NUM_SERVE_REQUESTS):\n", " one_row = df.iloc[[i]].to_dict()\n", " serve_result = requests.post(endpoint_uri, json=one_row).json()\n", " print(\n", - " f\"request[{i}] prediction: {serve_result['predictions']['0']} \"\n", + " f\"request{i} prediction: {serve_result[0]['predictions']} \"\n", " f\"- label: {str(label[i])}\"\n", " )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "id": "GFPwKc5JTgnI" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request0 prediction: 0.004963837098330259 - label: True\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request1 prediction: 6.652726733591408e-05 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request2 prediction: 0.00018405025184620172 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request3 prediction: 0.00016512417641934007 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request4 prediction: 0.00015515758423134685 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request5 prediction: 5.948602483840659e-05 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request6 prediction: 9.51739348238334e-05 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request7 prediction: 3.4787988170137396e-06 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request8 prediction: 0.00010751552326837555 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request9 prediction: 0.060329731553792953 - label: True\n" + ] + } + ], "source": [ "send_requests(test_df, test_label)" ] @@ -682,11 +1001,8 @@ "name": "tfx (1) (1) (1).ipynb", "provenance": [] }, - "interpreter": { - "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" - }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.7.10 ('ray3.7')", "language": "python", "name": "python3" }, @@ -700,7 +1016,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.7.10" + }, + "vscode": { + "interpreter": { + "hash": "99d89bfe98f3aa2d7facda0d08d31ff2a0af9559e5330d719288ce64a1966273" + } } }, "nbformat": 4, diff --git a/python/ray/data/extensions/__init__.py b/python/ray/data/extensions/__init__.py index a9ae87faeb02..70b4daedd328 100644 --- a/python/ray/data/extensions/__init__.py +++ b/python/ray/data/extensions/__init__.py @@ -1,6 +1,7 @@ from ray.data.extensions.tensor_extension import ( TensorDtype, TensorArray, + TensorArrayElement, ArrowTensorType, ArrowTensorArray, ) @@ -9,6 +10,7 @@ # Tensor array extension. "TensorDtype", "TensorArray", + "TensorArrayElement", "ArrowTensorType", "ArrowTensorArray", ] diff --git a/python/ray/data/extensions/tensor_extension.py b/python/ray/data/extensions/tensor_extension.py index 0fa577250a83..96b868bb47f5 100644 --- a/python/ray/data/extensions/tensor_extension.py +++ b/python/ray/data/extensions/tensor_extension.py @@ -1,6 +1,7 @@ from ray.air.util.tensor_extensions.pandas import ( # noqa: F401 TensorDtype, TensorArray, + TensorArrayElement, ) from ray.air.util.tensor_extensions.arrow import ( # noqa: F401 ArrowTensorType, diff --git a/python/ray/serve/air_integrations.py b/python/ray/serve/air_integrations.py index d3868763d683..0fbd5a2dd472 100644 --- a/python/ray/serve/air_integrations.py +++ b/python/ray/serve/air_integrations.py @@ -81,6 +81,8 @@ def batch_dataframe(input_list: List["pd.DataFrame"]) -> "pd.DataFrame": def split_dataframe( output_df: "pd.DataFrame", batch_size: int ) -> List["pd.DataFrame"]: + from ray.data.extensions import TensorArray, TensorArrayElement + if not isinstance(output_df, pd.DataFrame): raise TypeError( "The output should be a Pandas DataFrame but Serve got " @@ -91,6 +93,15 @@ def split_dataframe( f"The output dataframe should have length divisible by {batch_size}, " f"but Serve got length {len(output_df)}." ) + + # In dl_predictor.py we return a pd.DataFrame that could have multiple + # columns but value of each column is a TensorArray. Flatten the + # TensorArray to list to ensure output is json serializable as http + # response. + for col in output_df: + if isinstance(output_df[col].values, (TensorArray, TensorArrayElement)): + output_df[col] = output_df[col].to_numpy() + return [df.reset_index(drop=True) for df in np.split(output_df, batch_size)] @staticmethod diff --git a/python/ray/serve/tests/test_air_integrations.py b/python/ray/serve/tests/test_air_integrations.py index 26bd6f8a39c9..b0a222a69879 100644 --- a/python/ray/serve/tests/test_air_integrations.py +++ b/python/ray/serve/tests/test_air_integrations.py @@ -16,6 +16,7 @@ from ray.serve.deployment_graph_build import build from ray.serve.http_adapters import json_to_ndarray from ray.train.predictor import DataBatchType, Predictor +from ray.data.extensions import TensorArray class TestBatchingFunctionFunctions: @@ -73,6 +74,25 @@ def test_dataframe(self): for i, j in zip(unpacked_list, list_of_dfs): assert i.equals(j) + def test_dataframe_with_tensorarray(self): + batched_df = pd.DataFrame( + { + "a": TensorArray([1, 2, 3, 4]), + "b": TensorArray([5, 6, 7, 8]), + } + ) + split_df = pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": [5, 6, 7, 8], + } + ) + + unpacked_list = BatchingManager.split_dataframe(batched_df, 1) + assert len(unpacked_list) == 1 + assert unpacked_list[0]["a"].equals(split_df["a"]) + assert unpacked_list[0]["b"].equals(split_df["b"]) + class AdderPredictor(Predictor): def __init__(self, increment: int, do_double: bool) -> None: From b7db375b2112dc1f753bcab45ff799a04b040693 Mon Sep 17 00:00:00 2001 From: Jiao Dong Date: Wed, 20 Jul 2022 21:56:31 -0700 Subject: [PATCH 02/10] fix torch incremental learning notebook --- doc/source/ray-air/examples/BUILD | 1 - .../examples/torch_incremental_learning.ipynb | 2607 +++++++++++++---- 2 files changed, 2071 insertions(+), 537 deletions(-) diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD index c78ae78ce2e4..f072ba92a86a 100644 --- a/doc/source/ray-air/examples/BUILD +++ b/doc/source/ray-air/examples/BUILD @@ -31,7 +31,6 @@ py_test_run_all_notebooks( include = ["*.ipynb"], exclude = [ "huggingface_text_classification.ipynb", - "torch_incremental_learning.ipynb", "feast_example.ipynb", # REGRESSION "rl_offline_example.ipynb", # REGRESSION "rl_online_example.ipynb", # REGRESSION diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index b7fb2c72aa27..8b090822b69f 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -75,133 +75,7 @@ "id": "kWr6BRMk1Y1j", "outputId": "dad49a31-a602-4e44-b5fe-932de603925e" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: ray[data,serve,tune] in /usr/local/lib/python3.7/dist-packages (2.0.0.dev0)\n", - "Requirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.21.6)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (3.7.0)\n", - "Requirement already satisfied: grpcio!=1.44.0,>=1.28.1 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.43.0)\n", - "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.0.3)\n", - "Requirement already satisfied: protobuf>=3.15.3 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (3.17.3)\n", - "Requirement already satisfied: aiosignal in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.2.0)\n", - "Requirement already satisfied: frozenlist in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.3.0)\n", - "Requirement already satisfied: virtualenv in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (20.14.1)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (3.13)\n", - "Requirement already satisfied: attrs in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (21.4.0)\n", - "Requirement already satisfied: click<=8.0.4,>=7.0 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (7.1.2)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (2.23.0)\n", - "Requirement already satisfied: jsonschema in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (4.3.3)\n", - "Requirement already satisfied: pyarrow<7.0.0,>=6.0.1 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (6.0.1)\n", - "Requirement already satisfied: fsspec in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (2022.5.0)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.3.5)\n", - "Requirement already satisfied: tensorboardX>=1.9 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (2.5)\n", - "Requirement already satisfied: tabulate in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.8.9)\n", - "Requirement already satisfied: aiorwlock in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.3.0)\n", - "Requirement already satisfied: starlette in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.19.1)\n", - "Requirement already satisfied: prometheus-client<0.14.0,>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.13.1)\n", - "Requirement already satisfied: py-spy>=0.2.0 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.3.12)\n", - "Requirement already satisfied: smart-open in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (6.0.0)\n", - "Requirement already satisfied: gpustat>=1.0.0b1 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.0.0b1)\n", - "Requirement already satisfied: colorful in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.5.4)\n", - "Requirement already satisfied: aiohttp>=3.7 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (3.8.1)\n", - "Requirement already satisfied: fastapi in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.78.0)\n", - "Requirement already satisfied: aiohttp-cors in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.7.0)\n", - "Requirement already satisfied: uvicorn==0.16.0 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.16.0)\n", - "Requirement already satisfied: opencensus in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.9.0)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from uvicorn==0.16.0->ray[data,serve,tune]) (4.2.0)\n", - "Requirement already satisfied: asgiref>=3.4.0 in /usr/local/lib/python3.7/dist-packages (from uvicorn==0.16.0->ray[data,serve,tune]) (3.5.2)\n", - "Requirement already satisfied: h11>=0.8 in /usr/local/lib/python3.7/dist-packages (from uvicorn==0.16.0->ray[data,serve,tune]) (0.13.0)\n", - "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (0.13.0)\n", - "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (2.0.12)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (6.0.2)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (4.0.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (1.7.2)\n", - "Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from gpustat>=1.0.0b1->ray[data,serve,tune]) (5.4.8)\n", - "Requirement already satisfied: six>=1.7 in /usr/local/lib/python3.7/dist-packages (from gpustat>=1.0.0b1->ray[data,serve,tune]) (1.15.0)\n", - "Requirement already satisfied: blessed>=1.17.1 in /usr/local/lib/python3.7/dist-packages (from gpustat>=1.0.0b1->ray[data,serve,tune]) (1.19.1)\n", - "Requirement already satisfied: nvidia-ml-py3>=7.352.0 in /usr/local/lib/python3.7/dist-packages (from gpustat>=1.0.0b1->ray[data,serve,tune]) (7.352.0)\n", - "Requirement already satisfied: wcwidth>=0.1.4 in /usr/local/lib/python3.7/dist-packages (from blessed>=1.17.1->gpustat>=1.0.0b1->ray[data,serve,tune]) (0.2.5)\n", - "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp>=3.7->ray[data,serve,tune]) (2.10)\n", - "Requirement already satisfied: pydantic!=1.7,!=1.7.1,!=1.7.2,!=1.7.3,!=1.8,!=1.8.1,<2.0.0,>=1.6.2 in /usr/local/lib/python3.7/dist-packages (from fastapi->ray[data,serve,tune]) (1.9.1)\n", - "Requirement already satisfied: anyio<5,>=3.4.0 in /usr/local/lib/python3.7/dist-packages (from starlette->ray[data,serve,tune]) (3.6.1)\n", - "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.7/dist-packages (from anyio<5,>=3.4.0->starlette->ray[data,serve,tune]) (1.2.0)\n", - "Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray[data,serve,tune]) (5.7.1)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray[data,serve,tune]) (0.18.1)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray[data,serve,tune]) (4.11.3)\n", - "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema->ray[data,serve,tune]) (3.8.0)\n", - "Requirement already satisfied: google-api-core<3.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from opencensus->ray[data,serve,tune]) (1.31.5)\n", - "Requirement already satisfied: opencensus-context>=0.1.2 in /usr/local/lib/python3.7/dist-packages (from opencensus->ray[data,serve,tune]) (0.1.2)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (2022.1)\n", - "Requirement already satisfied: google-auth<2.0dev,>=1.25.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (1.35.0)\n", - "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (1.56.1)\n", - "Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (57.4.0)\n", - "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (21.3)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (0.2.8)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (4.8)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (4.2.4)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=14.3->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (3.0.9)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=1.25.0->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (0.4.8)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->ray[data,serve,tune]) (1.24.3)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->ray[data,serve,tune]) (3.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->ray[data,serve,tune]) (2022.5.18.1)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->ray[data,serve,tune]) (2.8.2)\n", - "Requirement already satisfied: distlib<1,>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from virtualenv->ray[data,serve,tune]) (0.3.4)\n", - "Requirement already satisfied: platformdirs<3,>=2 in /usr/local/lib/python3.7/dist-packages (from virtualenv->ray[data,serve,tune]) (2.5.2)\n", - "Found existing installation: ray 2.0.0.dev0\n", - "Uninstalling ray-2.0.0.dev0:\n", - " Successfully uninstalled ray-2.0.0.dev0\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting ray==3.0.0.dev0\n", - " Downloading https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl (54.9 MB)\n", - "\u001b[K |████████████████████████████████| 54.9 MB 74.4 MB/s \n", - "\u001b[?25hRequirement already satisfied: msgpack<2.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.0.3)\n", - "Requirement already satisfied: virtualenv in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (20.14.1)\n", - "Requirement already satisfied: protobuf>=3.15.3 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (3.17.3)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (3.13)\n", - "Requirement already satisfied: click<=8.0.4,>=7.0 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (7.1.2)\n", - "Requirement already satisfied: attrs in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (21.4.0)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (2.23.0)\n", - "Requirement already satisfied: frozenlist in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.3.0)\n", - "Requirement already satisfied: aiosignal in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.2.0)\n", - "Requirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.21.6)\n", - "Requirement already satisfied: jsonschema in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (4.3.3)\n", - "Requirement already satisfied: grpcio<=1.43.0,>=1.28.1 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.43.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (3.7.0)\n", - "Requirement already satisfied: six>=1.5.2 in /usr/local/lib/python3.7/dist-packages (from grpcio<=1.43.0,>=1.28.1->ray==3.0.0.dev0) (1.15.0)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray==3.0.0.dev0) (0.18.1)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray==3.0.0.dev0) (4.2.0)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray==3.0.0.dev0) (4.11.3)\n", - "Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray==3.0.0.dev0) (5.7.1)\n", - "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema->ray==3.0.0.dev0) (3.8.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->ray==3.0.0.dev0) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->ray==3.0.0.dev0) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->ray==3.0.0.dev0) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->ray==3.0.0.dev0) (2022.5.18.1)\n", - "Requirement already satisfied: platformdirs<3,>=2 in /usr/local/lib/python3.7/dist-packages (from virtualenv->ray==3.0.0.dev0) (2.5.2)\n", - "Requirement already satisfied: distlib<1,>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from virtualenv->ray==3.0.0.dev0) (0.3.4)\n", - "Installing collected packages: ray\n", - "Successfully installed ray-3.0.0.dev0\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (1.11.0+cu113)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch) (4.2.0)\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.7/dist-packages (0.12.0+cu113)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from torchvision) (2.23.0)\n", - "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision) (7.1.2)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torchvision) (1.21.6)\n", - "Requirement already satisfied: torch==1.11.0 in /usr/local/lib/python3.7/dist-packages (from torchvision) (1.11.0+cu113)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torchvision) (4.2.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->torchvision) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->torchvision) (2022.5.18.1)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->torchvision) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->torchvision) (1.24.3)\n" - ] - } - ], + "outputs": [], "source": [ "# !pip install -q \"ray[air]\"\n", "# !pip install -q torch\n", @@ -232,13 +106,56 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:25:31,150\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + "2022-07-20 21:23:11,138\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" ] }, { "data": { + "text/html": [ + "
\n", + "
\n", + "

Ray

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.7.10
Ray version: 3.0.0.dev0
Dashboard:http://127.0.0.1:8265
\n", + "
\n", + "
\n" + ], "text/plain": [ - "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.7.13', ray_version='3.0.0.dev0', ray_commit='ac620aeec0c0f68c92328ace0b2a5835f5b14b26', address_info={'node_ip_address': '172.28.0.2', 'raylet_ip_address': '172.28.0.2', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-05-25_22-25-28_641559_1518/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-05-25_22-25-28_641559_1518/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2022-05-25_22-25-28_641559_1518', 'metrics_export_port': 61030, 'gcs_address': '172.28.0.2:62940', 'address': '172.28.0.2:62940', 'node_id': '97455d0de12f3393126427ed2b1ef0a009f0bd3fb97177cb86b42d92'})" + "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.7.10', ray_version='3.0.0.dev0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-07-20_21-23-08_582230_36728/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-07-20_21-23-08_582230_36728/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2022-07-20_21-23-08_582230_36728', 'metrics_export_port': 65519, 'gcs_address': '127.0.0.1:63150', 'address': '127.0.0.1:63150', 'dashboard_agent_listen_port': 52365, 'node_id': '1fe76f557e913c317f0ba10b2c21a49a382ea1434302babe8f56c8b5'})" ] }, "execution_count": 2, @@ -270,7 +187,16 @@ "metadata": { "id": "3TVkSmFFCHhI" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import torch.nn as nn\n", "\n", @@ -595,6 +521,7 @@ "outputs": [], "source": [ "from ray.data.preprocessors import BatchMapper\n", + "from ray.data.extensions import TensorArray\n", "\n", "from torchvision import transforms\n", "\n", @@ -605,7 +532,7 @@ " [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]\n", " )\n", "\n", - " df[\"image\"] = df[\"image\"].map(torchvision_transforms)\n", + " df[\"image\"] = TensorArray([torchvision_transforms(image) for image in df[\"image\"]])\n", " return df\n", "\n", "mnist_normalize_preprocessor = BatchMapper(fn=preprocess_images)" @@ -704,7 +631,13 @@ " return df\n", "\n", " deployment = PredictorDeployment.options(name=\"mnist_model\", route_prefix=\"/mnist_predict\", version=f\"v{task_idx}\", num_replicas=2)\n", - " deployment.deploy(batching_params=False, http_adapter=json_to_pandas, predictor_cls=TorchPredictor, checkpoint=latest_checkpoint, model=SimpleMLP(num_classes=10))\n", + " deployment.deploy(\n", + " batching_params=dict(max_batch_size=10, batch_wait_timeout_s=5),\n", + " http_adapter=json_to_pandas, \n", + " predictor_cls=TorchPredictor, \n", + " checkpoint=latest_checkpoint, \n", + " model=SimpleMLP(num_classes=10)\n", + " )\n", " return deployment.url\n", "\n", "# Function that queries our deployed model\n", @@ -753,14 +686,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_prepare_read pid=1772)\u001b[0m 2022-05-25 22:25:35,236\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:05<00:00, 5.92s/it]\n", - "\u001b[2m\u001b[36m(_prepare_read pid=1772)\u001b[0m 2022-05-25 22:25:53,593\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.51it/s]\n", - "Map Progress (2 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.72s/it]\n", - "\u001b[2m\u001b[36m(_prepare_read pid=1978)\u001b[0m 2022-05-25 22:25:58,761\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.41it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]" + "2022-07-20 21:23:15,792\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "\u001b[2m\u001b[36m(_get_read_tasks pid=36810)\u001b[0m 2022-07-20 21:23:15,789\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:07<00:00, 7.10s/it]\n", + "2022-07-20 21:23:29,813\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "\u001b[2m\u001b[36m(_get_read_tasks pid=36810)\u001b[0m 2022-07-20 21:23:29,811\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.13it/s]\n", + "Map Progress (2 actors 0 pending): 100%|██████████| 1/1 [00:02<00:00, 2.37s/it]\n", + "2022-07-20 21:23:33,089\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "\u001b[2m\u001b[36m(_get_read_tasks pid=36810)\u001b[0m 2022-07-20 21:23:33,086\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.05it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n", + "2022-07-20 21:23:35,331\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" ] }, { @@ -770,22 +707,15 @@ "Starting training for task: 0\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-25 22:27:16 (running for 00:01:14.46)
Memory usage on this node: 4.7/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.31 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/TorchTrainer_2022-05-25_22-26-01
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:24:15 (running for 00:00:39.94)
Memory usage on this node: 33.0/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.02 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-23-35
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_a8585_00000TERMINATED172.28.0.2:2126
TorchTrainer_e2f7a_00000TERMINATED127.0.0.1:36887 4 36.469820.125 1658377454 6.31464


" ], @@ -800,125 +730,625 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_map_block_nosplit pid=2159)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=2159)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m 2022-05-25 22:26:19,944\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m 2022-05-25 22:26:20,033\tINFO torch.py:98 -- Moving model to device: cuda:0\n" + "2022-07-20 21:23:35,495\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n", + "\u001b[2m\u001b[36m(TorchTrainer pid=36887)\u001b[0m 2022-07-20 21:23:38,548\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=36896)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=36896)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m 2022-07-20 21:23:49,104\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m 2022-07-20 21:23:49,297\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 2.315190, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.464406, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.279081, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.052461, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.816213, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.019127, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.525613, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.570595, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.572004, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.543432, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.350156, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.443743, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.438318, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.342512, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.302048, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.414025, epoch: 3, iteration: 1500\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 2.361676, epoch: 0, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 1.813834, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 1.215798, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.784447, epoch: 0, iteration: 1500\n", + "Result for TorchTrainer_e2f7a_00000:\n", + " _time_this_iter_s: 6.658920049667358\n", + " _timestamp: 1658377435\n", + " _training_iteration: 1\n", + " date: 2022-07-20_21-23-56\n", + " done: false\n", + " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 1\n", + " loss: 2443.240612268448\n", + " node_ip: 127.0.0.1\n", + " pid: 36887\n", + " should_checkpoint: true\n", + " time_since_restore: 17.467722177505493\n", + " time_this_iter_s: 17.467722177505493\n", + " time_total_s: 17.467722177505493\n", + " timestamp: 1658377436\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: e2f7a_00000\n", + " warmup_time: 0.003819704055786133\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.788217, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.849067, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.771431, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.774948, epoch: 1, iteration: 1500\n", + "Result for TorchTrainer_e2f7a_00000:\n", + " _time_this_iter_s: 6.489426851272583\n", + " _timestamp: 1658377442\n", + " _training_iteration: 2\n", + " date: 2022-07-20_21-24-02\n", + " done: false\n", + " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 2\n", + " loss: 1298.5548962950706\n", + " node_ip: 127.0.0.1\n", + " pid: 36887\n", + " should_checkpoint: true\n", + " time_since_restore: 23.950473070144653\n", + " time_this_iter_s: 6.48275089263916\n", + " time_total_s: 23.950473070144653\n", + " timestamp: 1658377442\n", + " timesteps_since_restore: 0\n", + " training_iteration: 2\n", + " trial_id: e2f7a_00000\n", + " warmup_time: 0.003819704055786133\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.543702, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.656774, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.542017, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.437269, epoch: 2, iteration: 1500\n", + "Result for TorchTrainer_e2f7a_00000:\n", + " _time_this_iter_s: 6.202568054199219\n", + " _timestamp: 1658377448\n", + " _training_iteration: 3\n", + " date: 2022-07-20_21-24-08\n", + " done: false\n", + " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 3\n", + " loss: 976.985466003418\n", + " node_ip: 127.0.0.1\n", + " pid: 36887\n", + " should_checkpoint: true\n", + " time_since_restore: 30.153619050979614\n", + " time_this_iter_s: 6.203145980834961\n", + " time_total_s: 30.153619050979614\n", + " timestamp: 1658377448\n", + " timesteps_since_restore: 0\n", + " training_iteration: 3\n", + " trial_id: e2f7a_00000\n", + " warmup_time: 0.003819704055786133\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.354292, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.544945, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.494318, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.518876, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_e2f7a_00000:\n", + " _time_this_iter_s: 6.314639091491699\n", + " _timestamp: 1658377454\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-24-15\n", + " done: false\n", + " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 820.125178091228\n", + " node_ip: 127.0.0.1\n", + " pid: 36887\n", + " should_checkpoint: true\n", + " time_since_restore: 36.46902513504028\n", + " time_this_iter_s: 6.315406084060669\n", + " time_total_s: 36.46902513504028\n", + " timestamp: 1658377455\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: e2f7a_00000\n", + " warmup_time: 0.003819704055786133\n", + " \n", + "Result for TorchTrainer_e2f7a_00000:\n", + " _time_this_iter_s: 6.314639091491699\n", + " _timestamp: 1658377454\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-24-15\n", + " done: true\n", + " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " experiment_tag: '0'\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 820.125178091228\n", + " node_ip: 127.0.0.1\n", + " pid: 36887\n", + " should_checkpoint: true\n", + " time_since_restore: 36.46902513504028\n", + " time_this_iter_s: 6.315406084060669\n", + " time_total_s: 36.46902513504028\n", + " timestamp: 1658377455\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: e2f7a_00000\n", + " warmup_time: 0.003819704055786133\n", + " \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:27:16,013\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n" + "2022-07-20 21:24:15,426\tINFO tune.py:738 -- Total run time: 40.08 seconds (39.93 seconds for the tuning loop).\n", + "Map Progress (1 actors 1 pending): 0%| | 0/1 [00:01Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.20it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.41s/it]" + "\u001b[2m\u001b[36m(ServeController pid=37002)\u001b[0m INFO 2022-07-20 21:24:19,927 controller 37002 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n", + "\u001b[2m\u001b[36m(ServeController pid=37002)\u001b[0m INFO 2022-07-20 21:24:19,929 controller 37002 http_state.py:126 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:kNlRSH:SERVE_PROXY_ACTOR-1fe76f557e913c317f0ba10b2c21a49a382ea1434302babe8f56c8b5' on node '1fe76f557e913c317f0ba10b2c21a49a382ea1434302babe8f56c8b5' listening on '127.0.0.1:8000'\n", + "Shuffle Map: 0%| | 0/1 [00:00Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.32it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.34s/it]\n", + "2022-07-20 21:24:31,067\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" ] }, { @@ -928,22 +1358,15 @@ "Starting training for task: 1\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-25 22:28:52 (running for 00:01:09.00)
Memory usage on this node: 5.0/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.31 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/TorchTrainer_2022-05-25_22-27-43
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:25:08 (running for 00:00:37.85)
Memory usage on this node: 33.7/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.02 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-24-31
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_e4f66_00000TERMINATED172.28.0.2:2875
TorchTrainer_0435d_00000TERMINATED127.0.0.1:37029 4 34.0956706.09 1658377508 6.57723


" ], @@ -958,125 +1381,633 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_map_block_nosplit pid=2909)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=2909)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m 2022-05-25 22:28:01,917\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m 2022-05-25 22:28:02,063\tINFO torch.py:98 -- Moving model to device: cuda:0\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=37029)\u001b[0m 2022-07-20 21:24:34,273\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m 2022-07-20 21:24:41,746\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 3.347775, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 1.343975, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.768560, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.607410, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.578952, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.473788, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.609530, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.741895, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.417272, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.510404, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.422137, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.403623, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.384720, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.414567, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.274302, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.348169, epoch: 3, iteration: 1500\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 3.112837, epoch: 0, iteration: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:28:52,221\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m 2022-07-20 21:24:42,046\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Trial TorchTrainer_e4f66_00000 completed. Last result: \n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 1.363313, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.807654, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.658998, epoch: 0, iteration: 1500\n", + "Result for TorchTrainer_0435d_00000:\n", + " _time_this_iter_s: 6.752341032028198\n", + " _timestamp: 1658377488\n", + " _training_iteration: 1\n", + " date: 2022-07-20_21-24-48\n", + " done: false\n", + " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 1\n", + " loss: 1784.0176878273487\n", + " node_ip: 127.0.0.1\n", + " pid: 37029\n", + " should_checkpoint: true\n", + " time_since_restore: 14.582754850387573\n", + " time_this_iter_s: 14.582754850387573\n", + " time_total_s: 14.582754850387573\n", + " timestamp: 1658377488\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: 0435d_00000\n", + " warmup_time: 0.005182027816772461\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.479975, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.664035, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.700321, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.530712, epoch: 1, iteration: 1500\n", + "Result for TorchTrainer_0435d_00000:\n", + " _time_this_iter_s: 6.524201154708862\n", + " _timestamp: 1658377495\n", + " _training_iteration: 2\n", + " date: 2022-07-20_21-24-55\n", + " done: false\n", + " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 2\n", + " loss: 996.3850839287043\n", + " node_ip: 127.0.0.1\n", + " pid: 37029\n", + " should_checkpoint: true\n", + " time_since_restore: 21.103912830352783\n", + " time_this_iter_s: 6.52115797996521\n", + " time_total_s: 21.103912830352783\n", + " timestamp: 1658377495\n", + " timesteps_since_restore: 0\n", + " training_iteration: 2\n", + " trial_id: 0435d_00000\n", + " warmup_time: 0.005182027816772461\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.523517, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.680489, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.451373, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.406104, epoch: 2, iteration: 1500\n", + "Result for TorchTrainer_0435d_00000:\n", + " _time_this_iter_s: 6.414251804351807\n", + " _timestamp: 1658377501\n", + " _training_iteration: 3\n", + " date: 2022-07-20_21-25-01\n", + " done: false\n", + " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 3\n", + " loss: 808.2543668076396\n", + " node_ip: 127.0.0.1\n", + " pid: 37029\n", + " should_checkpoint: true\n", + " time_since_restore: 27.517575979232788\n", + " time_this_iter_s: 6.413663148880005\n", + " time_total_s: 27.517575979232788\n", + " timestamp: 1658377501\n", + " timesteps_since_restore: 0\n", + " training_iteration: 3\n", + " trial_id: 0435d_00000\n", + " warmup_time: 0.005182027816772461\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.415548, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.341548, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.467579, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.341880, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_0435d_00000:\n", + " _time_this_iter_s: 6.577230215072632\n", + " _timestamp: 1658377508\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-25-08\n", + " done: false\n", + " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 706.0895065665245\n", + " node_ip: 127.0.0.1\n", + " pid: 37029\n", + " should_checkpoint: true\n", + " time_since_restore: 34.09562683105469\n", + " time_this_iter_s: 6.578050851821899\n", + " time_total_s: 34.09562683105469\n", + " timestamp: 1658377508\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 0435d_00000\n", + " warmup_time: 0.005182027816772461\n", + " \n", + "Result for TorchTrainer_0435d_00000:\n", + " _time_this_iter_s: 6.577230215072632\n", + " _timestamp: 1658377508\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-25-08\n", + " done: true\n", + " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " experiment_tag: '0'\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 706.0895065665245\n", + " node_ip: 127.0.0.1\n", + " pid: 37029\n", + " should_checkpoint: true\n", + " time_since_restore: 34.09562683105469\n", + " time_this_iter_s: 6.578050851821899\n", + " time_total_s: 34.09562683105469\n", + " timestamp: 1658377508\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 0435d_00000\n", + " warmup_time: 0.005182027816772461\n", + " \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:28:52,344\tINFO tune.py:753 -- Total run time: 69.20 seconds (68.99 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/2 [00:01Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.44it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.41s/it]" + "Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 6.11it/s]\n", + "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 6.15it/s]\n", + "Map Progress (1 actors 0 pending): 100%|██████████| 1/1 [00:01<00:00, 1.17s/it]\n", + "\u001b[2m\u001b[36m(ServeController pid=37002)\u001b[0m INFO 2022-07-20 21:25:15,141 controller 37002 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n", + "\u001b[2m\u001b[36m(ServeController pid=37002)\u001b[0m INFO 2022-07-20 21:25:17,254 controller 37002 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n", + "\u001b[2m\u001b[36m(ServeController pid=37002)\u001b[0m INFO 2022-07-20 21:25:19,566 controller 37002 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n", + "\u001b[2m\u001b[36m(ServeController pid=37002)\u001b[0m INFO 2022-07-20 21:25:21,680 controller 37002 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=37005)\u001b[0m INFO 2022-07-20 21:25:24,165 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 3.8ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=37005)\u001b[0m INFO 2022-07-20 21:25:24,181 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 500 13.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=37005)\u001b[0m INFO 2022-07-20 21:25:24,187 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.5ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=37005)\u001b[0m INFO 2022-07-20 21:25:24,199 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 500 9.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=37005)\u001b[0m INFO 2022-07-20 21:25:24,204 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.1ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=37005)\u001b[0m INFO 2022-07-20 21:25:24,213 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 500 7.3ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m ERROR 2022-07-20 21:25:24,177 mnist_model mnist_model#mWswJO replica.py:434 - Request failed due to TypeError:\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m Traceback (most recent call last):\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/Workspace/ray/python/ray/serve/replica.py\", line 416, in invoke_single\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m result = await method_to_call(*args, **kwargs)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/Workspace/ray/python/ray/serve/drivers.py\", line 78, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(request.scope, receive=request.receive, send=sender)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/fastapi/applications.py\", line 181, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await super().__call__(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/applications.py\", line 102, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.middleware_stack(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/middleware/errors.py\", line 181, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise exc from None\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/middleware/errors.py\", line 159, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, _send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/exceptions.py\", line 82, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise exc from None\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/exceptions.py\", line 71, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, sender)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 550, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await route.handle(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 227, in handle\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 41, in app\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m response = await func(request)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/fastapi/routing.py\", line 218, in app\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m background=background_tasks,\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/responses.py\", line 45, in __init__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m self.body = self.render(content)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/responses.py\", line 162, in render\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m separators=(\",\", \":\"),\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/__init__.py\", line 238, in dumps\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m **kw).encode(obj)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 199, in encode\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m chunks = self.iterencode(o, _one_shot=True)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 257, in iterencode\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m return _iterencode(o, 0)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 179, in default\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise TypeError(f'Object of type {o.__class__.__name__} '\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m TypeError: Object of type TensorArrayElement is not JSON serializable\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m INFO 2022-07-20 21:25:24,180 mnist_model mnist_model#mWswJO replica.py:483 - HANDLE __call__ ERROR 9.5ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m ERROR 2022-07-20 21:25:24,197 mnist_model mnist_model#mWswJO replica.py:434 - Request failed due to TypeError:\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m Traceback (most recent call last):\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/Workspace/ray/python/ray/serve/replica.py\", line 416, in invoke_single\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m result = await method_to_call(*args, **kwargs)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/Workspace/ray/python/ray/serve/drivers.py\", line 78, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(request.scope, receive=request.receive, send=sender)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/fastapi/applications.py\", line 181, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await super().__call__(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/applications.py\", line 102, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.middleware_stack(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/middleware/errors.py\", line 181, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise exc from None\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/middleware/errors.py\", line 159, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, _send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/exceptions.py\", line 82, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise exc from None\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/exceptions.py\", line 71, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, sender)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 550, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await route.handle(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 227, in handle\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 41, in app\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m response = await func(request)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/fastapi/routing.py\", line 218, in app\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m background=background_tasks,\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/responses.py\", line 45, in __init__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m self.body = self.render(content)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/responses.py\", line 162, in render\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m separators=(\",\", \":\"),\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/__init__.py\", line 238, in dumps\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m **kw).encode(obj)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 199, in encode\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m chunks = self.iterencode(o, _one_shot=True)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 257, in iterencode\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m return _iterencode(o, 0)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 179, in default\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise TypeError(f'Object of type {o.__class__.__name__} '\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m TypeError: Object of type TensorArrayElement is not JSON serializable\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m INFO 2022-07-20 21:25:24,198 mnist_model mnist_model#mWswJO replica.py:483 - HANDLE __call__ ERROR 6.5ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m ERROR 2022-07-20 21:25:24,212 mnist_model mnist_model#mWswJO replica.py:434 - Request failed due to TypeError:\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m Traceback (most recent call last):\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/Workspace/ray/python/ray/serve/replica.py\", line 416, in invoke_single\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m result = await method_to_call(*args, **kwargs)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/Workspace/ray/python/ray/serve/drivers.py\", line 78, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(request.scope, receive=request.receive, send=sender)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/fastapi/applications.py\", line 181, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await super().__call__(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/applications.py\", line 102, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.middleware_stack(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/middleware/errors.py\", line 181, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise exc from None\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/middleware/errors.py\", line 159, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, _send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/exceptions.py\", line 82, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise exc from None\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/exceptions.py\", line 71, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, sender)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 550, in __call__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await route.handle(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37148)\u001b[0m INFO 2022-07-20 21:25:24,165 mnist_model mnist_model#iaJZOy replica.py:483 - HANDLE __call__ OK 0.3ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37148)\u001b[0m INFO 2022-07-20 21:25:24,186 mnist_model mnist_model#iaJZOy replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37148)\u001b[0m INFO 2022-07-20 21:25:24,203 mnist_model mnist_model#iaJZOy replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=37005)\u001b[0m INFO 2022-07-20 21:25:24,218 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.1ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 227, in handle\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m await self.app(scope, receive, send)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/routing.py\", line 41, in app\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m response = await func(request)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/fastapi/routing.py\", line 218, in app\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m background=background_tasks,\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/responses.py\", line 45, in __init__\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m self.body = self.render(content)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/starlette/responses.py\", line 162, in render\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m separators=(\",\", \":\"),\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/__init__.py\", line 238, in dumps\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m **kw).encode(obj)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 199, in encode\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m chunks = self.iterencode(o, _one_shot=True)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 257, in iterencode\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m return _iterencode(o, 0)\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m File \"/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/json/encoder.py\", line 179, in default\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m raise TypeError(f'Object of type {o.__class__.__name__} '\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m TypeError: Object of type TensorArrayElement is not JSON serializable\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37145)\u001b[0m INFO 2022-07-20 21:25:24,212 mnist_model mnist_model#mWswJO replica.py:483 - HANDLE __call__ ERROR 5.1ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=37148)\u001b[0m INFO 2022-07-20 21:25:24,218 mnist_model mnist_model#iaJZOy replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "Map_Batches: 0%| | 0/1 [00:00Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.09it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n", + "2022-07-20 21:25:29,661\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" ] }, { @@ -1086,22 +2017,15 @@ "Starting training for task: 2\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-25 22:30:31 (running for 00:01:09.12)
Memory usage on this node: 5.0/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.31 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/TorchTrainer_2022-05-25_22-29-22
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:26:07 (running for 00:00:37.91)
Memory usage on this node: 33.9/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.02 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-25-29
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_2040e_00000TERMINATED172.28.0.2:3703
TorchTrainer_2722b_00000TERMINATED127.0.0.1:37163 4 34.389674.492 1658377567 6.59337


" ], @@ -1116,123 +2040,628 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_map_block_nosplit pid=3738)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=3738)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m 2022-05-25 22:29:41,392\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m 2022-05-25 22:29:41,549\tINFO torch.py:98 -- Moving model to device: cuda:0\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=37163)\u001b[0m 2022-07-20 21:25:32,879\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m 2022-07-20 21:25:40,394\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 4.353125, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 1.147782, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.609233, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.606812, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.494777, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.776362, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.376833, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.478181, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.413856, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.668218, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.318078, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.427121, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.369263, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.479945, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.457482, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.318416, epoch: 3, iteration: 1500\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 3.695508, epoch: 0, iteration: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:30:31,831\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m 2022-07-20 21:25:40,627\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Trial TorchTrainer_2040e_00000 completed. Last result: \n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 1.271604, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.773141, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.778621, epoch: 0, iteration: 1500\n", + "Result for TorchTrainer_2722b_00000:\n", + " _time_this_iter_s: 6.677475929260254\n", + " _timestamp: 1658377547\n", + " _training_iteration: 1\n", + " date: 2022-07-20_21-25-47\n", + " done: false\n", + " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 1\n", + " loss: 1716.6645558029413\n", + " node_ip: 127.0.0.1\n", + " pid: 37163\n", + " should_checkpoint: true\n", + " time_since_restore: 14.479393005371094\n", + " time_this_iter_s: 14.479393005371094\n", + " time_total_s: 14.479393005371094\n", + " timestamp: 1658377547\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: 2722b_00000\n", + " warmup_time: 0.004639148712158203\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.640197, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.708120, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.539896, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.414441, epoch: 1, iteration: 1500\n", + "Result for TorchTrainer_2722b_00000:\n", + " _time_this_iter_s: 6.717606067657471\n", + " _timestamp: 1658377554\n", + " _training_iteration: 2\n", + " date: 2022-07-20_21-25-54\n", + " done: false\n", + " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 2\n", + " loss: 947.4811083450913\n", + " node_ip: 127.0.0.1\n", + " pid: 37163\n", + " should_checkpoint: true\n", + " time_since_restore: 21.194233894348145\n", + " time_this_iter_s: 6.714840888977051\n", + " time_total_s: 21.194233894348145\n", + " timestamp: 1658377554\n", + " timesteps_since_restore: 0\n", + " training_iteration: 2\n", + " trial_id: 2722b_00000\n", + " warmup_time: 0.004639148712158203\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.340714, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.561278, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.547913, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.511139, epoch: 2, iteration: 1500\n", + "Result for TorchTrainer_2722b_00000:\n", + " _time_this_iter_s: 6.598260879516602\n", + " _timestamp: 1658377560\n", + " _training_iteration: 3\n", + " date: 2022-07-20_21-26-00\n", + " done: false\n", + " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 3\n", + " loss: 771.773955013603\n", + " node_ip: 127.0.0.1\n", + " pid: 37163\n", + " should_checkpoint: true\n", + " time_since_restore: 27.794183015823364\n", + " time_this_iter_s: 6.59994912147522\n", + " time_total_s: 27.794183015823364\n", + " timestamp: 1658377560\n", + " timesteps_since_restore: 0\n", + " training_iteration: 3\n", + " trial_id: 2722b_00000\n", + " warmup_time: 0.004639148712158203\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.297863, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.295025, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.364217, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.306977, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_2722b_00000:\n", + " _time_this_iter_s: 6.593369245529175\n", + " _timestamp: 1658377567\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-26-07\n", + " done: false\n", + " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 674.491991031915\n", + " node_ip: 127.0.0.1\n", + " pid: 37163\n", + " should_checkpoint: true\n", + " time_since_restore: 34.38901996612549\n", + " time_this_iter_s: 6.594836950302124\n", + " time_total_s: 34.38901996612549\n", + " timestamp: 1658377567\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 2722b_00000\n", + " warmup_time: 0.004639148712158203\n", + " \n", + "Result for TorchTrainer_2722b_00000:\n", + " _time_this_iter_s: 6.593369245529175\n", + " _timestamp: 1658377567\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-26-07\n", + " done: true\n", + " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " experiment_tag: '0'\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 674.491991031915\n", + " node_ip: 127.0.0.1\n", + " pid: 37163\n", + " should_checkpoint: true\n", + " time_since_restore: 34.38901996612549\n", + " time_this_iter_s: 6.594836950302124\n", + " time_total_s: 34.38901996612549\n", + " timestamp: 1658377567\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 2722b_00000\n", + " warmup_time: 0.004639148712158203\n", + " \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:30:31,953\tINFO tune.py:753 -- Total run time: 69.33 seconds (69.12 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01Current time: 2022-05-18 23:52:49 (running for 00:03:27.40)
Memory usage on this node: 7.0/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/TorchTrainer_2022-05-18_23-49-22
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:28:07 (running for 00:01:31.34)
Memory usage on this node: 34.3/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.02 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-26-36
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_24496_00000TERMINATED172.28.0.2:4630
TorchTrainer_4ec24_00000TERMINATED127.0.0.1:37313 4 87.33652329.82 1658377686 19.1051


" ], @@ -1481,85 +2902,188 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_map_block_nosplit pid=4666)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=4666)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m 2022-05-18 23:50:06,950\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m 2022-05-18 23:50:07,011\tINFO torch.py:98 -- Moving model to device: cuda:0\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=37313)\u001b[0m 2022-07-20 21:26:39,346\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=37323)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=37323)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=37322)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=37322)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m 2022-07-20 21:26:50,228\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m 2022-07-20 21:26:50,458\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 2.373475, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.699985, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.636039, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.334987, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.152312, epoch: 0, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.998297, epoch: 0, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.434949, epoch: 0, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.971171, epoch: 0, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.796480, epoch: 0, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.802282, epoch: 0, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.731363, epoch: 0, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.847772, epoch: 0, iteration: 5500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.879676, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.564319, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.714444, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.565163, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.739525, epoch: 1, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.510878, epoch: 1, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.814798, epoch: 1, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.473765, epoch: 1, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.557866, epoch: 1, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.674371, epoch: 1, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.532800, epoch: 1, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.832442, epoch: 1, iteration: 5500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.557547, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.355255, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.426749, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.484543, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.360856, epoch: 2, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.444718, epoch: 2, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.596777, epoch: 2, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.289816, epoch: 2, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.407941, epoch: 2, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.438239, epoch: 2, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.379983, epoch: 2, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.527786, epoch: 2, iteration: 5500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.598584, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.355202, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.392683, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.415264, epoch: 3, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.417230, epoch: 3, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.289974, epoch: 3, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.648514, epoch: 3, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.369468, epoch: 3, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.378548, epoch: 3, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.392761, epoch: 3, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.555575, epoch: 3, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.394487, epoch: 3, iteration: 5500\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 2.340395, epoch: 0, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.700694, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.428385, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.446490, epoch: 0, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.130699, epoch: 0, iteration: 2000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.275075, epoch: 0, iteration: 2500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.094344, epoch: 0, iteration: 3000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.151832, epoch: 0, iteration: 3500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.763781, epoch: 0, iteration: 4000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.623614, epoch: 0, iteration: 4500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.584415, epoch: 0, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.908524, epoch: 0, iteration: 5500\n", + "Result for TorchTrainer_4ec24_00000:\n", + " _time_this_iter_s: 19.464027881622314\n", + " _timestamp: 1658377629\n", + " _training_iteration: 1\n", + " date: 2022-07-20_21-27-09\n", + " done: false\n", + " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 1\n", + " loss: 6849.163370370865\n", + " node_ip: 127.0.0.1\n", + " pid: 37313\n", + " should_checkpoint: true\n", + " time_since_restore: 30.637123107910156\n", + " time_this_iter_s: 30.637123107910156\n", + " time_total_s: 30.637123107910156\n", + " timestamp: 1658377629\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: 4ec24_00000\n", + " warmup_time: 0.01962900161743164\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.597300, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.673263, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.692797, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.748966, epoch: 1, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.493257, epoch: 1, iteration: 2000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.538625, epoch: 1, iteration: 2500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.523347, epoch: 1, iteration: 3000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.762524, epoch: 1, iteration: 3500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.556533, epoch: 1, iteration: 4000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.533555, epoch: 1, iteration: 4500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.425929, epoch: 1, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.717063, epoch: 1, iteration: 5500\n", + "Result for TorchTrainer_4ec24_00000:\n", + " _time_this_iter_s: 18.69218420982361\n", + " _timestamp: 1658377648\n", + " _training_iteration: 2\n", + " date: 2022-07-20_21-27-28\n", + " done: false\n", + " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 2\n", + " loss: 3535.0757773667574\n", + " node_ip: 127.0.0.1\n", + " pid: 37313\n", + " should_checkpoint: true\n", + " time_since_restore: 49.321765184402466\n", + " time_this_iter_s: 18.68464207649231\n", + " time_total_s: 49.321765184402466\n", + " timestamp: 1658377648\n", + " timesteps_since_restore: 0\n", + " training_iteration: 2\n", + " trial_id: 4ec24_00000\n", + " warmup_time: 0.01962900161743164\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.471813, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.777419, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.494634, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.509953, epoch: 2, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.402481, epoch: 2, iteration: 2000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.339183, epoch: 2, iteration: 2500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.422910, epoch: 2, iteration: 3000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.505549, epoch: 2, iteration: 3500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.379204, epoch: 2, iteration: 4000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.453385, epoch: 2, iteration: 4500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.291901, epoch: 2, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.505701, epoch: 2, iteration: 5500\n", + "Result for TorchTrainer_4ec24_00000:\n", + " _time_this_iter_s: 18.91029191017151\n", + " _timestamp: 1658377667\n", + " _training_iteration: 3\n", + " date: 2022-07-20_21-27-47\n", + " done: false\n", + " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 3\n", + " loss: 2748.917506597936\n", + " node_ip: 127.0.0.1\n", + " pid: 37313\n", + " should_checkpoint: true\n", + " time_since_restore: 68.22985124588013\n", + " time_this_iter_s: 18.90808606147766\n", + " time_total_s: 68.22985124588013\n", + " timestamp: 1658377667\n", + " timesteps_since_restore: 0\n", + " training_iteration: 3\n", + " trial_id: 4ec24_00000\n", + " warmup_time: 0.01962900161743164\n", + " \n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.323353, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.490277, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.325247, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.448500, epoch: 3, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.288957, epoch: 3, iteration: 2000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.344484, epoch: 3, iteration: 2500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.361682, epoch: 3, iteration: 3000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.556559, epoch: 3, iteration: 3500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.271744, epoch: 3, iteration: 4000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.277652, epoch: 3, iteration: 4500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.368515, epoch: 3, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.538925, epoch: 3, iteration: 5500\n", + "Result for TorchTrainer_4ec24_00000:\n", + " _time_this_iter_s: 19.10509490966797\n", + " _timestamp: 1658377686\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-28-06\n", + " done: false\n", + " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 2329.821715295315\n", + " node_ip: 127.0.0.1\n", + " pid: 37313\n", + " should_checkpoint: true\n", + " time_since_restore: 87.33653211593628\n", + " time_this_iter_s: 19.106680870056152\n", + " time_total_s: 87.33653211593628\n", + " timestamp: 1658377686\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 4ec24_00000\n", + " warmup_time: 0.01962900161743164\n", + " \n", + "Result for TorchTrainer_4ec24_00000:\n", + " _time_this_iter_s: 19.10509490966797\n", + " _timestamp: 1658377686\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-28-06\n", + " done: true\n", + " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " experiment_tag: '0'\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 2329.821715295315\n", + " node_ip: 127.0.0.1\n", + " pid: 37313\n", + " should_checkpoint: true\n", + " time_since_restore: 87.33653211593628\n", + " time_this_iter_s: 19.106680870056152\n", + " time_total_s: 87.33653211593628\n", + " timestamp: 1658377686\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 4ec24_00000\n", + " warmup_time: 0.01962900161743164\n", + " \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-18 23:52:49,915\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trial TorchTrainer_24496_00000 completed. Last result: \n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-05-18 23:52:50,042\tINFO tune.py:753 -- Total run time: 207.53 seconds (207.39 seconds for the tuning loop).\n" + "2022-07-20 21:28:07,659\tINFO tune.py:738 -- Total run time: 91.46 seconds (91.34 seconds for the tuning loop).\n" ] } ], @@ -1593,7 +3117,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1606,13 +3130,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01 Date: Wed, 20 Jul 2022 22:03:16 -0700 Subject: [PATCH 03/10] working incremental learning --- .../examples/torch_incremental_learning.ipynb | 2480 +++++------------ 1 file changed, 646 insertions(+), 1834 deletions(-) diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index 8b090822b69f..991634e26b61 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -106,7 +106,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:23:11,138\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + "2022-07-20 21:47:49,873\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" ] }, { @@ -155,7 +155,7 @@ "\n" ], "text/plain": [ - "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.7.10', ray_version='3.0.0.dev0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-07-20_21-23-08_582230_36728/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-07-20_21-23-08_582230_36728/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2022-07-20_21-23-08_582230_36728', 'metrics_export_port': 65519, 'gcs_address': '127.0.0.1:63150', 'address': '127.0.0.1:63150', 'dashboard_agent_listen_port': 52365, 'node_id': '1fe76f557e913c317f0ba10b2c21a49a382ea1434302babe8f56c8b5'})" + "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.7.10', ray_version='3.0.0.dev0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-07-20_21-47-47_297236_39344/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-07-20_21-47-47_297236_39344/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2022-07-20_21-47-47_297236_39344', 'metrics_export_port': 62008, 'gcs_address': '127.0.0.1:57307', 'address': '127.0.0.1:57307', 'dashboard_agent_listen_port': 52365, 'node_id': 'db68eafa3bbe9042df574f3c9974b40ce8d97728db90282feefb4690'})" ] }, "execution_count": 2, @@ -183,20 +183,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "3TVkSmFFCHhI" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "import torch.nn as nn\n", "\n", @@ -686,18 +677,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:23:15,792\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", - "\u001b[2m\u001b[36m(_get_read_tasks pid=36810)\u001b[0m 2022-07-20 21:23:15,789\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:07<00:00, 7.10s/it]\n", - "2022-07-20 21:23:29,813\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", - "\u001b[2m\u001b[36m(_get_read_tasks pid=36810)\u001b[0m 2022-07-20 21:23:29,811\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.13it/s]\n", - "Map Progress (2 actors 0 pending): 100%|██████████| 1/1 [00:02<00:00, 2.37s/it]\n", - "2022-07-20 21:23:33,089\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", - "\u001b[2m\u001b[36m(_get_read_tasks pid=36810)\u001b[0m 2022-07-20 21:23:33,086\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.05it/s]\n", + "2022-07-20 21:47:54,492\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "\u001b[2m\u001b[36m(_get_read_tasks pid=39493)\u001b[0m 2022-07-20 21:47:54,489\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:06<00:00, 6.40s/it]\n", + "2022-07-20 21:48:07,601\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "\u001b[2m\u001b[36m(_get_read_tasks pid=39493)\u001b[0m 2022-07-20 21:48:07,599\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.12it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.34s/it]\n", + "2022-07-20 21:48:10,858\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "\u001b[2m\u001b[36m(_get_read_tasks pid=39493)\u001b[0m 2022-07-20 21:48:10,856\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.29it/s]\n", "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n", - "2022-07-20 21:23:35,331\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" + "2022-07-20 21:48:13,075\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" ] }, { @@ -710,12 +701,12 @@ { "data": { "text/html": [ - "== Status ==
Current time: 2022-07-20 21:24:15 (running for 00:00:39.94)
Memory usage on this node: 33.0/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.02 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-23-35
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:48:52 (running for 00:00:39.66)
Memory usage on this node: 33.1/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-48-13
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_e2f7a_00000TERMINATED127.0.0.1:36887 4 36.469820.125 1658377454 6.31464
TorchTrainer_53c58_00000TERMINATED127.0.0.1:39548 4 36.4582824.229 1658378932 6.46339


" ], @@ -730,147 +721,159 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:23:35,495\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n", - "\u001b[2m\u001b[36m(TorchTrainer pid=36887)\u001b[0m 2022-07-20 21:23:38,548\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=36896)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=36896)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m 2022-07-20 21:23:49,104\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m 2022-07-20 21:23:49,297\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" + "2022-07-20 21:48:13,244\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n", + "\u001b[2m\u001b[36m(TorchTrainer pid=39548)\u001b[0m 2022-07-20 21:48:16,191\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=39557)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=39557)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m 2022-07-20 21:48:26,547\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 2.361676, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 1.813834, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 1.215798, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.784447, epoch: 0, iteration: 1500\n", - "Result for TorchTrainer_e2f7a_00000:\n", - " _time_this_iter_s: 6.658920049667358\n", - " _timestamp: 1658377435\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 2.282040, epoch: 0, iteration: 0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m 2022-07-20 21:48:26,772\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 1.521038, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 1.169452, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.856338, epoch: 0, iteration: 1500\n", + "Result for TorchTrainer_53c58_00000:\n", + " _time_this_iter_s: 6.627551078796387\n", + " _timestamp: 1658378913\n", " _training_iteration: 1\n", - " date: 2022-07-20_21-23-56\n", + " date: 2022-07-20_21-48-33\n", " done: false\n", - " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " experiment_id: abc531ef544440268933d8221addeb9d\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 1\n", - " loss: 2443.240612268448\n", + " loss: 2453.753049403429\n", " node_ip: 127.0.0.1\n", - " pid: 36887\n", + " pid: 39548\n", " should_checkpoint: true\n", - " time_since_restore: 17.467722177505493\n", - " time_this_iter_s: 17.467722177505493\n", - " time_total_s: 17.467722177505493\n", - " timestamp: 1658377436\n", + " time_since_restore: 17.27033305168152\n", + " time_this_iter_s: 17.27033305168152\n", + " time_total_s: 17.27033305168152\n", + " timestamp: 1658378913\n", " timesteps_since_restore: 0\n", " training_iteration: 1\n", - " trial_id: e2f7a_00000\n", - " warmup_time: 0.003819704055786133\n", + " trial_id: 53c58_00000\n", + " warmup_time: 0.003597259521484375\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.788217, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.849067, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.771431, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.774948, epoch: 1, iteration: 1500\n", - "Result for TorchTrainer_e2f7a_00000:\n", - " _time_this_iter_s: 6.489426851272583\n", - " _timestamp: 1658377442\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.788410, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.854239, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.533351, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.591339, epoch: 1, iteration: 1500\n", + "Result for TorchTrainer_53c58_00000:\n", + " _time_this_iter_s: 6.389349937438965\n", + " _timestamp: 1658378919\n", " _training_iteration: 2\n", - " date: 2022-07-20_21-24-02\n", + " date: 2022-07-20_21-48-39\n", " done: false\n", - " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " experiment_id: abc531ef544440268933d8221addeb9d\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 2\n", - " loss: 1298.5548962950706\n", + " loss: 1297.6828406900167\n", " node_ip: 127.0.0.1\n", - " pid: 36887\n", + " pid: 39548\n", " should_checkpoint: true\n", - " time_since_restore: 23.950473070144653\n", - " time_this_iter_s: 6.48275089263916\n", - " time_total_s: 23.950473070144653\n", - " timestamp: 1658377442\n", + " time_since_restore: 23.65428590774536\n", + " time_this_iter_s: 6.383952856063843\n", + " time_total_s: 23.65428590774536\n", + " timestamp: 1658378919\n", " timesteps_since_restore: 0\n", " training_iteration: 2\n", - " trial_id: e2f7a_00000\n", - " warmup_time: 0.003819704055786133\n", + " trial_id: 53c58_00000\n", + " warmup_time: 0.003597259521484375\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.543702, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.656774, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.542017, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.437269, epoch: 2, iteration: 1500\n", - "Result for TorchTrainer_e2f7a_00000:\n", - " _time_this_iter_s: 6.202568054199219\n", - " _timestamp: 1658377448\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.457057, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.594715, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.477588, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.235412, epoch: 2, iteration: 1500\n", + "Result for TorchTrainer_53c58_00000:\n", + " _time_this_iter_s: 6.340294122695923\n", + " _timestamp: 1658378926\n", " _training_iteration: 3\n", - " date: 2022-07-20_21-24-08\n", + " date: 2022-07-20_21-48-46\n", " done: false\n", - " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " experiment_id: abc531ef544440268933d8221addeb9d\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 3\n", - " loss: 976.985466003418\n", + " loss: 983.3285144269466\n", " node_ip: 127.0.0.1\n", - " pid: 36887\n", + " pid: 39548\n", " should_checkpoint: true\n", - " time_since_restore: 30.153619050979614\n", - " time_this_iter_s: 6.203145980834961\n", - " time_total_s: 30.153619050979614\n", - " timestamp: 1658377448\n", + " time_since_restore: 29.994139909744263\n", + " time_this_iter_s: 6.339854001998901\n", + " time_total_s: 29.994139909744263\n", + " timestamp: 1658378926\n", " timesteps_since_restore: 0\n", " training_iteration: 3\n", - " trial_id: e2f7a_00000\n", - " warmup_time: 0.003819704055786133\n", + " trial_id: 53c58_00000\n", + " warmup_time: 0.003597259521484375\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.354292, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.544945, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.494318, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=36903)\u001b[0m loss: 0.518876, epoch: 3, iteration: 1500\n", - "Result for TorchTrainer_e2f7a_00000:\n", - " _time_this_iter_s: 6.314639091491699\n", - " _timestamp: 1658377454\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.507374, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.447128, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.381943, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.347877, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_53c58_00000:\n", + " _time_this_iter_s: 6.463389873504639\n", + " _timestamp: 1658378932\n", " _training_iteration: 4\n", - " date: 2022-07-20_21-24-15\n", + " date: 2022-07-20_21-48-52\n", " done: false\n", - " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " experiment_id: abc531ef544440268933d8221addeb9d\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 4\n", - " loss: 820.125178091228\n", + " loss: 824.2287287414074\n", " node_ip: 127.0.0.1\n", - " pid: 36887\n", + " pid: 39548\n", " should_checkpoint: true\n", - " time_since_restore: 36.46902513504028\n", - " time_this_iter_s: 6.315406084060669\n", - " time_total_s: 36.46902513504028\n", - " timestamp: 1658377455\n", + " time_since_restore: 36.45815992355347\n", + " time_this_iter_s: 6.464020013809204\n", + " time_total_s: 36.45815992355347\n", + " timestamp: 1658378932\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: e2f7a_00000\n", - " warmup_time: 0.003819704055786133\n", + " trial_id: 53c58_00000\n", + " warmup_time: 0.003597259521484375\n", " \n", - "Result for TorchTrainer_e2f7a_00000:\n", - " _time_this_iter_s: 6.314639091491699\n", - " _timestamp: 1658377454\n", + "Result for TorchTrainer_53c58_00000:\n", + " _time_this_iter_s: 6.463389873504639\n", + " _timestamp: 1658378932\n", " _training_iteration: 4\n", - " date: 2022-07-20_21-24-15\n", + " date: 2022-07-20_21-48-52\n", " done: true\n", - " experiment_id: 19d3571dcd7f4d2b9193be27d191305c\n", + " experiment_id: abc531ef544440268933d8221addeb9d\n", " experiment_tag: '0'\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 4\n", - " loss: 820.125178091228\n", + " loss: 824.2287287414074\n", " node_ip: 127.0.0.1\n", - " pid: 36887\n", + " pid: 39548\n", " should_checkpoint: true\n", - " time_since_restore: 36.46902513504028\n", - " time_this_iter_s: 6.315406084060669\n", - " time_total_s: 36.46902513504028\n", - " timestamp: 1658377455\n", + " time_since_restore: 36.45815992355347\n", + " time_this_iter_s: 6.464020013809204\n", + " time_total_s: 36.45815992355347\n", + " timestamp: 1658378932\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: e2f7a_00000\n", - " warmup_time: 0.003819704055786133\n", + " trial_id: 53c58_00000\n", + " warmup_time: 0.003597259521484375\n", " \n" ] }, @@ -878,477 +881,87 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:24:15,426\tINFO tune.py:738 -- Total run time: 40.08 seconds (39.93 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/1 [00:01Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.32it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.34s/it]\n", - "2022-07-20 21:24:31,067\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:48:57,458 controller 39625 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:48:57,460 controller 39625 http_state.py:126 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:oEzsmU:SERVE_PROXY_ACTOR-db68eafa3bbe9042df574f3c9974b40ce8d97728db90282feefb4690' on node 'db68eafa3bbe9042df574f3c9974b40ce8d97728db90282feefb4690' listening on '127.0.0.1:8000'\n", + "Shuffle Map: 0%| | 0/1 [00:00Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.39it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n", + "2022-07-20 21:49:58,678\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" ] }, { @@ -1361,12 +974,12 @@ { "data": { "text/html": [ - "== Status ==
Current time: 2022-07-20 21:25:08 (running for 00:00:37.85)
Memory usage on this node: 33.7/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.02 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-24-31
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:50:36 (running for 00:00:37.98)
Memory usage on this node: 33.7/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-49-58
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_0435d_00000TERMINATED127.0.0.1:37029 4 34.0956706.09 1658377508 6.57723
TorchTrainer_92bcd_00000TERMINATED127.0.0.1:39736 4 34.1132707.634 1658379035 6.45643


" ], @@ -1381,156 +994,156 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(TorchTrainer pid=37029)\u001b[0m 2022-07-20 21:24:34,273\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m 2022-07-20 21:24:41,746\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=39736)\u001b[0m 2022-07-20 21:50:01,936\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,489\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 3.112837, epoch: 0, iteration: 0\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 3.301114, epoch: 0, iteration: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m 2022-07-20 21:24:42,046\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,795\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 1.363313, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.807654, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.658998, epoch: 0, iteration: 1500\n", - "Result for TorchTrainer_0435d_00000:\n", - " _time_this_iter_s: 6.752341032028198\n", - " _timestamp: 1658377488\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 1.075076, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.536976, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.600182, epoch: 0, iteration: 1500\n", + "Result for TorchTrainer_92bcd_00000:\n", + " _time_this_iter_s: 6.920917987823486\n", + " _timestamp: 1658379016\n", " _training_iteration: 1\n", - " date: 2022-07-20_21-24-48\n", + " date: 2022-07-20_21-50-16\n", " done: false\n", - " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 1\n", - " loss: 1784.0176878273487\n", + " loss: 1779.816628113389\n", " node_ip: 127.0.0.1\n", - " pid: 37029\n", + " pid: 39736\n", " should_checkpoint: true\n", - " time_since_restore: 14.582754850387573\n", - " time_this_iter_s: 14.582754850387573\n", - " time_total_s: 14.582754850387573\n", - " timestamp: 1658377488\n", + " time_since_restore: 14.83579397201538\n", + " time_this_iter_s: 14.83579397201538\n", + " time_total_s: 14.83579397201538\n", + " timestamp: 1658379016\n", " timesteps_since_restore: 0\n", " training_iteration: 1\n", - " trial_id: 0435d_00000\n", - " warmup_time: 0.005182027816772461\n", + " trial_id: 92bcd_00000\n", + " warmup_time: 0.005189180374145508\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.479975, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.664035, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.700321, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.530712, epoch: 1, iteration: 1500\n", - "Result for TorchTrainer_0435d_00000:\n", - " _time_this_iter_s: 6.524201154708862\n", - " _timestamp: 1658377495\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.546070, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.448120, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.392481, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.371981, epoch: 1, iteration: 1500\n", + "Result for TorchTrainer_92bcd_00000:\n", + " _time_this_iter_s: 6.480100154876709\n", + " _timestamp: 1658379023\n", " _training_iteration: 2\n", - " date: 2022-07-20_21-24-55\n", + " date: 2022-07-20_21-50-23\n", " done: false\n", - " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 2\n", - " loss: 996.3850839287043\n", + " loss: 1000.9026035964489\n", " node_ip: 127.0.0.1\n", - " pid: 37029\n", + " pid: 39736\n", " should_checkpoint: true\n", - " time_since_restore: 21.103912830352783\n", - " time_this_iter_s: 6.52115797996521\n", - " time_total_s: 21.103912830352783\n", - " timestamp: 1658377495\n", + " time_since_restore: 21.316986083984375\n", + " time_this_iter_s: 6.481192111968994\n", + " time_total_s: 21.316986083984375\n", + " timestamp: 1658379023\n", " timesteps_since_restore: 0\n", " training_iteration: 2\n", - " trial_id: 0435d_00000\n", - " warmup_time: 0.005182027816772461\n", + " trial_id: 92bcd_00000\n", + " warmup_time: 0.005189180374145508\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.523517, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.680489, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.451373, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.406104, epoch: 2, iteration: 1500\n", - "Result for TorchTrainer_0435d_00000:\n", - " _time_this_iter_s: 6.414251804351807\n", - " _timestamp: 1658377501\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.521735, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.635850, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.395862, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.402500, epoch: 2, iteration: 1500\n", + "Result for TorchTrainer_92bcd_00000:\n", + " _time_this_iter_s: 6.334350824356079\n", + " _timestamp: 1658379029\n", " _training_iteration: 3\n", - " date: 2022-07-20_21-25-01\n", + " date: 2022-07-20_21-50-29\n", " done: false\n", - " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 3\n", - " loss: 808.2543668076396\n", + " loss: 810.5124998539686\n", " node_ip: 127.0.0.1\n", - " pid: 37029\n", + " pid: 39736\n", " should_checkpoint: true\n", - " time_since_restore: 27.517575979232788\n", - " time_this_iter_s: 6.413663148880005\n", - " time_total_s: 27.517575979232788\n", - " timestamp: 1658377501\n", + " time_since_restore: 27.649451971054077\n", + " time_this_iter_s: 6.332465887069702\n", + " time_total_s: 27.649451971054077\n", + " timestamp: 1658379029\n", " timesteps_since_restore: 0\n", " training_iteration: 3\n", - " trial_id: 0435d_00000\n", - " warmup_time: 0.005182027816772461\n", + " trial_id: 92bcd_00000\n", + " warmup_time: 0.005189180374145508\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.415548, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.341548, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.467579, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37039)\u001b[0m loss: 0.341880, epoch: 3, iteration: 1500\n", - "Result for TorchTrainer_0435d_00000:\n", - " _time_this_iter_s: 6.577230215072632\n", - " _timestamp: 1658377508\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.236922, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.528482, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.372242, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.355759, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_92bcd_00000:\n", + " _time_this_iter_s: 6.456433057785034\n", + " _timestamp: 1658379035\n", " _training_iteration: 4\n", - " date: 2022-07-20_21-25-08\n", + " date: 2022-07-20_21-50-36\n", " done: false\n", - " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 4\n", - " loss: 706.0895065665245\n", + " loss: 707.6341038495302\n", " node_ip: 127.0.0.1\n", - " pid: 37029\n", + " pid: 39736\n", " should_checkpoint: true\n", - " time_since_restore: 34.09562683105469\n", - " time_this_iter_s: 6.578050851821899\n", - " time_total_s: 34.09562683105469\n", - " timestamp: 1658377508\n", + " time_since_restore: 34.11321783065796\n", + " time_this_iter_s: 6.463765859603882\n", + " time_total_s: 34.11321783065796\n", + " timestamp: 1658379036\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: 0435d_00000\n", - " warmup_time: 0.005182027816772461\n", + " trial_id: 92bcd_00000\n", + " warmup_time: 0.005189180374145508\n", " \n", - "Result for TorchTrainer_0435d_00000:\n", - " _time_this_iter_s: 6.577230215072632\n", - " _timestamp: 1658377508\n", + "Result for TorchTrainer_92bcd_00000:\n", + " _time_this_iter_s: 6.456433057785034\n", + " _timestamp: 1658379035\n", " _training_iteration: 4\n", - " date: 2022-07-20_21-25-08\n", + " date: 2022-07-20_21-50-36\n", " done: true\n", - " experiment_id: d2d862df890d4f719f253db293ed8057\n", + " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", " experiment_tag: '0'\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 4\n", - " loss: 706.0895065665245\n", + " loss: 707.6341038495302\n", " node_ip: 127.0.0.1\n", - " pid: 37029\n", + " pid: 39736\n", " should_checkpoint: true\n", - " time_since_restore: 34.09562683105469\n", - " time_this_iter_s: 6.578050851821899\n", - " time_total_s: 34.09562683105469\n", - " timestamp: 1658377508\n", + " time_since_restore: 34.11321783065796\n", + " time_this_iter_s: 6.463765859603882\n", + " time_total_s: 34.11321783065796\n", + " timestamp: 1658379036\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: 0435d_00000\n", - " warmup_time: 0.005182027816772461\n", + " trial_id: 92bcd_00000\n", + " warmup_time: 0.005189180374145508\n", " \n" ] }, @@ -1538,476 +1151,86 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:25:09,090\tINFO tune.py:738 -- Total run time: 38.00 seconds (37.84 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/2 [00:01Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.09it/s]\n", + "Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 6.24it/s]\n", + "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 6.19it/s]\n", + "Map Progress (1 actors 0 pending): 100%|██████████| 1/1 [00:01<00:00, 1.18s/it]\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:42,924 controller 39625 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:45,044 controller 39625 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:47,377 controller 39625 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:49,504 controller 39625 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:50:51,941 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 4.3ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:50:51,940 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.3ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:50:56,959 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 5014.3ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:50:56,964 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.4ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:50:56,958 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 5010.7ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:50:56,964 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:01,968 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 5000.3ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:01,974 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.6ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:51:01,966 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 4997.7ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:51:01,973 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:06,973 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 4996.3ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:06,979 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.5ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:51:06,972 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 4993.5ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:51:06,978 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:11,980 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 4997.4ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:11,986 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.6ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:51:11,978 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 4994.8ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:51:11,985 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:16,984 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 4994.3ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:16,989 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.4ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:51:16,982 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 4991.8ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:51:16,989 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:21,991 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 4999.1ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:21,997 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.5ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:51:21,990 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 4996.7ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:51:21,996 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:26,998 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 4997.6ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:27,004 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.8ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:51:26,997 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 4994.7ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:51:27,003 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:32,007 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 4999.8ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:32,013 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.8ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:51:32,006 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 4997.2ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:51:32,012 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:37,012 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 200 4995.8ms\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=39628)\u001b[0m INFO 2022-07-20 21:51:37,019 http_proxy 127.0.0.1 http_proxy.py:320 - POST /mnist_predict 307 2.8ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39834)\u001b[0m INFO 2022-07-20 21:51:37,011 mnist_model mnist_model#GcMrER replica.py:483 - HANDLE __call__ OK 4993.4ms\n", + "\u001b[2m\u001b[36m(ServeReplica:mnist_model pid=39843)\u001b[0m INFO 2022-07-20 21:51:37,018 mnist_model mnist_model#ywyHkr replica.py:483 - HANDLE __call__ OK 0.2ms\n", + "Map_Batches: 0%| | 0/1 [00:00Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.04it/s]\n", "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n", - "2022-07-20 21:25:29,661\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" + "2022-07-20 21:51:47,324\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" ] }, { @@ -2020,12 +1243,12 @@ { "data": { "text/html": [ - "== Status ==
Current time: 2022-07-20 21:26:07 (running for 00:00:37.91)
Memory usage on this node: 33.9/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.02 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-25-29
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:52:25 (running for 00:00:37.97)
Memory usage on this node: 34.0/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-51-47
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_2722b_00000TERMINATED127.0.0.1:37163 4 34.389674.492 1658377567 6.59337
TorchTrainer_d37db_00000TERMINATED127.0.0.1:39948 4 34.0141671.998 1658379144 6.59292


" ], @@ -2040,156 +1263,144 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(TorchTrainer pid=37163)\u001b[0m 2022-07-20 21:25:32,879\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m 2022-07-20 21:25:40,394\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=39948)\u001b[0m 2022-07-20 21:51:50,596\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,118\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,367\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 3.695508, epoch: 0, iteration: 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m 2022-07-20 21:25:40,627\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 1.271604, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.773141, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.778621, epoch: 0, iteration: 1500\n", - "Result for TorchTrainer_2722b_00000:\n", - " _time_this_iter_s: 6.677475929260254\n", - " _timestamp: 1658377547\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 4.062408, epoch: 0, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.970063, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.658269, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.442650, epoch: 0, iteration: 1500\n", + "Result for TorchTrainer_d37db_00000:\n", + " _time_this_iter_s: 6.64359712600708\n", + " _timestamp: 1658379125\n", " _training_iteration: 1\n", - " date: 2022-07-20_21-25-47\n", + " date: 2022-07-20_21-52-05\n", " done: false\n", - " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 1\n", - " loss: 1716.6645558029413\n", + " loss: 1683.766377851367\n", " node_ip: 127.0.0.1\n", - " pid: 37163\n", + " pid: 39948\n", " should_checkpoint: true\n", - " time_since_restore: 14.479393005371094\n", - " time_this_iter_s: 14.479393005371094\n", - " time_total_s: 14.479393005371094\n", - " timestamp: 1658377547\n", + " time_since_restore: 14.471015930175781\n", + " time_this_iter_s: 14.471015930175781\n", + " time_total_s: 14.471015930175781\n", + " timestamp: 1658379125\n", " timesteps_since_restore: 0\n", " training_iteration: 1\n", - " trial_id: 2722b_00000\n", - " warmup_time: 0.004639148712158203\n", + " trial_id: d37db_00000\n", + " warmup_time: 0.005116939544677734\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.640197, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.708120, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.539896, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.414441, epoch: 1, iteration: 1500\n", - "Result for TorchTrainer_2722b_00000:\n", - " _time_this_iter_s: 6.717606067657471\n", - " _timestamp: 1658377554\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.603212, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.534739, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.420072, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.351545, epoch: 1, iteration: 1500\n", + "Result for TorchTrainer_d37db_00000:\n", + " _time_this_iter_s: 6.489015102386475\n", + " _timestamp: 1658379131\n", " _training_iteration: 2\n", - " date: 2022-07-20_21-25-54\n", + " date: 2022-07-20_21-52-11\n", " done: false\n", - " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 2\n", - " loss: 947.4811083450913\n", + " loss: 929.9579524248838\n", " node_ip: 127.0.0.1\n", - " pid: 37163\n", + " pid: 39948\n", " should_checkpoint: true\n", - " time_since_restore: 21.194233894348145\n", - " time_this_iter_s: 6.714840888977051\n", - " time_total_s: 21.194233894348145\n", - " timestamp: 1658377554\n", + " time_since_restore: 20.958564043045044\n", + " time_this_iter_s: 6.487548112869263\n", + " time_total_s: 20.958564043045044\n", + " timestamp: 1658379131\n", " timesteps_since_restore: 0\n", " training_iteration: 2\n", - " trial_id: 2722b_00000\n", - " warmup_time: 0.004639148712158203\n", + " trial_id: d37db_00000\n", + " warmup_time: 0.005116939544677734\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.340714, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.561278, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.547913, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.511139, epoch: 2, iteration: 1500\n", - "Result for TorchTrainer_2722b_00000:\n", - " _time_this_iter_s: 6.598260879516602\n", - " _timestamp: 1658377560\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.347010, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.419703, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.350773, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.231652, epoch: 2, iteration: 1500\n", + "Result for TorchTrainer_d37db_00000:\n", + " _time_this_iter_s: 6.464617967605591\n", + " _timestamp: 1658379137\n", " _training_iteration: 3\n", - " date: 2022-07-20_21-26-00\n", + " date: 2022-07-20_21-52-18\n", " done: false\n", - " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 3\n", - " loss: 771.773955013603\n", + " loss: 772.555232591927\n", " node_ip: 127.0.0.1\n", - " pid: 37163\n", + " pid: 39948\n", " should_checkpoint: true\n", - " time_since_restore: 27.794183015823364\n", - " time_this_iter_s: 6.59994912147522\n", - " time_total_s: 27.794183015823364\n", - " timestamp: 1658377560\n", + " time_since_restore: 27.42328119277954\n", + " time_this_iter_s: 6.464717149734497\n", + " time_total_s: 27.42328119277954\n", + " timestamp: 1658379138\n", " timesteps_since_restore: 0\n", " training_iteration: 3\n", - " trial_id: 2722b_00000\n", - " warmup_time: 0.004639148712158203\n", + " trial_id: d37db_00000\n", + " warmup_time: 0.005116939544677734\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.297863, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.295025, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.364217, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37171)\u001b[0m loss: 0.306977, epoch: 3, iteration: 1500\n", - "Result for TorchTrainer_2722b_00000:\n", - " _time_this_iter_s: 6.593369245529175\n", - " _timestamp: 1658377567\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.343125, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.547853, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.353915, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.260028, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_d37db_00000:\n", + " _time_this_iter_s: 6.5929179191589355\n", + " _timestamp: 1658379144\n", " _training_iteration: 4\n", - " date: 2022-07-20_21-26-07\n", + " date: 2022-07-20_21-52-24\n", " done: false\n", - " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 4\n", - " loss: 674.491991031915\n", + " loss: 671.9976235236973\n", " node_ip: 127.0.0.1\n", - " pid: 37163\n", + " pid: 39948\n", " should_checkpoint: true\n", - " time_since_restore: 34.38901996612549\n", - " time_this_iter_s: 6.594836950302124\n", - " time_total_s: 34.38901996612549\n", - " timestamp: 1658377567\n", + " time_since_restore: 34.01405596733093\n", + " time_this_iter_s: 6.590774774551392\n", + " time_total_s: 34.01405596733093\n", + " timestamp: 1658379144\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: 2722b_00000\n", - " warmup_time: 0.004639148712158203\n", + " trial_id: d37db_00000\n", + " warmup_time: 0.005116939544677734\n", " \n", - "Result for TorchTrainer_2722b_00000:\n", - " _time_this_iter_s: 6.593369245529175\n", - " _timestamp: 1658377567\n", + "Result for TorchTrainer_d37db_00000:\n", + " _time_this_iter_s: 6.5929179191589355\n", + " _timestamp: 1658379144\n", " _training_iteration: 4\n", - " date: 2022-07-20_21-26-07\n", + " date: 2022-07-20_21-52-24\n", " done: true\n", - " experiment_id: 16c67c09690144f791d7400292b4ecea\n", + " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", " experiment_tag: '0'\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 4\n", - " loss: 674.491991031915\n", + " loss: 671.9976235236973\n", " node_ip: 127.0.0.1\n", - " pid: 37163\n", + " pid: 39948\n", " should_checkpoint: true\n", - " time_since_restore: 34.38901996612549\n", - " time_this_iter_s: 6.594836950302124\n", - " time_total_s: 34.38901996612549\n", - " timestamp: 1658377567\n", + " time_since_restore: 34.01405596733093\n", + " time_this_iter_s: 6.590774774551392\n", + " time_total_s: 34.01405596733093\n", + " timestamp: 1658379144\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: 2722b_00000\n", - " warmup_time: 0.004639148712158203\n", + " trial_id: d37db_00000\n", + " warmup_time: 0.005116939544677734\n", " \n" ] }, @@ -2197,471 +1408,81 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:26:07,750\tINFO tune.py:738 -- Total run time: 38.07 seconds (37.91 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01Current time: 2022-07-20 21:28:07 (running for 00:01:31.34)
Memory usage on this node: 34.3/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.02 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-26-36
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:55:10 (running for 00:01:25.89)
Memory usage on this node: 34.4/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-53-44
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_4ec24_00000TERMINATED127.0.0.1:37313 4 87.33652329.82 1658377686 19.1051
TorchTrainer_1923b_00000TERMINATED127.0.0.1:40228 4 82.72852328.8 1658379309 17.0239


" ], @@ -2902,180 +1723,180 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(TorchTrainer pid=37313)\u001b[0m 2022-07-20 21:26:39,346\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=37323)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=37323)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=37322)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=37322)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m 2022-07-20 21:26:50,228\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m 2022-07-20 21:26:50,458\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=40228)\u001b[0m 2022-07-20 21:53:47,328\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=40256)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=40256)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=40257)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(_map_block_nosplit pid=40257)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m 2022-07-20 21:53:58,782\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m 2022-07-20 21:53:59,042\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 2.340395, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.700694, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.428385, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.446490, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.130699, epoch: 0, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.275075, epoch: 0, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.094344, epoch: 0, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 1.151832, epoch: 0, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.763781, epoch: 0, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.623614, epoch: 0, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.584415, epoch: 0, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.908524, epoch: 0, iteration: 5500\n", - "Result for TorchTrainer_4ec24_00000:\n", - " _time_this_iter_s: 19.464027881622314\n", - " _timestamp: 1658377629\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 2.305423, epoch: 0, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.935424, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.380047, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.376004, epoch: 0, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.388083, epoch: 0, iteration: 2000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.863035, epoch: 0, iteration: 2500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.177578, epoch: 0, iteration: 3000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.235104, epoch: 0, iteration: 3500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.841955, epoch: 0, iteration: 4000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.943087, epoch: 0, iteration: 4500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.174222, epoch: 0, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.776577, epoch: 0, iteration: 5500\n", + "Result for TorchTrainer_1923b_00000:\n", + " _time_this_iter_s: 18.870364904403687\n", + " _timestamp: 1658379257\n", " _training_iteration: 1\n", - " date: 2022-07-20_21-27-09\n", + " date: 2022-07-20_21-54-17\n", " done: false\n", - " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " experiment_id: d304983bfe3f4e269118f8618aa9b02f\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 1\n", - " loss: 6849.163370370865\n", + " loss: 6798.259970664978\n", " node_ip: 127.0.0.1\n", - " pid: 37313\n", + " pid: 40228\n", " should_checkpoint: true\n", - " time_since_restore: 30.637123107910156\n", - " time_this_iter_s: 30.637123107910156\n", - " time_total_s: 30.637123107910156\n", - " timestamp: 1658377629\n", + " time_since_restore: 30.64481782913208\n", + " time_this_iter_s: 30.64481782913208\n", + " time_total_s: 30.64481782913208\n", + " timestamp: 1658379257\n", " timesteps_since_restore: 0\n", " training_iteration: 1\n", - " trial_id: 4ec24_00000\n", - " warmup_time: 0.01962900161743164\n", + " trial_id: 1923b_00000\n", + " warmup_time: 0.004433870315551758\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.597300, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.673263, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.692797, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.748966, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.493257, epoch: 1, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.538625, epoch: 1, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.523347, epoch: 1, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.762524, epoch: 1, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.556533, epoch: 1, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.533555, epoch: 1, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.425929, epoch: 1, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.717063, epoch: 1, iteration: 5500\n", - "Result for TorchTrainer_4ec24_00000:\n", - " _time_this_iter_s: 18.69218420982361\n", - " _timestamp: 1658377648\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.674814, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.699747, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.623052, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.896888, epoch: 1, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.559495, epoch: 1, iteration: 2000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.346765, epoch: 1, iteration: 2500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.710671, epoch: 1, iteration: 3000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.046087, epoch: 1, iteration: 3500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.412118, epoch: 1, iteration: 4000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.568381, epoch: 1, iteration: 4500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.795673, epoch: 1, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.651217, epoch: 1, iteration: 5500\n", + "Result for TorchTrainer_1923b_00000:\n", + " _time_this_iter_s: 18.252465963363647\n", + " _timestamp: 1658379276\n", " _training_iteration: 2\n", - " date: 2022-07-20_21-27-28\n", + " date: 2022-07-20_21-54-36\n", " done: false\n", - " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " experiment_id: d304983bfe3f4e269118f8618aa9b02f\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 2\n", - " loss: 3535.0757773667574\n", + " loss: 3529.048624396324\n", " node_ip: 127.0.0.1\n", - " pid: 37313\n", + " pid: 40228\n", " should_checkpoint: true\n", - " time_since_restore: 49.321765184402466\n", - " time_this_iter_s: 18.68464207649231\n", - " time_total_s: 49.321765184402466\n", - " timestamp: 1658377648\n", + " time_since_restore: 48.89374089241028\n", + " time_this_iter_s: 18.2489230632782\n", + " time_total_s: 48.89374089241028\n", + " timestamp: 1658379276\n", " timesteps_since_restore: 0\n", " training_iteration: 2\n", - " trial_id: 4ec24_00000\n", - " warmup_time: 0.01962900161743164\n", + " trial_id: 1923b_00000\n", + " warmup_time: 0.004433870315551758\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.471813, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.777419, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.494634, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.509953, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.402481, epoch: 2, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.339183, epoch: 2, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.422910, epoch: 2, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.505549, epoch: 2, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.379204, epoch: 2, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.453385, epoch: 2, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.291901, epoch: 2, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.505701, epoch: 2, iteration: 5500\n", - "Result for TorchTrainer_4ec24_00000:\n", - " _time_this_iter_s: 18.91029191017151\n", - " _timestamp: 1658377667\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.743072, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.745054, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.670639, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.742653, epoch: 2, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.666471, epoch: 2, iteration: 2000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.223321, epoch: 2, iteration: 2500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.541566, epoch: 2, iteration: 3000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.854896, epoch: 2, iteration: 3500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.393249, epoch: 2, iteration: 4000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.392985, epoch: 2, iteration: 4500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.639829, epoch: 2, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.682482, epoch: 2, iteration: 5500\n", + "Result for TorchTrainer_1923b_00000:\n", + " _time_this_iter_s: 16.812592267990112\n", + " _timestamp: 1658379292\n", " _training_iteration: 3\n", - " date: 2022-07-20_21-27-47\n", + " date: 2022-07-20_21-54-53\n", " done: false\n", - " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " experiment_id: d304983bfe3f4e269118f8618aa9b02f\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 3\n", - " loss: 2748.917506597936\n", + " loss: 2729.02725020051\n", " node_ip: 127.0.0.1\n", - " pid: 37313\n", + " pid: 40228\n", " should_checkpoint: true\n", - " time_since_restore: 68.22985124588013\n", - " time_this_iter_s: 18.90808606147766\n", - " time_total_s: 68.22985124588013\n", - " timestamp: 1658377667\n", + " time_since_restore: 65.70409798622131\n", + " time_this_iter_s: 16.810357093811035\n", + " time_total_s: 65.70409798622131\n", + " timestamp: 1658379293\n", " timesteps_since_restore: 0\n", " training_iteration: 3\n", - " trial_id: 4ec24_00000\n", - " warmup_time: 0.01962900161743164\n", + " trial_id: 1923b_00000\n", + " warmup_time: 0.004433870315551758\n", " \n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.323353, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.490277, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.325247, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.448500, epoch: 3, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.288957, epoch: 3, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.344484, epoch: 3, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.361682, epoch: 3, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.556559, epoch: 3, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.271744, epoch: 3, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.277652, epoch: 3, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.368515, epoch: 3, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=37335)\u001b[0m loss: 0.538925, epoch: 3, iteration: 5500\n", - "Result for TorchTrainer_4ec24_00000:\n", - " _time_this_iter_s: 19.10509490966797\n", - " _timestamp: 1658377686\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.553197, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.471037, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.398391, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.544436, epoch: 3, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.437728, epoch: 3, iteration: 2000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.129666, epoch: 3, iteration: 2500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.620640, epoch: 3, iteration: 3000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.930044, epoch: 3, iteration: 3500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.365749, epoch: 3, iteration: 4000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.445916, epoch: 3, iteration: 4500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.538055, epoch: 3, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.534079, epoch: 3, iteration: 5500\n", + "Result for TorchTrainer_1923b_00000:\n", + " _time_this_iter_s: 17.023871898651123\n", + " _timestamp: 1658379309\n", " _training_iteration: 4\n", - " date: 2022-07-20_21-28-06\n", + " date: 2022-07-20_21-55-10\n", " done: false\n", - " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " experiment_id: d304983bfe3f4e269118f8618aa9b02f\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 4\n", - " loss: 2329.821715295315\n", + " loss: 2328.8038033917546\n", " node_ip: 127.0.0.1\n", - " pid: 37313\n", + " pid: 40228\n", " should_checkpoint: true\n", - " time_since_restore: 87.33653211593628\n", - " time_this_iter_s: 19.106680870056152\n", - " time_total_s: 87.33653211593628\n", - " timestamp: 1658377686\n", + " time_since_restore: 82.72845268249512\n", + " time_this_iter_s: 17.024354696273804\n", + " time_total_s: 82.72845268249512\n", + " timestamp: 1658379310\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: 4ec24_00000\n", - " warmup_time: 0.01962900161743164\n", + " trial_id: 1923b_00000\n", + " warmup_time: 0.004433870315551758\n", " \n", - "Result for TorchTrainer_4ec24_00000:\n", - " _time_this_iter_s: 19.10509490966797\n", - " _timestamp: 1658377686\n", + "Result for TorchTrainer_1923b_00000:\n", + " _time_this_iter_s: 17.023871898651123\n", + " _timestamp: 1658379309\n", " _training_iteration: 4\n", - " date: 2022-07-20_21-28-06\n", + " date: 2022-07-20_21-55-10\n", " done: true\n", - " experiment_id: 26b75d3177b149f4a3193f17ba5171a2\n", + " experiment_id: d304983bfe3f4e269118f8618aa9b02f\n", " experiment_tag: '0'\n", " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", " iterations_since_restore: 4\n", - " loss: 2329.821715295315\n", + " loss: 2328.8038033917546\n", " node_ip: 127.0.0.1\n", - " pid: 37313\n", + " pid: 40228\n", " should_checkpoint: true\n", - " time_since_restore: 87.33653211593628\n", - " time_this_iter_s: 19.106680870056152\n", - " time_total_s: 87.33653211593628\n", - " timestamp: 1658377686\n", + " time_since_restore: 82.72845268249512\n", + " time_this_iter_s: 17.024354696273804\n", + " time_total_s: 82.72845268249512\n", + " timestamp: 1658379310\n", " timesteps_since_restore: 0\n", " training_iteration: 4\n", - " trial_id: 4ec24_00000\n", - " warmup_time: 0.01962900161743164\n", + " trial_id: 1923b_00000\n", + " warmup_time: 0.004433870315551758\n", " \n" ] }, @@ -3083,7 +1904,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:28:07,659\tINFO tune.py:738 -- Total run time: 91.46 seconds (91.34 seconds for the tuning loop).\n" + "2022-07-20 21:55:10,233\tINFO tune.py:738 -- Total run time: 86.00 seconds (85.88 seconds for the tuning loop).\n" ] } ], @@ -3130,19 +1951,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01 Date: Wed, 20 Jul 2022 22:11:24 -0700 Subject: [PATCH 04/10] remove outputs --- .../examples/torch_incremental_learning.ipynb | 158 +----------------- 1 file changed, 3 insertions(+), 155 deletions(-) diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index 991634e26b61..5359dbdc00e0 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -677,18 +677,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:47:54,492\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", - "\u001b[2m\u001b[36m(_get_read_tasks pid=39493)\u001b[0m 2022-07-20 21:47:54,489\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", "Read->Map_Batches: 100%|██████████| 1/1 [00:06<00:00, 6.40s/it]\n", - "2022-07-20 21:48:07,601\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", - "\u001b[2m\u001b[36m(_get_read_tasks pid=39493)\u001b[0m 2022-07-20 21:48:07,599\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.12it/s]\n", "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.34s/it]\n", - "2022-07-20 21:48:10,858\tWARNING read_api.py:291 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", - "\u001b[2m\u001b[36m(_get_read_tasks pid=39493)\u001b[0m 2022-07-20 21:48:10,856\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.29it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n", - "2022-07-20 21:48:13,075\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n" ] }, { @@ -721,12 +714,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-20 21:48:13,244\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n", - "\u001b[2m\u001b[36m(TorchTrainer pid=39548)\u001b[0m 2022-07-20 21:48:16,191\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=39557)\u001b[0m /Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/torchvision/transforms/functional.py:150: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=39557)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m 2022-07-20 21:48:26,547\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" + "2022-07-20 21:48:13,244\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n" ] }, { @@ -741,8 +729,6 @@ "output_type": "stream", "text": [ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m 2022-07-20 21:48:26,772\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { @@ -752,80 +738,14 @@ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 1.521038, epoch: 0, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 1.169452, epoch: 0, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.856338, epoch: 0, iteration: 1500\n", - "Result for TorchTrainer_53c58_00000:\n", - " _time_this_iter_s: 6.627551078796387\n", - " _timestamp: 1658378913\n", - " _training_iteration: 1\n", - " date: 2022-07-20_21-48-33\n", - " done: false\n", - " experiment_id: abc531ef544440268933d8221addeb9d\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 1\n", - " loss: 2453.753049403429\n", - " node_ip: 127.0.0.1\n", - " pid: 39548\n", - " should_checkpoint: true\n", - " time_since_restore: 17.27033305168152\n", - " time_this_iter_s: 17.27033305168152\n", - " time_total_s: 17.27033305168152\n", - " timestamp: 1658378913\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: 53c58_00000\n", - " warmup_time: 0.003597259521484375\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.788410, epoch: 1, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.854239, epoch: 1, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.533351, epoch: 1, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.591339, epoch: 1, iteration: 1500\n", - "Result for TorchTrainer_53c58_00000:\n", - " _time_this_iter_s: 6.389349937438965\n", - " _timestamp: 1658378919\n", - " _training_iteration: 2\n", - " date: 2022-07-20_21-48-39\n", - " done: false\n", - " experiment_id: abc531ef544440268933d8221addeb9d\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 2\n", - " loss: 1297.6828406900167\n", - " node_ip: 127.0.0.1\n", - " pid: 39548\n", - " should_checkpoint: true\n", - " time_since_restore: 23.65428590774536\n", - " time_this_iter_s: 6.383952856063843\n", - " time_total_s: 23.65428590774536\n", - " timestamp: 1658378919\n", - " timesteps_since_restore: 0\n", - " training_iteration: 2\n", - " trial_id: 53c58_00000\n", - " warmup_time: 0.003597259521484375\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.457057, epoch: 2, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.594715, epoch: 2, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.477588, epoch: 2, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.235412, epoch: 2, iteration: 1500\n", - "Result for TorchTrainer_53c58_00000:\n", - " _time_this_iter_s: 6.340294122695923\n", - " _timestamp: 1658378926\n", - " _training_iteration: 3\n", - " date: 2022-07-20_21-48-46\n", - " done: false\n", - " experiment_id: abc531ef544440268933d8221addeb9d\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 3\n", - " loss: 983.3285144269466\n", - " node_ip: 127.0.0.1\n", - " pid: 39548\n", - " should_checkpoint: true\n", - " time_since_restore: 29.994139909744263\n", - " time_this_iter_s: 6.339854001998901\n", - " time_total_s: 29.994139909744263\n", - " timestamp: 1658378926\n", - " timesteps_since_restore: 0\n", - " training_iteration: 3\n", - " trial_id: 53c58_00000\n", - " warmup_time: 0.003597259521484375\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.507374, epoch: 3, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.447128, epoch: 3, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.381943, epoch: 3, iteration: 1000\n", @@ -835,28 +755,6 @@ " _timestamp: 1658378932\n", " _training_iteration: 4\n", " date: 2022-07-20_21-48-52\n", - " done: false\n", - " experiment_id: abc531ef544440268933d8221addeb9d\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 4\n", - " loss: 824.2287287414074\n", - " node_ip: 127.0.0.1\n", - " pid: 39548\n", - " should_checkpoint: true\n", - " time_since_restore: 36.45815992355347\n", - " time_this_iter_s: 6.464020013809204\n", - " time_total_s: 36.45815992355347\n", - " timestamp: 1658378932\n", - " timesteps_since_restore: 0\n", - " training_iteration: 4\n", - " trial_id: 53c58_00000\n", - " warmup_time: 0.003597259521484375\n", - " \n", - "Result for TorchTrainer_53c58_00000:\n", - " _time_this_iter_s: 6.463389873504639\n", - " _timestamp: 1658378932\n", - " _training_iteration: 4\n", - " date: 2022-07-20_21-48-52\n", " done: true\n", " experiment_id: abc531ef544440268933d8221addeb9d\n", " experiment_tag: '0'\n", @@ -883,13 +781,6 @@ "text": [ "2022-07-20 21:48:52,891\tINFO tune.py:738 -- Total run time: 39.80 seconds (39.66 seconds for the tuning loop).\n", "Map Progress (1 actors 1 pending): 0%| | 0/1 [00:01Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.39it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n", - "2022-07-20 21:49:58,678\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n" ] }, { From a42c3a2922a4742718b5b10273803d94087dfbe3 Mon Sep 17 00:00:00 2001 From: Jiao Dong Date: Wed, 20 Jul 2022 22:13:56 -0700 Subject: [PATCH 05/10] remove logs --- .../examples/torch_incremental_learning.ipynb | 290 +----------------- 1 file changed, 4 insertions(+), 286 deletions(-) diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index 5359dbdc00e0..7a973c62d8eb 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -841,11 +841,7 @@ { "name": "stderr", "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(TorchTrainer pid=39736)\u001b[0m 2022-07-20 21:50:01,936\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,489\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" - ] + "text": [] }, { "name": "stdout", @@ -858,9 +854,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,795\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,795\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n" ] }, { @@ -870,80 +864,14 @@ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 1.075076, epoch: 0, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.536976, epoch: 0, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.600182, epoch: 0, iteration: 1500\n", - "Result for TorchTrainer_92bcd_00000:\n", - " _time_this_iter_s: 6.920917987823486\n", - " _timestamp: 1658379016\n", - " _training_iteration: 1\n", - " date: 2022-07-20_21-50-16\n", - " done: false\n", - " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 1\n", - " loss: 1779.816628113389\n", - " node_ip: 127.0.0.1\n", - " pid: 39736\n", - " should_checkpoint: true\n", - " time_since_restore: 14.83579397201538\n", - " time_this_iter_s: 14.83579397201538\n", - " time_total_s: 14.83579397201538\n", - " timestamp: 1658379016\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: 92bcd_00000\n", - " warmup_time: 0.005189180374145508\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.546070, epoch: 1, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.448120, epoch: 1, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.392481, epoch: 1, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.371981, epoch: 1, iteration: 1500\n", - "Result for TorchTrainer_92bcd_00000:\n", - " _time_this_iter_s: 6.480100154876709\n", - " _timestamp: 1658379023\n", - " _training_iteration: 2\n", - " date: 2022-07-20_21-50-23\n", - " done: false\n", - " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 2\n", - " loss: 1000.9026035964489\n", - " node_ip: 127.0.0.1\n", - " pid: 39736\n", - " should_checkpoint: true\n", - " time_since_restore: 21.316986083984375\n", - " time_this_iter_s: 6.481192111968994\n", - " time_total_s: 21.316986083984375\n", - " timestamp: 1658379023\n", - " timesteps_since_restore: 0\n", - " training_iteration: 2\n", - " trial_id: 92bcd_00000\n", - " warmup_time: 0.005189180374145508\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.521735, epoch: 2, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.635850, epoch: 2, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.395862, epoch: 2, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.402500, epoch: 2, iteration: 1500\n", - "Result for TorchTrainer_92bcd_00000:\n", - " _time_this_iter_s: 6.334350824356079\n", - " _timestamp: 1658379029\n", - " _training_iteration: 3\n", - " date: 2022-07-20_21-50-29\n", - " done: false\n", - " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 3\n", - " loss: 810.5124998539686\n", - " node_ip: 127.0.0.1\n", - " pid: 39736\n", - " should_checkpoint: true\n", - " time_since_restore: 27.649451971054077\n", - " time_this_iter_s: 6.332465887069702\n", - " time_total_s: 27.649451971054077\n", - " timestamp: 1658379029\n", - " timesteps_since_restore: 0\n", - " training_iteration: 3\n", - " trial_id: 92bcd_00000\n", - " warmup_time: 0.005189180374145508\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.236922, epoch: 3, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.528482, epoch: 3, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.372242, epoch: 3, iteration: 1000\n", @@ -953,28 +881,6 @@ " _timestamp: 1658379035\n", " _training_iteration: 4\n", " date: 2022-07-20_21-50-36\n", - " done: false\n", - " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 4\n", - " loss: 707.6341038495302\n", - " node_ip: 127.0.0.1\n", - " pid: 39736\n", - " should_checkpoint: true\n", - " time_since_restore: 34.11321783065796\n", - " time_this_iter_s: 6.463765859603882\n", - " time_total_s: 34.11321783065796\n", - " timestamp: 1658379036\n", - " timesteps_since_restore: 0\n", - " training_iteration: 4\n", - " trial_id: 92bcd_00000\n", - " warmup_time: 0.005189180374145508\n", - " \n", - "Result for TorchTrainer_92bcd_00000:\n", - " _time_this_iter_s: 6.456433057785034\n", - " _timestamp: 1658379035\n", - " _training_iteration: 4\n", - " date: 2022-07-20_21-50-36\n", " done: true\n", " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", " experiment_tag: '0'\n", @@ -1001,13 +907,6 @@ "text": [ "2022-07-20 21:50:36,835\tINFO tune.py:738 -- Total run time: 38.13 seconds (37.98 seconds for the tuning loop).\n", "Map Progress (1 actors 1 pending): 0%| | 0/2 [00:01Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.04it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n", - "2022-07-20 21:51:47,324\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n" ] }, { @@ -1113,10 +969,7 @@ "text": [ "\u001b[2m\u001b[36m(TorchTrainer pid=39948)\u001b[0m 2022-07-20 21:51:50,596\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,118\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,367\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,367\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n" ] }, { @@ -1127,80 +980,14 @@ "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.970063, epoch: 0, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.658269, epoch: 0, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.442650, epoch: 0, iteration: 1500\n", - "Result for TorchTrainer_d37db_00000:\n", - " _time_this_iter_s: 6.64359712600708\n", - " _timestamp: 1658379125\n", - " _training_iteration: 1\n", - " date: 2022-07-20_21-52-05\n", - " done: false\n", - " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 1\n", - " loss: 1683.766377851367\n", - " node_ip: 127.0.0.1\n", - " pid: 39948\n", - " should_checkpoint: true\n", - " time_since_restore: 14.471015930175781\n", - " time_this_iter_s: 14.471015930175781\n", - " time_total_s: 14.471015930175781\n", - " timestamp: 1658379125\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: d37db_00000\n", - " warmup_time: 0.005116939544677734\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.603212, epoch: 1, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.534739, epoch: 1, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.420072, epoch: 1, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.351545, epoch: 1, iteration: 1500\n", - "Result for TorchTrainer_d37db_00000:\n", - " _time_this_iter_s: 6.489015102386475\n", - " _timestamp: 1658379131\n", - " _training_iteration: 2\n", - " date: 2022-07-20_21-52-11\n", - " done: false\n", - " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 2\n", - " loss: 929.9579524248838\n", - " node_ip: 127.0.0.1\n", - " pid: 39948\n", - " should_checkpoint: true\n", - " time_since_restore: 20.958564043045044\n", - " time_this_iter_s: 6.487548112869263\n", - " time_total_s: 20.958564043045044\n", - " timestamp: 1658379131\n", - " timesteps_since_restore: 0\n", - " training_iteration: 2\n", - " trial_id: d37db_00000\n", - " warmup_time: 0.005116939544677734\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.347010, epoch: 2, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.419703, epoch: 2, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.350773, epoch: 2, iteration: 1000\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.231652, epoch: 2, iteration: 1500\n", - "Result for TorchTrainer_d37db_00000:\n", - " _time_this_iter_s: 6.464617967605591\n", - " _timestamp: 1658379137\n", - " _training_iteration: 3\n", - " date: 2022-07-20_21-52-18\n", - " done: false\n", - " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 3\n", - " loss: 772.555232591927\n", - " node_ip: 127.0.0.1\n", - " pid: 39948\n", - " should_checkpoint: true\n", - " time_since_restore: 27.42328119277954\n", - " time_this_iter_s: 6.464717149734497\n", - " time_total_s: 27.42328119277954\n", - " timestamp: 1658379138\n", - " timesteps_since_restore: 0\n", - " training_iteration: 3\n", - " trial_id: d37db_00000\n", - " warmup_time: 0.005116939544677734\n", - " \n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.343125, epoch: 3, iteration: 0\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.547853, epoch: 3, iteration: 500\n", "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.353915, epoch: 3, iteration: 1000\n", @@ -1210,28 +997,6 @@ " _timestamp: 1658379144\n", " _training_iteration: 4\n", " date: 2022-07-20_21-52-24\n", - " done: false\n", - " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", - " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", - " iterations_since_restore: 4\n", - " loss: 671.9976235236973\n", - " node_ip: 127.0.0.1\n", - " pid: 39948\n", - " should_checkpoint: true\n", - " time_since_restore: 34.01405596733093\n", - " time_this_iter_s: 6.590774774551392\n", - " time_total_s: 34.01405596733093\n", - " timestamp: 1658379144\n", - " timesteps_since_restore: 0\n", - " training_iteration: 4\n", - " trial_id: d37db_00000\n", - " warmup_time: 0.005116939544677734\n", - " \n", - "Result for TorchTrainer_d37db_00000:\n", - " _time_this_iter_s: 6.5929179191589355\n", - " _timestamp: 1658379144\n", - " _training_iteration: 4\n", - " date: 2022-07-20_21-52-24\n", " done: true\n", " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", " experiment_tag: '0'\n", @@ -1258,13 +1023,6 @@ "text": [ "2022-07-20 21:52:25,471\tINFO tune.py:738 -- Total run time: 38.13 seconds (37.97 seconds for the tuning loop).\n", "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01 Date: Wed, 20 Jul 2022 22:18:11 -0700 Subject: [PATCH 06/10] remove lines --- .../examples/torch_incremental_learning.ipynb | 148 +----------------- 1 file changed, 5 insertions(+), 143 deletions(-) diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index 7a973c62d8eb..f8d5210855b3 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -780,7 +780,7 @@ "output_type": "stream", "text": [ "2022-07-20 21:48:52,891\tINFO tune.py:738 -- Total run time: 39.80 seconds (39.66 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/1 [00:01 Date: Wed, 20 Jul 2022 22:24:43 -0700 Subject: [PATCH 07/10] fix notebook --- .../examples/torch_incremental_learning.ipynb | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index f8d5210855b3..6dc76215bfaf 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -728,7 +728,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m 2022-07-20 21:48:26,772\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m 2022-07-20 21:48:26,772\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n" ] }, { @@ -780,7 +780,7 @@ "output_type": "stream", "text": [ "2022-07-20 21:48:52,891\tINFO tune.py:738 -- Total run time: 39.80 seconds (39.66 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/1 [00:01 Date: Wed, 20 Jul 2022 22:32:08 -0700 Subject: [PATCH 08/10] apply convert to non-batching case too --- python/ray/serve/air_integrations.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/python/ray/serve/air_integrations.py b/python/ray/serve/air_integrations.py index 0fbd5a2dd472..5e131d6ba1f5 100644 --- a/python/ray/serve/air_integrations.py +++ b/python/ray/serve/air_integrations.py @@ -45,6 +45,22 @@ def _load_predictor_cls( return predictor_cls +def _unpack_tensorarray_from_pandas(output_df: pd.DataFrame) -> pd.DataFrame: + """ + In dl_predictor.py we return a pd.DataFrame that could have multiple + columns but value of each column is a TensorArray. Flatten the + TensorArray to list to ensure output is json serializable as http + response. + """ + from ray.data.extensions import TensorArray, TensorArrayElement + + for col in output_df: + if isinstance(output_df[col].values, (TensorArray, TensorArrayElement)): + output_df[col] = output_df[col].to_numpy() + + return output_df + + class BatchingManager: """A collection of utilities for batching and splitting data.""" @@ -81,8 +97,6 @@ def batch_dataframe(input_list: List["pd.DataFrame"]) -> "pd.DataFrame": def split_dataframe( output_df: "pd.DataFrame", batch_size: int ) -> List["pd.DataFrame"]: - from ray.data.extensions import TensorArray, TensorArrayElement - if not isinstance(output_df, pd.DataFrame): raise TypeError( "The output should be a Pandas DataFrame but Serve got " @@ -94,13 +108,7 @@ def split_dataframe( f"but Serve got length {len(output_df)}." ) - # In dl_predictor.py we return a pd.DataFrame that could have multiple - # columns but value of each column is a TensorArray. Flatten the - # TensorArray to list to ensure output is json serializable as http - # response. - for col in output_df: - if isinstance(output_df[col].values, (TensorArray, TensorArrayElement)): - output_df[col] = output_df[col].to_numpy() + output_df = _unpack_tensorarray_from_pandas(output_df) return [df.reset_index(drop=True) for df in np.split(output_df, batch_size)] @@ -211,6 +219,8 @@ async def predict_impl(inp: Union[np.ndarray, "pd.DataFrame"]): out = self.model.predict(inp, **predict_kwargs) if isinstance(out, ray.ObjectRef): out = await out + elif pd is not None and isinstance(out, pd.DataFrame): + out = _unpack_tensorarray_from_pandas(out) return out else: From 5da298d2865f1ab481b430f5bb0c6209c979d3c2 Mon Sep 17 00:00:00 2001 From: Jiao Date: Thu, 21 Jul 2022 10:49:38 -0700 Subject: [PATCH 09/10] Update python/ray/serve/air_integrations.py Co-authored-by: Simon Mo --- python/ray/serve/air_integrations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/serve/air_integrations.py b/python/ray/serve/air_integrations.py index 5e131d6ba1f5..4b63674a5299 100644 --- a/python/ray/serve/air_integrations.py +++ b/python/ray/serve/air_integrations.py @@ -45,7 +45,7 @@ def _load_predictor_cls( return predictor_cls -def _unpack_tensorarray_from_pandas(output_df: pd.DataFrame) -> pd.DataFrame: +def _unpack_tensorarray_from_pandas(output_df: "pd.DataFrame") -> "pd.DataFrame": """ In dl_predictor.py we return a pd.DataFrame that could have multiple columns but value of each column is a TensorArray. Flatten the From f0bc29d63579640dca307eec27dc172acbe1dd4f Mon Sep 17 00:00:00 2001 From: Jiao Dong Date: Thu, 21 Jul 2022 11:05:02 -0700 Subject: [PATCH 10/10] docstring --- python/ray/serve/air_integrations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ray/serve/air_integrations.py b/python/ray/serve/air_integrations.py index 4b63674a5299..ddf87c268c0a 100644 --- a/python/ray/serve/air_integrations.py +++ b/python/ray/serve/air_integrations.py @@ -46,7 +46,8 @@ def _load_predictor_cls( def _unpack_tensorarray_from_pandas(output_df: "pd.DataFrame") -> "pd.DataFrame": - """ + """Unpack predictor's return value with TensorArray into numpy. + In dl_predictor.py we return a pd.DataFrame that could have multiple columns but value of each column is a TensorArray. Flatten the TensorArray to list to ensure output is json serializable as http