diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD index 37895a1d851f..1beb742e708b 100644 --- a/doc/source/ray-air/examples/BUILD +++ b/doc/source/ray-air/examples/BUILD @@ -31,9 +31,7 @@ py_test_run_all_notebooks( include = ["*.ipynb"], exclude = [ "huggingface_text_classification.ipynb", - "torch_incremental_learning.ipynb", "feast_example.ipynb", # REGRESSION - "tfx_tabular_train_to_serve.ipynb", # REGRESSION ], data = ["//doc/source/ray-air/examples:air_examples"], tags = ["exclusive", "team:ml", "ray_air"], diff --git a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb index 78b43111b761..ece0a92d89cc 100644 --- a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb +++ b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -73,7 +73,68 @@ "id": "MOsHUjgdIrIW", "outputId": "8a21ead5-bb2d-4a3d-ae41-17a313688b24" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-20 18:45:28,814\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "
\n", + "

Ray

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.7.10
Ray version: 3.0.0.dev0
Dashboard:http://127.0.0.1:8266
\n", + "
\n", + "
\n" + ], + "text/plain": [ + "RayContext(dashboard_url='127.0.0.1:8266', python_version='3.7.10', ray_version='3.0.0.dev0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-07-20_18-45-26_127581_21006/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-07-20_18-45-26_127581_21006/sockets/raylet', 'webui_url': '127.0.0.1:8266', 'session_dir': '/tmp/ray/session_2022-07-20_18-45-26_127581_21006', 'metrics_export_port': 63884, 'gcs_address': '127.0.0.1:63685', 'address': '127.0.0.1:63685', 'dashboard_agent_listen_port': 52365, 'node_id': 'c21f810137e56bd967ab3f246c66aadc5262e00bdbe19c34c23456e7'})" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from pprint import pprint\n", "import ray\n", @@ -93,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -101,7 +162,18 @@ "id": "KlMz0dt9hYbS", "outputId": "e7234b52-08b4-49fc-e14c-72f283b893f2" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'CPU': 16.0,\n", + " 'memory': 30436675994.0,\n", + " 'node:127.0.0.1': 1.0,\n", + " 'object_store_memory': 2147483648.0}\n" + ] + } + ], "source": [ "pprint(ray.cluster_resources())" ] @@ -126,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "gAbhv9OqhYbT" }, @@ -160,7 +232,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "FbeYf1aF8ISK" }, @@ -180,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -189,7 +261,157 @@ "id": "8tugpr5S8gPq", "outputId": "3c57a348-12a7-4b6c-f9b2-fabdcb7a7c88" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pickup_community_areatrip_start_monthtrip_start_hourtrip_start_daytrip_start_timestamptrip_milesdropoff_census_tractpayment_typecompanytrip_secondsdropoff_community_areais_big_tip
0NaN519614002695000.0NaNCredit CardChicago Elite Cab Corp. (Chicago Carriag0.0NaNFalse
1NaN319513626837000.0NaNUnknownChicago Elite Cab Corp.300.0NaNFalse
260.01023138059370012.6NaNCashTaxi Affiliation Services1380.0NaNFalse
310.0101213823190000.0NaNCashTaxi Affiliation Services180.0NaNFalse
414.057513698972000.0NaNCashDispatch Taxi Affiliation1080.0NaNFalse
\n", + "
" + ], + "text/plain": [ + " pickup_community_area trip_start_month trip_start_hour trip_start_day \\\n", + "0 NaN 5 19 6 \n", + "1 NaN 3 19 5 \n", + "2 60.0 10 2 3 \n", + "3 10.0 10 1 2 \n", + "4 14.0 5 7 5 \n", + "\n", + " trip_start_timestamp trip_miles dropoff_census_tract payment_type \\\n", + "0 1400269500 0.0 NaN Credit Card \n", + "1 1362683700 0.0 NaN Unknown \n", + "2 1380593700 12.6 NaN Cash \n", + "3 1382319000 0.0 NaN Cash \n", + "4 1369897200 0.0 NaN Cash \n", + "\n", + " company trip_seconds \\\n", + "0 Chicago Elite Cab Corp. (Chicago Carriag 0.0 \n", + "1 Chicago Elite Cab Corp. 300.0 \n", + "2 Taxi Affiliation Services 1380.0 \n", + "3 Taxi Affiliation Services 180.0 \n", + "4 Dispatch Taxi Affiliation 1080.0 \n", + "\n", + " dropoff_community_area is_big_tip \n", + "0 NaN False \n", + "1 NaN False \n", + "2 NaN False \n", + "3 NaN False \n", + "4 NaN False " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data.head(5)" ] @@ -206,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "YSLvrBMC9aRv" }, @@ -239,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -247,7 +469,15 @@ "id": "xfhRl7eO981w", "outputId": "f80d90ff-fc8a-4a7d-b544-31633823d596" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 11251 samples for training and 3751 samples for testing.\n" + ] + } + ], "source": [ "print(f\"There are {train_ds.count()} samples for training and {test_df.shape[0]} samples for testing.\")" ] @@ -286,7 +516,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "zVvslsfMIrIh" }, @@ -345,7 +575,8 @@ " result = {}\n", " feature_cols = [col for col in dataframe.columns if col != LABEL]\n", " result[\"input\"] = TensorArray(dataframe[feature_cols].to_numpy(dtype=np.float32))\n", - " result[LABEL] = dataframe[LABEL]\n", + " if LABEL in dataframe.columns:\n", + " result[LABEL] = dataframe[LABEL]\n", " return pd.DataFrame(result)\n", "\n", " chained_pp = Chain(\n", @@ -372,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "ejGVU-uN_dVP" }, @@ -414,7 +645,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "MwhAeEOuhYbV" }, @@ -557,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "id": "BBbcMwc9Rz66" }, @@ -582,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "id": "ujmwT8ZhScq1" }, @@ -608,7 +839,7 @@ " # This is due to a current limitation on Serve that's\n", " # being addressed.\n", " # TODO(xwjiang): Change to True.\n", - " batching_params=False,\n", + " batching_params=dict(max_batch_size=2, batch_wait_timeout_s=5),\n", " model_definition=model_definition,\n", " http_adapter=adapter,\n", " )\n", @@ -617,12 +848,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "id": "uRe9a8947pl9" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-20 18:46:11,759\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n", + "\u001b[2m\u001b[36m(ServeController pid=21308)\u001b[0m INFO 2022-07-20 18:46:15,348 controller 21308 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n", + "\u001b[2m\u001b[36m(ServeController pid=21308)\u001b[0m INFO 2022-07-20 18:46:15,350 controller 21308 http_state.py:126 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:SERVE_PROXY_ACTOR-58fb3ee046cdce5c602369291de78f60c65dcbd7c5c5a8af57ec3a26' on node '58fb3ee046cdce5c602369291de78f60c65dcbd7c5c5a8af57ec3a26' listening on '127.0.0.1:8000'\n", + "\u001b[2m\u001b[36m(HTTPProxyActor pid=21311)\u001b[0m INFO: Started server process [21311]\n", + "/Users/jiaodong/anaconda3/envs/ray3.7/lib/python3.7/site-packages/ipykernel_launcher.py:23: UserWarning: From /var/folders/1s/wy6f3ytn3q726p5hl8fw8d780000gn/T/ipykernel_21006/609683685.py:23: deploy (from ray.serve.deployment) is deprecated and will be removed in a future version Please see https://docs.ray.io/en/latest/serve/index.html\n", + "\u001b[2m\u001b[36m(ServeController pid=21308)\u001b[0m INFO 2022-07-20 18:46:17,658 controller 21308 deployment_state.py:1281 - Adding 1 replicas to deployment 'Model'.\n", + "\u001b[2m\u001b[36m(ServeReplica:Model pid=21314)\u001b[0m 2022-07-20 18:46:23,199\tWARNING compression.py:18 -- lz4 not available, disabling sample compression. This will significantly impact RLlib performance. To install lz4, run `pip install lz4`.\n" + ] + } + ], "source": [ + "import ray\n", "# Generally speaking, training and serving are done in totally different ray clusters.\n", "# To simulate that, let's shutdown the old ray cluster in preparation for serving.\n", "ray.shutdown()\n", @@ -641,33 +887,106 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "id": "E9m80HDmSz66" }, "outputs": [], "source": [ "import requests\n", + "import pandas as pd\n", + "import numpy as np\n", "\n", - "NUM_SERVE_REQUESTS = 100\n", + "NUM_SERVE_REQUESTS = 10\n", "\n", "def send_requests(df: pd.DataFrame, label: np.array):\n", " for i in range(NUM_SERVE_REQUESTS):\n", " one_row = df.iloc[[i]].to_dict()\n", " serve_result = requests.post(endpoint_uri, json=one_row).json()\n", " print(\n", - " f\"request[{i}] prediction: {serve_result['predictions']['0']} \"\n", + " f\"request{i} prediction: {serve_result[0]['predictions']} \"\n", " f\"- label: {str(label[i])}\"\n", " )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "id": "GFPwKc5JTgnI" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request0 prediction: 0.004963837098330259 - label: True\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request1 prediction: 6.652726733591408e-05 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request2 prediction: 0.00018405025184620172 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request3 prediction: 0.00016512417641934007 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request4 prediction: 0.00015515758423134685 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request5 prediction: 5.948602483840659e-05 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request6 prediction: 9.51739348238334e-05 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request7 prediction: 3.4787988170137396e-06 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request8 prediction: 0.00010751552326837555 - label: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "request9 prediction: 0.060329731553792953 - label: True\n" + ] + } + ], "source": [ "send_requests(test_df, test_label)" ] @@ -682,11 +1001,8 @@ "name": "tfx (1) (1) (1).ipynb", "provenance": [] }, - "interpreter": { - "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" - }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.7.10 ('ray3.7')", "language": "python", "name": "python3" }, @@ -700,7 +1016,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.7.10" + }, + "vscode": { + "interpreter": { + "hash": "99d89bfe98f3aa2d7facda0d08d31ff2a0af9559e5330d719288ce64a1966273" + } } }, "nbformat": 4, diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index b7fb2c72aa27..6dc76215bfaf 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -75,133 +75,7 @@ "id": "kWr6BRMk1Y1j", "outputId": "dad49a31-a602-4e44-b5fe-932de603925e" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: ray[data,serve,tune] in /usr/local/lib/python3.7/dist-packages (2.0.0.dev0)\n", - "Requirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.21.6)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (3.7.0)\n", - "Requirement already satisfied: grpcio!=1.44.0,>=1.28.1 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.43.0)\n", - "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.0.3)\n", - "Requirement already satisfied: protobuf>=3.15.3 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (3.17.3)\n", - "Requirement already satisfied: aiosignal in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.2.0)\n", - "Requirement already satisfied: frozenlist in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.3.0)\n", - "Requirement already satisfied: virtualenv in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (20.14.1)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (3.13)\n", - "Requirement already satisfied: attrs in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (21.4.0)\n", - "Requirement already satisfied: click<=8.0.4,>=7.0 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (7.1.2)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (2.23.0)\n", - "Requirement already satisfied: jsonschema in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (4.3.3)\n", - "Requirement already satisfied: pyarrow<7.0.0,>=6.0.1 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (6.0.1)\n", - "Requirement already satisfied: fsspec in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (2022.5.0)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.3.5)\n", - "Requirement already satisfied: tensorboardX>=1.9 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (2.5)\n", - "Requirement already satisfied: tabulate in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.8.9)\n", - "Requirement already satisfied: aiorwlock in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.3.0)\n", - "Requirement already satisfied: starlette in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.19.1)\n", - "Requirement already satisfied: prometheus-client<0.14.0,>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.13.1)\n", - "Requirement already satisfied: py-spy>=0.2.0 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.3.12)\n", - "Requirement already satisfied: smart-open in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (6.0.0)\n", - "Requirement already satisfied: gpustat>=1.0.0b1 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (1.0.0b1)\n", - "Requirement already satisfied: colorful in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.5.4)\n", - "Requirement already satisfied: aiohttp>=3.7 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (3.8.1)\n", - "Requirement already satisfied: fastapi in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.78.0)\n", - "Requirement already satisfied: aiohttp-cors in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.7.0)\n", - "Requirement already satisfied: uvicorn==0.16.0 in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.16.0)\n", - "Requirement already satisfied: opencensus in /usr/local/lib/python3.7/dist-packages (from ray[data,serve,tune]) (0.9.0)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from uvicorn==0.16.0->ray[data,serve,tune]) (4.2.0)\n", - "Requirement already satisfied: asgiref>=3.4.0 in /usr/local/lib/python3.7/dist-packages (from uvicorn==0.16.0->ray[data,serve,tune]) (3.5.2)\n", - "Requirement already satisfied: h11>=0.8 in /usr/local/lib/python3.7/dist-packages (from uvicorn==0.16.0->ray[data,serve,tune]) (0.13.0)\n", - "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (0.13.0)\n", - "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (2.0.12)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (6.0.2)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (4.0.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp>=3.7->ray[data,serve,tune]) (1.7.2)\n", - "Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from gpustat>=1.0.0b1->ray[data,serve,tune]) (5.4.8)\n", - "Requirement already satisfied: six>=1.7 in /usr/local/lib/python3.7/dist-packages (from gpustat>=1.0.0b1->ray[data,serve,tune]) (1.15.0)\n", - "Requirement already satisfied: blessed>=1.17.1 in /usr/local/lib/python3.7/dist-packages (from gpustat>=1.0.0b1->ray[data,serve,tune]) (1.19.1)\n", - "Requirement already satisfied: nvidia-ml-py3>=7.352.0 in /usr/local/lib/python3.7/dist-packages (from gpustat>=1.0.0b1->ray[data,serve,tune]) (7.352.0)\n", - "Requirement already satisfied: wcwidth>=0.1.4 in /usr/local/lib/python3.7/dist-packages (from blessed>=1.17.1->gpustat>=1.0.0b1->ray[data,serve,tune]) (0.2.5)\n", - "Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp>=3.7->ray[data,serve,tune]) (2.10)\n", - "Requirement already satisfied: pydantic!=1.7,!=1.7.1,!=1.7.2,!=1.7.3,!=1.8,!=1.8.1,<2.0.0,>=1.6.2 in /usr/local/lib/python3.7/dist-packages (from fastapi->ray[data,serve,tune]) (1.9.1)\n", - "Requirement already satisfied: anyio<5,>=3.4.0 in /usr/local/lib/python3.7/dist-packages (from starlette->ray[data,serve,tune]) (3.6.1)\n", - "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.7/dist-packages (from anyio<5,>=3.4.0->starlette->ray[data,serve,tune]) (1.2.0)\n", - "Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray[data,serve,tune]) (5.7.1)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray[data,serve,tune]) (0.18.1)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray[data,serve,tune]) (4.11.3)\n", - "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema->ray[data,serve,tune]) (3.8.0)\n", - "Requirement already satisfied: google-api-core<3.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from opencensus->ray[data,serve,tune]) (1.31.5)\n", - "Requirement already satisfied: opencensus-context>=0.1.2 in /usr/local/lib/python3.7/dist-packages (from opencensus->ray[data,serve,tune]) (0.1.2)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (2022.1)\n", - "Requirement already satisfied: google-auth<2.0dev,>=1.25.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (1.35.0)\n", - "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (1.56.1)\n", - "Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (57.4.0)\n", - "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (21.3)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (0.2.8)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (4.8)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (4.2.4)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=14.3->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (3.0.9)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=1.25.0->google-api-core<3.0.0,>=1.0.0->opencensus->ray[data,serve,tune]) (0.4.8)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->ray[data,serve,tune]) (1.24.3)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->ray[data,serve,tune]) (3.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->ray[data,serve,tune]) (2022.5.18.1)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->ray[data,serve,tune]) (2.8.2)\n", - "Requirement already satisfied: distlib<1,>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from virtualenv->ray[data,serve,tune]) (0.3.4)\n", - "Requirement already satisfied: platformdirs<3,>=2 in /usr/local/lib/python3.7/dist-packages (from virtualenv->ray[data,serve,tune]) (2.5.2)\n", - "Found existing installation: ray 2.0.0.dev0\n", - "Uninstalling ray-2.0.0.dev0:\n", - " Successfully uninstalled ray-2.0.0.dev0\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting ray==3.0.0.dev0\n", - " Downloading https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl (54.9 MB)\n", - "\u001b[K |████████████████████████████████| 54.9 MB 74.4 MB/s \n", - "\u001b[?25hRequirement already satisfied: msgpack<2.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.0.3)\n", - "Requirement already satisfied: virtualenv in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (20.14.1)\n", - "Requirement already satisfied: protobuf>=3.15.3 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (3.17.3)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (3.13)\n", - "Requirement already satisfied: click<=8.0.4,>=7.0 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (7.1.2)\n", - "Requirement already satisfied: attrs in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (21.4.0)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (2.23.0)\n", - "Requirement already satisfied: frozenlist in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.3.0)\n", - "Requirement already satisfied: aiosignal in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.2.0)\n", - "Requirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.21.6)\n", - "Requirement already satisfied: jsonschema in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (4.3.3)\n", - "Requirement already satisfied: grpcio<=1.43.0,>=1.28.1 in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (1.43.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from ray==3.0.0.dev0) (3.7.0)\n", - "Requirement already satisfied: six>=1.5.2 in /usr/local/lib/python3.7/dist-packages (from grpcio<=1.43.0,>=1.28.1->ray==3.0.0.dev0) (1.15.0)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray==3.0.0.dev0) (0.18.1)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray==3.0.0.dev0) (4.2.0)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray==3.0.0.dev0) (4.11.3)\n", - "Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->ray==3.0.0.dev0) (5.7.1)\n", - "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema->ray==3.0.0.dev0) (3.8.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->ray==3.0.0.dev0) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->ray==3.0.0.dev0) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->ray==3.0.0.dev0) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->ray==3.0.0.dev0) (2022.5.18.1)\n", - "Requirement already satisfied: platformdirs<3,>=2 in /usr/local/lib/python3.7/dist-packages (from virtualenv->ray==3.0.0.dev0) (2.5.2)\n", - "Requirement already satisfied: distlib<1,>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from virtualenv->ray==3.0.0.dev0) (0.3.4)\n", - "Installing collected packages: ray\n", - "Successfully installed ray-3.0.0.dev0\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (1.11.0+cu113)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch) (4.2.0)\n", - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.7/dist-packages (0.12.0+cu113)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from torchvision) (2.23.0)\n", - "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision) (7.1.2)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torchvision) (1.21.6)\n", - "Requirement already satisfied: torch==1.11.0 in /usr/local/lib/python3.7/dist-packages (from torchvision) (1.11.0+cu113)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torchvision) (4.2.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->torchvision) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->torchvision) (2022.5.18.1)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->torchvision) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->torchvision) (1.24.3)\n" - ] - } - ], + "outputs": [], "source": [ "# !pip install -q \"ray[air]\"\n", "# !pip install -q torch\n", @@ -232,13 +106,56 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:25:31,150\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + "2022-07-20 21:47:49,873\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" ] }, { "data": { + "text/html": [ + "
\n", + "
\n", + "

Ray

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.7.10
Ray version: 3.0.0.dev0
Dashboard:http://127.0.0.1:8265
\n", + "
\n", + "
\n" + ], "text/plain": [ - "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.7.13', ray_version='3.0.0.dev0', ray_commit='ac620aeec0c0f68c92328ace0b2a5835f5b14b26', address_info={'node_ip_address': '172.28.0.2', 'raylet_ip_address': '172.28.0.2', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-05-25_22-25-28_641559_1518/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-05-25_22-25-28_641559_1518/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2022-05-25_22-25-28_641559_1518', 'metrics_export_port': 61030, 'gcs_address': '172.28.0.2:62940', 'address': '172.28.0.2:62940', 'node_id': '97455d0de12f3393126427ed2b1ef0a009f0bd3fb97177cb86b42d92'})" + "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.7.10', ray_version='3.0.0.dev0', ray_commit='{{RAY_COMMIT_SHA}}', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-07-20_21-47-47_297236_39344/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-07-20_21-47-47_297236_39344/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2022-07-20_21-47-47_297236_39344', 'metrics_export_port': 62008, 'gcs_address': '127.0.0.1:57307', 'address': '127.0.0.1:57307', 'dashboard_agent_listen_port': 52365, 'node_id': 'db68eafa3bbe9042df574f3c9974b40ce8d97728db90282feefb4690'})" ] }, "execution_count": 2, @@ -266,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "3TVkSmFFCHhI" }, @@ -595,6 +512,7 @@ "outputs": [], "source": [ "from ray.data.preprocessors import BatchMapper\n", + "from ray.data.extensions import TensorArray\n", "\n", "from torchvision import transforms\n", "\n", @@ -605,7 +523,7 @@ " [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]\n", " )\n", "\n", - " df[\"image\"] = df[\"image\"].map(torchvision_transforms)\n", + " df[\"image\"] = TensorArray([torchvision_transforms(image) for image in df[\"image\"]])\n", " return df\n", "\n", "mnist_normalize_preprocessor = BatchMapper(fn=preprocess_images)" @@ -704,7 +622,13 @@ " return df\n", "\n", " deployment = PredictorDeployment.options(name=\"mnist_model\", route_prefix=\"/mnist_predict\", version=f\"v{task_idx}\", num_replicas=2)\n", - " deployment.deploy(batching_params=False, http_adapter=json_to_pandas, predictor_cls=TorchPredictor, checkpoint=latest_checkpoint, model=SimpleMLP(num_classes=10))\n", + " deployment.deploy(\n", + " batching_params=dict(max_batch_size=10, batch_wait_timeout_s=5),\n", + " http_adapter=json_to_pandas, \n", + " predictor_cls=TorchPredictor, \n", + " checkpoint=latest_checkpoint, \n", + " model=SimpleMLP(num_classes=10)\n", + " )\n", " return deployment.url\n", "\n", "# Function that queries our deployed model\n", @@ -753,14 +677,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_prepare_read pid=1772)\u001b[0m 2022-05-25 22:25:35,236\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:05<00:00, 5.92s/it]\n", - "\u001b[2m\u001b[36m(_prepare_read pid=1772)\u001b[0m 2022-05-25 22:25:53,593\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.51it/s]\n", - "Map Progress (2 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.72s/it]\n", - "\u001b[2m\u001b[36m(_prepare_read pid=1978)\u001b[0m 2022-05-25 22:25:58,761\tWARNING torch_datasource.py:56 -- `SimpleTorchDatasource` doesn't support parallel reads. The `parallelism` argument will be ignored.\n", - "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.41it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]" + "Read->Map_Batches: 100%|██████████| 1/1 [00:06<00:00, 6.40s/it]\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.12it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:02<00:00, 2.34s/it]\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.29it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n" ] }, { @@ -770,22 +691,15 @@ "Starting training for task: 0\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-25 22:27:16 (running for 00:01:14.46)
Memory usage on this node: 4.7/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.31 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/TorchTrainer_2022-05-25_22-26-01
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:48:52 (running for 00:00:39.66)
Memory usage on this node: 33.1/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-48-13
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_a8585_00000TERMINATED172.28.0.2:2126
TorchTrainer_53c58_00000TERMINATED127.0.0.1:39548 4 36.4582824.229 1658378932 6.46339


" ], @@ -800,125 +714,102 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_map_block_nosplit pid=2159)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=2159)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m 2022-05-25 22:26:19,944\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m 2022-05-25 22:26:20,033\tINFO torch.py:98 -- Moving model to device: cuda:0\n" + "2022-07-20 21:48:13,244\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/jiaodong/Workspace/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 2.315190, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.464406, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.279081, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.052461, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.816213, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 1.019127, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.525613, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.570595, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.572004, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.543432, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.350156, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.443743, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.438318, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.342512, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.302048, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2197)\u001b[0m loss: 0.414025, epoch: 3, iteration: 1500\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 2.282040, epoch: 0, iteration: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:27:16,013\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m 2022-07-20 21:48:26,772\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Trial TorchTrainer_a8585_00000 completed. Last result: \n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 1.521038, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 1.169452, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.856338, epoch: 0, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.788410, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.854239, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.533351, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.591339, epoch: 1, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.457057, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.594715, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.477588, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.235412, epoch: 2, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.507374, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.447128, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.381943, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39562)\u001b[0m loss: 0.347877, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_53c58_00000:\n", + " _time_this_iter_s: 6.463389873504639\n", + " _timestamp: 1658378932\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-48-52\n", + " done: true\n", + " experiment_id: abc531ef544440268933d8221addeb9d\n", + " experiment_tag: '0'\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 824.2287287414074\n", + " node_ip: 127.0.0.1\n", + " pid: 39548\n", + " should_checkpoint: true\n", + " time_since_restore: 36.45815992355347\n", + " time_this_iter_s: 6.464020013809204\n", + " time_total_s: 36.45815992355347\n", + " timestamp: 1658378932\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 53c58_00000\n", + " warmup_time: 0.003597259521484375\n", + " \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:27:16,138\tINFO tune.py:753 -- Total run time: 74.68 seconds (74.45 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/1 [00:01Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.20it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.41s/it]" + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:48:57,458 controller 39625 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:48:57,460 controller 39625 http_state.py:126 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:oEzsmU:SERVE_PROXY_ACTOR-db68eafa3bbe9042df574f3c9974b40ce8d97728db90282feefb4690' on node 'db68eafa3bbe9042df574f3c9974b40ce8d97728db90282feefb4690' listening on '127.0.0.1:8000'\n", + "Shuffle Map: 0%| | 0/1 [00:00Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.39it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n" ] }, { @@ -928,22 +819,15 @@ "Starting training for task: 1\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-25 22:28:52 (running for 00:01:09.00)
Memory usage on this node: 5.0/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.31 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/TorchTrainer_2022-05-25_22-27-43
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:50:36 (running for 00:00:37.98)
Memory usage on this node: 33.7/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-49-58
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_e4f66_00000TERMINATED172.28.0.2:2875
TorchTrainer_92bcd_00000TERMINATED127.0.0.1:39736 4 34.1132707.634 1658379035 6.45643


" ], @@ -958,125 +842,105 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_map_block_nosplit pid=2909)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=2909)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m 2022-05-25 22:28:01,917\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m 2022-05-25 22:28:02,063\tINFO torch.py:98 -- Moving model to device: cuda:0\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=39736)\u001b[0m 2022-07-20 21:50:01,936\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,489\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 3.347775, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 1.343975, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.768560, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.607410, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.578952, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.473788, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.609530, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.741895, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.417272, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.510404, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.422137, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.403623, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.384720, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.414567, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.274302, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=2948)\u001b[0m loss: 0.348169, epoch: 3, iteration: 1500\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 3.301114, epoch: 0, iteration: 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:28:52,221\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m 2022-07-20 21:50:09,795\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m /Users/jiaodong/Workspace/ray/python/ray/air/_internal/torch_utils.py:64: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:178.)\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m return torch.as_tensor(vals, dtype=dtype)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Trial TorchTrainer_e4f66_00000 completed. Last result: \n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 1.075076, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.536976, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.600182, epoch: 0, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.546070, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.448120, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.392481, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.371981, epoch: 1, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.521735, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.635850, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.395862, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.402500, epoch: 2, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.236922, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.528482, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.372242, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39752)\u001b[0m loss: 0.355759, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_92bcd_00000:\n", + " _time_this_iter_s: 6.456433057785034\n", + " _timestamp: 1658379035\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-50-36\n", + " done: true\n", + " experiment_id: 21820161d0a245428cf75b0b9b17fe6e\n", + " experiment_tag: '0'\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 707.6341038495302\n", + " node_ip: 127.0.0.1\n", + " pid: 39736\n", + " should_checkpoint: true\n", + " time_since_restore: 34.11321783065796\n", + " time_this_iter_s: 6.463765859603882\n", + " time_total_s: 34.11321783065796\n", + " timestamp: 1658379036\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 92bcd_00000\n", + " warmup_time: 0.005189180374145508\n", + " \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:28:52,344\tINFO tune.py:753 -- Total run time: 69.20 seconds (68.99 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/2 [00:01Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.44it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.41s/it]" + "Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 6.24it/s]\n", + "Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 6.19it/s]\n", + "Map Progress (1 actors 0 pending): 100%|██████████| 1/1 [00:01<00:00, 1.18s/it]\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:42,924 controller 39625 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:45,044 controller 39625 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:47,377 controller 39625 deployment_state.py:1240 - Stopping 1 replicas of deployment 'mnist_model' with outdated versions.\n", + "\u001b[2m\u001b[36m(ServeController pid=39625)\u001b[0m INFO 2022-07-20 21:50:49,504 controller 39625 deployment_state.py:1281 - Adding 1 replicas to deployment 'mnist_model'.\n", + "Map Progress (2 actors 0 pending): 100%|██████████| 1/1 [00:02<00:00, 2.36s/it]\n", + "Read->Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 2.04it/s]\n", + "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00, 1.37s/it]\n" ] }, { @@ -1086,22 +950,15 @@ "Starting training for task: 2\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { "text/html": [ - "== Status ==
Current time: 2022-05-25 22:30:31 (running for 00:01:09.12)
Memory usage on this node: 5.0/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.31 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/TorchTrainer_2022-05-25_22-29-22
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:52:25 (running for 00:00:37.97)
Memory usage on this node: 34.0/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-51-47
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_2040e_00000TERMINATED172.28.0.2:3703
TorchTrainer_d37db_00000TERMINATED127.0.0.1:39948 4 34.0141671.998 1658379144 6.59292


" ], @@ -1116,123 +973,89 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_map_block_nosplit pid=3738)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=3738)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m 2022-05-25 22:29:41,392\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m 2022-05-25 22:29:41,549\tINFO torch.py:98 -- Moving model to device: cuda:0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 4.353125, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 1.147782, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.609233, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.606812, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.494777, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.776362, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.376833, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.478181, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.413856, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.668218, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.318078, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.427121, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.369263, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.479945, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.457482, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=3778)\u001b[0m loss: 0.318416, epoch: 3, iteration: 1500\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-05-25 22:30:31,831\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=39948)\u001b[0m 2022-07-20 21:51:50,596\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,118\tINFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m [W ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m 2022-07-20 21:51:58,367\tINFO train_loop_utils.py:298 -- Moving model to device: cpu\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Trial TorchTrainer_2040e_00000 completed. Last result: \n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 4.062408, epoch: 0, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.970063, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.658269, epoch: 0, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.442650, epoch: 0, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.603212, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.534739, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.420072, epoch: 1, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.351545, epoch: 1, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.347010, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.419703, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.350773, epoch: 2, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.231652, epoch: 2, iteration: 1500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.343125, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.547853, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.353915, epoch: 3, iteration: 1000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=39968)\u001b[0m loss: 0.260028, epoch: 3, iteration: 1500\n", + "Result for TorchTrainer_d37db_00000:\n", + " _time_this_iter_s: 6.5929179191589355\n", + " _timestamp: 1658379144\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-52-24\n", + " done: true\n", + " experiment_id: 5d41bf13ba524c528faac8f64b13c7cc\n", + " experiment_tag: '0'\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 671.9976235236973\n", + " node_ip: 127.0.0.1\n", + " pid: 39948\n", + " should_checkpoint: true\n", + " time_since_restore: 34.01405596733093\n", + " time_this_iter_s: 6.590774774551392\n", + " time_total_s: 34.01405596733093\n", + " timestamp: 1658379144\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: d37db_00000\n", + " warmup_time: 0.005116939544677734\n", + " \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-25 22:30:31,953\tINFO tune.py:753 -- Total run time: 69.33 seconds (69.12 seconds for the tuning loop).\n", - "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01Current time: 2022-05-18 23:52:49 (running for 00:03:27.40)
Memory usage on this node: 7.0/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/TorchTrainer_2022-05-18_23-49-22
Number of trials: 1/1 (1 TERMINATED)
\n", + "== Status ==
Current time: 2022-07-20 21:55:10 (running for 00:01:25.89)
Memory usage on this node: 34.4/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/28.14 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jiaodong/ray_results/TorchTrainer_2022-07-20_21-53-44
Number of trials: 1/1 (1 TERMINATED)
\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc
Trial name status loc iter total time (s) loss _timestamp _time_this_iter_s
TorchTrainer_24496_00000TERMINATED172.28.0.2:4630
TorchTrainer_1923b_00000TERMINATED127.0.0.1:40228 4 82.72852328.8 1658379309 17.0239


" ], @@ -1481,85 +1296,59 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_map_block_nosplit pid=4666)\u001b[0m /usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py:133: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:178.)\n", - "\u001b[2m\u001b[36m(_map_block_nosplit pid=4666)\u001b[0m img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m 2022-05-18 23:50:06,950\tINFO torch.py:347 -- Setting up process group for: env:// [rank=0, world_size=1]\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m 2022-05-18 23:50:07,011\tINFO torch.py:98 -- Moving model to device: cuda:0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 2.373475, epoch: 0, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.699985, epoch: 0, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.636039, epoch: 0, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.334987, epoch: 0, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.152312, epoch: 0, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.998297, epoch: 0, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 1.434949, epoch: 0, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.971171, epoch: 0, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.796480, epoch: 0, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.802282, epoch: 0, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.731363, epoch: 0, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.847772, epoch: 0, iteration: 5500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.879676, epoch: 1, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.564319, epoch: 1, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.714444, epoch: 1, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.565163, epoch: 1, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.739525, epoch: 1, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.510878, epoch: 1, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.814798, epoch: 1, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.473765, epoch: 1, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.557866, epoch: 1, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.674371, epoch: 1, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.532800, epoch: 1, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.832442, epoch: 1, iteration: 5500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.557547, epoch: 2, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.355255, epoch: 2, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.426749, epoch: 2, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.484543, epoch: 2, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.360856, epoch: 2, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.444718, epoch: 2, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.596777, epoch: 2, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.289816, epoch: 2, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.407941, epoch: 2, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.438239, epoch: 2, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.379983, epoch: 2, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.527786, epoch: 2, iteration: 5500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.598584, epoch: 3, iteration: 0\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.355202, epoch: 3, iteration: 500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.392683, epoch: 3, iteration: 1000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.415264, epoch: 3, iteration: 1500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.417230, epoch: 3, iteration: 2000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.289974, epoch: 3, iteration: 2500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.648514, epoch: 3, iteration: 3000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.369468, epoch: 3, iteration: 3500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.378548, epoch: 3, iteration: 4000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.392761, epoch: 3, iteration: 4500\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.555575, epoch: 3, iteration: 5000\n", - "\u001b[2m\u001b[36m(BaseWorkerMixin pid=4709)\u001b[0m loss: 0.394487, epoch: 3, iteration: 5500\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-05-18 23:52:49,915\tERROR checkpoint_manager.py:193 -- Result dict has no key: training_iteration. checkpoint_score_attr must be set to a key of the result dict. Valid keys are ['trial_id', 'experiment_id', 'date', 'timestamp', 'pid', 'hostname', 'node_ip', 'config', 'done']\n" + "\u001b[2m\u001b[36m(TorchTrainer pid=40228)\u001b[0m 2022-07-20 21:53:47,328\tWARNING base_trainer.py:167 -- When passing `datasets` to a Trainer, it is recommended to reserve at least 20% of node CPUs for Dataset execution by setting `_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. Not doing so can lead to resource contention or hangs. See https://docs.ray.io/en/master/data/key-concepts.html#example-datasets-in-tune for more info.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Trial TorchTrainer_24496_00000 completed. Last result: \n" + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 2.305423, epoch: 0, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.935424, epoch: 0, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 1.174222, epoch: 0, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.776577, epoch: 0, iteration: 5500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.674814, epoch: 1, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.699747, epoch: 1, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.795673, epoch: 1, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.651217, epoch: 1, iteration: 5500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.743072, epoch: 2, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.745054, epoch: 2, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.639829, epoch: 2, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.682482, epoch: 2, iteration: 5500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.553197, epoch: 3, iteration: 0\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.471037, epoch: 3, iteration: 500\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.538055, epoch: 3, iteration: 5000\n", + "\u001b[2m\u001b[36m(BaseWorkerMixin pid=40276)\u001b[0m loss: 0.534079, epoch: 3, iteration: 5500\n", + "Result for TorchTrainer_1923b_00000:\n", + " _time_this_iter_s: 17.023871898651123\n", + " _timestamp: 1658379309\n", + " _training_iteration: 4\n", + " date: 2022-07-20_21-55-10\n", + " done: true\n", + " experiment_id: d304983bfe3f4e269118f8618aa9b02f\n", + " experiment_tag: '0'\n", + " hostname: Jiaos-MacBook-Pro-16-inch-2019\n", + " iterations_since_restore: 4\n", + " loss: 2328.8038033917546\n", + " node_ip: 127.0.0.1\n", + " pid: 40228\n", + " should_checkpoint: true\n", + " time_since_restore: 82.72845268249512\n", + " time_this_iter_s: 17.024354696273804\n", + " time_total_s: 82.72845268249512\n", + " timestamp: 1658379310\n", + " timesteps_since_restore: 0\n", + " training_iteration: 4\n", + " trial_id: 1923b_00000\n", + " warmup_time: 0.004433870315551758\n", + " \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-18 23:52:50,042\tINFO tune.py:753 -- Total run time: 207.53 seconds (207.39 seconds for the tuning loop).\n" + "2022-07-20 21:55:10,233\tINFO tune.py:738 -- Total run time: 86.00 seconds (85.88 seconds for the tuning loop).\n" ] } ], @@ -1593,7 +1382,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1606,13 +1395,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "Map Progress (1 actors 1 pending): 0%| | 0/3 [00:01 "pd.DataFrame": + """Unpack predictor's return value with TensorArray into numpy. + + In dl_predictor.py we return a pd.DataFrame that could have multiple + columns but value of each column is a TensorArray. Flatten the + TensorArray to list to ensure output is json serializable as http + response. + """ + from ray.data.extensions import TensorArray, TensorArrayElement + + for col in output_df: + if isinstance(output_df[col].values, (TensorArray, TensorArrayElement)): + output_df[col] = output_df[col].to_numpy() + + return output_df + + class BatchingManager: """A collection of utilities for batching and splitting data.""" @@ -91,6 +108,9 @@ def split_dataframe( f"The output dataframe should have length divisible by {batch_size}, " f"but Serve got length {len(output_df)}." ) + + output_df = _unpack_tensorarray_from_pandas(output_df) + return [df.reset_index(drop=True) for df in np.split(output_df, batch_size)] @staticmethod @@ -200,6 +220,8 @@ async def predict_impl(inp: Union[np.ndarray, "pd.DataFrame"]): out = self.model.predict(inp, **predict_kwargs) if isinstance(out, ray.ObjectRef): out = await out + elif pd is not None and isinstance(out, pd.DataFrame): + out = _unpack_tensorarray_from_pandas(out) return out else: diff --git a/python/ray/serve/tests/test_air_integrations.py b/python/ray/serve/tests/test_air_integrations.py index 26bd6f8a39c9..b0a222a69879 100644 --- a/python/ray/serve/tests/test_air_integrations.py +++ b/python/ray/serve/tests/test_air_integrations.py @@ -16,6 +16,7 @@ from ray.serve.deployment_graph_build import build from ray.serve.http_adapters import json_to_ndarray from ray.train.predictor import DataBatchType, Predictor +from ray.data.extensions import TensorArray class TestBatchingFunctionFunctions: @@ -73,6 +74,25 @@ def test_dataframe(self): for i, j in zip(unpacked_list, list_of_dfs): assert i.equals(j) + def test_dataframe_with_tensorarray(self): + batched_df = pd.DataFrame( + { + "a": TensorArray([1, 2, 3, 4]), + "b": TensorArray([5, 6, 7, 8]), + } + ) + split_df = pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": [5, 6, 7, 8], + } + ) + + unpacked_list = BatchingManager.split_dataframe(batched_df, 1) + assert len(unpacked_list) == 1 + assert unpacked_list[0]["a"].equals(split_df["a"]) + assert unpacked_list[0]["b"].equals(split_df["b"]) + class AdderPredictor(Predictor): def __init__(self, increment: int, do_double: bool) -> None: