Revert "Revert "Global logging format changes" (ray-project#34126)" (r…

…ay-project#34182) Attempts to consolidate logging configuration by introducing reasonable defaults in ray/log.py. This new logging configuration is done once in ray/__init__.py at the top of the module. Subsequent calls to the configuration are ignored. A logger for ray.rllib is configured at the WARN level, to address Revert "Simplify logging configuration. (ray-project#30863)" ray-project#31858. With this change, Revert "Simplify logging configuration. (ray-project#30863)" ray-project#31858 can be reverted, again simplifying and consolidating logging configuration. Modified test_output.py::test_logger_config to test only the logger config, not launch a ray cluster. The test was failing intermittently, I think due to a race condition between the launch of the cluster and the reading of the subprocess's stdout, and anyway it wasn't necessary to call ray.init here to check that logging was configured correctly. Modified python/ray/tune/tests/test_commands.py::test_ls_with_cfg to test the underlying data, not what gets printed to stdout (which has changed with the new logging system). Modified a logging message in ray.tune.automl.search_policy.AutoMLSearcher.on_trial_complete, which in certain cases emits a logging message which tries to format a NoneType into a %f during log message formatting. This was a previously-undetected bug which showed up because the default log level is now INFO. This fixes a test that was failing in test_automl_searcher.py::AutoMLSearcherTest. Signed-off-by: e428265 <[email protected]>
lmco · Aug 31, 2023 · 360f405 · 360f405
1 parent b745551
commit 360f405
Show file tree

Hide file tree

Showing 34 changed files with 537 additions and 192 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,9 @@
 .jar
 /dashboard/client/build
 
+# Kuberay config lives in a separate repository
+python/ray/autoscaler/kuberay/config
+
 # Files generated by flatc should be ignored
 /src/ray/gcs/format/*_generated.h
 /src/ray/object_manager/format/*_generated.h

diff --git a/ci/lint/check_api_annotations.py b/ci/lint/check_api_annotations.py
@@ -99,7 +99,13 @@ def verify(symbol, scanned, ok, output, prefix=None, ignore=None):
     verify(ray.air, set(), ok, output)
     verify(ray.train, set(), ok, output)
     verify(ray.tune, set(), ok, output)
-    verify(ray, set(), ok, output, ignore=["ray.workflow", "ray.tune", "ray.serve"])
+    verify(
+        ray,
+        set(),
+        ok,
+        output,
+        ignore=["ray.workflow", "ray.tune", "ray.serve"],
+    )
     verify(ray.serve, set(), ok, output)
     assert len(ok) >= 500, len(ok)
     # TODO(ekl) enable it for all modules.

diff --git a/dashboard/agent.py b/dashboard/agent.py
@@ -5,6 +5,7 @@
 import logging
 import logging.handlers
 import os
+import pathlib
 import sys
 import signal
 
@@ -18,7 +19,10 @@
 from ray._private.gcs_pubsub import GcsAioPublisher
 from ray._raylet import GcsClient
 from ray._private.gcs_utils import GcsAioClient
-from ray._private.ray_logging import setup_component_logger
+from ray._private.ray_logging import (
+    setup_component_logger,
+    configure_log_file,
+)
 from ray.core.generated import agent_manager_pb2, agent_manager_pb2_grpc
 from ray.experimental.internal_kv import (
     _initialize_internal_kv,
@@ -341,6 +345,14 @@ async def _check_parent():
             await self.http_server.cleanup()
 
 
+def open_capture_files(log_dir):
+    filename = f"agent-{args.agent_id}"
+    return (
+        ray._private.utils.open_log(pathlib.Path(log_dir) / f"{filename}.out"),
+        ray._private.utils.open_log(pathlib.Path(log_dir) / f"{filename}.err"),
+    )
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Dashboard agent.")
     parser.add_argument(
@@ -507,6 +519,10 @@ async def _check_parent():
         # w.r.t grpc server init in the DashboardAgent initializer.
         loop = ray._private.utils.get_or_create_event_loop()
 
+        # Setup stdout/stderr redirect files
+        out_file, err_file = open_capture_files(args.log_dir)
+        configure_log_file(out_file, err_file)
+
         agent = DashboardAgent(
             args.node_ip_address,
             args.dashboard_agent_port,

diff --git a/doc/source/ray-observability/ray-logging.rst b/doc/source/ray-observability/ray-logging.rst
@@ -2,7 +2,43 @@
 
 Logging
 =======
-This document will explain Ray's logging system and its best practices.
+This document explains Ray's logging system and related best practices.
+
+Internal Ray Logging Configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When ``import ray`` is executed, Ray's logger is initialized, generating a sensible configuration given in ``python/ray/_private/log.py``. The default logging level is ``logging.INFO``.
+
+All ray loggers are automatically configured in ``ray._private.ray_logging``. To change the Ray library logging configuration:
+
+.. code-block:: python
+
+   import logging
+
+   logger = logging.getLogger("ray")
+   logger # Modify the ray logging config
+
+Similarly, to modify the logging configuration for any Ray subcomponent, specify the appropriate logger name:
+
+.. code-block:: python
+
+   import logging
+
+   # First, get the handle for the logger you want to modify
+   ray_data_logger = logging.getLogger("ray.data")
+   ray_tune_logger = logging.getLogger("ray.tune")
+   ray_rllib_logger = logging.getLogger("ray.rllib")
+   ray_air_logger = logging.getLogger("ray.air")
+   ray_train_logger = logging.getLogger("ray.train")
+   ray_workflow_logger = logging.getLogger("ray.workflow")
+
+   # Modify the ray.data logging level
+   ray_data_logger.setLevel(logging.WARNING)
+
+   # Other loggers can be modified similarly.
+   # Here's how to add an aditional file handler for ray tune:
+   ray_tune_logger.addHandler(logging.FileHandler("extra_ray_tune_log.log"))
+
+For more information about logging in workers, see :ref:`Customizing worker loggers`.
 
 Driver logs
 ~~~~~~~~~~~
@@ -16,12 +52,12 @@ The log file consists of the stdout of the entrypoint command of the job.  For t
 
 .. _ray-worker-logs:
 
-Worker logs
+Worker stdout and stderr
 ~~~~~~~~~~~
-Ray's tasks or actors are executed remotely within Ray's worker processes. Ray has special support to improve the visibility of logs produced by workers.
+Ray's tasks or actors are executed remotely within Ray's worker processes. Ray has special support to improve the visibility of stdout and stderr produced by workers.
 
-- By default, all of the tasks/actors stdout and stderr are redirected to the worker log files. Check out :ref:`Logging directory structure <logging-directory-structure>` to learn how Ray's logging directory is structured.
-- By default, all of the tasks/actors stdout and stderr that is redirected to worker log files are published to the driver. Drivers display logs generated from its tasks/actors to its stdout and stderr.
+- By default, stdout and stderr from all tasks and actors are redirected to the worker log files, including any log messages generated by the worker. See :ref:`Logging directory structure <logging-directory-structure>` to understand the structure of the Ray logging directory.
+- By default, the driver reads the worker log files to which the stdout and stderr for all tasks and actors are redirected. Drivers display all stdout and stderr generated from their tasks or actors to their own stdout and stderr.
 
 Let's look at a code example to see how this works.
 
@@ -37,7 +73,7 @@ Let's look at a code example to see how this works.
 
     ray.get(task.remote())
 
-You should be able to see the string `task` from your driver stdout. 
+You should be able to see the string `task` from your driver stdout.
 
 When logs are printed, the process id (pid) and an IP address of the node that executes tasks/actors are printed together. Check out the output below.
 
@@ -139,10 +175,9 @@ Limitations:
 By default, the builtin print will also be patched to use `ray.experimental.tqdm_ray.safe_print` when `tqdm_ray` is used.
 This avoids progress bar corruption on driver print statements. To disable this, set `RAY_TQDM_PATCH_PRINT=0`.
 
-How to set up loggers
+Customizing Worker Loggers
 ~~~~~~~~~~~~~~~~~~~~~
-When using ray, all of the tasks and actors are executed remotely in Ray's worker processes. 
-Since Python logger module creates a singleton logger per process, loggers should be configured on per task/actor basis. 
+When using Ray, all tasks and actors are executed remotely in Ray's worker processes.
 
 .. note::
 
@@ -164,22 +199,24 @@ Since Python logger module creates a singleton logger per process, loggers shoul
             logging.basicConfig(level=logging.INFO)
 
         def log(self, msg):
-            logging.info(msg)
+            logger = logging.getLogger(__name__)
+            logger.info(msg)
 
     actor = Actor.remote()
     ray.get(actor.log.remote("A log message for an actor."))
 
     @ray.remote
     def f(msg):
         logging.basicConfig(level=logging.INFO)
-        logging.info(msg)
+        logger = logging.getLogger(__name__)
+        logger.info(msg)
 
-    ray.get(f.remote("A log message for a task"))
+    ray.get(f.remote("A log message for a task."))
 
 .. code-block:: bash
 
-    (pid=95193) INFO:root:A log message for a task
-    (pid=95192) INFO:root:A log message for an actor.
+    (Actor pid=179641) INFO:__main__:A log message for an actor.
+    (f pid=177572) INFO:__main__:A log message for a task.
 
 How to use structured logging
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -206,11 +243,11 @@ Logging directory structure
 ---------------------------
 .. _logging-directory-structure:
 
-By default, Ray logs are stored in a ``/tmp/ray/session_*/logs`` directory. 
+By default, Ray logs are stored in a ``/tmp/ray/session_*/logs`` directory.
 
 .. note::
 
-    The default temp directory is ``/tmp/ray`` (for Linux and Mac OS). If you'd like to change the temp directory, you can specify it when ``ray start`` or ``ray.init()`` is called. 
+    The default temp directory is ``/tmp/ray`` (for Linux and MacOS). To change the temp directory, specify it when you call ``ray start`` or ``ray.init()``.
 
 A new Ray instance creates a new session ID to the temp directory. The latest session ID is symlinked to ``/tmp/ray/session_latest``.
 
@@ -235,7 +272,7 @@ Here's a Ray log directory structure. Note that ``.out`` is logs from stdout/std
   For the logs of the actual installations (including e.g. ``pip install`` logs), see the ``runtime_env_setup-[job_id].log`` file (see below).
 - ``runtime_env_setup-[job_id].log``: Logs from installing :ref:`runtime environments <runtime-environments>` for a task, actor or job.  This file will only be present if a runtime environment is installed.
 - ``runtime_env_setup-ray_client_server_[port].log``: Logs from installing :ref:`runtime environments <runtime-environments>` for a job when connecting via :ref:`Ray Client <ray-client-ref>`.
-- ``worker-[worker_id]-[job_id]-[pid].[out|err]``: Python/Java part of Ray drivers and workers. All of stdout and stderr from tasks/actors are streamed here. Note that job_id is an id of the driver.- 
+- ``worker-[worker_id]-[job_id]-[pid].[out|err]``: Python or Java part of Ray drivers and workers. All of stdout and stderr from tasks or actors are streamed here. Note that job_id is an id of the driver.-
 
 .. _ray-log-rotation:
 

diff --git a/python/ray/__init__.py b/python/ray/__init__.py
@@ -1,8 +1,10 @@
 # isort: skip_file
+from ray._private import log  # isort: skip # noqa: F401
 import logging
 import os
 import sys
 
+log.generate_logging_config()
 logger = logging.getLogger(__name__)
 
 

diff --git a/python/ray/_private/log.py b/python/ray/_private/log.py
@@ -0,0 +1,136 @@
+import logging
+import re
+from logging.config import dictConfig
+import threading
+
+
+class ContextFilter(logging.Filter):
+    """A filter that adds ray context info to log records.
+
+    This filter adds a package name to append to the message as well as information
+    about what worker emitted the message, if applicable.
+    """
+
+    logger_regex = re.compile(r"ray(\.(?P<subpackage>\w+))?(\..*)?")
+    package_message_names = {
+        "air": "AIR",
+        "data": "Data",
+        "rllib": "RLlib",
+        "serve": "Serve",
+        "train": "Train",
+        "tune": "Tune",
+        "workflow": "Workflow",
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        """Add context information to the log record.
+
+        This filter adds a package name from where the message was generated as
+        well as the worker IP address, if applicable.
+
+        Args:
+            record: Record to be filtered
+
+        Returns:
+            True if the record is to be logged, False otherwise. (This filter only
+            adds context, so records are always logged.)
+        """
+        match = self.logger_regex.search(record.name)
+        if match and match["subpackage"] in self.package_message_names:
+            record.package = f"[Ray {self.package_message_names[match['subpackage']]}]"
+        else:
+            record.package = ""
+
+        return True
+
+
+class PlainRayHandler(logging.StreamHandler):
+    """A plain log handler.
+
+    This handler writes to whatever sys.stderr points to at emit-time,
+    not at instantiation time. See docs for logging._StderrHandler.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.plain_handler = logging._StderrHandler()
+        self.plain_handler.level = self.level
+        self.plain_handler.formatter = logging.Formatter(fmt="%(message)s")
+
+    def emit(self, record: logging.LogRecord):
+        """Emit the log message.
+
+        If this is a worker, bypass fancy logging and just emit the log record.
+        If this is the driver, emit the message using the appropriate console handler.
+
+        Args:
+            record: Log record to be emitted
+        """
+        import ray
+
+        if (
+            hasattr(ray, "_private")
+            and ray._private.worker.global_worker.mode
+            == ray._private.worker.WORKER_MODE
+        ):
+            self.plain_handler.emit(record)
+        else:
+            logging._StderrHandler.emit(self, record)
+
+
+logger_initialized = False
+logging_config_lock = threading.Lock()
+
+
+def generate_logging_config():
+    """Generate the default Ray logging configuration."""
+    with logging_config_lock:
+        global logger_initialized
+        if logger_initialized:
+            return
+        logger_initialized = True
+
+        formatters = {
+            "plain": {
+                "datefmt": "[%Y-%m-%d %H:%M:%S]",
+                "format": "%(asctime)s %(package)s %(levelname)s %(name)s::%(message)s",
+            },
+        }
+        filters = {"context_filter": {"()": ContextFilter}}
+        handlers = {
+            "default": {
+                "()": PlainRayHandler,
+                "formatter": "plain",
+                "filters": ["context_filter"],
+            }
+        }
+
+        loggers = {
+            # Default ray logger; any log message that gets propagated here will be
+            # logged to the console. Disable propagation, as many users will use
+            # basicConfig to set up a default handler. If so, logs will be
+            # printed twice unless we prevent propagation here.
+            "ray": {
+                "level": "INFO",
+                "handlers": ["default"],
+                "propagate": False,
+            },
+            # Special handling for ray.rllib: only warning-level messages passed through
+            # See https://github.com/ray-project/ray/pull/31858 for related PR
+            "ray.rllib": {
+                "level": "WARN",
+            },
+        }
+
+        dictConfig(
+            {
+                "version": 1,
+                "formatters": formatters,
+                "filters": filters,
+                "handlers": handlers,
+                "loggers": loggers,
+            }
+        )