forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Datasets] Persist Datasets statistics to log file (ray-project#30557)
Currently, when we print Dataset stats after execution, there is no way to retrieve this information in case of job failure/crash. By persisting the logs to a separate file, we can access the stats which could be helpful for debugging. By default, this is configured to write to /logs/ray-data.log. The new logger, DatasetLogger, is configured to always write logs to the ray-data.log file, and optionally also writes to stdout (this is enabled by default). The motivation behind this is so that users can easily use the specific log file to filter for Dataset logs, while still maintaining console logs for those who use them. Signed-off-by: tmynn <[email protected]>
- Loading branch information
1 parent
d15342b
commit 912530e
Showing
4 changed files
with
195 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import logging | ||
import os | ||
|
||
import ray | ||
from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL | ||
|
||
|
||
class DatasetLogger: | ||
"""Logger for Ray Datasets which writes logs to a separate log file | ||
at `DatasetLogger.DEFAULT_DATASET_LOG_PATH`. Can optionally turn off | ||
logging to stdout to reduce clutter (but always logs to the aformentioned | ||
Datasets-specific log file). | ||
After initialization, always use the `get_logger()` method to correctly | ||
set whether to log to stdout. Example usage: | ||
``` | ||
logger = DatasetLogger(__name__) | ||
logger.get_logger().info("This logs to file and stdout") | ||
logger.get_logger(log_to_stdout=False).info("This logs to file only) | ||
logger.get_logger().warning("Can call the usual Logger methods") | ||
``` | ||
""" | ||
|
||
DEFAULT_DATASET_LOG_PATH = "logs/ray-data.log" | ||
|
||
def __init__(self, log_name: str): | ||
"""Initialize DatasetLogger for a given `log_name`. | ||
Args: | ||
log_name: Name of logger (usually passed into `logging.getLogger(...)`) | ||
""" | ||
# Logger used to logging to log file (in addition to the root logger, | ||
# which logs to stdout as normal). For logging calls made with the | ||
# parameter `log_to_stdout = False`, `_logger.propagate` will be set | ||
# to `False` in order to prevent the root logger from writing the log | ||
# to stdout. | ||
self._logger = logging.getLogger(f"{log_name}.logfile") | ||
# We need to set the log level again when explicitly | ||
# initializing a new logger (otherwise can have undesirable level). | ||
self._logger.setLevel(LOGGER_LEVEL.upper()) | ||
|
||
# Add log handler which writes to a separate Datasets log file | ||
# at `DatasetLogger.DEFAULT_DATASET_LOG_PATH` | ||
global_node = ray._private.worker._global_node | ||
if global_node is not None: | ||
# With current implementation, we can only get session_dir | ||
# after ray.init() is called. A less hacky way could potentially fix this | ||
session_dir = global_node.get_session_dir_path() | ||
self.datasets_log_path = os.path.join( | ||
session_dir, | ||
DatasetLogger.DEFAULT_DATASET_LOG_PATH, | ||
) | ||
# Add a FileHandler to write to the specific Ray Datasets log file, | ||
# using the standard default logger format used by the root logger | ||
file_log_handler = logging.FileHandler(self.datasets_log_path) | ||
file_log_formatter = logging.Formatter(fmt=LOGGER_FORMAT) | ||
file_log_handler.setFormatter(file_log_formatter) | ||
self._logger.addHandler(file_log_handler) | ||
|
||
def get_logger(self, log_to_stdout: bool = True): | ||
""" | ||
Returns the underlying Logger, with the `propagate` attribute set | ||
to the same value as `log_to_stdout`. For example, when | ||
`log_to_stdout = False`, we do not want the `DatasetLogger` to | ||
propagate up to the base Logger which writes to stdout. | ||
This is a workaround needed due to the DatasetLogger wrapper object | ||
not having access to the log caller's scope in Python <3.8. | ||
In the future, with Python 3.8 support, we can use the `stacklevel` arg, | ||
which allows the logger to fetch the correct calling file/line and | ||
also removes the need for this getter method: | ||
`logger.info(msg="Hello world", stacklevel=2)` | ||
""" | ||
self._logger.propagate = log_to_stdout | ||
return self._logger |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pytest | ||
from ray.tests.conftest import * # noqa | ||
|
||
import os | ||
import re | ||
import logging | ||
|
||
from datetime import datetime | ||
|
||
import ray | ||
from ray.data._internal.dataset_logger import DatasetLogger | ||
|
||
|
||
def test_dataset_logger(shutdown_only): | ||
ray.init() | ||
log_name, msg = "test_name", "test_message_1234" | ||
logger = DatasetLogger(log_name) | ||
logger.get_logger().info(msg) | ||
|
||
# Read from log file, and parse each component of emitted log row | ||
session_dir = ray._private.worker._global_node.get_session_dir_path() | ||
log_file_path = os.path.join(session_dir, DatasetLogger.DEFAULT_DATASET_LOG_PATH) | ||
with open(log_file_path, "r") as f: | ||
raw_logged_msg = f.read() | ||
( | ||
logged_ds, | ||
logged_ts, | ||
logged_level, | ||
logged_filepath, | ||
sep, | ||
logged_msg, | ||
) = raw_logged_msg.split() | ||
|
||
# Could not use freezegun to test exact timestamp value | ||
# (values off by some milliseconds), so instead we check | ||
# for correct timestamp format. | ||
try: | ||
datetime.strptime(f"{logged_ds} {logged_ts}", "%Y-%m-%d %H:%M:%S,%f") | ||
except ValueError: | ||
raise Exception(f"Invalid log timestamp: {logged_ds} {logged_ts}") | ||
|
||
assert logged_level == logging.getLevelName(logging.INFO) | ||
assert re.match(r"test_dataset_logger.py:\d+", logged_filepath) | ||
assert logged_msg == msg | ||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
|
||
sys.exit(pytest.main(["-v", __file__])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters