From 1a4dde005d75cff4f3b140158db25157c6b1ef57 Mon Sep 17 00:00:00 2001 From: Vandana Kannan Date: Thu, 2 Jul 2020 17:23:37 -0700 Subject: [PATCH] Changes to horovod file parser (#46) --- smdebug/profiler/algorithm_metrics_reader.py | 2 +- smdebug/profiler/hvd_trace_file_rotation.py | 4 +- smdebug/profiler/tf_profiler_parser.py | 35 +---------- smdebug/profiler/trace_event_file_parser.py | 51 ++++++++-------- .../core/test_algorithm_metric_readers.py | 11 ++++ .../core/test_horovodprofiler_events.py | 24 ++++---- ...6be41.ant.amazon.com_horovod_timeline.json | 5 ++ ...6be41.ant.amazon.com_horovod_timeline.json | 60 +++++++++++++++++++ 8 files changed, 122 insertions(+), 70 deletions(-) create mode 100644 tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051472361_88359-8c859046be41.ant.amazon.com_horovod_timeline.json create mode 100644 tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051473228_88359-8c859046be41.ant.amazon.com_horovod_timeline.json diff --git a/smdebug/profiler/algorithm_metrics_reader.py b/smdebug/profiler/algorithm_metrics_reader.py index fba208d7d..56cf86b2c 100644 --- a/smdebug/profiler/algorithm_metrics_reader.py +++ b/smdebug/profiler/algorithm_metrics_reader.py @@ -112,7 +112,7 @@ def _get_event_files_in_the_range( 1. For Filename containing 'pythontimeline.json' -> SMEventsParser 2. For Filename containing 'model_timeline.json' -> SMEventsParser 3. For Filename containing 'tensorboard' (TBD) -> TensorboardProfilerEvents - 4. For Filename containing 'Horovod' (TBD) -> 'HorovodProfilerEvents + 4. For Filename containing 'horovod_timeline.json' -> 'HorovodProfilerEvents """ def _get_event_parser(self, filename): diff --git a/smdebug/profiler/hvd_trace_file_rotation.py b/smdebug/profiler/hvd_trace_file_rotation.py index 6bd994746..8c3e214cd 100644 --- a/smdebug/profiler/hvd_trace_file_rotation.py +++ b/smdebug/profiler/hvd_trace_file_rotation.py @@ -71,7 +71,7 @@ def _parse_trace_event(self, event): ) # make a note of duration if present - duration = event.get("dur", 0) + duration_in_secs = event.get("dur", 0) / CONVERT_TO_MICROSECS # make a note of args if present args = event.get("args", {}) @@ -83,7 +83,7 @@ def _parse_trace_event(self, event): # get the operation name from the event string op_name = event.get("name", "") - return op_name, timestamp_in_secs, duration, args + return op_name, timestamp_in_secs, duration_in_secs, args def _convert_monotonic_to_epoch_time(self, timestamp_in_us): """ diff --git a/smdebug/profiler/tf_profiler_parser.py b/smdebug/profiler/tf_profiler_parser.py index a96438661..b68f5cb04 100644 --- a/smdebug/profiler/tf_profiler_parser.py +++ b/smdebug/profiler/tf_profiler_parser.py @@ -6,38 +6,12 @@ from smdebug.core.logger import get_logger from smdebug.profiler.profiler_constants import TF_METRICS_PREFIX from smdebug.profiler.trace_event_file_parser import TraceEventParser -from smdebug.profiler.utils import TimeUnits, convert_utc_datetime_to_nanoseconds class SMProfilerEvents(TraceEventParser): def __init__(self): super().__init__() - def _populate_start_time(self, event): - event_args = event["args"] if "args" in event else None - if self._start_time_known is False: - if event_args is None: - return - if "start_time_since_epoch_in_micros" in event_args: - self._start_timestamp = event_args["start_time_since_epoch_in_micros"] - self._start_time_known = True - self.logger.info(f"Start time for events in uSeconds = {self._start_timestamp}") - - """ - Return the events that have started and completed within the given start and end time boundaries. - The start and end time can be specified datetime objects. - The events that are in progress during these boundaries are not included. - """ - - def get_events_within_range(self, start_time: datetime, end_time: datetime): - if start_time.__class__ is datetime: - start_time_nanoseconds = convert_utc_datetime_to_nanoseconds(start_time) - if end_time.__class__ is datetime: - end_time_nanoseconds = convert_utc_datetime_to_nanoseconds(end_time) - return self.get_events_within_time_range( - start_time_nanoseconds, end_time_nanoseconds, unit=TimeUnits.NANOSECONDS - ) - class TensorboardProfilerEvents(TraceEventParser): def __init__(self): @@ -62,6 +36,9 @@ def read_events_from_file(self, tracefile): for event in trace_events_json: self._read_event(event) + def get_events_within_range(self, start_time: datetime, end_time: datetime): + return None + def _get_event_phase(self, event): if not event.event_name or not event.event_name.startswith(TF_METRICS_PREFIX): return @@ -189,9 +166,3 @@ def parse_tf_native_profiler_trace_json(log_dir): class HorovodProfilerEvents(TraceEventParser): def __init__(self): super().__init__() - self._base_timestamp_initialized = False - - def _populate_start_time(self, event): - # TODO, populate the self._start_timestamp when we make changes to horovod to record the unix epoch based - # timestamp at the start of tracing. - return diff --git a/smdebug/profiler/trace_event_file_parser.py b/smdebug/profiler/trace_event_file_parser.py index 541bea0a0..d330e4903 100644 --- a/smdebug/profiler/trace_event_file_parser.py +++ b/smdebug/profiler/trace_event_file_parser.py @@ -1,11 +1,12 @@ # First Party # Standard Library -import datetime import json +from datetime import datetime from smdebug.core.logger import get_logger from smdebug.profiler.utils import ( TimeUnits, + convert_utc_datetime_to_nanoseconds, convert_utc_timestamp_to_nanoseconds, get_node_id_from_tracefilename, ) @@ -53,15 +54,6 @@ def __init__(self): self._pid_stacks = dict() self._start_timestamp = 0 self._start_time_known = False - """ - In horovod trace, the 'ts' timestamps for events are relative to the first 'ts' timestamp included in the - first event. We will consider this timestamp as base_timestamp and subtract it from the 'ts' values read - from the subsequent events. It will give us 0-based timestamps for rest of the events. Please note that - this base_timestamp is not related to unix epoch based timestamp. We would still have to add absolute start time - (self._start_timestamp) to obtain the absolute start time of any event. - """ - self._base_timestamp = 0 - self._base_timestamp_initialized = True # The timestamp in trace events are in micro seconds, we multiply by 1000 to convert to ns self._timescale_multiplier_for_ns = 1000 self.logger = get_logger("smdebug-profiler") @@ -88,27 +80,26 @@ def _populate_thread_info_for_metaevent(self, event): self._processes[pid].add_thread(t_id, name) def _populate_start_time(self, event): - pass + event_args = event["args"] if "args" in event else None + if self._start_time_known is False: + if event_args is None: + return + if "start_time_since_epoch_in_micros" in event_args: + self._start_timestamp = event_args["start_time_since_epoch_in_micros"] + self._start_time_known = True + self.logger.info(f"Start time for events in uSeconds = {self._start_timestamp}") def _read_event(self, event, node_id=""): if "ph" not in event: return phase_type = event["ph"] - if "ts" in event and not self._base_timestamp_initialized: - self._base_timestamp = event["ts"] - self.logger.info( - f"The base timestamp in horovod trace file for future events is {self._base_timestamp}" - ) - self._base_timestamp_initialized = True if phase_type == "M": self._populate_process_info_for_metaevent(event) self._populate_thread_info_for_metaevent(event) self._populate_start_time(event) if phase_type == "X": # In nano seconds - start_time = ( - event["ts"] - self._base_timestamp + self._start_timestamp - ) * self._timescale_multiplier_for_ns + start_time = (event["ts"] + self._start_timestamp) * self._timescale_multiplier_for_ns # In nano seconds dur = event["dur"] * self._timescale_multiplier_for_ns name = event["name"] @@ -130,11 +121,9 @@ def _read_event(self, event, node_id=""): b_event = self._pid_stacks[pid][-1] self._pid_stacks[pid].pop() start_time = ( - b_event["ts"] - self._base_timestamp + self._start_timestamp - ) * self._timescale_multiplier_for_ns - end_time = ( - event["ts"] - self._base_timestamp + self._start_timestamp + b_event["ts"] + self._start_timestamp ) * self._timescale_multiplier_for_ns + end_time = (event["ts"] + self._start_timestamp) * self._timescale_multiplier_for_ns duration = end_time - start_time if duration < 0: self.logger.error( @@ -192,7 +181,19 @@ def get_events_within_time_range(self, start_time, end_time, unit=TimeUnits.MICR """ def get_events_within_range(self, start_time: datetime, end_time: datetime): - return None + """ + Return the events that have started and completed within the given start and end time boundaries. + The start and end time can be specified datetime objects. + The events that are in progress during these boundaries are not included. + """ + start_time_nanoseconds = end_time_nanoseconds = 0 + if start_time.__class__ is datetime: + start_time_nanoseconds = convert_utc_datetime_to_nanoseconds(start_time) + if end_time.__class__ is datetime: + end_time_nanoseconds = convert_utc_datetime_to_nanoseconds(end_time) + return self.get_events_within_time_range( + start_time_nanoseconds, end_time_nanoseconds, unit=TimeUnits.NANOSECONDS + ) def get_process_info(self, process_id): return self._processes[process_id] diff --git a/tests/profiler/core/test_algorithm_metric_readers.py b/tests/profiler/core/test_algorithm_metric_readers.py index 87ac2d68d..f958cbf0e 100644 --- a/tests/profiler/core/test_algorithm_metric_readers.py +++ b/tests/profiler/core/test_algorithm_metric_readers.py @@ -37,3 +37,14 @@ def test_LocalMetricsReader_Model_timeline( print(f"Number of events {len(events)}") assert len(events) == 54 assert events[0].node_id == "0001" + + +@pytest.mark.parametrize("use_in_memory_cache", [True, False]) +def test_LocalMetricsReader_Horovod_timeline( + use_in_memory_cache, tracefolder="./tests/profiler/resources/horovod_timeline_traces" +): + lt = LocalAlgorithmMetricsReader(tracefolder, use_in_memory_cache=use_in_memory_cache) + events = lt.get_events(1593673051472800, 1593673051473100) + + print(f"Number of events {len(events)}") + assert len(events) == 15 diff --git a/tests/profiler/core/test_horovodprofiler_events.py b/tests/profiler/core/test_horovodprofiler_events.py index 77e0aff0d..744586078 100644 --- a/tests/profiler/core/test_horovodprofiler_events.py +++ b/tests/profiler/core/test_horovodprofiler_events.py @@ -13,7 +13,7 @@ def test_horovodprofiler_events( - trace_file="./tests/profiler/resources/horovod_timeline_small.json" + trace_file="./tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051473228_88359-8c859046be41.ant.amazon.com_horovod_timeline.json" ): trace_json_file = trace_file print(f"Reading the trace file {trace_json_file}") @@ -24,21 +24,25 @@ def test_horovodprofiler_events( num_trace_events = len(all_trace_events) print(f"Number of events read = {num_trace_events}") - assert num_trace_events == 21 + assert num_trace_events == 19 - completed_event_list = t_events.get_events_within_time_range(0, 8990000) # microseconds - print(f"Number of events occurred between 0 and 8990000000 are {len(completed_event_list)}") - assert len(completed_event_list) == 20 + completed_event_list = t_events.get_events_within_time_range( + 1593673051473000, 1593673051473228 + ) # microseconds + print( + f"Number of events occurred between 1593673051473000 and 1593673051473228 are {len(completed_event_list)}" + ) + assert len(completed_event_list) == 12 start_time_sorted = t_events.get_events_start_time_sorted() start_time_for_first_event = start_time_sorted[0].start_time print(f"The first event started at {start_time_for_first_event}") - assert start_time_for_first_event == 8440608000 + assert start_time_for_first_event == 1592860696000713000 end_time_sorted = t_events.get_events_end_time_sorted() end_time_for_last_event = end_time_sorted[-1].end_time - print(f"The first event started at {end_time_for_last_event}") - assert end_time_for_last_event == 9296331000 + print(f"The last event ended at {end_time_for_last_event}") + assert end_time_for_last_event == 1593673051473228000 def test_steady_clock_to_epoch_time_conversion( @@ -72,8 +76,8 @@ def test_steady_clock_to_epoch_time_conversion( # assuming that the test is being run the same day as system boot or # at the maximum one day after system boot. - assert boot_time.tm_year == gmtime.tm_year - assert boot_time.tm_mon == gmtime.tm_mon + assert pytest.approx(boot_time.tm_year, 1) == gmtime.tm_year + assert pytest.approx(boot_time.tm_mon, 1) == gmtime.tm_mon assert pytest.approx(boot_time.tm_mday, 1) == gmtime.tm_mday hvd_file_reader.close() diff --git a/tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051472361_88359-8c859046be41.ant.amazon.com_horovod_timeline.json b/tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051472361_88359-8c859046be41.ant.amazon.com_horovod_timeline.json new file mode 100644 index 000000000..7e3fd103c --- /dev/null +++ b/tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051472361_88359-8c859046be41.ant.amazon.com_horovod_timeline.json @@ -0,0 +1,5 @@ +[ +{"name": "process_name", "ph": "M", "pid": 0, "args": {"start_time_since_epoch_in_micros": 1592860687560103}}, +{"name": "process_sort_index", "ph": "M", "pid": 0, "args": {"sort_index": 0}}, +{"name": "CYCLE_START", "pid": 1, "ph": "i", "ts": 0, "s": "g"} +] diff --git a/tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051473228_88359-8c859046be41.ant.amazon.com_horovod_timeline.json b/tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051473228_88359-8c859046be41.ant.amazon.com_horovod_timeline.json new file mode 100644 index 000000000..615158752 --- /dev/null +++ b/tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051473228_88359-8c859046be41.ant.amazon.com_horovod_timeline.json @@ -0,0 +1,60 @@ +[ +{"name": "process_name", "ph": "M", "pid": 0, "args": {"start_time_since_epoch_in_micros": 1592860687560103}}, +{"name": "process_sort_index", "ph": "M", "pid": 0, "args": {"sort_index": 0}}, +{"name": "process_name", "ph": "M", "pid": 1, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_dense_1_BiasAdd_BiasAddGrad_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 1, "args": {"sort_index": 1}}, +{"name": "NEGOTIATE_ALLREDUCE", "pid": 1, "ph": "B", "ts": 8440608}, +{"name": "4", "pid": 1, "ph": "X", "ts": 8440610, "dur": 812355471858}, +{"name": "process_name", "ph": "M", "pid": 2, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_dense_1_MatMul_1_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 2, "args": {"sort_index": 2}}, +{"name": "NEGOTIATE_ALLREDUCE", "pid": 2, "ph": "B", "ts": 8440614}, +{"name": "4", "pid": 2, "ph": "X", "ts": 8440616, "dur": 812355471927}, +{"name": "process_name", "ph": "M", "pid": 3, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_dense_BiasAdd_BiasAddGrad_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 3, "args": {"sort_index": 3}}, +{"name": "4", "pid": 3, "ph": "X", "ts": 8440621, "dur": 812355471970}, +{"name": "process_name", "ph": "M", "pid": 4, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_dense_MatMul_1_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 4, "args": {"sort_index": 4}}, +{"name": "4", "pid": 4, "ph": "X", "ts": 8440631, "dur": 812355472022}, +{"name": "process_name", "ph": "M", "pid": 5, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_22_Conv2DBackpropFilter_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 5, "args": {"sort_index": 5}}, +{"name": "4", "pid": 5, "ph": "X", "ts": 8461066, "dur": 812355451642}, +{"name": "process_name", "ph": "M", "pid": 6, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_22_BiasAdd_BiasAddGrad_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 6, "args": {"sort_index": 6}}, +{"name": "4", "pid": 6, "ph": "X", "ts": 8461072, "dur": 812355451685}, +{"name": "process_name", "ph": "M", "pid": 7, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_19_FusedBatchNormGradV3_1"}}, +{"name": "process_sort_index", "ph": "M", "pid": 7, "args": {"sort_index": 7}}, +{"name": "4", "pid": 7, "ph": "X", "ts": 8486503, "dur": 812355426374}, +{"name": "process_name", "ph": "M", "pid": 8, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_19_FusedBatchNormGradV3_2"}}, +{"name": "process_sort_index", "ph": "M", "pid": 8, "args": {"sort_index": 8}}, +{"name": "4", "pid": 8, "ph": "X", "ts": 8486509, "dur": 812355426399}, +{"name": "process_name", "ph": "M", "pid": 9, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_21_Conv2DBackpropFilter_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 9, "args": {"sort_index": 9}}, +{"name": "4", "pid": 9, "ph": "X", "ts": 8486513, "dur": 812355426415}, +{"name": "process_name", "ph": "M", "pid": 10, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_21_BiasAdd_BiasAddGrad_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 10, "args": {"sort_index": 10}}, +{"name": "4", "pid": 10, "ph": "X", "ts": 8486518, "dur": 812355426428}, +{"name": "process_name", "ph": "M", "pid": 11, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_18_FusedBatchNormGradV3_1"}}, +{"name": "process_sort_index", "ph": "M", "pid": 11, "args": {"sort_index": 11}}, +{"name": "4", "pid": 11, "ph": "X", "ts": 8486527, "dur": 812355426437}, +{"name": "CYCLE_START", "pid": 11, "ph": "i", "ts": 8491490, "s": "g"}, +{"name": "process_name", "ph": "M", "pid": 12, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_18_FusedBatchNormGradV3_2"}}, +{"name": "process_sort_index", "ph": "M", "pid": 12, "args": {"sort_index": 12}}, +{"name": "NEGOTIATE_ALLREDUCE", "pid": 12, "ph": "B", "ts": 8508311}, +{"name": "4", "pid": 12, "ph": "X", "ts": 8508330, "dur": 812355404674}, +{"name": "process_name", "ph": "M", "pid": 13, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_20_Conv2DBackpropFilter_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 13, "args": {"sort_index": 13}}, +{"name": "NEGOTIATE_ALLREDUCE", "pid": 13, "ph": "B", "ts": 8508350}, +{"name": "4", "pid": 13, "ph": "X", "ts": 8508351, "dur": 812355404681}, +{"name": "process_name", "ph": "M", "pid": 14, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_20_BiasAdd_BiasAddGrad_0"}}, +{"name": "process_sort_index", "ph": "M", "pid": 14, "args": {"sort_index": 14}}, +{"name": "", "pid": 14, "ph": "E", "ts": 8781513}, +{"name": "5", "pid": 14, "ph": "X", "ts": 8781514, "dur": 812355131544}, +{"name": "", "pid": 14, "ph": "E", "ts": 8781515}, +{"name": "4", "pid": 14, "ph": "X", "ts": 8508359, "dur": 812355404718}, +{"name": "process_name", "ph": "M", "pid": 15, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_17_FusedBatchNormGradV3_1"}}, +{"name": "process_sort_index", "ph": "M", "pid": 15, "args": {"sort_index": 15}}, +{"name": "5", "pid": 15, "ph": "X", "ts": 8781511, "dur": 812355131585}, +{"name": "7", "pid": 15, "ph": "X", "ts": 8781518, "dur": 812355131587}, +{"name": "7", "pid": 15, "ph": "X", "ts": 8781520, "dur": 812355131595}, +{"name": "7", "pid": 15, "ph": "X", "ts": 9296331, "dur": 812354616794} +]