Skip to content

Commit

Permalink
Changes to horovod file parser (#46)
Browse files Browse the repository at this point in the history
  • Loading branch information
vandanavk authored Jul 3, 2020
1 parent f073952 commit 1a4dde0
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 70 deletions.
2 changes: 1 addition & 1 deletion smdebug/profiler/algorithm_metrics_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def _get_event_files_in_the_range(
1. For Filename containing 'pythontimeline.json' -> SMEventsParser
2. For Filename containing 'model_timeline.json' -> SMEventsParser
3. For Filename containing 'tensorboard' (TBD) -> TensorboardProfilerEvents
4. For Filename containing 'Horovod' (TBD) -> 'HorovodProfilerEvents
4. For Filename containing 'horovod_timeline.json' -> 'HorovodProfilerEvents
"""

def _get_event_parser(self, filename):
Expand Down
4 changes: 2 additions & 2 deletions smdebug/profiler/hvd_trace_file_rotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _parse_trace_event(self, event):
)

# make a note of duration if present
duration = event.get("dur", 0)
duration_in_secs = event.get("dur", 0) / CONVERT_TO_MICROSECS

# make a note of args if present
args = event.get("args", {})
Expand All @@ -83,7 +83,7 @@ def _parse_trace_event(self, event):
# get the operation name from the event string
op_name = event.get("name", "")

return op_name, timestamp_in_secs, duration, args
return op_name, timestamp_in_secs, duration_in_secs, args

def _convert_monotonic_to_epoch_time(self, timestamp_in_us):
"""
Expand Down
35 changes: 3 additions & 32 deletions smdebug/profiler/tf_profiler_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,12 @@
from smdebug.core.logger import get_logger
from smdebug.profiler.profiler_constants import TF_METRICS_PREFIX
from smdebug.profiler.trace_event_file_parser import TraceEventParser
from smdebug.profiler.utils import TimeUnits, convert_utc_datetime_to_nanoseconds


class SMProfilerEvents(TraceEventParser):
def __init__(self):
super().__init__()

def _populate_start_time(self, event):
event_args = event["args"] if "args" in event else None
if self._start_time_known is False:
if event_args is None:
return
if "start_time_since_epoch_in_micros" in event_args:
self._start_timestamp = event_args["start_time_since_epoch_in_micros"]
self._start_time_known = True
self.logger.info(f"Start time for events in uSeconds = {self._start_timestamp}")

"""
Return the events that have started and completed within the given start and end time boundaries.
The start and end time can be specified datetime objects.
The events that are in progress during these boundaries are not included.
"""

def get_events_within_range(self, start_time: datetime, end_time: datetime):
if start_time.__class__ is datetime:
start_time_nanoseconds = convert_utc_datetime_to_nanoseconds(start_time)
if end_time.__class__ is datetime:
end_time_nanoseconds = convert_utc_datetime_to_nanoseconds(end_time)
return self.get_events_within_time_range(
start_time_nanoseconds, end_time_nanoseconds, unit=TimeUnits.NANOSECONDS
)


class TensorboardProfilerEvents(TraceEventParser):
def __init__(self):
Expand All @@ -62,6 +36,9 @@ def read_events_from_file(self, tracefile):
for event in trace_events_json:
self._read_event(event)

def get_events_within_range(self, start_time: datetime, end_time: datetime):
return None

def _get_event_phase(self, event):
if not event.event_name or not event.event_name.startswith(TF_METRICS_PREFIX):
return
Expand Down Expand Up @@ -189,9 +166,3 @@ def parse_tf_native_profiler_trace_json(log_dir):
class HorovodProfilerEvents(TraceEventParser):
def __init__(self):
super().__init__()
self._base_timestamp_initialized = False

def _populate_start_time(self, event):
# TODO, populate the self._start_timestamp when we make changes to horovod to record the unix epoch based
# timestamp at the start of tracing.
return
51 changes: 26 additions & 25 deletions smdebug/profiler/trace_event_file_parser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# First Party
# Standard Library
import datetime
import json
from datetime import datetime

from smdebug.core.logger import get_logger
from smdebug.profiler.utils import (
TimeUnits,
convert_utc_datetime_to_nanoseconds,
convert_utc_timestamp_to_nanoseconds,
get_node_id_from_tracefilename,
)
Expand Down Expand Up @@ -53,15 +54,6 @@ def __init__(self):
self._pid_stacks = dict()
self._start_timestamp = 0
self._start_time_known = False
"""
In horovod trace, the 'ts' timestamps for events are relative to the first 'ts' timestamp included in the
first event. We will consider this timestamp as base_timestamp and subtract it from the 'ts' values read
from the subsequent events. It will give us 0-based timestamps for rest of the events. Please note that
this base_timestamp is not related to unix epoch based timestamp. We would still have to add absolute start time
(self._start_timestamp) to obtain the absolute start time of any event.
"""
self._base_timestamp = 0
self._base_timestamp_initialized = True
# The timestamp in trace events are in micro seconds, we multiply by 1000 to convert to ns
self._timescale_multiplier_for_ns = 1000
self.logger = get_logger("smdebug-profiler")
Expand All @@ -88,27 +80,26 @@ def _populate_thread_info_for_metaevent(self, event):
self._processes[pid].add_thread(t_id, name)

def _populate_start_time(self, event):
pass
event_args = event["args"] if "args" in event else None
if self._start_time_known is False:
if event_args is None:
return
if "start_time_since_epoch_in_micros" in event_args:
self._start_timestamp = event_args["start_time_since_epoch_in_micros"]
self._start_time_known = True
self.logger.info(f"Start time for events in uSeconds = {self._start_timestamp}")

def _read_event(self, event, node_id=""):
if "ph" not in event:
return
phase_type = event["ph"]
if "ts" in event and not self._base_timestamp_initialized:
self._base_timestamp = event["ts"]
self.logger.info(
f"The base timestamp in horovod trace file for future events is {self._base_timestamp}"
)
self._base_timestamp_initialized = True
if phase_type == "M":
self._populate_process_info_for_metaevent(event)
self._populate_thread_info_for_metaevent(event)
self._populate_start_time(event)
if phase_type == "X":
# In nano seconds
start_time = (
event["ts"] - self._base_timestamp + self._start_timestamp
) * self._timescale_multiplier_for_ns
start_time = (event["ts"] + self._start_timestamp) * self._timescale_multiplier_for_ns
# In nano seconds
dur = event["dur"] * self._timescale_multiplier_for_ns
name = event["name"]
Expand All @@ -130,11 +121,9 @@ def _read_event(self, event, node_id=""):
b_event = self._pid_stacks[pid][-1]
self._pid_stacks[pid].pop()
start_time = (
b_event["ts"] - self._base_timestamp + self._start_timestamp
) * self._timescale_multiplier_for_ns
end_time = (
event["ts"] - self._base_timestamp + self._start_timestamp
b_event["ts"] + self._start_timestamp
) * self._timescale_multiplier_for_ns
end_time = (event["ts"] + self._start_timestamp) * self._timescale_multiplier_for_ns
duration = end_time - start_time
if duration < 0:
self.logger.error(
Expand Down Expand Up @@ -192,7 +181,19 @@ def get_events_within_time_range(self, start_time, end_time, unit=TimeUnits.MICR
"""

def get_events_within_range(self, start_time: datetime, end_time: datetime):
return None
"""
Return the events that have started and completed within the given start and end time boundaries.
The start and end time can be specified datetime objects.
The events that are in progress during these boundaries are not included.
"""
start_time_nanoseconds = end_time_nanoseconds = 0
if start_time.__class__ is datetime:
start_time_nanoseconds = convert_utc_datetime_to_nanoseconds(start_time)
if end_time.__class__ is datetime:
end_time_nanoseconds = convert_utc_datetime_to_nanoseconds(end_time)
return self.get_events_within_time_range(
start_time_nanoseconds, end_time_nanoseconds, unit=TimeUnits.NANOSECONDS
)

def get_process_info(self, process_id):
return self._processes[process_id]
Expand Down
11 changes: 11 additions & 0 deletions tests/profiler/core/test_algorithm_metric_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,14 @@ def test_LocalMetricsReader_Model_timeline(
print(f"Number of events {len(events)}")
assert len(events) == 54
assert events[0].node_id == "0001"


@pytest.mark.parametrize("use_in_memory_cache", [True, False])
def test_LocalMetricsReader_Horovod_timeline(
use_in_memory_cache, tracefolder="./tests/profiler/resources/horovod_timeline_traces"
):
lt = LocalAlgorithmMetricsReader(tracefolder, use_in_memory_cache=use_in_memory_cache)
events = lt.get_events(1593673051472800, 1593673051473100)

print(f"Number of events {len(events)}")
assert len(events) == 15
24 changes: 14 additions & 10 deletions tests/profiler/core/test_horovodprofiler_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def test_horovodprofiler_events(
trace_file="./tests/profiler/resources/horovod_timeline_small.json"
trace_file="./tests/profiler/resources/horovod_timeline_traces/framework/pevents/2020070206/1593673051473228_88359-8c859046be41.ant.amazon.com_horovod_timeline.json"
):
trace_json_file = trace_file
print(f"Reading the trace file {trace_json_file}")
Expand All @@ -24,21 +24,25 @@ def test_horovodprofiler_events(
num_trace_events = len(all_trace_events)

print(f"Number of events read = {num_trace_events}")
assert num_trace_events == 21
assert num_trace_events == 19

completed_event_list = t_events.get_events_within_time_range(0, 8990000) # microseconds
print(f"Number of events occurred between 0 and 8990000000 are {len(completed_event_list)}")
assert len(completed_event_list) == 20
completed_event_list = t_events.get_events_within_time_range(
1593673051473000, 1593673051473228
) # microseconds
print(
f"Number of events occurred between 1593673051473000 and 1593673051473228 are {len(completed_event_list)}"
)
assert len(completed_event_list) == 12

start_time_sorted = t_events.get_events_start_time_sorted()
start_time_for_first_event = start_time_sorted[0].start_time
print(f"The first event started at {start_time_for_first_event}")
assert start_time_for_first_event == 8440608000
assert start_time_for_first_event == 1592860696000713000

end_time_sorted = t_events.get_events_end_time_sorted()
end_time_for_last_event = end_time_sorted[-1].end_time
print(f"The first event started at {end_time_for_last_event}")
assert end_time_for_last_event == 9296331000
print(f"The last event ended at {end_time_for_last_event}")
assert end_time_for_last_event == 1593673051473228000


def test_steady_clock_to_epoch_time_conversion(
Expand Down Expand Up @@ -72,8 +76,8 @@ def test_steady_clock_to_epoch_time_conversion(

# assuming that the test is being run the same day as system boot or
# at the maximum one day after system boot.
assert boot_time.tm_year == gmtime.tm_year
assert boot_time.tm_mon == gmtime.tm_mon
assert pytest.approx(boot_time.tm_year, 1) == gmtime.tm_year
assert pytest.approx(boot_time.tm_mon, 1) == gmtime.tm_mon
assert pytest.approx(boot_time.tm_mday, 1) == gmtime.tm_mday

hvd_file_reader.close()
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
{"name": "process_name", "ph": "M", "pid": 0, "args": {"start_time_since_epoch_in_micros": 1592860687560103}},
{"name": "process_sort_index", "ph": "M", "pid": 0, "args": {"sort_index": 0}},
{"name": "CYCLE_START", "pid": 1, "ph": "i", "ts": 0, "s": "g"}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
[
{"name": "process_name", "ph": "M", "pid": 0, "args": {"start_time_since_epoch_in_micros": 1592860687560103}},
{"name": "process_sort_index", "ph": "M", "pid": 0, "args": {"sort_index": 0}},
{"name": "process_name", "ph": "M", "pid": 1, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_dense_1_BiasAdd_BiasAddGrad_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 1, "args": {"sort_index": 1}},
{"name": "NEGOTIATE_ALLREDUCE", "pid": 1, "ph": "B", "ts": 8440608},
{"name": "4", "pid": 1, "ph": "X", "ts": 8440610, "dur": 812355471858},
{"name": "process_name", "ph": "M", "pid": 2, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_dense_1_MatMul_1_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 2, "args": {"sort_index": 2}},
{"name": "NEGOTIATE_ALLREDUCE", "pid": 2, "ph": "B", "ts": 8440614},
{"name": "4", "pid": 2, "ph": "X", "ts": 8440616, "dur": 812355471927},
{"name": "process_name", "ph": "M", "pid": 3, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_dense_BiasAdd_BiasAddGrad_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 3, "args": {"sort_index": 3}},
{"name": "4", "pid": 3, "ph": "X", "ts": 8440621, "dur": 812355471970},
{"name": "process_name", "ph": "M", "pid": 4, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_dense_MatMul_1_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 4, "args": {"sort_index": 4}},
{"name": "4", "pid": 4, "ph": "X", "ts": 8440631, "dur": 812355472022},
{"name": "process_name", "ph": "M", "pid": 5, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_22_Conv2DBackpropFilter_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 5, "args": {"sort_index": 5}},
{"name": "4", "pid": 5, "ph": "X", "ts": 8461066, "dur": 812355451642},
{"name": "process_name", "ph": "M", "pid": 6, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_22_BiasAdd_BiasAddGrad_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 6, "args": {"sort_index": 6}},
{"name": "4", "pid": 6, "ph": "X", "ts": 8461072, "dur": 812355451685},
{"name": "process_name", "ph": "M", "pid": 7, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_19_FusedBatchNormGradV3_1"}},
{"name": "process_sort_index", "ph": "M", "pid": 7, "args": {"sort_index": 7}},
{"name": "4", "pid": 7, "ph": "X", "ts": 8486503, "dur": 812355426374},
{"name": "process_name", "ph": "M", "pid": 8, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_19_FusedBatchNormGradV3_2"}},
{"name": "process_sort_index", "ph": "M", "pid": 8, "args": {"sort_index": 8}},
{"name": "4", "pid": 8, "ph": "X", "ts": 8486509, "dur": 812355426399},
{"name": "process_name", "ph": "M", "pid": 9, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_21_Conv2DBackpropFilter_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 9, "args": {"sort_index": 9}},
{"name": "4", "pid": 9, "ph": "X", "ts": 8486513, "dur": 812355426415},
{"name": "process_name", "ph": "M", "pid": 10, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_21_BiasAdd_BiasAddGrad_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 10, "args": {"sort_index": 10}},
{"name": "4", "pid": 10, "ph": "X", "ts": 8486518, "dur": 812355426428},
{"name": "process_name", "ph": "M", "pid": 11, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_18_FusedBatchNormGradV3_1"}},
{"name": "process_sort_index", "ph": "M", "pid": 11, "args": {"sort_index": 11}},
{"name": "4", "pid": 11, "ph": "X", "ts": 8486527, "dur": 812355426437},
{"name": "CYCLE_START", "pid": 11, "ph": "i", "ts": 8491490, "s": "g"},
{"name": "process_name", "ph": "M", "pid": 12, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_18_FusedBatchNormGradV3_2"}},
{"name": "process_sort_index", "ph": "M", "pid": 12, "args": {"sort_index": 12}},
{"name": "NEGOTIATE_ALLREDUCE", "pid": 12, "ph": "B", "ts": 8508311},
{"name": "4", "pid": 12, "ph": "X", "ts": 8508330, "dur": 812355404674},
{"name": "process_name", "ph": "M", "pid": 13, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_20_Conv2DBackpropFilter_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 13, "args": {"sort_index": 13}},
{"name": "NEGOTIATE_ALLREDUCE", "pid": 13, "ph": "B", "ts": 8508350},
{"name": "4", "pid": 13, "ph": "X", "ts": 8508351, "dur": 812355404681},
{"name": "process_name", "ph": "M", "pid": 14, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_conv2d_20_BiasAdd_BiasAddGrad_0"}},
{"name": "process_sort_index", "ph": "M", "pid": 14, "args": {"sort_index": 14}},
{"name": "", "pid": 14, "ph": "E", "ts": 8781513},
{"name": "5", "pid": 14, "ph": "X", "ts": 8781514, "dur": 812355131544},
{"name": "", "pid": 14, "ph": "E", "ts": 8781515},
{"name": "4", "pid": 14, "ph": "X", "ts": 8508359, "dur": 812355404718},
{"name": "process_name", "ph": "M", "pid": 15, "args": {"name": "Adam_Allreduce/HorovodAllreduce_gradient_tape_model_batch_normalization_17_FusedBatchNormGradV3_1"}},
{"name": "process_sort_index", "ph": "M", "pid": 15, "args": {"sort_index": 15}},
{"name": "5", "pid": 15, "ph": "X", "ts": 8781511, "dur": 812355131585},
{"name": "7", "pid": 15, "ph": "X", "ts": 8781518, "dur": 812355131587},
{"name": "7", "pid": 15, "ph": "X", "ts": 8781520, "dur": 812355131595},
{"name": "7", "pid": 15, "ph": "X", "ts": 9296331, "dur": 812354616794}
]

0 comments on commit 1a4dde0

Please sign in to comment.