pytorch · weifengpy · Jun 19, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 13, 2024
diff --git a/torchtitan/profiling.py b/torchtitan/profiling.py
@@ -6,6 +6,7 @@
 
 import contextlib
 import os
+import pickle
 import time
 
 import torch
@@ -15,6 +16,14 @@
 # the number of warmup steps before the active step in each profiling cycle
 WARMUP = 3
 
+# how much memory allocation/free ops to record in memory snapshots
+MEMORY_SNAPSHOT_MAX_ENTRIES = 100000
+
+# default memory snapshot folder
+ENABLE_MEMORY_SNAPSHOT_KEY = "enable_memory_snapshot"
+MEMORY_SNAPSHOT_FOLDER_KEY = "memory_snapshot_folder"
+MEMORY_SNAPSHOT_FOLDER_DEFAULT_VALUE = "memory_snapshot"
+
 
 @contextlib.contextmanager
 def maybe_enable_profiling(config: JobConfig, *, global_step: int = 0):
@@ -70,3 +79,62 @@ def trace_handler(prof):
     else:
         torch_profiler = contextlib.nullcontext()
         yield None
+
+
+@contextlib.contextmanager
+def maybe_enable_memory_snapshot(config: JobConfig, *, global_step: int = 0):
+    enable_snapshot = getattr(config.profiling, ENABLE_MEMORY_SNAPSHOT_KEY, False)
+    if enable_snapshot:
+        snapshot_folder = getattr(
+            config.profiling,
+            MEMORY_SNAPSHOT_FOLDER_KEY,
+            MEMORY_SNAPSHOT_FOLDER_DEFAULT_VALUE,
+        )
+        snapshot_dir = os.path.join(config.job.dump_folder, snapshot_folder)
+        if not os.path.exists(snapshot_dir):
+            os.makedirs(snapshot_dir, exist_ok=True)
+        rank = torch.distributed.get_rank()
+
+        class MemoryProfiler:
+            def __init__(self, step_num: int, freq: int):
+                torch.cuda.memory._record_memory_history(
+                    max_entries=MEMORY_SNAPSHOT_MAX_ENTRIES
+                )
+                # when resume training, we start from the last step
+                self.step_num = step_num
+                self.freq = freq
+
+            def step(self, exit_ctx: bool = False):
+                if not exit_ctx and self.step_num % self.freq != 0:
+                    self.step_num += 1
+                    return
+                if not exit_ctx:
+                    curr_step = self.step_num
+                    self.step_num += 1
+                    dir_name = f"iteration_{curr_step}"
-                if not exit_ctx and self.step_num % self.freq != 0:
-                    self.step_num += 1
-                    return
-                if not exit_ctx:
-                    curr_step = self.step_num
-                    self.step_num += 1
-                    dir_name = f"iteration_{curr_step}"
+                self.step_num += 1
+                if not exit_ctx and self.step_num % self.freq != 0:
+                    return
+                if not exit_ctx:
+                    curr_step = self.step_num
+                    dir_name = f"iteration_{curr_step}"
-                if not exit_ctx and self.step_num % self.freq != 0:
-                    self.step_num += 1
-                    return
-                if not exit_ctx:
-                    curr_step = self.step_num
-                    self.step_num += 1
-                    dir_name = f"iteration_{curr_step}"
+                self.step_num += 1
+                if not exit_ctx and self.step_num % self.freq != 0:
+                    return
+                if not exit_ctx:
+                    curr_step = self.step_num
+                    dir_name = f"iteration_{curr_step}"
+                else:
+                    curr_step = self.step_num - 1
+                    dir_name = f"iteration_{curr_step}_exit"
+                curr_snapshot_dir = os.path.join(snapshot_dir, dir_name)
+                if not os.path.exists(curr_snapshot_dir):
+                    os.makedirs(curr_snapshot_dir, exist_ok=True)
+                logger.info(f"Dumping memory snapshot at step {curr_step}")
+                begin = time.monotonic()
+                with open(
+                    f"{curr_snapshot_dir}/rank{rank}_memory_snapshot.pickle", "wb"
+                ) as output:
+                    pickle.dump(torch.cuda.memory._snapshot(), output)
+                logger.info(
+                    f"Finished dumping memory snapshot in {time.monotonic() - begin:.2f} seconds"
+                )
+                torch.distributed.barrier()
+
+        logger.info(f"Memory profiler active. Snapshot will be saved at {snapshot_dir}")
+        profiler = MemoryProfiler(global_step, config.profiling.profile_freq)
+        try:
+            yield profiler
+        finally:
+            # dump snapshot when CUDA OOMs
+            profiler.step(exit_ctx=True)
+    else:
+        yield None
diff --git a/train.py b/train.py
@@ -38,7 +38,7 @@
     ParallelDims,
 )
 from torchtitan.parallelisms.pipelining_utils import build_pipeline_schedule
-from torchtitan.profiling import maybe_enable_profiling
+from torchtitan.profiling import maybe_enable_memory_snapshot, maybe_enable_profiling
 from torchtitan.utils import (
     Color,
     dist_max,
@@ -301,7 +301,9 @@ def loss_fn(pred, labels):
     logger.info(f"Training starts at step {train_state.step + 1}")
     with maybe_enable_profiling(
         job_config, global_step=train_state.step
-    ) as torch_profiler:
+    ) as torch_profiler, maybe_enable_memory_snapshot(
+        job_config, global_step=train_state.step
+    ) as memory_profiler:
         checkpoint.reset()
 
         # variables used to keep info for metrics logging
@@ -447,6 +449,9 @@ def loss_fn(pred, labels):
             if torch_profiler:
                 torch_profiler.step()
 
+            if memory_profiler:
+                memory_profiler.step()
+
             # Reduce timeout after first train step for faster signal (assumes lazy init, compile are finished)
             if train_state.step == 1:
                 set_pg_timeouts(

diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
@@ -9,6 +9,7 @@ use_for_integration_test = true
 enable_profiling = true
 save_traces_folder = "profile_trace"
 profile_freq = 10
+enable_memory_snapshot = false
 
 [metrics]
 log_freq = 1

diff --git a/train_configs/llama2_13b.toml b/train_configs/llama2_13b.toml
@@ -9,6 +9,7 @@ description = "Llama2 13B training"
 enable_profiling = true
 save_traces_folder = "profile_trace"
 profile_freq = 100
+enable_memory_snapshot = false
 
 [metrics]
 log_freq = 10

diff --git a/train_configs/llama2_70b.toml b/train_configs/llama2_70b.toml
@@ -9,6 +9,7 @@ description = "Llama2 70B training"
 enable_profiling = true
 save_traces_folder = "profile_trace"
 profile_freq = 100
+enable_memory_snapshot = false
 
 [metrics]
 log_freq = 10

diff --git a/train_configs/llama2_7b.toml b/train_configs/llama2_7b.toml
@@ -8,6 +8,7 @@ description = "Llama2 7B training"
 enable_profiling = true
 save_traces_folder = "profile_trace"
 profile_freq = 100
+enable_memory_snapshot = false
 
 [metrics]
 log_freq = 10

diff --git a/train_configs/llama3_70b.toml b/train_configs/llama3_70b.toml
@@ -9,6 +9,8 @@ description = "Llama 3 70B training"
 enable_profiling = true
 save_traces_folder = "profile_trace"
 profile_freq = 100
+enable_memory_snapshot = false
+
 
 [metrics]
 log_freq = 10

diff --git a/train_configs/llama3_8b.toml b/train_configs/llama3_8b.toml
@@ -9,6 +9,7 @@ description = "Llama 3 8B training"
 enable_profiling = true
 save_traces_folder = "profile_trace"
 profile_freq = 100
+enable_memory_snapshot = false
 
 [metrics]
 log_freq = 10