Schedule visualization upgrade (#54)

* update Plotly visualization to display energy breakdown and spatial utilization in hover * update main files to load CostModelEvaluationLUT to display extra performance info in visualization * fix small typo in simd parser: capitalization of K dimension * allow 'all' as tiling dimension size in mapping * update tpu mapping intra_core_tiling and inter_core_tiling * rename node_hw_performances to cost_lut and rename all variables; restructure output saving paths * add required and used link bandwidth in schedule visualization * change print in memory usage visualization to logging INFO statement * ignore all .pkl and .pickle files * remove output pickle files
KULeuven-MICAS · Oct 25, 2024 · 89f35da · 89f35da
1 parent 6355f08
commit 89f35da
Show file tree

Hide file tree

Showing 25 changed files with 305 additions and 222 deletions.
diff --git a/.gitignore b/.gitignore
@@ -136,8 +136,8 @@ dmypy.json
 .vscode/
 
 # result pickle files
-output*/*.pkl
-output*/*.pickle
+*.pkl
+*.pickle
 
 # result json files
 outputs*/*.json

diff --git a/docs/source/stages.rst b/docs/source/stages.rst
@@ -27,7 +27,7 @@ Stages within Stream are used to modularly and easily adapt the functionality of
         loma_lpf_limit=6,  # required by LomaEngine
         nb_ga_individuals=32,  # number of individuals in each genetic algorithm generation
         nb_ga_generations=100,  # number of genetic algorithm generations
-        node_hw_performances_path=node_hw_performances_path,  # saved node_hw_performances to skip re-computation
+        cost_lut_path=cost_lut_path,  # saved CostModelEvaluationLUT to skip re-computation
         plot_hof=True,  # Save schedule and memory usage plot of each individual in the Genetic Algorithm hall of fame
         plot_file_name=plot_file_name,
         plot_full_schedule=plot_full_schedule,
@@ -74,7 +74,7 @@ Multiple modes are applicable through the `cn_define_mode` parameter in conjunct
 `InterCoreMappingStage <https://github.com/KULeuven-MICAS/stream/blob/master/stream/classes/stages/InterCoreMappingStage.py#L17>`_
 ----------------------------------------------------------------------------------------------------------------------------------
 
-Stage that finds the best inter-core mapping using a genetic algorithm. From the IntraCoreMappingStage we receive the `node_hw_performances`, containing for each node and its valid core allocations the best CME. We then initialize the genetic algorithm.
+Stage that finds the best inter-core mapping using a genetic algorithm. From the IntraCoreMappingStage we receive the `CostModelEvaluationLUT`, containing for each node and its valid core allocations the best CME. We then initialize the genetic algorithm.
 
 `IntraCoreMappingStage <https://github.com/KULeuven-MICAS/stream/blob/master/stream/classes/stages/IntraCoreMappingStage.py#L22/>`_
 -----------------------------------------------------------------------------------------------------------------------------------

diff --git a/main_stream_co.py b/main_stream_co.py
@@ -2,6 +2,7 @@
 import re
 
 from stream.api import optimize_allocation_co
+from stream.utils import CostModelEvaluationLUT
 from stream.visualization.memory_usage import plot_memory_usage
 from stream.visualization.schedule import (
     visualize_timeline_plotly,
@@ -27,37 +28,42 @@
 experiment_id = f"{hw_name}-{wl_name}-{mode}-constraint_optimization"
 ######################################################################
 
+scme = optimize_allocation_co(
+    hardware=accelerator,
+    workload=workload_path,
+    mapping=mapping_path,
+    mode=mode,
+    layer_stacks=layer_stacks,
+    experiment_id=experiment_id,
+    output_path="outputs",
+    skip_if_exists=False,
+)
+
 ############PLOTTING#############
-plot_file_name = f"-{experiment_id}-"
 plot_full_schedule = True
 draw_dependencies = True
 plot_data_transfer = True
 section_start_percent = (0,)
 percent_shown = (100,)
 #################################
 
-
-################################PATHS################################
-timeline_fig_path_plotly = f"outputs/{experiment_id}-schedule.html"
-memory_fig_path = f"outputs/{experiment_id}-memory.png"
+#########################PLOTTING PATHS##############################
+timeline_fig_path_plotly = f"outputs/{experiment_id}/schedule.html"
+memory_fig_path = f"outputs/{experiment_id}/memory.png"
 #####################################################################
 
-scme = optimize_allocation_co(
-    hardware=accelerator,
-    workload=workload_path,
-    mapping=mapping_path,
-    mode=mode,
-    layer_stacks=layer_stacks,
-    experiment_id=experiment_id,
-    output_path="outputs",
-)
+#####################CostModelEvaluationLUT LOAD#############################
+cost_lut_path = f"outputs/{experiment_id}/cost_lut_post_co.pickle"
+cost_lut = CostModelEvaluationLUT(cost_lut_path)
+#############################################################################
 
 # Plotting schedule timeline of best SCME
 visualize_timeline_plotly(
     scme,
     draw_dependencies=draw_dependencies,
     draw_communication=plot_data_transfer,
     fig_path=timeline_fig_path_plotly,
+    cost_lut=cost_lut,
 )
 # Plotting memory usage of best SCME
 plot_memory_usage(scme, section_start_percent, percent_shown, fig_path=memory_fig_path)
diff --git a/main_stream_ga.py b/main_stream_ga.py
@@ -2,6 +2,7 @@
 import re
 
 from stream.api import optimize_allocation_ga
+from stream.utils import CostModelEvaluationLUT
 from stream.visualization.memory_usage import plot_memory_usage
 from stream.visualization.schedule import (
     visualize_timeline_plotly,
@@ -17,8 +18,8 @@
 mapping_path = "stream/inputs/examples/mapping/tpu_like_quad_core.yaml"
 mode = "fused"
 layer_stacks = [tuple(range(0, 11)), tuple(range(11, 22))] + list((i,) for i in range(22, 49))
-nb_ga_generations = 16
-nb_ga_individuals = 16
+nb_ga_generations = 4
+nb_ga_individuals = 4
 ##############################################################################################
 
 ################################PARSING###############################
@@ -40,8 +41,8 @@
 
 
 ################################PATHS################################
-timeline_fig_path_plotly = f"outputs/{experiment_id}-schedule.html"
-memory_fig_path = f"outputs/{experiment_id}-memory.png"
+timeline_fig_path_plotly = f"outputs/{experiment_id}/schedule.html"
+memory_fig_path = f"outputs/{experiment_id}/memory.png"
 #####################################################################
 
 scme = optimize_allocation_ga(
@@ -54,15 +55,20 @@
     nb_ga_individuals=nb_ga_individuals,
     experiment_id=experiment_id,
     output_path="outputs",
-    skip_if_exists=False,
+    skip_if_exists=True,
 )
 
+# Load in the CostModelEvaluationLUT from the run
+cost_lut_path = f"outputs/{experiment_id}/cost_lut.pickle"
+cost_lut = CostModelEvaluationLUT(cost_lut_path)
+
 # Plotting schedule timeline of best SCME
 visualize_timeline_plotly(
     scme,
     draw_dependencies=draw_dependencies,
     draw_communication=plot_data_transfer,
     fig_path=timeline_fig_path_plotly,
+    cost_lut=cost_lut,
 )
 # Plotting memory usage of best SCME
 plot_memory_usage(scme, section_start_percent, percent_shown, fig_path=memory_fig_path)
diff --git a/stream/api.py b/stream/api.py
@@ -40,6 +40,7 @@ def _sanity_check_gurobi_license():
     try:
         # Try to create a simple optimization model
         model = gp.Model()
+        model.setParam("OutputFlag", 0)
         # Check if the model was successfully created (license check)
         model.optimize()
         # If model.optimize() runs without a license issue, return
@@ -67,12 +68,17 @@ def optimize_allocation_ga(
 ) -> StreamCostModelEvaluation:
     _sanity_check_inputs(hardware, workload, mapping, mode, output_path)
 
-    logger = _logging.getLogger(__name__)
+    # Create experiment_id path
+    os.makedirs(f"{output_path}/{experiment_id}", exist_ok=True)
 
     # Output paths
-    node_hw_performances_path = f"{output_path}/{experiment_id}-saved_cn_hw_cost.pickle"
-    scme_path = f"{output_path}/{experiment_id}-scme.pickle"
+    cost_lut_path = f"{output_path}/{experiment_id}/cost_lut.pickle"
+    scme_path = f"{output_path}/{experiment_id}/scme.pickle"
+
+    # Get logger
+    logger = _logging.getLogger(__name__)
 
+    # Load SCME if it exists and skip_if_exists is True
     if os.path.exists(scme_path) and skip_if_exists:
         scme = pickle_load(scme_path)
         logger.info(f"Loaded SCME from {scme_path}")
@@ -93,11 +99,11 @@ def optimize_allocation_ga(
             workload_path=workload,  # required by ModelParserStage
             mapping_path=mapping,  # required by ModelParserStage
             loma_lpf_limit=6,  # required by LomaEngine
-            nb_ga_generations=nb_ga_generations,  # number of genetic algorithm generations
-            nb_ga_individuals=nb_ga_individuals,  # number of individuals in each genetic algorithm generation
+            nb_ga_generations=nb_ga_generations,  # number of genetic algorithm (ga) generations
+            nb_ga_individuals=nb_ga_individuals,  # number of individuals in each ga generation
             mode=mode,
             layer_stacks=layer_stacks,
-            node_hw_performances_path=node_hw_performances_path,
+            cost_lut_path=cost_lut_path,
             operands_to_prefetch=[],  # required by GeneticAlgorithmAllocationStage
         )
         # Launch the MainStage
@@ -120,14 +126,19 @@ def optimize_allocation_co(
     _sanity_check_inputs(hardware, workload, mapping, mode, output_path)
     _sanity_check_gurobi_license()
 
+    # Create experiment_id path
+    os.makedirs(f"{output_path}/{experiment_id}", exist_ok=True)
+
     # Output paths
-    node_hw_performances_path = f"{output_path}/{experiment_id}-saved_cn_hw_cost.pickle"
-    scme_path = f"{output_path}/{experiment_id}-scme.pickle"
-    # After constraint optimization paths
-    node_hw_performances_path_with_split = f"outputs/{experiment_id}-saved_cn_hw_cost-with_split.pickle"
+    cost_lut_path = f"{output_path}/{experiment_id}/cost_lut.pickle"
+    allocations_path = f"{output_path}/{experiment_id}/waco/"
+    cost_lut_post_co_path = f"outputs/{experiment_id}/cost_lut_post_co.pickle"
+    scme_path = f"{output_path}/{experiment_id}/scme.pickle"
 
+    # Get logger
     logger = _logging.getLogger(__name__)
 
+    # Load SCME if it exists and skip_if_exists is True
     if os.path.exists(scme_path) and skip_if_exists:
         scme = pickle_load(scme_path)
         logger.info(f"Loaded SCME from {scme_path}")
@@ -150,55 +161,13 @@ def optimize_allocation_co(
             loma_lpf_limit=6,  # required by LomaEngine
             mode=mode,
             layer_stacks=layer_stacks,
-            node_hw_performances_path=node_hw_performances_path,
-            node_hw_performances_path_with_split=node_hw_performances_path_with_split,
+            cost_lut_path=cost_lut_path,
+            allocations_path=allocations_path,
+            cost_lut_post_co_path=cost_lut_post_co_path,
             operands_to_prefetch=[],  # required by ConstraintOptimizationAllocationStage
         )
         # Launch the MainStage
         answers = mainstage.run()
         scme = answers[0][0]
         pickle_save(scme, scme_path)
     return scme
-
-
-if __name__ == "__main__":
-    from stream.visualization.memory_usage import plot_memory_usage
-    from stream.visualization.schedule import visualize_timeline_plotly
-
-    accelerator = "stream/inputs/examples/hardware/tpu_like_quad_core.yaml"
-    workload = "stream/inputs/examples/workload/resnet18.yaml"
-    mapping = "stream/inputs/examples/mapping/tpu_like_quad_core.yaml"
-
-    hw_name = "tpu_like_quad_core"
-    wl_name = "resnet18"
-    mode = "fused"
-    experiment_id = f"{hw_name}-{wl_name}"
-    output_path = "outputs"
-    layer_stacks = [tuple(range(0, 11)), tuple(range(11, 22))] + list((i,) for i in range(22, 49))
-
-    scme, _ = optimize_allocation_ga(
-        accelerator,
-        workload,
-        mapping,
-        mode,
-        layer_stacks,
-        experiment_id,
-        output_path,
-    )
-
-    plot_full_schedule = True
-    draw_dependencies = True
-    plot_data_transfer = True
-    section_start_percent = (0,)
-    percent_shown = (100,)
-    schedule_fig_path = f"{output_path}/schedule_plot.png"
-    memory_fig_path = f"{output_path}/memory_plot.png"
-    energy_fig_path = f"{output_path}/energy_plot.png"
-    visualize_timeline_plotly(
-        scme=scme,
-        draw_dependencies=draw_dependencies,
-        draw_communication=True,
-        fig_path=schedule_fig_path,
-    )
-    plot_memory_usage(scme.accelerator.memory_manager, fig_path=memory_fig_path)
-    # bar_plot_stream_cost_model_evaluations_breakdown([scme], fig_path=energy_fig_path)
diff --git a/stream/cost_model/communication_manager.py b/stream/cost_model/communication_manager.py
@@ -46,13 +46,11 @@ class CommunicationLinkEvent:
         - a list of tensors relevant for the event:
             * the tensor being transferred
             * the tensor(s) for which we are blocking
-        - an activity percentage:
-            * the percentage of the link bandwidth used
+        - an activity:
+            * the bits per clock cycle used of the link bandwidth
     """
 
-    def __init__(
-        self, type: str, start: int, end: int, tensors: list[Tensor], energy: float, activity: float = 100
-    ) -> None:
+    def __init__(self, type: str, start: int, end: int, tensors: list[Tensor], energy: float, activity: float) -> None:
         self.type = type
         self.start = start
         self.end = end
@@ -163,6 +161,7 @@ def update_links(
                 end=end_timestep,
                 tensors=[tensor],
                 energy=duration * link.unit_energy_cost,
+                activity=link.bandwidth,
             )
             for link in links
         ]

diff --git a/stream/inputs/examples/mapping/tpu_like_quad_core.yaml b/stream/inputs/examples/mapping/tpu_like_quad_core.yaml
@@ -1,31 +1,55 @@
 - name: default
   core_allocation: [0, 1, 2, 3]
   intra_core_tiling:
-    - D, 64
+    - D, all
   inter_core_tiling:
     - K, *
 
 - name: Conv
   core_allocation: [0, 1, 2, 3]
   intra_core_tiling:
-    - K, 8
+    - OY, all
   inter_core_tiling:
     - K, *
 
 - name: Gemm
   core_allocation: [0, 1, 2, 3]
+  intra_core_tiling:
+    - D, all
+  inter_core_tiling:
+    - H, *
 
 - name: Pool
   core_allocation: [4]
+  intra_core_tiling:
+    - OY, all
+  inter_core_tiling:
+    - K, *
 
 - name: MaxPool
   core_allocation: [4]
+  intra_core_tiling:
+    - OY, all
+  inter_core_tiling:
+    - K, *
 
 - name: AveragePool
   core_allocation: [4]
+  intra_core_tiling:
+    - OY, all
+  inter_core_tiling:
+    - K, *
 
 - name: GlobalAveragePool
   core_allocation: [4]
+  intra_core_tiling:
+    - OY, all
+  inter_core_tiling:
+    - K, *
 
 - name: Add
   core_allocation: [5]
+  intra_core_tiling:
+    - D, all
+  inter_core_tiling:
+    - H, *
diff --git a/stream/opt/allocation/constraint_optimization/allocation.py b/stream/opt/allocation/constraint_optimization/allocation.py
@@ -22,7 +22,7 @@
 def get_optimal_allocations(
     workload: ComputationNodeWorkload,
     accelerator: Accelerator,
-    node_hw_performances: CostModelEvaluationLUT,
+    cost_lut: CostModelEvaluationLUT,
     iterations: int,
     gap: float = 0.5,
     time_limit: int = 600,
@@ -34,9 +34,9 @@ def get_optimal_allocations(
     ids = convert_ids(nodes)
 
     latencies, possible_allocation_splits = get_latencies(
-        nodes, core_ids, accelerator, node_hw_performances, impossible_lat=0, ids=ids
+        nodes, core_ids, accelerator, cost_lut, impossible_lat=0, ids=ids
     )
-    energies = get_energies(nodes, core_ids, accelerator, node_hw_performances, impossible_energy=0, ids=ids)
+    energies = get_energies(nodes, core_ids, accelerator, cost_lut, impossible_energy=0, ids=ids)
     output_operand = LayerOperand("O")
     dependencies = {
         (ids[p], ids[c]): p.operand_size_bit[output_operand] for p, c in workload.edges() if p in nodes and c in nodes