Skip to content

Commit

Permalink
Schedule visualization upgrade (#54)
Browse files Browse the repository at this point in the history
* update Plotly visualization to display energy breakdown and spatial utilization in hover

* update main files to load CostModelEvaluationLUT to display extra performance info in visualization

* fix small typo in simd parser: capitalization of K dimension

* allow 'all' as tiling dimension size in mapping

* update tpu mapping intra_core_tiling and inter_core_tiling

* rename node_hw_performances to cost_lut and rename all variables; restructure output saving paths

* add required and used link bandwidth in schedule visualization

* change print in memory usage visualization to logging INFO statement

* ignore all .pkl and .pickle files

* remove output pickle files
  • Loading branch information
asyms authored Oct 25, 2024
1 parent 6355f08 commit 89f35da
Show file tree
Hide file tree
Showing 25 changed files with 305 additions and 222 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ dmypy.json
.vscode/

# result pickle files
output*/*.pkl
output*/*.pickle
*.pkl
*.pickle

# result json files
outputs*/*.json
Expand Down
4 changes: 2 additions & 2 deletions docs/source/stages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Stages within Stream are used to modularly and easily adapt the functionality of
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=32, # number of individuals in each genetic algorithm generation
nb_ga_generations=100, # number of genetic algorithm generations
node_hw_performances_path=node_hw_performances_path, # saved node_hw_performances to skip re-computation
cost_lut_path=cost_lut_path, # saved CostModelEvaluationLUT to skip re-computation
plot_hof=True, # Save schedule and memory usage plot of each individual in the Genetic Algorithm hall of fame
plot_file_name=plot_file_name,
plot_full_schedule=plot_full_schedule,
Expand Down Expand Up @@ -74,7 +74,7 @@ Multiple modes are applicable through the `cn_define_mode` parameter in conjunct
`InterCoreMappingStage <https://github.com/KULeuven-MICAS/stream/blob/master/stream/classes/stages/InterCoreMappingStage.py#L17>`_
----------------------------------------------------------------------------------------------------------------------------------

Stage that finds the best inter-core mapping using a genetic algorithm. From the IntraCoreMappingStage we receive the `node_hw_performances`, containing for each node and its valid core allocations the best CME. We then initialize the genetic algorithm.
Stage that finds the best inter-core mapping using a genetic algorithm. From the IntraCoreMappingStage we receive the `CostModelEvaluationLUT`, containing for each node and its valid core allocations the best CME. We then initialize the genetic algorithm.

`IntraCoreMappingStage <https://github.com/KULeuven-MICAS/stream/blob/master/stream/classes/stages/IntraCoreMappingStage.py#L22/>`_
-----------------------------------------------------------------------------------------------------------------------------------
Expand Down
34 changes: 20 additions & 14 deletions main_stream_co.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re

from stream.api import optimize_allocation_co
from stream.utils import CostModelEvaluationLUT
from stream.visualization.memory_usage import plot_memory_usage
from stream.visualization.schedule import (
visualize_timeline_plotly,
Expand All @@ -27,37 +28,42 @@
experiment_id = f"{hw_name}-{wl_name}-{mode}-constraint_optimization"
######################################################################

scme = optimize_allocation_co(
hardware=accelerator,
workload=workload_path,
mapping=mapping_path,
mode=mode,
layer_stacks=layer_stacks,
experiment_id=experiment_id,
output_path="outputs",
skip_if_exists=False,
)

############PLOTTING#############
plot_file_name = f"-{experiment_id}-"
plot_full_schedule = True
draw_dependencies = True
plot_data_transfer = True
section_start_percent = (0,)
percent_shown = (100,)
#################################


################################PATHS################################
timeline_fig_path_plotly = f"outputs/{experiment_id}-schedule.html"
memory_fig_path = f"outputs/{experiment_id}-memory.png"
#########################PLOTTING PATHS##############################
timeline_fig_path_plotly = f"outputs/{experiment_id}/schedule.html"
memory_fig_path = f"outputs/{experiment_id}/memory.png"
#####################################################################

scme = optimize_allocation_co(
hardware=accelerator,
workload=workload_path,
mapping=mapping_path,
mode=mode,
layer_stacks=layer_stacks,
experiment_id=experiment_id,
output_path="outputs",
)
#####################CostModelEvaluationLUT LOAD#############################
cost_lut_path = f"outputs/{experiment_id}/cost_lut_post_co.pickle"
cost_lut = CostModelEvaluationLUT(cost_lut_path)
#############################################################################

# Plotting schedule timeline of best SCME
visualize_timeline_plotly(
scme,
draw_dependencies=draw_dependencies,
draw_communication=plot_data_transfer,
fig_path=timeline_fig_path_plotly,
cost_lut=cost_lut,
)
# Plotting memory usage of best SCME
plot_memory_usage(scme, section_start_percent, percent_shown, fig_path=memory_fig_path)
16 changes: 11 additions & 5 deletions main_stream_ga.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re

from stream.api import optimize_allocation_ga
from stream.utils import CostModelEvaluationLUT
from stream.visualization.memory_usage import plot_memory_usage
from stream.visualization.schedule import (
visualize_timeline_plotly,
Expand All @@ -17,8 +18,8 @@
mapping_path = "stream/inputs/examples/mapping/tpu_like_quad_core.yaml"
mode = "fused"
layer_stacks = [tuple(range(0, 11)), tuple(range(11, 22))] + list((i,) for i in range(22, 49))
nb_ga_generations = 16
nb_ga_individuals = 16
nb_ga_generations = 4
nb_ga_individuals = 4
##############################################################################################

################################PARSING###############################
Expand All @@ -40,8 +41,8 @@


################################PATHS################################
timeline_fig_path_plotly = f"outputs/{experiment_id}-schedule.html"
memory_fig_path = f"outputs/{experiment_id}-memory.png"
timeline_fig_path_plotly = f"outputs/{experiment_id}/schedule.html"
memory_fig_path = f"outputs/{experiment_id}/memory.png"
#####################################################################

scme = optimize_allocation_ga(
Expand All @@ -54,15 +55,20 @@
nb_ga_individuals=nb_ga_individuals,
experiment_id=experiment_id,
output_path="outputs",
skip_if_exists=False,
skip_if_exists=True,
)

# Load in the CostModelEvaluationLUT from the run
cost_lut_path = f"outputs/{experiment_id}/cost_lut.pickle"
cost_lut = CostModelEvaluationLUT(cost_lut_path)

# Plotting schedule timeline of best SCME
visualize_timeline_plotly(
scme,
draw_dependencies=draw_dependencies,
draw_communication=plot_data_transfer,
fig_path=timeline_fig_path_plotly,
cost_lut=cost_lut,
)
# Plotting memory usage of best SCME
plot_memory_usage(scme, section_start_percent, percent_shown, fig_path=memory_fig_path)
79 changes: 24 additions & 55 deletions stream/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def _sanity_check_gurobi_license():
try:
# Try to create a simple optimization model
model = gp.Model()
model.setParam("OutputFlag", 0)
# Check if the model was successfully created (license check)
model.optimize()
# If model.optimize() runs without a license issue, return
Expand Down Expand Up @@ -67,12 +68,17 @@ def optimize_allocation_ga(
) -> StreamCostModelEvaluation:
_sanity_check_inputs(hardware, workload, mapping, mode, output_path)

logger = _logging.getLogger(__name__)
# Create experiment_id path
os.makedirs(f"{output_path}/{experiment_id}", exist_ok=True)

# Output paths
node_hw_performances_path = f"{output_path}/{experiment_id}-saved_cn_hw_cost.pickle"
scme_path = f"{output_path}/{experiment_id}-scme.pickle"
cost_lut_path = f"{output_path}/{experiment_id}/cost_lut.pickle"
scme_path = f"{output_path}/{experiment_id}/scme.pickle"

# Get logger
logger = _logging.getLogger(__name__)

# Load SCME if it exists and skip_if_exists is True
if os.path.exists(scme_path) and skip_if_exists:
scme = pickle_load(scme_path)
logger.info(f"Loaded SCME from {scme_path}")
Expand All @@ -93,11 +99,11 @@ def optimize_allocation_ga(
workload_path=workload, # required by ModelParserStage
mapping_path=mapping, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_generations=nb_ga_generations, # number of genetic algorithm generations
nb_ga_individuals=nb_ga_individuals, # number of individuals in each genetic algorithm generation
nb_ga_generations=nb_ga_generations, # number of genetic algorithm (ga) generations
nb_ga_individuals=nb_ga_individuals, # number of individuals in each ga generation
mode=mode,
layer_stacks=layer_stacks,
node_hw_performances_path=node_hw_performances_path,
cost_lut_path=cost_lut_path,
operands_to_prefetch=[], # required by GeneticAlgorithmAllocationStage
)
# Launch the MainStage
Expand All @@ -120,14 +126,19 @@ def optimize_allocation_co(
_sanity_check_inputs(hardware, workload, mapping, mode, output_path)
_sanity_check_gurobi_license()

# Create experiment_id path
os.makedirs(f"{output_path}/{experiment_id}", exist_ok=True)

# Output paths
node_hw_performances_path = f"{output_path}/{experiment_id}-saved_cn_hw_cost.pickle"
scme_path = f"{output_path}/{experiment_id}-scme.pickle"
# After constraint optimization paths
node_hw_performances_path_with_split = f"outputs/{experiment_id}-saved_cn_hw_cost-with_split.pickle"
cost_lut_path = f"{output_path}/{experiment_id}/cost_lut.pickle"
allocations_path = f"{output_path}/{experiment_id}/waco/"
cost_lut_post_co_path = f"outputs/{experiment_id}/cost_lut_post_co.pickle"
scme_path = f"{output_path}/{experiment_id}/scme.pickle"

# Get logger
logger = _logging.getLogger(__name__)

# Load SCME if it exists and skip_if_exists is True
if os.path.exists(scme_path) and skip_if_exists:
scme = pickle_load(scme_path)
logger.info(f"Loaded SCME from {scme_path}")
Expand All @@ -150,55 +161,13 @@ def optimize_allocation_co(
loma_lpf_limit=6, # required by LomaEngine
mode=mode,
layer_stacks=layer_stacks,
node_hw_performances_path=node_hw_performances_path,
node_hw_performances_path_with_split=node_hw_performances_path_with_split,
cost_lut_path=cost_lut_path,
allocations_path=allocations_path,
cost_lut_post_co_path=cost_lut_post_co_path,
operands_to_prefetch=[], # required by ConstraintOptimizationAllocationStage
)
# Launch the MainStage
answers = mainstage.run()
scme = answers[0][0]
pickle_save(scme, scme_path)
return scme


if __name__ == "__main__":
from stream.visualization.memory_usage import plot_memory_usage
from stream.visualization.schedule import visualize_timeline_plotly

accelerator = "stream/inputs/examples/hardware/tpu_like_quad_core.yaml"
workload = "stream/inputs/examples/workload/resnet18.yaml"
mapping = "stream/inputs/examples/mapping/tpu_like_quad_core.yaml"

hw_name = "tpu_like_quad_core"
wl_name = "resnet18"
mode = "fused"
experiment_id = f"{hw_name}-{wl_name}"
output_path = "outputs"
layer_stacks = [tuple(range(0, 11)), tuple(range(11, 22))] + list((i,) for i in range(22, 49))

scme, _ = optimize_allocation_ga(
accelerator,
workload,
mapping,
mode,
layer_stacks,
experiment_id,
output_path,
)

plot_full_schedule = True
draw_dependencies = True
plot_data_transfer = True
section_start_percent = (0,)
percent_shown = (100,)
schedule_fig_path = f"{output_path}/schedule_plot.png"
memory_fig_path = f"{output_path}/memory_plot.png"
energy_fig_path = f"{output_path}/energy_plot.png"
visualize_timeline_plotly(
scme=scme,
draw_dependencies=draw_dependencies,
draw_communication=True,
fig_path=schedule_fig_path,
)
plot_memory_usage(scme.accelerator.memory_manager, fig_path=memory_fig_path)
# bar_plot_stream_cost_model_evaluations_breakdown([scme], fig_path=energy_fig_path)
9 changes: 4 additions & 5 deletions stream/cost_model/communication_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,11 @@ class CommunicationLinkEvent:
- a list of tensors relevant for the event:
* the tensor being transferred
* the tensor(s) for which we are blocking
- an activity percentage:
* the percentage of the link bandwidth used
- an activity:
* the bits per clock cycle used of the link bandwidth
"""

def __init__(
self, type: str, start: int, end: int, tensors: list[Tensor], energy: float, activity: float = 100
) -> None:
def __init__(self, type: str, start: int, end: int, tensors: list[Tensor], energy: float, activity: float) -> None:
self.type = type
self.start = start
self.end = end
Expand Down Expand Up @@ -163,6 +161,7 @@ def update_links(
end=end_timestep,
tensors=[tensor],
energy=duration * link.unit_energy_cost,
activity=link.bandwidth,
)
for link in links
]
Expand Down
28 changes: 26 additions & 2 deletions stream/inputs/examples/mapping/tpu_like_quad_core.yaml
Original file line number Diff line number Diff line change
@@ -1,31 +1,55 @@
- name: default
core_allocation: [0, 1, 2, 3]
intra_core_tiling:
- D, 64
- D, all
inter_core_tiling:
- K, *

- name: Conv
core_allocation: [0, 1, 2, 3]
intra_core_tiling:
- K, 8
- OY, all
inter_core_tiling:
- K, *

- name: Gemm
core_allocation: [0, 1, 2, 3]
intra_core_tiling:
- D, all
inter_core_tiling:
- H, *

- name: Pool
core_allocation: [4]
intra_core_tiling:
- OY, all
inter_core_tiling:
- K, *

- name: MaxPool
core_allocation: [4]
intra_core_tiling:
- OY, all
inter_core_tiling:
- K, *

- name: AveragePool
core_allocation: [4]
intra_core_tiling:
- OY, all
inter_core_tiling:
- K, *

- name: GlobalAveragePool
core_allocation: [4]
intra_core_tiling:
- OY, all
inter_core_tiling:
- K, *

- name: Add
core_allocation: [5]
intra_core_tiling:
- D, all
inter_core_tiling:
- H, *
6 changes: 3 additions & 3 deletions stream/opt/allocation/constraint_optimization/allocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
def get_optimal_allocations(
workload: ComputationNodeWorkload,
accelerator: Accelerator,
node_hw_performances: CostModelEvaluationLUT,
cost_lut: CostModelEvaluationLUT,
iterations: int,
gap: float = 0.5,
time_limit: int = 600,
Expand All @@ -34,9 +34,9 @@ def get_optimal_allocations(
ids = convert_ids(nodes)

latencies, possible_allocation_splits = get_latencies(
nodes, core_ids, accelerator, node_hw_performances, impossible_lat=0, ids=ids
nodes, core_ids, accelerator, cost_lut, impossible_lat=0, ids=ids
)
energies = get_energies(nodes, core_ids, accelerator, node_hw_performances, impossible_energy=0, ids=ids)
energies = get_energies(nodes, core_ids, accelerator, cost_lut, impossible_energy=0, ids=ids)
output_operand = LayerOperand("O")
dependencies = {
(ids[p], ids[c]): p.operand_size_bit[output_operand] for p, c in workload.edges() if p in nodes and c in nodes
Expand Down
Loading

0 comments on commit 89f35da

Please sign in to comment.