Skip to content

Commit

Permalink
Transformer support (#34)
Browse files Browse the repository at this point in the history
* incorporate changes in ZigZag's Workload class, re-enable support for SIMD, transpose and reshape node parsing

* support for gather ONNX nodes

* fix bug in gather node

* LomaStage -> TemporalMappingGeneratorStage

* remove old input files (.py)

* fix core allocation of some nodes

* Show shortened layer name in html schedule

* allow for 1st node with non-constant operands + typehints

* typehing CN workload

* fix bug in core_ids in scheduler

* more typehint fixes

* tensor node: n-D tensor of sets -> (n+1)-D tensor

* upgrade zigzag version

* fix bug in nb_unique_data_seen

* optimize get_inter_edges

* pre-allocate tensor_cns

* add Concat node

* add dataflow to tpu_like core

* fix testing files

* use proper equality for Cn in IntraNodeMapping

* fix user-defined workload for testing
  • Loading branch information
RobinGeens authored Aug 30, 2024
1 parent 6ac8877 commit 5055d7e
Show file tree
Hide file tree
Showing 97 changed files with 1,427 additions and 2,876 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
zigzag_repo
zigzag
*.out

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
2 changes: 1 addition & 1 deletion docs/source/stages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Stages within Stream are used to modularly and easily adapt the functionality of
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=32, # number of individuals in each genetic algorithm generation
nb_ga_generations=100, # number of genetic algorithm generations
node_hw_performances_path=node_hw_performances_path, # saved node_hw_performances to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=nb_ga_individuals, # number of individuals in each genetic algorithm generation
nb_ga_generations=nb_ga_generations, # number of genetic algorithm generations
node_hw_performances_path=node_hw_performances_path, # saved node_hw_performances to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_stream_layer_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=nb_ga_individuals, # number of individuals in each genetic algorithm generation
nb_ga_generations=nb_ga_generations, # number of genetic algorithm generations
node_hw_performances_path=node_hw_performances_path, # saved node_hw_performances to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_stream_mode_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=nb_ga_individuals, # number of individuals in each genetic algorithm generation
nb_ga_generations=nb_ga_generations, # number of genetic algorithm generations
node_hw_performances_path=node_hw_performances_path, # saved node_hw_performances to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_stream_mode_4.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=nb_ga_individuals, # number of individuals in each genetic algorithm generation
nb_ga_generations=nb_ga_generations, # number of genetic algorithm generations
node_hw_performances_path=node_hw_performances_path, # saved node_hw_performances to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_testing_1_core_with_testing_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=4, # number of individuals in each genetic algorithm generation
nb_ga_generations=1, # number of genetic algorithm generations
node_hw_performances_path=f"outputs/{node_hw_cost_pkl_name}.pickle", # saved results to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_testing_2_cores_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=nb_ga_individuals, # number of individuals in each genetic algorithm generation
nb_ga_generations=nb_ga_generations, # number of genetic algorithm generations
node_hw_performances_path=node_hw_performances_path, # save node_hw_performances to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_testing_2_cores_with_testing_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=4, # number of individuals in each genetic algorithm generation
nb_ga_generations=1, # number of genetic algorithm generations
# node_hw_performances_path=f"outputs/{node_hw_cost_pkl_name}.pickle", # saved results to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_testing_2_cores_with_testing_workload_3_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=4, # number of individuals in each genetic algorithm generation
nb_ga_generations=1, # number of genetic algorithm generations
# node_hw_performances_path=f"outputs/{node_hw_cost_pkl_name}.pickle", # saved results to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion main_testing_4_cores_with_testing_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
accelerator=accelerator, # required by AcceleratorParserStage
workload_path=workload_path, # required by ModelParserStage
mapping_path=mapping_path, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=4, # number of individuals in each genetic algorithm generation
nb_ga_generations=1, # number of genetic algorithm generations
node_hw_performances_path=f"outputs/{node_hw_cost_pkl_name}.pickle", # saves results to skip re-computation
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
zigzag-dse==3.1.3
zigzag-dse==3.6.1
rtree
deap
matplotlib
Expand Down
2 changes: 1 addition & 1 deletion stream/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def get_hardware_performance_stream(hardware, workload, mapping, CN_define_mode,
accelerator=hardware, # required by AcceleratorParserStage
workload_path=workload, # required by ModelParserStage
mapping_path=mapping, # required by ModelParserStage
loma_lpf_limit=6, # required by LomaStage
loma_lpf_limit=6, # required by LomaEngine
nb_ga_individuals=128, # number of individuals in each genetic algorithm generation
nb_ga_generations=100, # number of genetic algorithm generations
node_hw_performances_path=f"outputs/{node_hw_cost_pkl_name}.pickle", # saves results to skip re-computation
Expand Down
34 changes: 18 additions & 16 deletions stream/classes/cost_model/communication_manager.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import itertools
from math import ceil
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

import networkx as nx
from zigzag.datatypes import Constants, MemoryOperand
from zigzag.hardware.architecture.Core import Core

Expand All @@ -12,6 +11,7 @@

if TYPE_CHECKING:
from stream.classes.hardware.architecture.accelerator import Accelerator
from stream.classes.hardware.architecture.noc.communication_link import CommunicationLink


class CommunicationEvent:
Expand Down Expand Up @@ -50,7 +50,9 @@ class CommunicationLinkEvent:
* the percentage of the link bandwidth used
"""

def __init__(self, type, start, end, tensors, energy, activity=100) -> None:
def __init__(
self, type: str, start: int, end: int, tensors: list[Tensor], energy: float, activity: float = 100
) -> None:
self.type = type
self.start = start
self.end = end
Expand Down Expand Up @@ -89,17 +91,15 @@ def __init__(self, accelerator: "Accelerator") -> None:

def get_shortest_paths(self):
# For each core pair save a shortest path
shortest_paths = {}
for producer_core, consumer_core in itertools.product(
self.accelerator.cores.nodes(), self.accelerator.cores.nodes()
):
shortest_paths[(producer_core, consumer_core)] = nx.shortest_path(
self.accelerator.cores, producer_core, consumer_core
shortest_paths: dict[tuple[Core, Core], list[Core]] = {}
for producer_core, consumer_core in itertools.product(self.accelerator.core_list, self.accelerator.core_list):
shortest_paths[(producer_core, consumer_core)] = self.accelerator.cores.shortest_path(
producer_core, consumer_core
)
return shortest_paths

def get_links_for_all_core_pairs(self):
communication_links = {}
communication_links: dict[tuple[Core, Core], Any] = {}
for pair, path in self.shortest_paths.items():
traversed_edges = [(i, j) for i, j in zip(path, path[1:])]
communication_links[pair] = [
Expand Down Expand Up @@ -137,7 +137,7 @@ def update_links(
tensor: Tensor,
sender: Core | int,
receiver: Core | int,
receiver_memory_operand: str,
receiver_memory_operand: MemoryOperand,
start_timestep: int,
duration: int,
) -> tuple[int, int, float, float]:
Expand All @@ -163,7 +163,7 @@ def update_links(
receiver = self.accelerator.get_core(receiver)
links = self.get_links_for_pair(sender, receiver)
if not links: # When sender == receiver
return 0, 0
return 0, 0, 0, 0

cles = [
CommunicationLinkEvent(
Expand Down Expand Up @@ -214,7 +214,7 @@ def block_offchip_links(
duration (int): The duration of the blocking in cycles.
cn (ComputationNode): The computational node for which we are blocking the links.
"""
links_to_block = dict()
links_to_block: dict["CommunicationLink", int] = {}
core = self.accelerator.get_core(core_id)
offchip_core = self.accelerator.get_core(self.accelerator.offchip_core_id)
if Constants.OUTPUT_MEM_OP in too_large_operands:
Expand All @@ -230,7 +230,7 @@ def block_offchip_links(
if not too_large_operands:
return start_timestep
# Get the tensors for which we are blocking based on the operands
tensors = []
tensors: list[Tensor] = []
for mem_op in too_large_operands:
layer_op = cn.memory_operand_links.mem_to_layer_op(mem_op)
tensors.append(cn.operand_tensors[layer_op])
Expand All @@ -242,7 +242,9 @@ def block_offchip_links(
link.block(block_start, duration, tensors, activity=req_bw)
return block_start

def get_links_idle_window(self, links: dict, best_case_start: int, duration: int, tensors: list[Tensor]) -> int:
def get_links_idle_window(
self, links: dict["CommunicationLink", int], best_case_start: int, duration: int, tensors: list[Tensor]
) -> int:
"""Return the timestep at which tensor can be transfered across the links.
Both links must have an idle window large enough for the transfer.
The timestep must be greater than or equal to best_case_start.
Expand All @@ -254,7 +256,7 @@ def get_links_idle_window(self, links: dict, best_case_start: int, duration: int
tensors (list): The tensors to be transferred. Used to broadcast from previous transfer.
"""
assert len(links) > 0
idle_intersections = []
idle_intersections: list[tuple[int, int]] = []
for i, (link, req_bw) in enumerate(links.items()):
req_bw = min(req_bw, link.bandwidth) # ceil the bw
windows = link.get_idle_window(req_bw, duration, best_case_start, tensors)
Expand Down
46 changes: 22 additions & 24 deletions stream/classes/cost_model/cost_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from zigzag.workload.Workload import Workload

from stream.classes.cost_model.scheduler import schedule_graph
from stream.classes.hardware.architecture.accelerator import Accelerator
from stream.classes.workload.onnx_workload import ComputationNodeWorkload
from stream.visualization.memory_usage import plot_memory_usage
from stream.visualization.schedule import plot_timeline_brokenaxes

Expand All @@ -14,26 +13,26 @@ class StreamCostModelEvaluation:

def __init__(
self,
workload: Workload,
workload: ComputationNodeWorkload,
accelerator: Accelerator,
operands_to_prefetch: list[str],
scheduling_order: list[int],
scheduling_order: list[tuple[int, int]],
) -> None:
# Initialize the SCME by setting the workload graph to be scheduled
self.workload = workload
self.accelerator = accelerator
self.energy = None
self.total_cn_onchip_energy = None
self.total_cn_offchip_link_energy = None
self.total_cn_offchip_memory_energy = None
self.total_eviction_to_offchip_link_energy = None
self.total_eviction_to_offchip_memory_energy = None
self.total_sink_layer_output_offchip_link_energy = None
self.total_sink_layer_output_offchip_memory_energy = None
self.total_core_to_core_link_energy = None
self.total_core_to_core_memory_energy = None
self.energy: float | None = None
self.total_cn_onchip_energy: float | None = None
self.total_cn_offchip_link_energy: float | None = None
self.total_cn_offchip_memory_energy: float | None = None
self.total_eviction_to_offchip_link_energy: float | None = None
self.total_eviction_to_offchip_memory_energy: float | None = None
self.total_sink_layer_output_offchip_link_energy: float | None = None
self.total_sink_layer_output_offchip_memory_energy: float | None = None
self.total_core_to_core_link_energy: float | None = None
self.total_core_to_core_memory_energy: float | None = None

self.latency = None
self.latency: int | None = None
self.max_memory_usage = None
self.core_timesteps_delta_cumsums = None
self.operands_to_prefetch = operands_to_prefetch
Expand Down Expand Up @@ -79,27 +78,26 @@ def run(self):

def plot_schedule(
self,
plot_full_schedule=False,
draw_dependencies=True,
plot_data_transfer=False,
section_start_percent=(0, 50, 95),
percent_shown=(5, 5, 5),
fig_path="outputs/schedule_plot.png",
plot_full_schedule: bool = False,
draw_dependencies: bool = True,
plot_data_transfer: bool = False,
section_start_percent: tuple[int, ...] = (0, 50, 95),
percent_shown: tuple[int, ...] = (5, 5, 5),
fig_path: str = "outputs/schedule_plot.png",
):
"""Plot the schedule of this SCME."""
if plot_full_schedule:
section_start_percent = (0,)
percent_shown = (100,)
plot_timeline_brokenaxes(
self.workload,
self.accelerator,
self,
draw_dependencies,
section_start_percent,
percent_shown,
plot_data_transfer,
fig_path,
)

def plot_memory_usage(self, fig_path="outputs/memory_usage_plot.png"):
def plot_memory_usage(self, fig_path: str = "outputs/memory_usage_plot.png"):
"""Plot the memory usage of this SCME."""
plot_memory_usage(self.accelerator.memory_manager, fig_path)
50 changes: 50 additions & 0 deletions stream/classes/cost_model/group_allocation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging
from typing import TypeAlias

from stream.classes.workload.computation_node import ComputationNode, LoopRanges

logger = logging.getLogger(__name__)

GroupAllocation: TypeAlias = dict[tuple[tuple[int, int], ...], int]


class GroupIdManager:
def __init__(self):
self.__id_count = 0
self.groups: GroupAllocation = {}

def __get_and_raise_id(self):
curr_id = self.__id_count
self.__id_count += 1
return curr_id

def get_group_id(self, node: ComputationNode, loop_ranges: LoopRanges) -> int:
"""Return the group id for the given loop ranges.
The group id is determined based on the relevant constant operand dimension loop ranges.
If there is no constant operand, we return 0.
If there is more than one constant operand, we only consider the last one's loop ranges.
If those loop ranges are already contained within 'groups' we return that group id.
Else we add it to the groups dict with an incremented group id.
Args:
node (ComputationNode): The original (layer) CN.
loop_ranges (dict): A dictionary containing the loop range for each dimension
Returns:
int: The group id for the given loop ranges
"""
# No constant operand
if not node.constant_operands:
return self.__get_and_raise_id()

# Constant operand and known ranges
constant_operand = node.constant_operands[-1]
relevant_dims = node.loop_relevancy_info.get_r_layer_dims(constant_operand)
relevant_ranges = tuple([loop_ranges[dim] for dim in relevant_dims])
if relevant_ranges in self.groups:
return self.groups[relevant_ranges]

# Constant operand and new ranges
new_group_id = self.__get_and_raise_id()
self.groups[relevant_ranges] = new_group_id
return new_group_id
22 changes: 8 additions & 14 deletions stream/classes/cost_model/memory_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, accelerator: "Accelerator") -> None:
self.top_instances: dict[Core, list[MemoryInstance]] = {}
# memory operand stored by every top level memory
self.memory_operands: dict[Core, list[list[MemoryOperand]]] = {}
for _, core in sorted([(core.id, core) for core in self.accelerator.core_iterator]):
for _, core in sorted([(core.id, core) for core in self.accelerator.core_list]):
top_levels: list[MemoryLevel] = list(
(level for level, out_degree in core.memory_hierarchy.out_degree() if out_degree == 0)
)
Expand Down Expand Up @@ -101,13 +101,10 @@ def find_tensor_in_top_instances(self, tensor: Tensor):
return instances_storing_tensor, available_since_timesteps

def find_tensor(self, tensor: Tensor):
(
instances_storing_tensor,
available_since_timesteps,
) = self.find_tensor_in_top_instances(tensor)
cores_storing_tensor = []
top_instance_idxs = []
available_since = []
instances_storing_tensor, available_since_timesteps = self.find_tensor_in_top_instances(tensor)
cores_storing_tensor: list[int] = []
top_instance_idxs: list[int] = []
available_since: list[int] = []
# Find which cores have these instances as their top instance
for core, top_instances in self.top_instances.items():
for top_instance_idx, top_instance in enumerate(top_instances):
Expand Down Expand Up @@ -297,12 +294,9 @@ def remove_tensor_from_top_instance(
)
)
except StopIteration:
# raise ValueError(
# f"No tensor found equal to {tensor} in top instance {top_instance}."
# )
# If the tensor is not present, we don't have to remove it.
# This is possible because in "Accelerator.transfer_tensor_to_core(...)"
# it removes a tensor on a sender core if detects it's no longer needed there.
# If the tensor is not present, we don't have to remove it. # This is possible because in
# `Accelerator.transfer_tensor_to_core(...)` it removes a tensor on a sender core if detects it's no longer
# needed there.
return
self.top_instance_stored_tensors[top_instance].remove(equivalent_tensor)
del self.top_instance_available_since_timestep[top_instance][tensor.equality_hash()]
Expand Down
Loading

0 comments on commit 5055d7e

Please sign in to comment.