diff --git a/models/experimental/functional_yolov4/reference/downsample1.py b/models/experimental/functional_yolov4/reference/downsample1.py new file mode 100644 index 00000000000..2addb26824d --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample1.py @@ -0,0 +1,73 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + + +class DownSample1(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(3, 32, 3, 1, 1, bias=False) + self.b1 = nn.BatchNorm2d(32) + self.relu = nn.ReLU(inplace=True) + + self.c2 = nn.Conv2d(32, 64, 3, 2, 1, bias=False) + self.b2 = nn.BatchNorm2d(64) + + self.c3 = nn.Conv2d(64, 64, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(64) + + self.c4 = nn.Conv2d(64, 64, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(64) + + self.c5 = nn.Conv2d(64, 32, 1, 1, 0, bias=False) + self.b5 = nn.BatchNorm2d(32) + + self.c6 = nn.Conv2d(32, 64, 3, 1, 1, bias=False) + self.b6 = nn.BatchNorm2d(64) + + self.c7 = nn.Conv2d(64, 64, 1, 1, 0, bias=False) + self.b7 = nn.BatchNorm2d(64) + + self.c8 = nn.Conv2d(128, 64, 1, 1, 0, bias=False) + self.b8 = nn.BatchNorm2d(64) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x2_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + x4 = self.c4(x2_m) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + + x6 = self.c6(x5_m) + x6_b = self.b6(x6) + x6_m = self.relu(x6_b) + x6_m = x6_m + x4_m + + x7 = self.c7(x6_m) + x7_b = self.b7(x7) + x7_m = self.relu(x7_b) + x7_m = torch.cat([x7_m, x3_m], dim=1) + + x8 = self.c8(x7_m) + x8_b = self.b8(x8) + x8_m = self.relu(x8_b) + + return x8_m diff --git a/models/experimental/functional_yolov4/reference/downsample2.py b/models/experimental/functional_yolov4/reference/downsample2.py new file mode 100644 index 00000000000..39f9fd4b52d --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample2.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn +from models.experimental.functional_yolov4.reference.resblock import ResBlock + + +class DownSample2(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(64, 128, 3, 2, 1, bias=False) + self.b1 = nn.BatchNorm2d(128) + self.relu = nn.ReLU(inplace=True) + + self.c2 = nn.Conv2d(128, 64, 1, 1, bias=False) + self.b2 = nn.BatchNorm2d(64) + + self.c3 = nn.Conv2d(128, 64, 1, 1, bias=False) + self.b3 = nn.BatchNorm2d(64) + + self.res = ResBlock(64, 2) + + self.c4 = nn.Conv2d(64, 64, 1, 1, bias=False) + self.b4 = nn.BatchNorm2d(64) + + self.c5 = nn.Conv2d(128, 128, 1, 1, bias=False) + self.b5 = nn.BatchNorm2d(128) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x1_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + r1 = self.res(x3_m) + + x4 = self.c4(r1) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x4_m = torch.cat([x4_m, x2_m], dim=1) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + + return x5_m diff --git a/models/experimental/functional_yolov4/reference/downsample3.py b/models/experimental/functional_yolov4/reference/downsample3.py new file mode 100644 index 00000000000..8a8a15ea162 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample3.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn +from models.experimental.functional_yolov4.reference.resblock import ResBlock + + +class DownSample3(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(128, 256, 3, 2, 1, bias=False) + self.b1 = nn.BatchNorm2d(256) + self.relu = nn.ReLU(inplace=True) + + self.c2 = nn.Conv2d(256, 128, 1, 1, bias=False) + self.b2 = nn.BatchNorm2d(128) + + self.c3 = nn.Conv2d(256, 128, 1, 1, bias=False) + self.b3 = nn.BatchNorm2d(128) + + self.res = ResBlock(128, 8) + + self.c4 = nn.Conv2d(128, 128, 1, 1, bias=False) + self.b4 = nn.BatchNorm2d(128) + + self.c5 = nn.Conv2d(256, 256, 1, 1, bias=False) + self.b5 = nn.BatchNorm2d(256) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x1_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + r1 = self.res(x3_m) + + x4 = self.c4(r1) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x4_m = torch.cat([x4_m, x2_m], dim=1) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + return x5_m diff --git a/models/experimental/functional_yolov4/reference/downsample4.py b/models/experimental/functional_yolov4/reference/downsample4.py new file mode 100644 index 00000000000..4538c9293c5 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample4.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn +from models.experimental.functional_yolov4.reference.resblock import ResBlock + + +class DownSample4(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(256, 512, 3, 2, 1, bias=False) + self.b1 = nn.BatchNorm2d(512) + self.relu = nn.ReLU(inplace=True) + + self.c2 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b2 = nn.BatchNorm2d(256) + + self.c3 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(256) + + self.res = ResBlock(256, 8) + + self.c4 = nn.Conv2d(256, 256, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(256) + + self.c5 = nn.Conv2d(512, 512, 1, 1, 0, bias=False) + self.b5 = nn.BatchNorm2d(512) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x1_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + # resblock + r = self.res(x3_m) + + x4 = self.c4(r) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x4_m = torch.cat([x4_m, x2_m], dim=1) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + + return x5_m diff --git a/models/experimental/functional_yolov4/reference/downsample5.py b/models/experimental/functional_yolov4/reference/downsample5.py new file mode 100644 index 00000000000..5e2200346ac --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample5.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn +from models.experimental.functional_yolov4.reference.resblock import ResBlock + + +class DownSample5(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(512, 1024, 3, 2, 1, bias=False) + self.b1 = nn.BatchNorm2d(1024) + self.relu = nn.ReLU(inplace=True) + + self.c2 = nn.Conv2d(1024, 512, 1, 1, bias=False) + self.b2 = nn.BatchNorm2d(512) + + self.c3 = nn.Conv2d(1024, 512, 1, 1, bias=False) + self.b3 = nn.BatchNorm2d(512) + + self.res = ResBlock(512, 4) + + self.c4 = nn.Conv2d(512, 512, 1, 1, bias=False) + self.b4 = nn.BatchNorm2d(512) + self.relu = nn.ReLU(inplace=True) + + self.c5 = nn.Conv2d(1024, 1024, 1, 1, bias=False) + self.b5 = nn.BatchNorm2d(1024) + self.relu = nn.ReLU(inplace=True) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x1_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + # resblock + r = self.res(x3_m) + + x4 = self.c4(r) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x4_m = torch.cat([x4_m, x2_m], dim=1) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + + return x5_m diff --git a/models/experimental/functional_yolov4/reference/head.py b/models/experimental/functional_yolov4/reference/head.py new file mode 100644 index 00000000000..d8f7277c7ac --- /dev/null +++ b/models/experimental/functional_yolov4/reference/head.py @@ -0,0 +1,147 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.nn as nn + + +class Head(nn.Module): + def __init__(self): + super().__init__() + # left side of graph + # in_chan, out_chan, kernel, stride, + output_ch = 255 + + self.c1 = nn.Conv2d(128, 256, 3, 1, 1, bias=False) + self.b1 = nn.BatchNorm2d(256) + self.relu = nn.ReLU(inplace=True) + + self.c2 = nn.Conv2d(256, output_ch, 1, 1, 0, bias=True) + + # R -4 + self.c3 = nn.Conv2d(128, 256, 3, 2, 1, bias=False) + self.b3 = nn.BatchNorm2d(256) + + # R -1 -16 + self.c4 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(256) + + self.c5 = nn.Conv2d(256, 512, 3, 1, 1, bias=False) + self.b5 = nn.BatchNorm2d(512) + + self.c6 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b6 = nn.BatchNorm2d(256) + + self.c7 = nn.Conv2d(256, 512, 3, 1, 1, bias=False) + self.b7 = nn.BatchNorm2d(512) + + self.c8 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b8 = nn.BatchNorm2d(256) + + self.c9 = nn.Conv2d(256, 512, 3, 1, 1, bias=False) + self.b9 = nn.BatchNorm2d(512) + + self.c10 = nn.Conv2d(512, output_ch, 1, 1, 0, bias=True) + + # R -4 + self.c11 = nn.Conv2d(256, 512, 3, 2, 1, bias=False) + self.b11 = nn.BatchNorm2d(512) + + self.c12 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b12 = nn.BatchNorm2d(512) + + self.c13 = nn.Conv2d(512, 1024, 3, 1, 1, bias=False) + self.b13 = nn.BatchNorm2d(1024) + + self.c14 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b14 = nn.BatchNorm2d(512) + + self.c15 = nn.Conv2d(512, 1024, 3, 1, 1, bias=False) + self.b15 = nn.BatchNorm2d(1024) + + self.c16 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b16 = nn.BatchNorm2d(512) + + self.c17 = nn.Conv2d(512, 1024, 3, 1, 1, bias=False) + self.b17 = nn.BatchNorm2d(1024) + + self.c18 = nn.Conv2d(1024, output_ch, 1, 1, 0, bias=True) + + def forward(self, inputs): + x1 = self.c1(inputs[0]) + x1 = self.b1(x1) + x1 = self.relu(x1) + + x2 = self.c2(x1) + + x3 = self.c3(inputs[0]) + x3 = self.b3(x3) + x3 = self.relu(x3) + + # R -1 -16 + outfromNeck1 = inputs[2] # torch.ones(1, 256, 20, 20) + conc1 = torch.cat([x3, outfromNeck1], dim=1) + + x4 = self.c4(conc1) + x4 = self.b4(x4) + x4 = self.relu(x4) + + x5 = self.c5(x4) + x5 = self.b5(x5) + x5 = self.relu(x5) + + x6 = self.c6(x5) + x6 = self.b6(x6) + x6 = self.relu(x6) + + x7 = self.c7(x6) + x7 = self.b7(x7) + x7 = self.relu(x7) + + x8 = self.c8(x7) + x8 = self.b8(x8) + x8 = self.relu(x8) + + x9 = self.c9(x8) + x9 = self.b9(x9) + x9 = self.relu(x9) + + x10 = self.c10(x9) + + # R -4 + x11 = self.c11(x8) + x11 = self.b11(x11) + x11 = self.relu(x11) + + # R -1 -37 + outfromNeck2 = inputs[1] # torch.ones(1, 512, 10, 10) + conc2 = torch.cat([x11, outfromNeck2], dim=1) + + x12 = self.c12(conc2) + x12 = self.b12(x12) + x12 = self.relu(x12) + + x13 = self.c13(x12) + x13 = self.b13(x13) + x13 = self.relu(x13) + + x14 = self.c14(x13) + x14 = self.b14(x14) + x14 = self.relu(x14) + + x15 = self.c15(x14) + x15 = self.b15(x15) + x15 = self.relu(x15) + + x16 = self.c16(x15) + x16 = self.b16(x16) + x16 = self.relu(x16) + + x17 = self.c17(x16) + x17 = self.b17(x17) + x17 = self.relu(x17) + + x18 = self.c18(x17) + + return x2, x10, x18 diff --git a/models/experimental/functional_yolov4/reference/neck.py b/models/experimental/functional_yolov4/reference/neck.py new file mode 100644 index 00000000000..b02eac68f64 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/neck.py @@ -0,0 +1,205 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + + +class Neck(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b1 = nn.BatchNorm2d(512) + self.relu = nn.ReLU(inplace=True) + + self.c2 = nn.Conv2d(512, 1024, 3, 1, 1, bias=False) + self.b2 = nn.BatchNorm2d(1024) + + self.c3 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(512) + + # 3 maxpools + self.p1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=2, dilation=1, ceil_mode=False) + self.p2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=4, dilation=1, ceil_mode=False) + self.p3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=6, dilation=1, ceil_mode=False) + #### + + self.c4 = nn.Conv2d(2048, 512, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(512) + + self.c5 = nn.Conv2d(512, 1024, 3, 1, 1, bias=False) + self.b5 = nn.BatchNorm2d(1024) + + self.c6 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b6 = nn.BatchNorm2d(512) + + self.c7 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7 = nn.BatchNorm2d(256) + + # 2 upsample2d + self.u = nn.Upsample(scale_factor=(2, 2), mode="nearest") + + self.c7_2 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7_2 = nn.BatchNorm2d(256) + + self.c7_3 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7_3 = nn.BatchNorm2d(256) + + self.c8 = nn.Conv2d(256, 512, 3, 1, 1, bias=False) + self.b8 = nn.BatchNorm2d(512) + + self.c7_4 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7_4 = nn.BatchNorm2d(256) + + self.c8_2 = nn.Conv2d(256, 512, 3, 1, 1, bias=False) + self.b8_2 = nn.BatchNorm2d(512) + + self.c7_5 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7_5 = nn.BatchNorm2d(256) + + self.c9 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9 = nn.BatchNorm2d(128) + + self.c9_2 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9_2 = nn.BatchNorm2d(128) + self.c9_3 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9_3 = nn.BatchNorm2d(128) + + self.c10 = nn.Conv2d(128, 256, 3, 1, 1, bias=False) + self.b10 = nn.BatchNorm2d(256) + + self.c9_4 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9_4 = nn.BatchNorm2d(128) + self.c10_2 = nn.Conv2d(128, 256, 3, 1, 1, bias=False) + self.b10_2 = nn.BatchNorm2d(256) + self.c9_5 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9_5 = nn.BatchNorm2d(128) + + def forward(self, inputs): + # 3 CBN blocks + x1 = self.c1(inputs[0]) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x2_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + # maxpools + x4 = self.p1(x3_m) + x5 = self.p2(x3_m) + x6 = self.p3(x3_m) + + # concat the outputs of maxpool and x3_m + conc1 = torch.cat([x4, x5, x6, x3_m], dim=1) + + # 4 back2back CBRs + # CBR4-1 + x7 = self.c4(conc1) + x7_b = self.b4(x7) + x7_m = self.relu(x7_b) + + # CBR4-2 + x8 = self.c5(x7_m) + x8_b = self.b5(x8) + x8_m = self.relu(x8_b) + + # CBR4-3 + x9 = self.c6(x8_m) + x9_b = self.b6(x9) + x9_m = self.relu(x9_b) + + # CBR4-4 + x10 = self.c7(x9_m) + x10_b = self.b7(x10) + x10_m = self.relu(x10_b) + + # upsample + u1 = self.u(x10_m) + + # Next CBR block to be concatinated with output of u1 + # gets the output of downsample4 module which is dimensions: [1, 512, 20, 20] - make a random tensor with that shape for the purpose of running the neck unit test stand-alone + outDownSample4 = inputs[1] + # CBR block for conc2 + x11 = self.c7_2(outDownSample4) + x11_b = self.b7_2(x11) + x11_m = self.relu(x11_b) + + # concat CBR output with output from u1 + conc2 = torch.cat([u1, x11_m], dim=1) + + # 6 back2back CBRs + # CBR6_1 + x12 = self.c7_3(conc2) + x12_b = self.b7_3(x12) + x12_m = self.relu(x12_b) + + # CBR6_2 + x13 = self.c8(x12_m) + x13_b = self.b8(x13) + x13_m = self.relu(x13_b) + + # CBR6_3 + x14 = self.c7_4(x13_m) + x14_b = self.b7_4(x14) + x14_m = self.relu(x14_b) + + # CBR6_4 + x15 = self.c8_2(x14_m) + x15_b = self.b8_2(x15) + x15_m = self.relu(x15_b) + + # CBR6_5 + x16 = self.c7_5(x15_m) + x16_b = self.b7_5(x16) + x16_m = self.relu(x16_b) + + # CBR6_6 + x17 = self.c9(x16_m) + x17_b = self.b9(x17) + x17_m = self.relu(x17_b) + + # upsample + u2 = self.u(x17_m) + + # CBR block for conc3 + outDownSample3 = inputs[2] + x18 = self.c9_2(outDownSample3) + x18_b = self.b9_2(x18) + x18_m = self.relu(x18_b) + + # concat CBR output with output from u2 + conc3 = torch.cat([u2, x18_m], dim=1) + + # 5 CBR blocks + # CBR5_1 + x19 = self.c9_3(conc3) + x19_b = self.b9_3(x19) + x19_m = self.relu(x19_b) + + # CBR5_2 + x20 = self.c10(x19_m) + x20_b = self.b10(x20) + x20_m = self.relu(x20_b) + + # CBR5_3 + x21 = self.c9_4(x20_m) + x21_b = self.b9_4(x21) + x21_m = self.relu(x21_b) + + # CBR5_4 + x22 = self.c10_2(x21_m) + x22_b = self.b10_2(x22) + x22_m = self.relu(x22_b) + + # CBR5_5 + x23 = self.c9_5(x22_m) + x23_b = self.b9_5(x23) + x23_m = self.relu(x23_b) + # return [x4, x4, x4] + return x23_m, x9_m, x16_m diff --git a/models/experimental/functional_yolov4/reference/resblock.py b/models/experimental/functional_yolov4/reference/resblock.py new file mode 100644 index 00000000000..dbd65796983 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/resblock.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch.nn as nn + + +class ResBlock(nn.Module): + def __init__(self, ch, nblocks=1, shortcut=True): + super().__init__() + self.shortcut = shortcut + self.module_list = nn.ModuleList() + for i in range(nblocks): + conv1 = nn.Conv2d(ch, ch, 1, 1, 0, bias=False) + bn1 = nn.BatchNorm2d(ch) + relu1 = nn.ReLU(inplace=True) + conv2 = nn.Conv2d(ch, ch, 3, 1, 1, bias=False) + bn2 = nn.BatchNorm2d(ch) + relu2 = nn.ReLU(inplace=True) + resblock_one = nn.ModuleList([conv1, bn1, relu1, conv2, bn2, relu2]) + self.module_list.append(resblock_one) + + def forward(self, x): + for module in self.module_list: + h = x + for res in module: + h = res(h) + x = x + h if self.shortcut else h + return x diff --git a/models/experimental/functional_yolov4/reference/yolov4.py b/models/experimental/functional_yolov4/reference/yolov4.py new file mode 100644 index 00000000000..8f1e573edec --- /dev/null +++ b/models/experimental/functional_yolov4/reference/yolov4.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from models.experimental.functional_yolov4.reference.downsample1 import DownSample1 +from models.experimental.functional_yolov4.reference.downsample2 import DownSample2 +from models.experimental.functional_yolov4.reference.downsample3 import DownSample3 +from models.experimental.functional_yolov4.reference.downsample4 import DownSample4 +from models.experimental.functional_yolov4.reference.downsample5 import DownSample5 +from models.experimental.functional_yolov4.reference.neck import Neck +from models.experimental.functional_yolov4.reference.head import Head + +import torch +import torch.nn as nn + + +class Yolov4(nn.Module): + def __init__(self): + super(Yolov4, self).__init__() + self.downsample1 = DownSample1() + self.downsample2 = DownSample2() + self.downsample3 = DownSample3() + self.downsample4 = DownSample4() + self.downsample5 = DownSample5() + self.neck = Neck() + self.head = Head() + + def forward(self, input: torch.Tensor): + d1 = self.downsample1(input) + d2 = self.downsample2(d1) + d3 = self.downsample3(d2) + d4 = self.downsample4(d3) + d5 = self.downsample5(d4) + x20, x13, x6 = self.neck([d5, d4, d3]) + x4, x5, x6 = self.head([x20, x13, x6]) + + return x4, x5, x6 diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample1.py b/models/experimental/functional_yolov4/tt/ttnn_downsample1.py new file mode 100644 index 00000000000..7e1ea1dc18b --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample1.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_model + + +class TtDownSample1: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.c4 = parameters.c4 + self.c5 = parameters.c5 + self.c6 = parameters.c6 + self.c7 = parameters.c7 + self.c8 = parameters.c8 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + + output_tensor = self.c1(input_tensor) + output_tensor = self.c2(output_tensor) + output_tensor_c2 = output_tensor + output_tensor = self.c3(output_tensor) + + output_tensor_c3 = output_tensor + output_tensor = self.c4(output_tensor_c2) + + output_tensor_c4 = output_tensor + output_tensor = self.c5(output_tensor) + output_tensor = self.c6(output_tensor) + + output_tensor = output_tensor + output_tensor_c4 + output_tensor = self.c7(output_tensor) + + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c3], dim=3) + + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c8.conv.input_sharded_memory_config) + output_tensor = self.c8(output_tensor) + + return ttnn.from_device(output_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample2.py b/models/experimental/functional_yolov4/tt/ttnn_downsample2.py new file mode 100644 index 00000000000..8065ee6f9a9 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample2.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_model +from models.experimental.functional_yolov4.tt.ttnn_resblock import TtResBlock + + +class TtDownSample2: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.res = TtResBlock(parameters.res, 2, True) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + + output_tensor = self.c1(input_tensor) + output_tensor_c1 = output_tensor + output_tensor = self.c2(output_tensor) + output_tensor_c2 = output_tensor + output_tensor = self.c3(output_tensor_c1) + output_tensor = self.res(device, output_tensor) + + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + output_tensor = self.c4(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c2], dim=3) + + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + output_tensor = self.c5(output_tensor) + return ttnn.from_device(output_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample3.py b/models/experimental/functional_yolov4/tt/ttnn_downsample3.py new file mode 100644 index 00000000000..625370da430 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample3.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_model +from models.experimental.functional_yolov4.tt.ttnn_resblock import TtResBlock + + +class TtDownSample3: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.res = TtResBlock(parameters.res, 8, True) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + + output_tensor_c1 = self.c1(input_tensor) + output_tensor_c2 = self.c2(output_tensor_c1) + output_tensor = self.c3(output_tensor_c1) + + output_tensor = self.res(device, output_tensor) + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + + output_tensor = self.c4(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c2], dim=3) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + output_tensor = self.c5(output_tensor) + + return ttnn.from_device(output_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample4.py b/models/experimental/functional_yolov4/tt/ttnn_downsample4.py new file mode 100644 index 00000000000..04458b0c950 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample4.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_model +from models.experimental.functional_yolov4.tt.ttnn_resblock import TtResBlock + + +class TtDownSample4: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.res = TtResBlock(parameters.res, 8, True) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + output_tensor = self.c1(input_tensor) + output_tensor_c1 = output_tensor + output_tensor = self.c2(output_tensor) + output_tensor_c2 = output_tensor + output_tensor = self.c3(output_tensor_c1) + output_tensor = self.res(device, output_tensor) + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + output_tensor = self.c4(output_tensor) + + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c2], dim=3) + output_tensor = output_tensor.to(device, self.c5.conv.input_sharded_memory_config) + + output_tensor = self.c5(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + return ttnn.from_device(output_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample5.py b/models/experimental/functional_yolov4/tt/ttnn_downsample5.py new file mode 100644 index 00000000000..02c284621d1 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample5.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_model +from models.experimental.functional_yolov4.tt.ttnn_resblock import TtResBlock + + +class TtDownSample5: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.res = TtResBlock(parameters.res, 4, True) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + output_tensor = self.c1(input_tensor) + output_tensor_c1 = output_tensor + output_tensor = self.c2(output_tensor) + output_tensor_c2 = output_tensor + output_tensor = self.c3(output_tensor_c1) + output_tensor = self.res(device, output_tensor) + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + output_tensor = self.c4(output_tensor) + + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c2], dim=3) + output_tensor = output_tensor.to(device, self.c5.conv.input_sharded_memory_config) + + output_tensor = self.c5(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + return ttnn.from_device(output_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_head.py b/models/experimental/functional_yolov4/tt/ttnn_head.py new file mode 100644 index 00000000000..59c551c87f1 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_head.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.nn as nn + +import ttnn +import tt_lib +from tt_lib import fallback_ops +from ttnn.model_preprocessing import preprocess_model + +from models.utility_functions import ( + torch_to_tt_tensor_rm, + torch_to_tt_tensor, + torch2tt_tensor, + tt_to_torch_tensor, +) + + +class TtHead: + def output_preprocessing(self, output_tensor, device): + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) + output_tensor = torch_to_tt_tensor_rm(output_tensor, device, put_on_device=True) + return output_tensor + + def __init__(self, device, parameters) -> None: + self.device = device + print("keys in parameters in TtHead are: ", parameters.keys()) + self.c1 = parameters.c1 + self.c2 = fallback_ops.Conv2d(parameters.c2["weight"], parameters.c2["bias"], 256, 255, 1, 1, 0, bias=True) + self.c3 = parameters.c3 + self.c4 = parameters.c4 + self.c5 = parameters.c5 + self.c6 = parameters.c6 + self.c7 = parameters.c7 + self.c8 = parameters.c8 + self.c9 = parameters.c9 + self.c10 = fallback_ops.Conv2d(parameters.c10["weight"], parameters.c10["bias"], 512, 255, 1, 1, 0, bias=True) + self.c11 = parameters.c11 + self.c12 = parameters.c12 + self.c13 = parameters.c13 + self.c14 = parameters.c14 + self.c15 = parameters.c15 + self.c16 = parameters.c16 + self.c17 = parameters.c17 + self.c18 = fallback_ops.Conv2d(parameters.c18["weight"], parameters.c18["bias"], 1024, 255, 1, 1, 0, bias=True) + + def __call__(self, device, input_tensors): + input_tensor = input_tensors[0].to(device, self.c1.conv.input_sharded_memory_config) + output_tensor = self.c1(input_tensor) + output_tensor = self.output_preprocessing(output_tensor, device) + output_tensor = self.c2(output_tensor) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.permute(output_tensor, (0, 2, 3, 1)) + + output_tensor_res1 = output_tensor + + input_tensor = input_tensors[0].to(device, self.c3.conv.input_sharded_memory_config) + output_tensor = self.c3(input_tensor) + + # outNeck1 = torch.ones([1, 256, 20, 20]) + # outNeck1 = torch.permute(outNeck1, (0, 2, 3, 1)) + # outNeck1 = outNeck1.reshape(outNeck1.shape[0], 1, outNeck1.shape[1] * outNeck1.shape[2], outNeck1.shape[3]) + # outNeck1 = ttnn.from_torch(outNeck1, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + # outNeck1 = outNeck1.to(device) + outNeck1 = input_tensors[2].to(device) + + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, outNeck1], dim=3) + + output_tensor = self.c4(output_tensor) + output_tensor = self.c5(output_tensor) + output_tensor = self.c6(output_tensor) + output_tensor = self.c7(output_tensor) + output_tensor = self.c8(output_tensor) + output_tensor2 = output_tensor + + output_tensor = self.c9(output_tensor) + output_tensor = self.output_preprocessing(output_tensor, device) + output_tensor = self.c10(output_tensor) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.permute(output_tensor, (0, 2, 3, 1)) + output_tensor_res2 = output_tensor + + output_tensor = self.c11(output_tensor2) + + # outNeck2 = torch.ones([1, 512, 10, 10]) + # outNeck2 = torch.permute(outNeck2, (0, 2, 3, 1)) + # outNeck2 = outNeck2.reshape(outNeck2.shape[0], 1, outNeck2.shape[1] * outNeck2.shape[2], outNeck2.shape[3]) + # outNeck2 = ttnn.from_torch(outNeck2, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT) + # outNeck2 = outNeck2.to(device) # , self.c11.conv.input_sharded_memory_config) + outNeck2 = input_tensors[1].to(device) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, outNeck2], dim=3) + + output_tensor = self.c12(output_tensor) + output_tensor = self.c13(output_tensor) + output_tensor = self.c14(output_tensor) + output_tensor = self.c15(output_tensor) + output_tensor = self.c16(output_tensor) + output_tensor = self.c17(output_tensor) + + output_tensor = self.output_preprocessing(output_tensor, device) + output_tensor = self.c18(output_tensor) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.permute(output_tensor, (0, 2, 3, 1)) + + output_tensor_res3 = output_tensor + + return ( + ttnn.from_device(output_tensor_res1), + ttnn.from_device(output_tensor_res2), + ttnn.from_device(output_tensor_res3), + ) diff --git a/models/experimental/functional_yolov4/tt/ttnn_neck.py b/models/experimental/functional_yolov4/tt/ttnn_neck.py new file mode 100644 index 00000000000..119d2b7198b --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_neck.py @@ -0,0 +1,454 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +import ttnn +import tt_lib +from tt_lib import fallback_ops +from ttnn.model_preprocessing import preprocess_model + + +class TtNeck: + def __init__( + self, + device, + parameters, + ) -> None: + self.device = device + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + # print("\n\n\nattributes of parameters.c3: ", parameters.c3.__dict__) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + self.c6 = parameters.c6 + self.c7 = parameters.c7 + self.c7_2 = parameters.c7_2 + self.c7_3 = parameters.c7_3 + self.c7_4 = parameters.c7_4 + self.c7_5 = parameters.c7_5 + self.c8 = parameters.c8 + self.c8_2 = parameters.c8_2 + self.c9 = parameters.c9 + self.c9_2 = parameters.c9_2 + self.c9_3 = parameters.c9_3 + self.c9_4 = parameters.c9_4 + self.c9_5 = parameters.c9_5 + self.c10 = parameters.c10 + self.c10_2 = parameters.c10_2 + # self.p1 = parameters.p1 + # self.p2 = parameters.p2 + # self.p3 = parameters.p3 + + #########conv3############### + # self.c3 = ttnn.Conv2d( + # in_channels=1024, + # out_channels=512, + # kernel_size=(1, 1), + # stride=(1, 1), + # padding=(0, 0), + # dtype=ttnn.bfloat8_b, + # device=device, + # use_1d_systolic_array=True, + # batch_size=1, + # input_height=10, + # input_width=10, + # reader_patterns_cache={}, + # weight=parameters.c3.weight, + # # bias=parameters.c3.bias, + # math_fidelity=ttnn.MathFidelity.LoFi, + # weights_dtype=ttnn.bfloat8_b, + # use_shallow_conv_variant=False, + # deallocate_activation=True, + # # padded_input_channels=32, + # activation="relu", + # conv_blocking_and_parallelization_config_override=None, + # # compute_kernel_config=compute_kernel_config, + # ) + + self.max_pool_reader_patterns_cache = {} + max_pool_parallel_config_override = {} + + max_pool_parallel_config_override["grid_size"] = self.c3.conv.grid_size + max_pool_parallel_config_override["num_cores_nhw"] = self.c3.conv.sliding_window_op_params.num_cores_nhw + print(max_pool_parallel_config_override) + print(max_pool_parallel_config_override["num_cores_nhw"]) + + # self.p1 = fallback_ops.MaxPool2d( + # kernel_size=(5, 5), + # stride=(1, 1), + # padding=(2, 2), + # dilation=(1, 1), + # channels_last=True + # ) + # self.p2 = fallback_ops.MaxPool2d( + # kernel_size=(9, 9), + # stride=(1, 1), + # padding=(4, 4), + # dilation=(1, 1), + # channels_last=True + # ) + # self.p3 = fallback_ops.MaxPool2d( + # kernel_size=(13, 13), + # stride=(1, 1), + # padding=(6, 6), + # dilation=(1, 1), + # channels_last=True + # ) + + self.p1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=2, dilation=1, ceil_mode=False) + self.p2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=4, dilation=1, ceil_mode=False) + self.p3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=6, dilation=1, ceil_mode=False) + + # self.p1 = ttnn.MaxPool2d( + # kernel_size=(5, 5), + # stride=(1, 1), + # padding=(2, 2), + # dilation=(1, 1), + # dtype=ttnn.bfloat16, + # device=self.device, + # batch_size=1, + # input_height=10, + # input_width=10, + # reader_patterns_cache=self.max_pool_reader_patterns_cache, + # deallocate_activation=True, + # # parallel_config_override=max_pool_parallel_config_override, + # channels=512, + # ) + # self.p2 = ttnn.MaxPool2d( + # kernel_size=(9, 9), + # stride=(1, 1), + # padding=(4, 4), + # dilation=(1, 1), + # dtype=ttnn.bfloat16, + # device=self.device, + # batch_size=1, + # input_height=10, + # input_width=10, + # reader_patterns_cache=self.max_pool_reader_patterns_cache, + # deallocate_activation=True, + # # parallel_config_override=max_pool_parallel_config_override, + # channels=512, + # ) + # self.p3 = ttnn.MaxPool2d( + # kernel_size=(13, 13), + # stride=(1, 1), + # padding=(6, 6), + # dilation=(1, 1), + # dtype=ttnn.bfloat16, + # device=self.device, + # batch_size=1, + # input_height=10, + # input_width=10, + # reader_patterns_cache=self.max_pool_reader_patterns_cache, + # deallocate_activation=True, + # # parallel_config_override=max_pool_parallel_config_override, + # channels=512, + # ) + # + def __call__(self, device, input_tensors): + input_tensor0 = input_tensors[0].to(device, self.c1.conv.input_sharded_memory_config) + + ####### + + # # 3 CBN blocks + # x1 = self.c1(input_tensor) + # x1_b = self.b1(x1) + # x1_m = self.relu(x1_b) + # + # x2 = self.c2(x1_m) + # x2_b = self.b2(x2) + # x2_m = self.relu(x2_b) + # + # x3 = self.c3(x2_m) + # x3_b = self.b3(x3) + # x3_m = self.relu(x3_b) + # + # # maxpools + # x4 = self.p1(x3_m) + # x5 = self.p2(x3_m) + # x6 = self.p3(x3_m) + # + # # concat the outputs of maxpool and x3_m + # conc1 = torch.cat([x4, x5, x6, x3_m], dim=1) + # + # # 4 back2back CBRs + # # CBR4-1 + # x7 = self.c4(conc1) + # x7_b = self.b4(x7) + # x7_m = self.relu(x7_b) + # + # # CBR4-2 + # x8 = self.c5(x7_m) + # x8_b = self.b5(x8) + # x8_m = self.relu(x8_b) + # + # # CBR4-3 + # x9 = self.c6(x8_m) + # x9_b = self.b6(x9) + # x9_m = self.relu(x9_b) + # + # # CBR4-4 + # x10 = self.c7(x9_m) + # x10_b = self.b7(x10) + # x10_m = self.relu(x10_b) + # + # # upsample + # u1 = self.u(x10_m) + # + # # Next CBR block to be concatinated with output of u1 + # # gets the output of downsample4 module which is dimensions: [1, 512, 20, 20] - make a random tensor with that shape for the purpose of running the neck unit test stand-alone + # outDownSample4 = torch.rand([1, 512, 20, 20]) + # # CBR block for conc2 + # x11 = self.c7(outDownSample4) + # x11_b = self.b7(x11) + # x11_m = self.relu(x11_b) + # + # # concat CBR output with output from u1 + # conc2 = torch.cat([u1, x11_m], dim=1) + # + # # 6 back2back CBRs + # # CBR6_1 + # x12 = self.c7(conc2) + # x12_b = self.b7(x12) + # x12_m = self.relu(x12_b) + # + # # CBR6_2 + # x13 = self.c8(x12_m) + # x13_b = self.b8(x13) + # x13_m = self.relu(x13_b) + # + # # CBR6_3 + # x14 = self.c7(x13_m) + # x14_b = self.b7(x14) + # x14_m = self.relu(x14_b) + # + # # CBR6_4 + # x15 = self.c8(x14_m) + # x15_b = self.b8(x15) + # x15_m = self.relu(x15_b) + # + # # CBR6_5 + # x16 = self.c7(x15_m) + # x16_b = self.b7(x16) + # x16_m = self.relu(x16_b) + # + # # CBR6_6 + # x17 = self.c9(x16_m) + # x17_b = self.b9(x17) + # x17_m = self.relu(x17_b) + # + # # upsample + # u2 = self.u(x17_m) + # + # # CBR block for conc3 + # outDownSample3 = torch.rand([1, 256, 40, 40]) + # x18 = self.c9(outDownSample3) + # x18_b = self.b9(x18) + # x18_m = self.relu(x18_b) + # + # # concat CBR output with output from u2 + # conc3 = torch.cat([u2, x18_m], dim=1) + # + # # 5 CBR blocks + # # CBR5_1 + # x19 = self.c9(conc3) + # x19_b = self.b9(x19) + # x19_m = self.relu(x19_b) + # + # # CBR5_2 + # x20 = self.c10(x19_m) + # x20_b = self.b10(x20) + # x20_m = self.relu(x20_b) + # + # # CBR5_3 + # x21 = self.c9(x20_m) + # x21_b = self.b9(x21) + # x21_m = self.relu(x21_b) + # + # # CBR5_4 + # x22 = self.c10(x21_m) + # x22_b = self.b10(x22) + # x22_m = self.relu(x22_b) + # + # # CBR5_5 + # x23 = self.c9(x22_m) + # x23_b = self.b9(x23) + # x23_m = self.relu(x23_b) + # + # return x23_m, x9_m, x16_m + # + # ####### + output_tensor = self.c1(input_tensor0) + output_tensor = self.c2(output_tensor) + output_tensor = self.c3(output_tensor) + output_tensorc3 = output_tensor + + output_tensorc3 = tt_lib.tensor.sharded_to_interleaved(output_tensorc3, ttnn.L1_MEMORY_CONFIG) + # output_tensorc3 = ttnn.to_layout(output_tensorc3, layout=ttnn.TILE_LAYOUT) + custom_sharded_memory_config = ttnn.experimental.tensor.MemoryConfig( + memory_layout=ttnn.experimental.tensor.TensorMemoryLayout.HEIGHT_SHARDED, + buffer_type=ttnn.experimental.tensor.BufferType.L1, + ) + # output_tensorc3 = tt_lib.tensor.interleaved_to_sharded(output_tensorc3, self.p1.max_pool.input_sharded_memory_config) + # ouptut_tensorc3=ttnn.to_memory_config(output_tensorc3, self.p1.max_pool.input_sharded_memory_config) + # input_tensor.to(device, mem_config = custom_sharded_memory_config) + # output_tensorc3 = output_tensorc3.to(device, self.p1.max_pool.input_sharded_memory_config) + # input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED) + + # reproduces maxpool padding error + output_tensorc3 = ttnn.to_layout(output_tensorc3, ttnn.ROW_MAJOR_LAYOUT) + # output_tensorc3 = tt_lib.tensor.interleaved_to_sharded( + # output_tensorc3, self.p1.max_pool.input_sharded_memory_config + # ) + print("C3 sharding: ", self.c3.conv.input_sharded_memory_config) + # print("P1 sharding: ", self.p1.max_pool.output_sharded_memory_config) + # input_tensor.memory_config().memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED + print("Input sharding: ", output_tensorc3.memory_config().memory_layout) + # return [output_tensorc3, output_tensorc3, output_tensorc3] + + output_tensorc3 = ttnn.from_device(output_tensorc3) + output_tensorc3 = ttnn.to_torch(output_tensorc3) + output_tensorc3 = torch.reshape(output_tensorc3, (1, 10, 10, 512)) + output_tensorc3 = torch.permute(output_tensorc3, (0, 3, 1, 2)) + # print("p1 inp: ",output_tensorc3.shape) + + # output_tensorc3 = ttnn.reshape(output_tensorc3, (1, 10, 10, 512)) + # output_tensorc3 = ttnn.to_torch(output_tensorc3) + # output_tensorc3 = torch.permute(output_tensorc3, (0, 3, 1, 2)) + # from models.utility_functions import torch_to_tt_tensor_rm + # output_tensorc3 = torch_to_tt_tensor_rm(output_tensorc3, device, put_on_device=True) + output_tensor = self.p1(output_tensorc3) + output_tensorp1 = output_tensor + output_tensor = self.p2(output_tensorc3) + output_tensorp2 = output_tensor + output_tensor = self.p3(output_tensorc3) + output_tensorp3 = output_tensor + print("p3 shape: ", output_tensorp1.shape) + # output_tensorp1 = ttnn.to_layout(output_tensorp1, layout=ttnn.TILE_LAYOUT) + # output_tensorp1 = ttnn.permute(output_tensorp1, (0, 2, 3, 1)) + # output_tensorp1 = ttnn.reshape(output_tensorp1, (1, 1, 100, 500)) + # output_tensorp2 = ttnn.to_layout(output_tensorp2, layout=ttnn.TILE_LAYOUT) + # output_tensorp2 = ttnn.permute(output_tensorp2, (0, 2, 3, 1)) + # output_tensorp2 = ttnn.reshape(output_tensorp2, (1, 1, 100, 500)) + # output_tensorp3 = ttnn.to_layout(output_tensorp3, layout=ttnn.TILE_LAYOUT) + # output_tensorp3 = ttnn.permute(output_tensorp3, (0, 2, 3, 1)) + # output_tensorp3 = ttnn.reshape(output_tensorp3, (1, 1, 100, 500)) + # output_tensorc3 = ttnn.to_layout(output_tensorc3, layout=ttnn.TILE_LAYOUT) + # output_tensorc3 = ttnn.permute(output_tensorc3, (0, 2, 3, 1)) + # output_tensorc3 = ttnn.reshape(output_tensorc3, (1, 1, 100, 500)) + # output_tensorc3 = ttnn.permute(output_tensorc3, (0, 2, 3, 1)) + output_tensorp1 = torch.reshape(output_tensorp1, (1, 512, 1, 100)) + output_tensorp2 = torch.reshape(output_tensorp2, (1, 512, 1, 100)) + output_tensorp3 = torch.reshape(output_tensorp3, (1, 512, 1, 100)) + output_tensorc3 = torch.reshape(output_tensorc3, (1, 512, 1, 100)) + output_tensorp1 = torch.permute(output_tensorp1, (0, 2, 3, 1)) + output_tensorp2 = torch.permute(output_tensorp2, (0, 2, 3, 1)) + output_tensorp3 = torch.permute(output_tensorp3, (0, 2, 3, 1)) + output_tensorc3 = torch.permute(output_tensorc3, (0, 2, 3, 1)) + + output_tensorp1 = ttnn.from_torch(output_tensorp1, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + output_tensorp2 = ttnn.from_torch(output_tensorp2, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + output_tensorp3 = ttnn.from_torch(output_tensorp3, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + output_tensorc3 = ttnn.from_torch(output_tensorc3, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + output_tensorp1 = output_tensorp1.to(device) + output_tensorp2 = output_tensorp2.to(device) + output_tensorp3 = output_tensorp3.to(device) + output_tensorc3 = output_tensorc3.to(device) + # output_tensorp1 = tt_lib.tensor.sharded_to_interleaved(output_tensorp1, ttnn.L1_MEMORY_CONFIG) + # output_tensorp1 = ttnn.to_layout(output_tensorp1, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensorp1, output_tensorp2, output_tensorp3, output_tensorc3], dim=3) + # output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + # print("DEBUG:", output_tensor.memory_config()) + output_tensor = self.c4(output_tensor) + output_tensor = self.c5(output_tensor) + output_tensor = self.c6(output_tensor) + output_tensor_9m = output_tensor + output_tensor = self.c7(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT) + output_tensor = ttnn.upsample(output_tensor, (1, 4, 1), memory_config=output_tensor.memory_config()) + + # TODO add ttnn tensor here for testing + # input_shape = torch_input_tensor.shape + # input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + # + # input_tensor = input_tensor.reshape( + # input_tensor.shape[0], 1, input_tensor.shape[1] * input_tensor.shape[2], input_tensor.shape[3] + # ) + + outDownSample4 = input_tensors[1].to(device, self.c7_2.conv.input_sharded_memory_config) + # CBR block for conc2 + outDownSample4_c7 = self.c7_2(outDownSample4) + # outDownSample4_b7 = self.b7(outDownSample4_c7) + # outDownSample4_r7 = self.relu(outDownSample4_b7) + # + # output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + # output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + outDownSample4_c7 = tt_lib.tensor.sharded_to_interleaved(outDownSample4_c7, ttnn.L1_MEMORY_CONFIG) + outDownSample4_c7 = ttnn.to_layout(outDownSample4_c7, layout=ttnn.TILE_LAYOUT) + print(outDownSample4_c7.memory_config()) + print(output_tensor.memory_config()) + output_tensor = ttnn.concat([output_tensor, outDownSample4_c7], dim=3) + + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c7_3.conv.input_sharded_memory_config) + output_tensor = self.c7_3(output_tensor) + output_tensor = self.c8(output_tensor) + output_tensor = self.c7_4(output_tensor) + output_tensor = self.c8_2(output_tensor) + output_tensor = self.c7_5(output_tensor) + output_tensor_16m = output_tensor + print(output_tensor.shape) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = output_tensor.to(device, self.c9.conv.input_sharded_memory_config) + + print(self.c9.conv.input_sharded_memory_config) + print("Last config:", output_tensor.memory_config()) + output_tensor = self.c9(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT) + output_tensor = ttnn.upsample(output_tensor, (1, 4, 1), memory_config=output_tensor.memory_config()) + # output_tensor = self.u(output_tensor) + # # CBR block for conc3 + # # TODO add ttnn random tensor here + outDownSample3 = input_tensors[2].to(device, self.c9_2.conv.input_sharded_memory_config) + outDownSample3_c9 = self.c9_2(outDownSample3) + # outDownSample3_b9 = self.b9(outDownSample3_c9) + # outDownSample3_r9 = self.relu(outDownSample3_b9) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, outDownSample3_c9], dim=3) + output_tensor = output_tensor.to(device, self.c9_3.conv.input_sharded_memory_config) + output_tensor = self.c9_3(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c10.conv.input_sharded_memory_config) + print("out: ", output_tensor.layout) + # print("c10: ", self.c10.output_layout) + output_tensor = self.c10(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c9_4.conv.input_sharded_memory_config) + output_tensor = self.c9_4(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c10_2.conv.input_sharded_memory_config) + output_tensor = self.c10_2(output_tensor) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c9_5.conv.input_sharded_memory_config) + output_tensor = self.c9_5(output_tensor) + # # output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + # # output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + # # output_tensor = ttnn.concat([output_tensor, output_tensor_c3], dim=3) + # + # # output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c8.conv.input_sharded_memory_config) + # # output_tensor = self.c8(output_tensor) + # + return ttnn.from_device(output_tensor), ttnn.from_device(output_tensor_9m), ttnn.from_device(output_tensor_16m) diff --git a/models/experimental/functional_yolov4/tt/ttnn_resblock.py b/models/experimental/functional_yolov4/tt/ttnn_resblock.py new file mode 100644 index 00000000000..6ad0d955e89 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_resblock.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import ttnn +import tt_lib + + +class TtResBlock: + def __init__(self, parameters, nblocks, shortcut) -> None: + self.shortcut = shortcut + self.nblocks = nblocks + self.module_list = [] + for i in range(nblocks): + conv1 = parameters[f"resblock_{i}_conv1"] + conv2 = parameters[f"resblock_{i}_conv2"] + resblock_one = [conv1, conv2] + self.module_list.append(resblock_one) + + def __call__(self, device, input_tensor): + input_tensor = tt_lib.tensor.sharded_to_interleaved(input_tensor, ttnn.L1_MEMORY_CONFIG) + input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT) + for i in range(self.nblocks): + output_tensor_h = input_tensor + output_tensor_h = output_tensor_h.to(device, self.module_list[i][0].conv.input_sharded_memory_config) + output_tensor_1 = self.module_list[i][0](output_tensor_h) + output_tensor_h = self.module_list[i][1](output_tensor_1) + output_tensor_h = tt_lib.tensor.sharded_to_interleaved(output_tensor_h, ttnn.L1_MEMORY_CONFIG) + output_tensor_h = ttnn.to_layout(output_tensor_h, layout=ttnn.TILE_LAYOUT) + + input_tensor = (input_tensor + output_tensor_h) if self.shortcut else output_tensor_h + return ttnn.from_device(input_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_yolov4.py b/models/experimental/functional_yolov4/tt/ttnn_yolov4.py new file mode 100644 index 00000000000..51374d3aaa9 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_yolov4.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from models.experimental.functional_yolov4.tt.ttnn_downsample1 import TtDownSample1 +from models.experimental.functional_yolov4.tt.ttnn_downsample2 import TtDownSample2 +from models.experimental.functional_yolov4.tt.ttnn_downsample3 import TtDownSample3 +from models.experimental.functional_yolov4.tt.ttnn_downsample4 import TtDownSample4 +from models.experimental.functional_yolov4.tt.ttnn_downsample5 import TtDownSample5 +from models.experimental.functional_yolov4.tt.ttnn_neck import TtNeck +from models.experimental.functional_yolov4.tt.ttnn_head import TtHead +import ttnn + + +class TtYolov4: + def __init__(self, device, parameters) -> None: + self.downsample1 = TtDownSample1(parameters["downsample1"]) + self.downsample2 = TtDownSample2(parameters["downsample2"]) + self.downsample3 = TtDownSample3(parameters["downsample3"]) + self.downsample4 = TtDownSample4(parameters["downsample4"]) + self.downsample5 = TtDownSample5(parameters["downsample5"]) + self.neck = TtNeck(device, parameters["neck"]) + self.head = TtHead(device, parameters["head"]) + + def __call__(self, device, input_tensor): + d1 = self.downsample1(device, input_tensor) + d2 = self.downsample2(device, d1) + d3 = self.downsample3(device, d2) + d4 = self.downsample4(device, d3) + d5 = self.downsample5(device, d4) + x20, x13, x6 = self.neck(device, [d5, d4, d3]) + x4, x5, x6 = self.head(device, [x20, x13, x6]) + print(x4.shape, x5.shape) + return x4, x5, x6 diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d1.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d1.py new file mode 100644 index 00000000000..ec418bac032 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d1.py @@ -0,0 +1,147 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from models.experimental.functional_yolov4.reference.downsample1 import DownSample1 +from models.experimental.functional_yolov4.tt.ttnn_downsample1 import TtDownSample1 + +import ttnn +import tt_lib + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels < 256 + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample1): + ttnn_module_args.c1["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c1["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c1["deallocate_activation"] = True + ttnn_module_args.c1["conv_blocking_and_parallelization_config_override"] = None + + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c2["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["activation"] = "relu" # Fuse relu with conv2 + ttnn_module_args.c2["deallocate_activation"] = True + ttnn_module_args.c2["conv_blocking_and_parallelization_config_override"] = None + + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c3["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c3["deallocate_activation"] = True + ttnn_module_args.c3["conv_blocking_and_parallelization_config_override"] = None + + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + ttnn_module_args.c4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c4["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c4["deallocate_activation"] = True + ttnn_module_args.c4["conv_blocking_and_parallelization_config_override"] = None + + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c5["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c5["deallocate_activation"] = True + ttnn_module_args.c5["conv_blocking_and_parallelization_config_override"] = None + + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + ttnn_module_args.c6["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c6["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c6["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c6["deallocate_activation"] = True + ttnn_module_args.c6["conv_blocking_and_parallelization_config_override"] = None + + conv6_weight, conv6_bias = fold_batch_norm2d_into_conv2d(model.c6, model.b6) + update_ttnn_module_args(ttnn_module_args.c6) + parameters["c6"], c6_parallel_config = preprocess_conv2d( + conv6_weight, conv6_bias, ttnn_module_args.c6, return_parallel_config=True + ) + + ttnn_module_args.c7["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c7["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c7["deallocate_activation"] = True + ttnn_module_args.c7["conv_blocking_and_parallelization_config_override"] = None + + conv7_weight, conv7_bias = fold_batch_norm2d_into_conv2d(model.c7, model.b7) + update_ttnn_module_args(ttnn_module_args.c7) + parameters["c7"], c7_parallel_config = preprocess_conv2d( + conv7_weight, conv7_bias, ttnn_module_args.c7, return_parallel_config=True + ) + + ttnn_module_args.c8["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c8["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c8["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c8["deallocate_activation"] = True + ttnn_module_args.c8["conv_blocking_and_parallelization_config_override"] = None + + conv8_weight, conv8_bias = fold_batch_norm2d_into_conv2d(model.c8, model.b8) + update_ttnn_module_args(ttnn_module_args.c8) + parameters["c8"], c8_parallel_config = preprocess_conv2d( + conv8_weight, conv8_bias, ttnn_module_args.c8, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d2.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d2.py new file mode 100644 index 00000000000..ef06765cf04 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d2.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from models.experimental.functional_yolov4.reference.downsample2 import DownSample2 +from models.experimental.functional_yolov4.tt.ttnn_downsample2 import TtDownSample2 + +import ttnn +import tt_lib + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels <= 256 + ttnn_module_args["dtype"] = ttnn.bfloat8_b + ttnn_module_args["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args["deallocate_activation"] = True + ttnn_module_args["conv_blocking_and_parallelization_config_override"] = None + ttnn_module_args["activation"] = "relu" + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample2): + ttnn_module_args.c1["weights_dtype"] = ttnn.bfloat8_b + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + parameters["res"] = {} + for i, block in enumerate(model.res.module_list): + conv1 = block[0] + bn1 = block[1] + conv2 = block[3] + bn2 = block[4] + + ttnn_module_args["res"][f"resblock_{i}_conv1"] = ttnn_module_args["res"]["0"] + ttnn_module_args["res"][f"resblock_{i}_conv1"]["weights_dtype"] = ttnn.bfloat8_b + weight1, bias1 = fold_batch_norm2d_into_conv2d(conv1, bn1) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv1"]) + parameters["res"][f"resblock_{i}_conv1"], _ = preprocess_conv2d( + weight1, bias1, ttnn_module_args["res"][f"resblock_{i}_conv1"], return_parallel_config=True + ) + + ttnn_module_args["res"][f"resblock_{i}_conv2"] = ttnn_module_args["res"]["3"] + ttnn_module_args["res"][f"resblock_{i}_conv2"]["weights_dtype"] = ttnn.bfloat8_b + weight2, bias2 = fold_batch_norm2d_into_conv2d(conv2, bn2) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv2"]) + parameters["res"][f"resblock_{i}_conv2"], _ = preprocess_conv2d( + weight2, bias2, ttnn_module_args["res"][f"resblock_{i}_conv2"], return_parallel_config=True + ) + + ttnn_module_args.c4["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d3.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d3.py new file mode 100644 index 00000000000..d8573c30ce3 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d3.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from models.experimental.functional_yolov4.reference.downsample3 import DownSample3 +from models.experimental.functional_yolov4.tt.ttnn_downsample3 import TtDownSample3 + +import ttnn +import tt_lib + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels <= 256 + ttnn_module_args["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args["dtype"] = ttnn.bfloat8_b + ttnn_module_args["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args["deallocate_activation"] = True + ttnn_module_args["conv_blocking_and_parallelization_config_override"] = None + ttnn_module_args["activation"] = "relu" + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample3): + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["use_shallow_conv_variant"] = False + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["use_shallow_conv_variant"] = False + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + parameters["res"] = {} + for i, block in enumerate(model.res.module_list): + conv1 = block[0] + bn1 = block[1] + conv2 = block[3] + bn2 = block[4] + + ttnn_module_args["res"][f"resblock_{i}_conv1"] = ttnn_module_args["res"]["0"] + ttnn_module_args["res"][f"resblock_{i}_conv1"]["weights_dtype"] = ttnn.bfloat8_b + weight1, bias1 = fold_batch_norm2d_into_conv2d(conv1, bn1) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv1"]) + parameters["res"][f"resblock_{i}_conv1"], _ = preprocess_conv2d( + weight1, bias1, ttnn_module_args["res"][f"resblock_{i}_conv1"], return_parallel_config=True + ) + + ttnn_module_args["res"][f"resblock_{i}_conv2"] = ttnn_module_args["res"]["3"] + ttnn_module_args["res"][f"resblock_{i}_conv2"]["weights_dtype"] = ttnn.bfloat8_b + weight2, bias2 = fold_batch_norm2d_into_conv2d(conv2, bn2) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv2"]) + parameters["res"][f"resblock_{i}_conv2"], _ = preprocess_conv2d( + weight2, bias2, ttnn_module_args["res"][f"resblock_{i}_conv2"], return_parallel_config=True + ) + + ttnn_module_args.c4["use_shallow_conv_variant"] = False + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + conv20_weight, conv20_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv20_weight, conv20_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["use_shallow_conv_variant"] = False + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d4.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d4.py new file mode 100644 index 00000000000..ae04c070dc1 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d4.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d +from models.experimental.functional_yolov4.reference.downsample4 import DownSample4 +from models.experimental.functional_yolov4.tt.ttnn_downsample4 import TtDownSample4 + +import ttnn +import tt_lib + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = True # ttnn_module_args.in_channels <= 256 + ttnn_module_args["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args["dtype"] = ttnn.bfloat8_b + ttnn_module_args["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args["deallocate_activation"] = True + ttnn_module_args["conv_blocking_and_parallelization_config_override"] = None + ttnn_module_args["activation"] = "relu" + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample4): + ttnn_module_args.c1["use_shallow_conv_variant"] = False + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["use_shallow_conv_variant"] = False + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["use_shallow_conv_variant"] = False + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + parameters["res"] = {} + for i, block in enumerate(model.res.module_list): + conv1 = block[0] + bn1 = block[1] + conv2 = block[3] + bn2 = block[4] + + ttnn_module_args["res"][f"resblock_{i}_conv1"] = ttnn_module_args["res"]["0"] + ttnn_module_args["res"][f"resblock_{i}_conv1"]["weights_dtype"] = ttnn.bfloat8_b + weight1, bias1 = fold_batch_norm2d_into_conv2d(conv1, bn1) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv1"]) + parameters["res"][f"resblock_{i}_conv1"], _ = preprocess_conv2d( + weight1, bias1, ttnn_module_args["res"][f"resblock_{i}_conv1"], return_parallel_config=True + ) + + ttnn_module_args["res"][f"resblock_{i}_conv2"] = ttnn_module_args["res"]["3"] + ttnn_module_args["res"][f"resblock_{i}_conv2"]["weights_dtype"] = ttnn.bfloat8_b + weight2, bias2 = fold_batch_norm2d_into_conv2d(conv2, bn2) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv2"]) + parameters["res"][f"resblock_{i}_conv2"], _ = preprocess_conv2d( + weight2, bias2, ttnn_module_args["res"][f"resblock_{i}_conv2"], return_parallel_config=True + ) + + ttnn_module_args.c4["use_shallow_conv_variant"] = False + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["use_shallow_conv_variant"] = False + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d5.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d5.py new file mode 100644 index 00000000000..614d16f2533 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d5.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d +from models.experimental.functional_yolov4.reference.downsample5 import DownSample5 +from models.experimental.functional_yolov4.tt.ttnn_downsample5 import TtDownSample5 + +import ttnn +import tt_lib + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = False # ttnn_module_args.in_channels <= 256 + ttnn_module_args["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args["dtype"] = ttnn.bfloat8_b + ttnn_module_args["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args["deallocate_activation"] = True + ttnn_module_args["conv_blocking_and_parallelization_config_override"] = None + ttnn_module_args["activation"] = "relu" + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample5): + ttnn_module_args.c1["use_shallow_conv_variant"] = False + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["use_shallow_conv_variant"] = False + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["use_shallow_conv_variant"] = False + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + parameters["res"] = {} + for i, block in enumerate(model.res.module_list): + conv1 = block[0] + bn1 = block[1] + conv2 = block[3] + bn2 = block[4] + + ttnn_module_args["res"][f"resblock_{i}_conv1"] = ttnn_module_args["res"]["0"] + ttnn_module_args["res"][f"resblock_{i}_conv1"]["weights_dtype"] = ttnn.bfloat8_b + weight1, bias1 = fold_batch_norm2d_into_conv2d(conv1, bn1) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv1"]) + parameters["res"][f"resblock_{i}_conv1"], _ = preprocess_conv2d( + weight1, bias1, ttnn_module_args["res"][f"resblock_{i}_conv1"], return_parallel_config=True + ) + + ttnn_module_args["res"][f"resblock_{i}_conv2"] = ttnn_module_args["res"]["3"] + ttnn_module_args["res"][f"resblock_{i}_conv2"]["weights_dtype"] = ttnn.bfloat8_b + weight2, bias2 = fold_batch_norm2d_into_conv2d(conv2, bn2) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv2"]) + parameters["res"][f"resblock_{i}_conv2"], _ = preprocess_conv2d( + weight2, bias2, ttnn_module_args["res"][f"resblock_{i}_conv2"], return_parallel_config=True + ) + + ttnn_module_args.c4["use_shallow_conv_variant"] = False + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["use_shallow_conv_variant"] = False + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_head.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_head.py new file mode 100644 index 00000000000..e29d3827d93 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_head.py @@ -0,0 +1,282 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from models.experimental.functional_yolov4.reference.head import Head +from models.experimental.functional_yolov4.tt.ttnn_head import TtHead + +import ttnn +import tt_lib + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels < 256 + ttnn_module_args["use_shallow_conv_variant"] = False + + +def custom_preprocessor(device, model, name, ttnn_module_args): + print("We do reach here!") + parameters = {} + if isinstance(model, Head): + ttnn_module_args.c1["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c1["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c1["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c1["deallocate_activation"] = False + ttnn_module_args.c1["conv_blocking_and_parallelization_config_override"] = None + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + conv2_weight = model.c2.weight.detach() + conv2_bias = model.c2.bias + parameters["c2"] = {} + parameters["c2"]["weight"] = conv2_weight + parameters["c2"]["bias"] = conv2_bias + + ttnn_module_args.c3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c3["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c3["deallocate_activation"] = True + ttnn_module_args.c3["conv_blocking_and_parallelization_config_override"] = None + + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + ttnn_module_args.c4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c4["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c4["deallocate_activation"] = True + ttnn_module_args.c4["conv_blocking_and_parallelization_config_override"] = None + + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c5["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c5["deallocate_activation"] = True + ttnn_module_args.c5["conv_blocking_and_parallelization_config_override"] = None + + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + ttnn_module_args.c6["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c6["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c6["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c6["deallocate_activation"] = True + ttnn_module_args.c6["conv_blocking_and_parallelization_config_override"] = None + + conv6_weight, conv6_bias = fold_batch_norm2d_into_conv2d(model.c6, model.b6) + update_ttnn_module_args(ttnn_module_args.c6) + parameters["c6"], c6_parallel_config = preprocess_conv2d( + conv6_weight, conv6_bias, ttnn_module_args.c6, return_parallel_config=True + ) + + ttnn_module_args.c7["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c7["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c7["deallocate_activation"] = True + ttnn_module_args.c7["conv_blocking_and_parallelization_config_override"] = None + + conv7_weight, conv7_bias = fold_batch_norm2d_into_conv2d(model.c7, model.b7) + update_ttnn_module_args(ttnn_module_args.c7) + parameters["c7"], c7_parallel_config = preprocess_conv2d( + conv7_weight, conv7_bias, ttnn_module_args.c7, return_parallel_config=True + ) + + ttnn_module_args.c8["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c8["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c8["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c8["deallocate_activation"] = False + ttnn_module_args.c8["conv_blocking_and_parallelization_config_override"] = None + + conv8_weight, conv8_bias = fold_batch_norm2d_into_conv2d(model.c8, model.b8) + update_ttnn_module_args(ttnn_module_args.c8) + parameters["c8"], c8_parallel_config = preprocess_conv2d( + conv8_weight, conv8_bias, ttnn_module_args.c8, return_parallel_config=True + ) + + ttnn_module_args.c9["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c9["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c9["deallocate_activation"] = False + ttnn_module_args.c9["conv_blocking_and_parallelization_config_override"] = None + + conv9_weight, conv9_bias = fold_batch_norm2d_into_conv2d(model.c9, model.b9) + update_ttnn_module_args(ttnn_module_args.c9) + ttnn_module_args.c9["use_1d_systolic_array"] = False + parameters["c9"], c9_parallel_config = preprocess_conv2d( + conv9_weight, conv9_bias, ttnn_module_args.c9, return_parallel_config=True + ) + + conv10_weight = model.c10.weight + conv10_bias = model.c10.bias + # conv10_bias = None + parameters["c10"] = {} + parameters["c10"]["weight"] = conv10_weight + parameters["c10"]["bias"] = conv10_bias + + ttnn_module_args.c11["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c11["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c11["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c11["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c11["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c11["deallocate_activation"] = False + ttnn_module_args.c11["conv_blocking_and_parallelization_config_override"] = None + + conv11_weight, conv11_bias = fold_batch_norm2d_into_conv2d(model.c11, model.b11) + update_ttnn_module_args(ttnn_module_args.c11) + parameters["c11"], c11_parallel_config = preprocess_conv2d( + conv11_weight, conv11_bias, ttnn_module_args.c11, return_parallel_config=True + ) + + ttnn_module_args.c12["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c12["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c12["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c12["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c12["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c12["deallocate_activation"] = True + ttnn_module_args.c12["conv_blocking_and_parallelization_config_override"] = None + + conv12_weight, conv12_bias = fold_batch_norm2d_into_conv2d(model.c12, model.b12) + update_ttnn_module_args(ttnn_module_args.c12) + parameters["c12"], c12_parallel_config = preprocess_conv2d( + conv12_weight, conv12_bias, ttnn_module_args.c12, return_parallel_config=True + ) + + ttnn_module_args.c13["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c13["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c13["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c13["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c13["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c13["deallocate_activation"] = True + ttnn_module_args.c13["conv_blocking_and_parallelization_config_override"] = None + + conv13_weight, conv13_bias = fold_batch_norm2d_into_conv2d(model.c13, model.b13) + update_ttnn_module_args(ttnn_module_args.c13) + parameters["c13"], c13_parallel_config = preprocess_conv2d( + conv13_weight, conv13_bias, ttnn_module_args.c13, return_parallel_config=True + ) + + ttnn_module_args.c14["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c14["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c14["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c14["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c14["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c14["deallocate_activation"] = True + ttnn_module_args.c14["conv_blocking_and_parallelization_config_override"] = None + + conv14_weight, conv14_bias = fold_batch_norm2d_into_conv2d(model.c14, model.b14) + update_ttnn_module_args(ttnn_module_args.c14) + parameters["c14"], c14_parallel_config = preprocess_conv2d( + conv14_weight, conv14_bias, ttnn_module_args.c14, return_parallel_config=True + ) + + ttnn_module_args.c15["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c15["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c15["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c15["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c15["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c15["deallocate_activation"] = True + ttnn_module_args.c15["conv_blocking_and_parallelization_config_override"] = None + + conv15_weight, conv15_bias = fold_batch_norm2d_into_conv2d(model.c15, model.b15) + update_ttnn_module_args(ttnn_module_args.c15) + parameters["c15"], c15_parallel_config = preprocess_conv2d( + conv15_weight, conv15_bias, ttnn_module_args.c15, return_parallel_config=True + ) + + ttnn_module_args.c16["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c16["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c16["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c16["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c16["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c16["deallocate_activation"] = True + ttnn_module_args.c16["conv_blocking_and_parallelization_config_override"] = None + + conv16_weight, conv16_bias = fold_batch_norm2d_into_conv2d(model.c16, model.b16) + update_ttnn_module_args(ttnn_module_args.c16) + parameters["c16"], c16_parallel_config = preprocess_conv2d( + conv16_weight, conv16_bias, ttnn_module_args.c16, return_parallel_config=True + ) + + ttnn_module_args.c17["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c17["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c17["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c17["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c17["deallocate_activation"] = True + ttnn_module_args.c17["conv_blocking_and_parallelization_config_override"] = None + # conv17_weight, conv17_bias = model.c17, model.b17 + conv17_weight, conv17_bias = fold_batch_norm2d_into_conv2d(model.c17, model.b17) + update_ttnn_module_args(ttnn_module_args.c17) + parameters["c17"], c17_parallel_config = preprocess_conv2d( + conv17_weight, conv17_bias, ttnn_module_args.c17, return_parallel_config=True + ) + + conv18_weight = model.c18.weight + conv18_bias = model.c18.bias + parameters["c18"] = {} + parameters["c18"]["weight"] = conv18_weight + parameters["c18"]["bias"] = conv18_bias + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_neck.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_neck.py new file mode 100644 index 00000000000..5b11e829a28 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_neck.py @@ -0,0 +1,365 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from models.experimental.functional_yolov4.reference.neck import Neck +from models.experimental.functional_yolov4.tt.ttnn_neck import TtNeck + +import ttnn +import tt_lib + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels < 256 + + +def update_ttnn_module_argsc3(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = True + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + print("ttnn_module_args: ", ttnn_module_args) + if isinstance(model, Neck): + ttnn_module_args.c1["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c1["use_shallow_conv_variant"] = False # ( + ttnn_module_args.c1["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c1["deallocate_activation"] = True + ttnn_module_args.c1["conv_blocking_and_parallelization_config_override"] = None + # ttnn_module_args.c1["use_1d_systolic_array"] = True + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["activation"] = "relu" # Fuse relu with conv2 + ttnn_module_args.c2["deallocate_activation"] = True + ttnn_module_args.c2["conv_blocking_and_parallelization_config_override"] = None + + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c3["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c3["deallocate_activation"] = True + ttnn_module_args.c3["conv_blocking_and_parallelization_config_override"] = None + update_ttnn_module_argsc3(ttnn_module_args.c3) + print("\n\n\n\nchecking here!: ", ttnn_module_args.c3["use_1d_systolic_array"]) + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + # parameters["c3"] = {} + # parameters["c3"]["weight"] = ttnn.from_torch(conv3_weight) + # ttnn_module_args.p1["deallocate_activation"] = False + # parameters["p1"] = {} + # ttnn_module_args.p1["parallel_config_override"] = { + # "grid_size": (c3_parallel_config.grid_size.x, c3_parallel_config.grid_size.y), + # "num_cores_nhw": c3_parallel_config.num_cores_nhw, + # } + # ttnn_module_args.p2["deallocate_activation"] = False + # parameters["p2"] = {} + # ttnn_module_args.p2["parallel_config_override"] = { + # "grid_size": (c3_parallel_config.grid_size.x, c3_parallel_config.grid_size.y), + # "num_cores_nhw": c3_parallel_config.num_cores_nhw, + # } + # ttnn_module_args.p3["deallocate_activation"] = False + # parameters["p3"] = {} + # ttnn_module_args.p3["parallel_config_override"] = { + # "grid_size": (c3_parallel_config.grid_size.x, c3_parallel_config.grid_size.y), + # "num_cores_nhw": c3_parallel_config.num_cores_nhw, + # } + ttnn_module_args.c4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c4["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c4["deallocate_activation"] = True + ttnn_module_args.c4["conv_blocking_and_parallelization_config_override"] = None + + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c5["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c5["deallocate_activation"] = True + ttnn_module_args.c5["conv_blocking_and_parallelization_config_override"] = None + + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + ttnn_module_args.c6["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c6["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c6["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c6["deallocate_activation"] = True + ttnn_module_args.c6["conv_blocking_and_parallelization_config_override"] = None + + conv6_weight, conv6_bias = fold_batch_norm2d_into_conv2d(model.c6, model.b6) + update_ttnn_module_args(ttnn_module_args.c6) + parameters["c6"], c6_parallel_config = preprocess_conv2d( + conv6_weight, conv6_bias, ttnn_module_args.c6, return_parallel_config=True + ) + + ttnn_module_args.c7["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c7["deallocate_activation"] = True + ttnn_module_args.c7["conv_blocking_and_parallelization_config_override"] = None + + conv7_weight, conv7_bias = fold_batch_norm2d_into_conv2d(model.c7, model.b7) + update_ttnn_module_args(ttnn_module_args.c7) + parameters["c7"], c7_parallel_config = preprocess_conv2d( + conv7_weight, conv7_bias, ttnn_module_args.c7, return_parallel_config=True + ) + + ttnn_module_args.c7_2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7_2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7_2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_2["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c7_2["deallocate_activation"] = True + ttnn_module_args.c7_2["conv_blocking_and_parallelization_config_override"] = None + + conv7_2_weight, conv7_2_bias = fold_batch_norm2d_into_conv2d(model.c7_2, model.b7_2) + update_ttnn_module_args(ttnn_module_args.c7_2) + parameters["c7_2"], c7_2_parallel_config = preprocess_conv2d( + conv7_2_weight, conv7_2_bias, ttnn_module_args.c7_2, return_parallel_config=True + ) + + ttnn_module_args.c7_3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7_3["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7_3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_3["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c7_3["deallocate_activation"] = True + ttnn_module_args.c7_3["conv_blocking_and_parallelization_config_override"] = None + + conv7_3_weight, conv7_3_bias = fold_batch_norm2d_into_conv2d(model.c7_3, model.b7_3) + update_ttnn_module_args(ttnn_module_args.c7_3) + parameters["c7_3"], c7_3_parallel_config = preprocess_conv2d( + conv7_3_weight, conv7_3_bias, ttnn_module_args.c7_3, return_parallel_config=True + ) + + ttnn_module_args.c7_4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7_4["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7_4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_4["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c7_4["deallocate_activation"] = True + ttnn_module_args.c7_4["conv_blocking_and_parallelization_config_override"] = None + + conv7_4_weight, conv7_4_bias = fold_batch_norm2d_into_conv2d(model.c7_4, model.b7_4) + update_ttnn_module_args(ttnn_module_args.c7_4) + parameters["c7_4"], c7_4_parallel_config = preprocess_conv2d( + conv7_4_weight, conv7_4_bias, ttnn_module_args.c7_4, return_parallel_config=True + ) + + ttnn_module_args.c7_5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7_5["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7_5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_5["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c7_5["deallocate_activation"] = True + ttnn_module_args.c7_5["conv_blocking_and_parallelization_config_override"] = None + + conv7_5_weight, conv7_5_bias = fold_batch_norm2d_into_conv2d(model.c7_5, model.b7_5) + update_ttnn_module_args(ttnn_module_args.c7_5) + parameters["c7_5"], c7_5_parallel_config = preprocess_conv2d( + conv7_5_weight, conv7_5_bias, ttnn_module_args.c7_5, return_parallel_config=True + ) + + ttnn_module_args.c8["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c8["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c8["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c8["deallocate_activation"] = True + ttnn_module_args.c8["conv_blocking_and_parallelization_config_override"] = None + + conv8_weight, conv8_bias = fold_batch_norm2d_into_conv2d(model.c8, model.b8) + update_ttnn_module_args(ttnn_module_args.c8) + parameters["c8"], c8_parallel_config = preprocess_conv2d( + conv8_weight, conv8_bias, ttnn_module_args.c8, return_parallel_config=True + ) + + ttnn_module_args.c8_2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c8_2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c8_2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8_2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8_2["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c8_2["deallocate_activation"] = True + ttnn_module_args.c8_2["conv_blocking_and_parallelization_config_override"] = None + + conv8_2_weight, conv8_2_bias = fold_batch_norm2d_into_conv2d(model.c8_2, model.b8_2) + update_ttnn_module_args(ttnn_module_args.c8_2) + parameters["c8_2"], c8_2_parallel_config = preprocess_conv2d( + conv8_2_weight, conv8_2_bias, ttnn_module_args.c8_2, return_parallel_config=True + ) + + ttnn_module_args.c9["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c9["deallocate_activation"] = True + ttnn_module_args.c9["conv_blocking_and_parallelization_config_override"] = None + + conv9_weight, conv9_bias = fold_batch_norm2d_into_conv2d(model.c9, model.b9) + update_ttnn_module_argsc3(ttnn_module_args.c9) + parameters["c9"], c9_parallel_config = preprocess_conv2d( + conv9_weight, conv9_bias, ttnn_module_args.c9, return_parallel_config=True + ) + + ttnn_module_args.c9_2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9_2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9_2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_2["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c9_2["deallocate_activation"] = True + ttnn_module_args.c9_2["conv_blocking_and_parallelization_config_override"] = None + + conv9_2_weight, conv9_2_bias = fold_batch_norm2d_into_conv2d(model.c9_2, model.b9_2) + update_ttnn_module_args(ttnn_module_args.c9_2) + parameters["c9_2"], c9_2_parallel_config = preprocess_conv2d( + conv9_2_weight, conv9_2_bias, ttnn_module_args.c9_2, return_parallel_config=True + ) + + ttnn_module_args.c9_3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9_3["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9_3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_3["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c9_3["deallocate_activation"] = True + ttnn_module_args.c9_3["conv_blocking_and_parallelization_config_override"] = None + + conv9_3_weight, conv9_3_bias = fold_batch_norm2d_into_conv2d(model.c9_3, model.b9_3) + update_ttnn_module_args(ttnn_module_args.c9_3) + parameters["c9_3"], c9_3_parallel_config = preprocess_conv2d( + conv9_3_weight, conv9_3_bias, ttnn_module_args.c9_3, return_parallel_config=True + ) + + ttnn_module_args.c9_4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9_4["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9_4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_4["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c9_4["deallocate_activation"] = True + ttnn_module_args.c9_4["conv_blocking_and_parallelization_config_override"] = None + + conv9_4_weight, conv9_4_bias = fold_batch_norm2d_into_conv2d(model.c9_4, model.b9_4) + update_ttnn_module_args(ttnn_module_args.c9_4) + parameters["c9_4"], c9_4_parallel_config = preprocess_conv2d( + conv9_4_weight, conv9_4_bias, ttnn_module_args.c9_4, return_parallel_config=True + ) + + ttnn_module_args.c9_5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9_5["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9_5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_5["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c9_5["deallocate_activation"] = True + ttnn_module_args.c9_5["conv_blocking_and_parallelization_config_override"] = None + + conv9_5_weight, conv9_5_bias = fold_batch_norm2d_into_conv2d(model.c9_5, model.b9_5) + update_ttnn_module_args(ttnn_module_args.c9_5) + parameters["c9_5"], c9_5_parallel_config = preprocess_conv2d( + conv9_5_weight, conv9_5_bias, ttnn_module_args.c9_5, return_parallel_config=True + ) + + ttnn_module_args.c10["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c10["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c10["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c10["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c10["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c10["deallocate_activation"] = True + ttnn_module_args.c10["conv_blocking_and_parallelization_config_override"] = None + + conv10_weight, conv10_bias = fold_batch_norm2d_into_conv2d(model.c10, model.b10) + update_ttnn_module_args(ttnn_module_args.c10) + parameters["c10"], c10_parallel_config = preprocess_conv2d( + conv10_weight, conv10_bias, ttnn_module_args.c10, return_parallel_config=True + ) + + ttnn_module_args.c10_2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c10_2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c10_2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c10_2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c10_2["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c10_2["deallocate_activation"] = True + ttnn_module_args.c10_2["conv_blocking_and_parallelization_config_override"] = None + + conv10_weight, conv10_bias = fold_batch_norm2d_into_conv2d(model.c10_2, model.b10_2) + update_ttnn_module_args(ttnn_module_args.c10_2) + parameters["c10_2"], c10_2_parallel_config = preprocess_conv2d( + conv10_weight, conv10_bias, ttnn_module_args.c10_2, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py new file mode 100644 index 00000000000..54469d89847 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import ttnn +import tt_lib + +from ttnn.model_preprocessing import preprocess_model, preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_wormhole_b0 + +from models.experimental.functional_yolov4.reference.yolov4 import Yolov4 +from models.experimental.functional_yolov4.tt.ttnn_yolov4 import TtYolov4 + +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d1 as D1 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d2 as D2 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d3 as D3 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d4 as D4 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d5 as D5 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_neck as neck +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_head as head + + +def create_custom_preprocessor(device): + def custom_preprocessor(model, name, ttnn_module_args): + parameters = {} + parameters["downsample1"] = D1.custom_preprocessor( + device, model.downsample1, name, ttnn_module_args["downsample1"] + ) + parameters["downsample2"] = D2.custom_preprocessor( + device, model.downsample2, name, ttnn_module_args["downsample2"] + ) + parameters["downsample3"] = D3.custom_preprocessor( + device, model.downsample3, name, ttnn_module_args["downsample3"] + ) + parameters["downsample4"] = D4.custom_preprocessor( + device, model.downsample4, name, ttnn_module_args["downsample4"] + ) + parameters["downsample5"] = D5.custom_preprocessor( + device, model.downsample5, name, ttnn_module_args["downsample5"] + ) + parameters["neck"] = neck.custom_preprocessor(device, model.neck, name, ttnn_module_args["neck"]) + parameters["head"] = head.custom_preprocessor(device, model.head, name, ttnn_module_args["head"]) + return parameters + + return custom_preprocessor + + +import pytest + + +@pytest.mark.parametrize("device_l1_small_size", [32768], indirect=True) +@skip_for_wormhole_b0() +def test_downsample1(device, reset_seeds): + # device = ttnn.open_device(device_id=2, l1_small_size=32768) + state_dict = torch.load("tests/ttnn/integration_tests/yolov4/yolov4.pth") + ds_state_dict = { + k: v + for k, v in state_dict.items() + if (k.startswith(("down1.", "down2.", "down3.", "down4.", "down5.", "neek.", "head."))) + } + torch_model = Yolov4() + + new_state_dict = {} + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + + torch_model.load_state_dict(new_state_dict) + torch_model.eval() + + torch_input_tensor = torch.randn(1, 3, 320, 320) # Batch size of 1, 128 input channels, 160x160 height and width + torch_output_tensor1, torch_output_tensor2, torch_output_tensor3 = torch_model(torch_input_tensor) + reader_patterns_cache = {} + parameters = preprocess_model( + initialize_model=lambda: torch_model, + run_model=lambda model: model(torch_input_tensor), + custom_preprocessor=create_custom_preprocessor(device), + reader_patterns_cache=reader_patterns_cache, + device=device, + ) + + ttnn_model = TtYolov4(device, parameters) + + # Tensor Preprocessing + # + input_shape = torch_input_tensor.shape + input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + + input_tensor = input_tensor.reshape( + input_tensor.shape[0], 1, input_tensor.shape[1] * input_tensor.shape[2], input_tensor.shape[3] + ) + input_tensor = ttnn.from_torch(input_tensor, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT) + output_tensor1, output_tensor2, output_tensor3 = ttnn_model(device, input_tensor) + + # + # Tensor Postprocessing + # + output_tensor1 = ttnn.to_torch(output_tensor1) + output_tensor1 = output_tensor1.reshape(1, 40, 40, 255) + output_tensor1 = torch.permute(output_tensor1, (0, 3, 1, 2)) + output_tensor1 = output_tensor1.to(torch_input_tensor.dtype) + + output_tensor2 = ttnn.to_torch(output_tensor2) + output_tensor2 = output_tensor2.reshape(1, 20, 20, 255) + output_tensor2 = torch.permute(output_tensor2, (0, 3, 1, 2)) + output_tensor2 = output_tensor2.to(torch_input_tensor.dtype) + + output_tensor3 = ttnn.to_torch(output_tensor3) + output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) + output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) + output_tensor3 = output_tensor3.to(torch_input_tensor.dtype) + + assert_with_pcc(torch_output_tensor1, output_tensor1, pcc=0.99) # PCC = 0.9997609248813403 + assert_with_pcc(torch_output_tensor2, output_tensor2, pcc=0.99) # PCC = 0.0 as nan + assert_with_pcc(torch_output_tensor3, output_tensor3, pcc=0.99) # PCC = -0.010033306229600303 diff --git a/tests/ttnn/integration_tests/yolov4/test_yolov4.py b/tests/ttnn/integration_tests/yolov4/test_yolov4.py new file mode 100644 index 00000000000..8f15d3c086c --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/test_yolov4.py @@ -0,0 +1,850 @@ +# -*- coding: utf-8 -*- +"""yolov4.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1FdesVZbnDd2N_H8Sgd148LWWwpykJCxl +""" + +import torch +from torch import nn +import torch.nn.functional as F +import sys +import cv2 +import time +import numpy as np +import math +import pytest +from models.utility_functions import skip_for_wormhole_b0 + +from models.experimental.functional_yolov4.reference.yolov4 import Yolov4 +from models.experimental.functional_yolov4.tt.ttnn_yolov4 import TtYolov4 + +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d1 as D1 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d2 as D2 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d3 as D3 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d4 as D4 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d5 as D5 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_neck as neck +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_head as head +import ttnn +from ttnn.model_preprocessing import preprocess_model, preprocess_conv2d, fold_batch_norm2d_into_conv2d + + +def create_custom_preprocessor(device): + def custom_preprocessor(model, name, ttnn_module_args): + parameters = {} + parameters["downsample1"] = D1.custom_preprocessor( + device, model.downsample1, name, ttnn_module_args["downsample1"] + ) + parameters["downsample2"] = D2.custom_preprocessor( + device, model.downsample2, name, ttnn_module_args["downsample2"] + ) + parameters["downsample3"] = D3.custom_preprocessor( + device, model.downsample3, name, ttnn_module_args["downsample3"] + ) + parameters["downsample4"] = D4.custom_preprocessor( + device, model.downsample4, name, ttnn_module_args["downsample4"] + ) + parameters["downsample5"] = D5.custom_preprocessor( + device, model.downsample5, name, ttnn_module_args["downsample5"] + ) + parameters["neck"] = neck.custom_preprocessor(device, model.neck, name, ttnn_module_args["neck"]) + parameters["head"] = head.custom_preprocessor(device, model.head, name, ttnn_module_args["head"]) + return parameters + + return custom_preprocessor + + +def yolo_forward_dynamic( + output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1, validation=False +): + # Output would be invalid if it does not satisfy this assert + # assert (output.size(1) == (5 + num_classes) * num_anchors) + + # print(output.size()) + + # Slice the second dimension (channel) of output into: + # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ] + # And then into + # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ] + # batch = output.size(0) + # H = output.size(2) + # W = output.size(3) + + bxy_list = [] + bwh_list = [] + det_confs_list = [] + cls_confs_list = [] + + print("inside yolo forward dynamic") + + for i in range(num_anchors): + begin = i * (5 + num_classes) + end = (i + 1) * (5 + num_classes) + bxy_list.append(output[:, begin : begin + 2]) + bwh_list.append(output[:, begin + 2 : begin + 4]) + det_confs_list.append(output[:, begin + 4 : begin + 5]) + cls_confs_list.append(output[:, begin + 5 : end]) + # Shape: [batch, num_anchors * 2, H, W] + bxy = torch.cat(bxy_list, dim=1) + # Shape: [batch, num_anchors * 2, H, W] + bwh = torch.cat(bwh_list, dim=1) + + # Shape: [batch, num_anchors, H, W] + det_confs = torch.cat(det_confs_list, dim=1) + # Shape: [batch, num_anchors * H * W] + print("det_confs", det_confs.shape) + det_confs = det_confs.reshape(output.size(0), num_anchors * output.size(2) * output.size(3)) + # Shape: [batch, num_anchors * num_classes, H, W] + cls_confs = torch.cat(cls_confs_list, dim=1) + # Shape: [batch, num_anchors, num_classes, H * W] + cls_confs = cls_confs.view(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3)) + # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] + cls_confs = cls_confs.permute(0, 1, 3, 2).reshape( + output.size(0), num_anchors * output.size(2) * output.size(3), num_classes + ) + # Apply sigmoid(), exp() and softmax() to slices + # + bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1) + bwh = torch.exp(bwh) + det_confs = torch.sigmoid(det_confs) + cls_confs = torch.sigmoid(cls_confs) + # Prepare C-x, C-y, P-w, P-h (None of them are torch related) + grid_x = np.expand_dims( + np.expand_dims( + np.expand_dims(np.linspace(0, output.size(3) - 1, output.size(3)), axis=0).repeat(output.size(2), 0), axis=0 + ), + axis=0, + ) + grid_y = np.expand_dims( + np.expand_dims( + np.expand_dims(np.linspace(0, output.size(2) - 1, output.size(2)), axis=1).repeat(output.size(3), 1), axis=0 + ), + axis=0, + ) + # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1) + # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W) + anchor_w = [] + anchor_h = [] + for i in range(num_anchors): + anchor_w.append(anchors[i * 2]) + anchor_h.append(anchors[i * 2 + 1]) + device = None + cuda_check = output.is_cuda + if cuda_check: + device = output.get_device() + + bx_list = [] + by_list = [] + bw_list = [] + bh_list = [] + + # Apply C-x, C-y, P-w, P-h + for i in range(num_anchors): + ii = i * 2 + # Shape: [batch, 1, H, W] + bx = bxy[:, ii : ii + 1] + torch.tensor( + grid_x, device=device, dtype=torch.float32 + ) # grid_x.to(device=device, dtype=torch.float32) + # Shape: [batch, 1, H, W] + by = bxy[:, ii + 1 : ii + 2] + torch.tensor( + grid_y, device=device, dtype=torch.float32 + ) # grid_y.to(device=device, dtype=torch.float32) + # Shape: [batch, 1, H, W] + bw = bwh[:, ii : ii + 1] * anchor_w[i] + # Shape: [batch, 1, H, W] + bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i] + + bx_list.append(bx) + by_list.append(by) + bw_list.append(bw) + bh_list.append(bh) + + ######################################## + # Figure out bboxes from slices # + ######################################## + # Shape: [batch, num_anchors, H, W] + bx = torch.cat(bx_list, dim=1) + # Shape: [batch, num_anchors, H, W] + by = torch.cat(by_list, dim=1) + # Shape: [batch, num_anchors, H, W] + bw = torch.cat(bw_list, dim=1) + # Shape: [batch, num_anchors, H, W] + bh = torch.cat(bh_list, dim=1) + # Shape: [batch, 2 * num_anchors, H, W] + bx_bw = torch.cat((bx, bw), dim=1) + # Shape: [batch, 2 * num_anchors, H, W] + by_bh = torch.cat((by, bh), dim=1) + + # normalize coordinates to [0, 1] + bx_bw /= output.size(3) + by_bh /= output.size(2) + + # Shape: [batch, num_anchors * H * W, 1] + bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + + bx1 = bx - bw * 0.5 + by1 = by - bh * 0.5 + bx2 = bx1 + bw + by2 = by1 + bh + # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4] + boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view( + output.size(0), num_anchors * output.size(2) * output.size(3), 1, 4 + ) + # boxes = boxes.repeat(1, 1, num_classes, 1) + + # boxes: [batch, num_anchors * H * W, 1, 4] + # cls_confs: [batch, num_anchors * H * W, num_classes] + # det_confs: [batch, num_anchors * H * W] + + det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + confs = cls_confs * det_confs + + # boxes: [batch, num_anchors * H * W, 1, 4] + # confs: [batch, num_anchors * H * W, num_classes] + print("at the end of inside yolo forward dynamic") + return boxes, confs + + +class YoloLayer(nn.Module): + """Yolo layer + model_out: while inference,is post-processing inside or outside the model + true:outside + """ + + def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, stride=32, model_out=False): + super(YoloLayer, self).__init__() + self.anchor_mask = anchor_mask + self.num_classes = num_classes + self.anchors = anchors + self.num_anchors = num_anchors + self.anchor_step = len(anchors) // num_anchors + self.coord_scale = 1 + self.noobject_scale = 1 + self.object_scale = 5 + self.class_scale = 1 + self.thresh = 0.6 + self.stride = stride + self.seen = 0 + self.scale_x_y = 1 + + self.model_out = model_out + + def forward(self, output, target=None): + # if self.training: + # return output + masked_anchors = [] + for m in self.anchor_mask: + masked_anchors += self.anchors[m * self.anchor_step : (m + 1) * self.anchor_step] + masked_anchors = [anchor / self.stride for anchor in masked_anchors] + return yolo_forward_dynamic( + output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask), scale_x_y=self.scale_x_y + ) + + +def get_region_boxes(boxes_and_confs): + print("Getting boxes from boxes and confs ...") + boxes_list = [] + confs_list = [] + + for item in boxes_and_confs: + boxes_list.append(item[0]) + confs_list.append(item[1]) + + # boxes: [batch, num1 + num2 + num3, 1, 4] + # confs: [batch, num1 + num2 + num3, num_classes] + boxes = torch.cat(boxes_list, dim=1) + confs = torch.cat(confs_list, dim=1) + + return [boxes, confs] + + +# class Yolov4Head(nn.Module): +# def __init__(self, output_ch, n_classes, inference=False): +# super().__init__() +# self.inference = inference + +# self.conv1 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky') +# self.conv2 = Conv_Bn_Activation(256, output_ch, 1, 1, 'linear', bn=False, bias=True) + +# # self.yolo1 = YoloLayer( +# # anchor_mask=[0, 1, 2], num_classes=n_classes, +# # anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], +# # num_anchors=9, stride=8) + +# # R -4 +# self.conv3 = Conv_Bn_Activation(128, 256, 3, 2, 'leaky') + +# # R -1 -16 +# self.conv4 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') +# self.conv5 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky') +# self.conv6 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') +# self.conv7 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky') +# self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') +# self.conv9 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky') +# self.conv10 = Conv_Bn_Activation(512, output_ch, 1, 1, 'linear', bn=False, bias=True) + + +# # self.yolo2 = YoloLayer( +# # anchor_mask=[3, 4, 5], num_classes=n_classes, +# # anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], +# # num_anchors=9, stride=16) + +# # R -4 +# self.conv11 = Conv_Bn_Activation(256, 512, 3, 2, 'leaky') + +# # R -1 -37 +# self.conv12 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') +# self.conv13 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky') +# self.conv14 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') +# self.conv15 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky') +# self.conv16 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') +# self.conv17 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky') +# self.conv18 = Conv_Bn_Activation(1024, output_ch, 1, 1, 'linear', bn=False, bias=True) + + +# # self.yolo3 = YoloLayer( +# # anchor_mask=[6, 7, 8], num_classes=n_classes, +# # anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], +# # num_anchors=9, stride=32) + +# def forward(self, input1, input2, input3): +# x1 = self.conv1(input1) +# x2 = self.conv2(x1) + +# x3 = self.conv3(input1) +# # R -1 -16 +# x3 = torch.cat([x3, input2], dim=1) +# x4 = self.conv4(x3) +# x5 = self.conv5(x4) +# x6 = self.conv6(x5) +# x7 = self.conv7(x6) +# x8 = self.conv8(x7) +# x9 = self.conv9(x8) +# x10 = self.conv10(x9) + +# # R -4 +# x11 = self.conv11(x8) +# # R -1 -37 +# x11 = torch.cat([x11, input3], dim=1) + +# x12 = self.conv12(x11) +# x13 = self.conv13(x12) +# x14 = self.conv14(x13) +# x15 = self.conv15(x14) +# x16 = self.conv16(x15) +# x17 = self.conv17(x16) +# x18 = self.conv18(x17) + +# return [x2, x10, x18] + + +# class Yolov4(nn.Module): +# def __init__(self, yolov4conv137weight=None, n_classes=80, inference=False): +# super().__init__() + +# output_ch = (4 + 1 + n_classes) * 3 + +# # backbone +# self.down1 = DownSample1() +# self.down2 = DownSample2() +# self.down3 = DownSample3() +# self.down4 = DownSample4() +# self.down5 = DownSample5() +# # neck +# self.neck = Neck(inference) +# # yolov4conv137 +# if yolov4conv137weight: +# _model = nn.Sequential(self.down1, self.down2, self.down3, self.down4, self.down5, self.neck) +# pretrained_dict = torch.load(yolov4conv137weight) + +# model_dict = _model.state_dict() +# # 1. filter out unnecessary keys +# pretrained_dict = {k1: v for (k, v), k1 in zip(pretrained_dict.items(), model_dict)} +# # 2. overwrite entries in the existing state dict +# model_dict.update(pretrained_dict) +# _model.load_state_dict(model_dict) + +# # head +# self.head = Yolov4Head(output_ch, n_classes, inference) + + +# def forward(self, input): +# d1 = self.down1(input) +# d2 = self.down2(d1) +# d3 = self.down3(d2) +# d4 = self.down4(d3) +# d5 = self.down5(d4) + +# x20, x13, x6 = self.neck(d5, d4, d3) + +# output = self.head(x20, x13, x6) +# return output + + +def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None): + img = np.copy(img) + colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32) + + def get_color(c, x, max_val): + ratio = float(x) / max_val * 5 + i = int(math.floor(ratio)) + j = int(math.ceil(ratio)) + ratio = ratio - i + r = (1 - ratio) * colors[i][c] + ratio * colors[j][c] + return int(r * 255) + + width = img.shape[1] + height = img.shape[0] + for i in range(len(boxes)): + box = boxes[i] + x1 = int(box[0] * width) + y1 = int(box[1] * height) + x2 = int(box[2] * width) + y2 = int(box[3] * height) + bbox_thick = int(0.6 * (height + width) / 600) + if color: + rgb = color + else: + rgb = (255, 0, 0) + if len(box) >= 7 and class_names: + cls_conf = box[5] + cls_id = box[6] + print("%s: %f" % (class_names[cls_id], cls_conf)) + classes = len(class_names) + offset = cls_id * 123457 % classes + red = get_color(2, offset, classes) + green = get_color(1, offset, classes) + blue = get_color(0, offset, classes) + if color is None: + rgb = (red, green, blue) + msg = str(class_names[cls_id]) + " " + str(round(cls_conf, 3)) + t_size = cv2.getTextSize(msg, 0, 0.7, thickness=bbox_thick // 2)[0] + c1, c2 = (x1, y1), (x2, y2) + c3 = (c1[0] + t_size[0], c1[1] - t_size[1] - 3) + print("x1", x1) + print("y1", y1) + print("(int(np.float32(c3[0])), int(np.float32(c3[1])))", (int(np.float32(c3[0])), int(np.float32(c3[1])))) + cv2.rectangle(img, (x1, y1), (int(np.float32(c3[0])), int(np.float32(c3[1]))), rgb, -1) + img = cv2.putText( + img, + msg, + (c1[0], int(np.float32(c1[1] - 2))), + cv2.FONT_HERSHEY_SIMPLEX, + 0.7, + (0, 0, 0), + bbox_thick // 2, + lineType=cv2.LINE_AA, + ) + + img = cv2.rectangle(img, (x1, y1), (int(x2), int(y2)), rgb, bbox_thick) + if savename: + print("save plot results to %s" % savename) + print(img) + cv2.imwrite(savename, img) + return img + + +def load_class_names(namesfile): + class_names = [] + with open(namesfile, "r") as fp: + lines = fp.readlines() + for line in lines: + line = line.rstrip() + class_names.append(line) + return class_names + + +def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): + # print(boxes.shape) + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1) * (y2 - y1) + order = confs.argsort()[::-1] + + keep = [] + while order.size > 0: + idx_self = order[0] + idx_other = order[1:] + + keep.append(idx_self) + + xx1 = np.maximum(x1[idx_self], x1[idx_other]) + yy1 = np.maximum(y1[idx_self], y1[idx_other]) + xx2 = np.minimum(x2[idx_self], x2[idx_other]) + yy2 = np.minimum(y2[idx_self], y2[idx_other]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + + if min_mode: + over = inter / np.minimum(areas[order[0]], areas[order[1:]]) + else: + over = inter / (areas[order[0]] + areas[order[1:]] - inter) + + inds = np.where(over <= nms_thresh)[0] + order = order[inds + 1] + + return np.array(keep) + + +def post_processing(img, conf_thresh, nms_thresh, output): + # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] + # num_anchors = 9 + # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + # strides = [8, 16, 32] + # anchor_step = len(anchors) // num_anchors + + # [batch, num, 1, 4] + for out in output: + print("out_shape: ", out.shape) + box_array = output[0] + # [batch, num, num_classes] + confs = output[1].float() + + t1 = time.time() + + if type(box_array).__name__ != "ndarray": + box_array = box_array.cpu().detach().numpy() + confs = confs.cpu().detach().numpy() + + num_classes = confs.shape[2] + + # [batch, num, 4] + box_array = box_array[:, :, 0] + + # [batch, num, num_classes] --> [batch, num] + max_conf = np.max(confs, axis=2) + max_id = np.argmax(confs, axis=2) + + t2 = time.time() + + bboxes_batch = [] + for i in range(box_array.shape[0]): + argwhere = max_conf[i] > conf_thresh + l_box_array = box_array[i, argwhere, :] + l_max_conf = max_conf[i, argwhere] + l_max_id = max_id[i, argwhere] + + bboxes = [] + # nms for each class + for j in range(num_classes): + cls_argwhere = l_max_id == j + ll_box_array = l_box_array[cls_argwhere, :] + ll_max_conf = l_max_conf[cls_argwhere] + ll_max_id = l_max_id[cls_argwhere] + + keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh) + + if keep.size > 0: + ll_box_array = ll_box_array[keep, :] + ll_max_conf = ll_max_conf[keep] + ll_max_id = ll_max_id[keep] + + for k in range(ll_box_array.shape[0]): + bboxes.append( + [ + ll_box_array[k, 0], + ll_box_array[k, 1], + ll_box_array[k, 2], + ll_box_array[k, 3], + ll_max_conf[k], + ll_max_conf[k], + ll_max_id[k], + ] + ) + + bboxes_batch.append(bboxes) + + t3 = time.time() + + print("-----------------------------------") + print(" max and argmax : %f" % (t2 - t1)) + print(" nms : %f" % (t3 - t2)) + print("Post processing total : %f" % (t3 - t1)) + print("-----------------------------------") + + return bboxes_batch + + +def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None): + # model.eval() + with torch.no_grad(): + t0 = time.time() + + if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + elif type(img) == np.ndarray and len(img.shape) == 4: + img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + else: + print("unknow image type") + exit(-1) + + img = torch.autograd.Variable(img) + + input_shape = img.shape + # input_tensor = torch.permute(img, (0, 2, 3, 1)) + + # input_tensor = input_tensor.reshape( + # input_tensor.shape[0], 1, input_tensor.shape[1] * input_tensor.shape[2], input_tensor.shape[3] + # ) + # input_tensor = ttnn.from_torch(input_tensor, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT) + # img = input_tensor + t1 = time.time() + + # output = model(device, img) + output = model(img) + torch.save(output, "tests/ttnn/integration_tests/yolov4/ref.pt") + + # output_tensor1 = ttnn.to_torch(output[0]) + # output_tensor1 = output_tensor1.reshape(1, 40, 40, 255) + # output_tensor1 = torch.permute(output_tensor1, (0, 3, 1, 2)) + + # output_tensor2 = ttnn.to_torch(output[1]) + # output_tensor2 = output_tensor2.reshape(1, 20, 20, 255) + # output_tensor2 = torch.permute(output_tensor2, (0, 3, 1, 2)) + + # output_tensor3 = ttnn.to_torch(output[2]) + # output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) + # output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) + + # return output + + yolo1 = YoloLayer( + anchor_mask=[0, 1, 2], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=8, + ) + + yolo2 = YoloLayer( + anchor_mask=[3, 4, 5], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=16, + ) + + yolo3 = YoloLayer( + anchor_mask=[6, 7, 8], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=32, + ) + + # y1 = yolo1(output_tensor1) + # y2 = yolo2(output_tensor2) + # y3 = yolo3(output_tensor3) + + y1 = yolo1(output[0]) + y2 = yolo2(output[1]) + y3 = yolo3(output[2]) + + return y1, y2, y3 + output = get_region_boxes([y1, y2, y3]) + + t2 = time.time() + + print("-----------------------------------") + print(" Preprocess : %f" % (t1 - t0)) + print(" Model Inference : %f" % (t2 - t1)) + print("-----------------------------------") + + return post_processing(img, conf_thresh, nms_thresh, output) + + +def do_detect_ttnn(model, img, conf_thresh, nms_thresh, n_classes, device=None): + # model.eval() + t0 = time.time() + + if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + elif type(img) == np.ndarray and len(img.shape) == 4: + img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + else: + print("unknow image type") + exit(-1) + + img = torch.autograd.Variable(img) + + input_shape = img.shape + input_tensor = torch.permute(img, (0, 2, 3, 1)) + + input_tensor = input_tensor.reshape( + input_tensor.shape[0], 1, input_tensor.shape[1] * input_tensor.shape[2], input_tensor.shape[3] + ) + input_tensor = ttnn.from_torch(input_tensor, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT) + img = input_tensor + t1 = time.time() + + # output = model(device, img) + output = model(device, img) + # torch.save(output,"tests/ttnn/integration_tests/yolov4/ttnn.pt") + + output_tensor1 = ttnn.to_torch(output[0]) + output_tensor1 = output_tensor1.reshape(1, 40, 40, 255) + output_tensor1 = torch.permute(output_tensor1, (0, 3, 1, 2)) + + output_tensor2 = ttnn.to_torch(output[1]) + output_tensor2 = output_tensor2.reshape(1, 20, 20, 255) + output_tensor2 = torch.permute(output_tensor2, (0, 3, 1, 2)) + + output_tensor3 = ttnn.to_torch(output[2]) + output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) + output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) + + # return output_tensor1, output_tensor2, output_tensor3 + yolo1 = YoloLayer( + anchor_mask=[0, 1, 2], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=8, + ) + + yolo2 = YoloLayer( + anchor_mask=[3, 4, 5], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=16, + ) + + yolo3 = YoloLayer( + anchor_mask=[6, 7, 8], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=32, + ) + + y1 = yolo1(output_tensor1) + y2 = yolo2(output_tensor2) + y3 = yolo3(output_tensor3) + + # y1 = yolo1(output[0]) + # y2 = yolo2(output[1]) + # y3 = yolo3(output[2]) + + return y1, y2, y3 + output = get_region_boxes([y1, y2, y3]) + + t2 = time.time() + + print("-----------------------------------") + print(" Preprocess : %f" % (t1 - t0)) + print(" Model Inference : %f" % (t2 - t1)) + print("-----------------------------------") + + return post_processing(img, conf_thresh, nms_thresh, output) + + +# if __name__ == "__main__": +@pytest.mark.parametrize("device_l1_small_size", [32768], indirect=True) +@skip_for_wormhole_b0() +def test_yolov4_model(device, reset_seeds): + # device = ttnn.open_device( device_id = 2, l1_small_size = 32768) + state_dict = torch.load("tests/ttnn/integration_tests/yolov4/yolov4.pth") + ds_state_dict = { + k: v + for k, v in state_dict.items() + if (k.startswith(("down1.", "down2.", "down3.", "down4.", "down5.", "neek.", "head."))) + } + torch_model = Yolov4() + + new_state_dict = {} + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + + torch_model.load_state_dict(new_state_dict) + torch_model.eval() + for i, j in new_state_dict.items(): + print(i, j.shape) + + torch_input_tensor = torch.randn(1, 3, 320, 320) # Batch size of 1, 128 input channels, 160x160 height and width + torch_output_tensor1, torch_output_tensor2, torch_output_tensor3 = torch_model(torch_input_tensor) + reader_patterns_cache = {} + parameters = preprocess_model( + initialize_model=lambda: torch_model, + run_model=lambda model: model(torch_input_tensor), + custom_preprocessor=create_custom_preprocessor(device), + reader_patterns_cache=reader_patterns_cache, + device=device, + ) + + ttnn_model = TtYolov4(device, parameters) + + n_classes = 80 + namesfile = "tests/ttnn/integration_tests/yolov4/coco.names" + weightfile = "tests/ttnn/integration_tests/yolov4/yolov4.pth" + imgfile = "tests/ttnn/integration_tests/yolov4/giraffe_320.jpg" + width = 320 + height = 320 + + pretrained_dict = torch.load(weightfile) + + img = cv2.imread(imgfile) + + # Inference input size is 416*416 does not mean training size is the same + # Training size could be 608*608 or even other sizes + # Optional inference sizes: + # Hight in {320, 416, 512, 608, ... 320 + 96 * n} + # Width in {320, 416, 512, 608, ... 320 + 96 * m} + sized = cv2.resize(img, (width, height)) + sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) + print("torch_model", torch_model) + img = sized.copy() + if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + elif type(img) == np.ndarray and len(img.shape) == 4: + img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + else: + print("unknow image type") + exit(-1) + + img = torch.autograd.Variable(img) + reader_patterns_cache = {} + parameters = preprocess_model( + initialize_model=lambda: torch_model, + run_model=lambda model: model(img), + custom_preprocessor=create_custom_preprocessor(device), + reader_patterns_cache=reader_patterns_cache, + device=device, + ) + + ttnn_model = TtYolov4(device, parameters) + + for i in range(2): # This 'for' loop is for speed check + # Because the first iteration is usually longer + boxes = do_detect(torch_model, sized, 0.0, 0.2, n_classes) + # boxes = do_detect(ttnn_model,sized,0.2, 0.4, n_classes, device) + + for i in range(2): # This 'for' loop is for speed check + # Because the first iteration is usually longer + boxes2 = do_detect_ttnn(ttnn_model, sized, 0.2, 0.4, n_classes, device) + # boxes = do_detect(ttnn_model,sized,0.2, 0.4, n_classes, device) + from tests.ttnn.utils_for_testing import assert_with_pcc + + assert_with_pcc(boxes[0][0], boxes2[0][0], pcc=0.99) + assert_with_pcc(boxes[0][1], boxes2[0][1], pcc=0.99) + assert_with_pcc(boxes[1][0], boxes2[1][0], pcc=0.99) + assert_with_pcc(boxes[1][1], boxes2[1][1], pcc=0.99) + assert_with_pcc(boxes[2][0], boxes2[2][0], pcc=0.99) + assert_with_pcc(boxes[2][1], boxes2[2][1], pcc=0.99) + class_names = load_class_names(namesfile) + # plot_boxes_cv2(img, boxes[0], 'predictions_ref_relu.jpg', class_names) + # plot_boxes_cv2(img, boxes2[0], 'predictions_ttnn_relu.jpg', class_names)