From 4001928ec71f11d0f65b196d37a1d62d2f1926a3 Mon Sep 17 00:00:00 2001 From: keerthana-r-mcw Date: Tue, 23 Apr 2024 10:13:10 +0000 Subject: [PATCH] #7743: yolov4 integration --- .../reference/downsample1.py | 81 +++ .../reference/downsample2.py | 63 +++ .../reference/downsample3.py | 64 +++ .../reference/downsample4.py | 64 +++ .../reference/downsample5.py | 64 +++ .../functional_yolov4/reference/head.py | 146 +++++ .../functional_yolov4/reference/neck.py | 212 +++++++ .../functional_yolov4/reference/resblock.py | 13 +- .../functional_yolov4/reference/yolov4.py | 37 ++ .../functional_yolov4/tt/ttnn_downsample1.py | 87 +++ .../functional_yolov4/tt/ttnn_downsample2.py | 64 +++ .../functional_yolov4/tt/ttnn_downsample3.py | 64 +++ .../functional_yolov4/tt/ttnn_downsample4.py | 64 +++ .../functional_yolov4/tt/ttnn_downsample5.py | 68 +++ .../functional_yolov4/tt/ttnn_head.py | 186 +++++++ .../functional_yolov4/tt/ttnn_neck.py | 525 ++++++++++++++++++ .../functional_yolov4/tt/ttnn_resblock.py | 25 +- .../functional_yolov4/tt/ttnn_yolov4.py | 27 + .../yolov4/custom_preprocessor_d1.py | 159 ++++++ .../yolov4/custom_preprocessor_d2.py | 107 ++++ .../yolov4/custom_preprocessor_d3.py | 101 ++++ .../yolov4/custom_preprocessor_d4.py | 100 ++++ .../yolov4/custom_preprocessor_d5.py | 90 +++ .../yolov4/custom_preprocessor_head.py | 293 ++++++++++ .../yolov4/custom_preprocessor_neck.py | 376 +++++++++++++ .../yolov4/test_ttnn_yolov4.py | 120 ++++ 26 files changed, 3197 insertions(+), 3 deletions(-) create mode 100755 models/experimental/functional_yolov4/reference/downsample1.py create mode 100755 models/experimental/functional_yolov4/reference/downsample2.py create mode 100755 models/experimental/functional_yolov4/reference/downsample3.py create mode 100755 models/experimental/functional_yolov4/reference/downsample4.py create mode 100755 models/experimental/functional_yolov4/reference/downsample5.py create mode 100755 models/experimental/functional_yolov4/reference/head.py create mode 100755 models/experimental/functional_yolov4/reference/neck.py create mode 100755 models/experimental/functional_yolov4/reference/yolov4.py create mode 100755 models/experimental/functional_yolov4/tt/ttnn_downsample1.py create mode 100755 models/experimental/functional_yolov4/tt/ttnn_downsample2.py create mode 100755 models/experimental/functional_yolov4/tt/ttnn_downsample3.py create mode 100755 models/experimental/functional_yolov4/tt/ttnn_downsample4.py create mode 100755 models/experimental/functional_yolov4/tt/ttnn_downsample5.py create mode 100755 models/experimental/functional_yolov4/tt/ttnn_head.py create mode 100755 models/experimental/functional_yolov4/tt/ttnn_neck.py create mode 100755 models/experimental/functional_yolov4/tt/ttnn_yolov4.py create mode 100755 tests/ttnn/integration_tests/yolov4/custom_preprocessor_d1.py create mode 100755 tests/ttnn/integration_tests/yolov4/custom_preprocessor_d2.py create mode 100755 tests/ttnn/integration_tests/yolov4/custom_preprocessor_d3.py create mode 100755 tests/ttnn/integration_tests/yolov4/custom_preprocessor_d4.py create mode 100755 tests/ttnn/integration_tests/yolov4/custom_preprocessor_d5.py create mode 100755 tests/ttnn/integration_tests/yolov4/custom_preprocessor_head.py create mode 100755 tests/ttnn/integration_tests/yolov4/custom_preprocessor_neck.py create mode 100755 tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py diff --git a/models/experimental/functional_yolov4/reference/downsample1.py b/models/experimental/functional_yolov4/reference/downsample1.py new file mode 100755 index 00000000000..0551b646164 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample1.py @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + + +class Mish(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + x = x * (torch.tanh(torch.nn.functional.softplus(x))) + return x + +class DownSample1(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(3, 32, 3, 1, 1, bias=False) + self.b1 = nn.BatchNorm2d(32) + # self.relu = nn.ReLU(inplace=True) + self.relu = Mish() + + self.c2 = nn.Conv2d(32, 64, 3, 2, 1, bias=False) + self.b2 = nn.BatchNorm2d(64) + + self.c3 = nn.Conv2d(64, 64, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(64) + + self.c4 = nn.Conv2d(64, 64, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(64) + + self.c5 = nn.Conv2d(64, 32, 1, 1, 0, bias=False) + self.b5 = nn.BatchNorm2d(32) + + self.c6 = nn.Conv2d(32, 64, 3, 1, 1, bias=False) + self.b6 = nn.BatchNorm2d(64) + + self.c7 = nn.Conv2d(64, 64, 1, 1, 0, bias=False) + self.b7 = nn.BatchNorm2d(64) + + self.c8 = nn.Conv2d(128, 64, 1, 1, 0, bias=False) + self.b8 = nn.BatchNorm2d(64) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x2_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + x4 = self.c4(x2_m) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + + x6 = self.c6(x5_m) + x6_b = self.b6(x6) + x6_m = self.relu(x6_b) + x6_m = x6_m + x4_m + + x7 = self.c7(x6_m) + x7_b = self.b7(x7) + x7_m = self.relu(x7_b) + x7_m = torch.cat([x7_m, x3_m], dim=1) + + x8 = self.c8(x7_m) + x8_b = self.b8(x8) + x8_m = self.relu(x8_b) + return x8_m diff --git a/models/experimental/functional_yolov4/reference/downsample2.py b/models/experimental/functional_yolov4/reference/downsample2.py new file mode 100755 index 00000000000..dfe67bc177a --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample2.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn +from models.experimental.functional_yolov4.reference.resblock import ResBlock + +class Mish(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + x = x * (torch.tanh(torch.nn.functional.softplus(x))) + return x + +class DownSample2(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(64, 128, 3, 2, 1, bias=False) + self.b1 = nn.BatchNorm2d(128) + self.relu = Mish() + + self.c2 = nn.Conv2d(128, 64, 1, 1, 0, bias=False) + self.b2 = nn.BatchNorm2d(64) + + self.c3 = nn.Conv2d(128, 64, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(64) + + self.res = ResBlock(ch = 64, nblocks=2) + + self.c4 = nn.Conv2d(64, 64, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(64) + + self.c5 = nn.Conv2d(128, 128, 1, 1, 0, bias=False) + self.b5 = nn.BatchNorm2d(128) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x1_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + r1 = self.res(x3_m) + + x4 = self.c4(r1) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x4_m = torch.cat([x4_m, x2_m], dim=1) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + return x5_m diff --git a/models/experimental/functional_yolov4/reference/downsample3.py b/models/experimental/functional_yolov4/reference/downsample3.py new file mode 100755 index 00000000000..c768a305f7d --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample3.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn +from models.experimental.functional_yolov4.reference.resblock import ResBlock + + +class Mish(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + x = x * (torch.tanh(torch.nn.functional.softplus(x))) + return x + +class DownSample3(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(128, 256, 3, 2, 1, bias=False) + self.b1 = nn.BatchNorm2d(256) + self.relu = Mish() + + self.c2 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b2 = nn.BatchNorm2d(128) + + self.c3 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(128) + + self.res = ResBlock(ch=128, nblocks=8) + + self.c4 = nn.Conv2d(128, 128, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(128) + + self.c5 = nn.Conv2d(256, 256, 1, 1, 0, bias=False) + self.b5 = nn.BatchNorm2d(256) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x1_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + r1 = self.res(x3_m) + + x4 = self.c4(r1) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x4_m = torch.cat([x4_m, x2_m], dim=1) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + return x5_m diff --git a/models/experimental/functional_yolov4/reference/downsample4.py b/models/experimental/functional_yolov4/reference/downsample4.py new file mode 100755 index 00000000000..17c01d4873c --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample4.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn +from models.experimental.functional_yolov4.reference.resblock import ResBlock + +class Mish(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + x = x * (torch.tanh(torch.nn.functional.softplus(x))) + return x + +class DownSample4(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(256, 512, 3, 2, 1, bias=False) + self.b1 = nn.BatchNorm2d(512) + self.relu = Mish() + + self.c2 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b2 = nn.BatchNorm2d(256) + + self.c3 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(256) + + self.res = ResBlock(ch=256, nblocks=8) + + self.c4 = nn.Conv2d(256, 256, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(256) + + self.c5 = nn.Conv2d(512, 512, 1, 1, 0, bias=False) + self.b5 = nn.BatchNorm2d(512) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x1_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + # resblock + r = self.res(x3_m) + + x4 = self.c4(r) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x4_m = torch.cat([x4_m, x2_m], dim=1) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + return x5_m diff --git a/models/experimental/functional_yolov4/reference/downsample5.py b/models/experimental/functional_yolov4/reference/downsample5.py new file mode 100755 index 00000000000..1edde90f499 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/downsample5.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn +from models.experimental.functional_yolov4.reference.resblock import ResBlock + +class Mish(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + x = x * (torch.tanh(torch.nn.functional.softplus(x))) + return x + +class DownSample5(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(512, 1024, 3, 2, 1, bias=False) + self.b1 = nn.BatchNorm2d(1024) + self.relu = Mish() + + self.c2 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b2 = nn.BatchNorm2d(512) + + self.c3 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(512) + + self.res = ResBlock(512, 4) + + self.c4 = nn.Conv2d(512, 512, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(512) + + self.c5 = nn.Conv2d(1024, 1024, 1, 1, 0, bias=False) + self.b5 = nn.BatchNorm2d(1024) + + def forward(self, input: torch.Tensor): + x1 = self.c1(input) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + x3 = self.c3(x1_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + # resblock + r = self.res(x3_m) + + x4 = self.c4(r) + x4_b = self.b4(x4) + x4_m = self.relu(x4_b) + + x4_m = torch.cat([x4_m, x2_m], dim=1) + + x5 = self.c5(x4_m) + x5_b = self.b5(x5) + x5_m = self.relu(x5_b) + return x5_m diff --git a/models/experimental/functional_yolov4/reference/head.py b/models/experimental/functional_yolov4/reference/head.py new file mode 100755 index 00000000000..59073bc56f8 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/head.py @@ -0,0 +1,146 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.nn as nn + + +class Head(nn.Module): + def __init__(self): + super().__init__() + # left side of graph + # in_chan, out_chan, kernel, stride, + output_ch = 255 + + self.c1 = nn.Conv2d(128, 256, 3, 1, 1, bias = False) + self.b1 = nn.BatchNorm2d(256) + self.relu = nn.LeakyReLU(0.1, inplace=True) + + self.c2 = nn.Conv2d(256, output_ch, 1, 1, 0, bias=True) + + # R -4 + self.c3 = nn.Conv2d(128, 256, 3, 2, 1, bias = False) + self.b3 = nn.BatchNorm2d(256) + + # R -1 -16 + self.c4 = nn.Conv2d(512, 256, 1, 1, 0, bias = False) + self.b4 = nn.BatchNorm2d(256) + + self.c5 = nn.Conv2d(256, 512, 3, 1, 1, bias = False) + self.b5 = nn.BatchNorm2d(512) + + self.c6 = nn.Conv2d(512, 256, 1, 1, 0, bias = False) + self.b6 = nn.BatchNorm2d(256) + + self.c7 = nn.Conv2d(256, 512, 3, 1, 1,bias = False) + self.b7 = nn.BatchNorm2d(512) + + self.c8 = nn.Conv2d(512, 256, 1, 1, 0,bias = False) + self.b8 = nn.BatchNorm2d(256) + + self.c9 = nn.Conv2d(256, 512, 3, 1, 1,bias = False) + self.b9 = nn.BatchNorm2d(512) + + self.c10 = nn.Conv2d(512, output_ch, 1, 1, 0, bias=True) + + # R -4 + self.c11 = nn.Conv2d(256, 512, 3, 2, 1,bias = False) + self.b11 = nn.BatchNorm2d(512) + + self.c12 = nn.Conv2d(1024, 512, 1, 1, 0,bias = False) + self.b12 = nn.BatchNorm2d(512) + + self.c13 = nn.Conv2d(512, 1024, 3, 1, 1,bias = False) + self.b13 = nn.BatchNorm2d(1024) + + self.c14 = nn.Conv2d(1024, 512, 1, 1, 0,bias = False) + self.b14 = nn.BatchNorm2d(512) + + self.c15 = nn.Conv2d(512, 1024, 3, 1, 1,bias = False) + self.b15 = nn.BatchNorm2d(1024) + + self.c16 = nn.Conv2d(1024, 512, 1, 1, 0,bias = False) + self.b16 = nn.BatchNorm2d(512) + + self.c17 = nn.Conv2d(512, 1024, 3, 1, 1,bias = False) + self.b17 = nn.BatchNorm2d(1024) + + self.c18 = nn.Conv2d(1024, output_ch, 1, 1, 0, bias=True) + + def forward(self, inputs): + x1 = self.c1(inputs[0]) + x1 = self.b1(x1) + x1 = self.relu(x1) + + x2 = self.c2(x1) + + x3 = self.c3(inputs[0]) + x3 = self.b3(x3) + x3 = self.relu(x3) + + # R -1 -16 + outfromNeck1 = inputs[2] + x3 = torch.cat([x3, outfromNeck1], dim=1) + + x4 = self.c4(x3) + x4 = self.b4(x4) + x4 = self.relu(x4) + + x5 = self.c5(x4) + x5 = self.b5(x5) + x5 = self.relu(x5) + + x6 = self.c6(x5) + x6 = self.b6(x6) + x6 = self.relu(x6) + + x7 = self.c7(x6) + x7 = self.b7(x7) + x7 = self.relu(x7) + + x8 = self.c8(x7) + x8 = self.b8(x8) + x8 = self.relu(x8) + + x9 = self.c9(x8) + x9 = self.b9(x9) + x9 = self.relu(x9) + + x10 = self.c10(x9) + + # R -4 + x11 = self.c11(x8) + x11 = self.b11(x11) + x11 = self.relu(x11) + + # R -1 -37 + outfromNeck2 = inputs[1] + x11 = torch.cat([x11, outfromNeck2], dim=1) + + x12 = self.c12(x11) + x12 = self.b12(x12) + x12 = self.relu(x12) + + x13 = self.c13(x12) + x13 = self.b13(x13) + x13 = self.relu(x13) + + x14 = self.c14(x13) + x14 = self.b14(x14) + x14 = self.relu(x14) + + x15 = self.c15(x14) + x15 = self.b15(x15) + x15 = self.relu(x15) + + x16 = self.c16(x15) + x16 = self.b16(x16) + x16 = self.relu(x16) + + x17 = self.c17(x16) + x17 = self.b17(x17) + x17 = self.relu(x17) + + x18 = self.c18(x17) + return x2, x10, x18 diff --git a/models/experimental/functional_yolov4/reference/neck.py b/models/experimental/functional_yolov4/reference/neck.py new file mode 100755 index 00000000000..39c1274ddd9 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/neck.py @@ -0,0 +1,212 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + + +class Neck(nn.Module): + def __init__(self): + super().__init__() + self.c1 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b1 = nn.BatchNorm2d(512) + self.relu = nn.LeakyReLU(0.1, inplace=True) + + self.c2 = nn.Conv2d(512, 1024, 3, 1, 1, bias=False) + self.b2 = nn.BatchNorm2d(1024) + + self.c3 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b3 = nn.BatchNorm2d(512) + + # 3 maxpools + self.p1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=2, dilation=1, ceil_mode=False) + self.p2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=4, dilation=1, ceil_mode=False) + self.p3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=6, dilation=1, ceil_mode=False) + #### + + self.c4 = nn.Conv2d(2048, 512, 1, 1, 0, bias=False) + self.b4 = nn.BatchNorm2d(512) + + self.c5 = nn.Conv2d(512, 1024, 3, 1, 1, bias=False) + self.b5 = nn.BatchNorm2d(1024) + + self.c6 = nn.Conv2d(1024, 512, 1, 1, 0, bias=False) + self.b6 = nn.BatchNorm2d(512) + + self.c7 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7 = nn.BatchNorm2d(256) + + # 2 upsample2d + self.u = nn.Upsample(scale_factor=(2, 2), mode="nearest") + + self.c7_2 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7_2 = nn.BatchNorm2d(256) + + self.c7_3 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7_3 = nn.BatchNorm2d(256) + + self.c8 = nn.Conv2d(256, 512, 3, 1, 1, bias=False) + self.b8 = nn.BatchNorm2d(512) + + self.c7_4 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7_4 = nn.BatchNorm2d(256) + + self.c8_2 = nn.Conv2d(256, 512, 3, 1, 1, bias=False) + self.b8_2 = nn.BatchNorm2d(512) + + self.c7_5 = nn.Conv2d(512, 256, 1, 1, 0, bias=False) + self.b7_5 = nn.BatchNorm2d(256) + + self.c9 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9 = nn.BatchNorm2d(128) + + self.c9_2 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9_2 = nn.BatchNorm2d(128) + self.c9_3 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9_3 = nn.BatchNorm2d(128) + + self.c10 = nn.Conv2d(128, 256, 3, 1, 1, bias=False) + self.b10 = nn.BatchNorm2d(256) + + self.c9_4 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9_4 = nn.BatchNorm2d(128) + self.c10_2 = nn.Conv2d(128, 256, 3, 1, 1, bias=False) + self.b10_2 = nn.BatchNorm2d(256) + self.c9_5 = nn.Conv2d(256, 128, 1, 1, 0, bias=False) + self.b9_5 = nn.BatchNorm2d(128) + + def forward(self, inputs): + # 3 CBN blocks + x1 = self.c1(inputs[0]) + x1_b = self.b1(x1) + x1_m = self.relu(x1_b) + + x2 = self.c2(x1_m) + x2_b = self.b2(x2) + x2_m = self.relu(x2_b) + + + x3 = self.c3(x2_m) + x3_b = self.b3(x3) + x3_m = self.relu(x3_b) + + # maxpools + x4 = self.p1(x3_m) + + x5 = self.p2(x3_m) + + x6 = self.p3(x3_m) + + + # concat the outputs of maxpool and x3_m + conc1 = torch.cat([x6, x5, x4, x3_m], dim=1) + + # 4 back2back CBRs + # CBR4-1 + x7 = self.c4(conc1) + x7_b = self.b4(x7) + x7_m = self.relu(x7_b) + + + # CBR4-2 + x8 = self.c5(x7_m) + x8_b = self.b5(x8) + x8_m = self.relu(x8_b) + + # CBR4-3 + x9 = self.c6(x8_m) + x9_b = self.b6(x9) + x9_m = self.relu(x9_b) + + + # CBR4-4 + x10 = self.c7(x9_m) + x10_b = self.b7(x10) + x10_m = self.relu(x10_b) + + # upsample + u1 = self.u(x10_m) + + # Next CBR block to be concatinated with output of u1 + # gets the output of downsample4 module which is dimensions: [1, 512, 20, 20] - make a random tensor with that shape for the purpose of running the neck unit test stand-alone + outDownSample4 = inputs[1] + # CBR block for conc2 + x11 = self.c7_2(outDownSample4) + x11_b = self.b7_2(x11) + x11_m = self.relu(x11_b) + + # concat CBR output with output from u1 + conc2 = torch.cat([x11_m, u1], dim=1) + + # 6 back2back CBRs + # CBR6_1 + x12 = self.c7_3(conc2) + x12_b = self.b7_3(x12) + x12_m = self.relu(x12_b) + + # CBR6_2 + x13 = self.c8(x12_m) + x13_b = self.b8(x13) + x13_m = self.relu(x13_b) + + # CBR6_3 + x14 = self.c7_4(x13_m) + x14_b = self.b7_4(x14) + x14_m = self.relu(x14_b) + + # CBR6_4 + x15 = self.c8_2(x14_m) + x15_b = self.b8_2(x15) + x15_m = self.relu(x15_b) + + # CBR6_5 + x16 = self.c7_5(x15_m) + x16_b = self.b7_5(x16) + x16_m = self.relu(x16_b) + + # CBR6_6 + x17 = self.c9(x16_m) + x17_b = self.b9(x17) + x17_m = self.relu(x17_b) + + # upsample + u2 = self.u(x17_m) + + # CBR block for conc3 + outDownSample3 = inputs[2] + x18 = self.c9_2(outDownSample3) + x18_b = self.b9_2(x18) + x18_m = self.relu(x18_b) + + # concat CBR output with output from u2 + conc3 = torch.cat([x18_m, u2], dim=1) + + # 5 CBR blocks + # CBR5_1 + x19 = self.c9_3(conc3) + x19_b = self.b9_3(x19) + x19_m = self.relu(x19_b) + + # CBR5_2 + x20 = self.c10(x19_m) + x20_b = self.b10(x20) + x20_m = self.relu(x20_b) + + # CBR5_3 + x21 = self.c9_4(x20_m) + x21_b = self.b9_4(x21) + x21_m = self.relu(x21_b) + + # CBR5_4 + x22 = self.c10_2(x21_m) + x22_b = self.b10_2(x22) + x22_m = self.relu(x22_b) + + # CBR5_5 + x23 = self.c9_5(x22_m) + x23_b = self.b9_5(x23) + x23_m = self.relu(x23_b) + # return [x4, x4, x4] + return x23_m, x9_m, x16_m diff --git a/models/experimental/functional_yolov4/reference/resblock.py b/models/experimental/functional_yolov4/reference/resblock.py index 934347260de..68b6df5ee84 100644 --- a/models/experimental/functional_yolov4/reference/resblock.py +++ b/models/experimental/functional_yolov4/reference/resblock.py @@ -2,9 +2,20 @@ # SPDX-License-Identifier: Apache-2.0 +import torch + import torch.nn as nn +class Mish(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + x = x * (torch.tanh(torch.nn.functional.softplus(x))) + return x + + class ResBlock(nn.Module): def __init__(self, ch, nblocks=1, shortcut=True): super().__init__() @@ -13,7 +24,7 @@ def __init__(self, ch, nblocks=1, shortcut=True): for i in range(nblocks): conv1 = nn.Conv2d(ch, ch, 1, 1, 0, bias=False) bn1 = nn.BatchNorm2d(ch) - relu = nn.ReLU(inplace=True) + relu = Mish() conv2 = nn.Conv2d(ch, ch, 3, 1, 1, bias=False) bn2 = nn.BatchNorm2d(ch) resblock_one = nn.ModuleList([conv1, bn1, relu, conv2, bn2, relu]) diff --git a/models/experimental/functional_yolov4/reference/yolov4.py b/models/experimental/functional_yolov4/reference/yolov4.py new file mode 100755 index 00000000000..61a5a256923 --- /dev/null +++ b/models/experimental/functional_yolov4/reference/yolov4.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from models.experimental.functional_yolov4.reference.downsample1 import DownSample1 +from models.experimental.functional_yolov4.reference.downsample2 import DownSample2 +from models.experimental.functional_yolov4.reference.downsample3 import DownSample3 +from models.experimental.functional_yolov4.reference.downsample4 import DownSample4 +from models.experimental.functional_yolov4.reference.downsample5 import DownSample5 +from models.experimental.functional_yolov4.reference.neck import Neck +from models.experimental.functional_yolov4.reference.head import Head + +import torch +import torch.nn as nn + + +class Yolov4(nn.Module): + def __init__(self): + super(Yolov4, self).__init__() + self.downsample1 = DownSample1() + self.downsample2 = DownSample2() + self.downsample3 = DownSample3() + self.downsample4 = DownSample4() + self.downsample5 = DownSample5() + self.neck = Neck() + self.head = Head() + + def forward(self, input: torch.Tensor): + d1 = self.downsample1(input) + d2 = self.downsample2(d1) + d3 = self.downsample3(d2) + d4 = self.downsample4(d3) + d5 = self.downsample5(d4) + x20, x13, x6 = self.neck([d5, d4, d3]) + x4, x5, x6 = self.head([x20, x13, x6]) + + return x4, x5, x6 diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample1.py b/models/experimental/functional_yolov4/tt/ttnn_downsample1.py new file mode 100755 index 00000000000..76f06b7caf9 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample1.py @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import ttnn +import tt_lib + + +class TtDownSample1: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.c4 = parameters.c4 + self.c5 = parameters.c5 + self.c6 = parameters.c6 + self.c7 = parameters.c7 + self.c8 = parameters.c8 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + + output_tensor = self.c1(input_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c1.conv.input_sharded_memory_config) + + + output_tensor = self.c2(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c3.conv.input_sharded_memory_config) + + output_tensor_c2 = output_tensor + output_tensor = self.c3(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c3.conv.input_sharded_memory_config) + + output_tensor_c3 = output_tensor + + output_tensor_c2 = ttnn.to_torch(output_tensor_c2) + output_tensor_c2 = ttnn.from_torch(output_tensor_c2,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = self.c4(output_tensor_c2) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c3.conv.input_sharded_memory_config) + + + output_tensor_c4 = output_tensor + output_tensor = self.c5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c6.conv.input_sharded_memory_config) + + + output_tensor = self.c6(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c7.conv.input_sharded_memory_config) + + + output_tensor = output_tensor + output_tensor_c4 + output_tensor = self.c7(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c3], dim=3) + + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c8.conv.input_sharded_memory_config) + output_tensor = self.c8(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + return ttnn.from_device(output_tensor) \ No newline at end of file diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample2.py b/models/experimental/functional_yolov4/tt/ttnn_downsample2.py new file mode 100755 index 00000000000..e9268856625 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample2.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + + +import ttnn +import tt_lib +from models.experimental.functional_yolov4.tt.ttnn_resblock import TtResBlock + + +class TtDownSample2: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.res = TtResBlock(parameters.res, 2, True) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + + output_tensor = self.c1(input_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c2.conv.input_sharded_memory_config) + + output_tensor_c1 = output_tensor + output_tensor = self.c2(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c4.conv.input_sharded_memory_config) + + output_tensor_c2 = output_tensor + output_tensor = self.c3(output_tensor_c1) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c4.conv.input_sharded_memory_config) + + output_tensor = self.res(device, output_tensor) + + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + output_tensor = self.c4(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c2], dim=3) + + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + output_tensor = self.c5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + return ttnn.from_device(output_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample3.py b/models/experimental/functional_yolov4/tt/ttnn_downsample3.py new file mode 100755 index 00000000000..0d2aff0cd29 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample3.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from models.experimental.functional_yolov4.tt.ttnn_resblock import TtResBlock + + +import ttnn +import tt_lib + + +class TtDownSample3: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.res = TtResBlock(parameters.res, 8, True) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + + output_tensor_c1 = self.c1(input_tensor) + output_tensor_c1 = ttnn.to_torch(output_tensor_c1) + output_tensor_c1 = ttnn.from_torch(output_tensor_c1,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor_c1 = ttnn.mish(output_tensor_c1) + output_tensor_c1 = tt_lib.tensor.interleaved_to_sharded(output_tensor_c1, self.c3.conv.input_sharded_memory_config) + + + output_tensor_c2 = self.c2(output_tensor_c1) + output_tensor_c2 = ttnn.to_torch(output_tensor_c2) + output_tensor_c2 = ttnn.from_torch(output_tensor_c2,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor_c2 = ttnn.mish(output_tensor_c2) + output_tensor_c2 = tt_lib.tensor.interleaved_to_sharded(output_tensor_c2, self.c4.conv.input_sharded_memory_config) + + output_tensor = self.c3(output_tensor_c1) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c4.conv.input_sharded_memory_config) + + output_tensor = self.res(device, output_tensor) + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + + output_tensor = self.c4(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c2], dim=3) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + + output_tensor = self.c5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + return ttnn.from_device(output_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample4.py b/models/experimental/functional_yolov4/tt/ttnn_downsample4.py new file mode 100755 index 00000000000..408a6a56cfd --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample4.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from models.experimental.functional_yolov4.tt.ttnn_resblock import TtResBlock + +import ttnn +import tt_lib + + +class TtDownSample4: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.res = TtResBlock(parameters.res, 8, True) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + output_tensor = self.c1(input_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c2.conv.input_sharded_memory_config) + + output_tensor_c1 = output_tensor + output_tensor = self.c2(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c4.conv.input_sharded_memory_config) + + + output_tensor_c2 = output_tensor + output_tensor = self.c3(output_tensor_c1) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c4.conv.input_sharded_memory_config) + + output_tensor = self.res(device, output_tensor) + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + output_tensor = self.c4(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c2], dim=3) + output_tensor = output_tensor.to(device, self.c5.conv.input_sharded_memory_config) + + output_tensor = self.c5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + + + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + return ttnn.from_device(output_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_downsample5.py b/models/experimental/functional_yolov4/tt/ttnn_downsample5.py new file mode 100755 index 00000000000..6fca1eadf0b --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_downsample5.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch + +import ttnn +import tt_lib +from models.experimental.functional_yolov4.tt.ttnn_resblock import TtResBlock + + +class TtDownSample5: + def __init__( + self, + parameters, + ) -> None: + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + self.res = TtResBlock(parameters.res, 4, True) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + + def __call__(self, device, input_tensor): + input_tensor = input_tensor.to(device, self.c1.conv.input_sharded_memory_config) + output_tensor = self.c1(input_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c2.conv.input_sharded_memory_config) + output_tensor_c1 = output_tensor + + output_tensor = self.c2(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c3.conv.input_sharded_memory_config) + output_tensor_c2 = output_tensor + + output_tensor = self.c3(output_tensor_c1) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c4.conv.input_sharded_memory_config) + + output_tensor = self.res(device, output_tensor) + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + + output_tensor = self.c4(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, output_tensor_c2], dim=3) + output_tensor = output_tensor.to(device, self.c5.conv.input_sharded_memory_config) + + output_tensor = self.c5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.from_torch(output_tensor,dtype = ttnn.bfloat16, device=device, layout = ttnn.TILE_LAYOUT) + output_tensor = ttnn.mish(output_tensor) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + return ttnn.from_device(output_tensor) \ No newline at end of file diff --git a/models/experimental/functional_yolov4/tt/ttnn_head.py b/models/experimental/functional_yolov4/tt/ttnn_head.py new file mode 100755 index 00000000000..1756bc33cc1 --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_head.py @@ -0,0 +1,186 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +##ttnn head + +import torch +import torch.nn as nn + + +import ttnn +import tt_lib + +from models.utility_functions import ( + torch_to_tt_tensor_rm +) + + +class TtHead: + def output_preprocessing(self, output_tensor, device): + output_tensor = ttnn.to_torch(output_tensor) + output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) + output_tensor = torch_to_tt_tensor_rm(output_tensor, device, put_on_device=True) + return output_tensor + + def __init__(self, device, parameters) -> None: + self.device = device + print("keys in parameters in TtHead are: ", parameters.keys()) + self.c1 = parameters.c1 + self.c2 = tt_lib.fallback_ops.Conv2d( + parameters.c2["weight"], parameters.c2["bias"], 256, 255, 1, 1, 0, bias=True + ) + self.c3 = parameters.c3 + self.c4 = parameters.c4 + self.c5 = parameters.c5 + self.c6 = parameters.c6 + self.c7 = parameters.c7 + self.c8 = parameters.c8 + self.c9 = parameters.c9 + self.c10 = tt_lib.fallback_ops.Conv2d( + parameters.c10["weight"], parameters.c10["bias"], 512, 255, 1, 1, 0, bias=False + ) + self.c11 = parameters.c11 + self.c12 = parameters.c12 + self.c13 = parameters.c13 + self.c14 = parameters.c14 + self.c15 = parameters.c15 + self.c16 = parameters.c16 + self.c17 = parameters.c17 + self.c18 = tt_lib.fallback_ops.Conv2d( + parameters.c18["weight"], parameters.c18["bias"], 1024, 255, 1, 1, 0, bias=True + ) + self.leaky_relu = nn.LeakyReLU(0.1, inplace=True) + + def __call__(self, device, input_tensors): + input_tensor = input_tensors[0].to(device, self.c1.conv.input_sharded_memory_config) + output_tensor = self.c1(input_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + output_tensor = self.output_preprocessing(output_tensor, device) + output_tensor = self.c2(output_tensor) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.permute(output_tensor, (0, 2, 3, 1)) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + output_tensor_res1 = output_tensor + + output_tensor = self.c3(input_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + outNeck1 = input_tensors[2].to(device) + + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, outNeck1], dim=3) + + output_tensor = self.c4(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + + output_tensor = self.c5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + output_tensor = self.c6(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c7.conv.input_sharded_memory_config) + + output_tensor = self.c7(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c8.conv.input_sharded_memory_config) + + output_tensor = self.c8(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c9.conv.input_sharded_memory_config) + + output_tensor2 = output_tensor + + output_tensor = self.c9(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + output_tensor = self.output_preprocessing(output_tensor, device) + output_tensor = self.c10(output_tensor) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.permute(output_tensor, (0, 2, 3, 1)) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c11.conv.input_sharded_memory_config) + + output_tensor_res2 = output_tensor + + output_tensor = self.c11(output_tensor2) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + outNeck2 = input_tensors[1].to(device) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, outNeck2], dim=3) + + output_tensor = self.c12(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c13.conv.input_sharded_memory_config) + + output_tensor = self.c13(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c14.conv.input_sharded_memory_config) + + output_tensor = self.c14(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c15.conv.input_sharded_memory_config) + + output_tensor = self.c15(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c16.conv.input_sharded_memory_config) + + output_tensor = self.c16(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c17.conv.input_sharded_memory_config) + + output_tensor = self.c17(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + + output_tensor = self.output_preprocessing(output_tensor, device) + output_tensor = self.c18(output_tensor) + output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.permute(output_tensor, (0, 2, 3, 1)) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor_res3 = output_tensor + + return ( + ttnn.from_device(output_tensor_res1), + ttnn.from_device(output_tensor_res2), + ttnn.from_device(output_tensor_res3), + ) diff --git a/models/experimental/functional_yolov4/tt/ttnn_neck.py b/models/experimental/functional_yolov4/tt/ttnn_neck.py new file mode 100755 index 00000000000..475d3bf3dfd --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_neck.py @@ -0,0 +1,525 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +##### ttnneck + +import torch +import torch.nn as nn + +from ttnn.model_preprocessing import preprocess_model + +import ttnn +import tt_lib +import tt_lib.fallback_ops + + +class TtNeck: + def __init__( + self, + device, + parameters, + ) -> None: + self.device = device + self.c1 = parameters.c1 + self.c2 = parameters.c2 + self.c3 = parameters.c3 + # print("\n\n\nattributes of parameters.c3: ", parameters.c3.__dict__) + self.c4 = parameters.c4 + self.c5 = parameters.c5 + self.c6 = parameters.c6 + self.c7 = parameters.c7 + self.c7_2 = parameters.c7_2 + self.c7_3 = parameters.c7_3 + self.c7_4 = parameters.c7_4 + self.c7_5 = parameters.c7_5 + self.c8 = parameters.c8 + self.c8_2 = parameters.c8_2 + self.c9 = parameters.c9 + self.c9_2 = parameters.c9_2 + self.c9_3 = parameters.c9_3 + self.c9_4 = parameters.c9_4 + self.c9_5 = parameters.c9_5 + self.c10 = parameters.c10 + self.c10_2 = parameters.c10_2 + self.leaky_relu = nn.LeakyReLU(0.1, inplace=True) + # self.p1 = parameters.p1 + # self.p2 = parameters.p2 + # self.p3 = parameters.p3 + + #########conv3############### + # self.c3 = ttnn.Conv2d( + # in_channels=1024, + # out_channels=512, + # kernel_size=(1, 1), + # stride=(1, 1), + # padding=(0, 0), + # dtype=ttnn.bfloat8_b, + # device=device, + # use_1d_systolic_array=True, + # batch_size=1, + # input_height=10, + # input_width=10, + # reader_patterns_cache={}, + # weight=parameters.c3.weight, + # # bias=parameters.c3.bias, + # math_fidelity=ttnn.MathFidelity.LoFi, + # weights_dtype=ttnn.bfloat8_b, + # use_shallow_conv_variant=False, + # deallocate_activation=True, + # # padded_input_channels=32, + # activation="relu", + # conv_blocking_and_parallelization_config_override=None, + # # compute_kernel_config=compute_kernel_config, + # ) + + self.max_pool_reader_patterns_cache = {} + max_pool_parallel_config_override = {} + + max_pool_parallel_config_override["grid_size"] = self.c3.conv.grid_size + max_pool_parallel_config_override["num_cores_nhw"] = self.c3.conv.sliding_window_op_params.num_cores_nhw + print(max_pool_parallel_config_override) + print(max_pool_parallel_config_override["num_cores_nhw"]) + + # self.p1 = tt_lib.fallback_ops.MaxPool2d( + # kernel_size=(5, 5), + # stride=(1, 1), + # padding=(2, 2), + # dilation=(1, 1), + # channels_last=True + # ) + # self.p2 = tt_lib.fallback_ops.MaxPool2d( + # kernel_size=(9, 9), + # stride=(1, 1), + # padding=(4, 4), + # dilation=(1, 1), + # channels_last=True + # ) + # self.p3 = tt_lib.fallback_ops.MaxPool2d( + # kernel_size=(13, 13), + # stride=(1, 1), + # padding=(6, 6), + # dilation=(1, 1), + # channels_last=True + # ) + + self.p1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=2, dilation=1, ceil_mode=False) + self.p2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=4, dilation=1, ceil_mode=False) + self.p3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=6, dilation=1, ceil_mode=False) + + # self.p1 = ttnn.MaxPool2d( + # kernel_size=(5, 5), + # stride=(1, 1), + # padding=(2, 2), + # dilation=(1, 1), + # dtype=ttnn.bfloat16, + # device=self.device, + # batch_size=1, + # input_height=10, + # input_width=10, + # reader_patterns_cache=self.max_pool_reader_patterns_cache, + # deallocate_activation=True, + # # parallel_config_override=max_pool_parallel_config_override, + # channels=512, + # ) + # self.p2 = ttnn.MaxPool2d( + # kernel_size=(9, 9), + # stride=(1, 1), + # padding=(4, 4), + # dilation=(1, 1), + # dtype=ttnn.bfloat16, + # device=self.device, + # batch_size=1, + # input_height=10, + # input_width=10, + # reader_patterns_cache=self.max_pool_reader_patterns_cache, + # deallocate_activation=True, + # # parallel_config_override=max_pool_parallel_config_override, + # channels=512, + # ) + # self.p3 = ttnn.MaxPool2d( + # kernel_size=(13, 13), + # stride=(1, 1), + # padding=(6, 6), + # dilation=(1, 1), + # dtype=ttnn.bfloat16, + # device=self.device, + # batch_size=1, + # input_height=10, + # input_width=10, + # reader_patterns_cache=self.max_pool_reader_patterns_cache, + # deallocate_activation=True, + # # parallel_config_override=max_pool_parallel_config_override, + # channels=512, + # ) + # + def __call__(self, device, input_tensors): + input_tensor0 = input_tensors[0].to(device, self.c1.conv.input_sharded_memory_config) + + ####### + + # # 3 CBN blocks + # x1 = self.c1(input_tensor) + # x1_b = self.b1(x1) + # x1_m = self.relu(x1_b) + # + # x2 = self.c2(x1_m) + # x2_b = self.b2(x2) + # x2_m = self.relu(x2_b) + # + # x3 = self.c3(x2_m) + # x3_b = self.b3(x3) + # x3_m = self.relu(x3_b) + # + # # maxpools + # x4 = self.p1(x3_m) + # x5 = self.p2(x3_m) + # x6 = self.p3(x3_m) + # + # # concat the outputs of maxpool and x3_m + # conc1 = torch.cat([x4, x5, x6, x3_m], dim=1) + # + # # 4 back2back CBRs + # # CBR4-1 + # x7 = self.c4(conc1) + # x7_b = self.b4(x7) + # x7_m = self.relu(x7_b) + # + # # CBR4-2 + # x8 = self.c5(x7_m) + # x8_b = self.b5(x8) + # x8_m = self.relu(x8_b) + # + # # CBR4-3 + # x9 = self.c6(x8_m) + # x9_b = self.b6(x9) + # x9_m = self.relu(x9_b) + # + # # CBR4-4 + # x10 = self.c7(x9_m) + # x10_b = self.b7(x10) + # x10_m = self.relu(x10_b) + # + # # upsample + # u1 = self.u(x10_m) + # + # # Next CBR block to be concatinated with output of u1 + # # gets the output of downsample4 module which is dimensions: [1, 512, 20, 20] - make a random tensor with that shape for the purpose of running the neck unit test stand-alone + # outDownSample4 = torch.rand([1, 512, 20, 20]) + # # CBR block for conc2 + # x11 = self.c7(outDownSample4) + # x11_b = self.b7(x11) + # x11_m = self.relu(x11_b) + # + # # concat CBR output with output from u1 + # conc2 = torch.cat([u1, x11_m], dim=1) + # + # # 6 back2back CBRs + # # CBR6_1 + # x12 = self.c7(conc2) + # x12_b = self.b7(x12) + # x12_m = self.relu(x12_b) + # + # # CBR6_2 + # x13 = self.c8(x12_m) + # x13_b = self.b8(x13) + # x13_m = self.relu(x13_b) + # + # # CBR6_3 + # x14 = self.c7(x13_m) + # x14_b = self.b7(x14) + # x14_m = self.relu(x14_b) + # + # # CBR6_4 + # x15 = self.c8(x14_m) + # x15_b = self.b8(x15) + # x15_m = self.relu(x15_b) + # + # # CBR6_5 + # x16 = self.c7(x15_m) + # x16_b = self.b7(x16) + # x16_m = self.relu(x16_b) + # + # # CBR6_6 + # x17 = self.c9(x16_m) + # x17_b = self.b9(x17) + # x17_m = self.relu(x17_b) + # + # # upsample + # u2 = self.u(x17_m) + # + # # CBR block for conc3 + # outDownSample3 = torch.rand([1, 256, 40, 40]) + # x18 = self.c9(outDownSample3) + # x18_b = self.b9(x18) + # x18_m = self.relu(x18_b) + # + # # concat CBR output with output from u2 + # conc3 = torch.cat([u2, x18_m], dim=1) + # + # # 5 CBR blocks + # # CBR5_1 + # x19 = self.c9(conc3) + # x19_b = self.b9(x19) + # x19_m = self.relu(x19_b) + # + # # CBR5_2 + # x20 = self.c10(x19_m) + # x20_b = self.b10(x20) + # x20_m = self.relu(x20_b) + # + # # CBR5_3 + # x21 = self.c9(x20_m) + # x21_b = self.b9(x21) + # x21_m = self.relu(x21_b) + # + # # CBR5_4 + # x22 = self.c10(x21_m) + # x22_b = self.b10(x22) + # x22_m = self.relu(x22_b) + # + # # CBR5_5 + # x23 = self.c9(x22_m) + # x23_b = self.b9(x23) + # x23_m = self.relu(x23_b) + # + # return x23_m, x9_m, x16_m + # + # ####### + output_tensor = self.c1(input_tensor0) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c2.conv.input_sharded_memory_config) + + output_tensor = self.c2(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c3.conv.input_sharded_memory_config) + + output_tensor = self.c3(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c7.conv.input_sharded_memory_config) + + output_tensorc3 = output_tensor + + output_tensorc3 = tt_lib.tensor.sharded_to_interleaved(output_tensorc3, ttnn.L1_MEMORY_CONFIG) + # output_tensorc3 = ttnn.to_layout(output_tensorc3, layout=ttnn.TILE_LAYOUT) + custom_sharded_memory_config = ttnn.experimental.tensor.MemoryConfig( + memory_layout=ttnn.experimental.tensor.TensorMemoryLayout.HEIGHT_SHARDED, + buffer_type=ttnn.experimental.tensor.BufferType.L1, + ) + # output_tensorc3 = tt_lib.tensor.interleaved_to_sharded(output_tensorc3, self.p1.max_pool.input_sharded_memory_config) + # ouptut_tensorc3=ttnn.to_memory_config(output_tensorc3, self.p1.max_pool.input_sharded_memory_config) + # input_tensor.to(device, mem_config = custom_sharded_memory_config) + # output_tensorc3 = output_tensorc3.to(device, self.p1.max_pool.input_sharded_memory_config) + # input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED) + + # reproduces maxpool padding error + output_tensorc3 = ttnn.to_layout(output_tensorc3, ttnn.ROW_MAJOR_LAYOUT) + # output_tensorc3 = tt_lib.tensor.interleaved_to_sharded( + # output_tensorc3, self.p1.max_pool.input_sharded_memory_config + # ) + print("C3 sharding: ", self.c3.conv.input_sharded_memory_config) + # print("P1 sharding: ", self.p1.max_pool.output_sharded_memory_config) + # input_tensor.memory_config().memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED + print("Input sharding: ", output_tensorc3.memory_config().memory_layout) + # return [output_tensorc3, output_tensorc3, output_tensorc3] + + output_tensorc3 = ttnn.from_device(output_tensorc3) + output_tensorc3 = ttnn.to_torch(output_tensorc3) + output_tensorc3 = torch.reshape(output_tensorc3, (1, 10, 10, 512)) + output_tensorc3 = torch.permute(output_tensorc3, (0, 3, 1, 2)) + # print("p1 inp: ",output_tensorc3.shape) + + # output_tensorc3 = ttnn.reshape(output_tensorc3, (1, 10, 10, 512)) + # output_tensorc3 = ttnn.to_torch(output_tensorc3) + # output_tensorc3 = torch.permute(output_tensorc3, (0, 3, 1, 2)) + # from models.utility_functions import torch_to_tt_tensor_rm + # output_tensorc3 = torch_to_tt_tensor_rm(output_tensorc3, device, put_on_device=True) + output_tensor = self.p1(output_tensorc3) + output_tensorp1 = output_tensor + output_tensor = self.p2(output_tensorc3) + + output_tensorp2 = output_tensor + output_tensor = self.p3(output_tensorc3) + + output_tensorp3 = output_tensor + output_tensorp1 = torch.reshape(output_tensorp1, (1, 512, 1, 100)) + output_tensorp2 = torch.reshape(output_tensorp2, (1, 512, 1, 100)) + output_tensorp3 = torch.reshape(output_tensorp3, (1, 512, 1, 100)) + output_tensorc3 = torch.reshape(output_tensorc3, (1, 512, 1, 100)) + output_tensorp1 = torch.permute(output_tensorp1, (0, 2, 3, 1)) + output_tensorp2 = torch.permute(output_tensorp2, (0, 2, 3, 1)) + output_tensorp3 = torch.permute(output_tensorp3, (0, 2, 3, 1)) + output_tensorc3 = torch.permute(output_tensorc3, (0, 2, 3, 1)) + + output_tensorp1 = ttnn.from_torch(output_tensorp1, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + output_tensorp2 = ttnn.from_torch(output_tensorp2, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + output_tensorp3 = ttnn.from_torch(output_tensorp3, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + output_tensorc3 = ttnn.from_torch(output_tensorc3, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT) + output_tensorp1 = output_tensorp1.to(device) + output_tensorp2 = output_tensorp2.to(device) + output_tensorp3 = output_tensorp3.to(device) + output_tensorc3 = output_tensorc3.to(device) + # output_tensorp1 = tt_lib.tensor.sharded_to_interleaved(output_tensorp1, ttnn.L1_MEMORY_CONFIG) + # output_tensorp1 = ttnn.to_layout(output_tensorp1, layout=ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensorp1, output_tensorp2, output_tensorp3, output_tensorc3], dim=3) + # output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = output_tensor.to(device, self.c4.conv.input_sharded_memory_config) + # print("DEBUG:", output_tensor.memory_config()) + output_tensor = self.c4(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c5.conv.input_sharded_memory_config) + + + output_tensor = self.c5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c6.conv.input_sharded_memory_config) + + + output_tensor = self.c6(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c7.conv.input_sharded_memory_config) + + + output_tensor_9m = output_tensor + output_tensor = self.c7(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + # output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT) + output_tensor = ttnn.upsample(output_tensor, (1, 4, 1), memory_config=output_tensor.memory_config()) + + # TODO add ttnn tensor here for testing + # input_shape = torch_input_tensor.shape + # input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + # + # input_tensor = input_tensor.reshape( + # input_tensor.shape[0], 1, input_tensor.shape[1] * input_tensor.shape[2], input_tensor.shape[3] + # ) + + outDownSample4 = input_tensors[1].to(device, self.c7_2.conv.input_sharded_memory_config) + # CBR block for conc2 + outDownSample4_c7 = self.c7_2(outDownSample4) + outDownSample4_c7 = ttnn.to_torch(outDownSample4_c7) + outDownSample4_c7= self.leaky_relu(outDownSample4_c7) + outDownSample4_c7 = ttnn.from_torch(outDownSample4_c7, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + # outDownSample4_b7 = self.b7(outDownSample4_c7) + # outDownSample4_r7 = self.relu(outDownSample4_b7) + # + # output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + # output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.TILE_LAYOUT) + # outDownSample4_c7 = tt_lib.tensor.sharded_to_interleaved(outDownSample4_c7, ttnn.L1_MEMORY_CONFIG) + outDownSample4_c7 = ttnn.to_layout(outDownSample4_c7, layout=ttnn.TILE_LAYOUT) + print(outDownSample4_c7.memory_config()) + print(output_tensor.memory_config()) + output_tensor = ttnn.concat([outDownSample4_c7, output_tensor], dim=3) + + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c7_3.conv.input_sharded_memory_config) + output_tensor = self.c7_3(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c8.conv.input_sharded_memory_config) + + output_tensor = self.c8(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c7_4.conv.input_sharded_memory_config) + + + output_tensor = self.c7_4(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c8_2.conv.input_sharded_memory_config) + + output_tensor = self.c8_2(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c7_5.conv.input_sharded_memory_config) + + + output_tensor = self.c7_5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c9.conv.input_sharded_memory_config) + + + output_tensor_16m = output_tensor + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = output_tensor.to(device, self.c9.conv.input_sharded_memory_config) + + output_tensor = self.c9(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c9_2.conv.input_sharded_memory_config) + + + output_tensor = tt_lib.tensor.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT) + output_tensor = ttnn.upsample(output_tensor, (1, 4, 1), memory_config=output_tensor.memory_config()) + + # output_tensor = self.u(output_tensor) + # # CBR block for conc3 + # # TODO add ttnn random tensor here + outDownSample3 = input_tensors[2].to(device, self.c9_2.conv.input_sharded_memory_config) + outDownSample3_c9 = self.c9_2(outDownSample3) + outDownSample3_c9 = ttnn.to_torch(outDownSample3_c9) + outDownSample3_c9= self.leaky_relu(outDownSample3_c9) + outDownSample3_c9 = ttnn.from_torch(outDownSample3_c9, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + # outDownSample3_b9 = self.b9(outDownSample3_c9) + # outDownSample3_r9 = self.relu(outDownSample3_b9) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = ttnn.concat([output_tensor, outDownSample3_c9], dim=3) + output_tensor = output_tensor.to(device, self.c9_3.conv.input_sharded_memory_config) + output_tensor = self.c9_3(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c10.conv.input_sharded_memory_config) + output_tensor = self.c10(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c9_4.conv.input_sharded_memory_config) + output_tensor = self.c9_4(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c10_2.conv.input_sharded_memory_config) + output_tensor = self.c10_2(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = tt_lib.tensor.interleaved_to_sharded(output_tensor, self.c9_5.conv.input_sharded_memory_config) + output_tensor = self.c9_5(output_tensor) + output_tensor = ttnn.to_torch(output_tensor) + output_tensor= self.leaky_relu(output_tensor) + output_tensor = ttnn.from_torch(output_tensor, device=device ,dtype = ttnn.bfloat16, layout = ttnn.TILE_LAYOUT) + return ttnn.from_device(output_tensor), ttnn.from_device(output_tensor_9m), ttnn.from_device(output_tensor_16m) diff --git a/models/experimental/functional_yolov4/tt/ttnn_resblock.py b/models/experimental/functional_yolov4/tt/ttnn_resblock.py index 1ff61facfb0..cbf89920a12 100644 --- a/models/experimental/functional_yolov4/tt/ttnn_resblock.py +++ b/models/experimental/functional_yolov4/tt/ttnn_resblock.py @@ -2,7 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 + import ttnn +import tt_lib class TtResBlock: @@ -17,10 +19,29 @@ def __init__(self, parameters, nblocks, shortcut) -> None: self.module_list.append(resblock_one) def __call__(self, device, input_tensor): - input_tensor = input_tensor.to(device, self.module_list[0][0].conv.input_sharded_memory_config) + input_tensor = tt_lib.tensor.sharded_to_interleaved(input_tensor, ttnn.L1_MEMORY_CONFIG) + input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT) for i in range(self.nblocks): output_tensor_h = input_tensor + output_tensor_h = output_tensor_h.to(device, self.module_list[i][0].conv.input_sharded_memory_config) output_tensor_1 = self.module_list[i][0](output_tensor_h) + output_tensor_1 = ttnn.to_torch(output_tensor_1) + output_tensor_1 = ttnn.from_torch( + output_tensor_1, dtype=ttnn.bfloat16, device=device, layout=ttnn.TILE_LAYOUT + ) + output_tensor_1 = ttnn.mish(output_tensor_1) + output_tensor_1 = tt_lib.tensor.interleaved_to_sharded( + output_tensor_1, self.module_list[i][1].conv.input_sharded_memory_config + ) + output_tensor_h = self.module_list[i][1](output_tensor_1) - input_tensor = ttnn.add(input_tensor, output_tensor_h) if self.shortcut else output_tensor_h + output_tensor_h = ttnn.to_torch(output_tensor_h) + output_tensor_h = ttnn.from_torch( + output_tensor_h, dtype=ttnn.bfloat16, device=device, layout=ttnn.TILE_LAYOUT + ) + output_tensor_h = ttnn.mish(output_tensor_h) + + output_tensor_h = ttnn.to_layout(output_tensor_h, layout=ttnn.TILE_LAYOUT) + + input_tensor = (input_tensor + output_tensor_h) if self.shortcut else output_tensor_h return ttnn.from_device(input_tensor) diff --git a/models/experimental/functional_yolov4/tt/ttnn_yolov4.py b/models/experimental/functional_yolov4/tt/ttnn_yolov4.py new file mode 100755 index 00000000000..c01710c165c --- /dev/null +++ b/models/experimental/functional_yolov4/tt/ttnn_yolov4.py @@ -0,0 +1,27 @@ +from models.experimental.functional_yolov4.tt.ttnn_downsample1 import TtDownSample1 +from models.experimental.functional_yolov4.tt.ttnn_downsample2 import TtDownSample2 +from models.experimental.functional_yolov4.tt.ttnn_downsample3 import TtDownSample3 +from models.experimental.functional_yolov4.tt.ttnn_downsample4 import TtDownSample4 +from models.experimental.functional_yolov4.tt.ttnn_downsample5 import TtDownSample5 +from models.experimental.functional_yolov4.tt.ttnn_neck import TtNeck +from models.experimental.functional_yolov4.tt.ttnn_head import TtHead + +class TtYolov4: + def __init__(self, device, parameters) -> None: + self.downsample1 = TtDownSample1(parameters["downsample1"]) + self.downsample2 = TtDownSample2(parameters["downsample2"]) + self.downsample3 = TtDownSample3(parameters["downsample3"]) + self.downsample4 = TtDownSample4(parameters["downsample4"]) + self.downsample5 = TtDownSample5(parameters["downsample5"]) + self.neck = TtNeck(device, parameters["neck"]) + self.head = TtHead(device, parameters["head"]) + + def __call__(self, device, input_tensor): + d1 = self.downsample1(device, input_tensor) + d2 = self.downsample2(device, d1) + d3 = self.downsample3(device, d2) + d4 = self.downsample4(device, d3) + d5 = self.downsample5(device, d4) + x20, x13, x6 = self.neck(device, [d5, d4, d3]) + x4, x5, x6 = self.head(device, [x20, x13, x6]) + return x4, x5, x6 diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d1.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d1.py new file mode 100755 index 00000000000..c29f817b81b --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d1.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +from ttnn.model_preprocessing import preprocess_model, preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_wormhole_b0 +from models.experimental.functional_yolov4.reference.downsample1 import DownSample1 +from models.experimental.functional_yolov4.tt.ttnn_downsample1 import TtDownSample1 + +import time +import tt_lib as ttl +import tt_lib.profiler as profiler + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d +import ttnn + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels < 256 + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample1): + ttnn_module_args.c1["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c1["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["activation"] = None # Fuse relu with conv1 + + ttnn_module_args.c1["deallocate_activation"] = True + ttnn_module_args.c1["conv_blocking_and_parallelization_config_override"] = None + + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c2["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["activation"] = None # Fuse relu with conv2 + ttnn_module_args.c2["deallocate_activation"] = True + ttnn_module_args.c2["conv_blocking_and_parallelization_config_override"] = None + + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c3["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c3["deallocate_activation"] = True + ttnn_module_args.c3["conv_blocking_and_parallelization_config_override"] = None + + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + ttnn_module_args.c4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c4["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["activation"] = "relu" # Fuse relu with conv1 + ttnn_module_args.c4["deallocate_activation"] = True + ttnn_module_args.c4["conv_blocking_and_parallelization_config_override"] = None + + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c5["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c5["deallocate_activation"] = True + ttnn_module_args.c5["conv_blocking_and_parallelization_config_override"] = None + + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + ttnn_module_args.c6["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c6["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c6["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c6["deallocate_activation"] = True + ttnn_module_args.c6["conv_blocking_and_parallelization_config_override"] = None + + conv6_weight, conv6_bias = fold_batch_norm2d_into_conv2d(model.c6, model.b6) + update_ttnn_module_args(ttnn_module_args.c6) + parameters["c6"], c6_parallel_config = preprocess_conv2d( + conv6_weight, conv6_bias, ttnn_module_args.c6, return_parallel_config=True + ) + + ttnn_module_args.c7["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c7["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c7["deallocate_activation"] = True + ttnn_module_args.c7["conv_blocking_and_parallelization_config_override"] = None + + conv7_weight, conv7_bias = fold_batch_norm2d_into_conv2d(model.c7, model.b7) + update_ttnn_module_args(ttnn_module_args.c7) + parameters["c7"], c7_parallel_config = preprocess_conv2d( + conv7_weight, conv7_bias, ttnn_module_args.c7, return_parallel_config=True + ) + + ttnn_module_args.c8["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c8["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c8["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c8["deallocate_activation"] = True + ttnn_module_args.c8["conv_blocking_and_parallelization_config_override"] = None + + conv8_weight, conv8_bias = fold_batch_norm2d_into_conv2d(model.c8, model.b8) + update_ttnn_module_args(ttnn_module_args.c8) + parameters["c8"], c8_parallel_config = preprocess_conv2d( + conv8_weight, conv8_bias, ttnn_module_args.c8, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d2.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d2.py new file mode 100755 index 00000000000..c70b5e5468c --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d2.py @@ -0,0 +1,107 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +from ttnn.model_preprocessing import preprocess_model, preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_wormhole_b0 +from models.experimental.functional_yolov4.reference.downsample2 import DownSample2 +from models.experimental.functional_yolov4.tt.ttnn_downsample2 import TtDownSample2 + +import time +import tt_lib as ttl +import tt_lib.profiler as profiler + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d +import ttnn + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels <= 256 + ttnn_module_args["dtype"] = ttnn.bfloat8_b + ttnn_module_args["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args["deallocate_activation"] = True + ttnn_module_args["conv_blocking_and_parallelization_config_override"] = None + ttnn_module_args["activation"] = None + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample2): + ttnn_module_args.c1["weights_dtype"] = ttnn.bfloat8_b + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + parameters["res"] = {} + for i, block in enumerate(model.res.module_list): + conv1 = block[0] + bn1 = block[1] + conv2 = block[3] + bn2 = block[4] + + ttnn_module_args["res"][f"resblock_{i}_conv1"] = ttnn_module_args["res"]["0"] + ttnn_module_args["res"][f"resblock_{i}_conv1"]["weights_dtype"] = ttnn.bfloat8_b + weight1, bias1 = fold_batch_norm2d_into_conv2d(conv1, bn1) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv1"]) + parameters["res"][f"resblock_{i}_conv1"], _ = preprocess_conv2d( + weight1, bias1, ttnn_module_args["res"][f"resblock_{i}_conv1"], return_parallel_config=True + ) + + ttnn_module_args["res"][f"resblock_{i}_conv2"] = ttnn_module_args["res"]["3"] + ttnn_module_args["res"][f"resblock_{i}_conv2"]["weights_dtype"] = ttnn.bfloat8_b + weight2, bias2 = fold_batch_norm2d_into_conv2d(conv2, bn2) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv2"]) + parameters["res"][f"resblock_{i}_conv2"], _ = preprocess_conv2d( + weight2, bias2, ttnn_module_args["res"][f"resblock_{i}_conv2"], return_parallel_config=True + ) + + ttnn_module_args.c4["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d3.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d3.py new file mode 100755 index 00000000000..33d459b5b31 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d3.py @@ -0,0 +1,101 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +from ttnn.model_preprocessing import preprocess_model, preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_wormhole_b0 +from models.experimental.functional_yolov4.reference.downsample3 import DownSample3 +from models.experimental.functional_yolov4.tt.ttnn_downsample3 import TtDownSample3 + +import time +import tt_lib as ttl +import tt_lib.profiler as profiler + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d +import ttnn + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels <= 256 + ttnn_module_args["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args["dtype"] = ttnn.bfloat8_b + ttnn_module_args["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args["deallocate_activation"] = True + ttnn_module_args["conv_blocking_and_parallelization_config_override"] = None + ttnn_module_args["activation"] = None + + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample3): + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["use_shallow_conv_variant"] = False + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["use_shallow_conv_variant"] = False + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + parameters["res"] = {} + for i, block in enumerate(model.res.module_list): + conv1 = block[0] + bn1 = block[1] + conv2 = block[3] + bn2 = block[4] + + ttnn_module_args["res"][f"resblock_{i}_conv1"] = ttnn_module_args["res"]["0"] + ttnn_module_args["res"][f"resblock_{i}_conv1"]["weights_dtype"] = ttnn.bfloat8_b + weight1, bias1 = fold_batch_norm2d_into_conv2d(conv1, bn1) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv1"]) + parameters["res"][f"resblock_{i}_conv1"], _ = preprocess_conv2d( + weight1, bias1, ttnn_module_args["res"][f"resblock_{i}_conv1"], return_parallel_config=True + ) + + ttnn_module_args["res"][f"resblock_{i}_conv2"] = ttnn_module_args["res"]["3"] + ttnn_module_args["res"][f"resblock_{i}_conv2"]["weights_dtype"] = ttnn.bfloat8_b + weight2, bias2 = fold_batch_norm2d_into_conv2d(conv2, bn2) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv2"]) + parameters["res"][f"resblock_{i}_conv2"], _ = preprocess_conv2d( + weight2, bias2, ttnn_module_args["res"][f"resblock_{i}_conv2"], return_parallel_config=True + ) + + ttnn_module_args.c4["use_shallow_conv_variant"] = False + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + conv20_weight, conv20_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv20_weight, conv20_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["use_shallow_conv_variant"] = False + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d4.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d4.py new file mode 100755 index 00000000000..ccc84cafef4 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d4.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +from ttnn.model_preprocessing import preprocess_model, preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_wormhole_b0 +from models.experimental.functional_yolov4.reference.downsample4 import DownSample4 +from models.experimental.functional_yolov4.tt.ttnn_downsample4 import TtDownSample4 + +import time +import tt_lib as ttl +import tt_lib.profiler as profiler + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = True # ttnn_module_args.in_channels <= 256 + ttnn_module_args["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args["dtype"] = ttnn.bfloat8_b + ttnn_module_args["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args["deallocate_activation"] = True + ttnn_module_args["conv_blocking_and_parallelization_config_override"] = None + ttnn_module_args["activation"] = None + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample4): + ttnn_module_args.c1["use_shallow_conv_variant"] = False + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["use_shallow_conv_variant"] = False + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["use_shallow_conv_variant"] = False + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + parameters["res"] = {} + for i, block in enumerate(model.res.module_list): + conv1 = block[0] + bn1 = block[1] + conv2 = block[3] + bn2 = block[4] + + ttnn_module_args["res"][f"resblock_{i}_conv1"] = ttnn_module_args["res"]["0"] + ttnn_module_args["res"][f"resblock_{i}_conv1"]["weights_dtype"] = ttnn.bfloat8_b + weight1, bias1 = fold_batch_norm2d_into_conv2d(conv1, bn1) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv1"]) + parameters["res"][f"resblock_{i}_conv1"], _ = preprocess_conv2d( + weight1, bias1, ttnn_module_args["res"][f"resblock_{i}_conv1"], return_parallel_config=True + ) + + ttnn_module_args["res"][f"resblock_{i}_conv2"] = ttnn_module_args["res"]["3"] + ttnn_module_args["res"][f"resblock_{i}_conv2"]["weights_dtype"] = ttnn.bfloat8_b + weight2, bias2 = fold_batch_norm2d_into_conv2d(conv2, bn2) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv2"]) + parameters["res"][f"resblock_{i}_conv2"], _ = preprocess_conv2d( + weight2, bias2, ttnn_module_args["res"][f"resblock_{i}_conv2"], return_parallel_config=True + ) + + ttnn_module_args.c4["use_shallow_conv_variant"] = False + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["use_shallow_conv_variant"] = False + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d5.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d5.py new file mode 100755 index 00000000000..0c8cea44395 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_d5.py @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + + +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from models.experimental.functional_yolov4.reference.downsample5 import DownSample5 + +import ttnn +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = False # ttnn_module_args.in_channels <= 256 + ttnn_module_args["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args["dtype"] = ttnn.bfloat8_b + ttnn_module_args["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args["deallocate_activation"] = True + ttnn_module_args["conv_blocking_and_parallelization_config_override"] = None + ttnn_module_args["activation"] = None + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + if isinstance(model, DownSample5): + ttnn_module_args.c1["use_shallow_conv_variant"] = False + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["use_shallow_conv_variant"] = False + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["use_shallow_conv_variant"] = False + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + parameters["res"] = {} + for i, block in enumerate(model.res.module_list): + conv1 = block[0] + bn1 = block[1] + conv2 = block[3] + bn2 = block[4] + + ttnn_module_args["res"][f"resblock_{i}_conv1"] = ttnn_module_args["res"]["0"] + ttnn_module_args["res"][f"resblock_{i}_conv1"]["weights_dtype"] = ttnn.bfloat8_b + weight1, bias1 = fold_batch_norm2d_into_conv2d(conv1, bn1) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv1"]) + parameters["res"][f"resblock_{i}_conv1"], _ = preprocess_conv2d( + weight1, bias1, ttnn_module_args["res"][f"resblock_{i}_conv1"], return_parallel_config=True + ) + + ttnn_module_args["res"][f"resblock_{i}_conv2"] = ttnn_module_args["res"]["3"] + ttnn_module_args["res"][f"resblock_{i}_conv2"]["weights_dtype"] = ttnn.bfloat8_b + weight2, bias2 = fold_batch_norm2d_into_conv2d(conv2, bn2) + update_ttnn_module_args(ttnn_module_args["res"][f"resblock_{i}_conv2"]) + parameters["res"][f"resblock_{i}_conv2"], _ = preprocess_conv2d( + weight2, bias2, ttnn_module_args["res"][f"resblock_{i}_conv2"], return_parallel_config=True + ) + + ttnn_module_args.c4["use_shallow_conv_variant"] = False + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["use_shallow_conv_variant"] = False + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_head.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_head.py new file mode 100755 index 00000000000..d2817835df6 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_head.py @@ -0,0 +1,293 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +from ttnn.model_preprocessing import preprocess_model, preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_wormhole_b0 +from models.experimental.functional_yolov4.reference.head import Head +from models.experimental.functional_yolov4.tt.ttnn_head import TtHead + +import time +import tt_lib as ttl +import tt_lib.profiler as profiler + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d +import ttnn + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels < 256 + ttnn_module_args["use_shallow_conv_variant"] = False + + +def custom_preprocessor(device, model, name, ttnn_module_args): + print("We do reach here!") + parameters = {} + if isinstance(model, Head): + ttnn_module_args.c1["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c1["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c1["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c1["deallocate_activation"] = False + ttnn_module_args.c1["conv_blocking_and_parallelization_config_override"] = None + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + conv2_weight = model.c2.weight.detach() + conv2_bias = model.c2.bias + parameters["c2"] = {} + parameters["c2"]["weight"] = conv2_weight + parameters["c2"]["bias"] = conv2_bias + + ttnn_module_args.c3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c3["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c3["deallocate_activation"] = True + ttnn_module_args.c3["conv_blocking_and_parallelization_config_override"] = None + + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + + ttnn_module_args.c4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c4["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c4["deallocate_activation"] = True + ttnn_module_args.c4["conv_blocking_and_parallelization_config_override"] = None + + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c5["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c5["deallocate_activation"] = True + ttnn_module_args.c5["conv_blocking_and_parallelization_config_override"] = None + + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + ttnn_module_args.c6["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c6["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c6["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c6["deallocate_activation"] = True + ttnn_module_args.c6["conv_blocking_and_parallelization_config_override"] = None + + conv6_weight, conv6_bias = fold_batch_norm2d_into_conv2d(model.c6, model.b6) + update_ttnn_module_args(ttnn_module_args.c6) + parameters["c6"], c6_parallel_config = preprocess_conv2d( + conv6_weight, conv6_bias, ttnn_module_args.c6, return_parallel_config=True + ) + + ttnn_module_args.c7["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c7["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c7["deallocate_activation"] = True + ttnn_module_args.c7["conv_blocking_and_parallelization_config_override"] = None + + conv7_weight, conv7_bias = fold_batch_norm2d_into_conv2d(model.c7, model.b7) + update_ttnn_module_args(ttnn_module_args.c7) + parameters["c7"], c7_parallel_config = preprocess_conv2d( + conv7_weight, conv7_bias, ttnn_module_args.c7, return_parallel_config=True + ) + + ttnn_module_args.c8["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c8["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c8["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c8["deallocate_activation"] = False + ttnn_module_args.c8["conv_blocking_and_parallelization_config_override"] = None + + conv8_weight, conv8_bias = fold_batch_norm2d_into_conv2d(model.c8, model.b8) + update_ttnn_module_args(ttnn_module_args.c8) + parameters["c8"], c8_parallel_config = preprocess_conv2d( + conv8_weight, conv8_bias, ttnn_module_args.c8, return_parallel_config=True + ) + + ttnn_module_args.c9["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c9["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c9["deallocate_activation"] = False + ttnn_module_args.c9["conv_blocking_and_parallelization_config_override"] = None + + conv9_weight, conv9_bias = fold_batch_norm2d_into_conv2d(model.c9, model.b9) + update_ttnn_module_args(ttnn_module_args.c9) + ttnn_module_args.c9["use_1d_systolic_array"] = False + parameters["c9"], c9_parallel_config = preprocess_conv2d( + conv9_weight, conv9_bias, ttnn_module_args.c9, return_parallel_config=True + ) + + conv10_weight = model.c10.weight + conv10_bias = model.c10.bias + # conv10_bias = None + parameters["c10"] = {} + parameters["c10"]["weight"] = conv10_weight + parameters["c10"]["bias"] = conv10_bias + + ttnn_module_args.c11["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c11["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c11["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c11["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c11["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c11["deallocate_activation"] = False + ttnn_module_args.c11["conv_blocking_and_parallelization_config_override"] = None + + conv11_weight, conv11_bias = fold_batch_norm2d_into_conv2d(model.c11, model.b11) + update_ttnn_module_args(ttnn_module_args.c11) + parameters["c11"], c11_parallel_config = preprocess_conv2d( + conv11_weight, conv11_bias, ttnn_module_args.c11, return_parallel_config=True + ) + + ttnn_module_args.c12["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c12["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c12["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c12["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c12["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c12["deallocate_activation"] = True + ttnn_module_args.c12["conv_blocking_and_parallelization_config_override"] = None + + conv12_weight, conv12_bias = fold_batch_norm2d_into_conv2d(model.c12, model.b12) + update_ttnn_module_args(ttnn_module_args.c12) + parameters["c12"], c12_parallel_config = preprocess_conv2d( + conv12_weight, conv12_bias, ttnn_module_args.c12, return_parallel_config=True + ) + + ttnn_module_args.c13["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c13["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c13["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c13["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c13["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c13["deallocate_activation"] = True + ttnn_module_args.c13["conv_blocking_and_parallelization_config_override"] = None + + conv13_weight, conv13_bias = fold_batch_norm2d_into_conv2d(model.c13, model.b13) + update_ttnn_module_args(ttnn_module_args.c13) + parameters["c13"], c13_parallel_config = preprocess_conv2d( + conv13_weight, conv13_bias, ttnn_module_args.c13, return_parallel_config=True + ) + + ttnn_module_args.c14["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c14["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c14["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c14["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c14["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c14["deallocate_activation"] = True + ttnn_module_args.c14["conv_blocking_and_parallelization_config_override"] = None + + conv14_weight, conv14_bias = fold_batch_norm2d_into_conv2d(model.c14, model.b14) + update_ttnn_module_args(ttnn_module_args.c14) + parameters["c14"], c14_parallel_config = preprocess_conv2d( + conv14_weight, conv14_bias, ttnn_module_args.c14, return_parallel_config=True + ) + + ttnn_module_args.c15["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c15["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c15["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c15["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c15["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c15["deallocate_activation"] = True + ttnn_module_args.c15["conv_blocking_and_parallelization_config_override"] = None + + conv15_weight, conv15_bias = fold_batch_norm2d_into_conv2d(model.c15, model.b15) + update_ttnn_module_args(ttnn_module_args.c15) + parameters["c15"], c15_parallel_config = preprocess_conv2d( + conv15_weight, conv15_bias, ttnn_module_args.c15, return_parallel_config=True + ) + + ttnn_module_args.c16["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c16["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c16["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c16["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c16["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c16["deallocate_activation"] = True + ttnn_module_args.c16["conv_blocking_and_parallelization_config_override"] = None + + conv16_weight, conv16_bias = fold_batch_norm2d_into_conv2d(model.c16, model.b16) + update_ttnn_module_args(ttnn_module_args.c16) + parameters["c16"], c16_parallel_config = preprocess_conv2d( + conv16_weight, conv16_bias, ttnn_module_args.c16, return_parallel_config=True + ) + + ttnn_module_args.c17["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c17["use_shallow_conv_variant"] = ( + False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + ) + ttnn_module_args.c17["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c17["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c17["deallocate_activation"] = True + ttnn_module_args.c17["conv_blocking_and_parallelization_config_override"] = None + # conv17_weight, conv17_bias = model.c17, model.b17 + conv17_weight, conv17_bias = fold_batch_norm2d_into_conv2d(model.c17, model.b17) + update_ttnn_module_args(ttnn_module_args.c17) + parameters["c17"], c17_parallel_config = preprocess_conv2d( + conv17_weight, conv17_bias, ttnn_module_args.c17, return_parallel_config=True + ) + + conv18_weight = model.c18.weight + conv18_bias = model.c18.bias + parameters["c18"] = {} + parameters["c18"]["weight"] = conv18_weight + parameters["c18"]["bias"] = conv18_bias + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/custom_preprocessor_neck.py b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_neck.py new file mode 100755 index 00000000000..cb0fc2b2838 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/custom_preprocessor_neck.py @@ -0,0 +1,376 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch +import torch.nn as nn + +from ttnn.model_preprocessing import preprocess_model, preprocess_conv2d, fold_batch_norm2d_into_conv2d + +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_wormhole_b0 +from models.experimental.functional_yolov4.reference.neck import Neck +from models.experimental.functional_yolov4.tt.ttnn_neck import TtNeck + +import time +import tt_lib as ttl +import tt_lib.profiler as profiler + +import ttnn +import tt_lib +from ttnn.model_preprocessing import preprocess_conv2d, fold_batch_norm2d_into_conv2d +import ttnn + + +def update_ttnn_module_args(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = ttnn_module_args.in_channels < 256 + + +def update_ttnn_module_argsc3(ttnn_module_args): + ttnn_module_args["use_1d_systolic_array"] = True + + +def custom_preprocessor(device, model, name, ttnn_module_args): + parameters = {} + print("ttnn_module_args: ", ttnn_module_args) + if isinstance(model, Neck): + ttnn_module_args.c1["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c1["use_shallow_conv_variant"] = False # ( + ttnn_module_args.c1["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c1["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c1["deallocate_activation"] = True + ttnn_module_args.c1["conv_blocking_and_parallelization_config_override"] = None + # ttnn_module_args.c1["use_1d_systolic_array"] = True + conv1_weight, conv1_bias = fold_batch_norm2d_into_conv2d(model.c1, model.b1) + update_ttnn_module_args(ttnn_module_args.c1) + parameters["c1"], c1_parallel_config = preprocess_conv2d( + conv1_weight, conv1_bias, ttnn_module_args.c1, return_parallel_config=True + ) + + ttnn_module_args.c2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c2["activation"] = None # Fuse relu with conv2 + ttnn_module_args.c2["deallocate_activation"] = True + ttnn_module_args.c2["conv_blocking_and_parallelization_config_override"] = None + + conv2_weight, conv2_bias = fold_batch_norm2d_into_conv2d(model.c2, model.b2) + update_ttnn_module_args(ttnn_module_args.c2) + parameters["c2"], c2_parallel_config = preprocess_conv2d( + conv2_weight, conv2_bias, ttnn_module_args.c2, return_parallel_config=True + ) + + ttnn_module_args.c3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c3["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c3["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c3["deallocate_activation"] = True + ttnn_module_args.c3["conv_blocking_and_parallelization_config_override"] = None + update_ttnn_module_argsc3(ttnn_module_args.c3) + print("\n\n\n\nchecking here!: ", ttnn_module_args.c3["use_1d_systolic_array"]) + conv3_weight, conv3_bias = fold_batch_norm2d_into_conv2d(model.c3, model.b3) + update_ttnn_module_args(ttnn_module_args.c3) + parameters["c3"], c3_parallel_config = preprocess_conv2d( + conv3_weight, conv3_bias, ttnn_module_args.c3, return_parallel_config=True + ) + # parameters["c3"] = {} + # parameters["c3"]["weight"] = ttnn.from_torch(conv3_weight) + # ttnn_module_args.p1["deallocate_activation"] = False + # parameters["p1"] = {} + # ttnn_module_args.p1["parallel_config_override"] = { + # "grid_size": (c3_parallel_config.grid_size.x, c3_parallel_config.grid_size.y), + # "num_cores_nhw": c3_parallel_config.num_cores_nhw, + # } + # ttnn_module_args.p2["deallocate_activation"] = False + # parameters["p2"] = {} + # ttnn_module_args.p2["parallel_config_override"] = { + # "grid_size": (c3_parallel_config.grid_size.x, c3_parallel_config.grid_size.y), + # "num_cores_nhw": c3_parallel_config.num_cores_nhw, + # } + # ttnn_module_args.p3["deallocate_activation"] = False + # parameters["p3"] = {} + # ttnn_module_args.p3["parallel_config_override"] = { + # "grid_size": (c3_parallel_config.grid_size.x, c3_parallel_config.grid_size.y), + # "num_cores_nhw": c3_parallel_config.num_cores_nhw, + # } + ttnn_module_args.c4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c4["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c4["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c4["deallocate_activation"] = True + ttnn_module_args.c4["conv_blocking_and_parallelization_config_override"] = None + + conv4_weight, conv4_bias = fold_batch_norm2d_into_conv2d(model.c4, model.b4) + update_ttnn_module_args(ttnn_module_args.c4) + parameters["c4"], c4_parallel_config = preprocess_conv2d( + conv4_weight, conv4_bias, ttnn_module_args.c4, return_parallel_config=True + ) + + ttnn_module_args.c5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c5["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c5["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c5["deallocate_activation"] = True + ttnn_module_args.c5["conv_blocking_and_parallelization_config_override"] = None + + conv5_weight, conv5_bias = fold_batch_norm2d_into_conv2d(model.c5, model.b5) + update_ttnn_module_args(ttnn_module_args.c5) + parameters["c5"], c5_parallel_config = preprocess_conv2d( + conv5_weight, conv5_bias, ttnn_module_args.c5, return_parallel_config=True + ) + + ttnn_module_args.c6["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c6["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c6["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c6["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c6["deallocate_activation"] = True + ttnn_module_args.c6["conv_blocking_and_parallelization_config_override"] = None + + conv6_weight, conv6_bias = fold_batch_norm2d_into_conv2d(model.c6, model.b6) + update_ttnn_module_args(ttnn_module_args.c6) + parameters["c6"], c6_parallel_config = preprocess_conv2d( + conv6_weight, conv6_bias, ttnn_module_args.c6, return_parallel_config=True + ) + + ttnn_module_args.c7["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c7["deallocate_activation"] = True + ttnn_module_args.c7["conv_blocking_and_parallelization_config_override"] = None + + conv7_weight, conv7_bias = fold_batch_norm2d_into_conv2d(model.c7, model.b7) + update_ttnn_module_args(ttnn_module_args.c7) + parameters["c7"], c7_parallel_config = preprocess_conv2d( + conv7_weight, conv7_bias, ttnn_module_args.c7, return_parallel_config=True + ) + + ttnn_module_args.c7_2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7_2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7_2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_2["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c7_2["deallocate_activation"] = True + ttnn_module_args.c7_2["conv_blocking_and_parallelization_config_override"] = None + + conv7_2_weight, conv7_2_bias = fold_batch_norm2d_into_conv2d(model.c7_2, model.b7_2) + update_ttnn_module_args(ttnn_module_args.c7_2) + parameters["c7_2"], c7_2_parallel_config = preprocess_conv2d( + conv7_2_weight, conv7_2_bias, ttnn_module_args.c7_2, return_parallel_config=True + ) + + ttnn_module_args.c7_3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7_3["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7_3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_3["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c7_3["deallocate_activation"] = True + ttnn_module_args.c7_3["conv_blocking_and_parallelization_config_override"] = None + + conv7_3_weight, conv7_3_bias = fold_batch_norm2d_into_conv2d(model.c7_3, model.b7_3) + update_ttnn_module_args(ttnn_module_args.c7_3) + parameters["c7_3"], c7_3_parallel_config = preprocess_conv2d( + conv7_3_weight, conv7_3_bias, ttnn_module_args.c7_3, return_parallel_config=True + ) + + ttnn_module_args.c7_4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7_4["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7_4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_4["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c7_4["deallocate_activation"] = True + ttnn_module_args.c7_4["conv_blocking_and_parallelization_config_override"] = None + + conv7_4_weight, conv7_4_bias = fold_batch_norm2d_into_conv2d(model.c7_4, model.b7_4) + update_ttnn_module_args(ttnn_module_args.c7_4) + parameters["c7_4"], c7_4_parallel_config = preprocess_conv2d( + conv7_4_weight, conv7_4_bias, ttnn_module_args.c7_4, return_parallel_config=True + ) + + ttnn_module_args.c7_5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c7_5["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c7_5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c7_5["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c7_5["deallocate_activation"] = True + ttnn_module_args.c7_5["conv_blocking_and_parallelization_config_override"] = None + + conv7_5_weight, conv7_5_bias = fold_batch_norm2d_into_conv2d(model.c7_5, model.b7_5) + update_ttnn_module_args(ttnn_module_args.c7_5) + parameters["c7_5"], c7_5_parallel_config = preprocess_conv2d( + conv7_5_weight, conv7_5_bias, ttnn_module_args.c7_5, return_parallel_config=True + ) + + ttnn_module_args.c8["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c8["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c8["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c8["deallocate_activation"] = True + ttnn_module_args.c8["conv_blocking_and_parallelization_config_override"] = None + + conv8_weight, conv8_bias = fold_batch_norm2d_into_conv2d(model.c8, model.b8) + update_ttnn_module_args(ttnn_module_args.c8) + parameters["c8"], c8_parallel_config = preprocess_conv2d( + conv8_weight, conv8_bias, ttnn_module_args.c8, return_parallel_config=True + ) + + ttnn_module_args.c8_2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c8_2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c8_2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8_2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c8_2["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c8_2["deallocate_activation"] = True + ttnn_module_args.c8_2["conv_blocking_and_parallelization_config_override"] = None + + conv8_2_weight, conv8_2_bias = fold_batch_norm2d_into_conv2d(model.c8_2, model.b8_2) + update_ttnn_module_args(ttnn_module_args.c8_2) + parameters["c8_2"], c8_2_parallel_config = preprocess_conv2d( + conv8_2_weight, conv8_2_bias, ttnn_module_args.c8_2, return_parallel_config=True + ) + + ttnn_module_args.c9["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c9["deallocate_activation"] = True + ttnn_module_args.c9["conv_blocking_and_parallelization_config_override"] = None + + conv9_weight, conv9_bias = fold_batch_norm2d_into_conv2d(model.c9, model.b9) + update_ttnn_module_argsc3(ttnn_module_args.c9) + parameters["c9"], c9_parallel_config = preprocess_conv2d( + conv9_weight, conv9_bias, ttnn_module_args.c9, return_parallel_config=True + ) + + ttnn_module_args.c9_2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9_2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9_2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_2["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c9_2["deallocate_activation"] = True + ttnn_module_args.c9_2["conv_blocking_and_parallelization_config_override"] = None + + conv9_2_weight, conv9_2_bias = fold_batch_norm2d_into_conv2d(model.c9_2, model.b9_2) + update_ttnn_module_args(ttnn_module_args.c9_2) + parameters["c9_2"], c9_2_parallel_config = preprocess_conv2d( + conv9_2_weight, conv9_2_bias, ttnn_module_args.c9_2, return_parallel_config=True + ) + + ttnn_module_args.c9_3["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9_3["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9_3["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_3["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_3["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c9_3["deallocate_activation"] = True + ttnn_module_args.c9_3["conv_blocking_and_parallelization_config_override"] = None + + conv9_3_weight, conv9_3_bias = fold_batch_norm2d_into_conv2d(model.c9_3, model.b9_3) + update_ttnn_module_args(ttnn_module_args.c9_3) + parameters["c9_3"], c9_3_parallel_config = preprocess_conv2d( + conv9_3_weight, conv9_3_bias, ttnn_module_args.c9_3, return_parallel_config=True + ) + + ttnn_module_args.c9_4["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9_4["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9_4["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_4["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_4["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c9_4["deallocate_activation"] = True + ttnn_module_args.c9_4["conv_blocking_and_parallelization_config_override"] = None + + conv9_4_weight, conv9_4_bias = fold_batch_norm2d_into_conv2d(model.c9_4, model.b9_4) + update_ttnn_module_args(ttnn_module_args.c9_4) + parameters["c9_4"], c9_4_parallel_config = preprocess_conv2d( + conv9_4_weight, conv9_4_bias, ttnn_module_args.c9_4, return_parallel_config=True + ) + + ttnn_module_args.c9_5["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c9_5["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c9_5["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_5["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c9_5["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c9_5["deallocate_activation"] = True + ttnn_module_args.c9_5["conv_blocking_and_parallelization_config_override"] = None + + conv9_5_weight, conv9_5_bias = fold_batch_norm2d_into_conv2d(model.c9_5, model.b9_5) + update_ttnn_module_args(ttnn_module_args.c9_5) + parameters["c9_5"], c9_5_parallel_config = preprocess_conv2d( + conv9_5_weight, conv9_5_bias, ttnn_module_args.c9_5, return_parallel_config=True + ) + + ttnn_module_args.c10["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c10["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c10["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c10["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c10["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c10["deallocate_activation"] = True + ttnn_module_args.c10["conv_blocking_and_parallelization_config_override"] = None + + conv10_weight, conv10_bias = fold_batch_norm2d_into_conv2d(model.c10, model.b10) + update_ttnn_module_args(ttnn_module_args.c10) + parameters["c10"], c10_parallel_config = preprocess_conv2d( + conv10_weight, conv10_bias, ttnn_module_args.c10, return_parallel_config=True + ) + + ttnn_module_args.c10_2["math_fidelity"] = ttnn.MathFidelity.LoFi + ttnn_module_args.c10_2["use_shallow_conv_variant"] = False # ( + # False if device.arch() == tt_lib.device.Arch.WORMHOLE_B0 else True + # ) + ttnn_module_args.c10_2["dtype"] = ttnn.bfloat8_b + ttnn_module_args.c10_2["weights_dtype"] = ttnn.bfloat8_b + ttnn_module_args.c10_2["activation"] = None # Fuse relu with conv1 + ttnn_module_args.c10_2["deallocate_activation"] = True + ttnn_module_args.c10_2["conv_blocking_and_parallelization_config_override"] = None + + conv10_weight, conv10_bias = fold_batch_norm2d_into_conv2d(model.c10_2, model.b10_2) + update_ttnn_module_args(ttnn_module_args.c10_2) + parameters["c10_2"], c10_2_parallel_config = preprocess_conv2d( + conv10_weight, conv10_bias, ttnn_module_args.c10_2, return_parallel_config=True + ) + + return parameters diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py new file mode 100755 index 00000000000..d2fc5995972 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import torch + +from ttnn.model_preprocessing import preprocess_model + +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_wormhole_b0 + + +from models.experimental.functional_yolov4.reference.yolov4 import Yolov4 +from models.experimental.functional_yolov4.tt.ttnn_yolov4 import TtYolov4 + +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d1 as D1 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d2 as D2 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d3 as D3 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d4 as D4 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_d5 as D5 +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_neck as neck +import tests.ttnn.integration_tests.yolov4.custom_preprocessor_head as head +import ttnn + + +def create_custom_preprocessor(device): + def custom_preprocessor(model, name, ttnn_module_args): + parameters = {} + parameters["downsample1"] = D1.custom_preprocessor( + device, model.downsample1, name, ttnn_module_args["downsample1"] + ) + parameters["downsample2"] = D2.custom_preprocessor( + device, model.downsample2, name, ttnn_module_args["downsample2"] + ) + parameters["downsample3"] = D3.custom_preprocessor( + device, model.downsample3, name, ttnn_module_args["downsample3"] + ) + parameters["downsample4"] = D4.custom_preprocessor( + device, model.downsample4, name, ttnn_module_args["downsample4"] + ) + parameters["downsample5"] = D5.custom_preprocessor( + device, model.downsample5, name, ttnn_module_args["downsample5"] + ) + parameters["neck"] = neck.custom_preprocessor(device, model.neck, name, ttnn_module_args["neck"]) + parameters["head"] = head.custom_preprocessor(device, model.head, name, ttnn_module_args["head"]) + return parameters + + return custom_preprocessor + + +import pytest + + +@pytest.mark.parametrize("device_l1_small_size", [32768], indirect=True) +@skip_for_wormhole_b0() +def test_downsample1(device, reset_seeds, model_location_generator): + model_path = model_location_generator("models", model_subdir="Yolo") + weights_pth = str(model_path / "yolov4.pth") + state_dict = torch.load(weights_pth) + ds_state_dict = { + k: v + for k, v in state_dict.items() + if (k.startswith(("down1.", "down2.", "down3.", "down4.", "down5.", "neek.", "head."))) + } + torch_model = Yolov4() + new_state_dict = {} + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + + torch_model.load_state_dict(new_state_dict) + torch_model.eval() + + torch_input_tensor = torch.randn(1, 3, 320, 320) # Batch size of 1, 128 input channels, 160x160 height and width + torch_output_tensor1, torch_output_tensor2, torch_output_tensor3 = torch_model(torch_input_tensor) + reader_patterns_cache = {} + parameters = preprocess_model( + initialize_model=lambda: torch_model, + run_model=lambda model: model(torch_input_tensor), + custom_preprocessor=create_custom_preprocessor(device), + reader_patterns_cache=reader_patterns_cache, + device=device, + ) + + ttnn_model = TtYolov4(device, parameters) + + # Tensor Preprocessing + # + input_shape = torch_input_tensor.shape + input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + + input_tensor = input_tensor.reshape( + input_tensor.shape[0], 1, input_tensor.shape[1] * input_tensor.shape[2], input_tensor.shape[3] + ) + input_tensor = ttnn.from_torch(input_tensor, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT) + output_tensor1, output_tensor2, output_tensor3 = ttnn_model(device, input_tensor) + + # + # Tensor Postprocessing + # + output_tensor1 = ttnn.to_torch(output_tensor1) + output_tensor1 = output_tensor1.reshape(1, 40, 40, 255) + output_tensor1 = torch.permute(output_tensor1, (0, 3, 1, 2)) + output_tensor1 = output_tensor1.to(torch_input_tensor.dtype) + + output_tensor2 = ttnn.to_torch(output_tensor2) + output_tensor2 = output_tensor2.reshape(1, 20, 20, 255) + output_tensor2 = torch.permute(output_tensor2, (0, 3, 1, 2)) + output_tensor2 = output_tensor2.to(torch_input_tensor.dtype) + + output_tensor3 = ttnn.to_torch(output_tensor3) + output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) + output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) + output_tensor3 = output_tensor3.to(torch_input_tensor.dtype) + + assert_with_pcc(torch_output_tensor1, output_tensor1, pcc=0.99) # PCC = 0.3981625206948637 + assert_with_pcc(torch_output_tensor2, output_tensor2, pcc=0.99) # PCC = 0.8124473470847513 + assert_with_pcc(torch_output_tensor3, output_tensor3, pcc=0.99) # PCC = 0.7684023245417583