diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index a1459da..0000000
Binary files a/.DS_Store and /dev/null differ
diff --git a/.github/verilog_ci.yml b/.github/workflows/verilog_ci.yml
similarity index 80%
rename from .github/verilog_ci.yml
rename to .github/workflows/verilog_ci.yml
index 5f66dd5..1bdf3a0 100644
--- a/.github/verilog_ci.yml
+++ b/.github/workflows/verilog_ci.yml
@@ -12,9 +12,9 @@ jobs:
     - name: Setup Icarus Verilog
       run: sudo apt-get update && sudo apt-get install -y iverilog
     
+    - name: Install virtualenv
+      run: sudo apt-get install -y python3-virtualenv
+    
     - name: Test Verilog Code
       run: |
         make test
-    
-    - name: Run Tests
-      run: vvp output_name.vvp
diff --git a/.gitignore b/.gitignore
index 7f2684e..10ef8af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
 *.vcd
 *.vvp
 *.DS_Store
+**/results.xml
+**/sim_build/
+**/build/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/makefile b/Makefile
similarity index 100%
rename from makefile
rename to Makefile
diff --git a/build/block.vpp b/build/block.vpp
deleted file mode 100755
index 2988285..0000000
--- a/build/block.vpp
+++ /dev/null
@@ -1,460 +0,0 @@
-#! /usr/local/Cellar/icarus-verilog/12.0/bin/vvp
-:ivl_version "12.0 (stable)";
-:ivl_delay_selection "TYPICAL";
-:vpi_time_precision - 12;
-:vpi_module "/usr/local/Cellar/icarus-verilog/12.0/lib/ivl/system.vpi";
-:vpi_module "/usr/local/Cellar/icarus-verilog/12.0/lib/ivl/vhdl_sys.vpi";
-:vpi_module "/usr/local/Cellar/icarus-verilog/12.0/lib/ivl/vhdl_textio.vpi";
-:vpi_module "/usr/local/Cellar/icarus-verilog/12.0/lib/ivl/v2005_math.vpi";
-:vpi_module "/usr/local/Cellar/icarus-verilog/12.0/lib/ivl/va_math.vpi";
-S_0x7fdb61b04160 .scope module, "block_tb" "block_tb" 2 4;
- .timescale -9 -12;
-v0x7fdb61914950_0 .var "clk", 0 0;
-v0x7fdb61914a10_0 .var "compute", 0 0;
-v0x7fdb61914ac0_0 .var "inp_north", 31 0;
-v0x7fdb61914bb0_0 .var "inp_west", 31 0;
-v0x7fdb61914c80_0 .net "outp_east", 31 0, v0x7fdb61914450_0;  1 drivers
-v0x7fdb61914d50_0 .net "outp_south", 31 0, v0x7fdb61914500_0;  1 drivers
-v0x7fdb61914de0_0 .var "rst", 0 0;
-v0x7fdb61914e70_0 .var "weight_en", 0 0;
-v0x7fdb61914f20_0 .var "weight_in", 31 0;
-S_0x7fdb61b042e0 .scope module, "uut" "block" 2 11, 3 1 0, S_0x7fdb61b04160;
- .timescale -9 -12;
-    .port_info 0 /INPUT 32 "inp_north";
-    .port_info 1 /INPUT 32 "inp_west";
-    .port_info 2 /INPUT 32 "weight_in";
-    .port_info 3 /OUTPUT 32 "outp_south";
-    .port_info 4 /OUTPUT 32 "outp_east";
-    .port_info 5 /INPUT 1 "clk";
-    .port_info 6 /INPUT 1 "rst";
-    .port_info 7 /INPUT 1 "compute";
-    .port_info 8 /INPUT 1 "weight_en";
-v0x7fdb61914000_0 .net "add_result", 31 0, L_0x7fdb61918740;  1 drivers
-v0x7fdb619140b0_0 .net "clk", 0 0, v0x7fdb61914950_0;  1 drivers
-v0x7fdb61914140_0 .net "compute", 0 0, v0x7fdb61914a10_0;  1 drivers
-v0x7fdb619141f0_0 .net "inp_north", 31 0, v0x7fdb61914ac0_0;  1 drivers
-v0x7fdb619142b0_0 .net "inp_west", 31 0, v0x7fdb61914bb0_0;  1 drivers
-v0x7fdb61914380_0 .net "mul_result", 31 0, L_0x7fdb61915610;  1 drivers
-v0x7fdb61914450_0 .var "outp_east", 31 0;
-v0x7fdb61914500_0 .var "outp_south", 31 0;
-v0x7fdb619145b0_0 .net "rst", 0 0, v0x7fdb61914de0_0;  1 drivers
-v0x7fdb619146c0_0 .var "weight", 31 0;
-v0x7fdb61914770_0 .net "weight_en", 0 0, v0x7fdb61914e70_0;  1 drivers
-v0x7fdb61914800_0 .net "weight_in", 31 0, v0x7fdb61914f20_0;  1 drivers
-E_0x7fdb61b045e0 .event posedge, v0x7fdb619140b0_0, v0x7fdb619145b0_0;
-S_0x7fdb61b04640 .scope module, "add_instance" "fadd" 3 19, 4 4 0, S_0x7fdb61b042e0;
- .timescale -9 -12;
-    .port_info 0 /INPUT 32 "a_operand";
-    .port_info 1 /INPUT 32 "b_operand";
-    .port_info 2 /OUTPUT 32 "result";
-L_0x7fdb61916690 .functor OR 1, L_0x7fdb61916430, L_0x7fdb619165f0, C4<0>, C4<0>;
-L_0x7fdb61916b10 .functor XOR 1, L_0x7fdb61916930, L_0x7fdb619169d0, C4<0>, C4<0>;
-L_0x7fdb61916b80 .functor NOT 1, L_0x7fdb61916b10, C4<0>, C4<0>, C4<0>;
-v0x7fdb61b04860_0 .net "Exception", 0 0, L_0x7fdb61916690;  1 drivers
-v0x7fdb619108b0_0 .net *"_ivl_100", 7 0, L_0x7fdb61918360;  1 drivers
-v0x7fdb61910970_0 .net *"_ivl_103", 7 0, L_0x7fdb61918120;  1 drivers
-v0x7fdb61910a10_0 .net *"_ivl_104", 7 0, L_0x7fdb619181c0;  1 drivers
-L_0x7fdb61863248 .functor BUFT 1, C4<00000000000000000000000000000000>, C4<0>, C4<0>, C4<0>;
-v0x7fdb61910ac0_0 .net/2u *"_ivl_106", 31 0, L_0x7fdb61863248;  1 drivers
-v0x7fdb61910bb0_0 .net *"_ivl_108", 31 0, L_0x7fdb61918660;  1 drivers
-v0x7fdb61910c60_0 .net *"_ivl_11", 63 0, L_0x7fdb61915d50;  1 drivers
-v0x7fdb61910d10_0 .net *"_ivl_13", 63 0, L_0x7fdb61915e10;  1 drivers
-v0x7fdb61910dc0_0 .net *"_ivl_16", 7 0, L_0x7fdb61915f70;  1 drivers
-v0x7fdb61910ed0_0 .net *"_ivl_20", 7 0, L_0x7fdb61916140;  1 drivers
-v0x7fdb61910f80_0 .net *"_ivl_24", 7 0, L_0x7fdb61916320;  1 drivers
-v0x7fdb61911030_0 .net *"_ivl_26", 0 0, L_0x7fdb61916430;  1 drivers
-v0x7fdb619110d0_0 .net *"_ivl_28", 7 0, L_0x7fdb619164d0;  1 drivers
-v0x7fdb61911180_0 .net *"_ivl_30", 0 0, L_0x7fdb619165f0;  1 drivers
-v0x7fdb61911220_0 .net *"_ivl_36", 0 0, L_0x7fdb61916930;  1 drivers
-v0x7fdb619112d0_0 .net *"_ivl_38", 0 0, L_0x7fdb619169d0;  1 drivers
-v0x7fdb61911380_0 .net *"_ivl_39", 0 0, L_0x7fdb61916b10;  1 drivers
-v0x7fdb61911510_0 .net *"_ivl_4", 30 0, L_0x7fdb61915920;  1 drivers
-L_0x7fdb61863098 .functor BUFT 1, C4<1>, C4<0>, C4<0>, C4<0>;
-v0x7fdb619115a0_0 .net/2u *"_ivl_43", 0 0, L_0x7fdb61863098;  1 drivers
-v0x7fdb61911650_0 .net *"_ivl_46", 22 0, L_0x7fdb61916bf0;  1 drivers
-L_0x7fdb618630e0 .functor BUFT 1, C4<1>, C4<0>, C4<0>, C4<0>;
-v0x7fdb61911700_0 .net/2u *"_ivl_49", 0 0, L_0x7fdb618630e0;  1 drivers
-v0x7fdb619117b0_0 .net *"_ivl_52", 22 0, L_0x7fdb61916a70;  1 drivers
-v0x7fdb61911860_0 .net *"_ivl_56", 7 0, L_0x7fdb619170e0;  1 drivers
-v0x7fdb61911910_0 .net *"_ivl_58", 7 0, L_0x7fdb61916e10;  1 drivers
-v0x7fdb619119c0_0 .net *"_ivl_6", 30 0, L_0x7fdb619159c0;  1 drivers
-v0x7fdb61911a70_0 .net *"_ivl_64", 7 0, L_0x7fdb61917510;  1 drivers
-v0x7fdb61911b20_0 .net *"_ivl_67", 24 0, L_0x7fdb61917700;  1 drivers
-v0x7fdb61911bd0_0 .net *"_ivl_7", 0 0, L_0x7fdb61915b00;  1 drivers
-L_0x7fdb61863128 .functor BUFT 1, C4<0>, C4<0>, C4<0>, C4<0>;
-v0x7fdb61911c70_0 .net *"_ivl_70", 0 0, L_0x7fdb61863128;  1 drivers
-v0x7fdb61911d20_0 .net *"_ivl_71", 24 0, L_0x7fdb619177a0;  1 drivers
-L_0x7fdb61863170 .functor BUFT 1, C4<0>, C4<0>, C4<0>, C4<0>;
-v0x7fdb61911dd0_0 .net *"_ivl_74", 0 0, L_0x7fdb61863170;  1 drivers
-v0x7fdb61911e80_0 .net *"_ivl_75", 24 0, L_0x7fdb61917610;  1 drivers
-L_0x7fdb618631b8 .functor BUFT 1, C4<0000000000000000000000000>, C4<0>, C4<0>, C4<0>;
-v0x7fdb61911f30_0 .net/2u *"_ivl_77", 24 0, L_0x7fdb618631b8;  1 drivers
-v0x7fdb61911430_0 .net *"_ivl_84", 0 0, L_0x7fdb61917bd0;  1 drivers
-v0x7fdb619121c0_0 .net *"_ivl_86", 22 0, L_0x7fdb61917c70;  1 drivers
-v0x7fdb61912250_0 .net *"_ivl_88", 22 0, L_0x7fdb61917ac0;  1 drivers
-v0x7fdb619122f0_0 .net *"_ivl_89", 22 0, L_0x7fdb61917eb0;  1 drivers
-v0x7fdb619123a0_0 .net *"_ivl_9", 63 0, L_0x7fdb61915c30;  1 drivers
-v0x7fdb61912450_0 .net *"_ivl_95", 0 0, L_0x7fdb61918080;  1 drivers
-L_0x7fdb61863200 .functor BUFT 1, C4<00000001>, C4<0>, C4<0>, C4<0>;
-v0x7fdb61912500_0 .net/2u *"_ivl_96", 7 0, L_0x7fdb61863200;  1 drivers
-v0x7fdb619125b0_0 .net *"_ivl_99", 7 0, L_0x7fdb61917f50;  1 drivers
-v0x7fdb61912660_0 .net "a_operand", 31 0, v0x7fdb61914ac0_0;  alias, 1 drivers
-v0x7fdb61912710_0 .net "add_sum", 30 0, L_0x7fdb61917d50;  1 drivers
-v0x7fdb619127c0_0 .net "b_operand", 31 0, L_0x7fdb61915610;  alias, 1 drivers
-v0x7fdb61912870_0 .net "exp_a", 0 0, L_0x7fdb619160a0;  1 drivers
-v0x7fdb61912910_0 .net "exp_b", 0 0, L_0x7fdb61916280;  1 drivers
-v0x7fdb619129b0_0 .net "exponent_b_add", 7 0, L_0x7fdb61917180;  1 drivers
-v0x7fdb61912a60_0 .net "exponent_diff", 7 0, L_0x7fdb61917250;  1 drivers
-v0x7fdb61912b10_0 .net "operand_a", 31 0, L_0x7fdb619157a0;  1 drivers
-v0x7fdb61912bc0_0 .net "operand_b", 31 0, L_0x7fdb61915840;  1 drivers
-v0x7fdb61912c70_0 .net "operation_sub_addBar", 0 0, L_0x7fdb61916b80;  1 drivers
-v0x7fdb61912d10_0 .net "output_sign", 0 0, L_0x7fdb61916780;  1 drivers
-v0x7fdb61912db0_0 .net "result", 31 0, L_0x7fdb61918740;  alias, 1 drivers
-v0x7fdb61912e60_0 .net "significand_a", 23 0, L_0x7fdb61916cb0;  1 drivers
-v0x7fdb61912f10_0 .net "significand_add", 24 0, L_0x7fdb61917a20;  1 drivers
-v0x7fdb61912fc0_0 .net "significand_b", 23 0, L_0x7fdb61916ec0;  1 drivers
-v0x7fdb61913070_0 .net "significand_b_add", 23 0, L_0x7fdb61917350;  1 drivers
-L_0x7fdb619157a0 .part L_0x7fdb61915e10, 32, 32;
-L_0x7fdb61915840 .part L_0x7fdb61915e10, 0, 32;
-L_0x7fdb61915920 .part v0x7fdb61914ac0_0, 0, 31;
-L_0x7fdb619159c0 .part L_0x7fdb61915610, 0, 31;
-L_0x7fdb61915b00 .cmp/gt 31, L_0x7fdb619159c0, L_0x7fdb61915920;
-L_0x7fdb61915c30 .concat [ 32 32 0 0], v0x7fdb61914ac0_0, L_0x7fdb61915610;
-L_0x7fdb61915d50 .concat [ 32 32 0 0], L_0x7fdb61915610, v0x7fdb61914ac0_0;
-L_0x7fdb61915e10 .functor MUXZ 64, L_0x7fdb61915d50, L_0x7fdb61915c30, L_0x7fdb61915b00, C4<>;
-L_0x7fdb61915f70 .part L_0x7fdb619157a0, 23, 8;
-L_0x7fdb619160a0 .part L_0x7fdb61915f70, 0, 1;
-L_0x7fdb61916140 .part L_0x7fdb61915840, 23, 8;
-L_0x7fdb61916280 .part L_0x7fdb61916140, 0, 1;
-L_0x7fdb61916320 .part L_0x7fdb619157a0, 23, 8;
-L_0x7fdb61916430 .reduce/and L_0x7fdb61916320;
-L_0x7fdb619164d0 .part L_0x7fdb61915840, 23, 8;
-L_0x7fdb619165f0 .reduce/and L_0x7fdb619164d0;
-L_0x7fdb61916780 .part L_0x7fdb619157a0, 31, 1;
-L_0x7fdb61916930 .part L_0x7fdb619157a0, 31, 1;
-L_0x7fdb619169d0 .part L_0x7fdb61915840, 31, 1;
-L_0x7fdb61916bf0 .part L_0x7fdb619157a0, 0, 23;
-L_0x7fdb61916cb0 .concat [ 23 1 0 0], L_0x7fdb61916bf0, L_0x7fdb61863098;
-L_0x7fdb61916a70 .part L_0x7fdb61915840, 0, 23;
-L_0x7fdb61916ec0 .concat [ 23 1 0 0], L_0x7fdb61916a70, L_0x7fdb618630e0;
-L_0x7fdb619170e0 .part L_0x7fdb619157a0, 23, 8;
-L_0x7fdb61916e10 .part L_0x7fdb61915840, 23, 8;
-L_0x7fdb61917250 .arith/sub 8, L_0x7fdb619170e0, L_0x7fdb61916e10;
-L_0x7fdb61917350 .shift/r 24, L_0x7fdb61916ec0, L_0x7fdb61917250;
-L_0x7fdb61917510 .part L_0x7fdb61915840, 23, 8;
-L_0x7fdb61917180 .arith/sum 8, L_0x7fdb61917510, L_0x7fdb61917250;
-L_0x7fdb61917700 .concat [ 24 1 0 0], L_0x7fdb61916cb0, L_0x7fdb61863128;
-L_0x7fdb619177a0 .concat [ 24 1 0 0], L_0x7fdb61917350, L_0x7fdb61863170;
-L_0x7fdb61917610 .arith/sum 25, L_0x7fdb61917700, L_0x7fdb619177a0;
-L_0x7fdb61917a20 .functor MUXZ 25, L_0x7fdb618631b8, L_0x7fdb61917610, L_0x7fdb61916b80, C4<>;
-L_0x7fdb61917bd0 .part L_0x7fdb61917a20, 24, 1;
-L_0x7fdb61917c70 .part L_0x7fdb61917a20, 1, 23;
-L_0x7fdb61917ac0 .part L_0x7fdb61917a20, 0, 23;
-L_0x7fdb61917eb0 .functor MUXZ 23, L_0x7fdb61917ac0, L_0x7fdb61917c70, L_0x7fdb61917bd0, C4<>;
-L_0x7fdb61917d50 .concat8 [ 23 8 0 0], L_0x7fdb61917eb0, L_0x7fdb619181c0;
-L_0x7fdb61918080 .part L_0x7fdb61917a20, 24, 1;
-L_0x7fdb61917f50 .part L_0x7fdb619157a0, 23, 8;
-L_0x7fdb61918360 .arith/sum 8, L_0x7fdb61863200, L_0x7fdb61917f50;
-L_0x7fdb61918120 .part L_0x7fdb619157a0, 23, 8;
-L_0x7fdb619181c0 .functor MUXZ 8, L_0x7fdb61918120, L_0x7fdb61918360, L_0x7fdb61918080, C4<>;
-L_0x7fdb61918660 .concat [ 31 1 0 0], L_0x7fdb61917d50, L_0x7fdb61916780;
-L_0x7fdb61918740 .functor MUXZ 32, L_0x7fdb61918660, L_0x7fdb61863248, L_0x7fdb61916690, C4<>;
-S_0x7fdb61913170 .scope module, "mul_instance" "fmul" 3 13, 5 4 0, S_0x7fdb61b042e0;
- .timescale -9 -12;
-    .port_info 0 /INPUT 32 "a_in";
-    .port_info 1 /INPUT 32 "b_in";
-    .port_info 2 /OUTPUT 32 "result";
-L_0x7fdb619152b0 .functor XOR 1, L_0x7fdb619150f0, L_0x7fdb619151b0, C4<0>, C4<0>;
-L_0x7fdb61915560 .functor AND 23, L_0x7fdb61915400, v0x7fdb619134e0_0, C4<11111111111111111111111>, C4<11111111111111111111111>;
-v0x7fdb619134e0_0 .var "M_result", 22 0;
-L_0x7fdb61863050 .functor BUFT 1, C4<11111111111111111111111>, C4<0>, C4<0>, C4<0>;
-v0x7fdb61913570_0 .net/2u *"_ivl_10", 22 0, L_0x7fdb61863050;  1 drivers
-v0x7fdb61913600_0 .net *"_ivl_14", 22 0, L_0x7fdb61915560;  1 drivers
-v0x7fdb61913690_0 .net *"_ivl_3", 0 0, L_0x7fdb619150f0;  1 drivers
-v0x7fdb61913740_0 .net *"_ivl_5", 0 0, L_0x7fdb619151b0;  1 drivers
-L_0x7fdb61863008 .functor BUFT 1, C4<00000000000000000000000>, C4<0>, C4<0>, C4<0>;
-v0x7fdb61913830_0 .net/2u *"_ivl_8", 22 0, L_0x7fdb61863008;  1 drivers
-v0x7fdb619138e0_0 .net "a_in", 31 0, v0x7fdb61914bb0_0;  alias, 1 drivers
-v0x7fdb61913990_0 .net "b_in", 31 0, v0x7fdb619146c0_0;  1 drivers
-v0x7fdb61913a40_0 .net "e_result", 7 0, L_0x7fdb61915050;  1 drivers
-v0x7fdb61913b50_0 .var "e_result0", 8 0;
-v0x7fdb61913c00_0 .var "mul_fix_out", 47 0;
-v0x7fdb61913cb0_0 .var "overflow", 0 0;
-v0x7fdb61913d50_0 .net "overflow_mask", 22 0, L_0x7fdb61915400;  1 drivers
-v0x7fdb61913e00_0 .net "result", 31 0, L_0x7fdb61915610;  alias, 1 drivers
-v0x7fdb61913ec0_0 .net "sign", 0 0, L_0x7fdb619152b0;  1 drivers
-v0x7fdb61913f50_0 .var "zero_check", 0 0;
-E_0x7fdb619133b0/0 .event anyedge, v0x7fdb61913f50_0, v0x7fdb619138e0_0, v0x7fdb61913990_0, v0x7fdb61913c00_0;
-E_0x7fdb619133b0/1 .event anyedge, v0x7fdb61913cb0_0;
-E_0x7fdb619133b0 .event/or E_0x7fdb619133b0/0, E_0x7fdb619133b0/1;
-E_0x7fdb61913430 .event anyedge, v0x7fdb61913c00_0;
-E_0x7fdb61913480 .event anyedge, v0x7fdb619138e0_0, v0x7fdb61913990_0;
-L_0x7fdb61915050 .part v0x7fdb61913b50_0, 0, 8;
-L_0x7fdb619150f0 .part v0x7fdb61914bb0_0, 31, 1;
-L_0x7fdb619151b0 .part v0x7fdb619146c0_0, 31, 1;
-L_0x7fdb61915400 .functor MUXZ 23, L_0x7fdb61863050, L_0x7fdb61863008, v0x7fdb61913cb0_0, C4<>;
-L_0x7fdb61915610 .concat [ 23 8 1 0], L_0x7fdb61915560, L_0x7fdb61915050, L_0x7fdb619152b0;
-    .scope S_0x7fdb61913170;
-T_0 ;
-    %wait E_0x7fdb61913480;
-    %pushi/vec4 1, 0, 1;
-    %load/vec4 v0x7fdb619138e0_0;
-    %parti/s 23, 0, 2;
-    %concat/vec4; draw_concat_vec4
-    %pad/u 48;
-    %pushi/vec4 1, 0, 1;
-    %load/vec4 v0x7fdb61913990_0;
-    %parti/s 23, 0, 2;
-    %concat/vec4; draw_concat_vec4
-    %pad/u 48;
-    %mul;
-    %store/vec4 v0x7fdb61913c00_0, 0, 48;
-    %jmp T_0;
-    .thread T_0, $push;
-    .scope S_0x7fdb61913170;
-T_1 ;
-    %wait E_0x7fdb61913480;
-    %load/vec4 v0x7fdb619138e0_0;
-    %parti/s 8, 23, 6;
-    %pad/u 32;
-    %cmpi/e 0, 0, 32;
-    %jmp/1 T_1.2, 4;
-    %flag_mov 8, 4;
-    %load/vec4 v0x7fdb61913990_0;
-    %parti/s 8, 23, 6;
-    %pad/u 32;
-    %cmpi/e 0, 0, 32;
-    %flag_or 4, 8;
-T_1.2;
-    %jmp/0xz  T_1.0, 4;
-    %pushi/vec4 1, 0, 1;
-    %store/vec4 v0x7fdb61913f50_0, 0, 1;
-    %jmp T_1.1;
-T_1.0 ;
-    %pushi/vec4 0, 0, 1;
-    %store/vec4 v0x7fdb61913f50_0, 0, 1;
-T_1.1 ;
-    %jmp T_1;
-    .thread T_1, $push;
-    .scope S_0x7fdb61913170;
-T_2 ;
-    %wait E_0x7fdb61913430;
-    %load/vec4 v0x7fdb61913c00_0;
-    %parti/s 2, 46, 7;
-    %dup/vec4;
-    %pushi/vec4 1, 0, 2;
-    %cmp/u;
-    %jmp/1 T_2.0, 6;
-    %dup/vec4;
-    %pushi/vec4 2, 0, 2;
-    %cmp/u;
-    %jmp/1 T_2.1, 6;
-    %dup/vec4;
-    %pushi/vec4 3, 0, 2;
-    %cmp/u;
-    %jmp/1 T_2.2, 6;
-    %load/vec4 v0x7fdb61913c00_0;
-    %parti/s 23, 24, 6;
-    %store/vec4 v0x7fdb619134e0_0, 0, 23;
-    %jmp T_2.4;
-T_2.0 ;
-    %load/vec4 v0x7fdb61913c00_0;
-    %parti/s 23, 23, 6;
-    %store/vec4 v0x7fdb619134e0_0, 0, 23;
-    %jmp T_2.4;
-T_2.1 ;
-    %load/vec4 v0x7fdb61913c00_0;
-    %parti/s 23, 24, 6;
-    %store/vec4 v0x7fdb619134e0_0, 0, 23;
-    %jmp T_2.4;
-T_2.2 ;
-    %load/vec4 v0x7fdb61913c00_0;
-    %parti/s 23, 24, 6;
-    %store/vec4 v0x7fdb619134e0_0, 0, 23;
-    %jmp T_2.4;
-T_2.4 ;
-    %pop/vec4 1;
-    %jmp T_2;
-    .thread T_2, $push;
-    .scope S_0x7fdb61913170;
-T_3 ;
-    %wait E_0x7fdb619133b0;
-    %load/vec4 v0x7fdb61913f50_0;
-    %flag_set/vec4 8;
-    %jmp/1 T_3.1, 8;
-    %pushi/vec4 0, 0, 1;
-    %load/vec4 v0x7fdb619138e0_0;
-    %parti/s 8, 23, 6;
-    %concat/vec4; draw_concat_vec4
-    %pushi/vec4 0, 0, 1;
-    %load/vec4 v0x7fdb61913990_0;
-    %parti/s 8, 23, 6;
-    %concat/vec4; draw_concat_vec4
-    %add;
-    %pushi/vec4 0, 0, 8;
-    %load/vec4 v0x7fdb61913c00_0;
-    %parti/s 1, 47, 7;
-    %concat/vec4; draw_concat_vec4
-    %add;
-    %cmpi/u 127, 0, 9;
-    %flag_or 8, 5;
-T_3.1;
-    %flag_get/vec4 8;
-    %jmp/1 T_3.0, 8;
-    %pushi/vec4 381, 0, 32;
-    %pushi/vec4 0, 0, 1;
-    %load/vec4 v0x7fdb619138e0_0;
-    %parti/s 8, 23, 6;
-    %concat/vec4; draw_concat_vec4
-    %pad/u 32;
-    %pushi/vec4 0, 0, 1;
-    %load/vec4 v0x7fdb61913990_0;
-    %parti/s 8, 23, 6;
-    %concat/vec4; draw_concat_vec4
-    %pad/u 32;
-    %add;
-    %pushi/vec4 0, 0, 8;
-    %load/vec4 v0x7fdb61913c00_0;
-    %parti/s 1, 47, 7;
-    %concat/vec4; draw_concat_vec4
-    %pad/u 32;
-    %add;
-    %cmp/u;
-    %flag_get/vec4 5;
-    %or;
-T_3.0;
-    %store/vec4 v0x7fdb61913cb0_0, 0, 1;
-    %load/vec4 v0x7fdb61913f50_0;
-    %inv;
-    %flag_set/vec4 8;
-    %jmp/0xz  T_3.2, 8;
-    %load/vec4 v0x7fdb61913cb0_0;
-    %flag_set/vec4 8;
-    %jmp/0xz  T_3.4, 8;
-    %pushi/vec4 511, 0, 9;
-    %store/vec4 v0x7fdb61913b50_0, 0, 9;
-    %jmp T_3.5;
-T_3.4 ;
-    %pushi/vec4 0, 0, 1;
-    %load/vec4 v0x7fdb619138e0_0;
-    %parti/s 8, 23, 6;
-    %concat/vec4; draw_concat_vec4
-    %pushi/vec4 0, 0, 1;
-    %load/vec4 v0x7fdb61913990_0;
-    %parti/s 8, 23, 6;
-    %concat/vec4; draw_concat_vec4
-    %add;
-    %pushi/vec4 0, 0, 8;
-    %load/vec4 v0x7fdb61913c00_0;
-    %parti/s 1, 47, 7;
-    %concat/vec4; draw_concat_vec4
-    %add;
-    %subi 127, 0, 9;
-    %store/vec4 v0x7fdb61913b50_0, 0, 9;
-T_3.5 ;
-    %jmp T_3.3;
-T_3.2 ;
-    %pushi/vec4 0, 0, 9;
-    %store/vec4 v0x7fdb61913b50_0, 0, 9;
-T_3.3 ;
-    %jmp T_3;
-    .thread T_3, $push;
-    .scope S_0x7fdb61b042e0;
-T_4 ;
-    %wait E_0x7fdb61b045e0;
-    %load/vec4 v0x7fdb619145b0_0;
-    %flag_set/vec4 8;
-    %jmp/0xz  T_4.0, 8;
-    %pushi/vec4 0, 0, 32;
-    %assign/vec4 v0x7fdb61914450_0, 0;
-    %pushi/vec4 0, 0, 32;
-    %assign/vec4 v0x7fdb61914500_0, 0;
-    %pushi/vec4 0, 0, 32;
-    %assign/vec4 v0x7fdb619146c0_0, 0;
-    %jmp T_4.1;
-T_4.0 ;
-    %load/vec4 v0x7fdb61914770_0;
-    %flag_set/vec4 8;
-    %jmp/0xz  T_4.2, 8;
-    %load/vec4 v0x7fdb61914800_0;
-    %assign/vec4 v0x7fdb619146c0_0, 0;
-T_4.2 ;
-    %load/vec4 v0x7fdb61914140_0;
-    %flag_set/vec4 8;
-    %jmp/0xz  T_4.4, 8;
-    %load/vec4 v0x7fdb619142b0_0;
-    %assign/vec4 v0x7fdb61914450_0, 0;
-    %load/vec4 v0x7fdb61914000_0;
-    %assign/vec4 v0x7fdb61914500_0, 0;
-T_4.4 ;
-T_4.1 ;
-    %jmp T_4;
-    .thread T_4;
-    .scope S_0x7fdb61b04160;
-T_5 ;
-    %pushi/vec4 0, 0, 1;
-    %store/vec4 v0x7fdb61914950_0, 0, 1;
-T_5.0 ;
-    %delay 5000, 0;
-    %load/vec4 v0x7fdb61914950_0;
-    %inv;
-    %store/vec4 v0x7fdb61914950_0, 0, 1;
-    %jmp T_5.0;
-    %end;
-    .thread T_5;
-    .scope S_0x7fdb61b04160;
-T_6 ;
-    %pushi/vec4 1, 0, 1;
-    %store/vec4 v0x7fdb61914de0_0, 0, 1;
-    %pushi/vec4 0, 0, 1;
-    %store/vec4 v0x7fdb61914e70_0, 0, 1;
-    %pushi/vec4 0, 0, 1;
-    %store/vec4 v0x7fdb61914a10_0, 0, 1;
-    %pushi/vec4 0, 0, 32;
-    %store/vec4 v0x7fdb61914ac0_0, 0, 32;
-    %pushi/vec4 0, 0, 32;
-    %store/vec4 v0x7fdb61914bb0_0, 0, 32;
-    %pushi/vec4 0, 0, 32;
-    %store/vec4 v0x7fdb61914f20_0, 0, 32;
-    %delay 20000, 0;
-    %pushi/vec4 0, 0, 1;
-    %store/vec4 v0x7fdb61914de0_0, 0, 1;
-    %pushi/vec4 1, 0, 1;
-    %store/vec4 v0x7fdb61914e70_0, 0, 1;
-    %pushi/vec4 13, 0, 32;
-    %store/vec4 v0x7fdb61914f20_0, 0, 32;
-    %pushi/vec4 2, 0, 32;
-    %store/vec4 v0x7fdb61914bb0_0, 0, 32;
-    %delay 10000, 0;
-    %pushi/vec4 0, 0, 1;
-    %store/vec4 v0x7fdb61914e70_0, 0, 1;
-    %pushi/vec4 1, 0, 1;
-    %store/vec4 v0x7fdb61914a10_0, 0, 1;
-    %delay 10000, 0;
-    %pushi/vec4 3, 0, 32;
-    %store/vec4 v0x7fdb61914bb0_0, 0, 32;
-    %pushi/vec4 5, 0, 32;
-    %store/vec4 v0x7fdb61914ac0_0, 0, 32;
-    %delay 100000, 0;
-    %vpi_call 2 63 "$finish" {0 0 0};
-    %end;
-    .thread T_6;
-    .scope S_0x7fdb61b04160;
-T_7 ;
-    %vpi_call 2 68 "$monitor", "Time = %t, rst = %b, compute = %b, weight_en = %b, inp_north = %h, inp_west = %h, weight_in = %h, outp_south = %h, outp_east = %h", $time, v0x7fdb61914de0_0, v0x7fdb61914a10_0, v0x7fdb61914e70_0, v0x7fdb61914ac0_0, v0x7fdb61914bb0_0, v0x7fdb61914f20_0, v0x7fdb61914d50_0, v0x7fdb61914c80_0 {0 0 0};
-    %end;
-    .thread T_7;
-    .scope S_0x7fdb61b04160;
-T_8 ;
-    %vpi_call 2 73 "$dumpfile", "block_wave.vcd" {0 0 0};
-    %vpi_call 2 74 "$dumpvars", 32'sb00000000000000000000000000000000, S_0x7fdb61b04160 {0 0 0};
-    %end;
-    .thread T_8;
-# The file index is used to find the file name in the following table.
-:file_names 6;
-    "N/A";
-    "<interactive>";
-    "tb/block_tb.v";
-    "src/block.v";
-    "src/fadd.v";
-    "src/fmul.v";
diff --git a/src/block.v b/src/block.v
index 8fa353f..72fde27 100644
--- a/src/block.v
+++ b/src/block.v
@@ -17,8 +17,8 @@ module block(inp_north, inp_west, weight_in, outp_south, outp_east,  clk, rst, c
     );
     wire [31:0] add_result;
     fadd add_instance (
-        .a_operand(inp_north),
-        .b_operand(mul_result),
+        .a_in(inp_north),
+        .b_in(mul_result),
         .result(add_result)
     );
 
diff --git a/src/fadd.v b/src/fadd.v
index 89d75dc..6592b36 100644
--- a/src/fadd.v
+++ b/src/fadd.v
@@ -2,7 +2,7 @@
 
 
 module fadd(
-    input [`BIT_W-1:0] a_operand, b_operand, // Inputs in the format of IEEE-`EXP_W-154 Representation.
+    input [`BIT_W-1:0] a_in, b_in, // Inputs in the format of IEEE-`EXP_W-154 Representation.
     output [`BIT_W-1:0] result // Outputs in the format of IEEE-`EXP_W-154 Representation.
 );
 
@@ -20,38 +20,47 @@ wire [`EXP_W-1:0] exponent_b_add;
 wire [`M_W+1:0] significand_add;
 wire [`BIT_W-2:0] add_sum;
 
+wire [`EXP_W-1:0] exp_a, exp_b;
 
-//for operations always operand_a must not be less than b_operand
-assign {operand_a,operand_b} = (a_operand[`BIT_W-2:0] < b_operand[`BIT_W-2:0]) ? {b_operand,a_operand} : {a_operand,b_operand};
 
-assign exp_a = operand_a[`BIT_W-2:`M_W];
-assign exp_b = operand_b[`BIT_W-2:`M_W];
+//for operations always operand_a must not be less than b_in
+assign {operand_a,operand_b} = (a_in[`BIT_W-2:0] < b_in[`BIT_W-2:0]) ? {b_in,a_in} : {a_in,b_in};
+
+assign exp_a = operand_a[`BIT_W-2:`M_W]; // extract exponent from operand_a
+assign exp_b = operand_b[`BIT_W-2:`M_W]; // extract exponent from operand_b
 
 //Exception flag sets 1 if either one of the exponent is 255.
 assign Exception = (&operand_a[`BIT_W-2:`M_W]) | (&operand_b[`BIT_W-2:`M_W]);
 
-assign output_sign = operand_a[`BIT_W-1] ;
+assign output_sign = operand_a[`BIT_W-1] ; // since the operand_a is always greater than operand_b, the sign of the result will be same as operand_a.
 
+//operation_sub_addBar is 1 if we are doing subtraction else 0.
 assign operation_sub_addBar =  ~(operand_a[`BIT_W-1] ^ operand_b[`BIT_W-1]);
 
 //Assigining significand values according to Hidden Bit.
-assign significand_a = {1'b1,operand_a[`M_W-1:0]};
-assign significand_b = {1'b1,operand_b[`M_W-1:0]};
+assign significand_a = {1'b1,operand_a[`M_W-1:0]}; // expand the mantissa by 1 bit before multiplication since its always implied
+assign significand_b = {1'b1,operand_b[`M_W-1:0]}; // same as above
 
 //Evaluating Exponent Difference
 assign exponent_diff = operand_a[`BIT_W-2:`M_W] - operand_b[`BIT_W-2:`M_W];
 
-//Shifting significand_b according to exponent_diff
+//Shifting significand_b to the right according to exponent_diff. Exapmle: if we have 1.0101 >> 2 = 0.0101 then exponent_diff = 2 and significand_b_add = significand_b >> exponent_diff
 assign significand_b_add = significand_b >> exponent_diff;
 
+//Adding exponent_diff to exponent_b. Exapmle: if we have 1.0101 << 2 = 101.01 then exponent_diff = 2 and exponent_b_add = exponent_b + exponent_diff
 assign exponent_b_add = operand_b[`BIT_W-2:`M_W] + exponent_diff; 
 
 //------------------------------------------------ADD BLOCK------------------------------------------//
+//if we are adding(operation_sub_addBar=1) need to add significand_b_add to significand_a. 
+//Or sets the significand to zero if the signs are different(this means we are doing subtraction), effectively determining the core operation of the floating-point addition based on the sign of the operands.
 assign significand_add = ( operation_sub_addBar) ? (significand_a + significand_b_add) : {(`M_W+2){1'b0}}; 
 
-//Result will be equal to Most `M_W bits if carry generates else it will be Least `M_W-1 bits.
+//Taking care of the resulting mantissa. 
+//If there is a carry, then the result is normalized by shifting the significand right by one bit(because its implied) and incrementing the exponent by one.
+//If there is no carry, we just use the result of the addition, and we have `M_W-1:0 due to the fact that we are using the hidden bit(implied 1).
 assign add_sum[`M_W-1:0] = significand_add[`M_W+1] ? significand_add[`M_W:1] : significand_add[`M_W-1:0];
 
+// Taking care of the resulting exponent.
 //If carry generates in sum value then exponent must be added with 1 else feed as it is.
 assign add_sum[`BIT_W-2:`M_W] = significand_add[`M_W+1] ? (1'b1 + operand_a[`BIT_W-2:`M_W]) : operand_a[`BIT_W-2:`M_W];
 
diff --git a/src/fmul.v b/src/fmul.v
index 1dc253f..dd91d24 100644
--- a/src/fmul.v
+++ b/src/fmul.v
@@ -17,7 +17,7 @@ module fmul(
     
     // Multiplication logic
     always @* begin
-        mul_fix_out = {1'b1, a_in[`M_W-1:0]} * {1'b1, b_in[`M_W-1:0]};
+        mul_fix_out = {1'b1, a_in[`M_W-1:0]} * {1'b1, b_in[`M_W-1:0]}; //extend the mantissa by 1 bit before multiplication
     end
 
     // Zero check
@@ -29,24 +29,34 @@ module fmul(
         end
     end
 
-    // Generate M
+    // Generate Mantissa. We are only considering the most significat bits of the product to generate the mantissa.
     always @* begin
+        //select two MSBs of the product
         case(mul_fix_out[`MULT_W-1:`MULT_W-2])
-            2'b01: M_result = mul_fix_out[`MULT_W-3:`M_W];
-            2'b10: M_result = mul_fix_out[`MULT_W-2:`M_W+1];
-            2'b11: M_result = mul_fix_out[`MULT_W-2:`M_W+1];
-            default: M_result = mul_fix_out[`MULT_W-2:`M_W+1];
+           //Example: If mul_fix_out is 8 bits wide and represents 01xxxxxx (binary), it extracts xxxxxx, assuming the MSBs are 01
+            2'b01: M_result = mul_fix_out[`MULT_W-3:`M_W]; //MSB is dropped(as it is always 1)
+            //In 2'b10 or 2'b11 case: 10yyyyyy → Shift → 0yyyyyy (Extract yyyyyy)
+            2'b10: M_result = mul_fix_out[`MULT_W-2:`M_W+1]; // Between two and just under 4. product larger than normalized range, so we need to shift right 
+            2'b11: M_result = mul_fix_out[`MULT_W-2:`M_W+1]; // same as line above. 
+            default: M_result = mul_fix_out[`MULT_W-2:`M_W+1]; // default same as two lines above
         endcase
     end
 
     // Overflow check
     always @* begin
+        //Different cases for overflow:
+        //1. If either of the inputs is zero, then the result is zero and there is no overflow.
+        //2. Underflow check: If the sum of the exponents is less than the minimum exponent, then the result is zero and there is no overflow. {2'b0,{(EXP_W-1){1'b1}}} is the minimum exponent(001111111 in case of 32bit float)
+        //3. Overflow check: If the sum of the exponents is greater than the maximum exponent, then the result is infinity and there is overflow. EXP_MAX is the maximum exponent.
         overflow = (zero_check || ({1'b0, a_in[`BIT_W-2:`M_W]} + {1'b0, b_in[`BIT_W-2:`M_W]} + {{`EXP_W{1'b0}}, mul_fix_out[`MULT_W-1]}) < {2'b0,{(`EXP_W-1){1'b1}}} || ({1'b0, a_in[`BIT_W-2:`M_W]} + {1'b0, b_in[`BIT_W-2:`M_W]} + {8'd0, mul_fix_out[`MULT_W-1]}) > `EXP_MAX);
 
         if (~zero_check) begin
             if (overflow) begin
                 e_result0 = {(`EXP_W+1){1'b1}};
             end else begin
+                //1. We extend the exponent by 1 bit because the result of addition of two exponents can be 1 bit larger than the exponent itself.
+                //2. We add the MSB of the mantissa multiplication(before normalization) to the exponent sum to account for the shifting of the mantissa.
+                //3. We subtract the bias from the exponent sum to get the final exponent because just adding two exponents would give us exp1 + exp2 + 2 x bias.
                 e_result0 = ({1'b0, a_in[`BIT_W-2:`M_W]} + {1'b0, b_in[`BIT_W-2:`M_W]} + {{`EXP_W{1'b0}}, mul_fix_out[`MULT_W-1]}) - {2'b0,{(`EXP_W-1){1'b1}}};
             end
         end else begin
diff --git a/tb/block_tb.v b/tb/block_tb.v
deleted file mode 100644
index 7bd07c2..0000000
--- a/tb/block_tb.v
+++ /dev/null
@@ -1,77 +0,0 @@
-
-`timescale 1ns/1ps
-
-module block_tb;
-
-    reg [31:0] inp_north, inp_west, weight_in;
-    reg clk, rst, compute, weight_en;
-    wire [31:0] outp_south, outp_east;
-    
-    // Instantiate the block module
-    block uut (
-        .inp_north(inp_north), 
-        .inp_west(inp_west), 
-        .weight_in(weight_in), 
-        .outp_south(outp_south), 
-        .outp_east(outp_east), 
-        .clk(clk), 
-        .rst(rst), 
-        .compute(compute), 
-        .weight_en(weight_en)
-    );
-
-    // Clock generation
-    initial begin
-        clk = 0;
-        forever #5 clk = ~clk;  // Generate a clock with 10ns period (100MHz)
-    end
-
-    // Test stimulus
-    initial begin
-        // Initialize inputs
-        rst = 1;  // Assert reset
-        weight_en = 0;
-        compute = 0;
-        inp_north = 0;
-        inp_west = 0;
-        weight_in = 0;
-        
-        // Wait for a few clock cycles
-        #(20);
-        
-        // Deassert reset
-        rst = 0;  
-        weight_en = 1;
-        weight_in = 32'd13;  // Load a sample weight
-        inp_west = 32'd2;   // Load a sample activation
-
-        // Wait for the weight to be loaded
-        #(10);
-        weight_en = 0;
-        
-        // Start computation
-        compute = 1;
-        #(10);
-        inp_west = 32'd3;
-        inp_north = 32'd5;  // Load a sample partial sum
-
-        // Continue the simulation for several cycles to observe behavior
-        // This should be expanded with specific cases and assertions as needed
-        #(100);
-        
-        // Finish the simulation
-        $finish;
-    end
-
-    // Optional: Monitor outputs and important signal transitions
-    initial begin
-        $monitor("Time = %t, rst = %b, compute = %b, weight_en = %b, inp_north = %h, inp_west = %h, weight_in = %h, outp_south = %h, outp_east = %h",
-                 $time, rst, compute, weight_en, inp_north, inp_west, weight_in, outp_south, outp_east);
-    end
-
-    initial begin
-        $dumpfile("block_wave.vcd");
-        $dumpvars(0, block_tb);
-    end
-    
-endmodule
diff --git a/tb/fmul_tb.v b/tb/fmul_tb.v
deleted file mode 100644
index 345bca2..0000000
--- a/tb/fmul_tb.v
+++ /dev/null
@@ -1,52 +0,0 @@
-`timescale 1ns / 1ps
-
-module fmul_tb;
-
-    `define BIT_W 32
-
-    // Inputs
-    reg [`BIT_W-1:0] a_in;
-    reg [`BIT_W-1:0] b_in;
-
-    // Outputs
-    wire [`BIT_W-1:0] result;
-
-    // Instantiate the Unit Under Test (UUT)
-    fmul uut (
-        .a_in(a_in), 
-        .b_in(b_in), 
-        .result(result)
-    );
-
-    initial begin
-        // Initialize Inputs
-        a_in = 0;
-        b_in = 0;
-
-        // Wait 100 ns for global reset to finish
-        #100;
-
-        // Apply test cases
-        // Note: You will need to replace these with actual test values that are meaningful for your application.
-        // These are just placeholders to demonstrate the structure of the testbench.
-
-        a_in = 32'b01000000110000000000000000000000; // Replace 'xxxxxxxx' with actual test data
-        b_in = 32'b01000000111000000000000000000000; // Replace 'xxxxxxxx' with actual test data
-        #10; // Wait for some time
-
-        // a_in = 32'hxxxxxxxx; // Next test data
-        // b_in = 32'hxxxxxxxx;
-        // #10;
-
-        // Add as many test cases as needed to thoroughly test your module
-
-        // Finish the simulation
-        $finish;
-    end
-
-    initial begin
-        $dumpfile("fmul_wave.vcd");
-        $dumpvars(0, fmul_tb);
-    end
-      
-endmodule
diff --git a/tb/spi_slave_tb.v b/tb/spi_slave_tb.v
deleted file mode 100644
index 7580b7d..0000000
--- a/tb/spi_slave_tb.v
+++ /dev/null
@@ -1,79 +0,0 @@
-`timescale 1ns / 1ps
-
-module spi_slave_tb;
-
-  reg clk;
-  reg rst;
-  reg ss;
-  reg mosi;
-  wire miso;
-  reg sck;
-  wire done;
-  reg [7:0] din;
-  wire [7:0] dout;
-
-  // Instantiate the Unit Under Test (UUT)
-  spi_slave uut (
-    .clk(clk),
-    .rst(rst),
-    .ss(ss),
-    .mosi(mosi),
-    .miso(miso),
-    .sck(sck),
-    .done(done),
-    .din(din),
-    .dout(dout)
-  );
-
-  // Clock generation
-  always #5 clk = (clk === 1'b0); // 100MHz clock
-
-  // Task to upload data
-  task upload_data;
-    input [7:0] data;
-    integer i;
-    begin
-      ss <= 1'b0;  // Assert slave select
-      for (i=7; i>=0; i=i-1) begin
-        mosi <= data[i];  // Set MOSI to the current bit of data
-        #10;  // Wait half period for stability
-        sck <= 1'b1;  // Clock high
-        #10;  // Complete the clock period
-        sck <= 1'b0;  // Clock low
-      end
-      ss <= 1'b1;  // Deassert slave select
-      #20;  // Wait some time after data upload
-    end
-  endtask
-
-  // Test sequence
-  initial begin
-    // Initialize inputs
-    clk <= 0;
-    rst <= 1;
-    ss <= 1;
-    mosi <= 0;
-    sck <= 0;
-    din <= 8'hAA;  // Example data to load into the slave
-
-    // Dump file
-    $dumpfile("spi_slave.vcd");
-    $dumpvars(0, spi_slave_tb);
-
-    // Reset the system
-    #15;
-    rst <= 0;
-    #10;
-    rst <= 1;
-    #20;
-
-    // Upload data
-    upload_data(8'h55);  // Send 0x55 as an example
-    upload_data(8'h3C);  // Send another byte
-
-    // Finish simulation
-    #100;
-    $finish;
-  end
-
-endmodule
diff --git a/tb/systolic_array_tb.v b/tb/systolic_array_tb.v
deleted file mode 100644
index 7b7077a..0000000
--- a/tb/systolic_array_tb.v
+++ /dev/null
@@ -1,92 +0,0 @@
-`timescale 1ns / 1ps
-
-module systolic_array_tb;
-
-    reg [31:0] inp_west0, inp_west3, inp_west6;
-    reg [31:0] inp_weight0, inp_weight1, inp_weight2, inp_weight3,
-               inp_weight4, inp_weight5, inp_weight6, inp_weight7, inp_weight8;
-    reg clk, rst, compute, weight_en;
-
-    // Instantiate the Unit Under Test (UUT)
-    systolic_array uut (
-        .inp_west0(inp_west0), .inp_west3(inp_west3), .inp_west6(inp_west6),
-        .inp_weight0(inp_weight0), .inp_weight1(inp_weight1), .inp_weight2(inp_weight2),
-        .inp_weight3(inp_weight3), .inp_weight4(inp_weight4), .inp_weight5(inp_weight5),
-        .inp_weight6(inp_weight6), .inp_weight7(inp_weight7), .inp_weight8(inp_weight8),
-        .clk(clk), .rst(rst), .compute(compute), .weight_en(weight_en)
-    );
-
-    // Clock generation
-    initial begin
-        clk = 0;
-        forever #10 clk = ~clk; // 50 MHz clock
-    end
-
-    // Test sequence
-    initial begin
-        // Initialize Inputs
-        rst = 1;
-        compute = 0;
-        weight_en = 0;
-        // Reset the system
-        #100;
-        rst = 0;
-        
-        // Load the weight matrix
-        weight_en = 1;
-        inp_weight0 = 32'd1;
-        inp_weight1 = 32'd2;
-        inp_weight2 = 32'd3;
-        inp_weight3 = 32'd4;
-        inp_weight4 = 32'd5;
-        inp_weight5 = 32'd6;
-        inp_weight6 = 32'd7;
-        inp_weight7 = 32'd8;
-        inp_weight8 = 32'd9;
-        #20;
-        weight_en = 0; // Disable weight loading
-
-        // Input activation matrix
-        inp_west0 = 32'd1;
-        inp_west3 = 32'd0;
-        inp_west6 = 32'd0;
-        compute = 1; // Start computation
-        #20;
-
-        inp_west0 = 32'd4;
-        inp_west3 = 32'd2;
-        inp_west6 = 32'd0;
-        #20;
-        inp_west0 = 32'd7;
-        inp_west3 = 32'd5;
-        inp_west6 = 32'd3;
-        #20;
-        inp_west0 = 32'd0;
-        inp_west3 = 32'd8;
-        inp_west6 = 32'd6;
-        #20;
-        inp_west0 = 32'd0;
-        inp_west3 = 32'd0;
-        inp_west6 = 32'd9;
-        // Observe the output for a few cycles
-        // Note: You would need additional logic to read out the final results from the array
-        #100;
-
-        compute = 0; // Stop computation
-        #20;
-
-        // Add your own checks to validate the outputs
-        // This will depend on how you decide to capture and observe the outputs
-        // from the systolic array.
-
-        // Finish simulation
-        $finish;
-    end
-
-
-    initial begin
-        $dumpfile("systolic_array_wave.vcd");
-        $dumpvars(0, systolic_array_tb);
-    end
-    
-endmodule
diff --git a/tests/Makefile b/tests/Makefile
index 87acaa0..d7b01e3 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -10,10 +10,9 @@ VERILOG_SOURCES = ../src/define.v \
                   ../src/systolic_array.v
 
 # You will have to create separate test rules for each module
-# For example, to test fmul.v
 TOPLEVEL_LANG = verilog
-TOPLEVEL = fmul
-MODULE = test_fmul
+TOPLEVEL = systolic_array
+MODULE = test_systolic_array
 
 
 include $(shell cocotb-config --makefiles)/Makefile.sim
diff --git a/tests/test_block.py b/tests/test_block.py
index e69de29..2b5ba52 100644
--- a/tests/test_block.py
+++ b/tests/test_block.py
@@ -0,0 +1,42 @@
+import cocotb
+from cocotb.triggers import Timer, RisingEdge
+from cocotb.clock import Clock
+from cocotb.binary import BinaryValue
+
+@cocotb.test()
+async def block_test(dut):
+    # Clock generation
+    clock = Clock(dut.clk, 10, units="ns")  # 100 MHz clock
+    cocotb.fork(clock.start())
+
+    # Initialize inputs
+    dut.rst.value = 1  # Assert reset
+    dut.weight_en.value = 0
+    dut.compute.value = 0
+    dut.inp_north.value = 0
+    dut.inp_west.value = 0
+    dut.weight_in.value = 0
+
+    # Wait for a few clock cycles
+    await Timer(20, units="ns")
+
+    # Deassert reset and set initial test values
+    dut.rst.value = 0
+    dut.weight_en.value = 1
+    dut.weight_in.value = BinaryValue("01000001010100000000000000000000")  # Sample weight
+    dut.inp_west.value = BinaryValue("01000000000000000000000000000000")    # Sample activation
+
+    # Wait for the weight to be loaded
+    await Timer(10, units="ns")
+    dut.weight_en.value = 0
+
+    # Start computation
+    dut.compute.value = 1
+    
+    await Timer(10, units="ns")
+    assert dut.outp_east.value == BinaryValue("01000000000000000000000000000000"), "Mismatch in outp_east"
+    assert dut.outp_south.value == BinaryValue("01000001110100000000000000000000"), "Mismatch in outp_south"
+
+
+    # Finish simulation
+    # (Cocotb automatically closes the simulation after the test completes)
diff --git a/tests/test_fadd.py b/tests/test_fadd.py
index e69de29..3849b3b 100644
--- a/tests/test_fadd.py
+++ b/tests/test_fadd.py
@@ -0,0 +1,51 @@
+import cocotb
+from cocotb.triggers import Timer
+from cocotb.binary import BinaryValue
+
+@cocotb.test()
+async def fmul_tb(dut):
+    """ Test for floating point addition """
+
+    # Define the bit width
+    BIT_W = 32
+
+    # Initialize Inputs
+    dut.a_in.value = 0
+    dut.b_in.value = 0
+
+    # Wait 100 ns for global reset to finish
+    await Timer(10, units='ns')
+
+    # Test two positive. 6 +7 = 13
+    dut.a_in.value = BinaryValue("01000000110000000000000000000000")  
+    dut.b_in.value = BinaryValue("01000000111000000000000000000000")  
+    await Timer(1, units='ns')
+    assert dut.result.value == BinaryValue("01000001010100000000000000000000") 
+    await Timer(9, units='ns')
+
+    # Test one positive, and one negative number. 16 -5 = 11
+    dut.a_in.value = BinaryValue("01000001100000000000000000000000")  
+    dut.b_in.value = BinaryValue("11000000101000000000000000000000")  
+    await Timer(1, units='ns')
+    assert dut.result.value == BinaryValue("01000001001100000000000000000000") 
+    await Timer(9, units='ns')
+
+    # Test one positive, and one negative number. 0.25 + 0.3 = 0.55
+    dut.a_in.value = BinaryValue("00111110100000000000000000000000")  
+    dut.b_in.value = BinaryValue("00111110100110011001100110011010") 
+    await Timer(1, units='ns')
+    assert dut.result.value == BinaryValue("00111111000011001100110011001101") 
+    await Timer(9, units='ns')
+
+    dut.a_in.value = BinaryValue("00000000000000000000000000000000") 
+    dut.b_in.value = BinaryValue("00000000000000000000000000000000")  
+    await Timer(1, units='ns')
+    assert dut.result.value == BinaryValue("00000000000000000000000000000000") 
+    await Timer(9, units='ns')
+
+    dut.a_in.value = BinaryValue("11111111111111111111111111111111")  
+    dut.b_in.value = BinaryValue("11111111111111111111111111111111")  
+    await Timer(10, units='ns')
+
+    # Finish the simulation
+    dut._log.info("Test completed")
diff --git a/tests/test_fmul.py b/tests/test_fmul.py
index 395d7f0..977cfd6 100644
--- a/tests/test_fmul.py
+++ b/tests/test_fmul.py
@@ -14,7 +14,7 @@ async def fmul_tb(dut):
     dut.b_in.value = 0
 
     # Wait 100 ns for global reset to finish
-    await Timer(100, units='ns')
+    await Timer(10, units='ns')
 
     # Apply test cases
     dut.a_in.value = BinaryValue("01000000110000000000000000000000")  # Replace with actual test data
diff --git a/tests/test_systolic_array.py b/tests/test_systolic_array.py
index e69de29..2620d9d 100644
--- a/tests/test_systolic_array.py
+++ b/tests/test_systolic_array.py
@@ -0,0 +1,71 @@
+import cocotb
+from cocotb.triggers import RisingEdge, Timer
+from cocotb.clock import Clock
+from cocotb.binary import BinaryValue
+
+@cocotb.test()
+async def systolic_array_test(dut):
+    # Clock generation
+    clock = Clock(dut.clk, 20, units="ns")  # 50 MHz clock
+    cocotb.fork(clock.start())
+
+    # Initialize Inputs
+    dut.rst.value = 1
+    dut.compute.value = 0
+    dut.weight_en.value = 0
+
+    # Reset the system
+    await Timer(100, units="ns")
+    dut.rst.value = 0
+    
+    # Load the weight matrix
+    dut.weight_en.value = 1
+    dut.inp_weight0.value = BinaryValue("00111111100000000000000000000000")  # 1
+    dut.inp_weight1.value = BinaryValue("01000000000000000000000000000000")  # 2
+    dut.inp_weight2.value = BinaryValue("01000000010000000000000000000000")  # 3
+    dut.inp_weight3.value = BinaryValue("01000000100000000000000000000000")  # 4
+    dut.inp_weight4.value = BinaryValue("01000000101000000000000000000000")  # 5
+    dut.inp_weight5.value = BinaryValue("01000000110000000000000000000000")  # 6
+    dut.inp_weight6.value = BinaryValue("01000000111000000000000000000000")  # 7
+    dut.inp_weight7.value = BinaryValue("01000001000000000000000000000000")  # 8
+    dut.inp_weight8.value = BinaryValue("01000001000100000000000000000000")  # 9
+    await Timer(20, units="ns")
+    dut.weight_en.value = 0  # Disable weight loading
+
+    # Input activation matrix
+    dut.inp_west0.value = BinaryValue("00111111100000000000000000000000")  # 1
+    dut.inp_west3.value = BinaryValue("00000000000000000000000000000000")  # 0
+    dut.inp_west6.value = BinaryValue("00000000000000000000000000000000")  # 0
+    dut.compute.value = 1  # Start computation
+    await Timer(20, units="ns")
+
+    # Continuing the input activation matrix sequence
+    dut.inp_west0.value = BinaryValue("01000000100000000000000000000000")
+    dut.inp_west3.value = BinaryValue("01000000000000000000000000000000")
+    dut.inp_west6.value = BinaryValue("00000000000000000000000000000000")
+    await Timer(20, units="ns")
+
+    dut.inp_west0.value = BinaryValue("01000000111000000000000000000000")
+    dut.inp_west3.value = BinaryValue("01000000101000000000000000000000")
+    dut.inp_west6.value = BinaryValue("01000000010000000000000000000000")
+    await Timer(20, units="ns")
+
+    dut.inp_west0.value = BinaryValue("00000000000000000000000000000000")
+    dut.inp_west3.value = BinaryValue("01000001000000000000000000000000")
+    dut.inp_west6.value = BinaryValue("01000000110000000000000000000000")
+    await Timer(20, units="ns")
+
+    dut.inp_west0.value = BinaryValue("00000000000000000000000000000000")
+    dut.inp_west3.value = BinaryValue("00000000000000000000000000000000")
+    dut.inp_west6.value = BinaryValue("01000001000100000000000000000000")
+    await Timer(20, units="ns")
+
+    # Observe the output for a few cycles
+    # Note: Need additional logic to read out the final results from the array
+    # This can involve checking the output signals of the systolic array
+    # For example: assert dut.output_signal.value == expected_value, "Mismatch in output"
+    await Timer(100, units="ns")
+
+    # Stop computation
+    dut.compute.value = 0
+    await Timer(20, units="ns")
\ No newline at end of file