From 81636a44ad19601f01bd340317fad93a3426b756 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Wed, 25 Sep 2024 09:03:07 +0200 Subject: [PATCH] [software] Parametrize apps with hjson file --- software/.gitignore | 2 +- software/apps/baremetal/Makefile | 4 +- software/apps/baremetal/axpy_i32/main.c | 113 ++--------- .../apps/baremetal/cfft_radix2_q16/data.args | 1 - .../apps/baremetal/cfft_radix2_q16/main.c | 1 + .../apps/baremetal/cfft_radix4_q16/data.args | 1 - .../apps/baremetal/cfft_radix4_q16/main.c | 2 + software/apps/baremetal/chest_q16/data.args | 3 - software/apps/baremetal/matmul_f16/main.c | 8 +- software/apps/baremetal/matmul_f32/main.c | 9 +- software/apps/baremetal/matmul_i32/main.c | 133 +++--------- software/apps/baremetal/matmul_i8/main.c | 139 +++---------- software/data/data_matmul_f16.h.tpl | 26 --- software/data/data_matmul_f32.h.tpl | 26 --- software/data/data_matmulf16.py | 111 ---------- software/data/data_matmulf32.py | 112 ---------- software/data/gendata_header.py | 192 ++++++++++++++++++ software/data/gendata_params.hjson | 132 ++++++++++++ software/data/gendatalib_blas.py | 67 ++++++ .../{generate_cfft.py => gendatalib_cfft.py} | 100 ++++----- ...{generate_chest.py => gendatalib_chest.py} | 21 +- software/data/print_header.py | 161 --------------- software/runtime/runtime.mk | 4 +- 23 files changed, 535 insertions(+), 833 deletions(-) delete mode 100644 software/apps/baremetal/cfft_radix2_q16/data.args delete mode 100644 software/apps/baremetal/cfft_radix4_q16/data.args delete mode 100644 software/apps/baremetal/chest_q16/data.args delete mode 100644 software/data/data_matmul_f16.h.tpl delete mode 100644 software/data/data_matmul_f32.h.tpl delete mode 100644 software/data/data_matmulf16.py delete mode 100644 software/data/data_matmulf32.py create mode 100644 software/data/gendata_header.py create mode 100644 software/data/gendata_params.hjson create mode 100644 software/data/gendatalib_blas.py rename software/data/{generate_cfft.py => gendatalib_cfft.py} (80%) rename software/data/{generate_chest.py => gendatalib_chest.py} (76%) delete mode 100644 software/data/print_header.py diff --git a/software/.gitignore b/software/.gitignore index dce9d8683..35dccde4a 100644 --- a/software/.gitignore +++ b/software/.gitignore @@ -26,5 +26,5 @@ runtime/arch.ld # Generated data files data.h -apps/*/*/data*.h +data/data*.h data/__pyc* diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile index 14e02456e..c4a2a40a3 100644 --- a/software/apps/baremetal/Makefile +++ b/software/apps/baremetal/Makefile @@ -17,7 +17,6 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime) include $(RUNTIME_DIR)/runtime.mk APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c")) -DATA := $(patsubst %.args,%.h,$(shell find $(APPS_DIR) -name "data.args")) BINARIES := $(addprefix $(BIN_DIR)/,$(APPS)) ALL := $(APPS) @@ -33,7 +32,7 @@ all_llvm: $(ALL_LLVM) $(APPS): % : $(BIN_DIR)/% $(APPS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S,c,h,ld} -type f) .PHONY: $(BINARIES) -$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) update_opcodes +$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) data_%.h update_opcodes mkdir -p $(dir $@) $(RISCV_CC) -Iinclude -o $@ $< $(RUNTIME) $(RISCV_LDFLAGS) -T$(RUNTIME_DIR)/link.ld $(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump @@ -49,5 +48,6 @@ clean: rm -vf $(addsuffix /main.c.o,$(APPS)) rm -vf $(RUNTIME) rm -vf $(LINKER_SCRIPT) + rm -vf $(wildcard $(DATA_DIR)/data_*.h) .INTERMEDIATE: $(addsuffix /main.c.o,$(APPS)) diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c index a9354796e..aa91733ea 100644 --- a/software/apps/baremetal/axpy_i32/main.c +++ b/software/apps/baremetal/axpy_i32/main.c @@ -5,125 +5,50 @@ // Author: Yichao Zhang, ETH Zurich #include +#include #include -#include "baremetal/mempool_axpy_i32p.h" +/* Mempool runtime libraries */ +#include "builtins_v2.h" +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include - -#if NUM_CORES > 32 -#define size_M 64 -#define size_N 64 -#else -#define size_M (NUM_CORES) -#define size_N (NUM_CORES) -#endif - -#define ALPHA 2 -#if NUM_CORES > 32 -int32_t data_x[size_M * size_N] - __attribute__((aligned(64 * 1024), section(".l1"))); -int32_t data_y[size_M * size_N] - __attribute__((aligned(64 * 1024), section(".l1"))); -int32_t data_y_copy[size_M * size_N] - __attribute__((aligned(64 * 1024), section(".l1"))); -#else -int32_t data_x[size_M * size_N] __attribute__((aligned(32), section(".l1"))); -int32_t data_y[size_M * size_N] __attribute__((aligned(32), section(".l1"))); -int32_t data_y_copy[size_M * size_N] - __attribute__((aligned(32), section(".l1"))); -#endif +#include "baremetal/mempool_axpy_i32p.h" +#include "baremetal/mempool_checks.h" +#include "data_axpy_i32.h" +int32_t l1_X[array_N] + __attribute__((aligned(NUM_CORES * sizeof(uint32_t)), section(".l1"))); +int32_t l1_Y[array_N] + __attribute__((aligned(NUM_CORES * sizeof(uint32_t)), section(".l1"))); int volatile error __attribute__((section(".l1"))); -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id, - uint32_t num_cores) { - // How many rows/columns to split the matrix into - uint32_t const split = 8; - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } -} - -int verify_axpy(int32_t *matrix_X, int32_t *matrix_Y, int32_t *matrix_Y_COPY, - int32_t alpha, uint32_t elements) { - for (uint32_t i = 0; i < elements; i++) { - if (matrix_Y[i] != matrix_X[i] * alpha + matrix_Y_COPY[i]) { - return 1; - } - } - return 0; -} - int main() { uint32_t const core_id = mempool_get_core_id(); uint32_t const num_cores = mempool_get_core_count(); - uint32_t const total_elements = size_M * size_N; - - // Seed for create element matrix - int32_t const A_a = 1; - int32_t const A_b = 1; - int32_t const A_c = -32; - int32_t const B_a = 2; - int32_t const B_b = 1; - int32_t const B_c = 16; - - // Initialize synchronization variables mempool_barrier_init(core_id); + + // Initialize data if (core_id == 0) { - printf("Initialize %3d cores\n", num_cores); + dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t)); error = 0; } - - // init_elements; - init_matrix(data_x, size_M, size_N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(data_y, size_M, size_N, B_a, B_b, B_c, core_id, num_cores); - init_matrix(data_y_copy, size_M, size_N, B_a, B_b, B_c, core_id, num_cores); mempool_barrier(num_cores); - // start kernel testing + // Benchmark mempool_start_benchmark(); - calc_axpy_unloop_x4_localbank(data_x, data_y, ALPHA, total_elements, core_id, - num_cores); + calc_axpy_unloop_x4_localbank(l1_X, l1_Y, ALPHA, array_N, core_id, num_cores); mempool_barrier(num_cores); mempool_stop_benchmark(); - // end kernel testing // Verify results - if (core_id == 0) { - printf("START CHECKING RESULTS\n"); - if (verify_axpy(data_x, data_y, data_y_copy, ALPHA, total_elements)) { - printf("RESULTS ERROR\n"); - error = 1; - } else { - printf("RESULTS CORRECT\n"); - } - } + mempool_check_q32(l1_Y, l2_Z, array_N, 0, 0); mempool_barrier(num_cores); - return error; + return 0; } diff --git a/software/apps/baremetal/cfft_radix2_q16/data.args b/software/apps/baremetal/cfft_radix2_q16/data.args deleted file mode 100644 index a7d20d682..000000000 --- a/software/apps/baremetal/cfft_radix2_q16/data.args +++ /dev/null @@ -1 +0,0 @@ -LEN 64 diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c index 105cf6370..5a66c37d3 100644 --- a/software/apps/baremetal/cfft_radix2_q16/main.c +++ b/software/apps/baremetal/cfft_radix2_q16/main.c @@ -19,6 +19,7 @@ #include "synchronization.h" #include "data_cfft_radix2_q16.h" +#define N_BANKS (NUM_CORES * BANKING_FACTOR) /* CFFT mempool libraries */ #include "baremetal/mempool_cfft_q16_bitreversal.h" diff --git a/software/apps/baremetal/cfft_radix4_q16/data.args b/software/apps/baremetal/cfft_radix4_q16/data.args deleted file mode 100644 index a7d20d682..000000000 --- a/software/apps/baremetal/cfft_radix4_q16/data.args +++ /dev/null @@ -1 +0,0 @@ -LEN 64 diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c index 88d7182fa..a4d9f887d 100644 --- a/software/apps/baremetal/cfft_radix4_q16/main.c +++ b/software/apps/baremetal/cfft_radix4_q16/main.c @@ -19,6 +19,8 @@ /* CFFT data libraries */ #include "data_cfft_radix4_q16.h" +#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) /* CHOOSE ONE */ //#define SINGLE // Single core FFT. diff --git a/software/apps/baremetal/chest_q16/data.args b/software/apps/baremetal/chest_q16/data.args deleted file mode 100644 index d8d3acb68..000000000 --- a/software/apps/baremetal/chest_q16/data.args +++ /dev/null @@ -1,3 +0,0 @@ -N_TX 4 -N_RX 32 -N_SAMPLES 32 diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c index b3b474b1d..99a0269cc 100644 --- a/software/apps/baremetal/matmul_f16/main.c +++ b/software/apps/baremetal/matmul_f16/main.c @@ -34,8 +34,10 @@ int main() { // Initialize Matrices 1 if (core_id == 0) { - dma_memcpy_blocking(matrix_a, A, (matrix_M * matrix_N) * sizeof(int16_t)); - dma_memcpy_blocking(matrix_b, B, (matrix_N * matrix_P) * sizeof(int16_t)); + dma_memcpy_blocking(matrix_a, l2_A, + (matrix_M * matrix_N) * sizeof(int16_t)); + dma_memcpy_blocking(matrix_b, l2_B, + (matrix_N * matrix_P) * sizeof(int16_t)); } mempool_barrier(num_cores); @@ -59,7 +61,7 @@ int main() { mempool_stop_benchmark(); #endif - mempool_check_f16(matrix_c, C, matrix_M * matrix_P, 0.5f, 0); + mempool_check_f16(matrix_c, l2_C, matrix_M * matrix_P, 0.5f, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c index bc391200f..d3d7622db 100644 --- a/software/apps/baremetal/matmul_f32/main.c +++ b/software/apps/baremetal/matmul_f32/main.c @@ -30,13 +30,14 @@ int main() { uint32_t num_cores = mempool_get_core_count(); mempool_barrier_init(core_id); - // Initialize Matrices + // Initialize data if (core_id == 0) { - dma_memcpy_blocking(matrix_a, A, matrix_M * matrix_N * sizeof(int32_t)); - dma_memcpy_blocking(matrix_b, B, matrix_N * matrix_P * sizeof(int32_t)); + dma_memcpy_blocking(matrix_a, l2_A, matrix_M * matrix_N * sizeof(int32_t)); + dma_memcpy_blocking(matrix_b, l2_B, matrix_N * matrix_P * sizeof(int32_t)); } mempool_barrier(num_cores); + // Benchmark #if defined(SINGLE) if (core_id == 0) { // Execute function to test. @@ -57,7 +58,7 @@ int main() { mempool_stop_benchmark(); #endif - mempool_check_f32(matrix_c, C, matrix_M * matrix_P, 0.01f, 0); + mempool_check_f32(matrix_c, l2_C, matrix_M * matrix_P, 0.01f, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/matmul_i32/main.c b/software/apps/baremetal/matmul_i32/main.c index 65e2b82f1..94b250306 100644 --- a/software/apps/baremetal/matmul_i32/main.c +++ b/software/apps/baremetal/matmul_i32/main.c @@ -7,131 +7,46 @@ #include #include -#include "baremetal/mempool_matmul_i32p.h" +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -// Define Matrix dimensions: -// C = AB with A=[MxN], B=[NxP], C=[MxP] -#define matrix_M 64 -#define matrix_N 32 -#define matrix_P 64 - -int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); -int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); -int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_matmul_i32p.h" +#include "data_matmul_i32.h" -int volatile error __attribute__((section(".l1"))); +int32_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); +int32_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); +int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id, - uint32_t num_cores) { - uint32_t const split = 8; // How many rows/columns to split the matrix into - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } -} +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); -// Initialize the matrices in parallel -int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t inner_dim, int32_t aa, int32_t ab, int32_t ac, - int32_t ba, int32_t bb, int32_t bc, uint32_t core_id, - uint32_t num_cores) { - // Convert to signed - int32_t n = (int32_t)inner_dim; - // Parallelize over rows - for (uint32_t i = core_id; i < num_rows; i += num_cores) { - for (uint32_t j = 0; j < num_columns; ++j) { - int32_t ii = (int32_t)i; - int32_t jj = (int32_t)j; - int32_t lin = - (aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + ac * bc) * n; - int32_t qua = - ((aa * ba * ii + ab * bb * jj + ab * bc + ba * ac) * (n * (n - 1))) / - 2; - int32_t cub = ((ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6; - int32_t golden = lin + qua + cub; - if (matrix[i * num_columns + j] != golden) { - return (i + j) == 0 ? -1 : (int)(i * num_columns + j); - } - matrix[i * num_columns + j] = 0; - } + // Initialize data + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int32_t)); } - return 0; -} - -int test_matrix_multiplication(int32_t *__restrict__ A, int32_t *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, uint32_t N, - uint32_t P, uint32_t core_id, - uint32_t num_cores) { - int32_t const A_a = 1; - int32_t const A_b = 1; - int32_t const A_c = -32; - int32_t const B_a = 2; - int32_t const B_b = 1; - int32_t const B_c = 16; - - // Initialize Matrices - init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores); - // Wait at barrier until everyone is ready mempool_barrier(num_cores); - // Execute function to test. - mempool_start_benchmark(); + // Benchmark + mempool_start_benchmark(); #ifdef __XPULPIMG - matmul_unrolled_2x2_parallel_i32_xpulpv2(A, B, C, M, N, P, core_id, - num_cores); + matmul_unrolled_2x2_parallel_i32_xpulpv2(l1_A, l1_B, l1_C, matrix_M, matrix_N, + matrix_P, core_id, num_cores); #else - matmul_unrolled_2x2_parallel_i32_rv32im(A, B, C, M, N, P, core_id, num_cores); + matmul_unrolled_2x2_parallel_i32_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N, + matrix_P, core_id, num_cores); #endif - mempool_stop_benchmark(); - // Wait at barrier befor checking mempool_barrier(num_cores); - if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id, - num_cores)) { - error = 1; - return -1; - } - return 0; -} -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - if (core_id == 0) { - error = 0; - } - - // Test the Matrix multiplication - test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N, - matrix_P, core_id, num_cores); - // wait until all cores have finished + // Verify results + mempool_check_q32(l1_C, l2_C, matrix_M * matrix_P, 0, 0); mempool_barrier(num_cores); - - return error; + return 0; } diff --git a/software/apps/baremetal/matmul_i8/main.c b/software/apps/baremetal/matmul_i8/main.c index 4fb557f2c..63a24418e 100644 --- a/software/apps/baremetal/matmul_i8/main.c +++ b/software/apps/baremetal/matmul_i8/main.c @@ -7,137 +7,46 @@ #include #include -#include "baremetal/mempool_matmul_i8p.h" +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -// Define Matrix dimensions: -// C = AB with A=[MxN], B=[NxP], C=[MxP] -#define matrix_M 64 -#define matrix_N 64 -#define matrix_P 64 - -int8_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); -int8_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); -int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_matmul_i8p.h" +#include "data_matmul_i8.h" -int volatile error __attribute__((section(".l1"))); +int8_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); +int8_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); +int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); -void init_matrix(int8_t *matrix, uint32_t num_rows, uint32_t num_columns, - int8_t a, int8_t b, int8_t c, uint32_t core_id, - uint32_t num_cores) { - uint32_t const split = 8; // How many rows/columns to split the matrix into - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = - (int8_t)(a * (int8_t)i + b * (int8_t)j + c); - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = - (int8_t)(a * (int8_t)i + b * (int8_t)j + c); - } - } - } -} +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); -// Initialize the matrices in parallel -int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t inner_dim, int8_t aa, int8_t ab, int8_t ac, - int8_t ba, int8_t bb, int8_t bc, uint32_t core_id, - uint32_t num_cores) { - // Convert to signed - int32_t n = (int32_t)inner_dim; - // Parallelize over rows - for (uint32_t i = core_id; i < num_rows; i += num_cores) { - for (uint32_t j = 0; j < num_columns; ++j) { - int32_t ii = (int32_t)i; - int32_t jj = (int32_t)j; - int32_t lin = ((int32_t)aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + - (int32_t)ac * bc) * - n; - int32_t qua = - (((int32_t)aa * ba * ii + ab * bb * jj + ab * bc + (int32_t)ba * ac) * - (n * (n - 1))) / - 2; - int32_t cub = (((int32_t)ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6; - int32_t golden = lin + qua + cub; - if (matrix[i * num_columns + j] != golden) { - return (i + j) == 0 ? -1 : (int)(i * num_columns + j); - } - matrix[i * num_columns + j] = 0; - } + // Initialize data + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int8_t)); + dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int8_t)); } - return 0; -} - -int test_matrix_multiplication(int8_t *__restrict__ A, int8_t *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, uint32_t N, - uint32_t P, uint32_t core_id, - uint32_t num_cores) { - int8_t const A_a = 1; - int8_t const A_b = 1; - int8_t const A_c = -40; - int8_t const B_a = 0; - int8_t const B_b = 1; - int8_t const B_c = 19; - - // Initialize Matrices - init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores); - // Wait at barrier until everyone is ready mempool_barrier(num_cores); - // Execute function to test. - mempool_start_benchmark(); + // Benchmark + mempool_start_benchmark(); #ifdef __XPULPIMG - matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id, - num_cores); - // matmul_unrolled_2x4_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id, - // num_cores); + matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2( + l1_A, l1_B, l1_C, matrix_M, matrix_N, matrix_P, core_id, num_cores); #else - matmul_unrolled_2x2_parallel_i8_rv32im(A, B, C, M, N, P, core_id, num_cores); + matmul_unrolled_2x2_parallel_i8_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N, + matrix_P, core_id, num_cores); #endif - mempool_stop_benchmark(); - // Wait at barrier befor checking mempool_barrier(num_cores); - if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id, - num_cores)) { - error = 1; - return -1; - } - return 0; -} -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - if (core_id == 0) { - error = 0; - } - - // Test the Matrix multiplication - test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N, - matrix_P, core_id, num_cores); - // wait until all cores have finished + // Verify results + mempool_check_q32(l1_C, l2_C, matrix_M * matrix_P, 0, 0); mempool_barrier(num_cores); - - return error; + return 0; } diff --git a/software/data/data_matmul_f16.h.tpl b/software/data/data_matmul_f16.h.tpl deleted file mode 100644 index 96aa738a3..000000000 --- a/software/data/data_matmul_f16.h.tpl +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:.4f}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define matrix_M (${matrix_M}) -#define matrix_N (${matrix_N}) -#define matrix_P (${matrix_P}) - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${matrix_M * matrix_N}] = ${array_to_cstr(A)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${matrix_N * matrix_P}] = ${array_to_cstr(B)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${matrix_M * matrix_P}] = ${array_to_cstr(C)}; diff --git a/software/data/data_matmul_f32.h.tpl b/software/data/data_matmul_f32.h.tpl deleted file mode 100644 index 4e9e6a4d6..000000000 --- a/software/data/data_matmul_f32.h.tpl +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define matrix_M (${matrix_M}) -#define matrix_N (${matrix_N}) -#define matrix_P (${matrix_P}) - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${matrix_M * matrix_N}] = ${array_to_cstr(A)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${matrix_N * matrix_P}] = ${array_to_cstr(B)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${matrix_M * matrix_P}] = ${array_to_cstr(C)}; diff --git a/software/data/data_matmulf16.py b/software/data/data_matmulf16.py deleted file mode 100644 index 2c362208b..000000000 --- a/software/data/data_matmulf16.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_matmul_f16.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-m", - "--dim_m", - type=int, - required=False, - default=16, - help='First dimension.' - ) - parser.add_argument( - "-n", - "--dim_n", - type=int, - required=False, - default=16, - help='Second dimension.' - ) - parser.add_argument( - "-p", - "--dim_p", - type=int, - required=False, - default=16, - help='Third dimension.' - ) - - args = parser.parse_args() - - matrix_M = args.dim_m - matrix_N = args.dim_n - matrix_P = args.dim_p - - # Create matrix - A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(np.float16) - B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(np.float16) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float16) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float16) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float16) - - kwargs = { - 'name': 'data_matmul_f16', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_matmulf32.py b/software/data/data_matmulf32.py deleted file mode 100644 index 15086d0fc..000000000 --- a/software/data/data_matmulf32.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp32 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_matmul_f32.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - - parser.add_argument( - "-m", - "--dim_m", - type=int, - required=False, - default=16, - help='First dimension.' - ) - parser.add_argument( - "-n", - "--dim_n", - type=int, - required=False, - default=16, - help='Second dimension.' - ) - parser.add_argument( - "-p", - "--dim_p", - type=int, - required=False, - default=16, - help='Third dimension.' - ) - - args = parser.parse_args() - - matrix_M = args.dim_m - matrix_N = args.dim_n - matrix_P = args.dim_p - - # Create matrix - A = np.random.rand(matrix_M, matrix_N) - B = np.random.rand(matrix_N, matrix_P) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float32) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float32) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float32) - - kwargs = { - 'name': 'data_matmul_f32', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py new file mode 100644 index 000000000..a064ce63b --- /dev/null +++ b/software/data/gendata_header.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data.h files. +# Author: Marco Bertuletti + +import argparse +import os +import math +import numpy as np +import hjson +import ast + +import gendatalib_cfft as cfft +import gendatalib_chest as chest +import gendatalib_blas as blas + + +header = """\ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// File generated with .data/print_header.py +// Author: Marco Bertuletti\n\n +""" + + +def print_array(arr, typ, name): + + typ_i32b = ["int32_t", "uint32_t"] + typ_i16b = ["int16_t", "uint16_t"] + typ_i8b = ["int8_t", "uint8_t"] + + output_string = typ + output_string += " __attribute__((aligned(sizeof(int32_t)), section(\".l2\"))) " + output_string += name + '[{}] = {{\n'.format(arr.size) + for (value, count) in zip(arr, range(arr.size)): + if typ in typ_i32b: + output_string += '({}) 0X{:08X}, '.format(typ, value & 0xffffffff) + elif typ in typ_i16b: + output_string += '({}) 0X{:04X}, '.format(typ, value & 0x0000ffff) + elif typ in typ_i8b: + output_string += '({}) 0X{:02X}, '.format(typ, value & 0x000000ff) + elif typ == 'float': + output_string += '({}) {:+.8f}, '.format(typ, value) + elif typ == '__fp16': + output_string += '({}) {:+.4f}, '.format(typ, value) + else: + raise Exception("ERROR: Unsupported data type!!!") + count += 1 + if count % 4 == 0: + output_string += '\n' + output_string = output_string[:-3] + output_string += "};\n\n" + return output_string + + +def print_file(header, defines, arrays, filename): + """ + Writes defines and arrays to a file. + + :param header: Header of the printed file + :param defines: A tuple of (define_name, define_value) for #define directives. + :param arrays: A tuple of (array_name, array_type, array_values) for arrays. + :param filename: The output file to write to. + """ + + # Initialize the output string + output_string = header + + # Write the defines + for define_name, define_value in defines: + output_string += "#define {} ({})\n".format(define_name, define_value) + output_string += "\n" # Add space between defines and arrays + + # Write the arrays using print_array + for array_values, array_type, array_name in arrays: + output_string += print_array(array_values, array_type, array_name) + + # Write everything to the file + with open(filename, "w") as file: + file.write(output_string) + + print("Generate {}".format(filename)) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='Generate data.h header files.') + parser.add_argument('--app_name', type=str, help='Name of the app') + parser.add_argument('--params', type=str, help='Name of the app') + + # Parse the command-line arguments + args = parser.parse_args() + app_name = args.app_name + with open(args.params, 'r') as hjson_file: + config_data = hjson.load(hjson_file) + data_args = config_data.get(app_name) + my_type = data_args.get("type") + defnes = [ast.literal_eval(defne) for defne in data_args.get("defines")] + arrays = [ast.literal_eval(array) for array in data_args.get("arrays")] + + # Determine output file name + filename = os.path.dirname(os.path.abspath(__file__)) + filename = os.path.join(filename, "data_{}.h".format(app_name)) + + # Generate data header file + if app_name == "axpy_i32": + + result = blas.generate_iaxpy(**{name: value for name, value in defnes}) + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + elif app_name == "cfft_radix4_q16": + + result = cfft.generate_cfft_q16( + **{name: value for name, value in defnes}) + N = defnes[0][1] + defnes += [ + ("LOG2", int(math.log2(N))), + ("N_TWIDDLES", 3 * N // 4), + ("BITREVINDEXTABLE_LENGTH", len(result[3])), + ("TOLERANCE", result[4]), + ] + result = result[0:4] + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + elif app_name == "cfft_radix2_q16": + + result = cfft.generate_cfft_q16( + **{name: value for name, value in defnes}) + N = defnes[0][1] + defnes += [ + ("LOG2", int(math.log2(N))), + ("N_TWIDDLES", 3 * N // 4), + ("BITREVINDEXTABLE_LENGTH", len(result[3])), + ("TOLERANCE", result[4]), + ] + result = result[0:4] + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + elif app_name == "chest_q16": + + result = chest.generate_chest_q16( + **{name: value for name, value in defnes}) + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + elif app_name == "conv2d_i32": + + result = blas.generate_iconv( + **{name: value for name, value in defnes}, my_type=my_type) + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + elif app_name == "matmul_f16": + + result = blas.generate_fmatmul( + **{name: value for name, value in defnes}, my_type=my_type) + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + elif app_name == "matmul_f32": + + result = blas.generate_fmatmul( + **{name: value for name, value in defnes}, my_type=my_type) + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + elif app_name == "matmul_i32": + + result = blas.generate_imatmul( + **{name: value for name, value in defnes}, my_type=my_type) + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + elif app_name == "matmul_i8": + + result = blas.generate_imatmul( + **{name: value for name, value in defnes}, my_type=my_type) + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + else: + print("No need for data generation.") diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson new file mode 100644 index 000000000..5953c9710 --- /dev/null +++ b/software/data/gendata_params.hjson @@ -0,0 +1,132 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// This script generates data.h files. +// Author: Marco Bertuletti + +{ + "axpy_i32": { + "type": "int32", + "defines": [ + ("ALPHA", 6) + ("array_N", 1024) + ] + "arrays": [ + ("int32_t", "l2_X") + ("int32_t", "l2_Y") + ("int32_t", "l2_Z") + ] + }, + + "cfft_radix4_q16": { + "type": "int16", + "defines": [ + ("N_CSAMPLES", 1024) + ] + "arrays": [ + ("int16_t", "l2_pSrc") + ("int16_t", "l2_pRes") + ("int16_t", "l2_twiddleCoef_q16") + ("int16_t", "l2_BitRevIndexTable") + ] + }, + + "cfft_radix2_q16": { + "type": "int16", + "defines": [ + ("N_CSAMPLES", 1024) + ] + "arrays": [ + ("int16_t", "l2_pSrc") + ("int16_t", "l2_pRes") + ("int16_t", "l2_twiddleCoef_q16") + ("int16_t", "l2_BitRevIndexTable") + ] + }, + + "chest_q16": { + "type": "int32", + "defines": [ + ("N_TX", 4) + ("N_RX", 4) + ("N_SAMPLES", 512) + ] + "arrays": [ + ("int16_t", "l2_PilotTX") + ("int16_t", "l2_PilotRX") + ("int16_t", "l2_HEST") + ] + }, + + "conv2d_i32": { + "type": "int32", + "defines": [ + ("matrix_M", 20) + ("matrix_N", 1024) + ("kernel_N", 3) + ] + "arrays": [ + ("int32_t", "l2_X") + ("int32_t", "l2_K") + ("int32_t", "l2_Y") + ] + }, + + "matmul_f16": { + "type": "float16", + "defines": [ + ("matrix_M", 32) + ("matrix_N", 32) + ("matrix_P", 32) + ] + "arrays": [ + ("__fp16", "l2_A") + ("__fp16", "l2_B") + ("__fp16", "l2_C") + ] + }, + + "matmul_f32": { + "type": "float32", + "defines": [ + ("matrix_M", 16) + ("matrix_N", 16) + ("matrix_P", 16) + ] + "arrays": [ + ("float", "l2_A") + ("float", "l2_B") + ("float", "l2_C") + ] + } + + "matmul_i32": { + "type": "int32", + "defines": [ + ("matrix_M", 32) + ("matrix_N", 32) + ("matrix_P", 32) + ] + "arrays": [ + ("int32_t", "l2_A") + ("int32_t", "l2_B") + ("int32_t", "l2_C") + ] + } + + "matmul_i8": { + "type": "int8", + "defines": [ + ("matrix_M", 64) + ("matrix_N", 64) + ("matrix_P", 64) + ] + "arrays": [ + ("int8_t", "l2_A") + ("int8_t", "l2_B") + ("int32_t", "l2_C") + ] + } + +} diff --git a/software/data/gendatalib_blas.py b/software/data/gendatalib_blas.py new file mode 100644 index 000000000..0fd1cf780 --- /dev/null +++ b/software/data/gendatalib_blas.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data for the fp16 matmul. +# Author: Marco Bertuletti + +import numpy as np +from scipy import signal + + +def generate_fmatmul(matrix_M=16, matrix_N=16, + matrix_P=16, my_type=np.float32): + + # Create matrix + A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(my_type) + B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(my_type) + C = np.matmul(A, B) + + A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type) + B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type) + C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(my_type) + + return A, B, C + + +def generate_imatmul(matrix_M=16, matrix_N=16, matrix_P=16, my_type=np.int32): + + # Create matrix + MAX = 2**6 + A = np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N)) + B = np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N)) + C = np.matmul(A, B) + + A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type) + B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type) + C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.int32) + + return A, B, C + + +def generate_iaxpy(ALPHA=6, array_N=1024, my_type=np.int32): + + # Create matrix + MAX = 32 + X = np.random.randint(-MAX, MAX, size=(array_N)).astype(my_type) + Y = np.random.randint(-MAX, MAX, size=(array_N)).astype(my_type) + Z = (Y + X * ALPHA).astype(my_type) + + return X, Y, Z + + +def generate_iconv(matrix_M=32, matrix_N=32, kernel_N=3, my_type=np.int32): + + # Create matrix + MAX = 32 + X = np.random.randint(-MAX, MAX, size=(matrix_M, matrix_M)).astype(my_type) + K = np.random.randint(-MAX, MAX, size=(kernel_N, kernel_N)).astype(my_type) + Y = signal.convolve2d(X, K, mode="same", boundary='fill') + + X = X.flatten().astype(my_type) + K = K.flatten().astype(my_type) + Y = Y.flatten().astype(my_type) + + return X, K, Y diff --git a/software/data/generate_cfft.py b/software/data/gendatalib_cfft.py similarity index 80% rename from software/data/generate_cfft.py rename to software/data/gendatalib_cfft.py index 584233f03..6916a532d 100644 --- a/software/data/generate_cfft.py +++ b/software/data/gendatalib_cfft.py @@ -11,13 +11,46 @@ import math as M from sympy.combinatorics import Permutation -__all__ = [ - 'generate_cfft_q16', - 'generate_twiddleCoefq15', - 'generate_bitreversal'] + +def generate_twiddleCoefq15(N): + PI = 3.14159265358979 + twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) + for i in range(0, (int)(3 * N / 4)): + twiddleCoefq15_cos = M.cos(i * 2 * PI / N) + twiddleCoefq15_sin = M.sin(i * 2 * PI / N) + twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) + twiddleCoefq15[2 * i + + 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) + return twiddleCoefq15 + + +def generate_bitreversal(N, R): + # Decompose + logR2 = [] + idx = N + while (idx >= R): + logR2.append(int(M.log2(R))) + idx = idx // R + if (idx > 1): + logR2.append(int(M.log2(idx))) + # Bitreversal + indexes = [] + for x in range(N): + result = 0 + for bits in logR2: + mask = (0xffffffff >> (32 - bits)) + result = (result << bits) | (x & mask) + x = x >> bits + indexes.append(result) + # Create transpositions table + tps = [] + for c in Permutation.from_sequence(indexes).cyclic_form: + for i in range(len(c) - 1): + tps.append([c[i] * 8, c[-1] * 8]) + return np.ndarray.flatten(np.array(tps)) -def generate_cfft_q16(N): +def generate_cfft_q16(N_CSAMPLES): # Q16: # len=16: Q1.15 -> Q5.11 # len=32: Q1.15 -> Q6.10 @@ -28,8 +61,9 @@ def generate_cfft_q16(N): # len=1024: Q1.15 -> Q11.5 # len=2048: Q1.15 -> Q12.4 # len=4096: Q1.15 -> Q13.3 - src = (np.random.randint(-2**(15), 2**(15) - 1, - 2 * N, dtype=np.int16)).astype(np.int16) + MAX = 2**(15) + src = (np.random.randint(-MAX, MAX - 1, 2 * + N_CSAMPLES, dtype=np.int16)).astype(np.int16) tolerance = { 16: 16, 32: 20, @@ -51,54 +85,20 @@ def generate_cfft_q16(N): 2048: 4, 4096: 3} my_fixpoint = 15 - dst = np.zeros(2 * N, dtype=np.int16) - complex_src = np.zeros(N, dtype=np.csingle) - complex_dst = np.zeros(N, dtype=np.csingle) - for i in range(N): + dst = np.zeros(2 * N_CSAMPLES, dtype=np.int16) + complex_src = np.zeros(N_CSAMPLES, dtype=np.csingle) + complex_dst = np.zeros(N_CSAMPLES, dtype=np.csingle) + for i in range(N_CSAMPLES): shift = 2**(my_fixpoint) complex_src[i] = (src[2 * i].astype(np.csingle) / shift) + \ 1j * (src[2 * i + 1].astype(np.csingle) / shift) complex_dst = np.fft.fft(complex_src) - for i in range(N): - shift = 2**(bit_shift_dict_q16[N]) + for i in range(N_CSAMPLES): + shift = 2**(bit_shift_dict_q16[N_CSAMPLES]) dst[2 * i] = (np.real(complex_dst[i]) * shift).astype(np.int16) dst[2 * i + 1] = (np.imag(complex_dst[i]) * shift).astype(np.int16) - return src, dst, tolerance[N] + twiddles = generate_twiddleCoefq15(N_CSAMPLES) + bitrever = generate_bitreversal(N_CSAMPLES, 2) -def generate_twiddleCoefq15(N): - PI = 3.14159265358979 - twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) - for i in range(0, (int)(3 * N / 4)): - twiddleCoefq15_cos = M.cos(i * 2 * PI / N) - twiddleCoefq15_sin = M.sin(i * 2 * PI / N) - twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) - twiddleCoefq15[2 * i + - 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) - return twiddleCoefq15 - - -def generate_bitreversal(N, R): - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - return np.ndarray.flatten(np.array(tps)) + return src, dst, twiddles, bitrever, tolerance[N_CSAMPLES] diff --git a/software/data/generate_chest.py b/software/data/gendatalib_chest.py similarity index 76% rename from software/data/generate_chest.py rename to software/data/gendatalib_chest.py index d95b9748e..ae197723b 100755 --- a/software/data/generate_chest.py +++ b/software/data/gendatalib_chest.py @@ -10,9 +10,6 @@ import numpy as np -__all__ = ['generate_chest_q16'] - - def q_sat(x): if x > 2**15 - 1: return x - 2**16 @@ -48,19 +45,19 @@ def compute_chest_q16(in_rx, in_tx, p): return result -def generate_chest_q16(nb_tx, nb_rx, nb_samples): +def generate_chest_q16(N_TX, N_RX, N_SAMPLES): FIXED_POINT = 8 MAX = 2**7 qvector_pilot_tx = [] qvector_pilot_rx = [] qvector_Hest = [] - for k in range(nb_samples): + for k in range(N_SAMPLES): # Create pilots - pilot_rx = np.random.randint(-MAX, MAX - 1, size=nb_rx) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=nb_rx) - pilot_tx = np.random.randint(-MAX, MAX - 1, size=nb_tx) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=nb_tx) + pilot_rx = np.random.randint(-MAX, MAX - 1, size=N_RX) + 1j * \ + np.random.randint(-MAX, MAX - 1, size=N_RX) + pilot_tx = np.random.randint(-MAX, MAX - 1, size=N_TX) + 1j * \ + np.random.randint(-MAX, MAX - 1, size=N_TX) # Compute Hest Hest = compute_chest_q16(pilot_rx, pilot_tx, FIXED_POINT) @@ -74,7 +71,7 @@ def generate_chest_q16(nb_tx, nb_rx, nb_samples): qvector_pilot_rx.append(pilot_rx) qvector_Hest.append(Hest) - qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * nb_tx * nb_samples]) - qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * nb_rx * nb_samples]) - qvector_Hest = np.reshape(qvector_Hest, [2 * nb_tx * nb_rx * nb_samples]) + qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * N_TX * N_SAMPLES]) + qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * N_RX * N_SAMPLES]) + qvector_Hest = np.reshape(qvector_Hest, [2 * N_TX * N_RX * N_SAMPLES]) return qvector_pilot_tx, qvector_pilot_rx, qvector_Hest diff --git a/software/data/print_header.py b/software/data/print_header.py deleted file mode 100644 index 8d2575d33..000000000 --- a/software/data/print_header.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the Channel estimation. -# Author: Marco Bertuletti - -import argparse -import os -import math -import generate_cfft as cfft -import generate_chest as chest - - -def extract_data_args(filename): - # Define a dictionary to store numerical values for each flag - args = {} - - # Open the file for reading - with open(filename, 'r') as file: - # Iterate through each line in the file - for line in file: - # Split the line into words - words = line.split() - # Iterate through each word in the line - for i in range(len(words)): - flag = words[i] # Get the flag name - # Check if the next word exists and is a numerical value - if i + 1 < len(words) and words[i + 1].isdigit(): - # Convert the numerical value to an integer - numerical_value = int(words[i + 1]) - # Store the numerical value in the structure - args[flag] = numerical_value - - # Return the structure containing numerical values for each flag - return args - - -class dot_dict: - def __init__(self, data): - self.data = data - - def __getattr__(self, attr): - if attr in self.data: - return self.data[attr] - else: - raise AttributeError(f"Object has no attribute '{attr}'") - - -def print_array(arr, typ, name, str): - count = 0 - output_string = typ - output_string += " __attribute__((aligned(sizeof(int32_t)), \ - section(\".l2\"))) " - output_string += name + '[{}] = {{\n'.format(arr.size) - for value in arr: - output_string += '(int16_t) 0X{:04X}, '.format(value & 0xffff) - count += 1 - if count % 8 == 0: - output_string += '\n' - output_string = output_string[:-3] - output_string += "};\n" - return output_string - - -def print_file(string, filename): - with open(filename, "w") as file: - # Write the string to the file - file.write(string + '\n') - return file - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser( - description='Generate data.h header files.') - parser.add_argument('--params', type=str, help='Name of the app') - # Parse the command-line arguments - args = parser.parse_args() - params = args.params - # Read arguments from data.args file - data_args = extract_data_args(params) - (app_path, _) = os.path.split(params) - (_, app_name) = os.path.split(app_path) - - if data_args != {}: - string = "// Copyright 2022 ETH Zurich and University of \ - Bologna.\n // Licensed under the Apache License, \ - Version 2.0, see LICENSE for details.\n \ - // SPDX-License-Identifier: Apache-2.0\n\n \ - // File generated with .data/print_header.py\n" - - data_args = dot_dict(data_args) # Access args with .notation - - if app_name == "cfft_radix4_q16": - # cfft_radix4_q16 - src_cfft_q16, dst_cfft_q16, tolerance_q16 = cfft.generate_cfft_q16( - data_args.LEN) - brv_cfft_q16 = cfft.generate_bitreversal(data_args.LEN, 2) - twi_cfft_q16 = cfft.generate_twiddleCoefq15(data_args.LEN) - string += "#define LOG2 ({})\n".format( - int(math.log2(data_args.LEN))) - string += "#define N_CSAMPLES ({})\n".format(data_args.LEN) - string += "#define N_TWIDDLES ({})\n".format(3 * - data_args.LEN // 4) - string += "#define BITREVINDEXTABLE_LENGTH ({})\n".format( - len(brv_cfft_q16)) - string += "#define TOLERANCE ({})\n".format(tolerance_q16) - string += "#define N_BANKS (NUM_CORES * BANKING_FACTOR)\n" - string += "#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))\n" - string += print_array(src_cfft_q16, "int16_t", "l2_pSrc", string) - string += print_array(dst_cfft_q16, "int16_t", "l2_pRes", string) - string += print_array(twi_cfft_q16, "int16_t", - "l2_twiddleCoef_q16", string) - string += print_array(brv_cfft_q16, "int16_t", - "l2_BitRevIndexTable", string) - filename = app_path + "/data_cfft_radix4_q16.h" - - elif app_name == "cfft_radix2_q16": - # cfft_radix2_q16 - src_cfft_q16, dst_cfft_q16, tolerance_q16 = cfft.generate_cfft_q16( - data_args.LEN) - brv_cfft_q16 = cfft.generate_bitreversal(data_args.LEN, 2) - twi_cfft_q16 = cfft.generate_twiddleCoefq15(data_args.LEN) - string += "#define LOG2 ({})\n".format( - int(math.log2(data_args.LEN))) - string += "#define N_CSAMPLES ({})\n".format(data_args.LEN) - string += "#define N_TWIDDLES ({})\n".format(3 * - data_args.LEN // 4) - string += "#define BITREVINDEXTABLE_LENGTH ({})\n".format( - len(brv_cfft_q16)) - string += "#define TOLERANCE ({})\n".format(tolerance_q16) - string += "#define N_BANKS (NUM_CORES * BANKING_FACTOR)\n" - string += print_array(src_cfft_q16, "int16_t", "l2_pSrc", string) - string += print_array(dst_cfft_q16, "int16_t", "l2_pRes", string) - string += print_array(twi_cfft_q16, "int16_t", - "l2_twiddleCoef_q16", string) - string += print_array(brv_cfft_q16, "int16_t", - "l2_BitRevIndexTable", string) - filename = app_path + "/data_cfft_radix2_q16.h" - - elif app_name == "chest_q16": - src1_chest_q16, src2_chest_q16, dst_chest_q16 = \ - chest.generate_chest_q16(data_args.N_TX, data_args.N_RX, - data_args.N_SAMPLES) - string += "#define N_TX ({})\n".format(data_args.N_TX) - string += "#define N_RX ({})\n".format(data_args.N_RX) - string += "#define N_SAMPLES ({})\n".format(data_args.N_SAMPLES) - string += print_array(src1_chest_q16, - "int16_t", "l2_PilotTX", string) - string += print_array(src2_chest_q16, - "int16_t", "l2_PilotRX", string) - string += print_array(dst_chest_q16, "int16_t", "l2_HEST", string) - filename = app_path + "/data_chest_q16.h" - - else: - raise Exception("ERROR: no app with such name!!!") - - print_file(string, filename) diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 82268ba07..52d86c6d1 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -172,8 +172,8 @@ OMP_RUNTIME := $(addsuffix .o,$(shell find $(OMP_DIR) -name "*.c")) %.ld: %.ld.c $(RISCV_CC) -P -E $(DEFINES) $< -o $@ -%.h: %.args - $(python) $(MEMPOOL_DIR)/software/data/print_header.py --params $< +data_%.h: $(DATA_DIR)/gendata_params.hjson + $(python) $(DATA_DIR)/gendata_header.py --app_name $* --params $(DATA_DIR)/gendata_params.hjson # Bootrom %.elf: %.S $(ROOT_DIR)/bootrom.ld $(LINKER_SCRIPT)