diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h index 4cbfb0aab8e2..d1a8619e8233 100644 --- a/include/tvm/runtime/crt/error_codes.h +++ b/include/tvm/runtime/crt/error_codes.h @@ -79,6 +79,7 @@ typedef enum { kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2), kTvmErrorPlatformNoMemory = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 3), kTvmErrorPlatformTimerBadState = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 4), + kTvmErrorPlatformStackAllocBadFree = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 5), // Common error codes returned from generated functions. kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0), diff --git a/include/tvm/runtime/crt/stack_allocator.h b/include/tvm/runtime/crt/stack_allocator.h index daa403cb2764..4184dff7510e 100644 --- a/include/tvm/runtime/crt/stack_allocator.h +++ b/include/tvm/runtime/crt/stack_allocator.h @@ -45,14 +45,61 @@ typedef struct { size_t workspace_size; // Total number of bytes in the workspace } tvm_workspace_t; +/*! + * \brief Initialize the stack-based memory manager + * + * \param tvm_runtime_workspace The tvm_workspace_t struct containing state + * \param g_aot_memory The memory buffer used to allocate within + * \param workspace_size The total size of the workspace buffer workspace + */ tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace, uint8_t* g_aot_memory, size_t workspace_size); +/*! + * \brief The intended user-facing function to allocate within the buffer. It wraps + * StackMemoryManager_Allocate_Body enable and disable the LIFO check that is useful for debugging + * the AoT codegen. + * + * \param tvm_runtime_workspace The tvm_workspace_t struct containing state + * \param nbytes The number of bytes required for the allocation + * \param current_alloc The pointer-to-pointer to be populated with the allocated address + */ tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes, - void**); + void** current_alloc); + +/*! + * \brief The internal function that accepts allocate inputs and an extra byte to say to enable the + * LIFO check that is useful in debugging for debugging the AoT codegen. + * + * \param tvm_runtime_workspace The tvm_workspace_t struct containing state + * \param nbytes The number of bytes required for the allocation + * \param current_alloc The pointer-to-pointer to be populated with the allocated address + * \param do_lifo_check This being non-zero indicates to perform a check LIFO pattern Allocs/Frees + */ +tvm_crt_error_t StackMemoryManager_Allocate_Body(tvm_workspace_t* tvm_runtime_workspace, + int32_t nbytes, void** current_alloc, + uint8_t do_lifo_check); +/*! + * \brief The intended user-facing function to free the tensor within the buffer. It wraps + * StackMemoryManager_Free_Body enable and disable the stack allocator + * + * \param tvm_runtime_workspace The tvm_workspace_t struct containing state + * \param ptr The base pointer of the tensor to be free'd + */ tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr); +/*! + * \brief The internal function that accepts free inputs and an extra byte to say to enable the LIFO + * check that is useful in debugging for debugging the AoT codegen. + * + * \param tvm_runtime_workspace The tvm_workspace_t struct containing state + * \param ptr The base pointer of tensor to be free'd within the workspace buffer + * \param do_lifo_check This being non-zero indicates to perform a check LIFO pattern Allocs/Frees + */ +tvm_crt_error_t StackMemoryManager_Free_Body(tvm_workspace_t* tvm_runtime_workspace, void* ptr, + uint8_t do_lifo_check); + #ifdef __cplusplus } // extern "C" #endif diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index c3527d87fbf7..a493469a333d 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -60,6 +60,10 @@ constexpr int kTempAllocaAlignment = 128; /*! \brief Maximum size that can be allocated on stack */ constexpr int kMaxStackAlloca = 1024; +/*! \brief Number of bytes each allocation must align to by default in the workspace buffer to + * service intermediate tensors */ +constexpr int kDefaultWorkspaceAlignment = 1; + /*! * \brief TVM Runtime Device API, abstracts the device * specific interface for memory management. diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h index c2b3148e5eb9..3f2fdcecc70f 100644 --- a/include/tvm/tir/analysis.h +++ b/include/tvm/tir/analysis.h @@ -181,8 +181,11 @@ TVM_DLL size_t CalculateExprComplexity(const PrimExpr& expr); /*! * \brief Calculate the workspace size in bytes needed by the TIR allocates inside the TIR PrimFunc * \param func The TIR PrimFunc for which the workspace size to be calculated + * \param workspace_byte_alignment The byte alignment required for each tensor allocated in this + * workspace */ -TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func); +TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func, + const Integer& workspace_byte_alignment); /*! * \brief Detect the lowest common ancestor(LCA) of buffer access, including both high-level diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py index 1d2e316095ac..030918a5e18e 100644 --- a/python/tvm/tir/analysis/analysis.py +++ b/python/tvm/tir/analysis/analysis.py @@ -133,7 +133,7 @@ def get_block_access_region(block, buffer_var_map): return _ffi_api.get_block_access_region(block, buffer_var_map) -def calculate_workspace_bytes(func: PrimFunc): +def calculate_workspace_bytes(func: PrimFunc, workspace_byte_alignment: int): """Calculate the workspace size in bytes needed by the TIR allocates inside the TIR PrimFunc. @@ -141,13 +141,15 @@ def calculate_workspace_bytes(func: PrimFunc): ---------- func: tvm.tir.PrimFunc The function to be detected. + workspace_byte_alignment : int + The byte alignment required for each tensor Returns ------- result : int Workspace size in bytes. """ - return _ffi_api.calculate_workspace_bytes(func) + return _ffi_api.calculate_workspace_bytes(func, workspace_byte_alignment) def detect_buffer_access_lca(func: PrimFunc) -> Dict[Buffer, Stmt]: diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index ef188b9df175..573206c782f6 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -279,7 +279,9 @@ class AOTExecutorCodegen : public ExprVisitor { * \param func The main function that contains calls to operator tir primitive functions */ void UpdateMainWorkspaceSize(const tir::PrimFunc& primfunc, const relay::Function& func) { - Integer workspace_size = CalculateWorkspaceBytes(primfunc); + auto workspace_byte_alignment = target_host_->GetAttr("workspace-byte-alignment") + .value_or(tvm::runtime::kDefaultWorkspaceAlignment); + Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment); // Populate FunctionInfo auto fi_node = make_object(); // Initialize all target workspaces to zero @@ -318,7 +320,9 @@ class AOTExecutorCodegen : public ExprVisitor { auto fi_node = make_object(); for (const auto& kv : cfunc->funcs->functions) { auto primfunc = Downcast(kv.second); - Integer workspace_size = CalculateWorkspaceBytes(primfunc); + auto workspace_byte_alignment = + target_host_->GetAttr("workspace-byte-alignment").value_or(16); + Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment); Target primfunc_target = relay_target; if (primfunc->attrs->dict.count("target")) { primfunc_target = Downcast(primfunc->attrs->dict["target"]); diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index ddcdeaac5d61..d92d4d2077f7 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -483,7 +483,9 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator(); for (const auto& kv : cfunc->funcs->functions) { auto primfunc = Downcast(kv.second); - Integer workspace_size = CalculateWorkspaceBytes(primfunc); + auto workspace_byte_alignment = relay_target->GetAttr("workspace-byte-alignment") + .value_or(tvm::runtime::kDefaultWorkspaceAlignment); + Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment); Target primfunc_target = relay_target; if (primfunc->attrs->dict.count("target")) { primfunc_target = Downcast(primfunc->attrs->dict["target"]); diff --git a/src/runtime/crt/host/crt_config.h b/src/runtime/crt/host/crt_config.h index b0a68c939070..b81a74eb4ae6 100644 --- a/src/runtime/crt/host/crt_config.h +++ b/src/runtime/crt/host/crt_config.h @@ -51,9 +51,6 @@ /*! \brief Maximum length of a PackedFunc function name. */ #define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30 -/*! \brief Enable checks to enforce the stack allocator with a FIFO ordering. */ -#define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK - // #define TVM_CRT_FRAMER_ENABLE_LOGS #endif // TVM_RUNTIME_CRT_HOST_CRT_CONFIG_H_ diff --git a/src/runtime/crt/memory/stack_allocator.c b/src/runtime/crt/memory/stack_allocator.c index 6722816ec538..7a41ca4241ab 100644 --- a/src/runtime/crt/memory/stack_allocator.c +++ b/src/runtime/crt/memory/stack_allocator.c @@ -18,12 +18,10 @@ */ // LINT_C_FILE #include -#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK -#include -#endif -tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes, - void** current_alloc) { +tvm_crt_error_t StackMemoryManager_Allocate_Body(tvm_workspace_t* tvm_runtime_workspace, + int32_t nbytes, void** current_alloc, + uint8_t do_lifo_check) { // reserve bytes at the end of the allocation such that // next_alloc % TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES == 0. uint32_t offset_bytes = @@ -34,30 +32,51 @@ tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspa } (*current_alloc) = tvm_runtime_workspace->next_alloc; uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes; -#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK - if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) { - return kTvmErrorPlatformNoMemory; + if (do_lifo_check != 0) { + if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) { + return kTvmErrorPlatformNoMemory; + } + const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES); + *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG; + next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES; } - const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES); - *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG; - next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES; -#endif tvm_runtime_workspace->next_alloc = next_alloc; return kTvmErrorNoError; } -tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) { -#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK - uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1); - uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr); - uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG; - CHECK_EQ(expected_size, actual_size, "Deallocation not in FIFO ordering"); +tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes, + void** current_alloc) { + uint8_t do_lifo_check = 0; +#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK + do_lifo_check = 1; #endif - tvm_runtime_workspace->next_alloc = ptr; + return StackMemoryManager_Allocate_Body(tvm_runtime_workspace, nbytes, current_alloc, + do_lifo_check); +} + +tvm_crt_error_t StackMemoryManager_Free_Body(tvm_workspace_t* tvm_runtime_workspace, void* ptr, + uint8_t do_lifo_check) { + if (do_lifo_check != 0) { + uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1); + uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr); + uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG; + if (expected_size != actual_size) { + return kTvmErrorPlatformStackAllocBadFree; + } + } + tvm_runtime_workspace->next_alloc = (uint8_t*)ptr; return kTvmErrorNoError; } +tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) { + uint8_t do_lifo_check = 0; +#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK + do_lifo_check = 1; +#endif + return StackMemoryManager_Free_Body(tvm_runtime_workspace, ptr, do_lifo_check); +} + tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace, uint8_t* g_aot_memory, size_t workspace_size) { tvm_runtime_workspace->next_alloc = g_aot_memory; diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index cc493b984d16..c2ab299bbf1f 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -228,6 +228,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU) .add_attr_option("mcpu") .add_attr_option("march") .add_attr_option("executor") + .add_attr_option("workspace-byte-alignment") .set_default_keys({"cpu"}); TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA) diff --git a/src/tir/analysis/calculate_workspace.cc b/src/tir/analysis/calculate_workspace.cc index 2f5f5e3a671c..49ddaf613c6d 100644 --- a/src/tir/analysis/calculate_workspace.cc +++ b/src/tir/analysis/calculate_workspace.cc @@ -22,6 +22,7 @@ * \brief Calculate any intermediary memory required by PrimFuncs. */ #include +#include #include #include #include @@ -33,10 +34,12 @@ class WorkspaceCalculator : public StmtExprVisitor { public: WorkspaceCalculator() = default; size_t operator()(const PrimFunc& func); + size_t byte_alignment = tvm::runtime::kDefaultWorkspaceAlignment; private: void VisitStmt_(const AllocateNode* op) override; size_t CalculateExtentsSize(const AllocateNode* op); + size_t GetByteAlignedSize(size_t non_aligned_size); size_t current_size = 0; size_t max_size = 0; }; @@ -46,6 +49,10 @@ size_t WorkspaceCalculator::operator()(const PrimFunc& func) { return this->max_size; } +size_t WorkspaceCalculator::GetByteAlignedSize(size_t non_aligned_size) { + return ((non_aligned_size + byte_alignment - 1) / byte_alignment) * byte_alignment; +} + size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) { size_t element_size_bytes = op->dtype.bytes(); size_t num_elements = 1; @@ -57,7 +64,7 @@ size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) { num_elements = 0; } } - return num_elements * element_size_bytes; + return GetByteAlignedSize(num_elements * element_size_bytes); } void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) { @@ -70,14 +77,16 @@ void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) { current_size -= size; } -size_t CalculateWorkspaceBytes(const PrimFunc& func) { +size_t CalculateWorkspaceBytes(const PrimFunc& func, const Integer& workspace_byte_alignment) { WorkspaceCalculator wc; + wc.byte_alignment = workspace_byte_alignment->value; return wc(func); } -TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes").set_body_typed([](PrimFunc func) { - return static_cast(CalculateWorkspaceBytes(func)); -}); +TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes") + .set_body_typed([](PrimFunc func, Integer workspace_byte_alignment) { + return static_cast(CalculateWorkspaceBytes(func, workspace_byte_alignment)); + }); } // namespace tir } // namespace tvm diff --git a/tests/crt/aot_memory_test.cc b/tests/crt/aot_memory_test.cc index ecae2ef52f59..abda7bebf766 100644 --- a/tests/crt/aot_memory_test.cc +++ b/tests/crt/aot_memory_test.cc @@ -19,7 +19,11 @@ #include #include +#include "../../src/runtime/crt/memory/stack_allocator.c" #include "platform.cc" + +// Check with LIFO checks enabled for stack allocator +#define TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK /* * Tests allocations are properly aligned when allocated */ @@ -29,19 +33,23 @@ TEST(AOTMemory, Allocate) { ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 96), kTvmErrorNoError); void* block_one = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_one, 1), + kTvmErrorNoError); ASSERT_EQ(block_one, &model_memory[0]); void* block_two = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &block_two), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 2, &block_two, 1), + kTvmErrorNoError); ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]); void* two_blocks = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 24, &two_blocks), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 24, &two_blocks, 1), + kTvmErrorNoError); ASSERT_EQ(two_blocks, &model_memory[32 + 2 * STACK_ALLOCATOR_TAG_SIZE_BYTES]); void* block_three = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_three, 1), + kTvmErrorNoError); ASSERT_EQ(block_three, &model_memory[64 + 3 * STACK_ALLOCATOR_TAG_SIZE_BYTES]); } @@ -54,21 +62,25 @@ TEST(AOTMemory, Free) { ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError); void* block_one = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_one, 1), + kTvmErrorNoError); ASSERT_EQ(block_one, &model_memory[0]); void* block_two = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_two, 1), + kTvmErrorNoError); ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]); - ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free(&tvm_runtime_workspace, block_two)); + ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free_Body(&tvm_runtime_workspace, block_two, 1)); void* two_blocks = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &two_blocks), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 2, &two_blocks, 1), + kTvmErrorNoError); ASSERT_EQ(two_blocks, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]); - ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free(&tvm_runtime_workspace, two_blocks)); + ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free_Body(&tvm_runtime_workspace, two_blocks, 1)); void* block_three = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_three, 1), + kTvmErrorNoError); ASSERT_EQ(block_three, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]); } @@ -81,15 +93,17 @@ TEST(AOTMemory, OverAllocate) { ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError); void* block_one = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_one, 1), + kTvmErrorNoError); ASSERT_EQ(block_one, &model_memory[0]); void* block_two = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_two, 1), + kTvmErrorNoError); ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]); void* two_blocks = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 64, &two_blocks), + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 64, &two_blocks, 1), kTvmErrorPlatformNoMemory); ASSERT_EQ(two_blocks, (void*)NULL); } @@ -103,15 +117,17 @@ TEST(AOTMemory, FreeOutOfOrder) { ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError); void* block_one = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_one, 1), + kTvmErrorNoError); ASSERT_EQ(block_one, &model_memory[0]); void* block_two = NULL; - ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError); + ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_two, 1), + kTvmErrorNoError); ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]); - ASSERT_EXIT(StackMemoryManager_Free(&tvm_runtime_workspace, block_one), - ::testing::ExitedWithCode(2), ""); + ASSERT_EQ(StackMemoryManager_Free_Body(&tvm_runtime_workspace, block_one, 1), + kTvmErrorPlatformStackAllocBadFree); } int main(int argc, char** argv) { diff --git a/tests/python/relay/aot/aot_test.mk b/tests/python/relay/aot/aot_test.mk index ae8389561459..793a8b1ea69a 100644 --- a/tests/python/relay/aot/aot_test.mk +++ b/tests/python/relay/aot/aot_test.mk @@ -52,23 +52,23 @@ lib_objs =$(source_libs:.c=.o) $(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/aot_executor.o $(source_libs) $(build_dir)/stack_allocator.o $(build_dir)/crt_backend_api.o $(QUIET)mkdir -p $(@D) - $(QUIET)$(CC) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm + $(QUIET)$(CC) $(CFLAGS) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm $(build_dir)/%.o: $(build_dir)/../codegen/host/src/%.c $(QUIET)mkdir -p $(@D) - $(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_CFLAGS) + $(QUIET)$(CC) $(CFLAGS) -c $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_CFLAGS) $(build_dir)/aot_executor.o: $(TVM_ROOT)/src/runtime/crt/aot_executor/aot_executor.c $(QUIET)mkdir -p $(@D) - $(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_CFLAGS) + $(QUIET)$(CC) $(CFLAGS) -c $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_CFLAGS) $(build_dir)/stack_allocator.o: $(TVM_ROOT)/src/runtime/crt/memory/stack_allocator.c $(QUIET)mkdir -p $(@D) - $(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_CFLAGS) + $(QUIET)$(CC) $(CFLAGS) -c $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_CFLAGS) $(build_dir)/crt_backend_api.o: $(TVM_ROOT)/src/runtime/crt/common/crt_backend_api.c $(QUIET)mkdir -p $(@D) - $(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_CFLAGS) + $(QUIET)$(CC) $(CFLAGS) -c $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_CFLAGS) clean: $(QUIET)rm -rf $(build_dir)/crt diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 8273d3954d3b..8c7aefe70d09 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -24,6 +24,7 @@ import subprocess import tempfile import tarfile +import json import tvm @@ -55,7 +56,7 @@ def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout): print(text, end="") -def create_main(test_name, input_list, output_list, output_path): +def create_main(test_name, input_list, output_list, output_path, workspace_bytes): file_path = pathlib.Path(f"{output_path}/" + test_name).resolve() # create header file raw_path = file_path.with_suffix(".c").resolve() @@ -64,7 +65,7 @@ def create_main(test_name, input_list, output_list, output_path): main_file.write("#include \n") main_file.write('#include "tvm/runtime/crt/internal/aot_executor/aot_executor.h"\n') main_file.write('#include "tvm/runtime/crt/stack_allocator.h"\n') - main_file.write("#define WORKSPACE_SIZE (16384*1024)\n") + main_file.write(f"#define WORKSPACE_SIZE ({workspace_bytes})\n") main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n") for i in range(0, len(input_list)): @@ -157,11 +158,24 @@ def create_header_file(tensor_name, npy_data, output_path): header_file.write("};\n\n") -def compile_and_run(mod, input_list, output_list, params=None): +def extract_main_workspace_sizebytes(extract_dir): + with open(os.path.join(extract_dir, "metadata.json")) as json_f: + metadata = json.load(json_f) + return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"] + + +def compile_and_run( + mod, input_list, output_list, use_calculated_workspaces, params=None, workspace_byte_alignment=8 +): """ This method verifies the generated source """ - target = "c -runtime=c --link-params --executor=aot" + target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment}" + cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} " + + # The calculated workspaces will not account for stack allocator tags used for debugging + if not use_calculated_workspaces: + cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK " with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): lib = tvm.relay.build(mod, target, target_host=target, params=params) @@ -177,6 +191,10 @@ def compile_and_run(mod, input_list, output_list, params=None): export_model_library_format(lib, tar_file) t = tarfile.open(tar_file) t.extractall(base_path) + if use_calculated_workspaces: + workspace_bytes = extract_main_workspace_sizebytes(base_path) + else: + workspace_bytes = 16384 * 1024 for i in range(len(input_list)): create_header_file((f"input_data{i}"), input_list[i], build_path) @@ -189,12 +207,16 @@ def compile_and_run(mod, input_list, output_list, params=None): ) create_header_file((f"expected_output_data{i}"), output_list[i], build_path) - create_main("test.c", input_list, output_list, build_path) + create_main("test.c", input_list, output_list, build_path, workspace_bytes) # Verify that compiles fine file_dir = os.path.dirname(os.path.abspath(__file__)) makefile = os.path.join(file_dir, "aot_test.mk") - make_cmd = f"make -f {makefile} build_dir=" + build_path + f" TVM_ROOT={file_dir}/../../../.." + make_cmd = ( + f"make CFLAGS='{cflags}' -f {makefile} build_dir=" + + build_path + + f" TVM_ROOT={file_dir}/../../../.." + ) compile_log_path = os.path.join(build_path, "test_compile.log") ret = subprocess_with_stdout_and_log(make_cmd, ".", compile_log_path, False) diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 0f1f2ad369e7..02b4de3a64f3 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -43,7 +43,8 @@ from aot_test_utils import * -def test_conv_with_params(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_conv_with_params(use_calculated_workspaces): RELAY_MODEL = """ #[version = "0.0.5"] def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) { @@ -72,10 +73,11 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), output_list = generate_ref_data(mod, inputs, params) input_list = [input_data] - compile_and_run(mod, input_list, output_list, params) + compile_and_run(mod, input_list, output_list, use_calculated_workspaces, params) -def test_add_with_params(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_add_with_params(use_calculated_workspaces): x = relay.var("x", shape=(1, 10)) y = relay.var("y", shape=(1, 10)) z = relay.add(x, y) @@ -89,10 +91,11 @@ def test_add_with_params(): output_list = generate_ref_data(func, inputs, params) input_list = [y_in] - compile_and_run(func, input_list, output_list, params) + compile_and_run(func, input_list, output_list, use_calculated_workspaces, params) -def test_conv2d(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_conv2d(use_calculated_workspaces): """Test a subgraph with a single conv2d operator.""" def conv2d_direct(): @@ -134,10 +137,11 @@ def group_conv2d(): for mod, inputs, out_shape in [conv2d_direct(), group_conv2d()]: output_list = generate_ref_data(mod, inputs) input_list = [inputs["data"], inputs["weight"]] - compile_and_run(mod, input_list, output_list) + compile_and_run(mod, input_list, output_list, use_calculated_workspaces) -def test_concatenate(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_concatenate(use_calculated_workspaces): dtype = "float32" x = relay.var("x", shape=(10, 5), dtype=dtype) y = relay.var("y", shape=(10, 5), dtype=dtype) @@ -153,10 +157,11 @@ def test_concatenate(): output_list = generate_ref_data(func, inputs) input_list = [inputs["x"], inputs["y"], inputs["z"]] - compile_and_run(func, input_list, output_list) + compile_and_run(func, input_list, output_list, use_calculated_workspaces) -def test_nested_tuples(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_nested_tuples(use_calculated_workspaces): x = relay.var("x", shape=(10,)) x1 = x + relay.const(1.0) x2 = x1 + relay.const(1.0) @@ -169,35 +174,39 @@ def test_nested_tuples(): inputs = {"x": x_data} output_list = generate_ref_data(func, inputs) input_list = [x_data] - compile_and_run(func, input_list, output_list) + compile_and_run(func, input_list, output_list, use_calculated_workspaces) -def test_tuple_getitem(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_tuple_getitem(use_calculated_workspaces): func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0)) output_list = generate_ref_data(func, {}) input_list = [] - compile_and_run(func, input_list, output_list) + compile_and_run(func, input_list, output_list, use_calculated_workspaces) -def test_id(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_id(use_calculated_workspaces): x = relay.var("x", "float32") ident = relay.Function([x], x) one = np.array(1.0, "float32") inputs = {"x": one} output_list = generate_ref_data(ident, inputs) input_list = [one] - compile_and_run(ident, input_list, output_list) + compile_and_run(ident, input_list, output_list, use_calculated_workspaces) -def test_add_const(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_add_const(use_calculated_workspaces): two = relay.add(relay.const(1), relay.const(1)) func = relay.Function([], two) output_list = generate_ref_data(func, {}) input_list = [] - compile_and_run(func, input_list, output_list) + compile_and_run(func, input_list, output_list, use_calculated_workspaces) -def test_mul_param(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_mul_param(use_calculated_workspaces): x = relay.var("x", shape=(10, 10)) y = relay.var("y", shape=(1, 10)) func = relay.Function([x, y], relay.multiply(x, y)) @@ -206,10 +215,11 @@ def test_mul_param(): inputs = {"x": x_data, "y": y_data} output_list = generate_ref_data(func, inputs) input_list = [inputs["x"], inputs["y"]] - compile_and_run(func, input_list, output_list) + compile_and_run(func, input_list, output_list, use_calculated_workspaces) -def test_subtract(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_subtract(use_calculated_workspaces): i = relay.var("i", shape=[], dtype="int32") sub = relay.subtract(i, relay.const(1, dtype="int32")) func = relay.Function([i], sub, ret_type=relay.TensorType([], "int32")) @@ -217,10 +227,11 @@ def test_subtract(): inputs = {"i": i_data} output_list = generate_ref_data(func, inputs) input_list = [inputs["i"]] - compile_and_run(func, input_list, output_list) + compile_and_run(func, input_list, output_list, use_calculated_workspaces) -def test_tuple_output(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_tuple_output(use_calculated_workspaces): x = relay.var("x", shape=(6, 9)) y = relay.split(x, 3).astuple() a = relay.TupleGetItem(y, 0) @@ -232,17 +243,24 @@ def test_tuple_output(): inputs = {"x": x_data} output_list = generate_ref_data(func, inputs) input_list = [inputs["x"]] - compile_and_run(func, input_list, output_list) + compile_and_run(func, input_list, output_list, use_calculated_workspaces) -def test_mobilenet(): +@pytest.mark.parametrize( + "use_calculated_workspaces_and_alignment", [(True, 1), (True, 16), (False, 1)] +) +def test_mobilenet(use_calculated_workspaces_and_alignment): + use_calculated_workspaces = use_calculated_workspaces_and_alignment[0] + workspace_byte_alignment = use_calculated_workspaces_and_alignment[1] mod, params = testing.mobilenet.get_workload(batch_size=1) data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape] data = np.random.uniform(size=data_shape).astype("float32") inputs = {"data": data} output_list = generate_ref_data(mod, inputs, params) input_list = [inputs["data"]] - compile_and_run(mod, input_list, output_list, params) + compile_and_run( + mod, input_list, output_list, use_calculated_workspaces, params, workspace_byte_alignment + ) class CcompilerAnnotator(ExprMutator): @@ -299,7 +317,8 @@ def visit_call(self, call): return super().visit_call(call) -def test_byoc_utvm(): +@pytest.mark.parametrize("use_calculated_workspaces", [True, False]) +def test_byoc_utvm(use_calculated_workspaces): """This is a simple test case to check BYOC capabilities of AOT""" x = relay.var("x", shape=(10, 10)) w0 = relay.var("w0", shape=(10, 10)) @@ -342,7 +361,7 @@ def test_byoc_utvm(): output_list = generate_ref_data(mod, map_inputs) input_list = [map_inputs["x"]] input_list.extend([map_inputs["w{}".format(i)] for i in range(8)]) - compile_and_run(mod, input_list, output_list) + compile_and_run(mod, input_list, output_list, use_calculated_workspaces) if __name__ == "__main__": diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py index 284ba633f2b8..190d1820c1f4 100644 --- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py +++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. import numpy as np +import pytest import tvm from tvm import tir, script @@ -91,14 +92,20 @@ def primfunc_local_allocates(placeholder_162: ty.handle, placeholder_163: ty.han # fmt: on -def test_global_allocates(): +@pytest.mark.parametrize("alignment_and_size", [(1, 663552), (10, 663560)]) +def test_global_allocates(alignment_and_size): + alignment = alignment_and_size[0] + size = alignment_and_size[1] primfunc = primfunc_global_allocates - assert tvm.tir.analysis.calculate_workspace_bytes(primfunc) == 663552 + assert tvm.tir.analysis.calculate_workspace_bytes(primfunc, alignment) == size -def test_local_allocates(): +@pytest.mark.parametrize("alignment_and_size", [(1, 1566720), (100, 1567100)]) +def test_local_allocates(alignment_and_size): + alignment = alignment_and_size[0] + size = alignment_and_size[1] primfunc = primfunc_local_allocates - assert tvm.tir.analysis.calculate_workspace_bytes(primfunc) == 1566720 + assert tvm.tir.analysis.calculate_workspace_bytes(primfunc, alignment) == size if __name__ == "__main__":