diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index 4cbfb0aab8e2..d1a8619e8233 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -79,6 +79,7 @@ typedef enum {
   kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
   kTvmErrorPlatformNoMemory = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 3),
   kTvmErrorPlatformTimerBadState = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 4),
+  kTvmErrorPlatformStackAllocBadFree = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 5),
 
   // Common error codes returned from generated functions.
   kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
diff --git a/include/tvm/runtime/crt/stack_allocator.h b/include/tvm/runtime/crt/stack_allocator.h
index daa403cb2764..4184dff7510e 100644
--- a/include/tvm/runtime/crt/stack_allocator.h
+++ b/include/tvm/runtime/crt/stack_allocator.h
@@ -45,14 +45,61 @@ typedef struct {
   size_t workspace_size;  // Total number of bytes in the workspace
 } tvm_workspace_t;
 
+/*!
+ * \brief Initialize the stack-based memory manager
+ *
+ * \param tvm_runtime_workspace The tvm_workspace_t struct containing state
+ * \param g_aot_memory The memory buffer used to allocate within
+ * \param workspace_size The total size of the workspace buffer workspace
+ */
 tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace,
                                         uint8_t* g_aot_memory, size_t workspace_size);
 
+/*!
+ * \brief The intended user-facing function to allocate within the buffer. It wraps
+ * StackMemoryManager_Allocate_Body enable and disable the LIFO check that is useful for debugging
+ * the AoT codegen.
+ *
+ * \param tvm_runtime_workspace The tvm_workspace_t struct containing state
+ * \param nbytes The number of bytes required for the allocation
+ * \param current_alloc The pointer-to-pointer to be populated with the allocated address
+ */
 tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
-                                            void**);
+                                            void** current_alloc);
+
+/*!
+ * \brief The internal function that accepts allocate inputs and an extra byte to say to enable the
+ * LIFO check that is useful in debugging for debugging the AoT codegen.
+ *
+ * \param tvm_runtime_workspace The tvm_workspace_t struct containing state
+ * \param nbytes The number of bytes required for the allocation
+ * \param current_alloc The pointer-to-pointer to be populated with the allocated address
+ * \param do_lifo_check This being non-zero indicates to perform a check LIFO pattern Allocs/Frees
+ */
+tvm_crt_error_t StackMemoryManager_Allocate_Body(tvm_workspace_t* tvm_runtime_workspace,
+                                                 int32_t nbytes, void** current_alloc,
+                                                 uint8_t do_lifo_check);
 
+/*!
+ * \brief The intended user-facing function to free the tensor within the buffer. It wraps
+ * StackMemoryManager_Free_Body enable and disable the stack allocator
+ *
+ * \param tvm_runtime_workspace The tvm_workspace_t struct containing state
+ * \param ptr The base pointer of the tensor to be free'd
+ */
 tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr);
 
+/*!
+ * \brief The internal function that accepts free inputs and an extra byte to say to enable the LIFO
+ * check that is useful in debugging for debugging the AoT codegen.
+ *
+ * \param tvm_runtime_workspace The tvm_workspace_t struct containing state
+ * \param ptr The base pointer of tensor to be free'd within the workspace buffer
+ * \param do_lifo_check This being non-zero indicates to perform a check LIFO pattern Allocs/Frees
+ */
+tvm_crt_error_t StackMemoryManager_Free_Body(tvm_workspace_t* tvm_runtime_workspace, void* ptr,
+                                             uint8_t do_lifo_check);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index c3527d87fbf7..a493469a333d 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -60,6 +60,10 @@ constexpr int kTempAllocaAlignment = 128;
 /*! \brief Maximum size that can be allocated on stack */
 constexpr int kMaxStackAlloca = 1024;
 
+/*! \brief Number of bytes each allocation must align to by default in the workspace buffer to
+ * service intermediate tensors */
+constexpr int kDefaultWorkspaceAlignment = 1;
+
 /*!
  *  \brief TVM Runtime Device API, abstracts the device
  *  specific interface for memory management.
diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index c2b3148e5eb9..3f2fdcecc70f 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -181,8 +181,11 @@ TVM_DLL size_t CalculateExprComplexity(const PrimExpr& expr);
 /*!
  * \brief Calculate the workspace size in bytes needed by the TIR allocates inside the TIR PrimFunc
  * \param func The TIR PrimFunc for which the workspace size to be calculated
+ * \param workspace_byte_alignment The byte alignment required for each tensor allocated in this
+ * workspace
  */
-TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func);
+TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func,
+                                       const Integer& workspace_byte_alignment);
 
 /*!
  * \brief Detect the lowest common ancestor(LCA) of buffer access, including both high-level
diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py
index 1d2e316095ac..030918a5e18e 100644
--- a/python/tvm/tir/analysis/analysis.py
+++ b/python/tvm/tir/analysis/analysis.py
@@ -133,7 +133,7 @@ def get_block_access_region(block, buffer_var_map):
     return _ffi_api.get_block_access_region(block, buffer_var_map)
 
 
-def calculate_workspace_bytes(func: PrimFunc):
+def calculate_workspace_bytes(func: PrimFunc, workspace_byte_alignment: int):
     """Calculate the workspace size in bytes needed by the TIR allocates inside the TIR
     PrimFunc.
 
@@ -141,13 +141,15 @@ def calculate_workspace_bytes(func: PrimFunc):
     ----------
     func: tvm.tir.PrimFunc
         The function to be detected.
+    workspace_byte_alignment : int
+        The byte alignment required for each tensor
 
     Returns
     -------
     result : int
         Workspace size in bytes.
     """
-    return _ffi_api.calculate_workspace_bytes(func)
+    return _ffi_api.calculate_workspace_bytes(func, workspace_byte_alignment)
 
 
 def detect_buffer_access_lca(func: PrimFunc) -> Dict[Buffer, Stmt]:
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index ef188b9df175..573206c782f6 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -279,7 +279,9 @@ class AOTExecutorCodegen : public ExprVisitor {
    * \param func The main function that contains calls to operator tir primitive functions
    */
   void UpdateMainWorkspaceSize(const tir::PrimFunc& primfunc, const relay::Function& func) {
-    Integer workspace_size = CalculateWorkspaceBytes(primfunc);
+    auto workspace_byte_alignment = target_host_->GetAttr<Integer>("workspace-byte-alignment")
+                                        .value_or(tvm::runtime::kDefaultWorkspaceAlignment);
+    Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
     // Populate FunctionInfo
     auto fi_node = make_object<FunctionInfoNode>();
     // Initialize all target workspaces to zero
@@ -318,7 +320,9 @@ class AOTExecutorCodegen : public ExprVisitor {
     auto fi_node = make_object<FunctionInfoNode>();
     for (const auto& kv : cfunc->funcs->functions) {
       auto primfunc = Downcast<tir::PrimFunc>(kv.second);
-      Integer workspace_size = CalculateWorkspaceBytes(primfunc);
+      auto workspace_byte_alignment =
+          target_host_->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+      Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
       Target primfunc_target = relay_target;
       if (primfunc->attrs->dict.count("target")) {
         primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index ddcdeaac5d61..d92d4d2077f7 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -483,7 +483,9 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     auto fi_node = make_object<FunctionInfoNode>();
     for (const auto& kv : cfunc->funcs->functions) {
       auto primfunc = Downcast<tir::PrimFunc>(kv.second);
-      Integer workspace_size = CalculateWorkspaceBytes(primfunc);
+      auto workspace_byte_alignment = relay_target->GetAttr<Integer>("workspace-byte-alignment")
+                                          .value_or(tvm::runtime::kDefaultWorkspaceAlignment);
+      Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
       Target primfunc_target = relay_target;
       if (primfunc->attrs->dict.count("target")) {
         primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
diff --git a/src/runtime/crt/host/crt_config.h b/src/runtime/crt/host/crt_config.h
index b0a68c939070..b81a74eb4ae6 100644
--- a/src/runtime/crt/host/crt_config.h
+++ b/src/runtime/crt/host/crt_config.h
@@ -51,9 +51,6 @@
 /*! \brief Maximum length of a PackedFunc function name. */
 #define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
 
-/*! \brief Enable checks to enforce the stack allocator with a FIFO ordering. */
-#define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
-
 // #define TVM_CRT_FRAMER_ENABLE_LOGS
 
 #endif  // TVM_RUNTIME_CRT_HOST_CRT_CONFIG_H_
diff --git a/src/runtime/crt/memory/stack_allocator.c b/src/runtime/crt/memory/stack_allocator.c
index 6722816ec538..7a41ca4241ab 100644
--- a/src/runtime/crt/memory/stack_allocator.c
+++ b/src/runtime/crt/memory/stack_allocator.c
@@ -18,12 +18,10 @@
  */
 // LINT_C_FILE
 #include <tvm/runtime/crt/stack_allocator.h>
-#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
-#include <tvm/runtime/crt/logging.h>
-#endif
 
-tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
-                                            void** current_alloc) {
+tvm_crt_error_t StackMemoryManager_Allocate_Body(tvm_workspace_t* tvm_runtime_workspace,
+                                                 int32_t nbytes, void** current_alloc,
+                                                 uint8_t do_lifo_check) {
   // reserve bytes at the end of the allocation such that
   // next_alloc % TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES == 0.
   uint32_t offset_bytes =
@@ -34,30 +32,51 @@ tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspa
   }
   (*current_alloc) = tvm_runtime_workspace->next_alloc;
   uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
-#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
-  if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) {
-    return kTvmErrorPlatformNoMemory;
+  if (do_lifo_check != 0) {
+    if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) {
+      return kTvmErrorPlatformNoMemory;
+    }
+    const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
+    *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
+    next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
   }
-  const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
-  *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
-  next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
-#endif
 
   tvm_runtime_workspace->next_alloc = next_alloc;
   return kTvmErrorNoError;
 }
 
-tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
-#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
-  uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
-  uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
-  uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG;
-  CHECK_EQ(expected_size, actual_size, "Deallocation not in FIFO ordering");
+tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
+                                            void** current_alloc) {
+  uint8_t do_lifo_check = 0;
+#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK
+  do_lifo_check = 1;
 #endif
-  tvm_runtime_workspace->next_alloc = ptr;
+  return StackMemoryManager_Allocate_Body(tvm_runtime_workspace, nbytes, current_alloc,
+                                          do_lifo_check);
+}
+
+tvm_crt_error_t StackMemoryManager_Free_Body(tvm_workspace_t* tvm_runtime_workspace, void* ptr,
+                                             uint8_t do_lifo_check) {
+  if (do_lifo_check != 0) {
+    uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
+    uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
+    uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG;
+    if (expected_size != actual_size) {
+      return kTvmErrorPlatformStackAllocBadFree;
+    }
+  }
+  tvm_runtime_workspace->next_alloc = (uint8_t*)ptr;
   return kTvmErrorNoError;
 }
 
+tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
+  uint8_t do_lifo_check = 0;
+#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK
+  do_lifo_check = 1;
+#endif
+  return StackMemoryManager_Free_Body(tvm_runtime_workspace, ptr, do_lifo_check);
+}
+
 tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace,
                                         uint8_t* g_aot_memory, size_t workspace_size) {
   tvm_runtime_workspace->next_alloc = g_aot_memory;
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index cc493b984d16..c2ab299bbf1f 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -228,6 +228,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("march")
     .add_attr_option<String>("executor")
+    .add_attr_option<Integer>("workspace-byte-alignment")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
diff --git a/src/tir/analysis/calculate_workspace.cc b/src/tir/analysis/calculate_workspace.cc
index 2f5f5e3a671c..49ddaf613c6d 100644
--- a/src/tir/analysis/calculate_workspace.cc
+++ b/src/tir/analysis/calculate_workspace.cc
@@ -22,6 +22,7 @@
  * \brief Calculate any intermediary memory required by PrimFuncs.
  */
 #include <tvm/arith/analyzer.h>
+#include <tvm/runtime/device_api.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
@@ -33,10 +34,12 @@ class WorkspaceCalculator : public StmtExprVisitor {
  public:
   WorkspaceCalculator() = default;
   size_t operator()(const PrimFunc& func);
+  size_t byte_alignment = tvm::runtime::kDefaultWorkspaceAlignment;
 
  private:
   void VisitStmt_(const AllocateNode* op) override;
   size_t CalculateExtentsSize(const AllocateNode* op);
+  size_t GetByteAlignedSize(size_t non_aligned_size);
   size_t current_size = 0;
   size_t max_size = 0;
 };
@@ -46,6 +49,10 @@ size_t WorkspaceCalculator::operator()(const PrimFunc& func) {
   return this->max_size;
 }
 
+size_t WorkspaceCalculator::GetByteAlignedSize(size_t non_aligned_size) {
+  return ((non_aligned_size + byte_alignment - 1) / byte_alignment) * byte_alignment;
+}
+
 size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) {
   size_t element_size_bytes = op->dtype.bytes();
   size_t num_elements = 1;
@@ -57,7 +64,7 @@ size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) {
       num_elements = 0;
     }
   }
-  return num_elements * element_size_bytes;
+  return GetByteAlignedSize(num_elements * element_size_bytes);
 }
 
 void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) {
@@ -70,14 +77,16 @@ void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) {
   current_size -= size;
 }
 
-size_t CalculateWorkspaceBytes(const PrimFunc& func) {
+size_t CalculateWorkspaceBytes(const PrimFunc& func, const Integer& workspace_byte_alignment) {
   WorkspaceCalculator wc;
+  wc.byte_alignment = workspace_byte_alignment->value;
   return wc(func);
 }
 
-TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes").set_body_typed([](PrimFunc func) {
-  return static_cast<int>(CalculateWorkspaceBytes(func));
-});
+TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes")
+    .set_body_typed([](PrimFunc func, Integer workspace_byte_alignment) {
+      return static_cast<int>(CalculateWorkspaceBytes(func, workspace_byte_alignment));
+    });
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/crt/aot_memory_test.cc b/tests/crt/aot_memory_test.cc
index ecae2ef52f59..abda7bebf766 100644
--- a/tests/crt/aot_memory_test.cc
+++ b/tests/crt/aot_memory_test.cc
@@ -19,7 +19,11 @@
 #include <gtest/gtest.h>
 #include <tvm/runtime/crt/stack_allocator.h>
 
+#include "../../src/runtime/crt/memory/stack_allocator.c"
 #include "platform.cc"
+
+// Check with LIFO checks enabled for stack allocator
+#define TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK
 /*
  * Tests allocations are properly aligned when allocated
  */
@@ -29,19 +33,23 @@ TEST(AOTMemory, Allocate) {
 
   ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 96), kTvmErrorNoError);
   void* block_one = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_one, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &block_two), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 2, &block_two, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* two_blocks = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 24, &two_blocks), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 24, &two_blocks, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(two_blocks, &model_memory[32 + 2 * STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* block_three = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_three, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_three, &model_memory[64 + 3 * STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 }
 
@@ -54,21 +62,25 @@ TEST(AOTMemory, Free) {
   ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError);
 
   void* block_one = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_one, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_two, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
-  ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free(&tvm_runtime_workspace, block_two));
+  ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free_Body(&tvm_runtime_workspace, block_two, 1));
 
   void* two_blocks = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &two_blocks), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 2, &two_blocks, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(two_blocks, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
-  ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free(&tvm_runtime_workspace, two_blocks));
+  ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free_Body(&tvm_runtime_workspace, two_blocks, 1));
 
   void* block_three = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_three, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_three, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 }
 
@@ -81,15 +93,17 @@ TEST(AOTMemory, OverAllocate) {
   ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError);
 
   void* block_one = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_one, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_two, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* two_blocks = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 64, &two_blocks),
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 64, &two_blocks, 1),
             kTvmErrorPlatformNoMemory);
   ASSERT_EQ(two_blocks, (void*)NULL);
 }
@@ -103,15 +117,17 @@ TEST(AOTMemory, FreeOutOfOrder) {
   ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError);
 
   void* block_one = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_one, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = NULL;
-  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError);
+  ASSERT_EQ(StackMemoryManager_Allocate_Body(&tvm_runtime_workspace, 1, &block_two, 1),
+            kTvmErrorNoError);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
-  ASSERT_EXIT(StackMemoryManager_Free(&tvm_runtime_workspace, block_one),
-              ::testing::ExitedWithCode(2), "");
+  ASSERT_EQ(StackMemoryManager_Free_Body(&tvm_runtime_workspace, block_one, 1),
+            kTvmErrorPlatformStackAllocBadFree);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/python/relay/aot/aot_test.mk b/tests/python/relay/aot/aot_test.mk
index ae8389561459..793a8b1ea69a 100644
--- a/tests/python/relay/aot/aot_test.mk
+++ b/tests/python/relay/aot/aot_test.mk
@@ -52,23 +52,23 @@ lib_objs =$(source_libs:.c=.o)
 
 $(build_dir)/aot_test_runner: $(build_dir)/test.c  $(build_dir)/aot_executor.o  $(source_libs) $(build_dir)/stack_allocator.o $(build_dir)/crt_backend_api.o
 	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm
+	$(QUIET)$(CC) $(CFLAGS) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm
 
 $(build_dir)/%.o: $(build_dir)/../codegen/host/src/%.c
 	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+	$(QUIET)$(CC) $(CFLAGS) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
 $(build_dir)/aot_executor.o: $(TVM_ROOT)/src/runtime/crt/aot_executor/aot_executor.c
 	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+	$(QUIET)$(CC) $(CFLAGS) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
 $(build_dir)/stack_allocator.o: $(TVM_ROOT)/src/runtime/crt/memory/stack_allocator.c
 	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+	$(QUIET)$(CC) $(CFLAGS) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
 $(build_dir)/crt_backend_api.o: $(TVM_ROOT)/src/runtime/crt/common/crt_backend_api.c
 	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+	$(QUIET)$(CC) $(CFLAGS) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
 clean:
 	$(QUIET)rm -rf $(build_dir)/crt
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 8273d3954d3b..8c7aefe70d09 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -24,6 +24,7 @@
 import subprocess
 import tempfile
 import tarfile
+import json
 
 
 import tvm
@@ -55,7 +56,7 @@ def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout):
                     print(text, end="")
 
 
-def create_main(test_name, input_list, output_list, output_path):
+def create_main(test_name, input_list, output_list, output_path, workspace_bytes):
     file_path = pathlib.Path(f"{output_path}/" + test_name).resolve()
     # create header file
     raw_path = file_path.with_suffix(".c").resolve()
@@ -64,7 +65,7 @@ def create_main(test_name, input_list, output_list, output_path):
         main_file.write("#include <math.h>\n")
         main_file.write('#include "tvm/runtime/crt/internal/aot_executor/aot_executor.h"\n')
         main_file.write('#include "tvm/runtime/crt/stack_allocator.h"\n')
-        main_file.write("#define WORKSPACE_SIZE (16384*1024)\n")
+        main_file.write(f"#define WORKSPACE_SIZE ({workspace_bytes})\n")
         main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
 
         for i in range(0, len(input_list)):
@@ -157,11 +158,24 @@ def create_header_file(tensor_name, npy_data, output_path):
         header_file.write("};\n\n")
 
 
-def compile_and_run(mod, input_list, output_list, params=None):
+def extract_main_workspace_sizebytes(extract_dir):
+    with open(os.path.join(extract_dir, "metadata.json")) as json_f:
+        metadata = json.load(json_f)
+        return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]
+
+
+def compile_and_run(
+    mod, input_list, output_list, use_calculated_workspaces, params=None, workspace_byte_alignment=8
+):
     """
     This method verifies the generated source
     """
-    target = "c -runtime=c --link-params --executor=aot"
+    target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment}"
+    cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
+
+    # The calculated workspaces will not account for stack allocator tags used for debugging
+    if not use_calculated_workspaces:
+        cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "
 
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         lib = tvm.relay.build(mod, target, target_host=target, params=params)
@@ -177,6 +191,10 @@ def compile_and_run(mod, input_list, output_list, params=None):
     export_model_library_format(lib, tar_file)
     t = tarfile.open(tar_file)
     t.extractall(base_path)
+    if use_calculated_workspaces:
+        workspace_bytes = extract_main_workspace_sizebytes(base_path)
+    else:
+        workspace_bytes = 16384 * 1024
 
     for i in range(len(input_list)):
         create_header_file((f"input_data{i}"), input_list[i], build_path)
@@ -189,12 +207,16 @@ def compile_and_run(mod, input_list, output_list, params=None):
         )
         create_header_file((f"expected_output_data{i}"), output_list[i], build_path)
 
-    create_main("test.c", input_list, output_list, build_path)
+    create_main("test.c", input_list, output_list, build_path, workspace_bytes)
 
     # Verify that compiles fine
     file_dir = os.path.dirname(os.path.abspath(__file__))
     makefile = os.path.join(file_dir, "aot_test.mk")
-    make_cmd = f"make -f {makefile} build_dir=" + build_path + f" TVM_ROOT={file_dir}/../../../.."
+    make_cmd = (
+        f"make CFLAGS='{cflags}' -f {makefile} build_dir="
+        + build_path
+        + f" TVM_ROOT={file_dir}/../../../.."
+    )
 
     compile_log_path = os.path.join(build_path, "test_compile.log")
     ret = subprocess_with_stdout_and_log(make_cmd, ".", compile_log_path, False)
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 0f1f2ad369e7..02b4de3a64f3 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -43,7 +43,8 @@
 from aot_test_utils import *
 
 
-def test_conv_with_params():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_conv_with_params(use_calculated_workspaces):
     RELAY_MODEL = """
 #[version = "0.0.5"]
 def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
@@ -72,10 +73,11 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
     output_list = generate_ref_data(mod, inputs, params)
 
     input_list = [input_data]
-    compile_and_run(mod, input_list, output_list, params)
+    compile_and_run(mod, input_list, output_list, use_calculated_workspaces, params)
 
 
-def test_add_with_params():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_add_with_params(use_calculated_workspaces):
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
     z = relay.add(x, y)
@@ -89,10 +91,11 @@ def test_add_with_params():
     output_list = generate_ref_data(func, inputs, params)
 
     input_list = [y_in]
-    compile_and_run(func, input_list, output_list, params)
+    compile_and_run(func, input_list, output_list, use_calculated_workspaces, params)
 
 
-def test_conv2d():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_conv2d(use_calculated_workspaces):
     """Test a subgraph with a single conv2d operator."""
 
     def conv2d_direct():
@@ -134,10 +137,11 @@ def group_conv2d():
     for mod, inputs, out_shape in [conv2d_direct(), group_conv2d()]:
         output_list = generate_ref_data(mod, inputs)
         input_list = [inputs["data"], inputs["weight"]]
-        compile_and_run(mod, input_list, output_list)
+        compile_and_run(mod, input_list, output_list, use_calculated_workspaces)
 
 
-def test_concatenate():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_concatenate(use_calculated_workspaces):
     dtype = "float32"
     x = relay.var("x", shape=(10, 5), dtype=dtype)
     y = relay.var("y", shape=(10, 5), dtype=dtype)
@@ -153,10 +157,11 @@ def test_concatenate():
 
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"], inputs["y"], inputs["z"]]
-    compile_and_run(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
 
 
-def test_nested_tuples():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_nested_tuples(use_calculated_workspaces):
     x = relay.var("x", shape=(10,))
     x1 = x + relay.const(1.0)
     x2 = x1 + relay.const(1.0)
@@ -169,35 +174,39 @@ def test_nested_tuples():
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [x_data]
-    compile_and_run(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
 
 
-def test_tuple_getitem():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_tuple_getitem(use_calculated_workspaces):
     func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
     output_list = generate_ref_data(func, {})
     input_list = []
-    compile_and_run(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
 
 
-def test_id():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_id(use_calculated_workspaces):
     x = relay.var("x", "float32")
     ident = relay.Function([x], x)
     one = np.array(1.0, "float32")
     inputs = {"x": one}
     output_list = generate_ref_data(ident, inputs)
     input_list = [one]
-    compile_and_run(ident, input_list, output_list)
+    compile_and_run(ident, input_list, output_list, use_calculated_workspaces)
 
 
-def test_add_const():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_add_const(use_calculated_workspaces):
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
     output_list = generate_ref_data(func, {})
     input_list = []
-    compile_and_run(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
 
 
-def test_mul_param():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_mul_param(use_calculated_workspaces):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
     func = relay.Function([x, y], relay.multiply(x, y))
@@ -206,10 +215,11 @@ def test_mul_param():
     inputs = {"x": x_data, "y": y_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"], inputs["y"]]
-    compile_and_run(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
 
 
-def test_subtract():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_subtract(use_calculated_workspaces):
     i = relay.var("i", shape=[], dtype="int32")
     sub = relay.subtract(i, relay.const(1, dtype="int32"))
     func = relay.Function([i], sub, ret_type=relay.TensorType([], "int32"))
@@ -217,10 +227,11 @@ def test_subtract():
     inputs = {"i": i_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["i"]]
-    compile_and_run(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
 
 
-def test_tuple_output():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_tuple_output(use_calculated_workspaces):
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()
     a = relay.TupleGetItem(y, 0)
@@ -232,17 +243,24 @@ def test_tuple_output():
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"]]
-    compile_and_run(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
 
 
-def test_mobilenet():
+@pytest.mark.parametrize(
+    "use_calculated_workspaces_and_alignment", [(True, 1), (True, 16), (False, 1)]
+)
+def test_mobilenet(use_calculated_workspaces_and_alignment):
+    use_calculated_workspaces = use_calculated_workspaces_and_alignment[0]
+    workspace_byte_alignment = use_calculated_workspaces_and_alignment[1]
     mod, params = testing.mobilenet.get_workload(batch_size=1)
     data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
     data = np.random.uniform(size=data_shape).astype("float32")
     inputs = {"data": data}
     output_list = generate_ref_data(mod, inputs, params)
     input_list = [inputs["data"]]
-    compile_and_run(mod, input_list, output_list, params)
+    compile_and_run(
+        mod, input_list, output_list, use_calculated_workspaces, params, workspace_byte_alignment
+    )
 
 
 class CcompilerAnnotator(ExprMutator):
@@ -299,7 +317,8 @@ def visit_call(self, call):
         return super().visit_call(call)
 
 
-def test_byoc_utvm():
+@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
+def test_byoc_utvm(use_calculated_workspaces):
     """This is a simple test case to check BYOC capabilities of AOT"""
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
@@ -342,7 +361,7 @@ def test_byoc_utvm():
     output_list = generate_ref_data(mod, map_inputs)
     input_list = [map_inputs["x"]]
     input_list.extend([map_inputs["w{}".format(i)] for i in range(8)])
-    compile_and_run(mod, input_list, output_list)
+    compile_and_run(mod, input_list, output_list, use_calculated_workspaces)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
index 284ba633f2b8..190d1820c1f4 100644
--- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py
+++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
 
 import tvm
 from tvm import tir, script
@@ -91,14 +92,20 @@ def primfunc_local_allocates(placeholder_162: ty.handle, placeholder_163: ty.han
 # fmt: on
 
 
-def test_global_allocates():
+@pytest.mark.parametrize("alignment_and_size", [(1, 663552), (10, 663560)])
+def test_global_allocates(alignment_and_size):
+    alignment = alignment_and_size[0]
+    size = alignment_and_size[1]
     primfunc = primfunc_global_allocates
-    assert tvm.tir.analysis.calculate_workspace_bytes(primfunc) == 663552
+    assert tvm.tir.analysis.calculate_workspace_bytes(primfunc, alignment) == size
 
 
-def test_local_allocates():
+@pytest.mark.parametrize("alignment_and_size", [(1, 1566720), (100, 1567100)])
+def test_local_allocates(alignment_and_size):
+    alignment = alignment_and_size[0]
+    size = alignment_and_size[1]
     primfunc = primfunc_local_allocates
-    assert tvm.tir.analysis.calculate_workspace_bytes(primfunc) == 1566720
+    assert tvm.tir.analysis.calculate_workspace_bytes(primfunc, alignment) == size
 
 
 if __name__ == "__main__":