diff --git a/Makefile b/Makefile
index b9dec82ad88f1..b92d8910fceaa 100644
--- a/Makefile
+++ b/Makefile
@@ -186,8 +186,9 @@ ifdef LLAMA_HIPBLAS
 	CC         := $(ROCM_PATH)/llvm/bin/clang
 	CXX        := $(ROCM_PATH)/llvm/bin/clang++
 	GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
-	LLAMA_CUDA_DMMV_X ?= 64
-	LLAMA_CUDA_MMV_Y ?= 2
+	LLAMA_CUDA_DMMV_X ?= 128 
+	LLAMA_CUDA_MMV_Y ?= 1
+	LLAMA_CUDA_KQUANTS_ITER ?= 1
 	LLAMA_CUDA_FORCE_DMMV = true
 	CFLAGS     += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	CXXFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
diff --git a/ggml.c b/ggml.c
index f98cc229783b9..f6c397adb4cf3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -246,12 +246,12 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 #include "ggml-opencl.h"
 #endif
-#elif defined(GGML_USE_OPENBLAS)
-#if defined(GGML_BLAS_USE_MKL)
-#include <mkl.h>
-#else
+#endif
+#if defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
 #endif
+#if defined(GGML_BLAS_USE_MKL)
+#include <mkl.h>
 #endif
 #if defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"