From 56b8922c8b183226abf07b3369caf926757d5e25 Mon Sep 17 00:00:00 2001 From: Masuda Masahiro Date: Tue, 2 Jan 2018 15:15:43 +0900 Subject: [PATCH 1/3] rocblas integration --- make/config.mk | 3 ++ make/contrib/rocblas.mk | 8 +++ python/tvm/contrib/cblas.py | 2 +- python/tvm/contrib/rocblas.py | 32 ++++++++++++ src/contrib/rocblas/rocblas.cc | 76 ++++++++++++++++++++++++++++ tests/python/contrib/test_rocblas.py | 38 ++++++++++++++ topi/python/topi/rocm/__init__.py | 1 + topi/python/topi/rocm/dense.py | 66 ++++++++++++++++++++++++ 8 files changed, 225 insertions(+), 1 deletion(-) create mode 100644 make/contrib/rocblas.mk create mode 100644 python/tvm/contrib/rocblas.py create mode 100644 src/contrib/rocblas/rocblas.cc create mode 100644 tests/python/contrib/test_rocblas.py create mode 100644 topi/python/topi/rocm/dense.py diff --git a/make/config.mk b/make/config.mk index 778d52025f92..256771ac3220 100644 --- a/make/config.mk +++ b/make/config.mk @@ -80,3 +80,6 @@ USE_MPS = 0 # Whether use cuBLAS USE_CUBLAS = 0 + +# Whether use rocBlas +USE_ROCBLAS = 0 diff --git a/make/contrib/rocblas.mk b/make/contrib/rocblas.mk new file mode 100644 index 000000000000..ae5663099d53 --- /dev/null +++ b/make/contrib/rocblas.mk @@ -0,0 +1,8 @@ +ROCBLAS_CONTRIB_SRC = $(wildcard src/contrib/rocblas/*.cc) +ROCBLAS_CONTRIB_OBJ = $(patsubst src/%.cc, build/%.o, $(ROCBLAS_CONTRIB_SRC)) + +ifeq ($(USE_ROCBLAS), 1) +CFLAGS += -DTVM_USE_ROCBLAS=1 +ADD_LDFLAGS += -lrocblas +RUNTIME_DEP += $(ROCBLAS_CONTRIB_OBJ) +endif diff --git a/python/tvm/contrib/cblas.py b/python/tvm/contrib/cblas.py index 17af941449ea..eb32cc490347 100644 --- a/python/tvm/contrib/cblas.py +++ b/python/tvm/contrib/cblas.py @@ -7,7 +7,7 @@ def matmul(lhs, rhs, transa=False, transb=False): """Create an extern op that compute matrix mult of A and rhs with CrhsLAS - This function serves as an example on how to calle external libraries. + This function serves as an example on how to call external libraries. Parameters ---------- diff --git a/python/tvm/contrib/rocblas.py b/python/tvm/contrib/rocblas.py new file mode 100644 index 000000000000..470cff662c4c --- /dev/null +++ b/python/tvm/contrib/rocblas.py @@ -0,0 +1,32 @@ +"""External function interface to rocBLAS libraries.""" +from __future__ import absolute_import as _abs + +from .. import api as _api +from .. import intrin as _intrin + +def matmul(lhs, rhs, transa=False, transb=False): + """Create an extern op that compute matrix mult of A and rhs with rocBLAS + + Parameters + ---------- + lhs : Tensor + The left matrix operand + rhs : Tensor + The right matrix operand + transa : bool + Whether transpose lhs + transb : bool + Whether transpose rhs + + Returns + ------- + C : Tensor + The result tensor. + """ + n = lhs.shape[1] if transa else lhs.shape[0] + m = rhs.shape[0] if transb else rhs.shape[1] + return _api.extern( + (n, m), [lhs, rhs], + lambda ins, outs: _intrin.call_packed( + "tvm.contrib.rocblas.matmul", + ins[0], ins[1], outs[0], transa, transb), name="C") diff --git a/src/contrib/rocblas/rocblas.cc b/src/contrib/rocblas/rocblas.cc new file mode 100644 index 000000000000..1d3a3104d1a9 --- /dev/null +++ b/src/contrib/rocblas/rocblas.cc @@ -0,0 +1,76 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file Use external cblas library call. + */ +#include +#include +#include + +extern "C" { +#include +} + +namespace tvm { +namespace contrib { + +using namespace runtime; + +#ifndef CHECK_ROCBLAS_ERROR +#define CHECK_ROCBLAS_ERROR(error) \ +if (error != rocblas_status_success) { \ + fprintf(stderr, "rocBLAS error: "); \ + if(error == rocblas_status_invalid_handle)fprintf(stderr, "rocblas_status_invalid_handle"); \ + if(error == rocblas_status_not_implemented )fprintf(stderr, " rocblas_status_not_implemented"); \ + if(error == rocblas_status_invalid_pointer)fprintf(stderr, "rocblas_status_invalid_pointer"); \ + if(error == rocblas_status_invalid_size)fprintf(stderr, "rocblas_status_invalid_size"); \ + if(error == rocblas_status_memory_error)fprintf(stderr, "rocblas_status_memory_error"); \ + if(error == rocblas_status_internal_error)fprintf(stderr, "rocblas_status_internal_error"); \ + fprintf(stderr, "\n"); \ + exit(EXIT_FAILURE); \ +} +#endif + + +// matrix multiplication for row major +TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.matmul") +.set_body([](TVMArgs args, TVMRetValue *ret) { + DLTensor* A = args[0]; + DLTensor* B = args[1]; + DLTensor* C = args[2]; + bool transa = args[3]; + bool transb = args[4]; + // call gemm for simple compact code. + CHECK_EQ(A->ndim, 2); + CHECK_EQ(B->ndim, 2); + CHECK_EQ(C->ndim, 2); + CHECK(C->strides == nullptr); + CHECK(B->strides == nullptr); + CHECK(A->strides == nullptr); + CHECK(TypeMatch(A->dtype, kDLFloat, 32)); + CHECK(TypeMatch(B->dtype, kDLFloat, 32)); + CHECK(TypeMatch(C->dtype, kDLFloat, 32)); + + rocblas_handle handle; + CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle)); + float alpha = 1.0; + float beta = 0.0; + + CHECK_ROCBLAS_ERROR(rocblas_sgemm(handle, + transb ? rocblas_operation_transpose : rocblas_operation_none, + transa ? rocblas_operation_transpose : rocblas_operation_none, + transb ? B->shape[0] : B->shape[1], + transa ? A->shape[1] : A->shape[0], + transb ? B->shape[1] : B->shape[0], + &alpha, + reinterpret_cast(static_cast(B->data) + B->byte_offset), + B->shape[1], + reinterpret_cast(static_cast(A->data) + A->byte_offset), + A->shape[1], + &beta, + reinterpret_cast(static_cast(C->data) + C->byte_offset), + C->shape[1])); + + CHECK_ROCBLAS_ERROR(rocblas_destroy_handle(handle)); +}); +} // namespace contrib +} // namespace tvm diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py new file mode 100644 index 000000000000..03e723c2b1af --- /dev/null +++ b/tests/python/contrib/test_rocblas.py @@ -0,0 +1,38 @@ +import tvm +import numpy as np +from tvm.contrib import rocblas + +def test_matmul_add(): + n = 1024 + l = 128 + m = 235 + bias = tvm.var('bias', dtype=tvm.float32) + A = tvm.placeholder((n, l), name='A') + B = tvm.placeholder((l, m), name='B') + C = rocblas.matmul(A, B) + D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D") + import topi + with tvm.target.create("rocm -libs=rocblas"): + s = topi.generic.schedule_extern(D) + + def verify(target="rocm"): + if not tvm.module.enabled(target): + print("skip because %s is not enabled..." % target) + return + if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): + print("skip because extern function is not avalable") + return + ctx = tvm.rocm(0) + f = tvm.build(s, [A, B, D, bias], target) + a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) + d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) + bb = 10.0 + f(a, b, d, bb) + np.testing.assert_allclose( + d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5) + verify() + + +if __name__ == "__main__": + test_matmul_add() diff --git a/topi/python/topi/rocm/__init__.py b/topi/python/topi/rocm/__init__.py index d2d7aaf0fd3d..3fddd53a3b36 100644 --- a/topi/python/topi/rocm/__init__.py +++ b/topi/python/topi/rocm/__init__.py @@ -3,3 +3,4 @@ from __future__ import absolute_import as _abs from .conv2d import * +from .dense import * diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py new file mode 100644 index 000000000000..0221fb6be850 --- /dev/null +++ b/topi/python/topi/rocm/dense.py @@ -0,0 +1,66 @@ +# pylint: disable=invalid-name, unused-variable +"""Schedule for dense operator""" +from __future__ import absolute_import as _abs +import tvm +import topi +from tvm.contrib import rocblas +from ..nn.dense import dense, dense_default +from .. import tag +from .. import generic + +@dense.register("rocm") +def dense_rocm(data, weight, bias=None): + """Dense operator for rocm backend. + + Parameters + ---------- + data : tvm.Tensor + 2-D with shape [batch, in_dim] + + weight : tvm.Tensor + 2-D with shape [out_dim, in_dim] + + bias : tvm.Tensor, optional + 1-D with shape [out_dim] + + Returns + ------- + output : tvm.Tensor + 2-D with shape [batch, out_dim] + """ + assert len(data.shape) == 2 and len(weight.shape) == 2, \ + "only support 2-dim dense" + if bias is not None: + assert len(bias.shape) == 1 + batch, in_dim = data.shape + out_dim, _ = weight.shape + target = tvm.target.current_target() + if "rocblas" in target.libs: + matmul = rocblas.matmul(data, weight, False, True) + if bias is not None: + matmul = tvm.compute((batch, out_dim), \ + lambda i, j: matmul[i, j] + bias[j], \ + tag=tag.BROADCAST) + return matmul + return dense_default(data, weight, bias) + + +@generic.schedule_dense.register(["rocm"]) +def schedule_dense(outs): + """Schedule for dense operator. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of dense + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for dense. + """ + target = tvm.target.current_target() + if target.target_name == "rocm" and "rocblas" in target.libs: + return generic.schedule_extern(outs) + return topi.cuda.schedule_dense(outs) From 9f9d60d49f1e9a8d0a104657ebfbefcb4c76a5dd Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 2 Jan 2018 18:25:53 +0900 Subject: [PATCH 2/3] fix include --- Makefile | 1 + src/contrib/rocblas/rocblas.cc | 51 ++++++++++++++++------------------ 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index 875c99b8657d..7f612f450cb0 100644 --- a/Makefile +++ b/Makefile @@ -139,6 +139,7 @@ include make/contrib/cudnn.mk include make/contrib/miopen.mk include make/contrib/mps.mk include make/contrib/cublas.mk +include make/contrib/rocblas.mk ifdef ADD_CFLAGS CFLAGS += $(ADD_CFLAGS) diff --git a/src/contrib/rocblas/rocblas.cc b/src/contrib/rocblas/rocblas.cc index 1d3a3104d1a9..0d41e6920df9 100644 --- a/src/contrib/rocblas/rocblas.cc +++ b/src/contrib/rocblas/rocblas.cc @@ -1,14 +1,11 @@ /*! * Copyright (c) 2017 by Contributors - * \file Use external cblas library call. + * \file Use external rocblas library call. */ #include #include #include - -extern "C" { -#include -} +#include "rocblas.h" namespace tvm { namespace contrib { @@ -18,15 +15,15 @@ using namespace runtime; #ifndef CHECK_ROCBLAS_ERROR #define CHECK_ROCBLAS_ERROR(error) \ if (error != rocblas_status_success) { \ - fprintf(stderr, "rocBLAS error: "); \ - if(error == rocblas_status_invalid_handle)fprintf(stderr, "rocblas_status_invalid_handle"); \ - if(error == rocblas_status_not_implemented )fprintf(stderr, " rocblas_status_not_implemented"); \ - if(error == rocblas_status_invalid_pointer)fprintf(stderr, "rocblas_status_invalid_pointer"); \ - if(error == rocblas_status_invalid_size)fprintf(stderr, "rocblas_status_invalid_size"); \ - if(error == rocblas_status_memory_error)fprintf(stderr, "rocblas_status_memory_error"); \ - if(error == rocblas_status_internal_error)fprintf(stderr, "rocblas_status_internal_error"); \ - fprintf(stderr, "\n"); \ - exit(EXIT_FAILURE); \ + fprintf(stderr, "rocBLAS error: "); \ + if(error == rocblas_status_invalid_handle)fprintf(stderr, "rocblas_status_invalid_handle"); \ + if(error == rocblas_status_not_implemented )fprintf(stderr, " rocblas_status_not_implemented"); \ + if(error == rocblas_status_invalid_pointer)fprintf(stderr, "rocblas_status_invalid_pointer"); \ + if(error == rocblas_status_invalid_size)fprintf(stderr, "rocblas_status_invalid_size"); \ + if(error == rocblas_status_memory_error)fprintf(stderr, "rocblas_status_memory_error"); \ + if(error == rocblas_status_internal_error)fprintf(stderr, "rocblas_status_internal_error"); \ + fprintf(stderr, "\n"); \ + exit(EXIT_FAILURE); \ } #endif @@ -56,19 +53,19 @@ TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.matmul") float beta = 0.0; CHECK_ROCBLAS_ERROR(rocblas_sgemm(handle, - transb ? rocblas_operation_transpose : rocblas_operation_none, - transa ? rocblas_operation_transpose : rocblas_operation_none, - transb ? B->shape[0] : B->shape[1], - transa ? A->shape[1] : A->shape[0], - transb ? B->shape[1] : B->shape[0], - &alpha, - reinterpret_cast(static_cast(B->data) + B->byte_offset), - B->shape[1], - reinterpret_cast(static_cast(A->data) + A->byte_offset), - A->shape[1], - &beta, - reinterpret_cast(static_cast(C->data) + C->byte_offset), - C->shape[1])); + transb ? rocblas_operation_transpose : rocblas_operation_none, + transa ? rocblas_operation_transpose : rocblas_operation_none, + transb ? B->shape[0] : B->shape[1], + transa ? A->shape[1] : A->shape[0], + transb ? B->shape[1] : B->shape[0], + &alpha, + reinterpret_cast(static_cast(B->data) + B->byte_offset), + B->shape[1], + reinterpret_cast(static_cast(A->data) + A->byte_offset), + A->shape[1], + &beta, + reinterpret_cast(static_cast(C->data) + C->byte_offset), + C->shape[1])); CHECK_ROCBLAS_ERROR(rocblas_destroy_handle(handle)); }); From 67633f9d18d3c531eab761dc2f1ccd11c45a94b1 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 3 Jan 2018 14:09:22 +0900 Subject: [PATCH 3/3] fix lint --- src/contrib/rocblas/rocblas.cc | 21 ++++++++++++--------- tests/python/contrib/test_rocblas.py | 15 +++++---------- topi/python/topi/rocm/dense.py | 2 +- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/contrib/rocblas/rocblas.cc b/src/contrib/rocblas/rocblas.cc index 0d41e6920df9..1dbf429461eb 100644 --- a/src/contrib/rocblas/rocblas.cc +++ b/src/contrib/rocblas/rocblas.cc @@ -16,12 +16,12 @@ using namespace runtime; #define CHECK_ROCBLAS_ERROR(error) \ if (error != rocblas_status_success) { \ fprintf(stderr, "rocBLAS error: "); \ - if(error == rocblas_status_invalid_handle)fprintf(stderr, "rocblas_status_invalid_handle"); \ - if(error == rocblas_status_not_implemented )fprintf(stderr, " rocblas_status_not_implemented"); \ - if(error == rocblas_status_invalid_pointer)fprintf(stderr, "rocblas_status_invalid_pointer"); \ - if(error == rocblas_status_invalid_size)fprintf(stderr, "rocblas_status_invalid_size"); \ - if(error == rocblas_status_memory_error)fprintf(stderr, "rocblas_status_memory_error"); \ - if(error == rocblas_status_internal_error)fprintf(stderr, "rocblas_status_internal_error"); \ + if (error == rocblas_status_invalid_handle) fprintf(stderr, "rocblas_status_invalid_handle"); \ + if (error == rocblas_status_not_implemented) fprintf(stderr, " rocblas_status_not_implemented"); \ + if (error == rocblas_status_invalid_pointer) fprintf(stderr, "rocblas_status_invalid_pointer"); \ + if (error == rocblas_status_invalid_size) fprintf(stderr, "rocblas_status_invalid_size"); \ + if (error == rocblas_status_memory_error) fprintf(stderr, "rocblas_status_memory_error"); \ + if (error == rocblas_status_internal_error) fprintf(stderr, "rocblas_status_internal_error"); \ fprintf(stderr, "\n"); \ exit(EXIT_FAILURE); \ } @@ -51,6 +51,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.matmul") CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle)); float alpha = 1.0; float beta = 0.0; + float *A_ptr = reinterpret_cast(static_cast(B->data) + B->byte_offset); + float *B_ptr = reinterpret_cast(static_cast(A->data) + A->byte_offset); + float *C_ptr = reinterpret_cast(static_cast(C->data) + C->byte_offset); CHECK_ROCBLAS_ERROR(rocblas_sgemm(handle, transb ? rocblas_operation_transpose : rocblas_operation_none, @@ -59,12 +62,12 @@ TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.matmul") transa ? A->shape[1] : A->shape[0], transb ? B->shape[1] : B->shape[0], &alpha, - reinterpret_cast(static_cast(B->data) + B->byte_offset), + A_ptr, B->shape[1], - reinterpret_cast(static_cast(A->data) + A->byte_offset), + B_ptr, A->shape[1], &beta, - reinterpret_cast(static_cast(C->data) + C->byte_offset), + C_ptr, C->shape[1])); CHECK_ROCBLAS_ERROR(rocblas_destroy_handle(handle)); diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py index 03e723c2b1af..46350f4d6625 100644 --- a/tests/python/contrib/test_rocblas.py +++ b/tests/python/contrib/test_rocblas.py @@ -6,14 +6,10 @@ def test_matmul_add(): n = 1024 l = 128 m = 235 - bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = rocblas.matmul(A, B) - D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D") - import topi - with tvm.target.create("rocm -libs=rocblas"): - s = topi.generic.schedule_extern(D) + s = tvm.create_schedule(C.op) def verify(target="rocm"): if not tvm.module.enabled(target): @@ -23,14 +19,13 @@ def verify(target="rocm"): print("skip because extern function is not avalable") return ctx = tvm.rocm(0) - f = tvm.build(s, [A, B, D, bias], target) + f = tvm.build(s, [A, B, C], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) - bb = 10.0 - f(a, b, d, bb) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + f(a, b, c) np.testing.assert_allclose( - d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5) + c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) verify() diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py index 0221fb6be850..cfeed247a4a1 100644 --- a/topi/python/topi/rocm/dense.py +++ b/topi/python/topi/rocm/dense.py @@ -2,8 +2,8 @@ """Schedule for dense operator""" from __future__ import absolute_import as _abs import tvm -import topi from tvm.contrib import rocblas +import topi from ..nn.dense import dense, dense_default from .. import tag from .. import generic