diff --git a/docs/source/reference.rst b/docs/source/reference.rst index c8e59ccc..c3ef351a 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -13,6 +13,7 @@ Library Wrapper Routines reference_cusolver reference_cula reference_pcula + reference_cusparse High-Level Routines ------------------- diff --git a/docs/source/reference_cusparse.rst b/docs/source/reference_cusparse.rst new file mode 100644 index 00000000..e14a5ab3 --- /dev/null +++ b/docs/source/reference_cusparse.rst @@ -0,0 +1,43 @@ +.. -*- rst -*- + +.. currentmodule:: skcuda.cusparse + +CUSPARSE Routines +================= + +Helper Routines +--------------- +.. autosummary:: + :toctree: generated/ + :nosignatures: + + cusparseCreate + cusparseDestroy + cusparseGetVersion + cusparseSetStream + cusparseGetStream + +Wrapper Routines +---------------- + +Single Precision Routines +^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autosummary:: + :toctree: generated/ + :nosignatures: + + cusparseSgtsv2StridedBatch_bufferSizeExt + cusparseSgtsv2StridedBatch + cusparseSgtsvInterleavedBatch_bufferSizeExt + cusparseSgtsvInterleavedBatch + +Double Precision Routines +^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autosummary:: + :toctree: generated/ + :nosignatures: + + cusparseDgtsv2StridedBatch_bufferSizeExt + cusparseDgtsv2StridedBatch + cusparseDgtsvInterleavedBatch_bufferSizeExt + cusparseDgtsvInterleavedBatch diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index db2c7888..a8442b6e 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -6,19 +6,17 @@ Note: this module does not explicitly depend on PyCUDA. """ -import atexit -import ctypes.util +from __future__ import absolute_import + +import ctypes import platform from string import Template import sys -import warnings - -import numpy as np -import cuda +from . import cuda # Load library: -_version_list = [10.1, 10.0, 9.2, 9.1, 9.0, 8.0, 7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] +_linux_version_list = [10.2, 10.1, 10.0, 9.2, 9.1, 9.0, 8.0, 7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] _win32_version_list = [10, 10, 100, 92, 91, 90, 80, 75, 70, 65, 60, 55, 50, 40] if 'linux' in sys.platform: _libcusparse_libname_list = ['libcusparse.so'] + \ @@ -86,6 +84,7 @@ class cusparseStatusMatrixTypeNotSupported(cusparseError): """The matrix type is not supported by this function""" pass +# TODO: Check if this is complete list of exceptions, and that numbers are correct. cusparseExceptions = { 1: cusparseStatusNotInitialized, 2: cusparseStatusAllocFailed, @@ -97,41 +96,6 @@ class cusparseStatusMatrixTypeNotSupported(cusparseError): 8: cusparseStatusMatrixTypeNotSupported, } -# Matrix types: -CUSPARSE_MATRIX_TYPE_GENERAL = 0 -CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1 -CUSPARSE_MATRIX_TYPE_HERMITIAN = 2 -CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 - -CUSPARSE_FILL_MODE_LOWER = 0 -CUSPARSE_FILL_MODE_UPPER = 1 - -# Whether or not a matrix' diagonal entries are unity: -CUSPARSE_DIAG_TYPE_NON_UNIT = 0 -CUSPARSE_DIAG_TYPE_UNIT = 1 - -# Matrix index bases: -CUSPARSE_INDEX_BASE_ZERO = 0 -CUSPARSE_INDEX_BASE_ONE = 1 - -# Operation types: -CUSPARSE_OPERATION_NON_TRANSPOSE = 0 -CUSPARSE_OPERATION_TRANSPOSE = 1 -CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 - -# Whether or not to parse elements of a dense matrix row or column-wise. -CUSPARSE_DIRECTION_ROW = 0 -CUSPARSE_DIRECTION_COLUMN = 1 - -# Helper functions: -class cusparseMatDescr(ctypes.Structure): - _fields_ = [ - ('MatrixType', ctypes.c_int), - ('FillMode', ctypes.c_int), - ('DiagType', ctypes.c_int), - ('IndexBase', ctypes.c_int) - ] - def cusparseCheckStatus(status): """ Raise CUSPARSE exception @@ -147,9 +111,7 @@ def cusparseCheckStatus(status): See Also -------- cusparseExceptions - """ - if status != 0: try: raise cusparseExceptions[status] @@ -169,35 +131,30 @@ def cusparseCreate(): ------- handle : int CUSPARSE library context. - """ - - handle = ctypes.c_int() + handle = ctypes.c_void_p() status = _libcusparse.cusparseCreate(ctypes.byref(handle)) cusparseCheckStatus(status) return handle.value _libcusparse.cusparseDestroy.restype = int -_libcusparse.cusparseDestroy.argtypes = [ctypes.c_int] +_libcusparse.cusparseDestroy.argtypes = [ctypes.c_void_p] def cusparseDestroy(handle): """ Release CUSPARSE resources. - Releases hardware resources used by CUSPARSE + Releases hardware resources used by CUSPARSE. Parameters ---------- handle : int CUSPARSE library context. - """ - status = _libcusparse.cusparseDestroy(handle) cusparseCheckStatus(status) _libcusparse.cusparseGetVersion.restype = int -_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int, - ctypes.c_void_p] +_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_void_p, ctypes.c_void_p] def cusparseGetVersion(handle): """ Return CUSPARSE library version. @@ -213,9 +170,7 @@ def cusparseGetVersion(handle): ------- version : int CUSPARSE library version number. - """ - version = ctypes.c_int() status = _libcusparse.cusparseGetVersion(handle, ctypes.byref(version)) @@ -223,8 +178,7 @@ def cusparseGetVersion(handle): return version.value _libcusparse.cusparseSetStream.restype = int -_libcusparse.cusparseSetStream.argtypes = [ctypes.c_int, - ctypes.c_int] +_libcusparse.cusparseSetStream.argtypes = [ctypes.c_void_p, ctypes.c_int] def cusparseSetStream(handle, id): """ Sets the CUSPARSE stream in which kernels will run. @@ -235,154 +189,346 @@ def cusparseSetStream(handle, id): CUSPARSE library context. id : int Stream ID. - """ - status = _libcusparse.cusparseSetStream(handle, id) cusparseCheckStatus(status) -_libcusparse.cusparseCreateMatDescr.restype = int -_libcusparse.cusparseCreateMatDescr.argtypes = [cusparseMatDescr] -def cusparseCreateMatDescr(): +_libcusparse.cusparseGetStream.restype = int +_libcusparse.cusparseGetStream.argtypes = [ctypes.c_void_p, ctypes.c_void_p] +def cusparseGetStream(handle): """ - Initialize a sparse matrix descriptor. + Gets the CUSPARSE stream in which kernels will run. - Initializes the `MatrixType` and `IndexBase` fields of the matrix - descriptor to the default values `CUSPARSE_MATRIX_TYPE_GENERAL` - and `CUSPARSE_INDEX_BASE_ZERO`. + Parameters + ---------- + handle : int + CUSPARSE library context. Returns ------- - desc : cusparseMatDescr - Matrix descriptor. - + handle : int + CUSPARSE library context. """ - - desc = cusparseMatrixDesc() - status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(desc)) + id = ctypes.c_int() + status = _libcusparse.cusparseGetStream(handle, ctypes.byref(id)) cusparseCheckStatus(status) - return desc + return id.value -_libcusparse.cusparseDestroyMatDescr.restype = int -_libcusparse.cusparseDestroyMatDescr.argtypes = [ctypes.c_int] -def cusparseDestroyMatDescr(desc): +gtsv2StridedBatch_bufferSizeExt_doc = Template( """ - Releases the memory allocated for the matrix descriptor. + Calculate size of work buffer used by cusparsegtsv2StridedBatch. Parameters ---------- - desc : cusparseMatDescr - Matrix descriptor. + handle : int + cuSPARSE context + m : int + Size of the linear system (must be >= 3) + dl : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the lower + diagonal of the tri-diagonal linear system. The lower diagonal dl(i) + that corresponds to the ith linear system starts at location + dl+batchStride*i in memory. Also, the first element of each lower + diagonal must be zero. + d : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the main + diagonal of the tri-diagonal linear system. The main diagonal d(i) + that corresponds to the ith linear system starts at location + d+batchStride*i in memory. + du : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the upper + diagonal of the tri-diagonal linear system. The upper diagonal du(i) + that corresponds to the ith linear system starts at location + du+batchStride*i in memory. Also, the last element of each upper + diagonal must be zero. + x : ctypes.c_void_p + Pointer to ${precision} ${real} dense array that contains the + right-hand-side of the tri-diagonal linear system. The + right-hand-side x(i) that corresponds to the ith linear system + starts at location x+batchStride*i in memory. + batchCount : int + Number of systems to solve. + batchStride : int + Stride (number of elements) that separates the vectors of every + system (must be at least m). - """ + Returns + ------- + bufferSizeInBytes : int + number of bytes of the buffer used in the gtsv2StridedBatch. - status = _libcusparse.cusparseDestroyMatDescr(desc) + References + ---------- + `cusparsegtsv2StridedBatch_bufferSizeExt `_ + """ +) + +_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.argtypes =\ + [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount, batchStride): + bufferSizeInBytes = ctypes.c_int() + status = _libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt( + handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, + ctypes.byref(bufferSizeInBytes)) cusparseCheckStatus(status) + return bufferSizeInBytes.value +cusparseSgtsv2StridedBatch_bufferSizeExt.__doc__ = \ + gtsv2StridedBatch_bufferSizeExt_doc.substitute(precision='single precision', real='real') + +_libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt.argtypes =\ + _libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.argtypes +def cusparseDgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount, batchStride): + bufferSizeInBytes = ctypes.c_int() + status = _libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt( + handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, + ctypes.byref(bufferSizeInBytes)) + cusparseCheckStatus(status) + return bufferSizeInBytes.value +cusparseDgtsv2StridedBatch_bufferSizeExt.__doc__ = \ + gtsv2StridedBatch_bufferSizeExt_doc.substitute(precision='double precision', real='real') -_libcusparse.cusparseSetMatType.restype = int -_libcusparse.cusparseSetMatType.argtypes = [cusparseMatDescr, - ctypes.c_int] -def cusparseSetMatType(desc, type): +gtsv2StridedBatch_doc = Template( """ - Sets the matrix type of the specified matrix. + Compute the solution of multiple tridiagonal linear systems. + + Solves multiple tridiagonal linear systems, for i=0,…,batchCount: + A(i) ∗ y(i) = x(i) + The coefficient matrix A of each of these tri-diagonal linear system is + defined with three vectors corresponding to its lower (dl), main (d), and + upper (du) matrix diagonals; the right-hand sides are stored in the dense + matrix X. Notice that solution Y overwrites right-hand-side matrix X on exit. + The different matrices are assumed to be of the same size and are stored with + a fixed batchStride in memory. + + The routine does not perform any pivoting and uses a combination of the + Cyclic Reduction (CR) and the Parallel Cyclic Reduction (PCR) algorithms to + find the solution. It achieves better performance when m is a power of 2. Parameters ---------- - desc : cusparseMatDescr - Matrix descriptor. - type : int - Matrix type. - + handle : int + cuSPARSE context + m : int + Size of the linear system (must be >= 3) + dl : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the lower + diagonal of the tri-diagonal linear system. The lower diagonal dl(i) + that corresponds to the ith linear system starts at location + dl+batchStride*i in memory. Also, the first element of each lower + diagonal must be zero. + d : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the main + diagonal of the tri-diagonal linear system. The main diagonal d(i) + that corresponds to the ith linear system starts at location + d+batchStride*i in memory. + du : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the upper + diagonal of the tri-diagonal linear system. The upper diagonal du(i) + that corresponds to the ith linear system starts at location + du+batchStride*i in memory. Also, the last element of each upper + diagonal must be zero. + x : ctypes.c_void_p + Pointer to ${precision} ${real} dense array that contains the + right-hand-side of the tri-diagonal linear system. The + right-hand-side x(i) that corresponds to the ith linear system + starts at location x+batchStride*i in memory. + batchCount : int + Number of systems to solve. + batchStride : int + Stride (number of elements) that separates the vectors of every + system (must be at least m). + pBuffer: ctypes.c_void_p + Buffer allocated by the user, the size is return by gtsv2StridedBatch_bufferSizeExt + + References + ---------- + `cusparsegtsv2StridedBatch `_ """ - - status = _libcusparse.cusparseSetMatType(desc, type) +) + +_libcusparse.cusparseSgtsv2StridedBatch.restype = int +_libcusparse.cusparseSgtsv2StridedBatch.argtypes =\ + [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer): + status = _libcusparse.cusparseSgtsv2StridedBatch( + handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, int(pBuffer)) cusparseCheckStatus(status) +cusparseSgtsv2StridedBatch.__doc__ = \ + gtsv2StridedBatch_doc.substitute(precision='single precision', real='real') + +_libcusparse.cusparseDgtsv2StridedBatch.restype = int +_libcusparse.cusparseDgtsv2StridedBatch.argtypes =\ + _libcusparse.cusparseSgtsv2StridedBatch.argtypes +def cusparseDgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer): + status = _libcusparse.cusparseDgtsv2StridedBatch( + handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, int(pBuffer)) + cusparseCheckStatus(status) +cusparseDgtsv2StridedBatch.__doc__ = \ + gtsv2StridedBatch_doc.substitute(precision='double precision', real='real') -_libcusparse.cusparseGetMatType.restype = int -_libcusparse.cusparseGetMatType.argtypes = [cusparseMatDescr] -def cusparseGetMatType(desc): +gtsv2InterleavedBatch_bufferSizeExt_doc = Template( """ - Gets the matrix type of the specified matrix. + Calculate size of work buffer used by cusparsegtsvInterleavedBatch. Parameters ---------- - desc : cusparseMatDescr - Matrix descriptor. - - Returns - ------- - type : int - Matrix type. - + handle : int + cuSPARSE context + algo : int + algo = 0: cuThomas (unstable algorithm); algo = 1: LU with pivoting + (stable algorithm); algo = 2: QR (stable algorithm) + m : int + Size of the linear system + dl : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the lower + diagonal of the tri-diagonal linear system. The first element of each + lower diagonal must be zero. + d : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the main + diagonal of the tri-diagonal linear system. + du : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the upper + diagonal of the tri-diagonal linear system. The last element of each + upper diagonal must be zero. + x : ctypes.c_void_p + Pointer to ${precision} ${real} dense array that contains the + right-hand-side of the tri-diagonal linear system. + batchCount : int + Number of systems to solve. + pBuffer: ctypes.c_void_p + Buffer allocated by the user, the size is return by gtsvInterleavedBatch_bufferSizeExt + + References + ---------- + `cusparsegtsvInterleavedBatch `_ """ +) + +_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.argtypes =\ + [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, batchCount): + pBufferSizeInBytes = ctypes.c_int() + status = _libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt( + handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, + ctypes.byref(pBufferSizeInBytes)) + cusparseCheckStatus(status) + return pBufferSizeInBytes.value +cusparseSgtsvInterleavedBatch_bufferSizeExt.__doc__ = \ + gtsv2InterleavedBatch_bufferSizeExt_doc.substitute(precision='single precision', real='real') + +_libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt.argtypes =\ + _libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.argtypes +def cusparseDgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, batchCount): + pBufferSizeInBytes = ctypes.c_int() + status = _libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt( + handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, + ctypes.byref(pBufferSizeInBytes)) + cusparseCheckStatus(status) + return pBufferSizeInBytes.value +cusparseDgtsvInterleavedBatch_bufferSizeExt.__doc__ = \ + gtsv2InterleavedBatch_bufferSizeExt_doc.substitute(precision='double precision', real='real') - return _libcusparse.cusparseGetMatType(desc) - -# Format conversion functions: -_libcusparse.cusparseSnnz.restype = int -_libcusparse.cusparseSnnz.argtypes = [ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - cusparseMatDescr, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p] -def cusparseSnnz(handle, dirA, m, n, descrA, A, lda, - nnzPerRowColumn, nnzTotalDevHostPtr): +gtsvInterleavedBatch_doc = Template( """ - Compute number of non-zero elements per row, column, or dense matrix. + Compute the solution of multiple tridiagonal linear systems. + + Solves multiple tridiagonal linear systems, for i=0,…,batchCount: + A(i) ∗ y(i) = x(i) + The coefficient matrix A of each of these tri-diagonal linear system is + defined with three vectors corresponding to its lower (dl), main (d), and + upper (du) matrix diagonals; the right-hand sides are stored in the dense + matrix X. Notice that solution Y overwrites right-hand-side matrix X on exit. + The different matrices are assumed to be of the same size and are stored with + a fixed batchStride in memory. + + Assuming A is of size m and base-1, dl, d and du are defined by the following formula: + dl(i) := A(i, i-1) for i=1,2,...,m + The first element of dl is out-of-bound (dl(1) := A(1,0)), so dl(1) = 0. + d(i) = A(i,i) for i=1,2,...,m + du(i) = A(i,i+1) for i=1,2,...,m + The last element of du is out-of-bound (du(m) := A(m,m+1)), so du(m) = 0. + + The data layout is different from gtsvStridedBatch which aggregates all + matrices one after another. Instead, gtsvInterleavedBatch gathers + different matrices of the same element in a continous manner. If dl is + regarded as a 2-D array of size m-by-batchCount, dl(:,j) to store j-th + matrix. gtsvStridedBatch uses column-major while gtsvInterleavedBatch + uses row-major. + + The routine provides three different algorithms, selected by parameter algo. + The first algorithm is cuThomas provided by Barcelona Supercomputing Center. + The second algorithm is LU with partial pivoting and last algorithm is QR. + From stability perspective, cuThomas is not numerically stable because it + does not have pivoting. LU with partial pivoting and QR are stable. From + performance perspective, LU with partial pivoting and QR is about 10% to 20% + slower than cuThomas. Parameters ---------- handle : int - CUSPARSE library context. - dirA : int - Data direction of elements. + cuSPARSE context + algo : int + algo = 0: cuThomas (unstable algorithm); algo = 1: LU with pivoting + (stable algorithm); algo = 2: QR (stable algorithm) m : int - Rows in A. - n : int - Columns in A. - descrA : cusparseMatDescr - Matrix descriptor. - A : pycuda.gpuarray.GPUArray - Dense matrix of dimensions (lda, n). - lda : int - Leading dimension of A. - - Returns - ------- - nnzPerRowColumn : pycuda.gpuarray.GPUArray - Array of length m or n containing the number of - non-zero elements per row or column, respectively. - nnzTotalDevHostPtr : pycuda.gpuarray.GPUArray - Total number of non-zero elements in device or host memory. - + Size of the linear system + dl : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the lower + diagonal of the tri-diagonal linear system. The first element of each + lower diagonal must be zero. + d : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the main + diagonal of the tri-diagonal linear system. + du : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the upper + diagonal of the tri-diagonal linear system. The last element of each + upper diagonal must be zero. + x : ctypes.c_void_p + Pointer to ${precision} ${real} dense array that contains the + right-hand-side of the tri-diagonal linear system. + batchCount : int + Number of systems to solve. + pBuffer: ctypes.c_void_p + Buffer allocated by the user, the size is return by gtsvInterleavedBatch_bufferSizeExt + + References + ---------- + `cusparsegtsvInterleavedBatch `_ """ - - # Unfinished: - nnzPerRowColumn = gpuarray.empty() - nnzTotalDevHostPtr = gpuarray.empty() - - status = _libcusparse.cusparseSnnz(handle, dirA, m, n, - descrA, int(A), lda, - int(nnzPerRowColumn), int(nnzTotalDevHostPtr)) +) + +_libcusparse.cusparseSgtsvInterleavedBatch.restype = int +_libcusparse.cusparseSgtsvInterleavedBatch.argtypes =\ + [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBuffer): + status = _libcusparse.cusparseSgtsvInterleavedBatch( + handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer)) cusparseCheckStatus(status) - return nnzPerVector, nnzHost - -_libcusparse.cusparseSdense2csr.restype = int -_libcusparse.cusparseSdense2csr.argtypes = [ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - cusparseMatDescr, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p] -def cusparseSdense2csr(handle, m, n, descrA, A, lda, - nnzPerRow, csrValA, csrRowPtrA, csrColIndA): - # Unfinished - pass +cusparseSgtsvInterleavedBatch.__doc__ = \ + gtsvInterleavedBatch_doc.substitute(precision='single precision', real='real') + +_libcusparse.cusparseDgtsvInterleavedBatch.restype = int +_libcusparse.cusparseDgtsvInterleavedBatch.argtypes =\ + _libcusparse.cusparseSgtsvInterleavedBatch.argtypes +def cusparseDgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBuffer): + status = _libcusparse.cusparseDgtsvInterleavedBatch( + handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer)) + cusparseCheckStatus(status) +cusparseDgtsvInterleavedBatch.__doc__ = \ + gtsvInterleavedBatch_doc.substitute(precision='double precision', real='real') diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py new file mode 100644 index 00000000..469bfc68 --- /dev/null +++ b/tests/test_cusparse.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python + +""" +Unit tests for skcuda.cusparse +""" + +from unittest import main, TestCase, TestSuite + +import pycuda +import pycuda.gpuarray as gpuarray +from pycuda.tools import clear_context_caches, make_default_context +import numpy as np + +import skcuda.cusparse as cusparse + +def check_batch_tridiagonal(dl,d,du,x, y, m,batchCount,batchStride=None,atol=1e-8): + """ + Check all solutions from batched tridiagonal routine + """ + if batchStride is None: + batchStride = m + + for ii in range(batchCount): + A_sys = np.diagflat(dl[ii*batchStride+1:ii*batchStride+m], -1) +\ + np.diagflat(d[ii*batchStride:ii*batchStride+m], 0) + \ + np.diagflat(du[ii*batchStride:ii*batchStride+m-1], 1) + x_sys = x[ii*batchStride:ii*batchStride+m] + y_sys = y[ii*batchStride:ii*batchStride+m] + assert(np.allclose(np.dot(A_sys,y_sys), x_sys, atol=atol)) + +def tridiagonal_system(m, batchCount, batchStride=None, seed=None, + dtype=np.float32): + """ + Create a tridiagonal system of a given size + """ + if batchStride is None: + batchStride = m + if seed is not None: + np.random.seed(seed) + + dl = np.zeros(batchStride*batchCount).astype(dtype) + d = np.zeros(batchStride*batchCount).astype(dtype) + du = np.zeros(batchStride*batchCount).astype(dtype) + x = np.zeros(batchStride*batchCount).astype(dtype) + + for ii in range(batchCount): + dl[ii*batchStride+1:ii*batchStride+m] = np.random.rand(m-1) + d[ii*batchStride:ii*batchStride+m] = np.random.rand(m) + du[ii*batchStride:ii*batchStride+m-1] = np.random.rand(m-1) + x[ii*batchStride:ii*batchStride+m] = np.random.rand(m) + + return dl,d,du,x + +class test_cusparse(TestCase): + @classmethod + def setUpClass(cls): + cls.ctx = make_default_context() + cls.cusparse_handle = cusparse.cusparseCreate() + + @classmethod + def tearDownClass(cls): + cusparse.cusparseDestroy(cls.cusparse_handle) + cls.ctx.pop() + clear_context_caches() + + def setUp(self): + np.random.seed(23) # For reproducible tests. + + def test_cusparseSgtsv2StridedBatch(self): + m = 6 + batchCount = 9 + batchStride = 11 + + dl,d,du,x = tridiagonal_system(m, batchCount, batchStride, seed=23) + + dl_gpu = gpuarray.to_gpu(dl) + d_gpu = gpuarray.to_gpu(d) + du_gpu = gpuarray.to_gpu(du) + x_gpu = gpuarray.to_gpu(x) + + bufferSizeInBytes = cusparse.cusparseSgtsv2StridedBatch_bufferSizeExt( + self.cusparse_handle, m, dl_gpu.gpudata, d_gpu.gpudata, + du_gpu.gpudata, x_gpu.gpudata, batchCount, batchStride) + pBuffer = pycuda.driver.mem_alloc(bufferSizeInBytes) + + cusparse.cusparseSgtsv2StridedBatch(self.cusparse_handle, m, + dl_gpu.gpudata, d_gpu.gpudata, du_gpu.gpudata, x_gpu.gpudata, + batchCount, batchStride, pBuffer) + + sln = x_gpu.get() + # For unstable algorithms, need to loosen atol + check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount,batchStride, 1e-4) + + def test_cusparseDgtsv2StridedBatch(self): + m = 6 + batchCount = 9 + batchStride = 11 + + dl,d,du,x = tridiagonal_system(m, batchCount, batchStride, seed=23, + dtype=np.float64) + + dl_gpu = gpuarray.to_gpu(dl) + d_gpu = gpuarray.to_gpu(d) + du_gpu = gpuarray.to_gpu(du) + x_gpu = gpuarray.to_gpu(x) + + bufferSizeInBytes = cusparse.cusparseDgtsv2StridedBatch_bufferSizeExt( + self.cusparse_handle, m, dl_gpu.gpudata, d_gpu.gpudata, + du_gpu.gpudata, x_gpu.gpudata, batchCount, batchStride) + pBuffer = pycuda.driver.mem_alloc(bufferSizeInBytes) + + cusparse.cusparseDgtsv2StridedBatch(self.cusparse_handle, m, + dl_gpu.gpudata, d_gpu.gpudata, du_gpu.gpudata, x_gpu.gpudata, + batchCount, batchStride, pBuffer) + + sln = x_gpu.get() + # For unstable algorithms, need to loosen atol + check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount,batchStride) + + def test_cusparseSgtsvInterleavedBatch(self): + m = 6 + batchCount = 9 + + dl,d,du,x = tridiagonal_system(m, batchCount, seed=23) + + # Convert to interleaved format, by switching from row-major order + # (numpy default) to column-major + dl_int = np.reshape(dl,(batchCount,m)).ravel('F') + d_int = np.reshape(d, (batchCount, m)).ravel('F') + du_int = np.reshape(du,(batchCount,m)).ravel('F') + x_int = np.reshape(x,(batchCount,m)).ravel('F') + + for algo in range(3): + dl_int_gpu = gpuarray.to_gpu(dl_int) + d_int_gpu = gpuarray.to_gpu(d_int) + du_int_gpu = gpuarray.to_gpu(du_int) + x_int_gpu = gpuarray.to_gpu(x_int) + + pBufferSizeInBytes = cusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt(self.cusparse_handle, + algo, m, dl_int_gpu.gpudata, d_int_gpu.gpudata, + du_int_gpu.gpudata, x_int_gpu.gpudata, batchCount) + + pBuffer = pycuda.driver.mem_alloc(pBufferSizeInBytes) + + cusparse.cusparseSgtsvInterleavedBatch(self.cusparse_handle, algo, m, + dl_int_gpu.gpudata, d_int_gpu.gpudata, du_int_gpu.gpudata, + x_int_gpu.gpudata, batchCount, pBuffer) + + sln_int = x_int_gpu.get() + # Convert back from interleaved format + sln = np.reshape(sln_int,(m,batchCount)).ravel('F') + check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount, atol=1e-6) + + def test_cusparseDgtsvInterleavedBatch(self): + m = 6 + batchCount = 9 + + dl,d,du,x = tridiagonal_system(m, batchCount, seed=23, + dtype=np.float64) + + # Convert to interleaved format, by switching from row-major order + # (numpy default) to column-major + dl_int = np.reshape(dl,(batchCount,m)).ravel('F') + d_int = np.reshape(d, (batchCount, m)).ravel('F') + du_int = np.reshape(du,(batchCount,m)).ravel('F') + x_int = np.reshape(x,(batchCount,m)).ravel('F') + + for algo in range(3): + dl_int_gpu = gpuarray.to_gpu(dl_int) + d_int_gpu = gpuarray.to_gpu(d_int) + du_int_gpu = gpuarray.to_gpu(du_int) + x_int_gpu = gpuarray.to_gpu(x_int) + + pBufferSizeInBytes = cusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt(self.cusparse_handle, + algo, m, dl_int_gpu.gpudata, d_int_gpu.gpudata, + du_int_gpu.gpudata, x_int_gpu.gpudata, batchCount) + + pBuffer = pycuda.driver.mem_alloc(pBufferSizeInBytes) + + cusparse.cusparseDgtsvInterleavedBatch(self.cusparse_handle, algo, m, + dl_int_gpu.gpudata, d_int_gpu.gpudata, du_int_gpu.gpudata, + x_int_gpu.gpudata, batchCount, pBuffer) + + sln_int = x_int_gpu.get() + # Convert back from interleaved format + sln = np.reshape(sln_int,(m,batchCount)).ravel('F') + check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount) + + def test_cusparseGetSetStream(self): + initial_stream = cusparse.cusparseGetStream(self.cusparse_handle) + # Switch stream + cusparse.cusparseSetStream(self.cusparse_handle, initial_stream+1) + final_stream = cusparse.cusparseGetStream(self.cusparse_handle) + assert(final_stream == initial_stream+1) + + def test_cusparseGetVersion(self): + cusparse.cusparseGetVersion(self.cusparse_handle) + +def suite(): + s = TestSuite() + s.addTest(test_cusparse('test_cusparseSgtsv2StridedBatch')) + s.addTest(test_cusparse('test_cusparseDgtsv2StridedBatch')) + s.addTest(test_cusparse('test_cusparseSgtsvInterleavedBatch')) + s.addTest(test_cusparse('test_cusparseDgtsvInterleavedBatch')) + + s.addTest(test_cusparse('test_cusparseGetSetStream')) + s.addTest(test_cusparse('test_cusparseGetVersion')) + return s + +if __name__ == '__main__': + main(defaultTest = 'suite')