diff --git a/docs/source/reference.rst b/docs/source/reference.rst
index c8e59ccc..c3ef351a 100644
--- a/docs/source/reference.rst
+++ b/docs/source/reference.rst
@@ -13,6 +13,7 @@ Library Wrapper Routines
    reference_cusolver
    reference_cula
    reference_pcula
+   reference_cusparse
 
 High-Level Routines
 -------------------
diff --git a/docs/source/reference_cusparse.rst b/docs/source/reference_cusparse.rst
new file mode 100644
index 00000000..e14a5ab3
--- /dev/null
+++ b/docs/source/reference_cusparse.rst
@@ -0,0 +1,43 @@
+.. -*- rst -*-
+
+.. currentmodule:: skcuda.cusparse
+
+CUSPARSE Routines
+=================
+
+Helper Routines
+---------------
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   cusparseCreate
+   cusparseDestroy
+   cusparseGetVersion
+   cusparseSetStream
+   cusparseGetStream
+
+Wrapper Routines
+----------------
+
+Single Precision Routines
+^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   cusparseSgtsv2StridedBatch_bufferSizeExt
+   cusparseSgtsv2StridedBatch
+   cusparseSgtsvInterleavedBatch_bufferSizeExt
+   cusparseSgtsvInterleavedBatch
+
+Double Precision Routines
+^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   cusparseDgtsv2StridedBatch_bufferSizeExt
+   cusparseDgtsv2StridedBatch
+   cusparseDgtsvInterleavedBatch_bufferSizeExt
+   cusparseDgtsvInterleavedBatch
diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py
index db2c7888..a8442b6e 100644
--- a/skcuda/cusparse.py
+++ b/skcuda/cusparse.py
@@ -6,19 +6,17 @@
 Note: this module does not explicitly depend on PyCUDA.
 """
 
-import atexit
-import ctypes.util
+from __future__ import absolute_import
+
+import ctypes
 import platform
 from string import Template
 import sys
-import warnings
-
-import numpy as np
 
-import cuda
+from . import cuda
 
 # Load library:
-_version_list = [10.1, 10.0, 9.2, 9.1, 9.0, 8.0, 7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0]
+_linux_version_list = [10.2, 10.1, 10.0, 9.2, 9.1, 9.0, 8.0, 7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0]
 _win32_version_list = [10, 10, 100, 92, 91, 90, 80, 75, 70, 65, 60, 55, 50, 40]
 if 'linux' in sys.platform:
     _libcusparse_libname_list = ['libcusparse.so'] + \
@@ -86,6 +84,7 @@ class cusparseStatusMatrixTypeNotSupported(cusparseError):
     """The matrix type is not supported by this function"""
     pass
 
+# TODO: Check if this is complete list of exceptions, and that numbers are correct.
 cusparseExceptions = {
     1: cusparseStatusNotInitialized,
     2: cusparseStatusAllocFailed,
@@ -97,41 +96,6 @@ class cusparseStatusMatrixTypeNotSupported(cusparseError):
     8: cusparseStatusMatrixTypeNotSupported,
     }
 
-# Matrix types:
-CUSPARSE_MATRIX_TYPE_GENERAL = 0
-CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1
-CUSPARSE_MATRIX_TYPE_HERMITIAN = 2
-CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
-
-CUSPARSE_FILL_MODE_LOWER = 0
-CUSPARSE_FILL_MODE_UPPER = 1
-
-# Whether or not a matrix' diagonal entries are unity:
-CUSPARSE_DIAG_TYPE_NON_UNIT = 0
-CUSPARSE_DIAG_TYPE_UNIT = 1
-
-# Matrix index bases:
-CUSPARSE_INDEX_BASE_ZERO = 0
-CUSPARSE_INDEX_BASE_ONE = 1
-
-# Operation types:
-CUSPARSE_OPERATION_NON_TRANSPOSE = 0
-CUSPARSE_OPERATION_TRANSPOSE = 1
-CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
-
-# Whether or not to parse elements of a dense matrix row or column-wise.
-CUSPARSE_DIRECTION_ROW = 0
-CUSPARSE_DIRECTION_COLUMN = 1
-
-# Helper functions:
-class cusparseMatDescr(ctypes.Structure):
-    _fields_ = [
-        ('MatrixType', ctypes.c_int),
-        ('FillMode', ctypes.c_int),
-        ('DiagType', ctypes.c_int),
-        ('IndexBase', ctypes.c_int)
-        ]
-
 def cusparseCheckStatus(status):
     """
     Raise CUSPARSE exception
@@ -147,9 +111,7 @@ def cusparseCheckStatus(status):
     See Also
     --------
     cusparseExceptions
-
     """
-
     if status != 0:
         try:
             raise cusparseExceptions[status]
@@ -169,35 +131,30 @@ def cusparseCreate():
     -------
     handle : int
         CUSPARSE library context.
-
     """
-
-    handle = ctypes.c_int()
+    handle = ctypes.c_void_p()
     status = _libcusparse.cusparseCreate(ctypes.byref(handle))
     cusparseCheckStatus(status)
     return handle.value
 
 _libcusparse.cusparseDestroy.restype = int
-_libcusparse.cusparseDestroy.argtypes = [ctypes.c_int]
+_libcusparse.cusparseDestroy.argtypes = [ctypes.c_void_p]
 def cusparseDestroy(handle):
     """
     Release CUSPARSE resources.
 
-    Releases hardware resources used by CUSPARSE
+    Releases hardware resources used by CUSPARSE.
 
     Parameters
     ----------
     handle : int
         CUSPARSE library context.
-
     """
-
     status = _libcusparse.cusparseDestroy(handle)
     cusparseCheckStatus(status)
 
 _libcusparse.cusparseGetVersion.restype = int
-_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int,
-                                            ctypes.c_void_p]
+_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
 def cusparseGetVersion(handle):
     """
     Return CUSPARSE library version.
@@ -213,9 +170,7 @@ def cusparseGetVersion(handle):
     -------
     version : int
         CUSPARSE library version number.
-
     """
-
     version = ctypes.c_int()
     status = _libcusparse.cusparseGetVersion(handle,
                                              ctypes.byref(version))
@@ -223,8 +178,7 @@ def cusparseGetVersion(handle):
     return version.value
 
 _libcusparse.cusparseSetStream.restype = int
-_libcusparse.cusparseSetStream.argtypes = [ctypes.c_int,
-                                                 ctypes.c_int]
+_libcusparse.cusparseSetStream.argtypes = [ctypes.c_void_p, ctypes.c_int]
 def cusparseSetStream(handle, id):
     """
     Sets the CUSPARSE stream in which kernels will run.
@@ -235,154 +189,346 @@ def cusparseSetStream(handle, id):
         CUSPARSE library context.
     id : int
         Stream ID.
-
     """
-
     status = _libcusparse.cusparseSetStream(handle, id)
     cusparseCheckStatus(status)
 
-_libcusparse.cusparseCreateMatDescr.restype = int
-_libcusparse.cusparseCreateMatDescr.argtypes = [cusparseMatDescr]
-def cusparseCreateMatDescr():
+_libcusparse.cusparseGetStream.restype = int
+_libcusparse.cusparseGetStream.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+def cusparseGetStream(handle):
     """
-    Initialize a sparse matrix descriptor.
+    Gets the CUSPARSE stream in which kernels will run.
 
-    Initializes the `MatrixType` and `IndexBase` fields of the matrix
-    descriptor to the default values `CUSPARSE_MATRIX_TYPE_GENERAL`
-    and `CUSPARSE_INDEX_BASE_ZERO`.
+    Parameters
+    ----------
+    handle : int
+        CUSPARSE library context.
 
     Returns
     -------
-    desc : cusparseMatDescr
-        Matrix descriptor.
-
+    handle : int
+        CUSPARSE library context.
     """
-
-    desc = cusparseMatrixDesc()
-    status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(desc))
+    id = ctypes.c_int()
+    status = _libcusparse.cusparseGetStream(handle, ctypes.byref(id))
     cusparseCheckStatus(status)
-    return desc
+    return id.value
 
-_libcusparse.cusparseDestroyMatDescr.restype = int
-_libcusparse.cusparseDestroyMatDescr.argtypes = [ctypes.c_int]
-def cusparseDestroyMatDescr(desc):
+gtsv2StridedBatch_bufferSizeExt_doc = Template(
     """
-    Releases the memory allocated for the matrix descriptor.
+    Calculate size of work buffer used by cusparse<t>gtsv2StridedBatch.
 
     Parameters
     ----------
-    desc : cusparseMatDescr
-        Matrix descriptor.
+    handle : int
+        cuSPARSE context
+    m : int
+        Size of the linear system (must be >= 3)
+    dl : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the lower
+        diagonal of the tri-diagonal linear system. The lower diagonal dl(i)
+        that corresponds to the ith linear system starts at location
+        dl+batchStride*i in memory. Also, the first element of each lower
+        diagonal must be zero.
+    d : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the main
+        diagonal of the tri-diagonal linear system. The main diagonal d(i)
+        that corresponds to the ith linear system starts at location
+        d+batchStride*i in memory.
+    du : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the upper
+        diagonal of the tri-diagonal linear system. The upper diagonal du(i)
+        that corresponds to the ith linear system starts at location
+        du+batchStride*i in memory. Also, the last element of each upper
+        diagonal must be zero.
+    x : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array that contains the
+        right-hand-side of the tri-diagonal linear system. The
+        right-hand-side x(i) that corresponds to the ith linear system
+        starts at location x+batchStride*i in memory.
+    batchCount : int
+        Number of systems to solve.
+    batchStride : int
+        Stride (number of elements) that separates the vectors of every
+        system (must be at least m).
 
-    """
+    Returns
+    -------
+    bufferSizeInBytes : int
+        number of bytes of the buffer used in the gtsv2StridedBatch.
 
-    status = _libcusparse.cusparseDestroyMatDescr(desc)
+    References
+    ----------
+    `cusparse<t>gtsv2StridedBatch_bufferSizeExt <https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2stridedbatch_bufferSize>`_
+    """
+)
+
+_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.restype = int
+_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.argtypes =\
+    [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p,
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    ctypes.c_void_p
+    ]
+def cusparseSgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount, batchStride):
+    bufferSizeInBytes = ctypes.c_int()
+    status = _libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt(
+        handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride,
+        ctypes.byref(bufferSizeInBytes))
     cusparseCheckStatus(status)
+    return bufferSizeInBytes.value
+cusparseSgtsv2StridedBatch_bufferSizeExt.__doc__ = \
+    gtsv2StridedBatch_bufferSizeExt_doc.substitute(precision='single precision', real='real')
+
+_libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt.restype = int
+_libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt.argtypes =\
+    _libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.argtypes
+def cusparseDgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount, batchStride):
+    bufferSizeInBytes = ctypes.c_int()
+    status = _libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt(
+        handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride,
+        ctypes.byref(bufferSizeInBytes))
+    cusparseCheckStatus(status)
+    return bufferSizeInBytes.value
+cusparseDgtsv2StridedBatch_bufferSizeExt.__doc__ = \
+    gtsv2StridedBatch_bufferSizeExt_doc.substitute(precision='double precision', real='real')
 
-_libcusparse.cusparseSetMatType.restype = int
-_libcusparse.cusparseSetMatType.argtypes = [cusparseMatDescr,
-                                            ctypes.c_int]
-def cusparseSetMatType(desc, type):
+gtsv2StridedBatch_doc = Template(
     """
-    Sets the matrix type of the specified matrix.
+    Compute the solution of multiple tridiagonal linear systems.
+    
+    Solves multiple tridiagonal linear systems, for i=0,…,batchCount:
+        A(i) ∗ y(i) = x(i)
+    The coefficient matrix A of each of these tri-diagonal linear system is
+    defined with three vectors corresponding to its lower (dl), main (d), and
+    upper (du) matrix diagonals; the right-hand sides are stored in the dense
+    matrix X. Notice that solution Y overwrites right-hand-side matrix X on exit.
+    The different matrices are assumed to be of the same size and are stored with
+    a fixed batchStride in memory.
+
+    The routine does not perform any pivoting and uses a combination of the
+    Cyclic Reduction (CR) and the Parallel Cyclic Reduction (PCR) algorithms to
+    find the solution. It achieves better performance when m is a power of 2.
 
     Parameters
     ----------
-    desc : cusparseMatDescr
-        Matrix descriptor.
-    type : int
-        Matrix type.
-
+    handle : int
+        cuSPARSE context
+    m : int
+        Size of the linear system (must be >= 3)
+    dl : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the lower
+        diagonal of the tri-diagonal linear system. The lower diagonal dl(i)
+        that corresponds to the ith linear system starts at location
+        dl+batchStride*i in memory. Also, the first element of each lower
+        diagonal must be zero.
+    d : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the main
+        diagonal of the tri-diagonal linear system. The main diagonal d(i)
+        that corresponds to the ith linear system starts at location
+        d+batchStride*i in memory.
+    du : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the upper
+        diagonal of the tri-diagonal linear system. The upper diagonal du(i)
+        that corresponds to the ith linear system starts at location
+        du+batchStride*i in memory. Also, the last element of each upper
+        diagonal must be zero.
+    x : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array that contains the
+        right-hand-side of the tri-diagonal linear system. The
+        right-hand-side x(i) that corresponds to the ith linear system
+        starts at location x+batchStride*i in memory.
+    batchCount : int
+        Number of systems to solve.
+    batchStride : int
+        Stride (number of elements) that separates the vectors of every
+        system (must be at least m).
+    pBuffer: ctypes.c_void_p
+        Buffer allocated by the user, the size is return by gtsv2StridedBatch_bufferSizeExt
+
+    References
+    ----------
+    `cusparse<t>gtsv2StridedBatch <https://docs.nvidia.com/cuda/cusparse/index.html#gtsv2stridedbatch>`_
     """
-
-    status = _libcusparse.cusparseSetMatType(desc, type)
+)
+
+_libcusparse.cusparseSgtsv2StridedBatch.restype = int
+_libcusparse.cusparseSgtsv2StridedBatch.argtypes =\
+    [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p,
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    ctypes.c_void_p
+    ]
+def cusparseSgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer):
+    status = _libcusparse.cusparseSgtsv2StridedBatch(
+        handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, int(pBuffer))
     cusparseCheckStatus(status)
+cusparseSgtsv2StridedBatch.__doc__ = \
+    gtsv2StridedBatch_doc.substitute(precision='single precision', real='real')
+
+_libcusparse.cusparseDgtsv2StridedBatch.restype = int
+_libcusparse.cusparseDgtsv2StridedBatch.argtypes =\
+    _libcusparse.cusparseSgtsv2StridedBatch.argtypes
+def cusparseDgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer):
+    status = _libcusparse.cusparseDgtsv2StridedBatch(
+        handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, int(pBuffer))
+    cusparseCheckStatus(status)
+cusparseDgtsv2StridedBatch.__doc__ = \
+    gtsv2StridedBatch_doc.substitute(precision='double precision', real='real')
 
-_libcusparse.cusparseGetMatType.restype = int
-_libcusparse.cusparseGetMatType.argtypes = [cusparseMatDescr]    
-def cusparseGetMatType(desc):
+gtsv2InterleavedBatch_bufferSizeExt_doc = Template(
     """
-    Gets the matrix type of the specified matrix.
+    Calculate size of work buffer used by cusparse<t>gtsvInterleavedBatch.
 
     Parameters
     ----------
-    desc : cusparseMatDescr
-        Matrix descriptor.
-
-    Returns
-    -------
-    type : int
-        Matrix type.
-
+    handle : int
+        cuSPARSE context
+    algo : int
+        algo = 0: cuThomas (unstable algorithm); algo = 1: LU with pivoting
+        (stable algorithm); algo = 2: QR (stable algorithm)
+    m : int
+        Size of the linear system
+    dl : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the lower
+        diagonal of the tri-diagonal linear system. The first element of each 
+        lower diagonal must be zero.
+    d : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the main
+        diagonal of the tri-diagonal linear system.
+    du : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the upper
+        diagonal of the tri-diagonal linear system. The last element of each
+        upper diagonal must be zero.
+    x : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array that contains the
+        right-hand-side of the tri-diagonal linear system.
+    batchCount : int
+        Number of systems to solve.
+    pBuffer: ctypes.c_void_p
+        Buffer allocated by the user, the size is return by gtsvInterleavedBatch_bufferSizeExt
+
+    References
+    ----------
+    `cusparse<t>gtsvInterleavedBatch <https://docs.nvidia.com/cuda/cusparse/index.html#gtsvInterleavedBatch>`_
     """
+)
+
+_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.restype = int
+_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.argtypes =\
+    [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
+    ctypes.c_void_p
+    ]
+def cusparseSgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, batchCount):
+    pBufferSizeInBytes = ctypes.c_int()
+    status = _libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt(
+        handle, algo, m, int(dl), int(d), int(du), int(x), batchCount,
+        ctypes.byref(pBufferSizeInBytes))
+    cusparseCheckStatus(status)
+    return pBufferSizeInBytes.value
+cusparseSgtsvInterleavedBatch_bufferSizeExt.__doc__ = \
+    gtsv2InterleavedBatch_bufferSizeExt_doc.substitute(precision='single precision', real='real')
+
+_libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt.restype = int
+_libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt.argtypes =\
+    _libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.argtypes
+def cusparseDgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, batchCount):
+    pBufferSizeInBytes = ctypes.c_int()
+    status = _libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt(
+        handle, algo, m, int(dl), int(d), int(du), int(x), batchCount,
+        ctypes.byref(pBufferSizeInBytes))
+    cusparseCheckStatus(status)
+    return pBufferSizeInBytes.value
+cusparseDgtsvInterleavedBatch_bufferSizeExt.__doc__ = \
+    gtsv2InterleavedBatch_bufferSizeExt_doc.substitute(precision='double precision', real='real')
 
-    return _libcusparse.cusparseGetMatType(desc)
-
-# Format conversion functions:
-_libcusparse.cusparseSnnz.restype = int
-_libcusparse.cusparseSnnz.argtypes = [ctypes.c_int,
-                                      ctypes.c_int,
-                                      ctypes.c_int,
-                                      ctypes.c_int,
-                                      cusparseMatDescr,
-                                      ctypes.c_void_p,
-                                      ctypes.c_int,
-                                      ctypes.c_void_p,
-                                      ctypes.c_void_p]
-def cusparseSnnz(handle, dirA, m, n, descrA, A, lda, 
-                 nnzPerRowColumn, nnzTotalDevHostPtr):
+gtsvInterleavedBatch_doc = Template(
     """
-    Compute number of non-zero elements per row, column, or dense matrix.
+    Compute the solution of multiple tridiagonal linear systems.
+
+    Solves multiple tridiagonal linear systems, for i=0,…,batchCount:
+        A(i) ∗ y(i) = x(i)
+    The coefficient matrix A of each of these tri-diagonal linear system is
+    defined with three vectors corresponding to its lower (dl), main (d), and
+    upper (du) matrix diagonals; the right-hand sides are stored in the dense
+    matrix X. Notice that solution Y overwrites right-hand-side matrix X on exit.
+    The different matrices are assumed to be of the same size and are stored with
+    a fixed batchStride in memory.
+
+    Assuming A is of size m and base-1, dl, d and du are defined by the following formula:
+        dl(i) := A(i, i-1) for i=1,2,...,m
+    The first element of dl is out-of-bound (dl(1) := A(1,0)), so dl(1) = 0.
+        d(i) = A(i,i) for i=1,2,...,m
+        du(i) = A(i,i+1) for i=1,2,...,m
+    The last element of du is out-of-bound (du(m) := A(m,m+1)), so du(m) = 0.
+
+    The data layout is different from gtsvStridedBatch which aggregates all
+    matrices one after another. Instead, gtsvInterleavedBatch gathers
+    different matrices of the same element in a continous manner. If dl is
+    regarded as a 2-D array of size m-by-batchCount, dl(:,j) to store j-th
+    matrix. gtsvStridedBatch uses column-major while gtsvInterleavedBatch
+    uses row-major.
+
+    The routine provides three different algorithms, selected by parameter algo.
+    The first algorithm is cuThomas provided by Barcelona Supercomputing Center.
+    The second algorithm is LU with partial pivoting and last algorithm is QR.
+    From stability perspective, cuThomas is not numerically stable because it
+    does not have pivoting. LU with partial pivoting and QR are stable. From
+    performance perspective, LU with partial pivoting and QR is about 10% to 20%
+    slower than cuThomas.
 
     Parameters
     ----------
     handle : int
-        CUSPARSE library context.
-    dirA : int
-        Data direction of elements.
+        cuSPARSE context
+    algo : int
+        algo = 0: cuThomas (unstable algorithm); algo = 1: LU with pivoting
+        (stable algorithm); algo = 2: QR (stable algorithm)
     m : int
-        Rows in A.
-    n : int
-        Columns in A.
-    descrA : cusparseMatDescr
-        Matrix descriptor.
-    A : pycuda.gpuarray.GPUArray
-        Dense matrix of dimensions (lda, n).
-    lda : int
-        Leading dimension of A.
-    
-    Returns
-    -------
-    nnzPerRowColumn : pycuda.gpuarray.GPUArray
-        Array of length m or n containing the number of 
-        non-zero elements per row or column, respectively.
-    nnzTotalDevHostPtr : pycuda.gpuarray.GPUArray
-        Total number of non-zero elements in device or host memory.
-
+        Size of the linear system
+    dl : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the lower
+        diagonal of the tri-diagonal linear system. The first element of each 
+        lower diagonal must be zero.
+    d : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the main
+        diagonal of the tri-diagonal linear system.
+    du : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array containing the upper
+        diagonal of the tri-diagonal linear system. The last element of each
+        upper diagonal must be zero.
+    x : ctypes.c_void_p
+        Pointer to ${precision} ${real} dense array that contains the
+        right-hand-side of the tri-diagonal linear system.
+    batchCount : int
+        Number of systems to solve.
+    pBuffer: ctypes.c_void_p
+        Buffer allocated by the user, the size is return by gtsvInterleavedBatch_bufferSizeExt
+
+    References
+    ----------
+    `cusparse<t>gtsvInterleavedBatch <https://docs.nvidia.com/cuda/cusparse/index.html#gtsvInterleavedBatch>`_
     """
-
-    # Unfinished:
-    nnzPerRowColumn = gpuarray.empty()
-    nnzTotalDevHostPtr = gpuarray.empty()
-
-    status = _libcusparse.cusparseSnnz(handle, dirA, m, n, 
-                                       descrA, int(A), lda,
-                                       int(nnzPerRowColumn), int(nnzTotalDevHostPtr))
+)
+
+_libcusparse.cusparseSgtsvInterleavedBatch.restype = int
+_libcusparse.cusparseSgtsvInterleavedBatch.argtypes =\
+    [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
+    ctypes.c_void_p
+    ]
+def cusparseSgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBuffer):
+    status = _libcusparse.cusparseSgtsvInterleavedBatch(
+        handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer))
     cusparseCheckStatus(status)
-    return nnzPerVector, nnzHost
-
-_libcusparse.cusparseSdense2csr.restype = int
-_libcusparse.cusparseSdense2csr.argtypes = [ctypes.c_int,
-                                            ctypes.c_int,
-                                            ctypes.c_int,
-                                            cusparseMatDescr,
-                                            ctypes.c_void_p,
-                                            ctypes.c_int,
-                                            ctypes.c_void_p,
-                                            ctypes.c_void_p,
-                                            ctypes.c_void_p,
-                                            ctypes.c_void_p]
-def cusparseSdense2csr(handle, m, n, descrA, A, lda, 
-                       nnzPerRow, csrValA, csrRowPtrA, csrColIndA):
-    # Unfinished
-    pass
+cusparseSgtsvInterleavedBatch.__doc__ = \
+    gtsvInterleavedBatch_doc.substitute(precision='single precision', real='real')
+
+_libcusparse.cusparseDgtsvInterleavedBatch.restype = int
+_libcusparse.cusparseDgtsvInterleavedBatch.argtypes =\
+    _libcusparse.cusparseSgtsvInterleavedBatch.argtypes
+def cusparseDgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBuffer):
+    status = _libcusparse.cusparseDgtsvInterleavedBatch(
+        handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer))
+    cusparseCheckStatus(status)
+cusparseDgtsvInterleavedBatch.__doc__ = \
+    gtsvInterleavedBatch_doc.substitute(precision='double precision', real='real')
diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py
new file mode 100644
index 00000000..469bfc68
--- /dev/null
+++ b/tests/test_cusparse.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+
+"""
+Unit tests for skcuda.cusparse
+"""
+
+from unittest import main, TestCase, TestSuite
+
+import pycuda
+import pycuda.gpuarray as gpuarray
+from pycuda.tools import clear_context_caches, make_default_context
+import numpy as np
+
+import skcuda.cusparse as cusparse
+
+def check_batch_tridiagonal(dl,d,du,x, y, m,batchCount,batchStride=None,atol=1e-8):
+    """
+    Check all solutions from batched tridiagonal routine
+    """
+    if batchStride is None:
+        batchStride = m
+
+    for ii in range(batchCount):
+        A_sys = np.diagflat(dl[ii*batchStride+1:ii*batchStride+m], -1) +\
+            np.diagflat(d[ii*batchStride:ii*batchStride+m], 0) + \
+            np.diagflat(du[ii*batchStride:ii*batchStride+m-1], 1)
+        x_sys = x[ii*batchStride:ii*batchStride+m]
+        y_sys = y[ii*batchStride:ii*batchStride+m]
+        assert(np.allclose(np.dot(A_sys,y_sys), x_sys, atol=atol))
+
+def tridiagonal_system(m, batchCount, batchStride=None, seed=None,
+                       dtype=np.float32):
+    """
+    Create a tridiagonal system of a given size
+    """
+    if batchStride is None:
+        batchStride = m
+    if seed is not None:
+        np.random.seed(seed)
+
+    dl = np.zeros(batchStride*batchCount).astype(dtype)
+    d = np.zeros(batchStride*batchCount).astype(dtype)
+    du = np.zeros(batchStride*batchCount).astype(dtype)
+    x = np.zeros(batchStride*batchCount).astype(dtype)
+
+    for ii in range(batchCount):
+        dl[ii*batchStride+1:ii*batchStride+m] = np.random.rand(m-1)
+        d[ii*batchStride:ii*batchStride+m] = np.random.rand(m)
+        du[ii*batchStride:ii*batchStride+m-1] = np.random.rand(m-1)
+        x[ii*batchStride:ii*batchStride+m] = np.random.rand(m)
+
+    return dl,d,du,x
+
+class test_cusparse(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.ctx = make_default_context()
+        cls.cusparse_handle = cusparse.cusparseCreate()
+
+    @classmethod
+    def tearDownClass(cls):
+        cusparse.cusparseDestroy(cls.cusparse_handle)
+        cls.ctx.pop()
+        clear_context_caches()
+
+    def setUp(self):
+        np.random.seed(23)    # For reproducible tests.
+
+    def test_cusparseSgtsv2StridedBatch(self):
+        m = 6
+        batchCount = 9
+        batchStride = 11
+
+        dl,d,du,x = tridiagonal_system(m, batchCount, batchStride, seed=23)
+
+        dl_gpu = gpuarray.to_gpu(dl)
+        d_gpu = gpuarray.to_gpu(d)
+        du_gpu = gpuarray.to_gpu(du)
+        x_gpu = gpuarray.to_gpu(x)
+
+        bufferSizeInBytes = cusparse.cusparseSgtsv2StridedBatch_bufferSizeExt(
+            self.cusparse_handle, m, dl_gpu.gpudata, d_gpu.gpudata,
+            du_gpu.gpudata, x_gpu.gpudata, batchCount, batchStride)
+        pBuffer = pycuda.driver.mem_alloc(bufferSizeInBytes)
+
+        cusparse.cusparseSgtsv2StridedBatch(self.cusparse_handle, m,
+            dl_gpu.gpudata, d_gpu.gpudata, du_gpu.gpudata, x_gpu.gpudata,
+            batchCount, batchStride, pBuffer)
+
+        sln = x_gpu.get()
+        # For unstable algorithms, need to loosen atol
+        check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount,batchStride, 1e-4)
+
+    def test_cusparseDgtsv2StridedBatch(self):
+        m = 6
+        batchCount = 9
+        batchStride = 11
+
+        dl,d,du,x = tridiagonal_system(m, batchCount, batchStride, seed=23,
+                                       dtype=np.float64)
+
+        dl_gpu = gpuarray.to_gpu(dl)
+        d_gpu = gpuarray.to_gpu(d)
+        du_gpu = gpuarray.to_gpu(du)
+        x_gpu = gpuarray.to_gpu(x)
+
+        bufferSizeInBytes = cusparse.cusparseDgtsv2StridedBatch_bufferSizeExt(
+            self.cusparse_handle, m, dl_gpu.gpudata, d_gpu.gpudata,
+            du_gpu.gpudata, x_gpu.gpudata, batchCount, batchStride)
+        pBuffer = pycuda.driver.mem_alloc(bufferSizeInBytes)
+
+        cusparse.cusparseDgtsv2StridedBatch(self.cusparse_handle, m,
+            dl_gpu.gpudata, d_gpu.gpudata, du_gpu.gpudata, x_gpu.gpudata,
+            batchCount, batchStride, pBuffer)
+
+        sln = x_gpu.get()
+        # For unstable algorithms, need to loosen atol
+        check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount,batchStride)
+
+    def test_cusparseSgtsvInterleavedBatch(self):
+        m = 6
+        batchCount = 9
+
+        dl,d,du,x = tridiagonal_system(m, batchCount, seed=23)
+
+        # Convert to interleaved format, by switching from row-major order
+        # (numpy default) to column-major
+        dl_int = np.reshape(dl,(batchCount,m)).ravel('F')
+        d_int = np.reshape(d, (batchCount, m)).ravel('F')
+        du_int = np.reshape(du,(batchCount,m)).ravel('F')
+        x_int = np.reshape(x,(batchCount,m)).ravel('F')
+
+        for algo in range(3):
+            dl_int_gpu = gpuarray.to_gpu(dl_int)
+            d_int_gpu = gpuarray.to_gpu(d_int)
+            du_int_gpu = gpuarray.to_gpu(du_int)
+            x_int_gpu = gpuarray.to_gpu(x_int)
+
+            pBufferSizeInBytes = cusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt(self.cusparse_handle,
+                algo, m, dl_int_gpu.gpudata, d_int_gpu.gpudata,
+                du_int_gpu.gpudata, x_int_gpu.gpudata, batchCount)
+                
+            pBuffer = pycuda.driver.mem_alloc(pBufferSizeInBytes)
+
+            cusparse.cusparseSgtsvInterleavedBatch(self.cusparse_handle, algo, m,
+                dl_int_gpu.gpudata, d_int_gpu.gpudata, du_int_gpu.gpudata,
+                x_int_gpu.gpudata, batchCount, pBuffer)
+    
+            sln_int = x_int_gpu.get()
+            # Convert back from interleaved format
+            sln = np.reshape(sln_int,(m,batchCount)).ravel('F')
+            check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount, atol=1e-6)
+
+    def test_cusparseDgtsvInterleavedBatch(self):
+        m = 6
+        batchCount = 9
+
+        dl,d,du,x = tridiagonal_system(m, batchCount, seed=23,
+                                       dtype=np.float64)
+
+        # Convert to interleaved format, by switching from row-major order
+        # (numpy default) to column-major
+        dl_int = np.reshape(dl,(batchCount,m)).ravel('F')
+        d_int = np.reshape(d, (batchCount, m)).ravel('F')
+        du_int = np.reshape(du,(batchCount,m)).ravel('F')
+        x_int = np.reshape(x,(batchCount,m)).ravel('F')
+
+        for algo in range(3):
+            dl_int_gpu = gpuarray.to_gpu(dl_int)
+            d_int_gpu = gpuarray.to_gpu(d_int)
+            du_int_gpu = gpuarray.to_gpu(du_int)
+            x_int_gpu = gpuarray.to_gpu(x_int)
+
+            pBufferSizeInBytes = cusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt(self.cusparse_handle,
+                algo, m, dl_int_gpu.gpudata, d_int_gpu.gpudata,
+                du_int_gpu.gpudata, x_int_gpu.gpudata, batchCount)
+                
+            pBuffer = pycuda.driver.mem_alloc(pBufferSizeInBytes)
+
+            cusparse.cusparseDgtsvInterleavedBatch(self.cusparse_handle, algo, m,
+                dl_int_gpu.gpudata, d_int_gpu.gpudata, du_int_gpu.gpudata,
+                x_int_gpu.gpudata, batchCount, pBuffer)
+    
+            sln_int = x_int_gpu.get()
+            # Convert back from interleaved format
+            sln = np.reshape(sln_int,(m,batchCount)).ravel('F')
+            check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount)
+        
+    def test_cusparseGetSetStream(self):
+        initial_stream = cusparse.cusparseGetStream(self.cusparse_handle)
+        # Switch stream
+        cusparse.cusparseSetStream(self.cusparse_handle, initial_stream+1)
+        final_stream = cusparse.cusparseGetStream(self.cusparse_handle)
+        assert(final_stream == initial_stream+1)
+
+    def test_cusparseGetVersion(self):
+        cusparse.cusparseGetVersion(self.cusparse_handle)
+
+def suite():
+    s = TestSuite()
+    s.addTest(test_cusparse('test_cusparseSgtsv2StridedBatch'))
+    s.addTest(test_cusparse('test_cusparseDgtsv2StridedBatch'))
+    s.addTest(test_cusparse('test_cusparseSgtsvInterleavedBatch'))
+    s.addTest(test_cusparse('test_cusparseDgtsvInterleavedBatch'))
+
+    s.addTest(test_cusparse('test_cusparseGetSetStream'))
+    s.addTest(test_cusparse('test_cusparseGetVersion'))
+    return s
+
+if __name__ == '__main__':
+    main(defaultTest = 'suite')