From 7180fe51872f2ac1c71fb98a02a4997ad83b22e7 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Fri, 2 Feb 2024 09:10:52 +0100
Subject: [PATCH 01/29] Unmuted tests from test_mathematical.py scope (#1668)

* Unmuted tests from test_mathematical.py scope

* Removed fallback fixture for dpnp.copysign

* Tests for dpnp.clip with NaN edges require numpy>=1.25.0

* Muted again power tests with complex types (until 2024.1 release)
---
 tests/test_mathematical.py | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 75735e89bc9..1faa0620f7d 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -15,6 +15,7 @@
 
 import dpnp
 from dpnp.dpnp_array import dpnp_array
+from tests.third_party.cupy import testing
 
 from .helper import (
     assert_dtype_allclose,
@@ -25,7 +26,6 @@
     get_integer_dtypes,
     has_support_aspect64,
     is_cpu_device,
-    is_win_platform,
 )
 
 
@@ -125,8 +125,7 @@ def test_input_nan(self):
         expected = numpy.clip(np_a, -1, 1)
         assert_array_equal(result, expected)
 
-    # TODO: unmute the test once dpctl resolves the issue
-    @pytest.mark.skip(reason="dpctl-1489 issue")
+    @testing.with_requires("numpy>=1.25.0")
     @pytest.mark.parametrize(
         "kwargs",
         [
@@ -138,7 +137,7 @@ def test_input_nan(self):
         ],
     )
     def test_nan_edges(self, kwargs):
-        np_a = numpy.arange(7)
+        np_a = numpy.arange(7.0)
         dp_a = dpnp.asarray(np_a)
 
         result = dp_a.clip(**kwargs)
@@ -424,7 +423,6 @@ def test_add(self, dtype, lhs, rhs):
     def test_arctan2(self, dtype, lhs, rhs):
         self._test_mathematical("arctan2", dtype, lhs, rhs)
 
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @pytest.mark.parametrize(
         "dtype", get_all_dtypes(no_bool=True, no_complex=True)
     )
@@ -564,7 +562,7 @@ def test_op_with_scalar(array, val, func, data_type, val_type):
             pytest.skip(
                 "(0j ** 0) is different: (NaN + NaNj) in dpnp and (1 + 0j) in numpy"
             )
-        # TODO: Remove when #1378 (dpctl) is solved
+        # TODO: Remove when #1378 (dpctl) is solved and 2024.1 is released (coverage is failing otherwise)
         elif (
             is_cpu_device()
             and dpnp_a.dtype == dpnp.complex128
@@ -1002,7 +1000,7 @@ def test_power(array, val, data_type, val_type):
     dpnp_a = dpnp.array(array, dtype=data_type)
     val_ = val_type(val)
 
-    # TODO: Remove when #1378 (dpctl) is solved
+    # TODO: Remove when #1378 (dpctl) is solved and 2024.1 is released (coverage is failing otherwise)
     if (
         is_cpu_device()
         and (
@@ -2306,12 +2304,6 @@ def test_complex_values(self):
         dp_arr = dpnp.array(np_arr)
         func = lambda x: x**2
 
-        # TODO: unmute the test once it's available
-        if is_win_platform():
-            pytest.skip(
-                "Until the latest dpctl is available on internal channel"
-            )
-
         assert_dtype_allclose(func(dp_arr), func(np_arr))
 
     @pytest.mark.parametrize("val", [0, 1], ids=["0", "1"])
@@ -2696,9 +2688,6 @@ def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair):
             with pytest.raises(TypeError):
                 dpnp.matmul(b1, b2, dtype=dtype2)
 
-    # TODO: Temporary skipping the test, until Internal CI is updated with
-    # recent changed in dpctl regarding dpt.result_type function
-    @pytest.mark.skip("Temporary skipping the test")
     @pytest.mark.parametrize("dtype1", get_all_dtypes(no_bool=True))
     @pytest.mark.parametrize("dtype2", get_all_dtypes(no_bool=True))
     @pytest.mark.parametrize(
@@ -2846,9 +2835,6 @@ def test_exe_q(self):
         with pytest.raises(ValueError):
             dpnp.matmul(x1, x2)
 
-    # TODO: Temporary skipping the test, until Internal CI is updated with
-    # recent changed in dpctl regarding dpt.result_type function
-    @pytest.mark.skip("Temporary skipping the test")
     def test_matmul_casting(self):
         a1 = dpnp.arange(2 * 4, dtype=dpnp.float32).reshape(2, 4)
         a2 = dpnp.arange(4 * 3).reshape(4, 3)

From c7770fd63036b810849ec7807fa20f63e2821f1e Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 2 Feb 2024 14:21:14 +0100
Subject: [PATCH 02/29] Update dpnp.linalg.svd() function (#1604)

* Draft commit of dpnp.linalg.svd impl

* Pass empty arrays if compute_uv=False

* Add logic for the input array n < m

* Add a new cupy test_decomposition

* Rename gesvd input parameters

* Correspondence of passed parameters to gesvd signature

* Correct initialization of result variables in dpnp_svd

* Update test_decomposition

* Add implementation of _dpnp_svd_batch

* Add test_decomposition to the scope of public CI

* Improve error handling for mkl_lapack::gesvd function

* Declate detail variable

* Use a_usm_type and a_sycl_queue variables

* Add additional checks for gesvd function

* Remove old dpnp_svd backend

* Refresh test_svd in test_linalg

* Add detailed comments for gesvd arguments

* gesvd returns pair of events and uses dpctl.utils.keep_args_alive

* Keep a lexicographical order

* Update docstrings for svd

* Add test_svd to test_usm_type

* Add a new impl to get s_type

* Add a description for _stacked_identity

* Simplify dpnp_svd_batch

* Update tests for dpnp.linalg.svd

* Add hermitian argument support

* Add test_svd_hermitian

* Update svd docstrings

* Tune tolerance

* Update test_svd_errors

* Update _common_type and _common_inexact_type

* Remove passing n and m parameteres to _gesvd

* Simplify results return logic for dpnp_svd_batch

* Update condition and random files in cupy/testing to use fix_random and repeat decorators

* Rename cupy/testing/condition.py to .../_condition.py

* Use self._tol in TestSvd

* Update gesvd error handler

* dpnp_svd works with F contiguous arrays

* Add additional checks for output arrays

* Impl parallel calculation in dpnp_svd_batch

* Skip using @_condition.repeat in cupy tests

* Add additional checks for output arrays

* Update docstrings for svd

* Use dpctl.SyclEvent.wait_for in dpnp_svd_batch

* Add TODO : matching the order of returned arrays

* Skip cupy tests on windows

* Rename condition to _condition

* Set setUpClass to skip cupy tests on cpu
---
 dpnp/backend/extensions/lapack/CMakeLists.txt |   1 +
 dpnp/backend/extensions/lapack/gesvd.cpp      | 359 +++++++++++++
 dpnp/backend/extensions/lapack/gesvd.hpp      |  55 ++
 dpnp/backend/extensions/lapack/lapack_py.cpp  |   9 +
 .../extensions/lapack/types_matrix.hpp        |  22 +
 dpnp/backend/include/dpnp_iface_fptr.hpp      |   2 -
 dpnp/backend/kernels/dpnp_krnl_linalg.cpp     |  44 --
 dpnp/dpnp_algo/dpnp_algo.pxd                  |   2 -
 dpnp/linalg/dpnp_algo_linalg.pyx              |  55 --
 dpnp/linalg/dpnp_iface_linalg.py              |  74 ++-
 dpnp/linalg/dpnp_utils_linalg.py              | 481 +++++++++++++++---
 tests/test_linalg.py                          | 205 +++++---
 tests/test_sycl_queue.py                      |  91 ++--
 tests/test_usm_type.py                        |  50 ++
 .../cupy/linalg_tests/test_decomposition.py   | 250 +++++++++
 .../cupy/linalg_tests/test_solve.py           |   4 +-
 .../cupy/random_tests/test_sample.py          |  24 +-
 tests/third_party/cupy/testing/__init__.py    |   4 +-
 .../testing/{condition.py => _condition.py}   |   2 +-
 tests/third_party/cupy/testing/random.py      |  17 +-
 20 files changed, 1425 insertions(+), 326 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/gesvd.cpp
 create mode 100644 dpnp/backend/extensions/lapack/gesvd.hpp
 rename tests/third_party/cupy/testing/{condition.py => _condition.py} (98%)

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 626615e3e53..28fa2072d7d 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -28,6 +28,7 @@ set(python_module_name _lapack_impl)
 set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/lapack_py.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/gesvd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getri_batch.cpp
diff --git a/dpnp/backend/extensions/lapack/gesvd.cpp b/dpnp/backend/extensions/lapack/gesvd.cpp
new file mode 100644
index 00000000000..27734f4492b
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/gesvd.cpp
@@ -0,0 +1,359 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "gesvd.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*gesvd_impl_fn_ptr_t)(sycl::queue,
+                                           const oneapi::mkl::jobsvd,
+                                           const oneapi::mkl::jobsvd,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           char *,
+                                           const std::int64_t,
+                                           char *,
+                                           char *,
+                                           const std::int64_t,
+                                           char *,
+                                           const std::int64_t,
+                                           std::vector<sycl::event> &,
+                                           const std::vector<sycl::event> &);
+
+static gesvd_impl_fn_ptr_t gesvd_dispatch_table[dpctl_td_ns::num_types]
+                                               [dpctl_td_ns::num_types];
+
+// Converts a given character code (ord) to the corresponding
+// oneapi::mkl::jobsvd enumeration value
+static oneapi::mkl::jobsvd process_job(std::int8_t job_val)
+{
+    switch (job_val) {
+    case 'A':
+        return oneapi::mkl::jobsvd::vectors;
+    case 'S':
+        return oneapi::mkl::jobsvd::somevec;
+    case 'O':
+        return oneapi::mkl::jobsvd::vectorsina;
+    case 'N':
+        return oneapi::mkl::jobsvd::novec;
+    default:
+        throw std::invalid_argument("Unknown value for job");
+    }
+}
+
+template <typename T, typename RealT>
+static sycl::event gesvd_impl(sycl::queue exec_q,
+                              const oneapi::mkl::jobsvd jobu,
+                              const oneapi::mkl::jobsvd jobvt,
+                              const std::int64_t m,
+                              const std::int64_t n,
+                              char *in_a,
+                              const std::int64_t lda,
+                              char *out_s,
+                              char *out_u,
+                              const std::int64_t ldu,
+                              char *out_vt,
+                              const std::int64_t ldvt,
+                              std::vector<sycl::event> &host_task_events,
+                              const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+    type_utils::validate_type_for_device<RealT>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    RealT *s = reinterpret_cast<RealT *>(out_s);
+    T *u = reinterpret_cast<T *>(out_u);
+    T *vt = reinterpret_cast<T *>(out_vt);
+
+    const std::int64_t scratchpad_size = mkl_lapack::gesvd_scratchpad_size<T>(
+        exec_q, jobu, jobvt, m, n, lda, ldu, ldvt);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event gesvd_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        gesvd_event = mkl_lapack::gesvd(
+            exec_q,
+            jobu,  // Character specifying how to compute the matrix U:
+                   // 'A' computes all columns of U,
+                   // 'S' computes the first min(m,n) columns of U,
+                   // 'O' overwrites A with the columns of U,
+                   // 'N' does not compute U.
+            jobvt, // Character specifying how to compute the matrix VT:
+                   // 'A' computes all rows of VT,
+                   // 'S' computes the first min(m,n) rows of VT,
+                   // 'O' overwrites A with the rows of VT,
+                   // 'N' does not compute VT.
+            m,     // The number of rows in the input matrix A (0 <= m).
+            n,     // The number of columns in the input matrix A (0 <= n).
+            a,     // Pointer to the input matrix A of size (m x n).
+            lda,   // The leading dimension of A, must be at least max(1, m).
+            s,     // Pointer to the array containing the singular values.
+            u,   // Pointer to the matrix U in the singular value decomposition.
+            ldu, // The leading dimension of U, must be at least max(1, m).
+            vt, // Pointer to the matrix VT in the singular value decomposition.
+            ldvt, // The leading dimension of VT, must be at least max(1, n).
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail();
+        }
+        else if (info > 0) {
+            error_msg << "The algorithm computing SVD failed to converge; "
+                      << info << " off-diagonal elements of an intermediate "
+                      << "bidiagonal form did not converge to zero.\n";
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during gesvd() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg << "Unexpected SYCL exception caught during gesvd() call:\n"
+                  << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(gesvd_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+    return gesvd_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    gesvd(sycl::queue exec_q,
+          const std::int8_t jobu_val,
+          const std::int8_t jobvt_val,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray out_s,
+          dpctl::tensor::usm_ndarray out_u,
+          dpctl::tensor::usm_ndarray out_vt,
+          const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int out_u_array_nd = out_u.get_ndim();
+    const int out_s_array_nd = out_s.get_ndim();
+    const int out_vt_array_nd = out_vt.get_ndim();
+
+    if (a_array_nd != 2) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but a 2-dimensional array is expected.");
+    }
+
+    if (out_s_array_nd != 1) {
+        throw py::value_error("The output array of singular values has ndim=" +
+                              std::to_string(out_s_array_nd) +
+                              ", but a 1-dimensional array is expected.");
+    }
+
+    if (jobu_val == 'N' && jobvt_val == 'N') {
+        if (out_u_array_nd != 0) {
+            throw py::value_error(
+                "The output array of the left singular vectors has ndim=" +
+                std::to_string(out_u_array_nd) +
+                ", but it is not used and should have ndim=0.");
+        }
+        if (out_vt_array_nd != 0) {
+            throw py::value_error(
+                "The output array of the right singular vectors has ndim=" +
+                std::to_string(out_vt_array_nd) +
+                ", but it is not used and should have ndim=0.");
+        }
+    }
+    else {
+        if (out_u_array_nd != 2) {
+            throw py::value_error(
+                "The output array of the left singular vectors has ndim=" +
+                std::to_string(out_u_array_nd) +
+                ", but a 2-dimensional array is expected.");
+        }
+        if (out_vt_array_nd != 2) {
+            throw py::value_error(
+                "The output array of the right singular vectors has ndim=" +
+                std::to_string(out_vt_array_nd) +
+                ", but a 2-dimensional array is expected.");
+        }
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q, {a_array.get_queue(), out_s.get_queue(), out_u.get_queue(),
+                     out_vt.get_queue()}))
+    {
+        throw std::runtime_error(
+            "USM allocations are not compatible with the execution queue.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, out_s) || overlap(a_array, out_u) ||
+        overlap(a_array, out_vt) || overlap(out_s, out_u) ||
+        overlap(out_s, out_vt) || overlap(out_u, out_vt))
+    {
+        throw py::value_error("Arrays have overlapping segments of memory");
+    }
+
+    bool is_a_array_f_contig = a_array.is_f_contiguous();
+    if (!is_a_array_f_contig) {
+        throw py::value_error("The input array must be F-contiguous");
+    }
+
+    bool is_out_u_array_f_contig = out_u.is_f_contiguous();
+    bool is_out_vt_array_f_contig = out_vt.is_f_contiguous();
+
+    if (!is_out_u_array_f_contig || !is_out_vt_array_f_contig) {
+        throw py::value_error("The output arrays of the left and right "
+                              "singular vectors must be F-contiguous");
+    }
+
+    bool is_out_s_array_c_contig = out_s.is_c_contiguous();
+    bool is_out_s_array_f_contig = out_s.is_f_contiguous();
+
+    if (!is_out_s_array_c_contig || !is_out_s_array_f_contig) {
+        throw py::value_error("The output array of singular values "
+                              "must be contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int out_u_type_id = array_types.typenum_to_lookup_id(out_u.get_typenum());
+    int out_s_type_id = array_types.typenum_to_lookup_id(out_s.get_typenum());
+    int out_vt_type_id = array_types.typenum_to_lookup_id(out_vt.get_typenum());
+
+    if (a_array_type_id != out_u_type_id || a_array_type_id != out_vt_type_id) {
+        throw py::type_error(
+            "Input array, output left singular vectors array, "
+            "and outpuy right singular vectors array must have "
+            "the same data type");
+    }
+
+    gesvd_impl_fn_ptr_t gesvd_fn =
+        gesvd_dispatch_table[a_array_type_id][out_s_type_id];
+    if (gesvd_fn == nullptr) {
+        throw py::value_error(
+            "No gesvd implementation is defined for the given pair "
+            "of array type and output singular values type.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    char *out_s_data = out_s.get_data();
+    char *out_u_data = out_u.get_data();
+    char *out_vt_data = out_vt.get_data();
+
+    const py::ssize_t *a_array_shape = a_array.get_shape_raw();
+    const std::int64_t m = a_array_shape[0];
+    const std::int64_t n = a_array_shape[1];
+
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+    const std::int64_t ldu = std::max<size_t>(1UL, m);
+    const std::int64_t ldvt =
+        std::max<std::size_t>(1UL, jobvt_val == 'S' ? (m > n ? n : m) : n);
+
+    const oneapi::mkl::jobsvd jobu = process_job(jobu_val);
+    const oneapi::mkl::jobsvd jobvt = process_job(jobvt_val);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event gesvd_ev =
+        gesvd_fn(exec_q, jobu, jobvt, m, n, a_array_data, lda, out_s_data,
+                 out_u_data, ldu, out_vt_data, ldvt, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {a_array, out_s, out_u, out_vt}, host_task_events);
+
+    return std::make_pair(args_ev, gesvd_ev);
+}
+
+template <typename fnT, typename T, typename RealT>
+struct GesvdContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GesvdTypePairSupportFactory<T, RealT>::is_defined)
+        {
+            return gesvd_impl<T, RealT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_gesvd_dispatch_table(void)
+{
+    dpctl_td_ns::DispatchTableBuilder<gesvd_impl_fn_ptr_t, GesvdContigFactory,
+                                      dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_table(gesvd_dispatch_table);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/gesvd.hpp b/dpnp/backend/extensions/lapack/gesvd.hpp
new file mode 100644
index 00000000000..17ebd0edbe7
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/gesvd.hpp
@@ -0,0 +1,55 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+extern std::pair<sycl::event, sycl::event>
+    gesvd(sycl::queue exec_q,
+          const std::int8_t jobu_val,
+          const std::int8_t jobvt_val,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray out_s,
+          dpctl::tensor::usm_ndarray out_u,
+          dpctl::tensor::usm_ndarray out_vt,
+          const std::vector<sycl::event> &depends);
+
+extern void init_gesvd_dispatch_table(void);
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index 71991be3652..0c76d0fc096 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -31,6 +31,7 @@
 #include <pybind11/stl.h>
 
 #include "gesv.hpp"
+#include "gesvd.hpp"
 #include "getrf.hpp"
 #include "getri.hpp"
 #include "heevd.hpp"
@@ -56,6 +57,7 @@ void init_dispatch_vectors(void)
 // populate dispatch tables
 void init_dispatch_tables(void)
 {
+    lapack_ext::init_gesvd_dispatch_table();
     lapack_ext::init_heevd_dispatch_table();
 }
 
@@ -76,6 +78,13 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("sycl_queue"), py::arg("coeff_matrix"),
           py::arg("dependent_vals"), py::arg("depends") = py::list());
 
+    m.def("_gesvd", &lapack_ext::gesvd,
+          "Call `gesvd` from OneMKL LAPACK library to return "
+          "the singular value decomposition of a general rectangular matrix",
+          py::arg("sycl_queue"), py::arg("jobu_val"), py::arg("jobvt_val"),
+          py::arg("a_array"), py::arg("res_s"), py::arg("res_u"),
+          py::arg("res_vt"), py::arg("depends") = py::list());
+
     m.def("_getrf", &lapack_ext::getrf,
           "Call `getrf` from OneMKL LAPACK library to return "
           "the LU factorization of a general n x n matrix",
diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp
index 7e5413b84c8..893619e6afb 100644
--- a/dpnp/backend/extensions/lapack/types_matrix.hpp
+++ b/dpnp/backend/extensions/lapack/types_matrix.hpp
@@ -70,6 +70,28 @@ struct GesvTypePairSupportFactory
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::gesvd<T, RealT>
+ * function.
+ *
+ * @tparam T Type of array containing input matrix A and output matrices U and
+ * VT of singular vectors.
+ * @tparam RealT Type of output array containing singular values of A.
+ */
+template <typename T, typename RealT>
+struct GesvdTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, float, RealT, float>,
+        dpctl_td_ns::TypePairDefinedEntry<T, double, RealT, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, std::complex<float>, RealT, float>,
+        dpctl_td_ns::
+            TypePairDefinedEntry<T, std::complex<double>, RealT, double>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::getrf<T>
diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp
index 2e2ce5ab144..3061bb01f29 100644
--- a/dpnp/backend/include/dpnp_iface_fptr.hpp
+++ b/dpnp/backend/include/dpnp_iface_fptr.hpp
@@ -363,8 +363,6 @@ enum class DPNPFuncName : size_t
                              parameters */
     DPNP_FN_SUM,          /**< Used in numpy.sum() impl  */
     DPNP_FN_SVD,          /**< Used in numpy.linalg.svd() impl  */
-    DPNP_FN_SVD_EXT,      /**< Used in numpy.linalg.svd() impl, requires extra
-                             parameters */
     DPNP_FN_TAKE,         /**< Used in numpy.take() impl  */
     DPNP_FN_TAN,          /**< Used in numpy.tan() impl  */
     DPNP_FN_TANH,         /**< Used in numpy.tanh() impl  */
diff --git a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
index e0b6de5b1b6..610da8fda3c 100644
--- a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
@@ -824,17 +824,6 @@ template <typename _InputDT, typename _ComputeDT, typename _SVDT>
 void (*dpnp_svd_default_c)(void *, void *, void *, void *, size_t, size_t) =
     dpnp_svd_c<_InputDT, _ComputeDT, _SVDT>;
 
-template <typename _InputDT, typename _ComputeDT, typename _SVDT>
-DPCTLSyclEventRef (*dpnp_svd_ext_c)(DPCTLSyclQueueRef,
-                                    void *,
-                                    void *,
-                                    void *,
-                                    void *,
-                                    size_t,
-                                    size_t,
-                                    const DPCTLEventVectorRef) =
-    dpnp_svd_c<_InputDT, _ComputeDT, _SVDT>;
-
 void func_map_init_linalg_func(func_map_t &fmap)
 {
     fmap[DPNPFuncName::DPNP_FN_CHOLESKY][eft_FLT][eft_FLT] = {
@@ -1046,38 +1035,5 @@ void func_map_init_linalg_func(func_map_t &fmap)
         eft_C128, (void *)dpnp_svd_default_c<std::complex<double>,
                                              std::complex<double>, double>};
 
-    fmap[DPNPFuncName::DPNP_FN_SVD_EXT][eft_INT][eft_INT] = {
-        get_default_floating_type(),
-        (void *)dpnp_svd_ext_c<
-            int32_t, func_type_map_t::find_type<get_default_floating_type()>,
-            func_type_map_t::find_type<get_default_floating_type()>>,
-        get_default_floating_type<std::false_type>(),
-        (void *)
-            dpnp_svd_ext_c<int32_t,
-                           func_type_map_t::find_type<
-                               get_default_floating_type<std::false_type>()>,
-                           func_type_map_t::find_type<
-                               get_default_floating_type<std::false_type>()>>};
-    fmap[DPNPFuncName::DPNP_FN_SVD_EXT][eft_LNG][eft_LNG] = {
-        get_default_floating_type(),
-        (void *)dpnp_svd_ext_c<
-            int64_t, func_type_map_t::find_type<get_default_floating_type()>,
-            func_type_map_t::find_type<get_default_floating_type()>>,
-        get_default_floating_type<std::false_type>(),
-        (void *)
-            dpnp_svd_ext_c<int64_t,
-                           func_type_map_t::find_type<
-                               get_default_floating_type<std::false_type>()>,
-                           func_type_map_t::find_type<
-                               get_default_floating_type<std::false_type>()>>};
-    fmap[DPNPFuncName::DPNP_FN_SVD_EXT][eft_FLT][eft_FLT] = {
-        eft_FLT, (void *)dpnp_svd_ext_c<float, float, float>};
-    fmap[DPNPFuncName::DPNP_FN_SVD_EXT][eft_DBL][eft_DBL] = {
-        eft_DBL, (void *)dpnp_svd_ext_c<double, double, double>};
-    fmap[DPNPFuncName::DPNP_FN_SVD_EXT][eft_C128][eft_C128] = {
-        eft_C128,
-        (void *)
-            dpnp_svd_ext_c<std::complex<double>, std::complex<double>, double>};
-
     return;
 }
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index 895b393aeff..28e21340647 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -171,8 +171,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_RNG_ZIPF_EXT
         DPNP_FN_SEARCHSORTED
         DPNP_FN_SEARCHSORTED_EXT
-        DPNP_FN_SVD
-        DPNP_FN_SVD_EXT
         DPNP_FN_TRACE
         DPNP_FN_TRACE_EXT
         DPNP_FN_TRANSPOSE
diff --git a/dpnp/linalg/dpnp_algo_linalg.pyx b/dpnp/linalg/dpnp_algo_linalg.pyx
index 1d94a893fff..3bf6dad3ee8 100644
--- a/dpnp/linalg/dpnp_algo_linalg.pyx
+++ b/dpnp/linalg/dpnp_algo_linalg.pyx
@@ -51,7 +51,6 @@ __all__ = [
     "dpnp_matrix_rank",
     "dpnp_norm",
     "dpnp_qr",
-    "dpnp_svd",
 ]
 
 
@@ -379,57 +378,3 @@ cpdef tuple dpnp_qr(utils.dpnp_descriptor x1, str mode):
     c_dpctl.DPCTLEvent_Delete(event_ref)
 
     return (res_q.get_pyobj(), res_r.get_pyobj())
-
-
-cpdef tuple dpnp_svd(utils.dpnp_descriptor x1, cpp_bool full_matrices, cpp_bool compute_uv, cpp_bool hermitian):
-    cdef size_t size_m = x1.shape[0]
-    cdef size_t size_n = x1.shape[1]
-    cdef size_t size_s = min(size_m, size_n)
-
-    cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(x1.dtype)
-    cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_SVD_EXT, param1_type, param1_type)
-
-    x1_obj = x1.get_array()
-
-    cdef (DPNPFuncType, void *) ret_type_and_func = utils.get_ret_type_and_func(kernel_data,
-                                                                                x1_obj.sycl_device.has_aspect_fp64)
-    cdef DPNPFuncType return_type = ret_type_and_func[0]
-    cdef custom_linalg_1in_3out_shape_t func = < custom_linalg_1in_3out_shape_t > ret_type_and_func[1]
-
-    cdef utils.dpnp_descriptor res_u = utils.create_output_descriptor((size_m, size_m),
-                                                                       return_type,
-                                                                       None,
-                                                                       device=x1_obj.sycl_device,
-                                                                       usm_type=x1_obj.usm_type,
-                                                                       sycl_queue=x1_obj.sycl_queue)
-    cdef utils.dpnp_descriptor res_s = utils.create_output_descriptor((size_s, ),
-                                                                       return_type,
-                                                                       None,
-                                                                       device=x1_obj.sycl_device,
-                                                                       usm_type=x1_obj.usm_type,
-                                                                       sycl_queue=x1_obj.sycl_queue)
-    cdef utils.dpnp_descriptor res_vt = utils.create_output_descriptor((size_n, size_n),
-                                                                       return_type,
-                                                                       None,
-                                                                       device=x1_obj.sycl_device,
-                                                                       usm_type=x1_obj.usm_type,
-                                                                       sycl_queue=x1_obj.sycl_queue)
-
-    result_sycl_queue = res_u.get_array().sycl_queue
-
-    cdef c_dpctl.SyclQueue q = <c_dpctl.SyclQueue> result_sycl_queue
-    cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref()
-
-    cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref,
-                                                    x1.get_data(),
-                                                    res_u.get_data(),
-                                                    res_s.get_data(),
-                                                    res_vt.get_data(),
-                                                    size_m,
-                                                    size_n,
-                                                    NULL)  # dep_events_ref
-
-    with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref)
-    c_dpctl.DPCTLEvent_Delete(event_ref)
-
-    return (res_u.get_pyobj(), res_s.get_pyobj(), res_vt.get_pyobj())
diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 800aa8de1bb..2b8506130ad 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -53,6 +53,7 @@
     dpnp_inv,
     dpnp_slogdet,
     dpnp_solve,
+    dpnp_svd,
 )
 
 __all__ = [
@@ -611,12 +612,47 @@ def solve(a, b):
     return dpnp_solve(a, b)
 
 
-def svd(x1, full_matrices=True, compute_uv=True, hermitian=False):
+def svd(a, full_matrices=True, compute_uv=True, hermitian=False):
     """
     Singular Value Decomposition.
 
     For full documentation refer to :obj:`numpy.linalg.svd`.
 
+    Parameters
+    ----------
+    a : (..., M, N) {dpnp.ndarray, usm_ndarray}
+        Input array with ``a.ndim >= 2``.
+    full_matrices : bool, optional
+        If ``True``, it returns `u` and `Vh` with full-sized matrices.
+        If ``False``, the matrices are reduced in size.
+        Default: ``True``.
+    compute_uv : bool, optional
+        If ``False``, it only returns singular values.
+        Default: ``True``.
+    hermitian : bool, optional
+        If True, a is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Default: ``False``.
+
+    Returns
+    -------
+    u : { (…, M, M), (…, M, K) } dpnp.ndarray
+        Unitary matrix, where M is the number of rows of the input array `a`.
+        The shape of the matrix `u` depends on the value of `full_matrices`.
+        If `full_matrices` is ``True``, `u` has the shape (…, M, M).
+        If `full_matrices` is ``False``, `u` has the shape (…, M, K),
+        where K = min(M, N), and N is the number of columns of the input array `a`.
+        If `compute_uv` is ``False``, neither `u` or `Vh` are computed.
+    s : (…, K) dpnp.ndarray
+        Vector containing the singular values of `a`, sorted in descending order.
+        The length of `s` is min(M, N).
+    Vh : { (…, N, N), (…, K, N) } dpnp.ndarray
+        Unitary matrix, where N is the number of columns of the input array `a`.
+        The shape of the matrix `Vh` depends on the value of `full_matrices`.
+        If `full_matrices` is ``True``, `Vh` has the shape (…, N, N).
+        If `full_matrices` is ``False``, `Vh` has the shape (…, K, N).
+        If `compute_uv` is ``False``, neither `u` or `Vh` are computed.
+
     Examples
     --------
     >>> import dpnp as np
@@ -629,11 +665,11 @@ def svd(x1, full_matrices=True, compute_uv=True, hermitian=False):
     >>> u.shape, s.shape, vh.shape
     ((9, 9), (6,), (6, 6))
     >>> np.allclose(a, np.dot(u[:, :6] * s, vh))
-    True
+    array([ True])
     >>> smat = np.zeros((9, 6), dtype=complex)
     >>> smat[:6, :6] = np.diag(s)
     >>> np.allclose(a, np.dot(u, np.dot(smat, vh)))
-    True
+    array([ True])
 
     Reconstruction based on reduced SVD, 2D case:
 
@@ -641,10 +677,10 @@ def svd(x1, full_matrices=True, compute_uv=True, hermitian=False):
     >>> u.shape, s.shape, vh.shape
     ((9, 6), (6,), (6, 6))
     >>> np.allclose(a, np.dot(u * s, vh))
-    True
+    array([ True])
     >>> smat = np.diag(s)
     >>> np.allclose(a, np.dot(u, np.dot(smat, vh)))
-    True
+    array([ True])
 
     Reconstruction based on full SVD, 4D case:
 
@@ -652,9 +688,9 @@ def svd(x1, full_matrices=True, compute_uv=True, hermitian=False):
     >>> u.shape, s.shape, vh.shape
     ((2, 7, 8, 8), (2, 7, 3), (2, 7, 3, 3))
     >>> np.allclose(b, np.matmul(u[..., :3] * s[..., None, :], vh))
-    True
+    array([ True])
     >>> np.allclose(b, np.matmul(u[..., :3], s[..., None] * vh))
-    True
+    array([ True])
 
     Reconstruction based on reduced SVD, 4D case:
 
@@ -662,30 +698,16 @@ def svd(x1, full_matrices=True, compute_uv=True, hermitian=False):
     >>> u.shape, s.shape, vh.shape
     ((2, 7, 8, 3), (2, 7, 3), (2, 7, 3, 3))
     >>> np.allclose(b, np.matmul(u * s[..., None, :], vh))
-    True
+    array([ True])
     >>> np.allclose(b, np.matmul(u, s[..., None] * vh))
-    True
+    array([ True])
 
     """
 
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
-    if x1_desc:
-        if not x1_desc.ndim == 2:
-            pass
-        elif full_matrices is not True:
-            pass
-        elif compute_uv is not True:
-            pass
-        elif hermitian is not False:
-            pass
-        else:
-            result_tup = dpnp_svd(x1_desc, full_matrices, compute_uv, hermitian)
-
-            return result_tup
+    dpnp.check_supported_arrays_type(a)
+    check_stacked_2d(a)
 
-    return call_origin(
-        numpy.linalg.svd, x1, full_matrices, compute_uv, hermitian
-    )
+    return dpnp_svd(a, full_matrices, compute_uv, hermitian)
 
 
 def slogdet(a):
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index f2632b5b6a4..93f41883133 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -24,8 +24,9 @@
 # *****************************************************************************
 
 
+import dpctl
 import dpctl.tensor._tensor_impl as ti
-from numpy import issubdtype
+from numpy import prod
 
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
@@ -40,6 +41,7 @@
     "dpnp_inv",
     "dpnp_slogdet",
     "dpnp_solve",
+    "dpnp_svd",
 ]
 
 _jobz = {"N": 0, "V": 1}
@@ -147,76 +149,6 @@ def _real_type(dtype, device=None):
     return dpnp.dtype(real_type)
 
 
-def check_stacked_2d(*arrays):
-    """
-    Return ``True`` if each array in `arrays` has at least two dimensions.
-
-    If any array is less than two-dimensional, `dpnp.linalg.LinAlgError` will be raised.
-
-    Parameters
-    ----------
-    arrays : {dpnp.ndarray, usm_ndarray}
-        A sequence of input arrays to check for dimensionality.
-
-    Returns
-    -------
-    out : bool
-        ``True`` if each array in `arrays` is at least two-dimensional.
-
-    Raises
-    ------
-    dpnp.linalg.LinAlgError
-        If any array in `arrays` is less than two-dimensional.
-
-    """
-
-    for a in arrays:
-        if a.ndim < 2:
-            raise dpnp.linalg.LinAlgError(
-                f"{a.ndim}-dimensional array given. The input "
-                "array must be at least two-dimensional"
-            )
-
-
-def check_stacked_square(*arrays):
-    """
-    Return ``True`` if each array in `arrays` is a square matrix.
-
-    If any array does not form a square matrix, `dpnp.linalg.LinAlgError` will be raised.
-
-    Precondition: `arrays` are at least 2d. The caller should assert it
-    beforehand. For example,
-
-    >>> def solve(a):
-    ...     check_stacked_2d(a)
-    ...     check_stacked_square(a)
-    ...     ...
-
-    Parameters
-    ----------
-    arrays : {dpnp.ndarray, usm_ndarray}
-        A sequence of input arrays to check for square matrix shape.
-
-    Returns
-    -------
-    out : bool
-        ``True`` if each array in `arrays` forms a square matrix.
-
-    Raises
-    ------
-    dpnp.linalg.LinAlgError
-        If any array in `arrays` does not form a square matrix.
-
-    """
-
-    for a in arrays:
-        m, n = a.shape[-2:]
-        if m != n:
-            raise dpnp.linalg.LinAlgError(
-                "Last 2 dimensions of the input array must be square"
-            )
-
-
 def _common_type(*arrays):
     """
     Common type for linear algebra operations.
@@ -245,7 +177,8 @@ def _common_type(*arrays):
 
     dtypes = [arr.dtype for arr in arrays]
 
-    default = dpnp.default_float_type(device=arrays[0].device)
+    _, sycl_queue = get_usm_allocations(arrays)
+    default = dpnp.default_float_type(sycl_queue=sycl_queue)
     dtype_common = _common_inexact_type(default, *dtypes)
 
     return dtype_common
@@ -275,7 +208,8 @@ def _common_inexact_type(default_dtype, *dtypes):
     """
 
     inexact_dtypes = [
-        dt if issubdtype(dt, dpnp.inexact) else default_dtype for dt in dtypes
+        dt if dpnp.issubdtype(dt, dpnp.inexact) else default_dtype
+        for dt in dtypes
     ]
     return dpnp.result_type(*inexact_dtypes)
 
@@ -469,6 +403,120 @@ def _lu_factor(a, res_type):
         return (a_h, ipiv_h, dev_info_array)
 
 
+def _stacked_identity(
+    batch_shape, n, dtype, usm_type="device", sycl_queue=None
+):
+    """
+    Create stacked identity matrices of size `n x n`.
+
+    Forms multiple identity matrices based on `batch_shape`.
+
+    Parameters
+    ----------
+    batch_shape : tuple
+        Shape of the batch determining the stacking of identity matrices.
+    n : int
+        Dimension of each identity matrix.
+    dtype : dtype
+        Data type of the matrix element.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of stacked `n x n` identity matrices as per `batch_shape`.
+
+    Example
+    -------
+    >>> _stacked_identity((2,), 2, dtype=dpnp.int64)
+    array([[[1, 0],
+            [0, 1]],
+
+           [[1, 0],
+            [0, 1]]])
+
+    """
+
+    shape = batch_shape + (n, n)
+    idx = dpnp.arange(n, usm_type=usm_type, sycl_queue=sycl_queue)
+    x = dpnp.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=sycl_queue)
+    x[..., idx, idx] = 1
+    return x
+
+
+def check_stacked_2d(*arrays):
+    """
+    Return ``True`` if each array in `arrays` has at least two dimensions.
+
+    If any array is less than two-dimensional, `dpnp.linalg.LinAlgError` will be raised.
+
+    Parameters
+    ----------
+    arrays : {dpnp.ndarray, usm_ndarray}
+        A sequence of input arrays to check for dimensionality.
+
+    Returns
+    -------
+    out : bool
+        ``True`` if each array in `arrays` is at least two-dimensional.
+
+    Raises
+    ------
+    dpnp.linalg.LinAlgError
+        If any array in `arrays` is less than two-dimensional.
+
+    """
+
+    for a in arrays:
+        if a.ndim < 2:
+            raise dpnp.linalg.LinAlgError(
+                f"{a.ndim}-dimensional array given. The input "
+                "array must be at least two-dimensional"
+            )
+
+
+def check_stacked_square(*arrays):
+    """
+    Return ``True`` if each array in `arrays` is a square matrix.
+
+    If any array does not form a square matrix, `dpnp.linalg.LinAlgError` will be raised.
+
+    Precondition: `arrays` are at least 2d. The caller should assert it
+    beforehand. For example,
+
+    >>> def solve(a):
+    ...     check_stacked_2d(a)
+    ...     check_stacked_square(a)
+    ...     ...
+
+    Parameters
+    ----------
+    arrays : {dpnp.ndarray, usm_ndarray}
+        A sequence of input arrays to check for square matrix shape.
+
+    Returns
+    -------
+    out : bool
+        ``True`` if each array in `arrays` forms a square matrix.
+
+    Raises
+    ------
+    dpnp.linalg.LinAlgError
+        If any array in `arrays` does not form a square matrix.
+
+    """
+
+    for a in arrays:
+        m, n = a.shape[-2:]
+        if m != n:
+            raise dpnp.linalg.LinAlgError(
+                "Last 2 dimensions of the input array must be square"
+            )
+
+
 def dpnp_cholesky_batch(a, upper_lower, res_type):
     """
     dpnp_cholesky_batch(a, upper_lower, res_type)
@@ -1088,3 +1136,290 @@ def dpnp_slogdet(a):
         dpnp.where(singular, res_type.type(0), sign).reshape(shape),
         dpnp.where(singular, logdet_dtype.type("-inf"), logdet).reshape(shape),
     )
+
+
+def dpnp_svd_batch(a, uv_type, s_type, full_matrices=True, compute_uv=True):
+    """
+    dpnp_svd_batch(a, uv_type, s_type, full_matrices=True, compute_uv=True)
+
+    Return the batched singular value decomposition (SVD) of a stack of matrices.
+
+    """
+
+    a_usm_type = a.usm_type
+    a_sycl_queue = a.sycl_queue
+    reshape = False
+    batch_shape_orig = a.shape[:-2]
+
+    if a.ndim > 3:
+        # get 3d input arrays by reshape
+        a = a.reshape(prod(a.shape[:-2]), a.shape[-2], a.shape[-1])
+        reshape = True
+
+    batch_size = a.shape[0]
+    m, n = a.shape[-2:]
+
+    if batch_size == 0:
+        k = min(m, n)
+        s = dpnp.empty(
+            batch_shape_orig + (k,),
+            dtype=s_type,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        if compute_uv:
+            if full_matrices:
+                u_shape = batch_shape_orig + (m, m)
+                vt_shape = batch_shape_orig + (n, n)
+            else:
+                u_shape = batch_shape_orig + (m, k)
+                vt_shape = batch_shape_orig + (k, n)
+
+            u = dpnp.empty(
+                u_shape,
+                dtype=uv_type,
+                usm_type=a_usm_type,
+                sycl_queue=a_sycl_queue,
+            )
+            vt = dpnp.empty(
+                vt_shape,
+                dtype=uv_type,
+                usm_type=a_usm_type,
+                sycl_queue=a_sycl_queue,
+            )
+            return u, s, vt
+        else:
+            return s
+    elif m == 0 or n == 0:
+        s = dpnp.empty(
+            batch_shape_orig + (0,),
+            dtype=s_type,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        if compute_uv:
+            if full_matrices:
+                u = _stacked_identity(
+                    batch_shape_orig,
+                    m,
+                    dtype=uv_type,
+                    usm_type=a_usm_type,
+                    sycl_queue=a_sycl_queue,
+                )
+                vt = _stacked_identity(
+                    batch_shape_orig,
+                    n,
+                    dtype=uv_type,
+                    usm_type=a_usm_type,
+                    sycl_queue=a_sycl_queue,
+                )
+            else:
+                u = dpnp.empty(
+                    batch_shape_orig + (m, 0),
+                    dtype=uv_type,
+                    usm_type=a_usm_type,
+                    sycl_queue=a_sycl_queue,
+                )
+                vt = dpnp.empty(
+                    batch_shape_orig + (0, n),
+                    dtype=uv_type,
+                    usm_type=a_usm_type,
+                    sycl_queue=a_sycl_queue,
+                )
+            return u, s, vt
+        else:
+            return s
+
+    u_matrices = [None] * batch_size
+    s_matrices = [None] * batch_size
+    vt_matrices = [None] * batch_size
+    ht_list_ev = [None] * batch_size * 2
+    for i in range(batch_size):
+        if compute_uv:
+            (
+                u_matrices[i],
+                s_matrices[i],
+                vt_matrices[i],
+                ht_list_ev[2 * i],
+                ht_list_ev[2 * i + 1],
+            ) = dpnp_svd(a[i], full_matrices, compute_uv=True, batch_call=True)
+        else:
+            s_matrices[i], ht_list_ev[2 * i], ht_list_ev[2 * i + 1] = dpnp_svd(
+                a[i], full_matrices, compute_uv=False, batch_call=True
+            )
+
+    dpctl.SyclEvent.wait_for(ht_list_ev)
+
+    # TODO: Need to return C-contiguous array to match the output of numpy.linalg.svd
+    # Allocate 'F' order memory for dpnp output arrays to be aligned with dpnp_svd
+    out_s = dpnp.array(s_matrices, order="F")
+    if reshape:
+        out_s = out_s.reshape(batch_shape_orig + out_s.shape[-1:])
+
+    if compute_uv:
+        out_u = dpnp.array(u_matrices, order="F")
+        out_vt = dpnp.array(vt_matrices, order="F")
+        if reshape:
+            return (
+                out_u.reshape(batch_shape_orig + out_u.shape[-2:]),
+                out_s,
+                out_vt.reshape(batch_shape_orig + out_vt.shape[-2:]),
+            )
+        else:
+            return out_u, out_s, out_vt
+    else:
+        return out_s
+
+
+def dpnp_svd(
+    a, full_matrices=True, compute_uv=True, hermitian=False, batch_call=False
+):
+    """
+    dpnp_svd(a, full_matrices=True, compute_uv=True, hermitian=False, batch_call=False)
+
+    Return the singular value decomposition (SVD).
+
+    """
+
+    if hermitian:
+        check_stacked_square(a)
+
+        # _gesvd returns eigenvalues with s ** 2 sorted descending,
+        # but dpnp.linalg.eigh returns s sorted ascending so we re-order the eigenvalues
+        # and related arrays to have the correct order
+        if compute_uv:
+            s, u = dpnp.linalg.eigh(a)
+            sgn = dpnp.sign(s)
+            s = dpnp.absolute(s)
+            sidx = dpnp.argsort(s)[..., ::-1]
+            # Rearrange the signs according to sorted indices
+            sgn = dpnp.take_along_axis(sgn, sidx, axis=-1)
+            # Sort the singular values in descending order
+            s = dpnp.take_along_axis(s, sidx, axis=-1)
+            # Rearrange the eigenvectors according to sorted indices
+            u = dpnp.take_along_axis(u, sidx[..., None, :], axis=-1)
+            # Singular values are unsigned, move the sign into v
+            # Compute V^T adjusting for the sign and conjugating
+            vt = dpnp.transpose(u * sgn[..., None, :]).conjugate()
+            return u, s, vt
+        else:
+            # TODO: use dpnp.linalg.eighvals when it is updated
+            s, _ = dpnp.linalg.eigh(a)
+            s = dpnp.abs(s)
+            return dpnp.sort(s)[..., ::-1]
+
+    uv_type = _common_type(a)
+    s_type = _real_type(uv_type)
+
+    if a.ndim > 2:
+        return dpnp_svd_batch(a, uv_type, s_type, full_matrices, compute_uv)
+
+    a_usm_type = a.usm_type
+    a_sycl_queue = a.sycl_queue
+    m, n = a.shape
+
+    if m == 0 or n == 0:
+        s = dpnp.empty(
+            (0,),
+            dtype=s_type,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        if compute_uv:
+            if full_matrices:
+                u_shape = (m,)
+                vt_shape = (n,)
+            else:
+                u_shape = (m, 0)
+                vt_shape = (0, n)
+
+            u = dpnp.eye(
+                *u_shape,
+                dtype=uv_type,
+                usm_type=a_usm_type,
+                sycl_queue=a_sycl_queue,
+            )
+            vt = dpnp.eye(
+                *vt_shape,
+                dtype=uv_type,
+                usm_type=a_usm_type,
+                sycl_queue=a_sycl_queue,
+            )
+            return u, s, vt
+        else:
+            return s
+
+    # oneMKL LAPACK gesvd destroys `a` and assumes fortran-like array as input.
+    # Allocate 'F' order memory for dpnp arrays to comply with these requirements.
+    a_h = dpnp.empty_like(a, order="F", dtype=uv_type)
+
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+
+    # use DPCTL tensor function to fill the сopy of the input array
+    # from the input array
+    a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr, dst=a_h.get_array(), sycl_queue=a_sycl_queue
+    )
+
+    k = min(m, n)
+    if compute_uv:
+        if full_matrices:
+            u_shape = (m, m)
+            vt_shape = (n, n)
+            jobu = ord("A")
+            jobvt = ord("A")
+        else:
+            u_shape = (m, k)
+            vt_shape = (k, n)
+            jobu = ord("S")
+            jobvt = ord("S")
+    else:
+        u_shape = vt_shape = ()
+        jobu = ord("N")
+        jobvt = ord("N")
+
+    # oneMKL LAPACK assumes fortran-like array as input.
+    # Allocate 'F' order memory for dpnp output arrays to comply with these requirements.
+    u_h = dpnp.empty(
+        u_shape,
+        dtype=uv_type,
+        order="F",
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
+    vt_h = dpnp.empty(
+        vt_shape,
+        dtype=uv_type,
+        order="F",
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
+    s_h = dpnp.empty(
+        k, dtype=s_type, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+    )
+
+    ht_lapack_ev, _ = li._gesvd(
+        a_sycl_queue,
+        jobu,
+        jobvt,
+        a_h.get_array(),
+        s_h.get_array(),
+        u_h.get_array(),
+        vt_h.get_array(),
+        [a_copy_ev],
+    )
+
+    if batch_call:
+        if compute_uv:
+            return u_h, s_h, vt_h, ht_lapack_ev, a_ht_copy_ev
+        else:
+            return s_h, ht_lapack_ev, a_ht_copy_ev
+
+    ht_lapack_ev.wait()
+    a_ht_copy_ev.wait()
+
+    # TODO: Need to return C-contiguous array to match the output of numpy.linalg.svd
+    if compute_uv:
+        return u_h, s_h, vt_h
+    else:
+        return s_h
diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 5ea536c2887..85206bad5ba 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -9,6 +9,7 @@
 from .helper import (
     assert_dtype_allclose,
     get_all_dtypes,
+    get_complex_dtypes,
     has_support_aspect64,
     is_cpu_device,
 )
@@ -755,64 +756,6 @@ def test_qr_not_2D():
     assert_allclose(ia, inp.matmul(dpnp_q, dpnp_r))
 
 
-@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
-@pytest.mark.parametrize(
-    "shape",
-    [(2, 2), (3, 4), (5, 3), (16, 16)],
-    ids=["(2,2)", "(3,4)", "(5,3)", "(16,16)"],
-)
-def test_svd(type, shape):
-    a = numpy.arange(shape[0] * shape[1], dtype=type).reshape(shape)
-    ia = inp.array(a)
-
-    np_u, np_s, np_vt = numpy.linalg.svd(a)
-    dpnp_u, dpnp_s, dpnp_vt = inp.linalg.svd(ia)
-
-    support_aspect64 = has_support_aspect64()
-
-    if support_aspect64:
-        assert dpnp_u.dtype == np_u.dtype
-        assert dpnp_s.dtype == np_s.dtype
-        assert dpnp_vt.dtype == np_vt.dtype
-    assert dpnp_u.shape == np_u.shape
-    assert dpnp_s.shape == np_s.shape
-    assert dpnp_vt.shape == np_vt.shape
-
-    tol = 1e-12
-    if type == inp.float32:
-        tol = 1e-03
-    elif not support_aspect64 and type in (inp.int32, inp.int64, None):
-        tol = 1e-03
-
-    # check decomposition
-    dpnp_diag_s = inp.zeros(shape, dtype=dpnp_s.dtype)
-    for i in range(dpnp_s.size):
-        dpnp_diag_s[i, i] = dpnp_s[i]
-
-    # check decomposition
-    assert_allclose(
-        ia, inp.dot(dpnp_u, inp.dot(dpnp_diag_s, dpnp_vt)), rtol=tol, atol=tol
-    )
-
-    # compare singular values
-    # assert_allclose(dpnp_s, np_s, rtol=tol, atol=tol)
-
-    # change sign of vectors
-    for i in range(min(shape[0], shape[1])):
-        if np_u[0, i] * dpnp_u[0, i] < 0:
-            np_u[:, i] = -np_u[:, i]
-            np_vt[i, :] = -np_vt[i, :]
-
-    # compare vectors for non-zero values
-    for i in range(numpy.count_nonzero(np_s > tol)):
-        assert_allclose(
-            inp.asnumpy(dpnp_u)[:, i], np_u[:, i], rtol=tol, atol=tol
-        )
-        assert_allclose(
-            inp.asnumpy(dpnp_vt)[i, :], np_vt[i, :], rtol=tol, atol=tol
-        )
-
-
 class TestSolve:
     @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
     def test_solve(self, dtype):
@@ -1028,3 +971,149 @@ def test_slogdet_errors(self):
         # unsupported type
         a_np = inp.asnumpy(a_dp)
         assert_raises(TypeError, inp.linalg.slogdet, a_np)
+
+
+class TestSvd:
+    def get_tol(self, dtype):
+        tol = 1e-06
+        if dtype in (inp.float32, inp.complex64):
+            tol = 1e-04
+        elif not has_support_aspect64() and dtype in (
+            inp.int32,
+            inp.int64,
+            None,
+        ):
+            tol = 1e-04
+        self._tol = tol
+
+    def check_types_shapes(
+        self, dp_u, dp_s, dp_vt, np_u, np_s, np_vt, compute_vt=True
+    ):
+        if has_support_aspect64():
+            if compute_vt:
+                assert dp_u.dtype == np_u.dtype
+                assert dp_vt.dtype == np_vt.dtype
+            assert dp_s.dtype == np_s.dtype
+        else:
+            if compute_vt:
+                assert dp_u.dtype.kind == np_u.dtype.kind
+                assert dp_vt.dtype.kind == np_vt.dtype.kind
+            assert dp_s.dtype.kind == np_s.dtype.kind
+
+        if compute_vt:
+            assert dp_u.shape == np_u.shape
+            assert dp_vt.shape == np_vt.shape
+        assert dp_s.shape == np_s.shape
+
+    # Checks the accuracy of singular value decomposition (SVD).
+    # Compares the reconstructed matrix from the decomposed components
+    # with the original matrix.
+    # Additionally checks for equality of singular values
+    # between dpnp and numpy decompositions
+    def check_decomposition(
+        self, dp_a, dp_u, dp_s, dp_vt, np_u, np_s, np_vt, compute_vt
+    ):
+        tol = self._tol
+        if compute_vt:
+            dpnp_diag_s = inp.zeros_like(dp_a, dtype=dp_s.dtype)
+            for i in range(min(dp_a.shape[-2], dp_a.shape[-1])):
+                dpnp_diag_s[..., i, i] = dp_s[..., i]
+            # TODO: remove it when dpnp.dot is updated
+            # dpnp.dot does not support complex type
+            if inp.issubdtype(dp_a.dtype, inp.complexfloating):
+                reconstructed = numpy.dot(
+                    inp.asnumpy(dp_u),
+                    numpy.dot(inp.asnumpy(dpnp_diag_s), inp.asnumpy(dp_vt)),
+                )
+            else:
+                reconstructed = inp.dot(dp_u, inp.dot(dpnp_diag_s, dp_vt))
+            # TODO: use assert dpnp.allclose() inside check_decomposition()
+            # when it will support complex dtypes
+            assert_allclose(dp_a, reconstructed, rtol=tol, atol=1e-4)
+
+        assert_allclose(dp_s, np_s, rtol=tol, atol=1e-03)
+
+        if compute_vt:
+            for i in range(min(dp_a.shape[-2], dp_a.shape[-1])):
+                if np_u[..., 0, i] * dp_u[..., 0, i] < 0:
+                    np_u[..., :, i] = -np_u[..., :, i]
+                    np_vt[..., i, :] = -np_vt[..., i, :]
+            for i in range(numpy.count_nonzero(np_s > tol)):
+                assert_allclose(
+                    inp.asnumpy(dp_u[..., :, i]),
+                    np_u[..., :, i],
+                    rtol=tol,
+                    atol=tol,
+                )
+                assert_allclose(
+                    inp.asnumpy(dp_vt[..., i, :]),
+                    np_vt[..., i, :],
+                    rtol=tol,
+                    atol=tol,
+                )
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2), (3, 4), (5, 3), (16, 16)],
+        ids=["(2,2)", "(3,4)", "(5,3)", "(16,16)"],
+    )
+    def test_svd(self, dtype, shape):
+        a = numpy.arange(shape[0] * shape[1], dtype=dtype).reshape(shape)
+        dp_a = inp.array(a)
+
+        np_u, np_s, np_vt = numpy.linalg.svd(a)
+        dp_u, dp_s, dp_vt = inp.linalg.svd(dp_a)
+
+        self.check_types_shapes(dp_u, dp_s, dp_vt, np_u, np_s, np_vt)
+        self.get_tol(dtype)
+        self.check_decomposition(
+            dp_a, dp_u, dp_s, dp_vt, np_u, np_s, np_vt, True
+        )
+
+    @pytest.mark.parametrize("dtype", get_complex_dtypes())
+    @pytest.mark.parametrize("compute_vt", [True, False], ids=["True", "False"])
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2), (16, 16)],
+        ids=["(2,2)", "(16, 16)"],
+    )
+    def test_svd_hermitian(self, dtype, compute_vt, shape):
+        a = numpy.random.randn(*shape) + 1j * numpy.random.randn(*shape)
+        a = numpy.conj(a.T) @ a
+
+        a = a.astype(dtype)
+        dp_a = inp.array(a)
+
+        if compute_vt:
+            np_u, np_s, np_vt = numpy.linalg.svd(
+                a, compute_uv=compute_vt, hermitian=True
+            )
+            dp_u, dp_s, dp_vt = inp.linalg.svd(
+                dp_a, compute_uv=compute_vt, hermitian=True
+            )
+        else:
+            np_s = numpy.linalg.svd(a, compute_uv=compute_vt, hermitian=True)
+            dp_s = inp.linalg.svd(dp_a, compute_uv=compute_vt, hermitian=True)
+            np_u = np_vt = dp_u = dp_vt = None
+
+        self.check_types_shapes(
+            dp_u, dp_s, dp_vt, np_u, np_s, np_vt, compute_vt
+        )
+
+        self.get_tol(dtype)
+
+        self.check_decomposition(
+            dp_a, dp_u, dp_s, dp_vt, np_u, np_s, np_vt, compute_vt
+        )
+
+    def test_svd_errors(self):
+        a_dp = inp.array([[1, 2], [3, 4]], dtype="float32")
+
+        # unsupported type
+        a_np = inp.asnumpy(a_dp)
+        assert_raises(TypeError, inp.linalg.svd, a_np)
+
+        # a.ndim < 2
+        a_dp_ndim_1 = a_dp.flatten()
+        assert_raises(inp.linalg.LinAlgError, inp.linalg.svd, a_dp_ndim_1)
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index 7a7bcd53e0b..205d4efb572 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -1230,53 +1230,62 @@ def test_qr(device):
     valid_devices,
     ids=[device.filter_string for device in valid_devices],
 )
-def test_svd(device):
-    shape = (2, 2)
+@pytest.mark.parametrize("full_matrices", [True, False], ids=["True", "False"])
+@pytest.mark.parametrize("compute_uv", [True, False], ids=["True", "False"])
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1, 4),
+        (3, 2),
+        (4, 4),
+        (2, 0),
+        (0, 2),
+        (2, 2, 3),
+        (3, 3, 0),
+        (0, 2, 3),
+        (1, 0, 3),
+    ],
+    ids=[
+        "(1, 4)",
+        "(3, 2)",
+        "(4, 4)",
+        "(2, 0)",
+        "(0, 2)",
+        "(2, 2, 3)",
+        "(3, 3, 0)",
+        "(0, 2, 3)",
+        "(1, 0, 3)",
+    ],
+)
+def test_svd(shape, full_matrices, compute_uv, device):
     dtype = dpnp.default_float_type(device)
-    numpy_data = numpy.arange(shape[0] * shape[1], dtype=dtype).reshape(shape)
-    dpnp_data = dpnp.arange(
-        shape[0] * shape[1], dtype=dtype, device=device
-    ).reshape(shape)
-
-    np_u, np_s, np_vt = numpy.linalg.svd(numpy_data)
-    dpnp_u, dpnp_s, dpnp_vt = dpnp.linalg.svd(dpnp_data)
-
-    assert dpnp_u.dtype == np_u.dtype
-    assert dpnp_s.dtype == np_s.dtype
-    assert dpnp_vt.dtype == np_vt.dtype
-    assert dpnp_u.shape == np_u.shape
-    assert dpnp_s.shape == np_s.shape
-    assert dpnp_vt.shape == np_vt.shape
-
-    # check decomposition
-    dpnp_diag_s = dpnp.zeros(shape, dtype=dpnp_s.dtype, device=device)
-    for i in range(dpnp_s.size):
-        dpnp_diag_s[i, i] = dpnp_s[i]
-
-    # check decomposition
-    assert_dtype_allclose(
-        dpnp_data, dpnp.dot(dpnp_u, dpnp.dot(dpnp_diag_s, dpnp_vt))
+
+    count_elems = numpy.prod(shape)
+    dpnp_data = dpnp.arange(count_elems, dtype=dtype, device=device).reshape(
+        shape
     )
+    expected_queue = dpnp_data.get_array().sycl_queue
 
-    for i in range(min(shape[0], shape[1])):
-        if np_u[0, i] * dpnp_u[0, i] < 0:
-            np_u[:, i] = -np_u[:, i]
-            np_vt[i, :] = -np_vt[i, :]
+    if compute_uv:
+        dpnp_u, dpnp_s, dpnp_vt = dpnp.linalg.svd(
+            dpnp_data, full_matrices=full_matrices, compute_uv=compute_uv
+        )
 
-    # compare vectors for non-zero values
-    for i in range(numpy.count_nonzero(np_s)):
-        assert_dtype_allclose(dpnp_u[:, i], np_u[:, i])
-        assert_dtype_allclose(dpnp_vt[i, :], np_vt[i, :])
+        dpnp_u_queue = dpnp_u.get_array().sycl_queue
+        dpnp_vt_queue = dpnp_vt.get_array().sycl_queue
+        dpnp_s_queue = dpnp_s.get_array().sycl_queue
 
-    expected_queue = dpnp_data.get_array().sycl_queue
-    dpnp_u_queue = dpnp_u.get_array().sycl_queue
-    dpnp_s_queue = dpnp_s.get_array().sycl_queue
-    dpnp_vt_queue = dpnp_vt.get_array().sycl_queue
+        assert_sycl_queue_equal(dpnp_u_queue, expected_queue)
+        assert_sycl_queue_equal(dpnp_vt_queue, expected_queue)
+        assert_sycl_queue_equal(dpnp_s_queue, expected_queue)
 
-    # compare queue and device
-    assert_sycl_queue_equal(dpnp_u_queue, expected_queue)
-    assert_sycl_queue_equal(dpnp_s_queue, expected_queue)
-    assert_sycl_queue_equal(dpnp_vt_queue, expected_queue)
+    else:
+        dpnp_s = dpnp.linalg.svd(
+            dpnp_data, full_matrices=full_matrices, compute_uv=compute_uv
+        )
+        dpnp_s_queue = dpnp_s.get_array().sycl_queue
+
+        assert_sycl_queue_equal(dpnp_s_queue, expected_queue)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index ada68ebfa6c..bff548a90d0 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -740,3 +740,53 @@ def test_inv(shape, is_empty, usm_type):
     result = dp.linalg.inv(x)
 
     assert x.usm_type == result.usm_type
+
+
+@pytest.mark.parametrize("usm_type", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize(
+    "full_matrices_param", [True, False], ids=["True", "False"]
+)
+@pytest.mark.parametrize(
+    "compute_uv_param", [True, False], ids=["True", "False"]
+)
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1, 4),
+        (3, 2),
+        (4, 4),
+        (2, 0),
+        (0, 2),
+        (2, 2, 3),
+        (3, 3, 0),
+        (0, 2, 3),
+        (1, 0, 3),
+    ],
+    ids=[
+        "(1, 4)",
+        "(3, 2)",
+        "(4, 4)",
+        "(2, 0)",
+        "(0, 2)",
+        "(2, 2, 3)",
+        "(3, 3, 0)",
+        "(0, 2, 3)",
+        "(1, 0, 3)",
+    ],
+)
+def test_svd(usm_type, shape, full_matrices_param, compute_uv_param):
+    x = dp.ones(shape, usm_type=usm_type)
+
+    if compute_uv_param:
+        u, s, vt = dp.linalg.svd(
+            x, full_matrices=full_matrices_param, compute_uv=compute_uv_param
+        )
+
+        assert x.usm_type == u.usm_type
+        assert x.usm_type == vt.usm_type
+    else:
+        s = dp.linalg.svd(
+            x, full_matrices=full_matrices_param, compute_uv=compute_uv_param
+        )
+
+    assert x.usm_type == s.usm_type
diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py
index 42bcf122ff4..fd887c16e6c 100644
--- a/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -6,6 +6,7 @@
 import dpnp as cupy
 from tests.helper import has_support_aspect64, is_cpu_device
 from tests.third_party.cupy import testing
+from tests.third_party.cupy.testing import _condition
 
 
 def random_matrix(shape, dtype, scale, sym=False):
@@ -44,6 +45,14 @@ def random_matrix(shape, dtype, scale, sym=False):
     return new_a.astype(dtype)
 
 
+def stacked_identity(xp, batch_shape, n, dtype):
+    shape = batch_shape + (n, n)
+    idx = xp.arange(n)
+    x = xp.zeros(shape, dtype=dtype)
+    x[..., idx, idx] = 1
+    return x
+
+
 class TestCholeskyDecomposition:
     @testing.numpy_cupy_allclose(atol=1e-3, type_check=has_support_aspect64())
     def check_L(self, array, xp):
@@ -135,3 +144,244 @@ def check_L(self, array):
     def test_decomposition(self, dtype):
         A = numpy.array([[1, -2], [-2, 1]]).astype(dtype)
         self.check_L(A)
+
+
+@testing.parameterize(
+    *testing.product(
+        {
+            "full_matrices": [True, False],
+        }
+    )
+)
+@testing.fix_random()
+class TestSVD(unittest.TestCase):
+    # TODO: New packages that fix issue CMPLRLLVM-53771 are only available in internal CI.
+    # Skip the tests on cpu until these packages are available for the external CI.
+    # Specifically dpcpp_linux-64>=2024.1.0
+    @classmethod
+    def setUpClass(cls):
+        if is_cpu_device():
+            raise unittest.SkipTest("CMPLRLLVM-53771")
+
+    def setUp(self):
+        self.seed = testing.generate_seed()
+
+    @testing.for_dtypes(
+        [
+            numpy.int32,
+            numpy.int64,
+            numpy.uint32,
+            numpy.uint64,
+            numpy.float32,
+            numpy.float64,
+            numpy.complex64,
+            numpy.complex128,
+        ]
+    )
+    def check_usv(self, shape, dtype):
+        array = testing.shaped_random(shape, numpy, dtype=dtype, seed=self.seed)
+        a_cpu = numpy.asarray(array, dtype=dtype)
+        a_gpu = cupy.asarray(array, dtype=dtype)
+        result_cpu = numpy.linalg.svd(a_cpu, full_matrices=self.full_matrices)
+        result_gpu = cupy.linalg.svd(a_gpu, full_matrices=self.full_matrices)
+        # Check if the input matrix is not broken
+        testing.assert_allclose(a_gpu, a_cpu)
+
+        assert len(result_gpu) == 3
+        for i in range(3):
+            assert result_gpu[i].shape == result_cpu[i].shape
+            if has_support_aspect64():
+                assert result_gpu[i].dtype == result_cpu[i].dtype
+            else:
+                assert result_gpu[i].dtype.kind == result_cpu[i].dtype.kind
+        u_cpu, s_cpu, vh_cpu = result_cpu
+        u_gpu, s_gpu, vh_gpu = result_gpu
+        testing.assert_allclose(s_gpu, s_cpu, rtol=1e-5, atol=1e-4)
+
+        # reconstruct the matrix
+        k = s_cpu.shape[-1]
+
+        # dpnp.dot/matmul does not support complex type and unstable on cpu
+        # TODO: remove it and use xp.dot/matmul when dpnp.dot/matmul is updated
+        u_gpu = u_gpu.asnumpy()
+        vh_gpu = vh_gpu.asnumpy()
+        s_gpu = s_gpu.asnumpy()
+        xp = numpy
+
+        if len(shape) == 2:
+            if self.full_matrices:
+                a_gpu_usv = numpy.dot(u_gpu[:, :k] * s_gpu, vh_gpu[:k, :])
+            else:
+                a_gpu_usv = numpy.dot(u_gpu * s_gpu, vh_gpu)
+        else:
+            if self.full_matrices:
+                a_gpu_usv = numpy.matmul(
+                    u_gpu[..., :k] * s_gpu[..., None, :], vh_gpu[..., :k, :]
+                )
+            else:
+                a_gpu_usv = numpy.matmul(u_gpu * s_gpu[..., None, :], vh_gpu)
+        testing.assert_allclose(a_gpu, a_gpu_usv, rtol=1e-4, atol=1e-4)
+
+        # assert unitary
+        u_len = u_gpu.shape[-1]
+        vh_len = vh_gpu.shape[-2]
+        testing.assert_allclose(
+            xp.matmul(u_gpu.swapaxes(-1, -2).conj(), u_gpu),
+            stacked_identity(xp, shape[:-2], u_len, dtype),
+            atol=1e-4,
+        )
+        testing.assert_allclose(
+            xp.matmul(vh_gpu, vh_gpu.swapaxes(-1, -2).conj()),
+            stacked_identity(xp, shape[:-2], vh_len, dtype),
+            atol=1e-4,
+        )
+
+    @testing.for_dtypes(
+        [
+            numpy.int32,
+            numpy.int64,
+            numpy.uint32,
+            numpy.uint64,
+            numpy.float32,
+            numpy.float64,
+            numpy.complex64,
+            numpy.complex128,
+        ]
+    )
+    # dpnp.linalg.svd() returns results as F-contiguous
+    # while numpy.linalg.svd() returns as C-contiguous
+    @testing.numpy_cupy_allclose(
+        rtol=1e-5,
+        atol=1e-4,
+        type_check=has_support_aspect64(),
+        contiguous_check=False,
+    )
+    def check_singular(self, shape, xp, dtype):
+        array = testing.shaped_random(shape, xp, dtype=dtype, seed=self.seed)
+        a = xp.asarray(array, dtype=dtype)
+        a_copy = a.copy()
+        result = xp.linalg.svd(
+            a, full_matrices=self.full_matrices, compute_uv=False
+        )
+        # Check if the input matrix is not broken
+        assert (a == a_copy).all()
+        return result
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank2(self):
+        self.check_usv((3, 7))
+        self.check_usv((2, 2))
+        self.check_usv((7, 3))
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank2_no_uv(self):
+        self.check_singular((3, 7))
+        self.check_singular((2, 2))
+        self.check_singular((7, 3))
+
+    @testing.with_requires("numpy>=1.16")
+    def test_svd_rank2_empty_array(self):
+        self.check_usv((0, 3))
+        self.check_usv((3, 0))
+        self.check_usv((1, 0))
+
+    @testing.with_requires("numpy>=1.16")
+    @testing.numpy_cupy_array_equal(type_check=has_support_aspect64())
+    def test_svd_rank2_empty_array_compute_uv_false(self, xp):
+        array = xp.empty((3, 0))
+        return xp.linalg.svd(
+            array, full_matrices=self.full_matrices, compute_uv=False
+        )
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank3(self):
+        self.check_usv((2, 3, 4))
+        self.check_usv((2, 3, 7))
+        self.check_usv((2, 4, 4))
+        self.check_usv((2, 7, 3))
+        self.check_usv((2, 4, 3))
+        self.check_usv((2, 32, 32))
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank3_loop(self):
+        # This tests the loop-based batched gesvd on CUDA (_gesvd_batched)
+        self.check_usv((2, 64, 64))
+        self.check_usv((2, 64, 32))
+        self.check_usv((2, 32, 64))
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank3_no_uv(self):
+        self.check_singular((2, 3, 4))
+        self.check_singular((2, 3, 7))
+        self.check_singular((2, 4, 4))
+        self.check_singular((2, 7, 3))
+        self.check_singular((2, 4, 3))
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank3_no_uv_loop(self):
+        # This tests the loop-based batched gesvd on CUDA (_gesvd_batched)
+        self.check_singular((2, 64, 64))
+        self.check_singular((2, 64, 32))
+        self.check_singular((2, 32, 64))
+
+    @testing.with_requires("numpy>=1.16")
+    def test_svd_rank3_empty_array(self):
+        self.check_usv((0, 3, 4))
+        self.check_usv((3, 0, 4))
+        self.check_usv((3, 4, 0))
+        self.check_usv((3, 0, 0))
+        self.check_usv((0, 3, 0))
+        self.check_usv((0, 0, 3))
+
+    @testing.with_requires("numpy>=1.16")
+    @testing.numpy_cupy_array_equal(type_check=has_support_aspect64())
+    def test_svd_rank3_empty_array_compute_uv_false1(self, xp):
+        array = xp.empty((3, 0, 4))
+        return xp.linalg.svd(
+            array, full_matrices=self.full_matrices, compute_uv=False
+        )
+
+    @testing.with_requires("numpy>=1.16")
+    @testing.numpy_cupy_array_equal(type_check=has_support_aspect64())
+    def test_svd_rank3_empty_array_compute_uv_false2(self, xp):
+        array = xp.empty((0, 3, 4))
+        return xp.linalg.svd(
+            array, full_matrices=self.full_matrices, compute_uv=False
+        )
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank4(self):
+        self.check_usv((2, 2, 3, 4))
+        self.check_usv((2, 2, 3, 7))
+        self.check_usv((2, 2, 4, 4))
+        self.check_usv((2, 2, 7, 3))
+        self.check_usv((2, 2, 4, 3))
+        self.check_usv((2, 2, 32, 32))
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank4_loop(self):
+        # This tests the loop-based batched gesvd on CUDA (_gesvd_batched)
+        self.check_usv((3, 2, 64, 64))
+        self.check_usv((3, 2, 64, 32))
+        self.check_usv((3, 2, 32, 64))
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank4_no_uv(self):
+        self.check_singular((2, 2, 3, 4))
+        self.check_singular((2, 2, 3, 7))
+        self.check_singular((2, 2, 4, 4))
+        self.check_singular((2, 2, 7, 3))
+        self.check_singular((2, 2, 4, 3))
+
+    @_condition.repeat(3, 10)
+    def test_svd_rank4_no_uv_loop(self):
+        # This tests the loop-based batched gesvd on CUDA (_gesvd_batched)
+        self.check_singular((3, 2, 64, 64))
+        self.check_singular((3, 2, 64, 32))
+        self.check_singular((3, 2, 32, 64))
+
+    @testing.with_requires("numpy>=1.16")
+    def test_svd_rank4_empty_array(self):
+        self.check_usv((0, 2, 3, 4))
+        self.check_usv((1, 2, 0, 4))
+        self.check_usv((1, 2, 3, 0))
diff --git a/tests/third_party/cupy/linalg_tests/test_solve.py b/tests/third_party/cupy/linalg_tests/test_solve.py
index b31082c8e84..cd397f6c9e1 100644
--- a/tests/third_party/cupy/linalg_tests/test_solve.py
+++ b/tests/third_party/cupy/linalg_tests/test_solve.py
@@ -10,7 +10,7 @@
     is_cpu_device,
 )
 from tests.third_party.cupy import testing
-from tests.third_party.cupy.testing import condition
+from tests.third_party.cupy.testing import _condition
 
 
 @testing.parameterize(
@@ -104,7 +104,7 @@ def test_invalid_shape(self):
 )
 class TestInv(unittest.TestCase):
     @testing.for_dtypes("ifdFD")
-    @condition.retry(10)
+    @_condition.retry(10)
     def check_x(self, a_shape, dtype):
         a_cpu = numpy.random.randint(0, 10, size=a_shape)
         a_cpu = a_cpu.astype(dtype, order=self.order)
diff --git a/tests/third_party/cupy/random_tests/test_sample.py b/tests/third_party/cupy/random_tests/test_sample.py
index f95f3e42710..79e2370ad05 100644
--- a/tests/third_party/cupy/random_tests/test_sample.py
+++ b/tests/third_party/cupy/random_tests/test_sample.py
@@ -7,7 +7,7 @@
 import dpnp as cupy
 from dpnp import random
 from tests.third_party.cupy import testing
-from tests.third_party.cupy.testing import condition, hypothesis
+from tests.third_party.cupy.testing import _condition, hypothesis
 
 
 @testing.gpu
@@ -43,7 +43,7 @@ def test_zero_sizes(self):
 @testing.gpu
 class TestRandint2(unittest.TestCase):
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_1(self):
         vals = [random.randint(0, 10, (2, 3)) for _ in range(10)]
         for val in vals:
@@ -52,7 +52,7 @@ def test_bound_1(self):
         self.assertEqual(max(_.max() for _ in vals), 9)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_2(self):
         vals = [random.randint(0, 2) for _ in range(20)]
         for val in vals:
@@ -61,7 +61,7 @@ def test_bound_2(self):
         self.assertEqual(max(_.max() for _ in vals), 1)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_overflow(self):
         # 100 - (-100) exceeds the range of int8
         val = random.randint(numpy.int8(-100), numpy.int8(100), size=20)
@@ -70,7 +70,7 @@ def test_bound_overflow(self):
         self.assertLess(val.max(), 100)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_float1(self):
         # generate floats s.t. int(low) < int(high)
         low, high = sorted(numpy.random.uniform(-5, 5, size=2))
@@ -90,7 +90,7 @@ def test_bound_float2(self):
         self.assertEqual(min(_.min() for _ in vals), -1)
         self.assertEqual(max(_.max() for _ in vals), 0)
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_goodness_of_fit(self):
         mx = 5
         trial = 100
@@ -99,7 +99,7 @@ def test_goodness_of_fit(self):
         expected = numpy.array([float(trial) / mx] * mx)
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_goodness_of_fit_2(self):
         mx = 5
         vals = random.randint(mx, size=(5, 20))
@@ -169,7 +169,7 @@ def test_size_is_not_none(self):
 @testing.fix_random()
 @testing.gpu
 class TestRandomIntegers2(unittest.TestCase):
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_1(self):
         vals = [random.random_integers(0, 10, (2, 3)).get() for _ in range(10)]
         for val in vals:
@@ -177,7 +177,7 @@ def test_bound_1(self):
         self.assertEqual(min(_.min() for _ in vals), 0)
         self.assertEqual(max(_.max() for _ in vals), 10)
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_2(self):
         vals = [random.random_integers(0, 2).get() for _ in range(20)]
         for val in vals:
@@ -185,7 +185,7 @@ def test_bound_2(self):
         self.assertEqual(min(vals), 0)
         self.assertEqual(max(vals), 2)
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_goodness_of_fit(self):
         mx = 5
         trial = 100
@@ -194,7 +194,7 @@ def test_goodness_of_fit(self):
         expected = numpy.array([float(trial) / mx] * mx)
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_goodness_of_fit_2(self):
         mx = 5
         vals = random.randint(0, mx, (5, 20)).get()
@@ -289,7 +289,7 @@ def test_randn_invalid_argument(self):
 @testing.fix_random()
 @testing.gpu
 class TestMultinomial(unittest.TestCase):
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose(rtol=0.05)
     def test_multinomial(self, xp, dtype):
diff --git a/tests/third_party/cupy/testing/__init__.py b/tests/third_party/cupy/testing/__init__.py
index 701c381e2f3..aa6c113706b 100644
--- a/tests/third_party/cupy/testing/__init__.py
+++ b/tests/third_party/cupy/testing/__init__.py
@@ -60,6 +60,4 @@
     product,
     product_dict,
 )
-from tests.third_party.cupy.testing.random import fix_random
-
-# from tests.third_party.cupy.testing.random import generate_seed
+from tests.third_party.cupy.testing.random import fix_random, generate_seed
diff --git a/tests/third_party/cupy/testing/condition.py b/tests/third_party/cupy/testing/_condition.py
similarity index 98%
rename from tests/third_party/cupy/testing/condition.py
rename to tests/third_party/cupy/testing/_condition.py
index 4465dc3d0ee..3533ef8b84d 100644
--- a/tests/third_party/cupy/testing/condition.py
+++ b/tests/third_party/cupy/testing/_condition.py
@@ -106,7 +106,7 @@ def repeat(times, intensive_times=None):
     if intensive_times is None:
         return repeat_with_success_at_least(times, times)
 
-    casual_test = bool(int(os.environ.get("CUPY_TEST_CASUAL", "0")))
+    casual_test = bool(int(os.environ.get("CUPY_TEST_CASUAL", "1")))
     times_ = times if casual_test else intensive_times
     return repeat_with_success_at_least(times_, times_)
 
diff --git a/tests/third_party/cupy/testing/random.py b/tests/third_party/cupy/testing/random.py
index 444f2b3352c..ecc299737c0 100644
--- a/tests/third_party/cupy/testing/random.py
+++ b/tests/third_party/cupy/testing/random.py
@@ -20,12 +20,15 @@ def do_setup(deterministic=True):
     global _old_cupy_random_states
     _old_python_random_state = random.getstate()
     _old_numpy_random_state = numpy.random.get_state()
-    _old_cupy_random_states = cupy.random.generator._random_states
-    cupy.random.reset_states()
+    _old_cupy_random_states = cupy.random.dpnp_iface_random._dpnp_random_states
+    cupy.random.dpnp_iface_random._dpnp_random_states = {}
     # Check that _random_state has been recreated in
     # cupy.random.reset_states(). Otherwise the contents of
     # _old_cupy_random_states would be overwritten.
-    assert cupy.random.generator._random_states is not _old_cupy_random_states
+    assert (
+        cupy.random.dpnp_iface_random._dpnp_random_states
+        is not _old_cupy_random_states
+    )
 
     if not deterministic:
         random.seed()
@@ -43,7 +46,7 @@ def do_teardown():
     global _old_cupy_random_states
     random.setstate(_old_python_random_state)
     numpy.random.set_state(_old_numpy_random_state)
-    cupy.random.generator._random_states = _old_cupy_random_states
+    cupy.random.dpnp_iface_random._dpnp_random_states = _old_cupy_random_states
     _old_python_random_state = None
     _old_numpy_random_state = None
     _old_cupy_random_states = None
@@ -91,12 +94,12 @@ def fix_random():
     """Decorator that fixes random numbers in a test.
 
     This decorator can be applied to either a test case class or a test method.
-    It should not be applied within ``condition.retry`` or
-    ``condition.repeat``.
+    It should not be applied within ``_condition.retry`` or
+    ``_condition.repeat``.
     """
 
     # TODO(niboshi): Prevent this decorator from being applied within
-    #    condition.repeat or condition.retry decorators. That would repeat
+    #    _condition.repeat or _condition.retry decorators. That would repeat
     #    tests with the same random seeds. It's okay to apply this outside
     #    these decorators.
 

From b46e0f62ca8d90ee03c267d9559e33c1b02c7736 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Sat, 3 Feb 2024 00:34:18 +0100
Subject: [PATCH 03/29] Pin conda-build to `3.28.4` version in GitHub action
 (#1678)

* Add extra pre-step to free more memory on Ubuntu runners

* Exclude Windows runners

* Enable conda verbosity

* Pin conda-build to 3.28.4

* Pinned conda-build in test jobs
---
 .github/workflows/conda-package.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 32f62306ae5..d9072c26a65 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -96,7 +96,7 @@ jobs:
           (echo CONDA_BLD=%CONDA_PREFIX%\conda-bld\win-64\) >> %GITHUB_ENV%
 
       - name: Install conda-build
-        run: conda install conda-build
+        run: conda install conda-build=3.28.4
 
       - name: Cache conda packages
         uses: actions/cache@v4
@@ -167,7 +167,7 @@ jobs:
 
       # Needed to be able to run conda index
       - name: Install conda-build
-        run: conda install conda-build
+        run: conda install conda-build=3.28.4
 
       - name: Create conda channel
         run: conda index ${{ env.channel-path }}
@@ -283,7 +283,7 @@ jobs:
 
       # Needed to be able to run conda index
       - name: Install conda-build
-        run: conda install conda-build
+        run: conda install conda-build=3.28.4
 
       - name: Create conda channel
         run: conda index ${{ env.channel-path }}

From 38a7ca8fdef58595d0ddc6909568a0a7ce1a95bd Mon Sep 17 00:00:00 2001
From: vtavana <120411540+vtavana@users.noreply.github.com>
Date: Fri, 2 Feb 2024 22:35:49 -0600
Subject: [PATCH 04/29] update `build_locally.py` (#1677)

* update build_locally.py

* fix pre-commit

* add comments

---------

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 scripts/build_locally.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/scripts/build_locally.py b/scripts/build_locally.py
index c8ff30d34ce..a0e5c55edc9 100644
--- a/scripts/build_locally.py
+++ b/scripts/build_locally.py
@@ -58,12 +58,31 @@ def run(
         cmake_args += [
             "--cmake-executable=" + cmake_executable,
         ]
+
+    # if dpctl is locally built using `script/build_locally.py`, it is needed
+    # to pass the -DDpctl_ROOT=$(python -m dpctl --cmakedir)
+    # if dpctl is conda installed, it is optional to pass this parameter
+    process = subprocess.Popen(
+        ["python", "-m", "dpctl", "--cmakedir"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    output, error = process.communicate()
+    if process.returncode == 0:
+        cmake_dir = output.decode("utf-8").strip()
+    else:
+        raise RuntimeError(
+            "Failed to retrieve dpctl cmake directory: "
+            + error.decode("utf-8").strip()
+        )
+
     cmake_args += [
         "--build-type=" + build_type,
         "--generator=" + build_system,
         "--",
         "-DCMAKE_C_COMPILER:PATH=" + c_compiler,
         "-DCMAKE_CXX_COMPILER:PATH=" + cxx_compiler,
+        "-DDpctl_ROOT=" + cmake_dir,
     ]
     if verbose:
         cmake_args += [

From 7c4b39ac9681790b71c866dac74485548a67c510 Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Sat, 3 Feb 2024 06:29:08 -0800
Subject: [PATCH 05/29] Implement sparse and copy arguments for dpnp.mesgrid
 function (#1675)

* Implement sparse and copy arguments for dpnp.mesgrid function

* address comments

* Removed limitation block from th description

* added tests

---------

Co-authored-by: Anton Volkov <antonwolfy@gmail.com>
Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/dpnp_container.py           | 12 ------
 dpnp/dpnp_iface_arraycreation.py | 64 +++++++++++++++++++++++---------
 tests/skipped_tests.tbl          | 24 ------------
 tests/skipped_tests_gpu.tbl      | 24 ------------
 tests/test_arraycreation.py      |  9 +++++
 5 files changed, 56 insertions(+), 77 deletions(-)

diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 3aa4478baa0..243899bee80 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -271,18 +271,6 @@ def linspace(
     return dpnp_array(array_obj.shape, buffer=array_obj)
 
 
-def meshgrid(*xi, indexing="xy"):
-    """Creates list of `dpnp_array` coordinate matrices from vectors."""
-    if len(xi) == 0:
-        return []
-    arrays = tuple(dpnp.get_usm_ndarray(x) for x in xi)
-    arrays_obj = dpt.meshgrid(*arrays, indexing=indexing)
-    return [
-        dpnp_array._create_from_usm_ndarray(array_obj)
-        for array_obj in arrays_obj
-    ]
-
-
 def ones(
     shape,
     *,
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 067eb3fbb52..851ef119975 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -1394,12 +1394,28 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
 
     For full documentation refer to :obj:`numpy.meshgrid`.
 
-    Limitations
-    -----------
-    Each array instance from `xi` is supported as either :class:`dpnp.dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`.
-    Parameter `copy` is supported only with default value ``True``.
-    Parameter `sparse` is supported only with default value ``False``.
-    Otherwise the function will be executed sequentially on CPU.
+    Parameters
+    ----------
+    x1, x2,..., xn : {dpnp.ndarray, usm_ndarray}
+        1-D arrays representing the coordinates of a grid.
+    indexing : {'xy', 'ij'}, optional
+        Cartesian ('xy', default) or matrix ('ij') indexing of output.
+    sparse : bool, optional
+        If True the shape of the returned coordinate array for dimension `i`
+        is reduced from ``(N1, ..., Ni, ... Nn)`` to
+        ``(1, ..., 1, Ni, 1, ..., 1)``. Default is False.
+    copy : bool, optional
+        If False, a view into the original arrays are returned in order to
+        conserve memory.  Default is True.
+
+    Returns
+    -------
+    X1, X2,..., XN : tuple of dpnp.ndarrays
+        For vectors `x1`, `x2`,..., `xn` with lengths ``Ni=len(xi)``,
+        returns ``(N1, N2, N3,..., Nn)`` shaped arrays if indexing='ij'
+        or ``(N2, N1, N3,..., Nn)`` shaped arrays if indexing='xy'
+        with the elements of `xi` repeated to fill the matrix along
+        the first dimension for `x1`, the second for `x2` and so on.
 
     Examples
     --------
@@ -1433,18 +1449,32 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
 
     """
 
-    if not all((isinstance(x, (dpnp.ndarray, dpt.usm_ndarray)) for x in xi)):
-        pass
-    elif indexing not in ["ij", "xy"]:
-        pass
-    elif copy is not True:
-        pass
-    elif sparse is not False:
-        pass
-    else:
-        return dpnp_container.meshgrid(*xi, indexing=indexing)
+    if not dpnp.check_supported_arrays_type(*xi):
+        raise TypeError("Each input array must be any of supported type")
+
+    ndim = len(xi)
+
+    if indexing not in ["xy", "ij"]:
+        raise ValueError(
+            "Unrecognized indexing keyword value, expecting 'xy' or 'ij'."
+        )
+
+    s0 = (1,) * ndim
+    output = [
+        dpnp.reshape(x, s0[:i] + (-1,) + s0[i + 1 :]) for i, x in enumerate(xi)
+    ]
+
+    if indexing == "xy" and ndim > 1:
+        output[0] = output[0].reshape((1, -1) + s0[2:])
+        output[1] = output[1].reshape((-1, 1) + s0[2:])
+
+    if not sparse:
+        output = dpnp.broadcast_arrays(*output)
+
+    if copy:
+        output = [x.copy() for x in output]
 
-    return call_origin(numpy.meshgrid, xi, copy, sparse, indexing)
+    return output
 
 
 class MGridClass:
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index 8eb46d3c983..018255c1e40 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -151,30 +151,6 @@ tests/third_party/cupy/creation_tests/test_basic.py::TestBasic::test_ones_like_s
 tests/third_party/cupy/creation_tests/test_basic.py::TestBasic::test_zeros_like_subok
 tests/third_party/cupy/creation_tests/test_basic.py::TestBasic::test_zeros_strides
 
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_1_{copy=False, indexing='xy', sparse=True}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_1_{copy=False, indexing='xy', sparse=True}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_1_{copy=False, indexing='xy', sparse=True}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_1_{copy=False, indexing='xy', sparse=True}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_2_{copy=False, indexing='ij', sparse=False}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_2_{copy=False, indexing='ij', sparse=False}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_2_{copy=False, indexing='ij', sparse=False}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_2_{copy=False, indexing='ij', sparse=False}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_3_{copy=False, indexing='ij', sparse=True}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_3_{copy=False, indexing='ij', sparse=True}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_3_{copy=False, indexing='ij', sparse=True}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_3_{copy=False, indexing='ij', sparse=True}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_5_{copy=True, indexing='xy', sparse=True}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_5_{copy=True, indexing='xy', sparse=True}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_5_{copy=True, indexing='xy', sparse=True}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_5_{copy=True, indexing='xy', sparse=True}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_7_{copy=True, indexing='ij', sparse=True}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_7_{copy=True, indexing='ij', sparse=True}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_7_{copy=True, indexing='ij', sparse=True}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_7_{copy=True, indexing='ij', sparse=True}::test_meshgrid3
 tests/third_party/cupy/indexing_tests/test_generate.py::TestAxisConcatenator::test_AxisConcatenator_init1
 tests/third_party/cupy/indexing_tests/test_generate.py::TestAxisConcatenator::test_len
 tests/third_party/cupy/indexing_tests/test_generate.py::TestC_::test_c_1
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index b8c195b9861..fe3671ecf7f 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -230,30 +230,6 @@ tests/third_party/cupy/creation_tests/test_basic.py::TestBasic::test_ones_like_s
 tests/third_party/cupy/creation_tests/test_basic.py::TestBasic::test_zeros_like_subok
 tests/third_party/cupy/creation_tests/test_basic.py::TestBasic::test_zeros_strides
 
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_1_{copy=False, indexing='xy', sparse=True}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_1_{copy=False, indexing='xy', sparse=True}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_1_{copy=False, indexing='xy', sparse=True}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_1_{copy=False, indexing='xy', sparse=True}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_2_{copy=False, indexing='ij', sparse=False}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_2_{copy=False, indexing='ij', sparse=False}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_2_{copy=False, indexing='ij', sparse=False}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_2_{copy=False, indexing='ij', sparse=False}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_3_{copy=False, indexing='ij', sparse=True}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_3_{copy=False, indexing='ij', sparse=True}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_3_{copy=False, indexing='ij', sparse=True}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_3_{copy=False, indexing='ij', sparse=True}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_5_{copy=True, indexing='xy', sparse=True}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_5_{copy=True, indexing='xy', sparse=True}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_5_{copy=True, indexing='xy', sparse=True}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_5_{copy=True, indexing='xy', sparse=True}::test_meshgrid3
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_7_{copy=True, indexing='ij', sparse=True}::test_meshgrid0
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_7_{copy=True, indexing='ij', sparse=True}::test_meshgrid1
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_7_{copy=True, indexing='ij', sparse=True}::test_meshgrid2
-tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_7_{copy=True, indexing='ij', sparse=True}::test_meshgrid3
 tests/third_party/cupy/creation_tests/test_ranges.py::TestRanges::test_arange_negative_size
 tests/third_party/cupy/creation_tests/test_ranges.py::TestRanges::test_arange_no_dtype_int
 
diff --git a/tests/test_arraycreation.py b/tests/test_arraycreation.py
index 0a4ce206337..f7b06ffc9be 100644
--- a/tests/test_arraycreation.py
+++ b/tests/test_arraycreation.py
@@ -878,3 +878,12 @@ def test_logspace_axis(axis):
         [2, 3], [20, 15], num=2, base=[[1, 3], [5, 7]], axis=axis
     )
     assert_dtype_allclose(func(dpnp), func(numpy))
+
+
+def test_meshgrid_raise_error():
+    a = numpy.array([1, 2, 3, 4])
+    with pytest.raises(TypeError):
+        dpnp.meshgrid(a)
+    b = dpnp.array([1, 2, 3, 4])
+    with pytest.raises(ValueError):
+        dpnp.meshgrid(b, indexing="ab")

From d504d7dcb147568994ebd8205b7d74083adce86c Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Sat, 3 Feb 2024 20:11:43 +0100
Subject: [PATCH 06/29] Specify permissions in GH Action workflows (#1679)

* Specify permissions in GH Action workflows

* Added contents permissions write to deploy your static files to GitHub Pages

* Added permissions required by actions
---
 .github/workflows/build-sphinx.yml       | 10 ++++++++++
 .github/workflows/conda-package.yml      |  6 ++++++
 .github/workflows/generate_coverage.yaml |  6 ++++++
 .github/workflows/pre-commit.yml         |  2 ++
 4 files changed, 24 insertions(+)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 6246ee13e12..a547efec727 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -6,6 +6,8 @@ on:
   pull_request:
     types: [opened, synchronize, reopened, closed]
 
+permissions: read-all
+
 env:
   GH_BOT_NAME: 'github-actions[bot]'
   GH_BOT_EMAIL: 'github-actions[bot]@users.noreply.github.com'
@@ -25,6 +27,14 @@ jobs:
 
     runs-on: ubuntu-20.04
 
+    permissions:
+      # Needed to cancel any previous runs that are not completed for a given workflow
+      actions: write
+      # Needed to deploy static files to GitHub Pages
+      contents: write
+      # Needed to add a comment to a pull request's issue
+      pull-requests: write
+
     env:
       python-ver: '3.9'
       CHANNELS: '-c dppy/label/dev -c intel -c conda-forge --override-channels'
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index d9072c26a65..ddbd9191287 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -6,6 +6,8 @@ on:
       - master
   pull_request:
 
+permissions: read-all
+
 env:
   PACKAGE_NAME: dpnp
   MODULE_NAME: dpnp
@@ -58,6 +60,10 @@ jobs:
         python: ['3.9', '3.10', '3.11']
         os: [ubuntu-20.04, windows-latest]
 
+    permissions:
+      # Needed to cancel any previous runs that are not completed for a given workflow
+      actions: write
+
     runs-on: ${{ matrix.os }}
 
     defaults:
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index e7479d445ea..b5b0e4a40b9 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -4,11 +4,17 @@ on:
   push:
     branches: [master]
 
+permissions: read-all
+
 jobs:
   generate-coverage:
     name: Generate coverage and push to Coveralls.io
     runs-on: ubuntu-20.04
 
+    permissions:
+      # Needed to cancel any previous runs that are not completed for a given workflow
+      actions: write
+
     defaults:
       run:
         shell: bash -l {0}
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index dd5047f22b1..aa17c7696df 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,6 +5,8 @@ on:
   push:
     branches: [master]
 
+permissions: read-all
+
 jobs:
   pre-commit:
     runs-on: ubuntu-latest

From 22c2367d9ce82b4d2792db5dd9e5d1cbc717bda9 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Sat, 3 Feb 2024 23:01:21 +0100
Subject: [PATCH 07/29] Add OpenSSF Scorecard badge to README (#1680)

* Add OpenSSF Scorecard badge to README

* Add permissions for clean job
---
 .github/workflows/build-sphinx.yml      |  4 ++
 .github/workflows/openssf-scorecard.yml | 73 +++++++++++++++++++++++++
 README.md                               |  1 +
 3 files changed, 78 insertions(+)
 create mode 100644 .github/workflows/openssf-scorecard.yml

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index a547efec727..fe82e96b2f0 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -190,6 +190,10 @@ jobs:
 
     needs: build-and-deploy
 
+    permissions:
+      # Needed to remove docs for closed pull request from the repo
+      contents: write
+
     runs-on: ubuntu-20.04
 
     steps:
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
new file mode 100644
index 00000000000..b43795eaebe
--- /dev/null
+++ b/.github/workflows/openssf-scorecard.yml
@@ -0,0 +1,73 @@
+# This workflow uses actions that are not certified by GitHub. They are provided
+# by a third-party and are governed by separate terms of service, privacy
+# policy, and support documentation.
+
+name: Scorecard supply-chain security
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    - cron: '28 2 * * 1'
+    - cron: '28 2 * * 4'
+  push:
+    branches: [ "master" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+      # Uncomment the permissions below if installing in a private repository.
+      # contents: read
+      # actions: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@v4.1.1
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@v2.3.1
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+          # - you want to enable the Branch-Protection check on a *public* repository, or
+          # - you are installing Scorecard on a *private* repository
+          # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
+          # repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+          # Public repositories:
+          #   - Publish results to OpenSSF REST API for easy access by consumers
+          #   - Allows the repository to include the Scorecard badge.
+          #   - See https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories:
+          #   - `publish_results` will always be set to `false`, regardless
+          #     of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@v4.3.0
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 14
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@v3.23.2
+        with:
+          sarif_file: results.sarif
diff --git a/README.md b/README.md
index b19e902ece1..086f33c895a 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@
 [![Conda package](https://github.com/IntelPython/dpnp/actions/workflows/conda-package.yml/badge.svg?branch=master&event=push)](https://github.com/IntelPython/dpnp/actions/workflows/conda-package.yml)
 [![Coverage Status](https://coveralls.io/repos/github/IntelPython/dpnp/badge.svg?branch=master)](https://coveralls.io/github/IntelPython/dpnp?branch=master)
 [![Build Sphinx](https://github.com/IntelPython/dpnp/workflows/Build%20Sphinx/badge.svg)](https://intelpython.github.io/dpnp)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/IntelPython/dpnp/badge)](https://securityscorecards.dev/viewer/?uri=github.com/IntelPython/dpnp)
 
 # DPNP - Data Parallel Extension for NumPy*
 [API coverage summary](https://intelpython.github.io/dpnp/reference/comparison.html#summary)

From 3c676e75f8bf226865e582446425c8e9c30d1f5d Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Sun, 4 Feb 2024 11:58:08 +0100
Subject: [PATCH 08/29] Adding dependabot file to update GH action versions
 (#1681)

---
 .github/dependabot.yml             | 6 ++++++
 .github/workflows/build-sphinx.yml | 2 ++
 2 files changed, 8 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000000..5ace4600a1f
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index fe82e96b2f0..b4484a02988 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -193,6 +193,8 @@ jobs:
     permissions:
       # Needed to remove docs for closed pull request from the repo
       contents: write
+      # Needed to modify a comment in the pull request's issue
+      pull-requests: write
 
     runs-on: ubuntu-20.04
 

From da4df675e714de648ea1c16c4637bfb8baef20fc Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Sun, 4 Feb 2024 14:02:27 +0100
Subject: [PATCH 09/29] Add recipe-maintainers list (#1682)

---
 conda-recipe/meta.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 2ff93e0a0a8..99e50c706c0 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -58,6 +58,7 @@ about:
     home: https://github.com/IntelPython/dpnp
     license: BSD-2-Clause
     license_file: LICENSE.txt
+    summary: 'Data Parallel Extension for NumPy'
     description: |
         <strong>LEGAL NOTICE: Use of this software package is subject to the
         software license agreement (as set forth above, in the license section of
@@ -67,3 +68,11 @@ about:
         <br/><br/>
         EULA: <a href="https://opensource.org/licenses/BSD-2-Clause" target="_blank">BSD-2-Clause</a>
         <br/><br/>
+
+extra:
+    recipe-maintainers:
+        - oleksandr-pavlyk
+        - antonwolfy
+        - npolina4
+        - vtavana
+        - vlad-perevezentsev

From afd84fb4948dc1d1ee9bd2418b8396ae0e19ae45 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 4 Feb 2024 15:16:56 +0100
Subject: [PATCH 10/29] Bump mshick/add-pr-comment from 2.8.1 to 2.8.2 (#1683)

Bumps [mshick/add-pr-comment](https://github.com/mshick/add-pr-comment) from 2.8.1 to 2.8.2.
- [Release notes](https://github.com/mshick/add-pr-comment/releases)
- [Commits](https://github.com/mshick/add-pr-comment/compare/v2.8.1...v2.8.2)

---
updated-dependencies:
- dependency-name: mshick/add-pr-comment
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 .github/workflows/build-sphinx.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index b4484a02988..6f7bc30562b 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -176,7 +176,7 @@ jobs:
         if: env.GH_EVENT_OPEN_PR_UPSTREAM
         env:
           PR_NUM: ${{ github.event.number }}
-        uses: mshick/add-pr-comment@v2.8.1
+        uses: mshick/add-pr-comment@v2.8.2
         with:
           message: |
             View rendered docs @ https://intelpython.github.io/dpnp/pull/${{ env.PR_NUM }}/index.html
@@ -218,7 +218,7 @@ jobs:
           git push tokened_docs gh-pages
 
       - name: Modify the comment with URL to official documentation
-        uses: mshick/add-pr-comment@v2.8.1
+        uses: mshick/add-pr-comment@v2.8.2
         with:
           find: |
             View rendered docs @.+

From 1528fc676d0c2b909d79648094939e24e662fcb9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 4 Feb 2024 16:25:56 +0100
Subject: [PATCH 11/29] Bump styfle/cancel-workflow-action from 0.12.0 to
 0.12.1 (#1684)

Bumps [styfle/cancel-workflow-action](https://github.com/styfle/cancel-workflow-action) from 0.12.0 to 0.12.1.
- [Release notes](https://github.com/styfle/cancel-workflow-action/releases)
- [Commits](https://github.com/styfle/cancel-workflow-action/compare/0.12.0...0.12.1)

---
updated-dependencies:
- dependency-name: styfle/cancel-workflow-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 .github/workflows/build-sphinx.yml       | 2 +-
 .github/workflows/conda-package.yml      | 2 +-
 .github/workflows/generate_coverage.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 6f7bc30562b..af0c2243368 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -41,7 +41,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@0.12.0
+        uses: styfle/cancel-workflow-action@0.12.1
         with:
           access_token: ${{ github.token }}
 
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index ddbd9191287..47db3e1850e 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -74,7 +74,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@0.12.0
+        uses: styfle/cancel-workflow-action@0.12.1
         with:
           access_token: ${{ github.token }}
 
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index b5b0e4a40b9..ceafb5390a1 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -25,7 +25,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@0.12.0
+        uses: styfle/cancel-workflow-action@0.12.1
         with:
           access_token: ${{ github.token }}
 

From fcf3fa0f76f603d05b3913019bf9d8485536a668 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 4 Feb 2024 17:38:37 +0100
Subject: [PATCH 12/29] Bump nick-fields/retry from 2.9.0 to 3.0.0 (#1687)

Bumps [nick-fields/retry](https://github.com/nick-fields/retry) from 2.9.0 to 3.0.0.
- [Release notes](https://github.com/nick-fields/retry/releases)
- [Changelog](https://github.com/nick-fields/retry/blob/master/.releaserc.js)
- [Commits](https://github.com/nick-fields/retry/compare/v2.9.0...v3.0.0)

---
updated-dependencies:
- dependency-name: nick-fields/retry
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 .github/workflows/generate_coverage.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index ceafb5390a1..009947c9c98 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -60,7 +60,7 @@ jobs:
 
       - name: Build dpnp with coverage
         id: build_coverage
-        uses: nick-fields/retry@v2.9.0
+        uses: nick-fields/retry@v3.0.0
         with:
           shell: bash
           timeout_minutes: 60

From 10357cbc70fedd55397610bfe072078d18ea24cd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 4 Feb 2024 18:35:13 +0100
Subject: [PATCH 13/29] Bump mattnotmitt/doxygen-action from 1.9.5 to 1.9.8
 (#1685)

Bumps [mattnotmitt/doxygen-action](https://github.com/mattnotmitt/doxygen-action) from 1.9.5 to 1.9.8.
- [Release notes](https://github.com/mattnotmitt/doxygen-action/releases)
- [Commits](https://github.com/mattnotmitt/doxygen-action/compare/v1.9.5...v1.9.8)

---
updated-dependencies:
- dependency-name: mattnotmitt/doxygen-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 .github/workflows/build-sphinx.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index af0c2243368..a7ed392a3a5 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -135,7 +135,7 @@ jobs:
 
       # https://github.com/marketplace/actions/doxygen-action
       - name: Build backend docs
-        uses: mattnotmitt/doxygen-action@v1.9.5
+        uses: mattnotmitt/doxygen-action@v1.9.8
         with:
             working-directory: 'dpnp/backend/doc'
 

From ac30e215440b17a311c7d710faa5c03ef0f30227 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 4 Feb 2024 20:27:23 +0100
Subject: [PATCH 14/29] Bump github/codeql-action from 3.23.2 to 3.24.0 (#1686)

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.23.2 to 3.24.0.
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/v3.23.2...v3.24.0)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/openssf-scorecard.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index b43795eaebe..903d1cb12cb 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -68,6 +68,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@v3.23.2
+        uses: github/codeql-action/upload-sarif@v3.24.0
         with:
           sarif_file: results.sarif

From e7f7a7cbee9ee8036373614551fe9fd2a40e1006 Mon Sep 17 00:00:00 2001
From: StepSecurity Bot <bot@stepsecurity.io>
Date: Sun, 4 Feb 2024 12:59:46 -0800
Subject: [PATCH 15/29] [StepSecurity] ci: Harden GitHub Actions (#1688)

Signed-off-by: StepSecurity Bot <bot@stepsecurity.io>
---
 .github/workflows/build-sphinx.yml       | 20 ++++++++--------
 .github/workflows/conda-package.yml      | 30 ++++++++++++------------
 .github/workflows/generate_coverage.yaml |  8 +++----
 .github/workflows/openssf-scorecard.yml  |  8 +++----
 .github/workflows/pre-commit.yml         |  6 ++---
 5 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index a7ed392a3a5..9de0097e120 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -41,7 +41,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@0.12.1
+        uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # 0.12.1
         with:
           access_token: ${{ github.token }}
 
@@ -52,7 +52,7 @@ jobs:
           echo "$GITHUB_CONTEXT"
 
       - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@v1.3.1
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
         with:
           docker-images: false
 
@@ -86,13 +86,13 @@ jobs:
           sudo apt-get install -y nvidia-cuda-toolkit clinfo
 
       - name: Checkout repo
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           fetch-depth: 0
 
       # https://github.com/marketplace/actions/setup-miniconda
       - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v3.0.1
+        uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
         with:
           auto-update-conda: true
           python-version: ${{ env.python-ver }}
@@ -135,7 +135,7 @@ jobs:
 
       # https://github.com/marketplace/actions/doxygen-action
       - name: Build backend docs
-        uses: mattnotmitt/doxygen-action@v1.9.8
+        uses: mattnotmitt/doxygen-action@cbe72c8e402e8a3faa1f0b247ef90aa6c8e4ce74 # v1.9.8
         with:
             working-directory: 'dpnp/backend/doc'
 
@@ -146,7 +146,7 @@ jobs:
       # The step is only used to build docs while pushing a PR to "master"
       - name: Deploy docs
         if: env.GH_EVENT_PUSH_UPSTREAM
-        uses: peaceiris/actions-gh-pages@v3.9.3
+        uses: peaceiris/actions-gh-pages@373f7f263a76c20808c831209c920827a82a2847 # v3.9.3
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           publish_dir: ${{ env.PUBLISH_DIR }}
@@ -159,7 +159,7 @@ jobs:
       # The step is only used to build docs while pushing to PR branch
       - name: Publish pull-request docs
         if: env.GH_EVENT_OPEN_PR_UPSTREAM
-        uses: peaceiris/actions-gh-pages@v3.9.3
+        uses: peaceiris/actions-gh-pages@373f7f263a76c20808c831209c920827a82a2847 # v3.9.3
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           publish_dir: ${{ env.PUBLISH_DIR }}
@@ -176,7 +176,7 @@ jobs:
         if: env.GH_EVENT_OPEN_PR_UPSTREAM
         env:
           PR_NUM: ${{ github.event.number }}
-        uses: mshick/add-pr-comment@v2.8.2
+        uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
         with:
           message: |
             View rendered docs @ https://intelpython.github.io/dpnp/pull/${{ env.PR_NUM }}/index.html
@@ -199,7 +199,7 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v4.1.1
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           fetch-depth: 0
 
@@ -218,7 +218,7 @@ jobs:
           git push tokened_docs gh-pages
 
       - name: Modify the comment with URL to official documentation
-        uses: mshick/add-pr-comment@v2.8.2
+        uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
         with:
           find: |
             View rendered docs @.+
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 47db3e1850e..5ec377bd740 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -74,17 +74,17 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@0.12.1
+        uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # 0.12.1
         with:
           access_token: ${{ github.token }}
 
       - name: Checkout DPNP repo
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           fetch-depth: 0
 
       - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v3.0.1
+        uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
         with:
           auto-update-conda: true
           python-version: ${{ matrix.python }}
@@ -105,7 +105,7 @@ jobs:
         run: conda install conda-build=3.28.4
 
       - name: Cache conda packages
-        uses: actions/cache@v4
+        uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0
         env:
           CACHE_NUMBER: 1  # Increase to reset cache
         with:
@@ -120,7 +120,7 @@ jobs:
         run: conda build --no-test --python ${{ matrix.python }} ${{ env.CHANNELS }} conda-recipe
 
       - name: Upload artifact
-        uses: actions/upload-artifact@v4.3.0
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.CONDA_BLD }}${{ env.PACKAGE_NAME }}-*.tar.bz2
@@ -153,7 +153,7 @@ jobs:
 
     steps:
       - name: Download artifact
-        uses: actions/download-artifact@v4.1.1
+        uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.pkg-path-in-channel }}
@@ -164,7 +164,7 @@ jobs:
           tar -xvf ${{ env.pkg-path-in-channel }}/${{ env.PACKAGE_NAME }}-*.tar.bz2 -C ${{ env.extracted-pkg-path }}
 
       - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v3.0.1
+        uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
         with:
           auto-update-conda: true
           python-version: ${{ matrix.python }}
@@ -196,7 +196,7 @@ jobs:
           TEST_CHANNELS: '-c ${{ env.channel-path }} ${{ env.CHANNELS }}'
 
       - name: Cache conda packages
-        uses: actions/cache@v4
+        uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0
         env:
           CACHE_NUMBER: 1 # Increase to reset cache
         with:
@@ -254,7 +254,7 @@ jobs:
 
     steps:
       - name: Download artifact
-        uses: actions/download-artifact@v4.1.1
+        uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.pkg-path-in-channel }}
@@ -274,7 +274,7 @@ jobs:
           dir ${{ env.extracted-pkg-path }}
 
       - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v3.0.1
+        uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
         with:
           auto-update-conda: true
           python-version: ${{ matrix.python }}
@@ -320,7 +320,7 @@ jobs:
         run: more lockfile
 
       - name: Cache conda packages
-        uses: actions/cache@v4
+        uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0
         env:
           CACHE_NUMBER: 1  # Increase to reset cache
         with:
@@ -388,12 +388,12 @@ jobs:
 
     steps:
       - name: Download artifact
-        uses: actions/download-artifact@v4.1.1
+        uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
 
       - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v3.0.1
+        uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
         with:
           auto-update-conda: true
           python-version: ${{ matrix.python }}
@@ -416,7 +416,7 @@ jobs:
       run:
         shell: bash -el {0}
     steps:
-      - uses: conda-incubator/setup-miniconda@v3.0.1
+      - uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
         with:
           run-post: false
           channel-priority: "disabled"
@@ -427,7 +427,7 @@ jobs:
         run: conda install anaconda-client
 
       - name: Checkout repo
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           repository: IntelPython/devops-tools
           fetch-depth: 0
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index 009947c9c98..432377ce10c 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -25,17 +25,17 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@0.12.1
+        uses: styfle/cancel-workflow-action@85880fa0301c86cca9da44039ee3bb12d3bedbfa # 0.12.1
         with:
           access_token: ${{ github.token }}
 
       - name: Checkout repo
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           fetch-depth: 0
 
       - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v3.0.1
+        uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
         with:
           auto-update-conda: true
           python-version: ${{ env.python-ver }}
@@ -60,7 +60,7 @@ jobs:
 
       - name: Build dpnp with coverage
         id: build_coverage
-        uses: nick-fields/retry@v3.0.0
+        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
           shell: bash
           timeout_minutes: 60
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index 903d1cb12cb..ee124d3a9a1 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -33,12 +33,12 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@v2.3.1
+        uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
         with:
           results_file: results.sarif
           results_format: sarif
@@ -60,7 +60,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@v4.3.0
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
         with:
           name: SARIF file
           path: results.sarif
@@ -68,6 +68,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@v3.24.0
+        uses: github/codeql-action/upload-sarif@e8893c57a1f3a2b659b6b55564fdfdbbd2982911 # v3.24.0
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index aa17c7696df..7aea59b3977 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -18,8 +18,8 @@ jobs:
           sudo ln -s /usr/bin/clang-format-12 /usr/bin/clang-format
           clang-format --version
 
-      - uses: actions/checkout@v4.1.1
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
         with:
           python-version: '3.11'
-      - uses: pre-commit/action@v3.0.0
+      - uses: pre-commit/action@646c83fcd040023954eafda54b4db0192ce70507 # v3.0.0

From a002bdeff3b1038ce7af51ba805bc41cd06b35e1 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Mon, 5 Feb 2024 19:12:54 +0100
Subject: [PATCH 16/29] Add `gitleaks` and `shellcheck` to pre-commit
 configuration (#1689)

* Add gitleaks and shellcheck to pre-commit configuration

* Pin gitleaks to the latest version

* Use an array to build scikit-build arguments

* Use an array to build wheel arguments
---
 .pre-commit-config.yaml                       |   8 +
 benchmarks/pytest_benchmark/README.md         |  76 +-
 benchmarks/pytest_benchmark/test_random.py    | 234 +++---
 conda-recipe/build.sh                         |  22 +-
 conda-recipe/run_test.sh                      |   9 +-
 doc/0.builddoc.sh                             |   6 +-
 doc/make.bat                                  |  72 +-
 dpnp/backend/examples/example11.cpp           | 170 ++--
 dpnp/dpnp_algo/dpnp_arraycreation.py          | 784 +++++++++---------
 scripts/build_deps_dpctl.sh                   |   8 +-
 scripts/install_cmake_lin.sh                  |   7 +-
 scripts/install_python_deps.sh                |   4 +-
 scripts/install_system_deps.sh                |   2 -
 scripts/install_system_deps_intelpython.sh    |   2 -
 scripts/set_ci_env.sh                         |  18 +-
 tests/test_histograms.py                      | 178 ++--
 .../cupy/manipulation_tests/test_kind.py      | 286 +++----
 .../third_party/intel/test_zero_copy_test1.py |  72 +-
 tests/third_party/intel/zero-copy-test1.py    | 168 ++--
 19 files changed, 1066 insertions(+), 1060 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b3787c3833c..3289990f4a4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -71,3 +71,11 @@ repos:
     hooks:
     -   id: clang-format
         args: ["-i"]
+-   repo: https://github.com/gitleaks/gitleaks
+    rev: v8.18.2
+    hooks:
+    -   id: gitleaks
+-   repo: https://github.com/jumanjihouse/pre-commit-hooks
+    rev: 3.0.0
+    hooks:
+    -   id: shellcheck
diff --git a/benchmarks/pytest_benchmark/README.md b/benchmarks/pytest_benchmark/README.md
index d3c7478509a..77015a089ef 100644
--- a/benchmarks/pytest_benchmark/README.md
+++ b/benchmarks/pytest_benchmark/README.md
@@ -1,38 +1,38 @@
-# dpnp/benchmarks/pytest_benchmark/
-
-## Prerequisites
-* pytest >= 6.1.1
-* pytest-benchmark >= 3.4.1
-
-
-## Running benchmark tests
-```bash
-pytest benchmarks/ --benchmark-json=results.json
-```
-Running tests and saving the current run into `STORAGE`, see [1]
-```bash
-pytest benchmarks/ --benchmark-json=results.json --benchmark-autosave
-```
-
-## Creating `.csv` report
-```bash
-pytest-benchmark compare results.json --csv=results.csv --group-by='name'
-```
-
-## Optional: creating histogram
-Note: make sure that `pytest-benchmark[histogram]` installed
-```bash
-# example
-pip install pytest-benchmark[histogram]
-pytest -vv benchmarks/ --benchmark-autosave --benchmark-histogram
-pytest-benchmark compare .benchmarks/Linux-CPython-3.7-64bit/* --histogram
-```
-
-## Advanced running example
-```
-pytest benchmarks/ --benchmark-columns='min, max, mean, stddev, median, rounds, iterations' --benchmark-json=results.json --benchmark-autosave
-pytest-benchmark compare results.json --csv=results.csv --group-by='name'
-```
-
-
-[1] https://pytest-benchmark.readthedocs.io/en/latest/usage.html
+# dpnp/benchmarks/pytest_benchmark/
+
+## Prerequisites
+* pytest >= 6.1.1
+* pytest-benchmark >= 3.4.1
+
+
+## Running benchmark tests
+```bash
+pytest benchmarks/ --benchmark-json=results.json
+```
+Running tests and saving the current run into `STORAGE`, see [1]
+```bash
+pytest benchmarks/ --benchmark-json=results.json --benchmark-autosave
+```
+
+## Creating `.csv` report
+```bash
+pytest-benchmark compare results.json --csv=results.csv --group-by='name'
+```
+
+## Optional: creating histogram
+Note: make sure that `pytest-benchmark[histogram]` installed
+```bash
+# example
+pip install pytest-benchmark[histogram]
+pytest -vv benchmarks/ --benchmark-autosave --benchmark-histogram
+pytest-benchmark compare .benchmarks/Linux-CPython-3.7-64bit/* --histogram
+```
+
+## Advanced running example
+```
+pytest benchmarks/ --benchmark-columns='min, max, mean, stddev, median, rounds, iterations' --benchmark-json=results.json --benchmark-autosave
+pytest-benchmark compare results.json --csv=results.csv --group-by='name'
+```
+
+
+[1] https://pytest-benchmark.readthedocs.io/en/latest/usage.html
diff --git a/benchmarks/pytest_benchmark/test_random.py b/benchmarks/pytest_benchmark/test_random.py
index 7c083d20009..ce0f374fb1e 100644
--- a/benchmarks/pytest_benchmark/test_random.py
+++ b/benchmarks/pytest_benchmark/test_random.py
@@ -1,117 +1,117 @@
-# cython: language_level=3
-# -*- coding: utf-8 -*-
-# *****************************************************************************
-# Copyright (c) 2016-2024, Intel Corporation
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# - Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
-# *****************************************************************************
-
-import numpy as np
-import pytest
-
-import dpnp
-
-ROUNDS = 30
-ITERATIONS = 4
-
-NNUMBERS = 2**26
-
-
-@pytest.mark.parametrize(
-    "function", [dpnp.random.beta, np.random.beta], ids=["dpnp", "numpy"]
-)
-def test_beta(benchmark, function):
-    result = benchmark.pedantic(
-        target=function,
-        args=(
-            4.0,
-            5.0,
-            NNUMBERS,
-        ),
-        rounds=ROUNDS,
-        iterations=ITERATIONS,
-    )
-
-
-@pytest.mark.parametrize(
-    "function",
-    [dpnp.random.exponential, np.random.exponential],
-    ids=["dpnp", "numpy"],
-)
-def test_exponential(benchmark, function):
-    result = benchmark.pedantic(
-        target=function,
-        args=(
-            4.0,
-            NNUMBERS,
-        ),
-        rounds=ROUNDS,
-        iterations=ITERATIONS,
-    )
-
-
-@pytest.mark.parametrize(
-    "function", [dpnp.random.gamma, np.random.gamma], ids=["dpnp", "numpy"]
-)
-def test_gamma(benchmark, function):
-    result = benchmark.pedantic(
-        target=function,
-        args=(
-            2.0,
-            4.0,
-            NNUMBERS,
-        ),
-        rounds=ROUNDS,
-        iterations=ITERATIONS,
-    )
-
-
-@pytest.mark.parametrize(
-    "function", [dpnp.random.normal, np.random.normal], ids=["dpnp", "numpy"]
-)
-def test_normal(benchmark, function):
-    result = benchmark.pedantic(
-        target=function,
-        args=(
-            0.0,
-            1.0,
-            NNUMBERS,
-        ),
-        rounds=ROUNDS,
-        iterations=ITERATIONS,
-    )
-
-
-@pytest.mark.parametrize(
-    "function", [dpnp.random.uniform, np.random.uniform], ids=["dpnp", "numpy"]
-)
-def test_uniform(benchmark, function):
-    result = benchmark.pedantic(
-        target=function,
-        args=(
-            0.0,
-            1.0,
-            NNUMBERS,
-        ),
-        rounds=ROUNDS,
-        iterations=ITERATIONS,
-    )
+# cython: language_level=3
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2016-2024, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp
+
+ROUNDS = 30
+ITERATIONS = 4
+
+NNUMBERS = 2**26
+
+
+@pytest.mark.parametrize(
+    "function", [dpnp.random.beta, np.random.beta], ids=["dpnp", "numpy"]
+)
+def test_beta(benchmark, function):
+    result = benchmark.pedantic(
+        target=function,
+        args=(
+            4.0,
+            5.0,
+            NNUMBERS,
+        ),
+        rounds=ROUNDS,
+        iterations=ITERATIONS,
+    )
+
+
+@pytest.mark.parametrize(
+    "function",
+    [dpnp.random.exponential, np.random.exponential],
+    ids=["dpnp", "numpy"],
+)
+def test_exponential(benchmark, function):
+    result = benchmark.pedantic(
+        target=function,
+        args=(
+            4.0,
+            NNUMBERS,
+        ),
+        rounds=ROUNDS,
+        iterations=ITERATIONS,
+    )
+
+
+@pytest.mark.parametrize(
+    "function", [dpnp.random.gamma, np.random.gamma], ids=["dpnp", "numpy"]
+)
+def test_gamma(benchmark, function):
+    result = benchmark.pedantic(
+        target=function,
+        args=(
+            2.0,
+            4.0,
+            NNUMBERS,
+        ),
+        rounds=ROUNDS,
+        iterations=ITERATIONS,
+    )
+
+
+@pytest.mark.parametrize(
+    "function", [dpnp.random.normal, np.random.normal], ids=["dpnp", "numpy"]
+)
+def test_normal(benchmark, function):
+    result = benchmark.pedantic(
+        target=function,
+        args=(
+            0.0,
+            1.0,
+            NNUMBERS,
+        ),
+        rounds=ROUNDS,
+        iterations=ITERATIONS,
+    )
+
+
+@pytest.mark.parametrize(
+    "function", [dpnp.random.uniform, np.random.uniform], ids=["dpnp", "numpy"]
+)
+def test_uniform(benchmark, function):
+    result = benchmark.pedantic(
+        target=function,
+        args=(
+            0.0,
+            1.0,
+            NNUMBERS,
+        ),
+        rounds=ROUNDS,
+        iterations=ITERATIONS,
+    )
diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh
index b4ea4c44cb2..b0a266be3ea 100755
--- a/conda-recipe/build.sh
+++ b/conda-recipe/build.sh
@@ -5,25 +5,29 @@ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${BUILD_PREFIX}/lib"
 
 # Intel LLVM must cooperate with compiler and sysroot from conda
 echo "--gcc-toolchain=${BUILD_PREFIX} --sysroot=${BUILD_PREFIX}/${HOST}/sysroot -target ${HOST}" > icpx_for_conda.cfg
-export ICPXCFG="$(pwd)/icpx_for_conda.cfg"
-export ICXCFG="$(pwd)/icpx_for_conda.cfg"
+
+ICPXCFG="$(pwd)/icpx_for_conda.cfg"
+export ICPXCFG
+
+ICXCFG="$(pwd)/icpx_for_conda.cfg"
+export ICXCFG
 
 export CMAKE_GENERATOR="Ninja"
 export TBB_ROOT_HINT=$PREFIX
 export DPL_ROOT_HINT=$PREFIX
 export MKL_ROOT_HINT=$PREFIX
-SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
-SKBUILD_ARGS="${SKBUILD_ARGS} -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+SKBUILD_ARGS=(-- "-DCMAKE_C_COMPILER:PATH=icx" "-DCMAKE_CXX_COMPILER:PATH=icpx" "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON")
+SKBUILD_ARGS=("${SKBUILD_ARGS[@]}" "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON")
 
 # Build wheel package
 if [ "$CONDA_PY" == "36" ]; then
-    WHEELS_BUILD_ARGS="-p manylinux1_x86_64"
+    WHEELS_BUILD_ARGS=("-p" "manylinux1_x86_64")
 else
-    WHEELS_BUILD_ARGS="-p manylinux2014_x86_64"
+    WHEELS_BUILD_ARGS=("-p" "manylinux2014_x86_64")
 fi
 if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then
-    $PYTHON setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS}
-    cp dist/dpnp*.whl ${WHEELS_OUTPUT_FOLDER}
+    $PYTHON setup.py install bdist_wheel "${WHEELS_BUILD_ARGS[@]}" "${SKBUILD_ARGS[@]}"
+    cp dist/dpnp*.whl "${WHEELS_OUTPUT_FOLDER}"
 else
-    $PYTHON setup.py install ${SKBUILD_ARGS}
+    $PYTHON setup.py install "${SKBUILD_ARGS[@]}"
 fi
diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh
index 7780c9b98d1..c67e538e90f 100755
--- a/conda-recipe/run_test.sh
+++ b/conda-recipe/run_test.sh
@@ -10,16 +10,19 @@ fi
 
 # if DPCPPROOT is specified (work with custom DPCPP)
 if [ -n "${DPCPPROOT}" ]; then
-    . ${DPCPPROOT}/env/vars.sh
+    # shellcheck source=/dev/null
+    . "${DPCPPROOT}"/env/vars.sh
 fi
 
 # if MKLROOT is specified (work with custom math library)
 if [ -n "${MKLROOT}" ]; then
-    . ${MKLROOT}/env/vars.sh
+    # shellcheck source=/dev/null
+    . "${MKLROOT}"/env/vars.sh
 fi
 
 # have to activate while SYCL CPU device/driver needs paths
 # if TBBROOT is specified
 if [ -n "${TBBROOT}" ]; then
-    . ${TBBROOT}/env/vars.sh
+    # shellcheck source=/dev/null
+    . "${TBBROOT}"/env/vars.sh
 fi
diff --git a/doc/0.builddoc.sh b/doc/0.builddoc.sh
index 5dd034ac667..f10b4a5cc22 100755
--- a/doc/0.builddoc.sh
+++ b/doc/0.builddoc.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
-BUILDDOCDIR=$(dirname $(readlink -e ${BASH_SOURCE[0]}))
+BUILDDOCDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
 ROOTDIR=$BUILDDOCDIR/..
 
-cd $ROOTDIR
+cd "$ROOTDIR" || exit 1
 python setup.py develop
 
-cd $BUILDDOCDIR
+cd "$BUILDDOCDIR" || exit 2
 make clean
 make html
diff --git a/doc/make.bat b/doc/make.bat
index 0bd6076d3b2..3382907d015 100644
--- a/doc/make.bat
+++ b/doc/make.bat
@@ -1,36 +1,36 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-set SPHINXPROJ=dpnp
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+set SPHINXPROJ=dpnp
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/dpnp/backend/examples/example11.cpp b/dpnp/backend/examples/example11.cpp
index 52fce9beb3b..3a16991bae6 100644
--- a/dpnp/backend/examples/example11.cpp
+++ b/dpnp/backend/examples/example11.cpp
@@ -1,85 +1,85 @@
-//*****************************************************************************
-// Copyright (c) 2016-2024, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-
-/**
- * Example 11.
- *
- * This example shows simple usage of the DPNP C++ Backend library RNG shuffle
- * function for one and ndim arrays.
- *
- * Possible compile line:
- * g++ -g dpnp/backend/examples/example11.cpp -Idpnp -Idpnp/backend/include
- * -Ldpnp -Wl,-rpath='$ORIGIN'/dpnp -ldpnp_backend_c -o example11
- *
- */
-
-#include <iostream>
-
-#include <dpnp_iface.hpp>
-
-template <typename T>
-void print_dpnp_array(T *arr, size_t size)
-{
-    std::cout << std::endl;
-    for (size_t i = 0; i < size; ++i) {
-        std::cout << arr[i] << ", ";
-    }
-    std::cout << std::endl;
-}
-
-int main(int, char **)
-{
-    // Two cases:
-    // 1) array size = 100, ndim = 1, high_dim_size = 10 (aka ndarray with shape
-    // (100,) ) 2) array size = 100, ndim = 2, high_dim_size = 20 (e.g. ndarray
-    // with shape (20, 5) and len(array) = 20 )
-    const size_t ndim_cases = 2;
-    const size_t itemsize = sizeof(double);
-    const size_t ndim[ndim_cases] = {1, 2};
-    const size_t high_dim_size[ndim_cases] = {100, 20};
-    const size_t size = 100;
-    const size_t seed = 1234;
-
-    // DPNPC dpnp_rng_shuffle_c
-    // DPNPC interface
-    double *array_1 =
-        reinterpret_cast<double *>(dpnp_memory_alloc_c(size * sizeof(double)));
-    for (size_t i = 0; i < ndim_cases; i++) {
-        std::cout << "\nREPRODUCE: DPNPC dpnp_rng_shuffle_c:";
-        std::cout << "\nDIMS: " << ndim[i] << std::endl;
-        // init array 0, 1, 2, 3, 4, 5, 6, ....
-        dpnp_arange_c<double>(0, 1, array_1, size);
-        // print before shuffle
-        std::cout << "\nINPUT array:";
-        print_dpnp_array(array_1, size);
-        dpnp_rng_srand_c(seed);
-        dpnp_rng_shuffle_c<double>(array_1, itemsize, ndim[i], high_dim_size[i],
-                                   size);
-        // print shuffle result
-        std::cout << "\nSHUFFLE INPUT array:";
-        print_dpnp_array(array_1, size);
-    }
-    dpnp_memory_free_c(array_1);
-}
+//*****************************************************************************
+// Copyright (c) 2016-2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+/**
+ * Example 11.
+ *
+ * This example shows simple usage of the DPNP C++ Backend library RNG shuffle
+ * function for one and ndim arrays.
+ *
+ * Possible compile line:
+ * g++ -g dpnp/backend/examples/example11.cpp -Idpnp -Idpnp/backend/include
+ * -Ldpnp -Wl,-rpath='$ORIGIN'/dpnp -ldpnp_backend_c -o example11
+ *
+ */
+
+#include <iostream>
+
+#include <dpnp_iface.hpp>
+
+template <typename T>
+void print_dpnp_array(T *arr, size_t size)
+{
+    std::cout << std::endl;
+    for (size_t i = 0; i < size; ++i) {
+        std::cout << arr[i] << ", ";
+    }
+    std::cout << std::endl;
+}
+
+int main(int, char **)
+{
+    // Two cases:
+    // 1) array size = 100, ndim = 1, high_dim_size = 10 (aka ndarray with shape
+    // (100,) ) 2) array size = 100, ndim = 2, high_dim_size = 20 (e.g. ndarray
+    // with shape (20, 5) and len(array) = 20 )
+    const size_t ndim_cases = 2;
+    const size_t itemsize = sizeof(double);
+    const size_t ndim[ndim_cases] = {1, 2};
+    const size_t high_dim_size[ndim_cases] = {100, 20};
+    const size_t size = 100;
+    const size_t seed = 1234;
+
+    // DPNPC dpnp_rng_shuffle_c
+    // DPNPC interface
+    double *array_1 =
+        reinterpret_cast<double *>(dpnp_memory_alloc_c(size * sizeof(double)));
+    for (size_t i = 0; i < ndim_cases; i++) {
+        std::cout << "\nREPRODUCE: DPNPC dpnp_rng_shuffle_c:";
+        std::cout << "\nDIMS: " << ndim[i] << std::endl;
+        // init array 0, 1, 2, 3, 4, 5, 6, ....
+        dpnp_arange_c<double>(0, 1, array_1, size);
+        // print before shuffle
+        std::cout << "\nINPUT array:";
+        print_dpnp_array(array_1, size);
+        dpnp_rng_srand_c(seed);
+        dpnp_rng_shuffle_c<double>(array_1, itemsize, ndim[i], high_dim_size[i],
+                                   size);
+        // print shuffle result
+        std::cout << "\nSHUFFLE INPUT array:";
+        print_dpnp_array(array_1, size);
+    }
+    dpnp_memory_free_c(array_1);
+}
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 0399deea254..d140f412e6a 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -1,392 +1,392 @@
-import math
-import operator
-
-import dpctl.utils as dpu
-import numpy
-
-import dpnp
-import dpnp.dpnp_container as dpnp_container
-import dpnp.dpnp_utils as utils
-
-__all__ = [
-    "dpnp_geomspace",
-    "dpnp_linspace",
-    "dpnp_logspace",
-    "dpnp_nd_grid",
-]
-
-
-def dpnp_geomspace(
-    start,
-    stop,
-    num,
-    dtype=None,
-    device=None,
-    usm_type=None,
-    sycl_queue=None,
-    endpoint=True,
-    axis=0,
-):
-    usm_type_alloc, sycl_queue_alloc = utils.get_usm_allocations([start, stop])
-
-    if sycl_queue is None and device is None:
-        sycl_queue = sycl_queue_alloc
-    sycl_queue_normalized = dpnp.get_normalized_queue_device(
-        sycl_queue=sycl_queue, device=device
-    )
-
-    if usm_type is None:
-        _usm_type = "device" if usm_type_alloc is None else usm_type_alloc
-    else:
-        _usm_type = usm_type
-
-    if not dpnp.is_supported_array_type(start):
-        start = dpnp.asarray(
-            start, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
-        )
-    if not dpnp.is_supported_array_type(stop):
-        stop = dpnp.asarray(
-            stop, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
-        )
-
-    dt = numpy.result_type(start, stop, float(num))
-    dt = utils.map_dtype_to_device(dt, sycl_queue_normalized.sycl_device)
-    if dtype is None:
-        dtype = dt
-
-    if dpnp.any(start == 0) or dpnp.any(stop == 0):
-        raise ValueError("Geometric sequence cannot include zero")
-
-    out_sign = dpnp.ones(
-        dpnp.broadcast_arrays(start, stop)[0].shape,
-        dtype=dt,
-        usm_type=_usm_type,
-        sycl_queue=sycl_queue_normalized,
-    )
-    # Avoid negligible real or imaginary parts in output by rotating to
-    # positive real, calculating, then undoing rotation
-    if dpnp.issubdtype(dt, dpnp.complexfloating):
-        all_imag = (start.real == 0.0) & (stop.real == 0.0)
-        if dpnp.any(all_imag):
-            start[all_imag] = start[all_imag].imag
-            stop[all_imag] = stop[all_imag].imag
-            out_sign[all_imag] = 1j
-
-    both_negative = (dpnp.sign(start) == -1) & (dpnp.sign(stop) == -1)
-    if dpnp.any(both_negative):
-        dpnp.negative(start[both_negative], out=start[both_negative])
-        dpnp.negative(stop[both_negative], out=stop[both_negative])
-        dpnp.negative(out_sign[both_negative], out=out_sign[both_negative])
-
-    log_start = dpnp.log10(start)
-    log_stop = dpnp.log10(stop)
-    result = dpnp_logspace(
-        log_start,
-        log_stop,
-        num=num,
-        endpoint=endpoint,
-        base=10.0,
-        dtype=dtype,
-        usm_type=_usm_type,
-        sycl_queue=sycl_queue_normalized,
-    )
-
-    if num > 0:
-        result[0] = start
-        if num > 1 and endpoint:
-            result[-1] = stop
-
-    result = out_sign * result
-
-    if axis != 0:
-        result = dpnp.moveaxis(result, 0, axis)
-
-    return result.astype(dtype, copy=False)
-
-
-def dpnp_linspace(
-    start,
-    stop,
-    num,
-    dtype=None,
-    device=None,
-    usm_type=None,
-    sycl_queue=None,
-    endpoint=True,
-    retstep=False,
-    axis=0,
-):
-    usm_type_alloc, sycl_queue_alloc = utils.get_usm_allocations([start, stop])
-
-    if sycl_queue is None and device is None:
-        sycl_queue = sycl_queue_alloc
-    sycl_queue_normalized = dpnp.get_normalized_queue_device(
-        sycl_queue=sycl_queue, device=device
-    )
-
-    if usm_type is None:
-        _usm_type = "device" if usm_type_alloc is None else usm_type_alloc
-    else:
-        _usm_type = usm_type
-
-    if not hasattr(start, "dtype") and not dpnp.isscalar(start):
-        start = dpnp.asarray(
-            start, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
-        )
-    if not hasattr(stop, "dtype") and not dpnp.isscalar(stop):
-        stop = dpnp.asarray(
-            stop, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
-        )
-
-    dt = numpy.result_type(start, stop, float(num))
-    dt = utils.map_dtype_to_device(dt, sycl_queue_normalized.sycl_device)
-    if dtype is None:
-        dtype = dt
-
-    num = operator.index(num)
-    if num < 0:
-        raise ValueError("Number of points must be non-negative")
-    step_num = (num - 1) if endpoint else num
-
-    step_nan = False
-    if step_num == 0:
-        step_nan = True
-        step = dpnp.nan
-
-    if dpnp.isscalar(start) and dpnp.isscalar(stop):
-        # Call linspace() function for scalars.
-        res = dpnp_container.linspace(
-            start,
-            stop,
-            num,
-            dtype=dt,
-            usm_type=_usm_type,
-            sycl_queue=sycl_queue_normalized,
-            endpoint=endpoint,
-        )
-        if retstep is True and step_nan is False:
-            step = (stop - start) / step_num
-    else:
-        _start = dpnp.asarray(
-            start,
-            dtype=dt,
-            usm_type=_usm_type,
-            sycl_queue=sycl_queue_normalized,
-        )
-        _stop = dpnp.asarray(
-            stop, dtype=dt, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
-        )
-
-        res = dpnp_container.arange(
-            0,
-            stop=num,
-            step=1,
-            dtype=dt,
-            usm_type=_usm_type,
-            sycl_queue=sycl_queue_normalized,
-        )
-
-        if step_nan is False:
-            step = (_stop - _start) / step_num
-            res = res.reshape((-1,) + (1,) * step.ndim)
-            res = res * step + _start
-
-        if endpoint and num > 1:
-            res[-1] = dpnp_container.full(step.shape, _stop)
-
-    if axis != 0:
-        res = dpnp.moveaxis(res, 0, axis)
-
-    if numpy.issubdtype(dtype, dpnp.integer):
-        dpnp.floor(res, out=res)
-
-    res = res.astype(dtype, copy=False)
-
-    if retstep is True:
-        if dpnp.isscalar(step):
-            step = dpnp.asarray(
-                step, usm_type=res.usm_type, sycl_queue=res.sycl_queue
-            )
-        return (res, step)
-
-    return res
-
-
-def dpnp_logspace(
-    start,
-    stop,
-    num=50,
-    device=None,
-    usm_type=None,
-    sycl_queue=None,
-    endpoint=True,
-    base=10.0,
-    dtype=None,
-    axis=0,
-):
-    if not dpnp.isscalar(base):
-        usm_type_alloc, sycl_queue_alloc = utils.get_usm_allocations(
-            [start, stop, base]
-        )
-
-        if sycl_queue is None and device is None:
-            sycl_queue = sycl_queue_alloc
-        sycl_queue = dpnp.get_normalized_queue_device(
-            sycl_queue=sycl_queue, device=device
-        )
-
-        if usm_type is None:
-            usm_type = "device" if usm_type_alloc is None else usm_type_alloc
-        else:
-            usm_type = usm_type
-        start = dpnp.asarray(start, usm_type=usm_type, sycl_queue=sycl_queue)
-        stop = dpnp.asarray(stop, usm_type=usm_type, sycl_queue=sycl_queue)
-        base = dpnp.asarray(base, usm_type=usm_type, sycl_queue=sycl_queue)
-        [start, stop, base] = dpnp.broadcast_arrays(start, stop, base)
-        base = dpnp.expand_dims(base, axis=axis)
-
-    res = dpnp_linspace(
-        start,
-        stop,
-        num=num,
-        device=device,
-        usm_type=usm_type,
-        sycl_queue=sycl_queue,
-        endpoint=endpoint,
-        axis=axis,
-    )
-
-    if dtype is None:
-        return dpnp.power(base, res)
-    return dpnp.power(base, res).astype(dtype, copy=False)
-
-
-class dpnp_nd_grid:
-    """
-    Construct a multi-dimensional "meshgrid".
-
-    ``grid = dpnp_nd_grid()`` creates an instance which will return a mesh-grid
-    when indexed. The dimension and number of the output arrays are equal
-    to the number of indexing dimensions.  If the step length is not a
-    complex number, then the stop is not inclusive.
-
-    However, if the step length is a complex number (e.g. 5j), then the
-    integer part of its magnitude is interpreted as specifying the
-    number of points to create between the start and stop values, where
-    the stop value is inclusive.
-
-    If instantiated with an argument of ``sparse=True``, the mesh-grid is
-    open (or not fleshed out) so that only one-dimension of each returned
-    argument is greater than 1.
-
-    Parameters
-    ----------
-    sparse : bool, optional
-        Whether the grid is sparse or not. Default is False.
-
-    """
-
-    def __init__(
-        self, sparse=False, device=None, usm_type="device", sycl_queue=None
-    ):
-        dpu.validate_usm_type(usm_type, allow_none=False)
-        self.sparse = sparse
-        self.usm_type = usm_type
-        self.sycl_queue_normalized = dpnp.get_normalized_queue_device(
-            sycl_queue=sycl_queue, device=device
-        )
-
-    def __getitem__(self, key):
-        if isinstance(key, slice):
-            step = key.step
-            stop = key.stop
-            start = key.start
-            if start is None:
-                start = 0
-            if isinstance(step, complex):
-                step = abs(step)
-                length = int(step)
-                if step != 1:
-                    step = (stop - start) / float(step - 1)
-                stop = stop + step
-                return (
-                    dpnp.arange(
-                        0,
-                        length,
-                        1,
-                        dtype=dpnp.default_float_type(),
-                        usm_type=self.usm_type,
-                        sycl_queue=self.sycl_queue_normalized,
-                    )
-                    * step
-                    + start
-                )
-            else:
-                return dpnp.arange(
-                    start,
-                    stop,
-                    step,
-                    usm_type=self.usm_type,
-                    sycl_queue=self.sycl_queue_normalized,
-                )
-
-        size = []
-        dtype = int
-        for k in range(len(key)):
-            step = key[k].step
-            start = key[k].start
-            stop = key[k].stop
-            if start is None:
-                start = 0
-            if step is None:
-                step = 1
-            if isinstance(step, complex):
-                size.append(int(abs(step)))
-                dtype = dpnp.default_float_type()
-            else:
-                size.append(
-                    int(math.ceil((key[k].stop - start) / (step * 1.0)))
-                )
-            if (
-                isinstance(step, float)
-                or isinstance(start, float)
-                or isinstance(stop, float)
-            ):
-                dtype = dpnp.default_float_type()
-        if self.sparse:
-            nn = [
-                dpnp.arange(
-                    _x,
-                    dtype=_t,
-                    usm_type=self.usm_type,
-                    sycl_queue=self.sycl_queue_normalized,
-                )
-                for _x, _t in zip(size, (dtype,) * len(size))
-            ]
-        else:
-            nn = dpnp.indices(
-                size,
-                dtype,
-                usm_type=self.usm_type,
-                sycl_queue=self.sycl_queue_normalized,
-            )
-        for k in range(len(size)):
-            step = key[k].step
-            start = key[k].start
-            stop = key[k].stop
-            if start is None:
-                start = 0
-            if step is None:
-                step = 1
-            if isinstance(step, complex):
-                step = int(abs(step))
-                if step != 1:
-                    step = (stop - start) / float(step - 1)
-            nn[k] = nn[k] * step + start
-        if self.sparse:
-            slobj = [dpnp.newaxis] * len(size)
-            for k in range(len(size)):
-                slobj[k] = slice(None, None)
-                nn[k] = nn[k][tuple(slobj)]
-                slobj[k] = dpnp.newaxis
-        return nn
+import math
+import operator
+
+import dpctl.utils as dpu
+import numpy
+
+import dpnp
+import dpnp.dpnp_container as dpnp_container
+import dpnp.dpnp_utils as utils
+
+__all__ = [
+    "dpnp_geomspace",
+    "dpnp_linspace",
+    "dpnp_logspace",
+    "dpnp_nd_grid",
+]
+
+
+def dpnp_geomspace(
+    start,
+    stop,
+    num,
+    dtype=None,
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+    endpoint=True,
+    axis=0,
+):
+    usm_type_alloc, sycl_queue_alloc = utils.get_usm_allocations([start, stop])
+
+    if sycl_queue is None and device is None:
+        sycl_queue = sycl_queue_alloc
+    sycl_queue_normalized = dpnp.get_normalized_queue_device(
+        sycl_queue=sycl_queue, device=device
+    )
+
+    if usm_type is None:
+        _usm_type = "device" if usm_type_alloc is None else usm_type_alloc
+    else:
+        _usm_type = usm_type
+
+    if not dpnp.is_supported_array_type(start):
+        start = dpnp.asarray(
+            start, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
+        )
+    if not dpnp.is_supported_array_type(stop):
+        stop = dpnp.asarray(
+            stop, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
+        )
+
+    dt = numpy.result_type(start, stop, float(num))
+    dt = utils.map_dtype_to_device(dt, sycl_queue_normalized.sycl_device)
+    if dtype is None:
+        dtype = dt
+
+    if dpnp.any(start == 0) or dpnp.any(stop == 0):
+        raise ValueError("Geometric sequence cannot include zero")
+
+    out_sign = dpnp.ones(
+        dpnp.broadcast_arrays(start, stop)[0].shape,
+        dtype=dt,
+        usm_type=_usm_type,
+        sycl_queue=sycl_queue_normalized,
+    )
+    # Avoid negligible real or imaginary parts in output by rotating to
+    # positive real, calculating, then undoing rotation
+    if dpnp.issubdtype(dt, dpnp.complexfloating):
+        all_imag = (start.real == 0.0) & (stop.real == 0.0)
+        if dpnp.any(all_imag):
+            start[all_imag] = start[all_imag].imag
+            stop[all_imag] = stop[all_imag].imag
+            out_sign[all_imag] = 1j
+
+    both_negative = (dpnp.sign(start) == -1) & (dpnp.sign(stop) == -1)
+    if dpnp.any(both_negative):
+        dpnp.negative(start[both_negative], out=start[both_negative])
+        dpnp.negative(stop[both_negative], out=stop[both_negative])
+        dpnp.negative(out_sign[both_negative], out=out_sign[both_negative])
+
+    log_start = dpnp.log10(start)
+    log_stop = dpnp.log10(stop)
+    result = dpnp_logspace(
+        log_start,
+        log_stop,
+        num=num,
+        endpoint=endpoint,
+        base=10.0,
+        dtype=dtype,
+        usm_type=_usm_type,
+        sycl_queue=sycl_queue_normalized,
+    )
+
+    if num > 0:
+        result[0] = start
+        if num > 1 and endpoint:
+            result[-1] = stop
+
+    result = out_sign * result
+
+    if axis != 0:
+        result = dpnp.moveaxis(result, 0, axis)
+
+    return result.astype(dtype, copy=False)
+
+
+def dpnp_linspace(
+    start,
+    stop,
+    num,
+    dtype=None,
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+    endpoint=True,
+    retstep=False,
+    axis=0,
+):
+    usm_type_alloc, sycl_queue_alloc = utils.get_usm_allocations([start, stop])
+
+    if sycl_queue is None and device is None:
+        sycl_queue = sycl_queue_alloc
+    sycl_queue_normalized = dpnp.get_normalized_queue_device(
+        sycl_queue=sycl_queue, device=device
+    )
+
+    if usm_type is None:
+        _usm_type = "device" if usm_type_alloc is None else usm_type_alloc
+    else:
+        _usm_type = usm_type
+
+    if not hasattr(start, "dtype") and not dpnp.isscalar(start):
+        start = dpnp.asarray(
+            start, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
+        )
+    if not hasattr(stop, "dtype") and not dpnp.isscalar(stop):
+        stop = dpnp.asarray(
+            stop, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
+        )
+
+    dt = numpy.result_type(start, stop, float(num))
+    dt = utils.map_dtype_to_device(dt, sycl_queue_normalized.sycl_device)
+    if dtype is None:
+        dtype = dt
+
+    num = operator.index(num)
+    if num < 0:
+        raise ValueError("Number of points must be non-negative")
+    step_num = (num - 1) if endpoint else num
+
+    step_nan = False
+    if step_num == 0:
+        step_nan = True
+        step = dpnp.nan
+
+    if dpnp.isscalar(start) and dpnp.isscalar(stop):
+        # Call linspace() function for scalars.
+        res = dpnp_container.linspace(
+            start,
+            stop,
+            num,
+            dtype=dt,
+            usm_type=_usm_type,
+            sycl_queue=sycl_queue_normalized,
+            endpoint=endpoint,
+        )
+        if retstep is True and step_nan is False:
+            step = (stop - start) / step_num
+    else:
+        _start = dpnp.asarray(
+            start,
+            dtype=dt,
+            usm_type=_usm_type,
+            sycl_queue=sycl_queue_normalized,
+        )
+        _stop = dpnp.asarray(
+            stop, dtype=dt, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
+        )
+
+        res = dpnp_container.arange(
+            0,
+            stop=num,
+            step=1,
+            dtype=dt,
+            usm_type=_usm_type,
+            sycl_queue=sycl_queue_normalized,
+        )
+
+        if step_nan is False:
+            step = (_stop - _start) / step_num
+            res = res.reshape((-1,) + (1,) * step.ndim)
+            res = res * step + _start
+
+        if endpoint and num > 1:
+            res[-1] = dpnp_container.full(step.shape, _stop)
+
+    if axis != 0:
+        res = dpnp.moveaxis(res, 0, axis)
+
+    if numpy.issubdtype(dtype, dpnp.integer):
+        dpnp.floor(res, out=res)
+
+    res = res.astype(dtype, copy=False)
+
+    if retstep is True:
+        if dpnp.isscalar(step):
+            step = dpnp.asarray(
+                step, usm_type=res.usm_type, sycl_queue=res.sycl_queue
+            )
+        return (res, step)
+
+    return res
+
+
+def dpnp_logspace(
+    start,
+    stop,
+    num=50,
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+    endpoint=True,
+    base=10.0,
+    dtype=None,
+    axis=0,
+):
+    if not dpnp.isscalar(base):
+        usm_type_alloc, sycl_queue_alloc = utils.get_usm_allocations(
+            [start, stop, base]
+        )
+
+        if sycl_queue is None and device is None:
+            sycl_queue = sycl_queue_alloc
+        sycl_queue = dpnp.get_normalized_queue_device(
+            sycl_queue=sycl_queue, device=device
+        )
+
+        if usm_type is None:
+            usm_type = "device" if usm_type_alloc is None else usm_type_alloc
+        else:
+            usm_type = usm_type
+        start = dpnp.asarray(start, usm_type=usm_type, sycl_queue=sycl_queue)
+        stop = dpnp.asarray(stop, usm_type=usm_type, sycl_queue=sycl_queue)
+        base = dpnp.asarray(base, usm_type=usm_type, sycl_queue=sycl_queue)
+        [start, stop, base] = dpnp.broadcast_arrays(start, stop, base)
+        base = dpnp.expand_dims(base, axis=axis)
+
+    res = dpnp_linspace(
+        start,
+        stop,
+        num=num,
+        device=device,
+        usm_type=usm_type,
+        sycl_queue=sycl_queue,
+        endpoint=endpoint,
+        axis=axis,
+    )
+
+    if dtype is None:
+        return dpnp.power(base, res)
+    return dpnp.power(base, res).astype(dtype, copy=False)
+
+
+class dpnp_nd_grid:
+    """
+    Construct a multi-dimensional "meshgrid".
+
+    ``grid = dpnp_nd_grid()`` creates an instance which will return a mesh-grid
+    when indexed. The dimension and number of the output arrays are equal
+    to the number of indexing dimensions.  If the step length is not a
+    complex number, then the stop is not inclusive.
+
+    However, if the step length is a complex number (e.g. 5j), then the
+    integer part of its magnitude is interpreted as specifying the
+    number of points to create between the start and stop values, where
+    the stop value is inclusive.
+
+    If instantiated with an argument of ``sparse=True``, the mesh-grid is
+    open (or not fleshed out) so that only one-dimension of each returned
+    argument is greater than 1.
+
+    Parameters
+    ----------
+    sparse : bool, optional
+        Whether the grid is sparse or not. Default is False.
+
+    """
+
+    def __init__(
+        self, sparse=False, device=None, usm_type="device", sycl_queue=None
+    ):
+        dpu.validate_usm_type(usm_type, allow_none=False)
+        self.sparse = sparse
+        self.usm_type = usm_type
+        self.sycl_queue_normalized = dpnp.get_normalized_queue_device(
+            sycl_queue=sycl_queue, device=device
+        )
+
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            step = key.step
+            stop = key.stop
+            start = key.start
+            if start is None:
+                start = 0
+            if isinstance(step, complex):
+                step = abs(step)
+                length = int(step)
+                if step != 1:
+                    step = (stop - start) / float(step - 1)
+                stop = stop + step
+                return (
+                    dpnp.arange(
+                        0,
+                        length,
+                        1,
+                        dtype=dpnp.default_float_type(),
+                        usm_type=self.usm_type,
+                        sycl_queue=self.sycl_queue_normalized,
+                    )
+                    * step
+                    + start
+                )
+            else:
+                return dpnp.arange(
+                    start,
+                    stop,
+                    step,
+                    usm_type=self.usm_type,
+                    sycl_queue=self.sycl_queue_normalized,
+                )
+
+        size = []
+        dtype = int
+        for k in range(len(key)):
+            step = key[k].step
+            start = key[k].start
+            stop = key[k].stop
+            if start is None:
+                start = 0
+            if step is None:
+                step = 1
+            if isinstance(step, complex):
+                size.append(int(abs(step)))
+                dtype = dpnp.default_float_type()
+            else:
+                size.append(
+                    int(math.ceil((key[k].stop - start) / (step * 1.0)))
+                )
+            if (
+                isinstance(step, float)
+                or isinstance(start, float)
+                or isinstance(stop, float)
+            ):
+                dtype = dpnp.default_float_type()
+        if self.sparse:
+            nn = [
+                dpnp.arange(
+                    _x,
+                    dtype=_t,
+                    usm_type=self.usm_type,
+                    sycl_queue=self.sycl_queue_normalized,
+                )
+                for _x, _t in zip(size, (dtype,) * len(size))
+            ]
+        else:
+            nn = dpnp.indices(
+                size,
+                dtype,
+                usm_type=self.usm_type,
+                sycl_queue=self.sycl_queue_normalized,
+            )
+        for k in range(len(size)):
+            step = key[k].step
+            start = key[k].start
+            stop = key[k].stop
+            if start is None:
+                start = 0
+            if step is None:
+                step = 1
+            if isinstance(step, complex):
+                step = int(abs(step))
+                if step != 1:
+                    step = (stop - start) / float(step - 1)
+            nn[k] = nn[k] * step + start
+        if self.sparse:
+            slobj = [dpnp.newaxis] * len(size)
+            for k in range(len(size)):
+                slobj[k] = slice(None, None)
+                nn[k] = nn[k][tuple(slobj)]
+                slobj[k] = dpnp.newaxis
+        return nn
diff --git a/scripts/build_deps_dpctl.sh b/scripts/build_deps_dpctl.sh
index 3d5331bbdfb..dd85846a9d5 100755
--- a/scripts/build_deps_dpctl.sh
+++ b/scripts/build_deps_dpctl.sh
@@ -1,21 +1,19 @@
 #!/bin/bash
 
-THEDIR=$(dirname $(readlink -e ${BASH_SOURCE[0]}))
-
 DPCTL_TARGET_VERSION=0.5.0rc2
 echo ++++++++++++++++++ Build DPCTL ${DPCTL_TARGET_VERSION} +++++++++++++++++++
 git clone --branch ${DPCTL_TARGET_VERSION} https://github.com/IntelPython/dpctl.git
 
-cd dpctl
+cd dpctl || exit 1
 
 # didn't find better way to set required version
-git tag -d $(git tag -l)
+git tag -d "$(git tag -l)"
 git tag ${DPCTL_TARGET_VERSION}
 
 # python ./setup.py develop
 # python ./setup.py install
 
-conda build conda-recipe/ --no-test -c ${ONEAPI_ROOT}/conda_channel
+conda build conda-recipe/ --no-test -c "${ONEAPI_ROOT}"/conda_channel
 
 # ls -lR /opt/intel/oneapi/intelpython/latest/conda-bld
 
diff --git a/scripts/install_cmake_lin.sh b/scripts/install_cmake_lin.sh
index 966a22c617b..63ee19b0fdf 100755
--- a/scripts/install_cmake_lin.sh
+++ b/scripts/install_cmake_lin.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-THEDIR=$(dirname $(readlink -e ${BASH_SOURCE[0]}))
-
 echo ========================= install cmake ==================================
 curl --output cmake_webimage.tar.gz \
   --url https://github.com/Kitware/CMake/releases/download/v3.26.2/cmake-3.26.2-linux-x86_64.tar.gz \
@@ -10,7 +8,8 @@ curl --output cmake_webimage.tar.gz \
 tar -xzf cmake_webimage.tar.gz
 rm -f cmake_webimage.tar.gz
 
-export PATH=`pwd`/cmake-3.26.2-linux-x86_64/bin:$PATH
+PATH=$(pwd)/cmake-3.26.2-linux-x86_64/bin:$PATH
+export PATH
 
-which cmake
+command -v cmake
 cmake --version
diff --git a/scripts/install_python_deps.sh b/scripts/install_python_deps.sh
index bcb005403ba..e40d9a5b34a 100755
--- a/scripts/install_python_deps.sh
+++ b/scripts/install_python_deps.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-THEDIR=$(dirname $(readlink -e ${BASH_SOURCE[0]}))
-
 echo +++++++++++++++++++++++++ Python prerequisites +++++++++++++++++++++++++++++++++
 
 echo ========================= Conda: install prerequisites =========================
@@ -19,7 +17,7 @@ echo ========================= SW versions =====================================
 conda list
 
 python --version
-which python
+command -v python
 
 python -c "import numpy as sw; print(f\"sw.__version__={sw.__version__}\nsw.get_include={sw.get_include()}\")"
 python -c "import dpctl as sw; print(f\"sw.__version__={sw.__version__}\nsw.get_include={sw.get_include()}\")"
diff --git a/scripts/install_system_deps.sh b/scripts/install_system_deps.sh
index a0bd07a040c..591bb025e85 100755
--- a/scripts/install_system_deps.sh
+++ b/scripts/install_system_deps.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-THEDIR=$(dirname $(readlink -e ${BASH_SOURCE[0]}))
-
 # echo +++++++++++++++++++++++++ System prerequisites +++++++++++++++++++++++++++
 # sudo apt-get install -f
 # sudo dpkg --configure -a
diff --git a/scripts/install_system_deps_intelpython.sh b/scripts/install_system_deps_intelpython.sh
index 8c38d41c385..5dbef56ff7e 100755
--- a/scripts/install_system_deps_intelpython.sh
+++ b/scripts/install_system_deps_intelpython.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-THEDIR=$(dirname $(readlink -e ${BASH_SOURCE[0]}))
-
 echo +++++++++++++++++++++++++ Intel OneAPI Python ++++++++++++++++++++++++++++
 
 sudo apt-get install intel-oneapi-python
diff --git a/scripts/set_ci_env.sh b/scripts/set_ci_env.sh
index afc3cc1b6a2..d8e6c2b93e8 100755
--- a/scripts/set_ci_env.sh
+++ b/scripts/set_ci_env.sh
@@ -1,30 +1,30 @@
 #!/bin/bash
 
-THEDIR=$(dirname $(readlink -e ${BASH_SOURCE[0]}))
+THEDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
 
 echo
 echo ========================= Set DPNP environment ===========================
-echo SHELL=${SHELL}
-echo PWD=${PWD}
-echo HOME=${HOME}
+echo SHELL="${SHELL}"
+echo PWD="${PWD}"
+echo HOME="${HOME}"
 ls -l
 echo ========================= current machine kernel =========================
 uname -a
 
-${THEDIR}/install_system_deps.sh
+"${THEDIR}"/install_system_deps.sh
 . ./scripts/install_cmake_lin.sh
 
 echo ========================= setup Intel OneAPI python changed to Intel OneAPI ====
 . /opt/intel/oneapi/setvars.sh
 
-${THEDIR}/install_python_deps.sh
+"${THEDIR}"/install_python_deps.sh
 
 echo ========================= SW versions ===============================
 g++ --version
-which g++
+command -v g++
 
 clang++ --version
-which clang++
+command -v clang++
 
 dpcpp --version
-which dpcpp
+command -v dpcpp
diff --git a/tests/test_histograms.py b/tests/test_histograms.py
index 2fb4cd71fa6..a283c5547cc 100644
--- a/tests/test_histograms.py
+++ b/tests/test_histograms.py
@@ -1,89 +1,89 @@
-import numpy
-import pytest
-
-import dpnp
-
-from .helper import has_support_aspect64
-
-
-class TestHistogram:
-    def setup(self):
-        pass
-
-    def teardown(self):
-        pass
-
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    def test_simple(self):
-        n = 100
-        v = dpnp.random.rand(n)
-        a, _ = dpnp.histogram(v)
-        # check if the sum of the bins equals the number of samples
-        numpy.testing.assert_equal(dpnp.sum(a, axis=0), n)
-        # check that the bin counts are evenly spaced when the data is from
-        # a linear function
-        a, _ = dpnp.histogram(
-            numpy.linspace(
-                0,
-                10,
-                100,
-                dtype="float64" if has_support_aspect64() else "float32",
-            )
-        )
-        numpy.testing.assert_array_equal(a, 10)
-
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    def test_one_bin(self):
-        # Ticket 632
-        hist, edges = dpnp.histogram([1, 2, 3, 4], [1, 2])
-        numpy.testing.assert_array_equal(
-            hist,
-            [
-                2,
-            ],
-        )
-        numpy.testing.assert_array_equal(edges, [1, 2])
-        numpy.testing.assert_raises(ValueError, dpnp.histogram, [1, 2], bins=0)
-        h, e = dpnp.histogram([1, 2], bins=1)
-        numpy.testing.assert_equal(h, dpnp.array([2]))
-        numpy.testing.assert_allclose(e, dpnp.array([1.0, 2.0]))
-
-    def test_density(self):
-        # Check that the integral of the density equals 1.
-        n = 100
-        v = dpnp.random.rand(n)
-        a, b = dpnp.histogram(v, density=True)
-        area = dpnp.sum(a * dpnp.diff(b)[0])[0]
-        numpy.testing.assert_almost_equal(area, 1)
-
-        # Check with non-constant bin widths
-        v = dpnp.arange(10)
-        bins = [0, 1, 3, 6, 10]
-        a, b = dpnp.histogram(v, bins, density=True)
-        numpy.testing.assert_array_equal(a, 0.1)
-        numpy.testing.assert_equal(dpnp.sum(a * dpnp.diff(b))[0], 1)
-
-        # Test that passing False works too
-        a, b = dpnp.histogram(v, bins, density=False)
-        numpy.testing.assert_array_equal(a, [1, 2, 3, 4])
-
-        # Variable bin widths are especially useful to deal with
-        # infinities.
-        v = dpnp.arange(10)
-        bins = [0, 1, 3, 6, numpy.inf]
-        a, b = dpnp.histogram(v, bins, density=True)
-        numpy.testing.assert_array_equal(a, [0.1, 0.1, 0.1, 0.0])
-
-        # Taken from a bug report from N. Becker on the numpy-discussion
-        # mailing list Aug. 6, 2010.
-        counts, _ = dpnp.histogram(
-            [1, 2, 3, 4], [0.5, 1.5, numpy.inf], density=True
-        )
-        numpy.testing.assert_equal(counts, [0.25, 0])
-
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    def test_arr_weights_mismatch(self):
-        a = dpnp.arange(10) + 0.5
-        w = dpnp.arange(11) + 0.5
-        with numpy.testing.assert_raises_regex(ValueError, "same shape as"):
-            h, b = dpnp.histogram(a, range=[1, 9], weights=w, density=True)
+import numpy
+import pytest
+
+import dpnp
+
+from .helper import has_support_aspect64
+
+
+class TestHistogram:
+    def setup(self):
+        pass
+
+    def teardown(self):
+        pass
+
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    def test_simple(self):
+        n = 100
+        v = dpnp.random.rand(n)
+        a, _ = dpnp.histogram(v)
+        # check if the sum of the bins equals the number of samples
+        numpy.testing.assert_equal(dpnp.sum(a, axis=0), n)
+        # check that the bin counts are evenly spaced when the data is from
+        # a linear function
+        a, _ = dpnp.histogram(
+            numpy.linspace(
+                0,
+                10,
+                100,
+                dtype="float64" if has_support_aspect64() else "float32",
+            )
+        )
+        numpy.testing.assert_array_equal(a, 10)
+
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    def test_one_bin(self):
+        # Ticket 632
+        hist, edges = dpnp.histogram([1, 2, 3, 4], [1, 2])
+        numpy.testing.assert_array_equal(
+            hist,
+            [
+                2,
+            ],
+        )
+        numpy.testing.assert_array_equal(edges, [1, 2])
+        numpy.testing.assert_raises(ValueError, dpnp.histogram, [1, 2], bins=0)
+        h, e = dpnp.histogram([1, 2], bins=1)
+        numpy.testing.assert_equal(h, dpnp.array([2]))
+        numpy.testing.assert_allclose(e, dpnp.array([1.0, 2.0]))
+
+    def test_density(self):
+        # Check that the integral of the density equals 1.
+        n = 100
+        v = dpnp.random.rand(n)
+        a, b = dpnp.histogram(v, density=True)
+        area = dpnp.sum(a * dpnp.diff(b)[0])[0]
+        numpy.testing.assert_almost_equal(area, 1)
+
+        # Check with non-constant bin widths
+        v = dpnp.arange(10)
+        bins = [0, 1, 3, 6, 10]
+        a, b = dpnp.histogram(v, bins, density=True)
+        numpy.testing.assert_array_equal(a, 0.1)
+        numpy.testing.assert_equal(dpnp.sum(a * dpnp.diff(b))[0], 1)
+
+        # Test that passing False works too
+        a, b = dpnp.histogram(v, bins, density=False)
+        numpy.testing.assert_array_equal(a, [1, 2, 3, 4])
+
+        # Variable bin widths are especially useful to deal with
+        # infinities.
+        v = dpnp.arange(10)
+        bins = [0, 1, 3, 6, numpy.inf]
+        a, b = dpnp.histogram(v, bins, density=True)
+        numpy.testing.assert_array_equal(a, [0.1, 0.1, 0.1, 0.0])
+
+        # Taken from a bug report from N. Becker on the numpy-discussion
+        # mailing list Aug. 6, 2010.
+        counts, _ = dpnp.histogram(
+            [1, 2, 3, 4], [0.5, 1.5, numpy.inf], density=True
+        )
+        numpy.testing.assert_equal(counts, [0.25, 0])
+
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    def test_arr_weights_mismatch(self):
+        a = dpnp.arange(10) + 0.5
+        w = dpnp.arange(11) + 0.5
+        with numpy.testing.assert_raises_regex(ValueError, "same shape as"):
+            h, b = dpnp.histogram(a, range=[1, 9], weights=w, density=True)
diff --git a/tests/third_party/cupy/manipulation_tests/test_kind.py b/tests/third_party/cupy/manipulation_tests/test_kind.py
index 1812d77c0af..7cc61f722f8 100644
--- a/tests/third_party/cupy/manipulation_tests/test_kind.py
+++ b/tests/third_party/cupy/manipulation_tests/test_kind.py
@@ -1,143 +1,143 @@
-import unittest
-
-import numpy
-import pytest
-
-import dpnp as cupy
-from tests.helper import has_support_aspect64
-from tests.third_party.cupy import testing
-
-
-class TestKind(unittest.TestCase):
-    @pytest.mark.skip("dpnp.asarray_chkfinite() is not implemented yet")
-    @testing.for_orders("CFAK")
-    @testing.for_all_dtypes()
-    @testing.numpy_cupy_array_equal()
-    def test_asarray_chkfinite(self, xp, dtype, order):
-        a = [0, 4, 0, 5]
-        return xp.asarray_chkfinite(a, dtype=dtype, order=order)
-
-    @pytest.mark.skip("dpnp.asarray_chkfinite() is not implemented yet")
-    @testing.for_orders("CFAK")
-    @testing.for_all_dtypes(no_bool=True)
-    def test_asarray_chkfinite_non_finite_vals(self, dtype, order):
-        a = [-numpy.inf, 0.0, numpy.inf, numpy.nan]
-        for xp in (numpy, cupy):
-            if xp.issubdtype(dtype, xp.integer):
-                error = OverflowError
-            else:
-                error = ValueError
-            with pytest.raises(error):
-                xp.asarray_chkfinite(a, dtype=dtype, order=order)
-
-    @testing.for_all_dtypes()
-    def test_asfarray(self, dtype):
-        a = cupy.asarray([1, 2, 3])
-        a_gpu = cupy.asfarray(a, dtype)
-        a_cpu = numpy.asfarray(a, dtype)
-        if (
-            has_support_aspect64()
-            or cupy.issubdtype(dtype, cupy.complexfloating)
-            or cupy.issubdtype(dtype, cupy.floating)
-        ):
-            assert a_cpu.dtype == a_gpu.dtype
-        else:
-            assert a_cpu.dtype == cupy.float64
-            assert a_gpu.dtype == cupy.float32
-
-    @testing.for_all_dtypes()
-    def test_asfortranarray1(self, dtype):
-        def func(xp):
-            x = xp.zeros((2, 3), dtype=dtype)
-            ret = xp.asfortranarray(x)
-            assert x.flags.c_contiguous
-            assert ret.flags.f_contiguous
-
-        assert func(numpy) == func(cupy)
-
-    @testing.for_all_dtypes()
-    def test_asfortranarray2(self, dtype):
-        def func(xp):
-            x = xp.zeros((2, 3, 4), dtype=dtype)
-            ret = xp.asfortranarray(x)
-            assert x.flags.c_contiguous
-            assert ret.flags.f_contiguous
-
-        assert func(numpy) == func(cupy)
-
-    @testing.for_all_dtypes()
-    def test_asfortranarray3(self, dtype):
-        def func(xp):
-            x = xp.zeros((2, 3, 4), dtype=dtype)
-            ret = xp.asfortranarray(xp.asfortranarray(x))
-            assert x.flags.c_contiguous
-            assert ret.flags.f_contiguous
-
-        assert func(numpy) == func(cupy)
-
-    @testing.for_all_dtypes()
-    def test_asfortranarray4(self, dtype):
-        def func(xp):
-            x = xp.zeros((2, 3), dtype=dtype)
-            x = xp.transpose(x, (1, 0))
-            ret = xp.asfortranarray(x)
-            assert ret.flags.f_contiguous
-
-        assert func(numpy) == func(cupy)
-
-    @testing.for_all_dtypes()
-    def test_asfortranarray5(self, dtype):
-        def func(xp):
-            x = testing.shaped_arange((2, 3), xp, dtype)
-            ret = xp.asfortranarray(x)
-            assert x.flags.c_contiguous
-            assert ret.flags.f_contiguous
-
-        assert func(numpy) == func(cupy)
-
-    @pytest.mark.skip("dpnp.require() is not implemented yet")
-    @testing.for_all_dtypes()
-    def test_require_flag_check(self, dtype):
-        possible_flags = [["C_CONTIGUOUS"], ["F_CONTIGUOUS"]]
-        x = cupy.zeros((2, 3, 4), dtype=dtype)
-        for flags in possible_flags:
-            arr = cupy.require(x, dtype, flags)
-            for parameter in flags:
-                assert arr.flags[parameter]
-                assert arr.dtype == dtype
-
-    @pytest.mark.skip("dpnp.require() is not implemented yet")
-    @testing.for_all_dtypes()
-    def test_require_owndata(self, dtype):
-        x = cupy.zeros((2, 3, 4), dtype=dtype)
-        arr = x.view()
-        arr = cupy.require(arr, dtype, ["O"])
-        assert arr.flags["OWNDATA"]
-
-    @pytest.mark.skip("dpnp.require() is not implemented yet")
-    @testing.for_all_dtypes()
-    def test_require_C_and_F_flags(self, dtype):
-        x = cupy.zeros((2, 3, 4), dtype=dtype)
-        with pytest.raises(ValueError):
-            cupy.require(x, dtype, ["C", "F"])
-
-    @pytest.mark.skip("dpnp.require() is not implemented yet")
-    @testing.for_all_dtypes()
-    def test_require_incorrect_requirments(self, dtype):
-        x = cupy.zeros((2, 3, 4), dtype=dtype)
-        with pytest.raises(ValueError):
-            cupy.require(x, dtype, ["W"])
-
-    @pytest.mark.skip("dpnp.require() is not implemented yet")
-    @testing.for_all_dtypes()
-    def test_require_incorrect_dtype(self, dtype):
-        x = cupy.zeros((2, 3, 4), dtype=dtype)
-        with pytest.raises(ValueError):
-            cupy.require(x, "random", "C")
-
-    @pytest.mark.skip("dpnp.require() is not implemented yet")
-    @testing.for_all_dtypes()
-    def test_require_empty_requirements(self, dtype):
-        x = cupy.zeros((2, 3, 4), dtype=dtype)
-        x = cupy.require(x, dtype, [])
-        assert x.flags["C_CONTIGUOUS"]
+import unittest
+
+import numpy
+import pytest
+
+import dpnp as cupy
+from tests.helper import has_support_aspect64
+from tests.third_party.cupy import testing
+
+
+class TestKind(unittest.TestCase):
+    @pytest.mark.skip("dpnp.asarray_chkfinite() is not implemented yet")
+    @testing.for_orders("CFAK")
+    @testing.for_all_dtypes()
+    @testing.numpy_cupy_array_equal()
+    def test_asarray_chkfinite(self, xp, dtype, order):
+        a = [0, 4, 0, 5]
+        return xp.asarray_chkfinite(a, dtype=dtype, order=order)
+
+    @pytest.mark.skip("dpnp.asarray_chkfinite() is not implemented yet")
+    @testing.for_orders("CFAK")
+    @testing.for_all_dtypes(no_bool=True)
+    def test_asarray_chkfinite_non_finite_vals(self, dtype, order):
+        a = [-numpy.inf, 0.0, numpy.inf, numpy.nan]
+        for xp in (numpy, cupy):
+            if xp.issubdtype(dtype, xp.integer):
+                error = OverflowError
+            else:
+                error = ValueError
+            with pytest.raises(error):
+                xp.asarray_chkfinite(a, dtype=dtype, order=order)
+
+    @testing.for_all_dtypes()
+    def test_asfarray(self, dtype):
+        a = cupy.asarray([1, 2, 3])
+        a_gpu = cupy.asfarray(a, dtype)
+        a_cpu = numpy.asfarray(a, dtype)
+        if (
+            has_support_aspect64()
+            or cupy.issubdtype(dtype, cupy.complexfloating)
+            or cupy.issubdtype(dtype, cupy.floating)
+        ):
+            assert a_cpu.dtype == a_gpu.dtype
+        else:
+            assert a_cpu.dtype == cupy.float64
+            assert a_gpu.dtype == cupy.float32
+
+    @testing.for_all_dtypes()
+    def test_asfortranarray1(self, dtype):
+        def func(xp):
+            x = xp.zeros((2, 3), dtype=dtype)
+            ret = xp.asfortranarray(x)
+            assert x.flags.c_contiguous
+            assert ret.flags.f_contiguous
+
+        assert func(numpy) == func(cupy)
+
+    @testing.for_all_dtypes()
+    def test_asfortranarray2(self, dtype):
+        def func(xp):
+            x = xp.zeros((2, 3, 4), dtype=dtype)
+            ret = xp.asfortranarray(x)
+            assert x.flags.c_contiguous
+            assert ret.flags.f_contiguous
+
+        assert func(numpy) == func(cupy)
+
+    @testing.for_all_dtypes()
+    def test_asfortranarray3(self, dtype):
+        def func(xp):
+            x = xp.zeros((2, 3, 4), dtype=dtype)
+            ret = xp.asfortranarray(xp.asfortranarray(x))
+            assert x.flags.c_contiguous
+            assert ret.flags.f_contiguous
+
+        assert func(numpy) == func(cupy)
+
+    @testing.for_all_dtypes()
+    def test_asfortranarray4(self, dtype):
+        def func(xp):
+            x = xp.zeros((2, 3), dtype=dtype)
+            x = xp.transpose(x, (1, 0))
+            ret = xp.asfortranarray(x)
+            assert ret.flags.f_contiguous
+
+        assert func(numpy) == func(cupy)
+
+    @testing.for_all_dtypes()
+    def test_asfortranarray5(self, dtype):
+        def func(xp):
+            x = testing.shaped_arange((2, 3), xp, dtype)
+            ret = xp.asfortranarray(x)
+            assert x.flags.c_contiguous
+            assert ret.flags.f_contiguous
+
+        assert func(numpy) == func(cupy)
+
+    @pytest.mark.skip("dpnp.require() is not implemented yet")
+    @testing.for_all_dtypes()
+    def test_require_flag_check(self, dtype):
+        possible_flags = [["C_CONTIGUOUS"], ["F_CONTIGUOUS"]]
+        x = cupy.zeros((2, 3, 4), dtype=dtype)
+        for flags in possible_flags:
+            arr = cupy.require(x, dtype, flags)
+            for parameter in flags:
+                assert arr.flags[parameter]
+                assert arr.dtype == dtype
+
+    @pytest.mark.skip("dpnp.require() is not implemented yet")
+    @testing.for_all_dtypes()
+    def test_require_owndata(self, dtype):
+        x = cupy.zeros((2, 3, 4), dtype=dtype)
+        arr = x.view()
+        arr = cupy.require(arr, dtype, ["O"])
+        assert arr.flags["OWNDATA"]
+
+    @pytest.mark.skip("dpnp.require() is not implemented yet")
+    @testing.for_all_dtypes()
+    def test_require_C_and_F_flags(self, dtype):
+        x = cupy.zeros((2, 3, 4), dtype=dtype)
+        with pytest.raises(ValueError):
+            cupy.require(x, dtype, ["C", "F"])
+
+    @pytest.mark.skip("dpnp.require() is not implemented yet")
+    @testing.for_all_dtypes()
+    def test_require_incorrect_requirments(self, dtype):
+        x = cupy.zeros((2, 3, 4), dtype=dtype)
+        with pytest.raises(ValueError):
+            cupy.require(x, dtype, ["W"])
+
+    @pytest.mark.skip("dpnp.require() is not implemented yet")
+    @testing.for_all_dtypes()
+    def test_require_incorrect_dtype(self, dtype):
+        x = cupy.zeros((2, 3, 4), dtype=dtype)
+        with pytest.raises(ValueError):
+            cupy.require(x, "random", "C")
+
+    @pytest.mark.skip("dpnp.require() is not implemented yet")
+    @testing.for_all_dtypes()
+    def test_require_empty_requirements(self, dtype):
+        x = cupy.zeros((2, 3, 4), dtype=dtype)
+        x = cupy.require(x, dtype, [])
+        assert x.flags["C_CONTIGUOUS"]
diff --git a/tests/third_party/intel/test_zero_copy_test1.py b/tests/third_party/intel/test_zero_copy_test1.py
index 9c9d0fa9dba..c59cd5b3188 100644
--- a/tests/third_party/intel/test_zero_copy_test1.py
+++ b/tests/third_party/intel/test_zero_copy_test1.py
@@ -1,36 +1,36 @@
-import importlib
-import sys
-
-import pytest
-
-
-class dummymodule:
-    pass
-
-
-sys.modules["numba_dppy"] = dummymodule
-
-module_not_found = False
-
-reason = ""
-
-try:
-    zero_copy_test1 = importlib.import_module("zero-copy-test1")
-except ModuleNotFoundError as e:
-    module_not_found = True
-    reason = str(e)
-
-
-@pytest.mark.skipif(module_not_found, reason=reason)
-def test_dpnp_interaction_with_dpctl_memory():
-    return zero_copy_test1.test_dpnp_interaction_with_dpctl_memory()
-
-
-@pytest.mark.skipif(module_not_found, reason=reason)
-def test_dpnp_array_has_iface():
-    return zero_copy_test1.test_dpnp_array_has_iface()
-
-
-@pytest.mark.skipif(module_not_found, reason=reason)
-def test_dpctl_dparray_has_iface():
-    return zero_copy_test1.test_dpctl_dparray_has_iface()
+import importlib
+import sys
+
+import pytest
+
+
+class dummymodule:
+    pass
+
+
+sys.modules["numba_dppy"] = dummymodule
+
+module_not_found = False
+
+reason = ""
+
+try:
+    zero_copy_test1 = importlib.import_module("zero-copy-test1")
+except ModuleNotFoundError as e:
+    module_not_found = True
+    reason = str(e)
+
+
+@pytest.mark.skipif(module_not_found, reason=reason)
+def test_dpnp_interaction_with_dpctl_memory():
+    return zero_copy_test1.test_dpnp_interaction_with_dpctl_memory()
+
+
+@pytest.mark.skipif(module_not_found, reason=reason)
+def test_dpnp_array_has_iface():
+    return zero_copy_test1.test_dpnp_array_has_iface()
+
+
+@pytest.mark.skipif(module_not_found, reason=reason)
+def test_dpctl_dparray_has_iface():
+    return zero_copy_test1.test_dpctl_dparray_has_iface()
diff --git a/tests/third_party/intel/zero-copy-test1.py b/tests/third_party/intel/zero-copy-test1.py
index 4e7b110c669..44d2d776e9b 100644
--- a/tests/third_party/intel/zero-copy-test1.py
+++ b/tests/third_party/intel/zero-copy-test1.py
@@ -1,84 +1,84 @@
-import dpctl
-import dpctl.memory as dpmem
-import dpctl.tensor.numpy_usm_shared as usmarray
-import numba_dppy as dppy
-import numpy as np
-import pytest
-
-import dpnp
-
-
-class DuckUSMArray:
-    def __init__(self, shape, dtype="d", host_buffer=None):
-        nelems = np.prod(shape)
-        bytes = nelems * np.dtype(dtype).itemsize
-        shmem = dpmem.MemoryUSMShared(bytes)
-        if isinstance(host_buffer, np.ndarray):
-            shmem.copy_from_host(host_buffer.view(dtype="|u1"))
-        self.arr = np.ndarray(shape, dtype=dtype, buffer=shmem)
-
-    def __getitem__(self, indx):
-        return self.arr[indx]
-
-    def __setitem__(self, indx, val):
-        self.arr.__setitem__(indx, val)
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        iface = self.arr.__array_interface__
-        b = self.arr.base
-        iface["syclobj"] = b.__sycl_usm_array_interface__["syclobj"]
-        iface["version"] = 1
-        return iface
-
-
-def test_dpnp_interaction_with_dpctl_memory():
-    """Tests if dpnp supports zero-copy data exchange with another Python
-    object that defines `__sycl_usm_array_interface__`
-    """
-    hb = np.arange(0, 100, dtype=np.int64)
-    da = DuckUSMArray(hb.shape, dtype=hb.dtype, host_buffer=hb)
-
-    Y = dpnp.asarray(da)
-    # dpnp array must infer dimensions/dtype from input object
-    assert Y.dtype == hb.dtype
-    assert Y.shape == hb.shape
-
-    Y[0] = 10
-    assert da[0] == 10  # check zero copy
-
-
-def test_dppy_array_pass():
-    """Tests if dppy supports passing an array-like object DuckArray that defines `__sycl_usm_array_interface__`
-    to a dppy.kernel
-    """
-
-    @dppy.kernel
-    def dppy_f(array_like_obj):
-        i = dppy.get_global_id(0)
-        array_like_obj[i] = 10
-
-    global_size = 100
-    hb = np.arange(0, global_size, dtype="i4")
-    da = DuckUSMArray(hb.shape, dtype=hb.dtype, host_buffer=hb)
-
-    if dpctl.has_gpu_queues(dpctl.backend_type.level_zero):
-        print("\nScheduling on OpenCL GPU\n")
-        with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppy_f[global_size, dppy.DEFAULT_LOCAL_SIZE](da)
-    else:
-        print("\nSkip scheduling on OpenCL GPU\n")
-
-    assert da[0] == 10
-
-
-def test_dpctl_dparray_has_iface():
-    """Tests if dpctl.dptensor.numpy_usm_shared defines '__sycl_usm_array_interface__'"""
-    X = usmarray.ones(10)
-    assert type(getattr(X, "__sycl_usm_array_interface__", None) is dict)
-
-
-def test_dpnp_array_has_iface():
-    """Tests if dpnp.ndarray defines '__sycl_usm_array_interface__'"""
-    X = dpnp.array([1])
-    assert type(getattr(X, "__sycl_usm_array_interface__", None) is dict)
+import dpctl
+import dpctl.memory as dpmem
+import dpctl.tensor.numpy_usm_shared as usmarray
+import numba_dppy as dppy
+import numpy as np
+import pytest
+
+import dpnp
+
+
+class DuckUSMArray:
+    def __init__(self, shape, dtype="d", host_buffer=None):
+        nelems = np.prod(shape)
+        bytes = nelems * np.dtype(dtype).itemsize
+        shmem = dpmem.MemoryUSMShared(bytes)
+        if isinstance(host_buffer, np.ndarray):
+            shmem.copy_from_host(host_buffer.view(dtype="|u1"))
+        self.arr = np.ndarray(shape, dtype=dtype, buffer=shmem)
+
+    def __getitem__(self, indx):
+        return self.arr[indx]
+
+    def __setitem__(self, indx, val):
+        self.arr.__setitem__(indx, val)
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        iface = self.arr.__array_interface__
+        b = self.arr.base
+        iface["syclobj"] = b.__sycl_usm_array_interface__["syclobj"]
+        iface["version"] = 1
+        return iface
+
+
+def test_dpnp_interaction_with_dpctl_memory():
+    """Tests if dpnp supports zero-copy data exchange with another Python
+    object that defines `__sycl_usm_array_interface__`
+    """
+    hb = np.arange(0, 100, dtype=np.int64)
+    da = DuckUSMArray(hb.shape, dtype=hb.dtype, host_buffer=hb)
+
+    Y = dpnp.asarray(da)
+    # dpnp array must infer dimensions/dtype from input object
+    assert Y.dtype == hb.dtype
+    assert Y.shape == hb.shape
+
+    Y[0] = 10
+    assert da[0] == 10  # check zero copy
+
+
+def test_dppy_array_pass():
+    """Tests if dppy supports passing an array-like object DuckArray that defines `__sycl_usm_array_interface__`
+    to a dppy.kernel
+    """
+
+    @dppy.kernel
+    def dppy_f(array_like_obj):
+        i = dppy.get_global_id(0)
+        array_like_obj[i] = 10
+
+    global_size = 100
+    hb = np.arange(0, global_size, dtype="i4")
+    da = DuckUSMArray(hb.shape, dtype=hb.dtype, host_buffer=hb)
+
+    if dpctl.has_gpu_queues(dpctl.backend_type.level_zero):
+        print("\nScheduling on OpenCL GPU\n")
+        with dpctl.device_context("opencl:gpu") as gpu_queue:
+            dppy_f[global_size, dppy.DEFAULT_LOCAL_SIZE](da)
+    else:
+        print("\nSkip scheduling on OpenCL GPU\n")
+
+    assert da[0] == 10
+
+
+def test_dpctl_dparray_has_iface():
+    """Tests if dpctl.dptensor.numpy_usm_shared defines '__sycl_usm_array_interface__'"""
+    X = usmarray.ones(10)
+    assert type(getattr(X, "__sycl_usm_array_interface__", None) is dict)
+
+
+def test_dpnp_array_has_iface():
+    """Tests if dpnp.ndarray defines '__sycl_usm_array_interface__'"""
+    X = dpnp.array([1])
+    assert type(getattr(X, "__sycl_usm_array_interface__", None) is dict)

From 554bcddcb26fae9f9c97884aa7dcd7ae83767b89 Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Mon, 5 Feb 2024 15:46:29 -0800
Subject: [PATCH 17/29] Update docs for array creation functions (#1674)

* Update docs for array creation functions

* fix pre-cimmit

* address comments

* address comments

* Add CFD examples and update CFD check for dpnp.copy

* Fix dpnp.asfortranarray and dpnp.ascontiguousarray functions for not array input (#1691)

* Fix dpnp.asfortranarray and dpnp.ascontiguousarray functions for not array input

* Fix tests
---
 dpnp/dpnp_iface_arraycreation.py | 1398 +++++++++++++++++++++++++-----
 tests/test_arraycreation.py      |   36 +
 tests/test_sycl_queue.py         |    2 +
 tests/test_usm_type.py           |    1 +
 4 files changed, 1237 insertions(+), 200 deletions(-)

diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 851ef119975..26464d40256 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -112,6 +112,30 @@ def arange(
 
     For full documentation refer to :obj:`numpy.arange`.
 
+    Parameters
+    ----------
+    start : {int, real}, optional
+        Start of interval. The interval includes this value. The default start value is 0.
+    stop : {int, real}
+        End of interval. The interval does not include this value, except in some cases
+        where `step` is not an integer and floating point round-off affects the length of out.
+    step : {int, real}, optional
+        Spacing between values. The default `step` size is 1. If `step` is specified as
+        a position argument, `start` must also be given.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is "device".
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -129,12 +153,26 @@ def arange(
     Examples
     --------
     >>> import dpnp as np
-    >>> [i for i in np.arange(3)]
-    [0, 1, 2]
-    >>> [i for i in np.arange(3, 7)]
-    [3, 4, 5, 6]
-    >>> [i for i in np.arange(3, 7, 2)]
-    [3, 5]
+    >>> np.arange(3)
+    array([0, 1, 2])
+    >>> np.arange(3, 7)
+    array([3, 4, 5, 6])
+    >>> np.arange(3, 7, 2)
+    array([3, 5])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.arange(3)  # default case
+    >>> x, x.device, x.usm_type
+    (array([0, 1, 2]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.arange(3, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([0, 1, 2]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.arange(3, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([0, 1, 2]), Device(level_zero:gpu:0), 'host')
 
     """
 
@@ -172,6 +210,29 @@ def array(
 
     For full documentation refer to :obj:`numpy.array`.
 
+    Parameters
+    ----------
+    a : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    copy : bool, optional
+        If ``True`` (default), then the object is copied.
+    order : {"C", "F", "A", "K"}, optional
+        Memory layout of the newly output array. Default: "K".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -201,17 +262,31 @@ def array(
     >>> x = np.array([1, 2, 3])
     >>> x.ndim, x.size, x.shape
     (1, 3, (3,))
-    >>> print(x)
-    [1 2 3]
+    >>> x
+    array([1, 2, 3])
 
     More than one dimension:
 
     >>> x2 = np.array([[1, 2], [3, 4]])
     >>> x2.ndim, x2.size, x2.shape
     (2, 4, (2, 2))
-    >>> print(x2)
-    [[1 2]
-     [3 4]]
+    >>> x2
+    array([[1, 2],
+           [3, 4]])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.array([1, 2, 3]) # default case
+    >>> x, x.device, x.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.array([1, 2, 3], device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1, 2, 3]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.array([1, 2, 3], usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'host')
 
     """
 
@@ -262,6 +337,27 @@ def asanyarray(
 
     For full documentation refer to :obj:`numpy.asanyarray`.
 
+    Parameters
+    ----------
+    a : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    order : {"C", "F", "A", "K"}, optional
+        Memory layout of the newly output array. Default: "K".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -291,6 +387,20 @@ def asanyarray(
     >>> np.asanyarray([1, 2, 3])
     array([1, 2, 3])
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.asanyarray([1, 2, 3]) # default case
+    >>> x, x.device, x.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.asanyarray([1, 2, 3], device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1, 2, 3]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.asanyarray([1, 2, 3], usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'host')
+
     """
 
     if like is not None:
@@ -323,6 +433,27 @@ def asarray(
 
     For full documentation refer to :obj:`numpy.asarray`.
 
+    Parameters
+    ----------
+    a : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    order : {"C", "F", "A", "K"}, optional
+        Memory layout of the newly output array. Default: "K".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -353,6 +484,20 @@ def asarray(
     >>> np.asarray([1, 2, 3])
     array([1, 2, 3])
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.asarray([1, 2, 3]) # default case
+    >>> x, x.device, x.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.asarray([1, 2, 3], device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1, 2, 3]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.asarray([1, 2, 3], usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'host')
+
     """
 
     if like is not None:
@@ -379,6 +524,25 @@ def ascontiguousarray(
 
     For full documentation refer to :obj:`numpy.ascontiguousarray`.
 
+    Parameters
+    ----------
+    a : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -424,6 +588,21 @@ def ascontiguousarray(
     >>> x is y
     True
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x0 = np.asarray([1, 2, 3])
+    >>> x = np.ascontiguousarray(x0) # default case
+    >>> x, x.device, x.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.ascontiguousarray(x0, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1, 2, 3]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.ascontiguousarray(x0, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'host')
+
     """
 
     if like is not None:
@@ -433,7 +612,7 @@ def ascontiguousarray(
         )
 
     # at least 1-d array has to be returned
-    if a.ndim == 0:
+    if dpnp.isscalar(a) or hasattr(a, "ndim") and a.ndim == 0:
         a = [a]
 
     return asarray(
@@ -454,6 +633,25 @@ def asfortranarray(
 
     For full documentation refer to :obj:`numpy.asfortranarray`.
 
+    Parameters
+    ----------
+    a : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -501,6 +699,21 @@ def asfortranarray(
     >>> x is y
     True
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x0 = np.asarray([1, 2, 3])
+    >>> x = np.asfortranarray(x0) # default case
+    >>> x, x.device, x.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.asfortranarray(x0, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1, 2, 3]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.asfortranarray(x0, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'host')
+
     """
 
     if like is not None:
@@ -510,7 +723,7 @@ def asfortranarray(
         )
 
     # at least 1-d array has to be returned
-    if a.ndim == 0:
+    if dpnp.isscalar(a) or hasattr(a, "ndim") and a.ndim == 0:
         a = [a]
 
     return asarray(
@@ -523,12 +736,32 @@ def asfortranarray(
     )
 
 
-def copy(a, order="K", subok=False):
+def copy(
+    a, order="K", subok=False, device=None, usm_type=None, sycl_queue=None
+):
     """
     Return an array copy of the given object.
 
     For full documentation refer to :obj:`numpy.copy`.
 
+    Parameters
+    ----------
+    a : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    order : {"C", "F", "A", "K"}, optional
+        Memory layout of the newly output array. Default: "K".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Limitations
     -----------
     Parameter `subok` is supported only with default value ``False``.
@@ -566,6 +799,21 @@ def copy(a, order="K", subok=False):
     >>> x[0] == z[0]
     array(False)
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x0 = np.array([1, 2, 3])
+    >>> x = np.copy(x0) # default case
+    >>> x, x.device, x.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.copy(x0, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1, 2, 3]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.copy(x0, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1, 2, 3]), Device(level_zero:gpu:0), 'host')
+
     """
 
     if subok is not False:
@@ -575,9 +823,23 @@ def copy(a, order="K", subok=False):
         )
 
     if dpnp.is_supported_array_type(a):
-        return dpnp_container.copy(a, order=order)
+        sycl_queue_normalized = dpnp.get_normalized_queue_device(
+            a, device=device, sycl_queue=sycl_queue
+        )
+        if (
+            usm_type is None or usm_type == a.usm_type
+        ) and sycl_queue_normalized == a.sycl_queue:
+            return dpnp_container.copy(a, order=order)
 
-    return array(a, order=order, subok=subok, copy=True)
+    return array(
+        a,
+        order=order,
+        subok=subok,
+        copy=True,
+        device=device,
+        usm_type=usm_type,
+        sycl_queue=sycl_queue,
+    )
 
 
 def diag(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
@@ -586,16 +848,32 @@ def diag(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
 
     For full documentation refer to :obj:`numpy.diag`.
 
+    Parameters
+    ----------
+    v : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+        If `v` is a 2-D array, return a copy of its k-th diagonal. If `v` is a 1-D array,
+        return a 2-D array with `v` on the k-th diagonal.
+    k : int, optional
+        Diagonal in question. The default is 0. Use k > 0 for diagonals above the main diagonal,
+        and k < 0 for diagonals below the main diagonal.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
         The extracted diagonal or constructed diagonal array.
 
-    Limitations
-    -----------
-    Parameter `k` is only supported as integer data type.
-    Otherwise ``TypeError`` exception will be raised.
-
     See Also
     --------
     :obj:`diagonal` : Return specified diagonals.
@@ -607,24 +885,38 @@ def diag(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
     Examples
     --------
     >>> import dpnp as np
-    >>> x = np.arange(9).reshape((3,3))
-    >>> x
+    >>> x0 = np.arange(9).reshape((3, 3))
+    >>> x0
     array([[0, 1, 2],
            [3, 4, 5],
            [6, 7, 8]])
 
-    >>> np.diag(x)
+    >>> np.diag(x0)
     array([0, 4, 8])
-    >>> np.diag(x, k=1)
+    >>> np.diag(x0, k=1)
     array([1, 5])
-    >>> np.diag(x, k=-1)
+    >>> np.diag(x0, k=-1)
     array([3, 7])
 
-    >>> np.diag(np.diag(x))
+    >>> np.diag(np.diag(x0))
     array([[0, 0, 0],
            [0, 4, 0],
            [0, 0, 8]])
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.diag(x0) # default case
+    >>> x, x.device, x.usm_type
+    (array([0, 4, 8]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.diag(x0, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([0, 4, 8]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.diag(x0, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([0, 4, 8]), Device(level_zero:gpu:0), 'host')
+
     """
 
     if not isinstance(k, int):
@@ -670,6 +962,26 @@ def diagflat(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
 
     For full documentation refer to :obj:`numpy.diagflat`.
 
+    Parameters
+    ----------
+    v : array_like
+        Input data, which is flattened and set as the k-th diagonal of the output,
+        in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    k : int, optional
+        Diagonal to set; 0, the default, corresponds to the "main" diagonal,
+        a positive (negative) k giving the number of the diagonal above (below) the main.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -689,19 +1001,42 @@ def diagflat(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
     Examples
     --------
     >>> import dpnp as np
-    >>> x = np.array([[1,2], [3,4]])
-    >>> np.diagflat(x)
+    >>> x0 = np.array([[1, 2], [3, 4]])
+    >>> np.diagflat(x0)
     array([[1, 0, 0, 0],
            [0, 2, 0, 0],
            [0, 0, 3, 0],
            [0, 0, 0, 4]])
 
-    >>> np.diagflat(x, 1)
+    >>> np.diagflat(x0, 1)
     array([[0, 1, 0, 0, 0],
-        [0, 0, 2, 0, 0],
-        [0, 0, 0, 3, 0],
-        [0, 0, 0, 0, 4],
-        [0, 0, 0, 0, 0]])
+           [0, 0, 2, 0, 0],
+           [0, 0, 0, 3, 0],
+           [0, 0, 0, 0, 4],
+           [0, 0, 0, 0, 0]])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.diagflat(x0) # default case
+    >>> x, x.device, x.usm_type
+    array([[1, 0, 0, 0],
+           [0, 2, 0, 0],
+           [0, 0, 3, 0],
+           [0, 0, 0, 4]]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.diagflat(x0, device="cpu")
+    >>> y, y.device, y.usm_type
+    array([[1, 0, 0, 0],
+           [0, 2, 0, 0],
+           [0, 0, 3, 0],
+           [0, 0, 0, 4]]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.diagflat(x0, usm_type="host")
+    >>> z, z.device, z.usm_type
+    array([[1, 0, 0, 0],
+           [0, 2, 0, 0],
+           [0, 0, 3, 0],
+           [0, 0, 0, 4]]), Device(level_zero:gpu:0), 'host')
 
     """
 
@@ -730,9 +1065,33 @@ def empty(
 
     For full documentation refer to :obj:`numpy.empty`.
 
+    Parameters
+    ----------
+    shape : {int, sequence of ints}
+        Shape of the new array, e.g., (2, 3) or 2.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is "device".
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of uninitialized data of the given shape, dtype, and order.
+
     Limitations
     -----------
-    Parameter `order` is supported only with values ``"C"`` and ``"F"``.
     Parameter `like` is supported only with default value ``None``.
     Otherwise the function will be executed sequentially on CPU.
 
@@ -746,9 +1105,22 @@ def empty(
     Examples
     --------
     >>> import dpnp as np
-    >>> x = np.empty(4)
-    >>> print(x)
-    [0. 0. 0. 0.]
+    >>> np.empty(4)
+    array([9.03088525e-312, 9.03088525e-312, 9.03088525e-312, 9.03088525e-312])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.empty((3, 3)) # default case
+    >>> x.shape, x.device, x.usm_type
+    ((3, 3), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.empty((3, 3), device="cpu")
+    >>> y.shape, y.device, y.usm_type
+    ((3, 3), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.empty((3, 3), usm_type="host")
+    >>> z.shape, z.device, z.usm_type
+    ((3, 3), Device(level_zero:gpu:0), 'host')
 
     """
 
@@ -770,7 +1142,7 @@ def empty(
 
 
 def empty_like(
-    x1,
+    a,
     /,
     *,
     dtype=None,
@@ -786,10 +1158,35 @@ def empty_like(
 
     For full documentation refer to :obj:`numpy.empty_like`.
 
+    Parameters
+    ----------
+    a : {dpnp_array, usm_ndarray}
+        The shape and dtype of `a` define these same attributes of the returned array.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    shape : {int, sequence of ints}
+        Overrides the shape of the result.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of uninitialized data with the same shape and type as prototype.
+
     Limitations
     -----------
-    Parameter `x1` is supported as :class:`dpnp.dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`
-    Parameter `order` is supported with values ``"C"`` or ``"F"``.
     Parameter `subok` is supported only with default value ``False``.
     Otherwise the function will be executed sequentially on CPU.
 
@@ -804,24 +1201,37 @@ def empty_like(
     --------
     >>> import dpnp as np
     >>> a = np.array([1, 2, 3])
-    >>> x = np.empty_like(a)
-    >>> [i for i in x]
-    [0, 0, 0]
+    >>> np.empty_like(a)
+    array([1, 2, 3])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.empty_like(a) # default case
+    >>> x.shape, x.device, x.usm_type
+    ((3, ), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.empty_like(a, device="cpu")
+    >>> y.shape, y.device, y.usm_type
+    ((3, ), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.empty_like(a, usm_type="host")
+    >>> z.shape, z.device, z.usm_type
+    ((3, ), Device(level_zero:gpu:0), 'host')
 
     """
 
-    if not isinstance(x1, (dpnp.ndarray, dpt.usm_ndarray)):
+    if not isinstance(a, (dpnp.ndarray, dpt.usm_ndarray)):
         pass
     elif order not in ("C", "c", "F", "f", None):
         pass
     elif subok is not False:
         pass
     else:
-        _shape = x1.shape if shape is None else shape
-        _dtype = x1.dtype if dtype is None else dtype
-        _usm_type = x1.usm_type if usm_type is None else usm_type
+        _shape = a.shape if shape is None else shape
+        _dtype = a.dtype if dtype is None else dtype
+        _usm_type = a.usm_type if usm_type is None else usm_type
         _sycl_queue = dpnp.get_normalized_queue_device(
-            x1, sycl_queue=sycl_queue, device=device
+            a, sycl_queue=sycl_queue, device=device
         )
         return dpnp_container.empty(
             _shape,
@@ -831,7 +1241,7 @@ def empty_like(
             sycl_queue=_sycl_queue,
         )
 
-    return call_origin(numpy.empty_like, x1, dtype, order, subok, shape)
+    return call_origin(numpy.empty_like, a, dtype, order, subok, shape)
 
 
 def eye(
@@ -852,6 +1262,37 @@ def eye(
 
     For full documentation refer to :obj:`numpy.eye`.
 
+    Parameters
+    ----------
+    N : int
+        Number of rows in the output.
+    M : int, optional
+        Number of columns in the output. If None, defaults to `N`.
+    k : int, optional
+        Index of the diagonal: 0 (the default) refers to the main diagonal,
+        a positive value refers to an upper diagonal, and a negative value to a lower diagonal.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is "device".
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        An array where all elements are equal to zero, except for the k-th diagonal,
+        whose values are equal to one.
+
     Limitations
     -----------
     Parameter `order` is supported only with values ``"C"`` and ``"F"``.
@@ -870,6 +1311,23 @@ def eye(
            [0.,  0.,  1.],
            [0.,  0.,  0.]])
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.eye(2, dtype=int) # default case
+    >>> x, x.device, x.usm_type
+    (array([[1, 0],
+            [0, 1]]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.eye(2, dtype=int, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([[1, 0],
+            [0, 1]]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.eye(2, dtype=int, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([[1, 0],
+            [0, 1]]), Device(level_zero:gpu:0), 'host')
+
     """
     if order not in ("C", "c", "F", "f", None):
         pass
@@ -990,6 +1448,34 @@ def full(
 
     For full documentation refer to :obj:`numpy.full`.
 
+    Parameters
+    ----------
+    shape : {int, sequence of ints}
+        Shape of the new array, e.g., (2, 3) or 2.
+    fill_value : {scalar, array_like}
+        Fill value, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of `fill_value` with the given shape, dtype, and order.
+
     Limitations
     -----------
     Parameter `order` is supported only with values ``"C"`` and ``"F"``.
@@ -1006,9 +1492,22 @@ def full(
     Examples
     --------
     >>> import dpnp as np
-    >>> x = np.full(4, 10)
-    >>> [i for i in x]
-    [10, 10, 10, 10]
+    >>> np.full(4, 10)
+    array([10, 10, 10, 10])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.full(4, 10) # default case
+    >>> x, x.device, x.usm_type
+    (array([10, 10, 10, 10]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.full(4, 10, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([10, 10, 10, 10]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.full(4, 10, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([10, 10, 10, 10]), Device(level_zero:gpu:0), 'host')
 
     """
 
@@ -1031,7 +1530,7 @@ def full(
 
 
 def full_like(
-    x1,
+    a,
     /,
     fill_value,
     *,
@@ -1048,9 +1547,38 @@ def full_like(
 
     For full documentation refer to :obj:`numpy.full_like`.
 
+    Parameters
+    ----------
+    a : {dpnp_array, usm_ndarray}
+        The shape and dtype of `a` define these same attributes of the returned array.
+    fill_value : {scalar, array_like}
+        Fill value, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    shape : {int, sequence of ints}
+        Overrides the shape of the result.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of `fill_value` with the same shape and type as `a`.
+
     Limitations
     -----------
-    Parameter `x1` is supported as :class:`dpnp.dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`
     Parameter `order` is supported only with values ``"C"`` and ``"F"``.
     Parameter `subok` is supported only with default value ``False``.
     Otherwise the function will be executed sequentially on CPU.
@@ -1066,23 +1594,36 @@ def full_like(
     --------
     >>> import dpnp as np
     >>> a = np.arange(6)
-    >>> x = np.full_like(a, 1)
-    >>> [i for i in x]
-    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    >>> np.full_like(a, 1)
+    array([1, 1, 1, 1, 1, 1])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.full_like(a, 1) # default case
+    >>> x, x.device, x.usm_type
+    (array([1, 1, 1, 1, 1, 1]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.full_like(a, 1, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1, 1, 1, 1, 1, 1]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.full_like(a, 1, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1, 1, 1, 1, 1, 1]), Device(level_zero:gpu:0), 'host')
 
     """
-    if not isinstance(x1, (dpnp.ndarray, dpt.usm_ndarray)):
+    if not isinstance(a, (dpnp.ndarray, dpt.usm_ndarray)):
         pass
     elif order not in ("C", "c", "F", "f", None):
         pass
     elif subok is not False:
         pass
     else:
-        _shape = x1.shape if shape is None else shape
-        _dtype = x1.dtype if dtype is None else dtype
-        _usm_type = x1.usm_type if usm_type is None else usm_type
+        _shape = a.shape if shape is None else shape
+        _dtype = a.dtype if dtype is None else dtype
+        _usm_type = a.usm_type if usm_type is None else usm_type
         _sycl_queue = dpnp.get_normalized_queue_device(
-            x1, sycl_queue=sycl_queue, device=device
+            a, sycl_queue=sycl_queue, device=device
         )
 
         return dpnp_container.full(
@@ -1093,14 +1634,14 @@ def full_like(
             usm_type=_usm_type,
             sycl_queue=_sycl_queue,
         )
-    return numpy.full_like(x1, fill_value, dtype, order, subok, shape)
+    return numpy.full_like(a, fill_value, dtype, order, subok, shape)
 
 
 def geomspace(
     start,
     stop,
     /,
-    num,
+    num=50,
     *,
     dtype=None,
     device=None,
@@ -1114,6 +1655,40 @@ def geomspace(
 
     For full documentation refer to :obj:`numpy.geomspace`.
 
+    Parameters
+    ----------
+    start : array_like
+        The starting value of the sequence, in any form that can be converted to an array.
+        This includes scalars, lists, lists of tuples, tuples, tuples of tuples,
+        tuples of lists, and ndarrays.
+    stop : array_like
+        The final value of the sequence, in any form that can be converted to an array.
+        This includes scalars, lists, lists of tuples, tuples, tuples of tuples,
+        tuples of lists, and ndarrays. If `endpoint` is ``False`` num + 1 values
+        are spaced over the interval in log-space, of which all but the last
+        (a sequence of length num) are returned.
+    num : int, optional
+        Number of samples to generate. Default is 50.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+    endpoint : bool, optional
+        If ``True``, `stop` is the last sample. Otherwise, it is not included. Default is ``True``.
+    axis : int, optional
+        The axis in the result to store the samples. Relevant only if start or stop are array-like.
+        By default (0), the samples will be along a new axis inserted at the beginning.
+        Use -1 to get an axis at the end.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -1150,6 +1725,20 @@ def geomspace(
     >>> np.geomspace(-1000, -1, num=4)
     array([-1000.,  -100.,   -10.,    -1.])
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.geomspace(1000, 1, num=4) # default case
+    >>> x, x.device, x.usm_type
+    (array([1000.,  100.,   10.,    1.]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.geomspace(1000, 1, num=4, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1000.,  100.,   10.,    1.]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.geomspace(1000, 1, num=4, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1000.,  100.,   10.,    1.]), Device(level_zero:gpu:0), 'host')
+
     """
 
     return dpnp_geomspace(
@@ -1182,6 +1771,24 @@ def identity(
 
     For full documentation refer to :obj:`numpy.identity`.
 
+    Parameters
+    ----------
+    n : int
+        Number of rows (and columns) in `n` x `n` output.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is "device".
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -1207,21 +1814,41 @@ def identity(
            [0.,  1.,  0.],
            [0.,  0.,  1.]])
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.identity(3) # default case
+    >>> x, x.device, x.usm_type
+    (array([[1.,  0.,  0.],
+            [0.,  1.,  0.],
+            [0.,  0.,  1.]]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.identity(3, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([[1.,  0.,  0.],
+            [0.,  1.,  0.],
+            [0.,  0.,  1.]]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.identity(3, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([[1.,  0.,  0.],
+            [0.,  1.,  0.],
+            [0.,  0.,  1.]]), Device(level_zero:gpu:0), 'host')
+
     """
-    if not use_origin_backend():
-        if like is not None:
-            pass
-        elif n < 0:
-            raise ValueError("negative dimensions are not allowed")
-        else:
-            _dtype = dpnp.default_float_type() if dtype is None else dtype
-            return dpnp.eye(
-                n,
-                dtype=_dtype,
-                device=device,
-                usm_type=usm_type,
-                sycl_queue=sycl_queue,
-            )
+
+    if like is not None:
+        pass
+    elif n < 0:
+        raise ValueError("negative dimensions are not allowed")
+    else:
+        _dtype = dpnp.default_float_type() if dtype is None else dtype
+        return dpnp.eye(
+            n,
+            dtype=_dtype,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
     return call_origin(numpy.identity, n, dtype=dtype, like=like)
 
 
@@ -1244,6 +1871,39 @@ def linspace(
 
     For full documentation refer to :obj:`numpy.linspace`.
 
+    Parameters
+    ----------
+    start : array_like
+        The starting value of the sequence, in any form that can be converted to an array.
+        This includes scalars, lists, lists of tuples, tuples, tuples of tuples,
+        tuples of lists, and ndarrays.
+    stop : array_like
+        The end value of the sequence, in any form that can be converted to an array.
+        This includes scalars, lists, lists of tuples, tuples, tuples of tuples,
+        tuples of lists, and ndarrays. If `endpoint` is set to ``False`` the sequence consists
+        of all but the last of num + 1 evenly spaced samples, so that `stop` is excluded.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+    endpoint : bool, optional
+        If ``True``, `stop` is the last sample. Otherwise, it is not included. Default is ``True``.
+    retstep : bool, optional
+        If ``True``, return (samples, step), where step is the spacing between samples.
+    axis : int, optional
+        The axis in the result to store the samples. Relevant only if start or stop are array-like.
+        By default (0), the samples will be along a new axis inserted at the beginning.
+        Use -1 to get an axis at the end.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -1275,6 +1935,20 @@ def linspace(
     >>> np.linspace(2.0, 3.0, num=5, retstep=True)
     (array([2.  , 2.25, 2.5 , 2.75, 3.  ]), array(0.25))
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.linspace(2.0, 3.0, num=3) # default case
+    >>> x, x.device, x.usm_type
+    (array([2. , 2.5, 3. ]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.linspace(2.0, 3.0, num=3, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([2. , 2.5, 3. ]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.linspace(2.0, 3.0, num=3, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([2. , 2.5, 3. ]), Device(level_zero:gpu:0), 'host')
+
     """
 
     return dpnp_linspace(
@@ -1336,6 +2010,47 @@ def logspace(
 
     For full documentation refer to :obj:`numpy.logspace`.
 
+    Parameters
+    ----------
+    start : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+        `base` ** `start` is the starting value of the sequence.
+    stop : array_like
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+        `base` ** `stop` is the final value of the sequence, unless `endpoint` is ``False``.
+        In that case, num + 1 values are spaced over the interval in log-space,
+        of which all but the last (a sequence of length num) are returned.
+    num : int, optional
+        Number of samples to generate. Default is 50.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+    endpoint : bool, optional
+        If ``True``, stop is the last sample. Otherwise, it is not included. Default is ``True``.
+    base : array_like, optional
+        Input data, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+        The base of the log space, in any form that can be converted to an array.This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+        The `step` size between the elements in ln(samples) / ln(base) (or log_base(samples))
+        is uniform. Default is 10.0.
+    dtype : dtype, optional
+        The desired dtype for the array. If not given, a default dtype will be used that can represent
+        the values (by considering Promotion Type Rule and device capabilities when necessary.)
+    axis : int, optional
+        The axis in the result to store the samples. Relevant only if start, stop,
+        or base are array-like. By default (0), the samples will be along a new axis inserted
+        at the beginning. Use -1 to get an axis at the end.
+
     Returns
     -------
     out: dpnp.ndarray
@@ -1368,6 +2083,20 @@ def logspace(
     array([[ 4.        ,  5.0396842 ,  6.34960421,  8.        ],
            [ 9.        , 12.98024613, 18.72075441, 27.        ]])
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.logspace(1.0, 3.0, num=3) # default case
+    >>> x, x.device, x.usm_type
+    (array([  10.,  100., 1000.]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.logspace(1.0, 3.0, num=3, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([  10.,  100., 1000.]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.logspace(1.0, 3.0, num=3, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([  10.,  100., 1000.]), Device(level_zero:gpu:0), 'host')
+
     """
 
     return dpnp_logspace(
@@ -1492,7 +2221,7 @@ class MGridClass:
         an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
         :obj:`dpnp.dpnp_array.dpnp_array.device` property.
     usm_type : {"device", "shared", "host"}, optional
-        The type of SYCL USM allocation for the output array.
+        The type of SYCL USM allocation for the output array. Default is "device".
     sycl_queue : {None, SyclQueue}, optional
         A SYCL queue to use for output array allocation and copying.
 
@@ -1516,17 +2245,19 @@ class MGridClass:
             [0, 1, 2, 3, 4],
             [0, 1, 2, 3, 4]]])
 
-    >>> x = np.mgrid[-1:1:5j]
-    >>> x
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-    >>> x.usm_type
-    'device'
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.mgrid[-1:1:5j] # default case
+    >>> x, x.device, x.usm_type
+    (array([-1. , -0.5,  0. ,  0.5,  1. ]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.mgrid(device="cpu")[-1:1:5j]
+    >>> y, y.device, y.usm_type
+    (array([-1. , -0.5,  0. ,  0.5,  1. ]), Device(opencl:cpu:0), 'device')
 
-    >>> y = np.mgrid(usm_type="host")[-1:1:5j]
-    >>> y
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-    >>> x.usm_type
-    'host'
+    >>> z = np.mgrid(usm_type="host")[-1:1:5j]
+    >>> z, z.device, z.usm_type
+    (array([-1. , -0.5,  0. ,  0.5,  1. ]), Device(level_zero:gpu:0), 'host')
 
     """
 
@@ -1560,7 +2291,7 @@ class OGridClass:
         an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
         :obj:`dpnp.dpnp_array.dpnp_array.device` property.
     usm_type : {"device", "shared", "host"}, optional
-        The type of SYCL USM allocation for the output array.
+        The type of SYCL USM allocation for the output array. Default is "device".
     sycl_queue : {None, SyclQueue}, optional
         A SYCL queue to use for output array allocation and copying.
 
@@ -1580,17 +2311,19 @@ class OGridClass:
             [3],
             [4]]), array([[0, 1, 2, 3, 4]])]
 
-    >>> x = np.ogrid[-1:1:5j]
-    >>> x
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-    >>> x.usm_type
-    'device'
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.ogrid[-1:1:5j] # default case
+    >>> x, x.device, x.usm_type
+    (array([-1. , -0.5,  0. ,  0.5,  1. ]), Device(level_zero:gpu:0), 'device')
 
-    >>> y = np.ogrid(usm_type="host")[-1:1:5j]
-    >>> y
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-    >>> x.usm_type
-    'host'
+    >>> y = np.ogrid(device="cpu")[-1:1:5j]
+    >>> y, y.device, y.usm_type
+    (array([-1. , -0.5,  0. ,  0.5,  1. ]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.ogrid(usm_type="host")[-1:1:5j]
+    >>> z, z.device, z.usm_type
+    (array([-1. , -0.5,  0. ,  0.5,  1. ]), Device(level_zero:gpu:0), 'host')
 
     """
 
@@ -1621,6 +2354,31 @@ def ones(
 
     For full documentation refer to :obj:`numpy.ones`.
 
+    Parameters
+    ----------
+    shape : {int, sequence of ints}
+        Shape of the new array, e.g., (2, 3) or 2.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is "device".
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of ones with the given shape, dtype, and order.
+
     Limitations
     -----------
     Parameter `order` is supported only with values ``"C"`` and ``"F"``.
@@ -1637,13 +2395,28 @@ def ones(
     Examples
     --------
     >>> import dpnp as np
-    >>> [i for i in np.ones(5)]
-    [1.0, 1.0, 1.0, 1.0, 1.0]
+    >>> np.ones(5)
+    array([1., 1., 1., 1., 1.])
     >>> x = np.ones((2, 1))
     >>> x.ndim, x.size, x.shape
     (2, 2, (2, 1))
-    >>> [i for i in x]
-    [1.0, 1.0]
+    >>> x
+    array([[1.],
+           [1.]])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.ones(3) # default case
+    >>> x, x.device, x.usm_type
+    (array([1., 1., 1.]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.ones(3, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1., 1., 1.]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.ones(3, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1., 1., 1.]), Device(level_zero:gpu:0), 'host')
 
     """
 
@@ -1665,7 +2438,7 @@ def ones(
 
 
 def ones_like(
-    x1,
+    a,
     /,
     *,
     dtype=None,
@@ -1681,10 +2454,35 @@ def ones_like(
 
     For full documentation refer to :obj:`numpy.ones_like`.
 
+    Parameters
+    ----------
+    a : {dpnp_array, usm_ndarray}
+        The shape and dtype of `a` define these same attributes of the returned array.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    shape : {int, sequence of ints}
+        Overrides the shape of the result.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of ones with the same shape and type as `a`.
+
     Limitations
     -----------
-    Parameter `x1` is supported as :class:`dpnp.dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`
-    Parameter `order` is supported with values ``"C"`` or ``"F"``.
     Parameter `subok` is supported only with default value ``False``.
     Otherwise the function will be executed sequentially on CPU.
 
@@ -1698,25 +2496,39 @@ def ones_like(
     Examples
     --------
     >>> import dpnp as np
-    >>> x = np.arange(6)
-    >>> [i for i in x]
-    [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
-    >>> [i for i in np.ones_like(x)]
-    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    >>> x0 = np.arange(6)
+    >>> x0
+    array([0, 1, 2, 3, 4, 5])
+    >>> np.ones_like(x0)
+    array([1, 1, 1, 1, 1, 1])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.ones_like(x0) # default case
+    >>> x, x.device, x.usm_type
+    (array([1, 1, 1, 1, 1, 1]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.ones_like(x0, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([1, 1, 1, 1, 1, 1]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.ones_like(x0, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([1, 1, 1, 1, 1, 1]), Device(level_zero:gpu:0), 'host')
 
     """
-    if not isinstance(x1, (dpnp.ndarray, dpt.usm_ndarray)):
+    if not isinstance(a, (dpnp.ndarray, dpt.usm_ndarray)):
         pass
     elif order not in ("C", "c", "F", "f", None):
         pass
     elif subok is not False:
         pass
     else:
-        _shape = x1.shape if shape is None else shape
-        _dtype = x1.dtype if dtype is None else dtype
-        _usm_type = x1.usm_type if usm_type is None else usm_type
+        _shape = a.shape if shape is None else shape
+        _dtype = a.dtype if dtype is None else dtype
+        _usm_type = a.usm_type if usm_type is None else usm_type
         _sycl_queue = dpnp.get_normalized_queue_device(
-            x1, sycl_queue=sycl_queue, device=device
+            a, sycl_queue=sycl_queue, device=device
         )
         return dpnp_container.ones(
             _shape,
@@ -1726,7 +2538,7 @@ def ones_like(
             sycl_queue=_sycl_queue,
         )
 
-    return call_origin(numpy.ones_like, x1, dtype, order, subok, shape)
+    return call_origin(numpy.ones_like, a, dtype, order, subok, shape)
 
 
 def trace(x1, offset=0, axis1=0, axis2=1, dtype=None, out=None):
@@ -1778,9 +2590,32 @@ def tri(
 
     For full documentation refer to :obj:`numpy.tri`.
 
+    Parameters
+    ----------
+    N : int
+        Number of rows in the array.
+    M : int, optional
+        Number of columns in the array. By default, `M` is taken equal to `N`.
+    k : int, optional
+        The sub-diagonal at and below which the array is filled. k = 0 is the main diagonal,
+        while k < 0 is below it, and k > 0 is above. The default is 0.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is "device".
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
-    out : ndarray of shape (N, M)
+    out : dpnp.ndarray of shape (N, M)
         Array with its lower triangle filled with ones and zeros elsewhere.
 
     Limitations
@@ -1807,43 +2642,60 @@ def tri(
            [1.,  0.,  0.,  0.,  0.],
            [1.,  1.,  0.,  0.,  0.]])
 
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.tri(3, 2) # default case
+    >>> x, x.device, x.usm_type
+    (array([[1., 0.],
+            [1., 1.],
+            [1., 1.]]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.tri(3, 2, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([[1., 0.],
+            [1., 1.],
+            [1., 1.]]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.tri(3, 2, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([[1., 0.],
+            [1., 1.],
+            [1., 1.]]), Device(level_zero:gpu:0), 'host')
+
     """
 
-    if not use_origin_backend():
-        if len(kwargs) != 0:
-            pass
-        elif not isinstance(N, int):
-            pass
-        elif N < 0:
-            pass
-        elif M is not None and not isinstance(M, int):
-            pass
-        elif M is not None and M < 0:
-            pass
-        elif not isinstance(k, int):
-            pass
-        else:
-            _dtype = (
-                dpnp.default_float_type()
-                if dtype in (dpnp.float, None)
-                else dtype
-            )
-            if M is None:
-                M = N
-
-            m = dpnp.ones(
-                (N, M),
-                dtype=_dtype,
-                device=device,
-                usm_type=usm_type,
-                sycl_queue=sycl_queue,
-            )
-            return dpnp.tril(m, k=k)
+    if len(kwargs) != 0:
+        pass
+    elif not isinstance(N, int):
+        pass
+    elif N < 0:
+        pass
+    elif M is not None and not isinstance(M, int):
+        pass
+    elif M is not None and M < 0:
+        pass
+    elif not isinstance(k, int):
+        pass
+    else:
+        _dtype = (
+            dpnp.default_float_type() if dtype in (dpnp.float, None) else dtype
+        )
+        if M is None:
+            M = N
+
+        m = dpnp.ones(
+            (N, M),
+            dtype=_dtype,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+        return dpnp.tril(m, k=k)
 
     return call_origin(numpy.tri, N, M, k, dtype, **kwargs)
 
 
-def tril(x1, /, *, k=0):
+def tril(m, /, *, k=0):
     """
     Lower triangle of an array.
 
@@ -1851,16 +2703,29 @@ def tril(x1, /, *, k=0):
 
     For full documentation refer to :obj:`numpy.tril`.
 
+    Parameters
+    ----------
+    m : {dpnp_array, usm_ndarray}, shape (…, M, N)
+        Input array.
+    k : int, optional
+        Diagonal above which to zero elements. k = 0 (the default) is the main diagonal,
+        k < 0 is below it and k > 0 is above.
+
+    Returns
+    -------
+    out : dpnp.ndarray of shape (N, M)
+        Lower triangle of `m`, of same shape and dtype as `m`.
+
     Limitations
     -----------
-    Parameter `x1` is supported as :class:`dpnp.dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray` with two or more dimensions.
     Parameter `k` is supported only of integer data type.
     Otherwise the function will be executed sequentially on CPU.
 
     Examples
     --------
     >>> import dpnp as np
-    >>> np.tril([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], -1)
+    >>> m = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+    >>> np.tril(m, k=-1)
     array([[ 0,  0,  0],
            [ 4,  0,  0],
            [ 7,  8,  0],
@@ -1874,19 +2739,19 @@ def tril(x1, /, *, k=0):
     except TypeError:
         pass
 
-    if not isinstance(x1, (dpnp.ndarray, dpt.usm_ndarray)):
+    if not isinstance(m, (dpnp.ndarray, dpt.usm_ndarray)):
         pass
-    elif x1.ndim < 2:
+    elif m.ndim < 2:
         pass
     elif _k is None:
         pass
     else:
-        return dpnp_container.tril(x1, k=_k)
+        return dpnp_container.tril(m, k=_k)
 
-    return call_origin(numpy.tril, x1, k)
+    return call_origin(numpy.tril, m, k)
 
 
-def triu(x1, /, *, k=0):
+def triu(m, /, *, k=0):
     """
     Upper triangle of an array.
 
@@ -1895,16 +2760,29 @@ def triu(x1, /, *, k=0):
 
     For full documentation refer to :obj:`numpy.triu`.
 
+    Parameters
+    ----------
+    m : {dpnp_array, usm_ndarray}, shape (…, M, N)
+        Input array.
+    k : int, optional
+        Diagonal below which to zero elements. k = 0 (the default) is the main diagonal,
+        k < 0 is below it and k > 0 is above.
+
+    Returns
+    -------
+    out : dpnp.ndarray of shape (N, M)
+        Upper triangle of `m`, of same shape and dtype as `m`.
+
     Limitations
     -----------
-    Parameter `x1` is supported as :class:`dpnp.dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray` with two or more dimensions.
     Parameter `k` is supported only of integer data type.
     Otherwise the function will be executed sequentially on CPU.
 
     Examples
     --------
     >>> import dpnp as np
-    >>> np.triu([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], -1)
+    >>> m = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+    >>> np.triu(m, k=-1)
     array([[ 1,  2,  3],
            [ 4,  5,  6],
            [ 0,  8,  9],
@@ -1918,20 +2796,20 @@ def triu(x1, /, *, k=0):
     except TypeError:
         pass
 
-    if not isinstance(x1, (dpnp.ndarray, dpt.usm_ndarray)):
+    if not isinstance(m, (dpnp.ndarray, dpt.usm_ndarray)):
         pass
-    elif x1.ndim < 2:
+    elif m.ndim < 2:
         pass
     elif _k is None:
         pass
     else:
-        return dpnp_container.triu(x1, k=_k)
+        return dpnp_container.triu(m, k=_k)
 
-    return call_origin(numpy.triu, x1, k)
+    return call_origin(numpy.triu, m, k)
 
 
 def vander(
-    x1,
+    x,
     /,
     N=None,
     increasing=False,
@@ -1945,6 +2823,27 @@ def vander(
 
     For full documentation refer to :obj:`numpy.vander`.
 
+    Parameters
+    ----------
+    x : array_like
+        1-D input array, in any form that can be converted to an array. This includes scalars,
+        lists, lists of tuples, tuples, tuples of tuples, tuples of lists, and ndarrays.
+    N : int, optional
+        Number of columns in the output. If `N` is not specified, a square array is returned (N = len(x)).
+    increasing : bool, optional
+        Order of the powers of the columns. If ``True,`` the powers increase from left to right,
+        if ``False`` (the default) they are reversed.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
     Returns
     -------
     out : dpnp.ndarray
@@ -1958,51 +2857,71 @@ def vander(
     Examples
     --------
     >>> import dpnp as np
-    >>> x = np.array([1, 2, 3, 5])
+    >>> x0 = np.array([1, 2, 3, 5])
     >>> N = 3
-    >>> np.vander(x, N)
+    >>> np.vander(x0, N)
     array([[ 1,  1,  1],
            [ 4,  2,  1],
            [ 9,  3,  1],
            [25,  5,  1]])
 
-    >>> x = np.array([1, 2, 3, 5])
-    >>> np.vander(x)
+    >>> np.vander(x0)
     array([[  1,   1,   1,   1],
            [  8,   4,   2,   1],
            [ 27,   9,   3,   1],
            [125,  25,   5,   1]])
 
-    >>> np.vander(x, increasing=True)
+    >>> np.vander(x0, increasing=True)
     array([[  1,   1,   1,   1],
            [  1,   2,   4,   8],
            [  1,   3,   9,  27],
            [  1,   5,  25, 125]])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.vander(x0) # default case
+    >>> x, x.device, x.usm_type
+    (array([[  1,   1,   1,   1],
+            [  8,   4,   2,   1],
+            [ 27,   9,   3,   1],
+            [125,  25,   5,   1]]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.vander(x0, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([[  1,   1,   1,   1],
+            [  8,   4,   2,   1],
+            [ 27,   9,   3,   1],
+            [125,  25,   5,   1]]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.vander(x0, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([[  1,   1,   1,   1],
+            [  8,   4,   2,   1],
+            [ 27,   9,   3,   1],
+            [125,  25,   5,   1]]), Device(level_zero:gpu:0), 'host')
     """
 
-    x1 = dpnp.asarray(
-        x1, device=device, usm_type=usm_type, sycl_queue=sycl_queue
-    )
+    x = dpnp.asarray(x, device=device, usm_type=usm_type, sycl_queue=sycl_queue)
 
     if N is not None and not isinstance(N, int):
         raise TypeError("An integer is required, but got {}".format(type(N)))
-    elif x1.ndim != 1:
-        raise ValueError("x1 must be a one-dimensional array or sequence.")
+    elif x.ndim != 1:
+        raise ValueError("`x` must be a one-dimensional array or sequence.")
     else:
         if N is None:
-            N = x1.size
+            N = x.size
 
-        _dtype = int if x1.dtype == bool else x1.dtype
+        _dtype = int if x.dtype == bool else x.dtype
         m = empty(
-            (x1.size, N),
+            (x.size, N),
             dtype=_dtype,
-            usm_type=x1.usm_type,
-            sycl_queue=x1.sycl_queue,
+            usm_type=x.usm_type,
+            sycl_queue=x.sycl_queue,
         )
         tmp = m[:, ::-1] if not increasing else m
         dpnp.power(
-            x1.reshape(-1, 1),
-            dpnp.arange(N, dtype=_dtype, sycl_queue=x1.sycl_queue),
+            x.reshape(-1, 1),
+            dpnp.arange(N, dtype=_dtype, sycl_queue=x.sycl_queue),
             out=tmp,
         )
 
@@ -2024,6 +2943,31 @@ def zeros(
 
     For full documentation refer to :obj:`numpy.zeros`.
 
+    Parameters
+    ----------
+    shape : {int, sequence of ints}
+        Shape of the new array, e.g., (2, 3) or 2.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is "device".
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of zeros with the given shape, dtype, and order.
+
     Limitations
     -----------
     Parameter `order` is supported only with values ``"C"`` and ``"F"``.
@@ -2040,13 +2984,28 @@ def zeros(
     Examples
     --------
     >>> import dpnp as np
-    >>> [i for i in np.zeros(5)]
-    [0.0, 0.0, 0.0, 0.0, 0.0]
+    >>> np.zeros(5)
+    array([0., 0., 0., 0., 0.])
     >>> x = np.zeros((2, 1))
     >>> x.ndim, x.size, x.shape
     (2, 2, (2, 1))
-    >>> [i for i in x]
-    [0.0, 0.0]
+    >>> x
+    array([[0.],
+           [0.]])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.zeros(3) # default case
+    >>> x, x.device, x.usm_type
+    (array([0., 0., 0.]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.zeros(3, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([0., 0., 0.]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.zeros(3, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([0., 0., 0.]), Device(level_zero:gpu:0), 'host')
 
     """
     if like is not None:
@@ -2067,7 +3026,7 @@ def zeros(
 
 
 def zeros_like(
-    x1,
+    a,
     /,
     *,
     dtype=None,
@@ -2083,10 +3042,35 @@ def zeros_like(
 
     For full documentation refer to :obj:`numpy.zeros_like`.
 
+    Parameters
+    ----------
+    a : {dpnp_array, usm_ndarray}
+        The shape and dtype of `a` define these same attributes of the returned array.
+    dtype : dtype, optional
+        The desired dtype for the array, e.g., dpnp.int32. Default is the default floating point
+        data type for the device where input array is allocated.
+    order : {"C", "F"}, optional
+        Memory layout of the newly output array. Default: "C".
+    shape : {int, sequence of ints}
+        Overrides the shape of the result.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where the output array is created.
+        The `device` can be ``None`` (the default), an OneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL device,
+        an instance of :class:`dpctl.SyclQueue`, or a `Device` object returned by
+        :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+    usm_type : {None, "device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array. Default is ``None``.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of zeros with the same shape and type as `a`.
+
     Limitations
     -----------
-    Parameter `x1` is supported as :class:`dpnp.dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`
-    Parameter `order` is supported with values ``"C"`` or ``"F"``.
     Parameter `subok` is supported only with default value ``False``.
     Otherwise the function will be executed sequentially on CPU.
 
@@ -2100,25 +3084,39 @@ def zeros_like(
     Examples
     --------
     >>> import dpnp as np
-    >>> x = np.arange(6)
-    >>> [i for i in x]
-    [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
-    >>> [i for i in np.zeros_like(x)]
-    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    >>> x0 = np.arange(6)
+    >>> x0
+    array([0, 1, 2, 3, 4, 5])
+    >>> np.zeros_like(x0)
+    array([0, 0, 0, 0, 0, 0])
+
+    Creating an array on a different device or with a specified usm_type
+
+    >>> x = np.zeros_like(x0) # default case
+    >>> x, x.device, x.usm_type
+    (array([0, 0, 0, 0, 0, 0]), Device(level_zero:gpu:0), 'device')
+
+    >>> y = np.zeros_like(x0, device="cpu")
+    >>> y, y.device, y.usm_type
+    (array([0, 0, 0, 0, 0, 0]), Device(opencl:cpu:0), 'device')
+
+    >>> z = np.zeros_like(x0, usm_type="host")
+    >>> z, z.device, z.usm_type
+    (array([0, 0, 0, 0, 0, 0]), Device(level_zero:gpu:0), 'host')
 
     """
-    if not isinstance(x1, (dpnp.ndarray, dpt.usm_ndarray)):
+    if not isinstance(a, (dpnp.ndarray, dpt.usm_ndarray)):
         pass
     elif order not in ("C", "c", "F", "f", None):
         pass
     elif subok is not False:
         pass
     else:
-        _shape = x1.shape if shape is None else shape
-        _dtype = x1.dtype if dtype is None else dtype
-        _usm_type = x1.usm_type if usm_type is None else usm_type
+        _shape = a.shape if shape is None else shape
+        _dtype = a.dtype if dtype is None else dtype
+        _usm_type = a.usm_type if usm_type is None else usm_type
         _sycl_queue = dpnp.get_normalized_queue_device(
-            x1, sycl_queue=sycl_queue, device=device
+            a, sycl_queue=sycl_queue, device=device
         )
         return dpnp_container.zeros(
             _shape,
@@ -2128,4 +3126,4 @@ def zeros_like(
             sycl_queue=_sycl_queue,
         )
 
-    return call_origin(numpy.zeros_like, x1, dtype, order, subok, shape)
+    return call_origin(numpy.zeros_like, a, dtype, order, subok, shape)
diff --git a/tests/test_arraycreation.py b/tests/test_arraycreation.py
index f7b06ffc9be..267842c749c 100644
--- a/tests/test_arraycreation.py
+++ b/tests/test_arraycreation.py
@@ -880,6 +880,42 @@ def test_logspace_axis(axis):
     assert_dtype_allclose(func(dpnp), func(numpy))
 
 
+@pytest.mark.parametrize(
+    "data", [(), 1, (2, 3), [4], numpy.array(5), numpy.array([6, 7])]
+)
+def test_ascontiguousarray(data):
+    result = dpnp.ascontiguousarray(data)
+    expected = numpy.ascontiguousarray(data)
+    assert_dtype_allclose(result, expected)
+    assert result.shape == expected.shape
+
+
+@pytest.mark.parametrize("data", [(), 1, (2, 3), [4]])
+def test_ascontiguousarray1(data):
+    result = dpnp.ascontiguousarray(dpnp.array(data))
+    expected = numpy.ascontiguousarray(numpy.array(data))
+    assert_dtype_allclose(result, expected)
+    assert result.shape == expected.shape
+
+
+@pytest.mark.parametrize(
+    "data", [(), 1, (2, 3), [4], numpy.array(5), numpy.array([6, 7])]
+)
+def test_asfortranarray(data):
+    result = dpnp.asfortranarray(data)
+    expected = numpy.asfortranarray(data)
+    assert_dtype_allclose(result, expected)
+    assert result.shape == expected.shape
+
+
+@pytest.mark.parametrize("data", [(), 1, (2, 3), [4]])
+def test_asfortranarray1(data):
+    result = dpnp.asfortranarray(dpnp.array(data))
+    expected = numpy.asfortranarray(numpy.array(data))
+    assert_dtype_allclose(result, expected)
+    assert result.shape == expected.shape
+
+
 def test_meshgrid_raise_error():
     a = numpy.array([1, 2, 3, 4])
     with pytest.raises(TypeError):
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index 205d4efb572..78a869fac9d 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -141,6 +141,7 @@ def test_empty_like(device_x, device_y):
 @pytest.mark.parametrize(
     "func, args, kwargs",
     [
+        pytest.param("copy", ["x0"], {}),
         pytest.param("diag", ["x0"], {}),
         pytest.param("full_like", ["x0"], {"fill_value": 5}),
         pytest.param("geomspace", ["x0[0:3]", "8", "4"], {}),
@@ -225,6 +226,7 @@ def test_array_creation_follow_device_2d_array(func, args, kwargs, device):
 @pytest.mark.parametrize(
     "func, args, kwargs",
     [
+        pytest.param("copy", ["x0"], {}),
         pytest.param("diag", ["x0"], {}),
         pytest.param("full", ["10", "x0[3]"], {}),
         pytest.param("full_like", ["x0"], {"fill_value": 5}),
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index bff548a90d0..5a29e677747 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -140,6 +140,7 @@ def test_coerced_usm_types_power(usm_type_x, usm_type_y):
 @pytest.mark.parametrize(
     "func, args",
     [
+        pytest.param("copy", ["x0"]),
         pytest.param("diag", ["x0"]),
         pytest.param("empty_like", ["x0"]),
         pytest.param("full", ["10", "x0[3]"]),

From ac1fca74c6bfbf6d65a9f0f80309cc0e89c0f8f5 Mon Sep 17 00:00:00 2001
From: vtavana <120411540+vtavana@users.noreply.github.com>
Date: Tue, 6 Feb 2024 06:06:22 -0600
Subject: [PATCH 18/29] update `dpnp.dot` implementation (#1669)

* dot_func

* using mkl::dotu instead mkl::dotc for complex

* fix a test

* fix negative strides

* add a temporary workaround

* address comments

* add a TODO comment

* call dpt.vecdot for integer data types

* update doc string

* pass argument by reference

* update doc to add boolean dtype

---------

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/backend/extensions/blas/CMakeLists.txt   |   4 +-
 dpnp/backend/extensions/blas/blas_py.cpp      |  21 +-
 dpnp/backend/extensions/blas/dot.cpp          | 238 +++++++++++
 dpnp/backend/extensions/blas/dot.hpp          |  60 +++
 dpnp/backend/extensions/blas/dotu.cpp         | 241 +++++++++++
 dpnp/backend/extensions/blas/gemm.cpp         |   2 +-
 dpnp/backend/extensions/blas/gemm.hpp         |   2 +-
 dpnp/backend/extensions/blas/gemm_batch.cpp   |   2 +-
 dpnp/backend/extensions/blas/types_matrix.hpp |  45 ++-
 dpnp/backend/kernels/dpnp_krnl_common.cpp     |   1 +
 dpnp/dpnp_algo/dpnp_algo.pxd                  |   7 -
 dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi    | 100 -----
 dpnp/dpnp_array.py                            |  25 +-
 dpnp/dpnp_iface_linearalgebra.py              | 139 ++++---
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   | 262 ++++++++----
 tests/skipped_tests.tbl                       |   3 +-
 tests/skipped_tests_gpu.tbl                   |   6 +-
 tests/skipped_tests_gpu_no_fp64.tbl           |  85 ----
 tests/test_dot.py                             | 379 ++++++++++++++++--
 tests/test_mathematical.py                    |   5 +-
 tests/test_sycl_queue.py                      |   8 +-
 tests/test_usm_type.py                        |   4 +-
 .../cupy/linalg_tests/test_eigenvalue.py      |  25 +-
 .../cupy/linalg_tests/test_product.py         |  35 +-
 .../cupy/math_tests/test_matmul.py            |  55 +++
 25 files changed, 1329 insertions(+), 425 deletions(-)
 create mode 100644 dpnp/backend/extensions/blas/dot.cpp
 create mode 100644 dpnp/backend/extensions/blas/dot.hpp
 create mode 100644 dpnp/backend/extensions/blas/dotu.cpp

diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index d19f60c9792..fe3a92d2181 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -1,5 +1,5 @@
 # *****************************************************************************
-# Copyright (c) 2016-2023, Intel Corporation
+# Copyright (c) 2024, Intel Corporation
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,6 +27,8 @@
 set(python_module_name _blas_impl)
 set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/blas_py.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dotu.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gemm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gemm_batch.cpp
 )
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
index 524f16fcc7d..7d5237381b1 100644
--- a/dpnp/backend/extensions/blas/blas_py.cpp
+++ b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,6 +30,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dot.hpp"
 #include "gemm.hpp"
 
 namespace blas_ext = dpnp::backend::ext::blas;
@@ -38,6 +39,8 @@ namespace py = pybind11;
 // populate dispatch tables
 void init_dispatch_tables(void)
 {
+    blas_ext::init_dot_dispatch_table();
+    blas_ext::init_dotu_dispatch_table();
     blas_ext::init_gemm_batch_dispatch_table();
     blas_ext::init_gemm_dispatch_table();
 }
@@ -46,6 +49,22 @@ PYBIND11_MODULE(_blas_impl, m)
 {
     init_dispatch_tables();
 
+    {
+        m.def("_dot", &blas_ext::dot,
+              "Call `dot` from OneMKL LAPACK library to return "
+              "the dot product of two real-valued vectors.",
+              py::arg("sycl_queue"), py::arg("vectorA"), py::arg("vectorB"),
+              py::arg("result"), py::arg("depends") = py::list());
+    }
+
+    {
+        m.def("_dotu", &blas_ext::dotu,
+              "Call `dotu` from OneMKL LAPACK library to return "
+              "the dot product of two complex vectors.",
+              py::arg("sycl_queue"), py::arg("vectorA"), py::arg("vectorB"),
+              py::arg("result"), py::arg("depends") = py::list());
+    }
+
     {
         m.def("_gemm", &blas_ext::gemm,
               "Call `gemm` from OneMKL LAPACK library to return "
diff --git a/dpnp/backend/extensions/blas/dot.cpp b/dpnp/backend/extensions/blas/dot.cpp
new file mode 100644
index 00000000000..048738f57a9
--- /dev/null
+++ b/dpnp/backend/extensions/blas/dot.cpp
@@ -0,0 +1,238 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "dot.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+namespace mkl_blas = oneapi::mkl::blas;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*dot_impl_fn_ptr_t)(sycl::queue &,
+                                         const std::int64_t,
+                                         char *,
+                                         const std::int64_t,
+                                         char *,
+                                         const std::int64_t,
+                                         char *,
+                                         const std::vector<sycl::event> &);
+
+static dot_impl_fn_ptr_t dot_dispatch_table[dpctl_td_ns::num_types]
+                                           [dpctl_td_ns::num_types];
+
+template <typename Tab, typename Tc>
+static sycl::event dot_impl(sycl::queue &exec_q,
+                            const std::int64_t n,
+                            char *vectorA,
+                            const std::int64_t stride_a,
+                            char *vectorB,
+                            const std::int64_t stride_b,
+                            char *result,
+                            const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<Tab>(exec_q);
+    type_utils::validate_type_for_device<Tc>(exec_q);
+
+    Tab *a = reinterpret_cast<Tab *>(vectorA);
+    Tab *b = reinterpret_cast<Tab *>(vectorB);
+    Tc *res = reinterpret_cast<Tc *>(result);
+
+    std::stringstream error_msg;
+    bool is_exception_caught = false;
+
+    sycl::event dot_event;
+    try {
+        dot_event = mkl_blas::row_major::dot(exec_q,
+                                             n, // size of the input vectors
+                                             a, // Pointer to vector a.
+                                             stride_a, // Stride of vector a.
+                                             b,        // Pointer to vector b.
+                                             stride_b, // Stride of vector b.
+                                             res,      // Pointer to result.
+                                             depends);
+    } catch (oneapi::mkl::exception const &e) {
+        error_msg
+            << "Unexpected MKL exception caught during dot() call:\nreason: "
+            << e.what();
+        is_exception_caught = true;
+    } catch (sycl::exception const &e) {
+        error_msg << "Unexpected SYCL exception caught during dot() call:\n"
+                  << e.what();
+        is_exception_caught = true;
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        throw std::runtime_error(error_msg.str());
+    }
+
+    return dot_event;
+}
+
+std::pair<sycl::event, sycl::event> dot(sycl::queue &exec_q,
+                                        dpctl::tensor::usm_ndarray vectorA,
+                                        dpctl::tensor::usm_ndarray vectorB,
+                                        dpctl::tensor::usm_ndarray result,
+                                        const std::vector<sycl::event> &depends)
+{
+    const int vectorA_nd = vectorA.get_ndim();
+    const int vectorB_nd = vectorB.get_ndim();
+    const int result_nd = result.get_ndim();
+
+    if ((vectorA_nd != 1)) {
+        throw py::value_error(
+            "The first input array has ndim=" + std::to_string(vectorA_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((vectorB_nd != 1)) {
+        throw py::value_error(
+            "The second input array has ndim=" + std::to_string(vectorB_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((result_nd != 0)) {
+        throw py::value_error(
+            "The output array has ndim=" + std::to_string(result_nd) +
+            ", but a 0-dimensional array is expected.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(vectorA, result)) {
+        throw py::value_error(
+            "The first input array and output array are overlapping "
+            "segments of memory");
+    }
+    if (overlap(vectorB, result)) {
+        throw py::value_error(
+            "The second input array and output array are overlapping "
+            "segments of memory");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q,
+            {vectorA.get_queue(), vectorB.get_queue(), result.get_queue()}))
+    {
+        throw py::value_error(
+            "USM allocations are not compatible with the execution queue.");
+    }
+
+    py::ssize_t a_size = vectorA.get_size();
+    py::ssize_t b_size = vectorB.get_size();
+    if (a_size != b_size) {
+        throw py::value_error("The size of the first input array must be "
+                              "equal to the size of the second input array.");
+    }
+
+    std::vector<py::ssize_t> a_stride = vectorA.get_strides_vector();
+    std::vector<py::ssize_t> b_stride = vectorB.get_strides_vector();
+
+    const std::int64_t n = a_size;
+    const std::int64_t str_a = a_stride[0];
+    const std::int64_t str_b = b_stride[0];
+
+    int vectorA_typenum = vectorA.get_typenum();
+    int vectorB_typenum = vectorB.get_typenum();
+    int result_typenum = result.get_typenum();
+
+    if (vectorA_typenum != vectorB_typenum) {
+        throw py::value_error("vectorA and vectorB must be of the same type.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int vectorAB_type_id = array_types.typenum_to_lookup_id(vectorA_typenum);
+    int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
+
+    dot_impl_fn_ptr_t dot_fn =
+        dot_dispatch_table[vectorAB_type_id][result_type_id];
+    if (dot_fn == nullptr) {
+        throw py::value_error(
+            "Types of input vectors and result array are mismatched.");
+    }
+
+    char *a_typeless_ptr = vectorA.get_data();
+    char *b_typeless_ptr = vectorB.get_data();
+    char *r_typeless_ptr = result.get_data();
+
+    const int a_elemsize = vectorA.get_elemsize();
+    const int b_elemsize = vectorB.get_elemsize();
+    if (str_a < 0) {
+        a_typeless_ptr -= (n - 1) * std::abs(str_a) * a_elemsize;
+    }
+    if (str_b < 0) {
+        b_typeless_ptr -= (n - 1) * std::abs(str_b) * b_elemsize;
+    }
+
+    sycl::event dot_ev = dot_fn(exec_q, n, a_typeless_ptr, str_a,
+                                b_typeless_ptr, str_b, r_typeless_ptr, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {vectorA, vectorB, result}, {dot_ev});
+
+    return std::make_pair(args_ev, dot_ev);
+}
+
+template <typename fnT, typename Tab, typename Tc>
+struct DotContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::DotTypePairSupportFactory<Tab, Tc>::is_defined) {
+            return dot_impl<Tab, Tc>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_dot_dispatch_table(void)
+{
+    dpctl_td_ns::DispatchTableBuilder<dot_impl_fn_ptr_t, DotContigFactory,
+                                      dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_table(dot_dispatch_table);
+}
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/dot.hpp b/dpnp/backend/extensions/blas/dot.hpp
new file mode 100644
index 00000000000..3468196f760
--- /dev/null
+++ b/dpnp/backend/extensions/blas/dot.hpp
@@ -0,0 +1,60 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+extern std::pair<sycl::event, sycl::event>
+    dot(sycl::queue &exec_q,
+        dpctl::tensor::usm_ndarray vectorA,
+        dpctl::tensor::usm_ndarray vectorB,
+        dpctl::tensor::usm_ndarray result,
+        const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    dotu(sycl::queue &exec_q,
+         dpctl::tensor::usm_ndarray vectorA,
+         dpctl::tensor::usm_ndarray vectorB,
+         dpctl::tensor::usm_ndarray result,
+         const std::vector<sycl::event> &depends);
+
+extern void init_dot_dispatch_table(void);
+extern void init_dotu_dispatch_table(void);
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/dotu.cpp b/dpnp/backend/extensions/blas/dotu.cpp
new file mode 100644
index 00000000000..8c4b43f8034
--- /dev/null
+++ b/dpnp/backend/extensions/blas/dotu.cpp
@@ -0,0 +1,241 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "dot.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+namespace mkl_blas = oneapi::mkl::blas;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*dotu_impl_fn_ptr_t)(sycl::queue &,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::vector<sycl::event> &);
+
+static dotu_impl_fn_ptr_t dotu_dispatch_table[dpctl_td_ns::num_types]
+                                             [dpctl_td_ns::num_types];
+
+template <typename Tab, typename Tc>
+static sycl::event dotu_impl(sycl::queue &exec_q,
+                             const std::int64_t n,
+                             char *vectorA,
+                             const std::int64_t stride_a,
+                             char *vectorB,
+                             const std::int64_t stride_b,
+                             char *result,
+                             const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<Tab>(exec_q);
+    type_utils::validate_type_for_device<Tc>(exec_q);
+
+    Tab *a = reinterpret_cast<Tab *>(vectorA);
+    Tab *b = reinterpret_cast<Tab *>(vectorB);
+    Tc *res = reinterpret_cast<Tc *>(result);
+
+    std::stringstream error_msg;
+    bool is_exception_caught = false;
+
+    sycl::event dotu_event;
+    try {
+        dotu_event = mkl_blas::row_major::dotu(exec_q,
+                                               n, // size of the input vectors
+                                               a, // Pointer to vector a.
+                                               stride_a, // Stride of vector a.
+                                               b,        // Pointer to vector b.
+                                               stride_b, // Stride of vector b.
+                                               res,      // Pointer to result.
+                                               depends);
+    } catch (oneapi::mkl::exception const &e) {
+        error_msg
+            << "Unexpected MKL exception caught during dotu() call:\nreason: "
+            << e.what();
+        is_exception_caught = true;
+    } catch (sycl::exception const &e) {
+        error_msg << "Unexpected SYCL exception caught during dotu() call:\n"
+                  << e.what();
+        is_exception_caught = true;
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        throw std::runtime_error(error_msg.str());
+    }
+
+    return dotu_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    dotu(sycl::queue &exec_q,
+         dpctl::tensor::usm_ndarray vectorA,
+         dpctl::tensor::usm_ndarray vectorB,
+         dpctl::tensor::usm_ndarray result,
+         const std::vector<sycl::event> &depends)
+{
+    const int vectorA_nd = vectorA.get_ndim();
+    const int vectorB_nd = vectorB.get_ndim();
+    const int result_nd = result.get_ndim();
+
+    if ((vectorA_nd != 1)) {
+        throw py::value_error(
+            "The first input array has ndim=" + std::to_string(vectorA_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((vectorB_nd != 1)) {
+        throw py::value_error(
+            "The second input array has ndim=" + std::to_string(vectorB_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((result_nd != 0)) {
+        throw py::value_error(
+            "The output array has ndim=" + std::to_string(result_nd) +
+            ", but a 0-dimensional array is expected.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(vectorA, result)) {
+        throw py::value_error(
+            "The first input array and output array are overlapping "
+            "segments of memory");
+    }
+    if (overlap(vectorB, result)) {
+        throw py::value_error(
+            "The second input array and output array are overlapping "
+            "segments of memory");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q,
+            {vectorA.get_queue(), vectorB.get_queue(), result.get_queue()}))
+    {
+        throw py::value_error(
+            "USM allocations are not compatible with the execution queue.");
+    }
+
+    py::ssize_t a_size = vectorA.get_size();
+    py::ssize_t b_size = vectorB.get_size();
+    if (a_size != b_size) {
+        throw py::value_error("The size of the first input array must be "
+                              "equal to the size of the second input array.");
+    }
+
+    std::vector<py::ssize_t> a_stride = vectorA.get_strides_vector();
+    std::vector<py::ssize_t> b_stride = vectorB.get_strides_vector();
+
+    const std::int64_t n = a_size;
+    const std::int64_t str_a = a_stride[0];
+    const std::int64_t str_b = b_stride[0];
+
+    int vectorA_typenum = vectorA.get_typenum();
+    int vectorB_typenum = vectorB.get_typenum();
+    int result_typenum = result.get_typenum();
+
+    if (vectorA_typenum != vectorB_typenum) {
+        throw py::value_error(
+            "Input arrays must be of must be of the same type.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int vectorAB_type_id = array_types.typenum_to_lookup_id(vectorA_typenum);
+    int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
+
+    dotu_impl_fn_ptr_t dotu_fn =
+        dotu_dispatch_table[vectorAB_type_id][result_type_id];
+    if (dotu_fn == nullptr) {
+        throw py::value_error(
+            "Types of input vectors and result array are mismatched.");
+    }
+
+    char *a_typeless_ptr = vectorA.get_data();
+    char *b_typeless_ptr = vectorB.get_data();
+    char *r_typeless_ptr = result.get_data();
+
+    const int a_elemsize = vectorA.get_elemsize();
+    const int b_elemsize = vectorB.get_elemsize();
+    if (str_a < 0) {
+        a_typeless_ptr -= (n - 1) * std::abs(str_a) * a_elemsize;
+    }
+    if (str_b < 0) {
+        b_typeless_ptr -= (n - 1) * std::abs(str_b) * b_elemsize;
+    }
+
+    sycl::event dotu_ev =
+        dotu_fn(exec_q, n, a_typeless_ptr, str_a, b_typeless_ptr, str_b,
+                r_typeless_ptr, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {vectorA, vectorB, result}, {dotu_ev});
+
+    return std::make_pair(args_ev, dotu_ev);
+}
+
+template <typename fnT, typename Tab, typename Tc>
+struct DotuContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::DotuTypePairSupportFactory<Tab, Tc>::is_defined) {
+            return dotu_impl<Tab, Tc>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_dotu_dispatch_table(void)
+{
+    dpctl_td_ns::DispatchTableBuilder<dotu_impl_fn_ptr_t, DotuContigFactory,
+                                      dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_table(dotu_dispatch_table);
+}
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp
index 5526ecd3c1b..a26420f49b3 100644
--- a/dpnp/backend/extensions/blas/gemm.cpp
+++ b/dpnp/backend/extensions/blas/gemm.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
index 25f78b5b850..3f1ec6e745a 100644
--- a/dpnp/backend/extensions/blas/gemm.hpp
+++ b/dpnp/backend/extensions/blas/gemm.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/dpnp/backend/extensions/blas/gemm_batch.cpp b/dpnp/backend/extensions/blas/gemm_batch.cpp
index 32f592f6b8a..9359901edd8 100644
--- a/dpnp/backend/extensions/blas/gemm_batch.cpp
+++ b/dpnp/backend/extensions/blas/gemm_batch.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/dpnp/backend/extensions/blas/types_matrix.hpp b/dpnp/backend/extensions/blas/types_matrix.hpp
index 49154df03c4..c36ae0e2045 100644
--- a/dpnp/backend/extensions/blas/types_matrix.hpp
+++ b/dpnp/backend/extensions/blas/types_matrix.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,49 @@ namespace blas
 {
 namespace types
 {
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL BLAS library provides support in oneapi::mkl::blas::dot<Tab, Tc>
+ * function.
+ *
+ * @tparam Tab Type of arrays containing input vectors A and B.
+ * @tparam Tc Type of array containing output.
+ */
+template <typename Tab, typename Tc>
+struct DotTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, double>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, double, Tc, double>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL BLAS library provides support in oneapi::mkl::blas::dotu<Tab, Tc>
+ * function.
+ *
+ * @tparam Tab Type of arrays containing input vectors A and B.
+ * @tparam Tc Type of array containing output.
+ */
+template <typename Tab, typename Tc>
+struct DotuTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<float>,
+                                          Tc,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<double>,
+                                          Tc,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL BLAS library provides support in oneapi::mkl::blas::gemm<Tab, Tc>
diff --git a/dpnp/backend/kernels/dpnp_krnl_common.cpp b/dpnp/backend/kernels/dpnp_krnl_common.cpp
index e664c30b848..04eac54310d 100644
--- a/dpnp/backend/kernels/dpnp_krnl_common.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_common.cpp
@@ -1040,6 +1040,7 @@ void func_map_init_linalg(func_map_t &fmap)
     fmap[DPNPFuncName::DPNP_FN_DOT][eft_DBL][eft_DBL] = {
         eft_DBL, (void *)dpnp_dot_default_c<double, double, double>};
 
+    // needed for "dpnp_correlate_c" function in dpnp_krnl_statistics.cpp
     fmap[DPNPFuncName::DPNP_FN_DOT_EXT][eft_INT][eft_INT] = {
         eft_INT, (void *)dpnp_dot_ext_c<int32_t, int32_t, int32_t>};
     fmap[DPNPFuncName::DPNP_FN_DOT_EXT][eft_INT][eft_LNG] = {
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index 28e21340647..2fc7e1b4a3b 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -54,8 +54,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_DIAG_INDICES_EXT
         DPNP_FN_DIAGONAL
         DPNP_FN_DIAGONAL_EXT
-        DPNP_FN_DOT
-        DPNP_FN_DOT_EXT
         DPNP_FN_EDIFF1D
         DPNP_FN_EDIFF1D_EXT
         DPNP_FN_EIG
@@ -282,11 +280,6 @@ cpdef dpnp_descriptor dpnp_isclose(dpnp_descriptor input1, dpnp_descriptor input
                                    double rtol=*, double atol=*, cpp_bool equal_nan=*)
 
 
-"""
-Linear algebra
-"""
-cpdef dpnp_descriptor dpnp_dot(dpnp_descriptor in_array1, dpnp_descriptor in_array2)
-
 """
 Array creation routines
 """
diff --git a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
index 9b4faf2a1b5..09336b5aaa3 100644
--- a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
+++ b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
@@ -36,7 +36,6 @@ and the rest of the library
 # NO IMPORTs here. All imports must be placed into main "dpnp_algo.pyx" file
 
 __all__ += [
-    "dpnp_dot",
     "dpnp_inner",
     "dpnp_kron",
 ]
@@ -47,105 +46,6 @@ ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_shapes_t)(c_dpctl.DPCTLSyclQue
                                                             void *, void * , void * , shape_elem_type * ,
                                                             shape_elem_type *, shape_elem_type * , size_t,
                                                             const c_dpctl.DPCTLEventVectorRef)
-ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_dot_t)(c_dpctl.DPCTLSyclQueueRef,
-                                                         void * , const size_t, const size_t,
-                                                         const shape_elem_type *, const shape_elem_type * ,
-                                                         void * , const size_t, const size_t,
-                                                         const shape_elem_type *, const shape_elem_type * ,
-                                                         void * , const size_t, const size_t,
-                                                         const shape_elem_type *, const shape_elem_type * ,
-                                                         const c_dpctl.DPCTLEventVectorRef) except +
-
-cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1,
-                                     utils.dpnp_descriptor in_array2,
-                                     utils.dpnp_descriptor out=None):
-    cdef shape_type_c shape1, shape2
-
-    shape1 = in_array1.shape
-    shape2 = in_array2.shape
-
-    # convert string type names (array.dtype) to C enum DPNPFuncType
-    cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(in_array1.dtype)
-    cdef DPNPFuncType param2_type = dpnp_dtype_to_DPNPFuncType(in_array2.dtype)
-
-    # get the FPTR data structure
-    cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_DOT_EXT, param1_type, param2_type)
-    cdef utils.dpnp_descriptor result
-
-    ndim1 = in_array1.ndim
-    ndim2 = in_array2.ndim
-    cdef shape_type_c result_shape
-    if ndim1 == 0:
-        result_shape = shape2
-    elif ndim2 == 0:
-        result_shape = shape1
-    elif ndim1 == 1 and ndim2 == 1:
-        result_shape = ()
-    elif ndim1 == 1:  # ndim2 > 1
-        result_shape = shape2[::-2] if ndim2 == 2 else shape2[::2]
-    elif ndim2 == 1:  # ndim1 > 1
-        result_shape = shape1[:-1]
-    else:
-        if ndim1 == 1:
-            shape1 = (1, shape1[0])
-        if ndim2 == 1:
-            shape2 = (shape1[0], 1)
-        result_shape = shape1[:-1] + shape2[:-2] + shape2[-1:]
-
-    result_sycl_device, result_usm_type, result_sycl_queue = utils.get_common_usm_allocation(in_array1, in_array2)
-
-    if out is None:
-        # create result array with type given by FPTR data
-        result = utils.create_output_descriptor(result_shape,
-                                                kernel_data.return_type,
-                                                None,
-                                                device=result_sycl_device,
-                                                usm_type=result_usm_type,
-                                                sycl_queue=result_sycl_queue)
-    else:
-        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > kernel_data.return_type)
-        if out.dtype != result_type:
-            utils.checker_throw_value_error('dot', 'out.dtype', out.dtype, result_type)
-        if out.shape != result_shape:
-            utils.checker_throw_value_error('dot', 'out.shape', out.shape, result_shape)
-
-        result = out
-
-        utils.get_common_usm_allocation(in_array1, result)  # check USM allocation is common
-
-    cdef shape_type_c result_strides = utils.strides_to_vector(result.strides, result.shape)
-    cdef shape_type_c in_array1_shape = in_array1.shape
-    cdef shape_type_c in_array1_strides = utils.strides_to_vector(in_array1.strides, in_array1.shape)
-    cdef shape_type_c in_array2_shape = in_array2.shape
-    cdef shape_type_c in_array2_strides = utils.strides_to_vector(in_array2.strides, in_array2.shape)
-
-    cdef c_dpctl.SyclQueue q = <c_dpctl.SyclQueue> result_sycl_queue
-    cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref()
-
-    cdef fptr_2in_1out_dot_t func = <fptr_2in_1out_dot_t > kernel_data.ptr
-    # call FPTR function
-    cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref,
-                                                    result.get_data(),
-                                                    result.size,
-                                                    result.ndim,
-                                                    result_shape.data(),
-                                                    result_strides.data(),
-                                                    in_array1.get_data(),
-                                                    in_array1.size,
-                                                    in_array1.ndim,
-                                                    in_array1_shape.data(),
-                                                    in_array1_strides.data(),
-                                                    in_array2.get_data(),
-                                                    in_array2.size,
-                                                    in_array2.ndim,
-                                                    in_array2_shape.data(),
-                                                    in_array2_strides.data(),
-                                                    NULL)  # dep_events_ref
-
-    with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref)
-    c_dpctl.DPCTLEvent_Delete(event_ref)
-
-    return result
 
 
 cpdef utils.dpnp_descriptor dpnp_inner(dpnp_descriptor array1, dpnp_descriptor array2):
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index cf848b50690..b5e75dde07c 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -704,8 +704,29 @@ def diagonal(input, offset=0, axis1=0, axis2=1):
 
         return dpnp.diagonal(input, offset, axis1, axis2)
 
-    def dot(self, other, out=None):
-        return dpnp.dot(self, other, out)
+    def dot(self, b, out=None):
+        """
+        Dot product of two arrays.
+
+        For full documentation refer to :obj:`dpnp.dot`.
+
+        Examples
+        --------
+        >>> import dpnp as np
+        >>> a = np.eye(2)
+        >>> b = np.ones((2, 2)) * 2
+        >>> a.dot(b)
+        array([[2., 2.],
+               [2., 2.]])
+
+        This array method can be conveniently chained:
+
+        >>> a.dot(b).dot(b)
+        array([[8., 8.],
+               [8., 8.]])
+        """
+
+        return dpnp.dot(self, b, out)
 
     @property
     def dtype(self):
diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
index d39b84a50ec..9d63f7f8c3d 100644
--- a/dpnp/dpnp_iface_linearalgebra.py
+++ b/dpnp/dpnp_iface_linearalgebra.py
@@ -38,13 +38,12 @@
 """
 
 
-import dpctl.tensor as dpt
 import numpy
 
 import dpnp
 from dpnp.dpnp_algo import *
 from dpnp.dpnp_utils import *
-from dpnp.dpnp_utils.dpnp_utils_linearalgebra import dpnp_matmul
+from dpnp.dpnp_utils.dpnp_utils_linearalgebra import dpnp_dot, dpnp_matmul
 
 __all__ = [
     "dot",
@@ -59,87 +58,99 @@
 ]
 
 
-def dot(x1, x2, out=None, **kwargs):
+def dot(a, b, out=None):
     """
-    Dot product of `x1` and `x2`.
+    Dot product of `a` and `b`.
 
     For full documentation refer to :obj:`numpy.dot`.
 
+    Parameters
+    ----------
+    a : {dpnp_array, usm_ndarray, scalar}
+        First input array. Both inputs `a` and `b` can not be scalars at the same time.
+    b : {dpnp_array, usm_ndarray, scalar}
+        Second input array. Both inputs `a` and `b` can not be scalars at the same time.
+    out : {dpnp.ndarray, usm_ndarray}, optional
+        Alternative output array in which to place the result. It must have
+        the same shape and data type as the expected output and should be
+        C-contiguous. If these conditions are not met, an exception is
+        raised, instead of attempting to be flexible.
+
     Returns
     -------
-    y : dpnp.ndarray
-        Returns the dot product of `x1` and `x2`.
+    out : dpnp.ndarray
+        Returns the dot product of `a` and `b`.
         If `out` is given, then it is returned.
 
-    Limitations
-    -----------
-    Parameters `x1` and `x2` are supported as either scalar, :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`, but both `x1` and `x2` can not be scalars at the same time.
-    Keyword argument ``kwargs`` is currently unsupported.
-    Otherwise the functions will be executed sequentially on CPU.
-    Input array data types are limited by supported DPNP :ref:`Data types`.
-
     See Also
     --------
+    :obj:`dpnp.ndarray.dot` : Equivalent method.
     :obj:`dpnp.tensordot` : Sum products over arbitrary axes.
     :obj:`dpnp.vdot` : Complex-conjugating dot product.
+    :obj:`dpnp.einsum` : Einstein summation convention.
+    :obj:`dpnp.matmul` : Matrix product of two arrays.
+    :obj:`dpnp.linalg.multi_dot` : Chained dot product.
 
     Examples
     --------
-    >>> import dpnp as dp
-    >>> a = dp.array([1, 2, 3])
-    >>> b = dp.array([1, 2, 3])
-    >>> dp.dot(a, b)
-    14
+    >>> import dpnp as np
+    >>> a = np.array([1, 2, 3])
+    >>> b = np.array([1, 2, 3])
+    >>> np.dot(a, b)
+    array(14)
+
+    Neither argument is complex-conjugated:
+
+    >>> np.dot(np.array([2j, 3j]), np.array([2j, 3j]))
+    array(-13+0j)
+
+    For 2-D arrays it is the matrix product:
+
+    >>> a = np.array([[1, 0], [0, 1]])
+    >>> b = np.array([[4, 1], [2, 2]])
+    >>> np.dot(a, b)
+    array([[4, 1],
+           [2, 2]])
+
+    >>> a = np.arange(3*4*5*6).reshape((3,4,5,6))
+    >>> b = np.arange(3*4*5*6)[::-1].reshape((5,4,6,3))
+    >>> np.dot(a, b)[2,3,2,1,2,2]
+    array(499128)
+    >>> sum(a[2,3,2,:] * b[1,2,:,2])
+    array(499128)
 
     """
 
-    if kwargs:
-        pass
-    elif dpnp.isscalar(x1) and dpnp.isscalar(x2):
-        # at least either x1 or x2 has to be an array
-        pass
+    dpnp.check_supported_arrays_type(a, scalar_type=True)
+    dpnp.check_supported_arrays_type(b, scalar_type=True)
+
+    if out is not None:
+        dpnp.check_supported_arrays_type(out)
+        if not out.flags.c_contiguous:
+            raise ValueError("Only C-contiguous array is acceptable.")
+
+    if dpnp.isscalar(a) or dpnp.isscalar(b):
+        # TODO: investigate usage of axpy (axpy_batch) or scal
+        # functions from BLAS here instead of dpnp.multiply
+        return dpnp.multiply(a, b, out=out)
+    elif a.ndim == 0 or b.ndim == 0:
+        # TODO: investigate usage of axpy (axpy_batch) or scal
+        # functions from BLAS here instead of dpnp.multiply
+        return dpnp.multiply(a, b, out=out)
+    elif a.ndim == 1 and b.ndim == 1:
+        return dpnp_dot(a, b, out=out)
+    elif a.ndim == 2 and b.ndim == 2:
+        # NumPy does not allow casting even if it is safe
+        return dpnp.matmul(a, b, out=out, casting="no")
+    elif a.ndim == 1 or b.ndim == 1:
+        # NumPy does not allow casting even if it is safe
+        return dpnp.matmul(a, b, out=out, casting="no")
     else:
-        # get USM type and queue to copy scalar from the host memory into a USM allocation
-        usm_type, queue = (
-            get_usm_allocations([x1, x2])
-            if dpnp.isscalar(x1) or dpnp.isscalar(x2)
-            else (None, None)
-        )
-
-        x1_desc = dpnp.get_dpnp_descriptor(
-            x1,
-            copy_when_strides=False,
-            copy_when_nondefault_queue=False,
-            alloc_usm_type=usm_type,
-            alloc_queue=queue,
-        )
-        x2_desc = dpnp.get_dpnp_descriptor(
-            x2,
-            copy_when_strides=False,
-            copy_when_nondefault_queue=False,
-            alloc_usm_type=usm_type,
-            alloc_queue=queue,
-        )
-        if x1_desc and x2_desc:
-            if out is not None:
-                if not isinstance(out, (dpnp.ndarray, dpt.usm_ndarray)):
-                    raise TypeError(
-                        "return array must be of supported array type"
-                    )
-                out_desc = (
-                    dpnp.get_dpnp_descriptor(
-                        out,
-                        copy_when_strides=False,
-                        copy_when_nondefault_queue=False,
-                    )
-                    or None
-                )
-            else:
-                out_desc = None
-            return dpnp_dot(x1_desc, x2_desc, out=out_desc).get_pyobj()
-
-    return call_origin(numpy.dot, x1, x2, out=out, **kwargs)
+        # TODO: investigate usage of matmul for some possible
+        # use cases instead of dpnp.tensordot
+        result = dpnp.tensordot(a, b, axes=(-1, -2))
+        # NumPy does not allow casting even if it is safe
+        return dpnp.get_result_array(result, out, casting="no")
 
 
 def einsum(*args, **kwargs):
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index d0add55eee3..65d97befa98 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -24,69 +24,46 @@
 # *****************************************************************************
 
 import dpctl
+import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
 import numpy
 
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
+from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 
-__all__ = ["dpnp_matmul"]
+__all__ = ["dpnp_dot", "dpnp_matmul"]
 
 
-def _gemm_res_dtype(*arrays, dtype, casting, sycl_queue):
+def _copy_array(x, dep_events, host_events, contig_copy=False, dtype=None):
     """
-    Determines the output array data type and the intermediate data type.
-
-    If dtype is ``None``, the output array data type is determined based on
-    the Promotion Type Rule and device capabilities. Otherwise, `dtype` is
-    used as output array dtype if input arrays can cast to it according to
-    the casting rule determined. If casting cannot be done, a ``TypeError``
-    is raised.
-    The intermediate data type is the data type used for performing matmul
-    operation calculations. If output array dtype is a floating-point data type,
-    it is also used for the intermediate data type. If output array dtype is an
-    integral data type, the default floating point data type of the device where
-    input arrays are allocated on are used for intermediate data type.
+    Creating a copy of input array if needed.
 
-    Parameters
-    ----------
-    arrays : {dpnp.ndarray, usm_ndarray}
-        Input arrays.
-    dtype : dtype
-        If not ``None``, data type of the output array.
-    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
-        Controls what kind of data casting may occur.
-    sycl_queue : {SyclQueue}
-        A SYCL queue to use for determining default floating point datat type.
-
-    Returns
-    -------
-    gemm_dtype, res_dtype :
-        `gemm_dtype` is the data type used in performing matmul calculations.
-        The input arrays of matmul function are cast to `gemm_dtype` and then
-        the calculations are performed.
-        `res_dtype` is the output data type. When the result is obtained, it is cast
-        to `res_dtype`.
+    If `contig_copy` is ``True``, a C-contiguous copy of input array is returned.
+    In this case, the copy array has the input array data type unless `dtype` is
+    determined.
+    If `contig_copy` is ``False`` and input array data type is different than `dtype`,
+    a C-contiguous copy of input array with specified `dtype` is returned.
 
     """
 
-    res_dtype = dpnp.result_type(*arrays)
-    default_dtype = dpnp.default_float_type(sycl_queue=sycl_queue)
-
-    if dtype is not None:
-        if dpnp.can_cast(res_dtype, dtype, casting=casting):
-            res_dtype = dtype
-        else:
-            raise TypeError(
-                f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
-            )
-
-    gemm_dtype = (
-        res_dtype if dpnp.issubdtype(res_dtype, dpnp.inexact) else default_dtype
-    )
+    if contig_copy:
+        copy = contig_copy
+    else:
+        copy = x.dtype != dtype if dtype is not None else False
 
-    return gemm_dtype, res_dtype
+    if copy:
+        x_copy = dpnp.empty_like(x, dtype=dtype, order="C")
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=dpnp.get_usm_ndarray(x),
+            dst=x_copy.get_array(),
+            sycl_queue=x.sycl_queue,
+        )
+        dep_events.append(copy_ev)
+        host_events.append(ht_copy_ev)
+        return x_copy
+    return x
 
 
 def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
@@ -95,8 +72,10 @@ def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
     # when the input array is F-contiguous, the data of 2D array
     # that needs to be called in mkl::gemm_batch are not contiguous.
     ht_tasks_list = []
-    x1 = _get_gemm_contig_array(x1, dev_tasks_list, ht_tasks_list)
-    x2 = _get_gemm_contig_array(x2, dev_tasks_list, ht_tasks_list)
+    contig_copy = not x1.flags.c_contiguous
+    x1 = _copy_array(x1, dev_tasks_list, ht_tasks_list, contig_copy=contig_copy)
+    contig_copy = not x2.flags.c_contiguous
+    x2 = _copy_array(x2, dev_tasks_list, ht_tasks_list, contig_copy=contig_copy)
 
     x1_strides = x1.strides
     x2_strides = x2.strides
@@ -149,41 +128,133 @@ def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
     return ht_blas_ev, ht_tasks_list, res
 
 
-def _get_gemm_contig_array(x, dep_events, host_events, dtype=None):
+def _op_res_dtype(*arrays, dtype, casting, sycl_queue):
+    """
+    _op_res_dtype(*arrays, dtype, casting, sycl_queue)
+
+    Determines the output array data type and an intermediate data type
+    used in performing calculations related to a specific math function.
+    If dtype is ``None``, the output array data type of the operation is
+    determined based on the Promotion Type Rule and device capabilities.
+    Otherwise, `dtype` is used as output array dtype, if input arrays
+    can cast to it according to the casting rule determined. If casting
+    cannot be done, a ``TypeError`` is raised.
+    The intermediate data type is the data type used for performing the math
+    function calculations. If output array dtype is a floating-point data type,
+    it is also used for the intermediate data type. If output array dtype is an
+    integral data type, the default floating point data type of the device where
+    input arrays are allocated on are used for intermediate data type.
+
+    Parameters
+    ----------
+    arrays : {dpnp.ndarray, usm_ndarray}
+        Input arrays.
+    dtype : dtype
+        If not ``None``, data type of the output array.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur.
+    sycl_queue : {SyclQueue}
+        A SYCL queue to use for determining default floating point datat type.
+
+    Returns
+    -------
+    op_dtype, res_dtype :
+        `op_dtype` is the data type used in performing math function calculations.
+        The input arrays of the math function are cast to `op_dtype` and then
+        the calculations are performed.
+        `res_dtype` is the output data type. When the result is obtained, it is cast
+        to `res_dtype`.
+
     """
-    Creating a copy of input array if needed.
 
-    This function has two use cases. In the first use case, which is more general,
-    if the input array is not c-contiguous or f-contiguous, we ensure it becomes
-    c-contiguous. Additionally, if the input array has an integral dtype, we
-    convert it to an appropriate floating-point data type specified by `dtype`.
-    In the second use case, which is for N-dimensional arrays with N>2, we need
-    to ensure c-contiguity. This is crucial because the implementation of the
-    `gemm_batch` function in dpnp only works for C-contiguous arrays. This use case
-    is essential when the input array is f-contiguous with floating point dtype for
-    which the array is not modified in the first use case.
+    res_dtype = dpnp.result_type(*arrays)
+    default_dtype = dpnp.default_float_type(sycl_queue=sycl_queue)
 
+    if dtype is not None:
+        if dpnp.can_cast(res_dtype, dtype, casting=casting):
+            res_dtype = dtype
+        else:
+            raise TypeError(
+                f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
+            )
+
+    op_dtype = (
+        res_dtype if dpnp.issubdtype(res_dtype, dpnp.inexact) else default_dtype
+    )
+
+    return op_dtype, res_dtype
+
+
+def dpnp_dot(a, b, /, out=None):
     """
+    Return the dot product of two arrays.
 
-    if dtype is None:
-        copy = not x.flags.c_contiguous
-    else:
-        copy = (
-            not (x.flags.c_contiguous or x.flags.f_contiguous)
-            or x.dtype != dtype
-        )
+    The routine that is used to perform the main calculation
+    depends on input array data types: 1) For integer and boolean data types,
+    `dpctl.tensor.vecdot` form the Data Parallel Control library is used,
+    2) For floating point real-valued data types, `dot` routines from
+    BLAS library of OneMKL is used, and 3) For complex data types,
+    `dotu` routines from BLAS library of OneMKL is used.
 
-    if copy:
-        x_copy = dpnp.empty_like(x, dtype=dtype, order="C")
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=dpnp.get_usm_ndarray(x),
-            dst=x_copy.get_array(),
-            sycl_queue=x.sycl_queue,
+    """
+
+    if a.size != b.size:
+        raise ValueError(
+            "Input arrays have a mismatch in their size. "
+            f"(size {a.size} is different from {b.size})"
         )
-        dep_events.append(copy_ev)
-        host_events.append(ht_copy_ev)
-        return x_copy
-    return x
+
+    res_usm_type, exec_q = get_usm_allocations([a, b])
+
+    # Determine the appropriate data types
+    # casting is irrelevant here since dtype is `None`
+    dot_dtype, res_dtype = _op_res_dtype(
+        a, b, dtype=None, casting="no", sycl_queue=exec_q
+    )
+
+    # create result array
+    result = dpnp.empty(
+        (),
+        dtype=dot_dtype,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+
+    # input arrays should have the proper data type
+    dep_events_list = []
+    host_tasks_list = []
+    if dpnp.issubdtype(res_dtype, dpnp.inexact):
+        # copying is needed if dtypes of input arrays are different
+        a = _copy_array(a, dep_events_list, host_tasks_list, dtype=dot_dtype)
+        b = _copy_array(b, dep_events_list, host_tasks_list, dtype=dot_dtype)
+        if dpnp.issubdtype(res_dtype, dpnp.complexfloating):
+            ht_ev, _ = bi._dotu(
+                exec_q,
+                dpnp.get_usm_ndarray(a),
+                dpnp.get_usm_ndarray(b),
+                dpnp.get_usm_ndarray(result),
+                dep_events_list,
+            )
+        else:
+            ht_ev, _ = bi._dot(
+                exec_q,
+                dpnp.get_usm_ndarray(a),
+                dpnp.get_usm_ndarray(b),
+                dpnp.get_usm_ndarray(result),
+                dep_events_list,
+            )
+        host_tasks_list.append(ht_ev)
+        dpctl.SyclEvent.wait_for(host_tasks_list)
+    else:
+        dpt_a = dpnp.get_usm_ndarray(a)
+        dpt_b = dpnp.get_usm_ndarray(b)
+        result = dpnp_array._create_from_usm_ndarray(dpt.vecdot(dpt_a, dpt_b))
+
+    if dot_dtype != res_dtype:
+        result = result.astype(res_dtype, copy=False)
+
+    # NumPy does not allow casting even if it is safe
+    return dpnp.get_result_array(result, out, casting="no")
 
 
 def dpnp_matmul(
@@ -197,8 +268,6 @@ def dpnp_matmul(
     dtype=None,
 ):
     """
-    dpnp_matmul(x1, x2, out=None, casting="same_kind", order="K", dtype=None)
-
     Return the matrix product of two arrays.
 
     The main calculation is done by calling an extension function
@@ -222,14 +291,16 @@ def dpnp_matmul(
 
     res_usm_type, exec_q = get_usm_allocations([x1, x2])
 
-    squeeze_flag = x1_ndim == 1 or x2_ndim == 1
+    appended_axes = []
     if x1_ndim == 1:
         x1 = x1[dpnp.newaxis, :]
         x1_ndim = x1.ndim
+        appended_axes.append(-2)
 
     if x2_ndim == 1:
         x2 = x2[:, dpnp.newaxis]
         x2_ndim = x2.ndim
+        appended_axes.append(-1)
 
     x1_shape = x1.shape
     x2_shape = x2.shape
@@ -241,7 +312,7 @@ def dpnp_matmul(
         )
 
     # Determine the appropriate data types
-    gemm_dtype, res_dtype = _gemm_res_dtype(
+    gemm_dtype, res_dtype = _op_res_dtype(
         x1, x2, dtype=dtype, casting=casting, sycl_queue=exec_q
     )
 
@@ -306,13 +377,28 @@ def dpnp_matmul(
         # and be C_CONTIGUOUS or F_CONTIGUOUS
         dep_events_list = []
         host_tasks_list = []
-        x1 = _get_gemm_contig_array(
-            x1, dep_events_list, host_tasks_list, gemm_dtype
+        contig_copy = not (x1.flags.c_contiguous or x1.flags.f_contiguous)
+        x1 = _copy_array(
+            x1,
+            dep_events_list,
+            host_tasks_list,
+            contig_copy=contig_copy,
+            dtype=gemm_dtype,
         )
-        x2 = _get_gemm_contig_array(
-            x2, dep_events_list, host_tasks_list, gemm_dtype
+        contig_copy = not (x2.flags.c_contiguous or x2.flags.f_contiguous)
+        x2 = _copy_array(
+            x2,
+            dep_events_list,
+            host_tasks_list,
+            contig_copy=contig_copy,
+            dtype=gemm_dtype,
         )
 
+        # TODO: investigate usage of gemv (gemv_batch) function
+        # from BLAS when one of the inputs is a vector to
+        # gain performance.
+        # TODO: investigate usage of syrk function from BLAS in
+        # case of a.T @ a and a @ a.T to gain performance.
         if x1_is_2D and x2_is_2D:
             ht_blas_ev, _ = bi._gemm(
                 exec_q,
@@ -340,8 +426,8 @@ def dpnp_matmul(
         host_tasks_list.append(ht_blas_ev)
         dpctl.SyclEvent.wait_for(host_tasks_list)
 
-    if squeeze_flag:
-        result = dpnp.squeeze(result)
+    if appended_axes:
+        result = dpnp.squeeze(result, tuple(appended_axes))
 
     if x1_is_2D and x2_is_2D:
         # add new axes only if one of the input arrays
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index 018255c1e40..f91a4f23289 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -331,13 +331,12 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumUnaryOperationWith
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_dim_mismatch3
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_invalid_sub1
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_too_many_dims3
+
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_invlarge
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_large
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_of_two
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_dot_vec2
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_dot_with_out_f_contiguous
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index fe3671ecf7f..c3464096085 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -151,8 +151,6 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumError::test_too_ma
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_dim_mismatch3
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_too_many_dims3
 
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_reversed_vdot
-
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsMultivariateNormal_param_0_{d=2, shape=(4, 3, 2)}::test_normal
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsMultivariateNormal_param_1_{d=2, shape=(3, 2)}::test_normal
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsMultivariateNormal_param_2_{d=4, shape=(4, 3, 2)}::test_normal
@@ -435,17 +433,17 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumLarge_param_9_{opt
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumUnaryOperationWithScalar::test_scalar_float
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumUnaryOperationWithScalar::test_scalar_int
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_invalid_sub1
+
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_invlarge
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_large
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_of_two
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_dot_vec2
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_list_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_dot_with_out_f_contiguous
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_multidim_vdot
+tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_reversed_vdot
 
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_broadcast_not_allowed
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_is_equal
diff --git a/tests/skipped_tests_gpu_no_fp64.tbl b/tests/skipped_tests_gpu_no_fp64.tbl
index 26e11a70062..d724a6043e5 100644
--- a/tests/skipped_tests_gpu_no_fp64.tbl
+++ b/tests/skipped_tests_gpu_no_fp64.tbl
@@ -30,91 +30,6 @@ tests/test_umath.py::test_umaths[('floor_divide', 'ff')]
 tests/third_party/cupy/linalg_tests/test_eigenvalue.py::TestEigenvalue_param_0_{UPLO='U'}::test_eigh_batched
 tests/third_party/cupy/linalg_tests/test_eigenvalue.py::TestEigenvalue_param_1_{UPLO='L'}::test_eigh_batched
 
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_0_{shape=((2, 3, 4), (3, 4, 2)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_1_{shape=((2, 3, 4), (3, 4, 2)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_2_{shape=((2, 3, 4), (3, 4, 2)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_3_{shape=((2, 3, 4), (3, 4, 2)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_4_{shape=((1, 1), (1, 1)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_5_{shape=((1, 1), (1, 1)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_6_{shape=((1, 1), (1, 1)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_7_{shape=((1, 1), (1, 1)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_8_{shape=((1, 1), (1, 2)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_9_{shape=((1, 1), (1, 2)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_10_{shape=((1, 1), (1, 2)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_11_{shape=((1, 1), (1, 2)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_12_{shape=((1, 2), (2, 1)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_13_{shape=((1, 2), (2, 1)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_14_{shape=((1, 2), (2, 1)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_15_{shape=((1, 2), (2, 1)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_16_{shape=((2, 1), (1, 1)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_17_{shape=((2, 1), (1, 1)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_18_{shape=((2, 1), (1, 1)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_19_{shape=((2, 1), (1, 1)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_20_{shape=((1, 2), (2, 3)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_21_{shape=((1, 2), (2, 3)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_22_{shape=((1, 2), (2, 3)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_23_{shape=((1, 2), (2, 3)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_24_{shape=((2, 1), (1, 3)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_25_{shape=((2, 1), (1, 3)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_26_{shape=((2, 1), (1, 3)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_27_{shape=((2, 1), (1, 3)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_28_{shape=((2, 3), (3, 1)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_29_{shape=((2, 3), (3, 1)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_30_{shape=((2, 3), (3, 1)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_31_{shape=((2, 3), (3, 1)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_32_{shape=((2, 3), (3, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_33_{shape=((2, 3), (3, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_34_{shape=((2, 3), (3, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_35_{shape=((2, 3), (3, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_36_{shape=((0, 3), (3, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_37_{shape=((0, 3), (3, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_38_{shape=((0, 3), (3, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_39_{shape=((0, 3), (3, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_40_{shape=((2, 3), (3, 0)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_41_{shape=((2, 3), (3, 0)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_42_{shape=((2, 3), (3, 0)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_43_{shape=((2, 3), (3, 0)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_44_{shape=((0, 3), (3, 0)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_45_{shape=((0, 3), (3, 0)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_46_{shape=((0, 3), (3, 0)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_47_{shape=((0, 3), (3, 0)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_48_{shape=((3, 0), (0, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_49_{shape=((3, 0), (0, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_50_{shape=((3, 0), (0, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_51_{shape=((3, 0), (0, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_52_{shape=((2, 3, 0), (3, 0, 2)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_53_{shape=((2, 3, 0), (3, 0, 2)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_54_{shape=((2, 3, 0), (3, 0, 2)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_55_{shape=((2, 3, 0), (3, 0, 2)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_56_{shape=((0, 0), (0, 0)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_57_{shape=((0, 0), (0, 0)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_58_{shape=((0, 0), (0, 0)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_59_{shape=((0, 0), (0, 0)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_60_{shape=((3,), (3,)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_61_{shape=((3,), (3,)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_62_{shape=((3,), (3,)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_63_{shape=((3,), (3,)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_64_{shape=((2,), (2, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_65_{shape=((2,), (2, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_66_{shape=((2,), (2, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_67_{shape=((2,), (2, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_68_{shape=((4, 2), (2,)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_69_{shape=((4, 2), (2,)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_70_{shape=((4, 2), (2,)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_71_{shape=((4, 2), (2,)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_0_{shape=((), ()), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_1_{shape=((), ()), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_2_{shape=((), ()), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_3_{shape=((), ()), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_4_{shape=((), (2, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_5_{shape=((), (2, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_6_{shape=((), (2, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_7_{shape=((), (2, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_8_{shape=((4, 2), ()), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_9_{shape=((4, 2), ()), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_10_{shape=((4, 2), ()), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_11_{shape=((4, 2), ()), trans_a=False, trans_b=False}::test_dot
-
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsBeta_param_6_{a_shape=(3, 2), b_shape=(3, 2), shape=(4, 3, 2)}::test_beta
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsBeta_param_7_{a_shape=(3, 2), b_shape=(3, 2), shape=(3, 2)}::test_beta
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsChisquare_param_0_{df_shape=(), shape=(4, 3, 2)}::test_chisquare
diff --git a/tests/test_dot.py b/tests/test_dot.py
index 80da5090e1b..55884b00cd3 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -1,52 +1,373 @@
+import dpctl
 import numpy
 import pytest
 from numpy.testing import assert_allclose, assert_array_equal
 
-import dpnp as inp
+import dpnp
 
-from .helper import get_all_dtypes
+from .helper import assert_dtype_allclose, get_all_dtypes, get_complex_dtypes
 
 
-@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
-def test_dot_ones(type):
-    n = 10**5
-    a = numpy.ones(n, dtype=type)
-    b = numpy.ones(n, dtype=type)
-    ia = inp.array(a)
-    ib = inp.array(b)
-
-    result = inp.dot(ia, ib)
-    expected = numpy.dot(a, b)
-    assert_array_equal(expected, result)
+class Testdot:
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    def test_dot_ones(self, dtype):
+        n = 10**5
+        a = numpy.ones(n, dtype=dtype)
+        b = numpy.ones(n, dtype=dtype)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_dot_arange(self, dtype):
+        n = 10**2
+        m = 10**3 if dtype is not dpnp.float32 else 10**2
+        a = numpy.hstack((numpy.arange(n, dtype=dtype),) * m)
+        b = numpy.flipud(a)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    def test_dot_scalar(self, dtype):
+        a = 2
+        b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(a, ib)
+        expected = numpy.dot(a, b)
+        assert_allclose(result, expected)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_complex=True))
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 10, (), (10,)),
+            (10, 1, (10,), ()),
+            (1, 1, (), ()),
+            (10, 10, (10,), (10,)),
+            (12, 6, (4, 3), (3, 2)),
+            (12, 3, (4, 3), (3,)),
+            (60, 3, (5, 4, 3), (3,)),
+            (4, 8, (4,), (4, 2)),
+            (60, 48, (5, 3, 4), (6, 4, 2)),
+        ],
+        ids=[
+            "0d_1d",
+            "1d_0d",
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
+    )
+    def test_dot(self, dtype, array_info):
+        size1, size2, shape1, shape2 = array_info
+        a = numpy.array(
+            numpy.random.uniform(-5, 5, size1), dtype=dtype
+        ).reshape(shape1)
+        b = numpy.array(
+            numpy.random.uniform(-5, 5, size2), dtype=dtype
+        ).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_complex_dtypes())
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 10, (), (10,)),
+            (10, 1, (10,), ()),
+            (1, 1, (), ()),
+            (10, 10, (10,), (10,)),
+            (12, 6, (4, 3), (3, 2)),
+            (12, 3, (4, 3), (3,)),
+            (60, 3, (5, 4, 3), (3,)),
+            (4, 8, (4,), (4, 2)),
+            (60, 48, (5, 3, 4), (6, 4, 2)),
+        ],
+        ids=[
+            "0d_1d",
+            "1d_0d",
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
+    )
+    def test_dot_complex(self, dtype, array_info):
+        size1, size2, shape1, shape2 = array_info
+        x11 = numpy.random.uniform(-5, 5, size1)
+        x12 = numpy.random.uniform(-5, 5, size1)
+        x21 = numpy.random.uniform(-5, 5, size2)
+        x22 = numpy.random.uniform(-5, 5, size2)
+        a = numpy.array(x11 + 1j * x12, dtype=dtype).reshape(shape1)
+        b = numpy.array(x21 + 1j * x22, dtype=dtype).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 10, (), (10,)),
+            (10, 1, (10,), ()),
+            (1, 1, (), ()),
+            (10, 10, (10,), (10,)),
+            (12, 6, (4, 3), (3, 2)),
+            (12, 3, (4, 3), (3,)),
+            (60, 3, (5, 4, 3), (3,)),
+            (4, 8, (4,), (4, 2)),
+            (60, 48, (5, 3, 4), (6, 4, 2)),
+        ],
+        ids=[
+            "0d_1d",
+            "1d_0d",
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
+    )
+    def test_dot_ndarray(self, dtype, array_info):
+        size1, size2, shape1, shape2 = array_info
+        a = numpy.array(
+            numpy.random.uniform(-5, 5, size1), dtype=dtype
+        ).reshape(shape1)
+        b = numpy.array(
+            numpy.random.uniform(-5, 5, size2), dtype=dtype
+        ).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = ia.dot(ib)
+        expected = a.dot(b)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_dot_strided(self, dtype):
+        a = numpy.arange(25, dtype=dtype)
+        b = numpy.arange(25, dtype=dtype)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia[::3], ib[::3])
+        expected = numpy.dot(a[::3], b[::3])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.dot(ia, ib[::-1])
+        expected = numpy.dot(a, b[::-1])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.dot(ia[::-2], ib[::-2])
+        expected = numpy.dot(a[::-2], b[::-2])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.dot(ia[::-5], ib[::-5])
+        expected = numpy.dot(a[::-5], b[::-5])
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_dot_out_scalar(self, dtype):
+        size = 10
+        a = 2
+        b = numpy.array(numpy.random.uniform(-5, 5, size), dtype=dtype)
+        ia = 2
+        ib = dpnp.array(b)
+
+        dp_out = dpnp.empty((size,), dtype=dtype)
+        result = dpnp.dot(ia, ib, out=dp_out)
+        expected = numpy.dot(a, b)
+
+        assert result is dp_out
+        assert_allclose(result, expected)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 10, (), (10,), (10,)),
+            (10, 1, (10,), (), (10,)),
+            (1, 1, (), (), ()),
+            (10, 10, (10,), (10,), ()),
+            (12, 6, (4, 3), (3, 2), (4, 2)),
+            (12, 3, (4, 3), (3,), (4,)),
+            (60, 3, (5, 4, 3), (3,), (5, 4)),
+            (4, 8, (4,), (4, 2), (2,)),
+            (60, 48, (5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
+        ],
+        ids=[
+            "0d_1d",
+            "1d_0d",
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
+    )
+    def test_dot_out(self, dtype, array_info):
+        size1, size2, shape1, shape2, out_shape = array_info
+        a = numpy.array(
+            numpy.random.uniform(-5, 5, size1), dtype=dtype
+        ).reshape(shape1)
+        b = numpy.array(
+            numpy.random.uniform(-5, 5, size2), dtype=dtype
+        ).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        dp_out = dpnp.empty(out_shape, dtype=dtype)
+        result = dpnp.dot(ia, ib, out=dp_out)
+        expected = numpy.dot(a, b)
+
+        assert result is dp_out
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype1", get_all_dtypes())
+    @pytest.mark.parametrize("dtype2", get_all_dtypes())
+    def test_dot_input_dtype_matrix(self, dtype1, dtype2):
+        a = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype1)
+        b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    def test_dot_1d_error(self):
+        a = dpnp.ones(25)
+        b = dpnp.ones(24)
+        # size of input arrays differ
+        with pytest.raises(ValueError):
+            dpnp.dot(a, b)
+
+    def test_dot_sycl_queue_error(self):
+        a = dpnp.ones((5,), sycl_queue=dpctl.SyclQueue())
+        b = dpnp.ones((5,), sycl_queue=dpctl.SyclQueue())
+        with pytest.raises(ValueError):
+            dpnp.dot(a, b)
+
+    # NumPy does not raise an error for the following test.
+    # it just does not update the out keyword if it as not properly defined
+    @pytest.mark.parametrize("ia", [1, dpnp.ones((), dtype=dpnp.int32)])
+    def test_dot_out_error_scalar(self, ia):
+        ib = dpnp.ones(10, dtype=dpnp.int32)
+
+        # output data type is incorrect
+        dp_out = dpnp.empty((10,), dtype=dpnp.int64)
+        # TODO: change it to ValueError, when updated
+        # dpctl is being used in internal CI
+        with pytest.raises((ValueError, TypeError)):
+            dpnp.dot(ia, ib, out=dp_out)
+
+        # output shape is incorrect
+        dp_out = dpnp.empty((2,), dtype=dpnp.int32)
+        # TODO: change it to ValueError, when updated
+        # dpctl is being used in internal CI
+        with pytest.raises((ValueError, TypeError)):
+            dpnp.dot(ia, ib, out=dp_out)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((10,), (10,), ()),
+            ((3, 4), (4, 2), (3, 2)),
+            ((3, 4), (4,), (3,)),
+            ((5, 4, 3), (3,), (5, 4)),
+            ((4,), (3, 4, 2), (3, 2)),
+            ((5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
+        ],
+        ids=["1d_1d", "2d_2d", "2d_1d", "3d_1d", "1d_3d", "3d_3d"],
+    )
+    def test_dot_out_error(self, shape_pair):
+        shape1, shape2, shape_out = shape_pair
+        a = numpy.ones(shape1, dtype=numpy.int32)
+        b = numpy.ones(shape2, dtype=numpy.int32)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
 
+        # output data type is incorrect
+        np_out = numpy.empty(shape_out, dtype=numpy.int64)
+        dp_out = dpnp.empty(shape_out, dtype=dpnp.int64)
+        with pytest.raises(TypeError):
+            dpnp.dot(ia, ib, out=dp_out)
+        with pytest.raises(ValueError):
+            numpy.dot(a, b, out=np_out)
 
-@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
-def test_dot_arange(dtype):
-    n = 10**2
-    m = 10**3 if dtype is not inp.float32 else 10**2
-    a = numpy.hstack((numpy.arange(n, dtype=dtype),) * m)
-    b = numpy.flipud(a)
-    ia = inp.array(a)
-    ib = inp.array(b)
+        # output shape is incorrect
+        np_out = numpy.empty((2, 3), dtype=numpy.int32)
+        dp_out = dpnp.empty((2, 3), dtype=dpnp.int32)
+        with pytest.raises(ValueError):
+            dpnp.dot(ia, ib, out=dp_out)
+        with pytest.raises(ValueError):
+            numpy.dot(a, b, out=np_out)
 
-    result = inp.dot(ia, ib)
-    expected = numpy.dot(a, b)
-    assert_allclose(expected, result)
+        # "F" or "C" is irrelevant for 0d or 1d arrays
+        if not (len(shape_out) in [0, 1]):
+            # output should be C-contiguous
+            np_out = numpy.empty(shape_out, dtype=numpy.int32, order="F")
+            dp_out = dpnp.empty(shape_out, dtype=dpnp.int32, order="F")
+            with pytest.raises(ValueError):
+                dpnp.dot(ia, ib, out=dp_out)
+            with pytest.raises(ValueError):
+                numpy.dot(a, b, out=np_out)
 
 
 @pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 def test_multi_dot(type):
     n = 16
-    a = inp.reshape(inp.arange(n, dtype=type), (4, 4))
-    b = inp.reshape(inp.arange(n, dtype=type), (4, 4))
-    c = inp.reshape(inp.arange(n, dtype=type), (4, 4))
-    d = inp.reshape(inp.arange(n, dtype=type), (4, 4))
+    a = dpnp.reshape(dpnp.arange(n, dtype=type), (4, 4))
+    b = dpnp.reshape(dpnp.arange(n, dtype=type), (4, 4))
+    c = dpnp.reshape(dpnp.arange(n, dtype=type), (4, 4))
+    d = dpnp.reshape(dpnp.arange(n, dtype=type), (4, 4))
 
     a1 = numpy.arange(n, dtype=type).reshape((4, 4))
     b1 = numpy.arange(n, dtype=type).reshape((4, 4))
     c1 = numpy.arange(n, dtype=type).reshape((4, 4))
     d1 = numpy.arange(n, dtype=type).reshape((4, 4))
 
-    result = inp.linalg.multi_dot([a, b, c, d])
+    result = dpnp.linalg.multi_dot([a, b, c, d])
     expected = numpy.linalg.multi_dot([a1, b1, c1, d1])
     assert_array_equal(expected, result)
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 1faa0620f7d..56be3db6d92 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -2517,6 +2517,7 @@ class TestMatmul:
             ((4,), (4,)),
             ((4,), (4, 2)),
             ((2, 4), (4,)),
+            ((1, 4), (4,)),  # output should be 1-d not 0-d
             ((2, 4), (4, 3)),
             ((1, 2, 3), (1, 3, 5)),
             ((4, 2, 3), (4, 3, 5)),
@@ -2672,7 +2673,7 @@ def test_matmul_dtype(self, dtype, shape_pair):
             "((6, 7, 4, 3), (6, 7, 3, 5))",
         ],
     )
-    def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair):
+    def test_matmul_dtype_matrix_inout(self, dtype1, dtype2, shape_pair):
         shape1, shape2 = shape_pair
         a1 = numpy.arange(numpy.prod(shape1), dtype=dtype1).reshape(shape1)
         a2 = numpy.arange(numpy.prod(shape2), dtype=dtype1).reshape(shape2)
@@ -2703,7 +2704,7 @@ def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair):
             "((6, 7, 4, 3), (6, 7, 3, 5))",
         ],
     )
-    def test_matmul_dtype_matrix_inout(self, dtype1, dtype2, shape_pair):
+    def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair):
         shape1, shape2 = shape_pair
         a1 = numpy.arange(numpy.prod(shape1), dtype=dtype1).reshape(shape1)
         a2 = numpy.arange(numpy.prod(shape2), dtype=dtype2).reshape(shape2)
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index 78a869fac9d..a8b8be52009 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -534,8 +534,8 @@ def test_reduce_hypot(device):
         ),
         pytest.param(
             "dot",
-            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
-            [[4.0, 4.0], [4.0, 4.0], [4.0, 4.0]],
+            [3.0, 4.0, 5.0],
+            [1.0, 2.0, 3.0],
         ),
         pytest.param(
             "floor_divide", [1.0, 2.0, 3.0, 4.0], [2.5, 2.5, 2.5, 2.5]
@@ -842,8 +842,8 @@ def test_out_1in_1out(func, data, device):
         ),
         pytest.param(
             "dot",
-            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
-            [[4.0, 4.0], [4.0, 4.0], [4.0, 4.0]],
+            [3.0, 4.0, 5.0],
+            [1.0, 2.0, 3.0],
         ),
         pytest.param(
             "floor_divide", [1.0, 2.0, 3.0, 4.0], [2.5, 2.5, 2.5, 2.5]
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 5a29e677747..171e979facf 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -494,8 +494,8 @@ def test_1in_1out(func, data, usm_type):
         pytest.param("copysign", [0.0, 1.0, 2.0], [-1.0, 0.0, 1.0]),
         pytest.param(
             "dot",
-            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
-            [[4.0, 4.0], [4.0, 4.0], [4.0, 4.0]],
+            [3.0, 4.0, 5.0],
+            [1.0, 2.0, 3.0],
         ),
         pytest.param("fmax", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
         pytest.param("fmin", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
diff --git a/tests/third_party/cupy/linalg_tests/test_eigenvalue.py b/tests/third_party/cupy/linalg_tests/test_eigenvalue.py
index 99dcfb2127c..b620bd39e98 100644
--- a/tests/third_party/cupy/linalg_tests/test_eigenvalue.py
+++ b/tests/third_party/cupy/linalg_tests/test_eigenvalue.py
@@ -15,12 +15,6 @@ def _get_hermitian(xp, a, UPLO):
         return xp.tril(a) + xp.tril(a, k=-1).swapaxes(-2, -1).conj()
 
 
-# TODO:
-# remove once dpnp.dot and dpnp.matmul support complex types
-def _wrap_as_numpy_array(xp, a):
-    return a.asnumpy() if xp is cupy else a
-
-
 @testing.parameterize(
     *testing.product(
         {
@@ -57,20 +51,12 @@ def test_eigh(self, xp, dtype):
         else:
             tol = 1e-5
 
-        # TODO: remove _wrap_as_numpy_array() once @ support complex types
-        testing.assert_allclose(
-            _wrap_as_numpy_array(xp, A) @ _wrap_as_numpy_array(xp, v),
-            _wrap_as_numpy_array(xp, v)
-            @ numpy.diag(_wrap_as_numpy_array(xp, w)),
-            atol=tol,
-            rtol=tol,
-        )
+        testing.assert_allclose(A @ v, v @ xp.diag(w), atol=tol, rtol=tol)
 
         # Check if v @ vt is an identity matrix
         testing.assert_allclose(
-            _wrap_as_numpy_array(xp, v)
-            @ _wrap_as_numpy_array(xp, v).swapaxes(-2, -1).conj(),
-            numpy.identity(_wrap_as_numpy_array(xp, A).shape[-1], _dtype),
+            v @ v.swapaxes(-2, -1).conj(),
+            xp.identity(A.shape[-1], _dtype),
             atol=tol,
             rtol=tol,
         )
@@ -121,11 +107,6 @@ def test_eigh_complex_batched(self, xp, dtype):
         # them through the eigen equation A*v=w*v.
         A = _get_hermitian(xp, a, self.UPLO)
 
-        # TODO: remove _wrap_as_numpy_array() once dpnp.dot() support complex types
-        A = _wrap_as_numpy_array(xp, A)
-        v = _wrap_as_numpy_array(xp, v)
-        w = _wrap_as_numpy_array(xp, w)
-
         for i in range(a.shape[0]):
             testing.assert_allclose(
                 A[i].dot(v[i]), w[i] * v[i], rtol=1e-5, atol=1e-5
diff --git a/tests/third_party/cupy/linalg_tests/test_product.py b/tests/third_party/cupy/linalg_tests/test_product.py
index 93b13c93e87..1fd048356b4 100644
--- a/tests/third_party/cupy/linalg_tests/test_product.py
+++ b/tests/third_party/cupy/linalg_tests/test_product.py
@@ -36,10 +36,12 @@
         }
     )
 )
-@testing.gpu
+# TODO: get rid of falls back on NumPy when tensordot
+# is implemented using OneMKL
+@pytest.mark.usefixtures("allow_fall_back_on_numpy")
 class TestDot(unittest.TestCase):
     @testing.for_all_dtypes_combination(["dtype_a", "dtype_b"])
-    @testing.numpy_cupy_allclose()
+    @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
     def test_dot(self, xp, dtype_a, dtype_b):
         shape_a, shape_b = self.shape
         if self.trans_a:
@@ -71,8 +73,13 @@ def test_dot_with_out(self, xp, dtype_a, dtype_b, dtype_c):
         else:
             shape_c = shape_a[:-1] + shape_b[:-2] + shape_b[-1:]
         c = xp.empty(shape_c, dtype=dtype_c)
-        out = xp.dot(a, b, out=c)
-        self.assertIs(out, c)
+        try:
+            out = xp.dot(a, b, out=c)
+        except TypeError:
+            # When output dtype is incorrect, NumPy raises ValueError
+            # While DPNP raises TypeError, so we change it to ValueError
+            raise ValueError
+        assert out is c
         return c
 
 
@@ -128,10 +135,11 @@ def test_cross(self, xp, dtype_a, dtype_b):
         }
     )
 )
-@testing.gpu
 class TestDotFor0Dim(unittest.TestCase):
     @testing.for_all_dtypes_combination(["dtype_a", "dtype_b"])
-    @testing.numpy_cupy_allclose(contiguous_check=False)
+    @testing.numpy_cupy_allclose(
+        type_check=has_support_aspect64(), contiguous_check=False
+    )
     def test_dot(self, xp, dtype_a, dtype_b):
         shape_a, shape_b = self.shape
         if self.trans_a:
@@ -145,8 +153,7 @@ def test_dot(self, xp, dtype_a, dtype_b):
         return xp.dot(a, b)
 
 
-@testing.gpu
-class TestProduct(unittest.TestCase):
+class TestProduct:
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_dot_vec1(self, xp, dtype):
@@ -154,6 +161,9 @@ def test_dot_vec1(self, xp, dtype):
         b = testing.shaped_arange((2,), xp, dtype)
         return xp.dot(a, b)
 
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_dot_vec2(self, xp, dtype):
@@ -168,6 +178,9 @@ def test_dot_vec3(self, xp, dtype):
         b = testing.shaped_arange((2,), xp, dtype)
         return xp.dot(a, b)
 
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_transposed_dot(self, xp, dtype):
@@ -175,6 +188,9 @@ def test_transposed_dot(self, xp, dtype):
         b = testing.shaped_arange((2, 3, 4), xp, dtype).transpose(0, 2, 1)
         return xp.dot(a, b)
 
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_transposed_dot_with_out(self, xp, dtype):
@@ -184,6 +200,9 @@ def test_transposed_dot_with_out(self, xp, dtype):
         xp.dot(a, b, out=c)
         return c
 
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     def test_transposed_dot_with_out_f_contiguous(self, dtype):
         for xp in (numpy, cupy):
diff --git a/tests/third_party/cupy/math_tests/test_matmul.py b/tests/third_party/cupy/math_tests/test_matmul.py
index d21ec7a2d68..887ed9ae1b9 100644
--- a/tests/third_party/cupy/math_tests/test_matmul.py
+++ b/tests/third_party/cupy/math_tests/test_matmul.py
@@ -73,6 +73,61 @@ def test_cupy_matmul(self, xp, dtype1):
         return xp.matmul(x1, x2)
 
 
+@testing.parameterize(
+    *testing.product(
+        {
+            "shape_pair": [
+                # dot test
+                ((2, 3), (3, 4), (2, 4)),
+                # ((0,), (0,), (0,)),
+                # matmul test
+                ((5, 3, 2), (5, 2, 4), (5, 3, 4)),
+                ((0, 3, 2), (0, 2, 4), (0, 3, 4)),
+            ],
+        }
+    )
+)
+class TestMatmulOut(unittest.TestCase):
+    @testing.for_all_dtypes(name="dtype1")
+    @testing.for_all_dtypes(name="dtype2")
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-3, accept_error=TypeError  # required for uint8
+    )
+    def test_cupy_matmul_noncontiguous(self, xp, dtype1, dtype2):
+        x1 = testing.shaped_arange(self.shape_pair[0], xp, dtype1)
+        x2 = testing.shaped_arange(self.shape_pair[1], xp, dtype2)
+        out = xp.zeros(self.shape_pair[2], dtype=dtype1)[::-1]
+        ret = xp.matmul(x1, x2, out=out)
+        assert ret is out
+        return ret
+
+    @testing.for_all_dtypes(name="dtype1")
+    @testing.for_all_dtypes(name="dtype2")
+    @testing.numpy_cupy_allclose(rtol=1e-3, atol=1e-3)  # required for uint8
+    def test_cupy_matmul_out_cast(self, xp, dtype1, dtype2):
+        x1 = testing.shaped_arange(self.shape_pair[0], xp, dtype1)
+        x2 = testing.shaped_arange(self.shape_pair[1], xp, dtype2)
+        out = xp.zeros(self.shape_pair[2], dtype=bool)
+        ret = xp.matmul(x1, x2, out=out, casting="unsafe")
+        assert ret is out
+        return ret
+
+
+class TestMatmulOutOverlap:
+    @pytest.mark.parametrize(
+        "shape",
+        [
+            (900, 900),
+            (2, 600, 600),
+        ],
+    )
+    @testing.for_dtypes([numpy.int32, numpy.float64])
+    @testing.numpy_cupy_allclose(rtol=1e-5, atol=1e-5)
+    def test_overlap_both(self, xp, dtype, shape):
+        a = xp.ones(shape, dtype=dtype)
+        return xp.matmul(a, a, out=a)
+
+
 @testing.parameterize(
     *testing.product(
         {

From b03261258eabf2d688decd16b874e22cebbf4de3 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Wed, 7 Feb 2024 17:18:45 +0100
Subject: [PATCH 19/29] Add support of numpy 1.26.3 (#1690)

---
 conda-recipe/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 99e50c706c0..7c9a10c4ff7 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -13,7 +13,7 @@ requirements:
     host:
       - python
       - setuptools
-      - numpy >=1.19,<1.25a0
+      - numpy >=1.19,<1.27a0
       - cython
       - cmake >=3.21
       - ninja

From 666486f5ae168b60ee6fe668cbe0977759b10557 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Wed, 7 Feb 2024 19:38:29 +0100
Subject: [PATCH 20/29] Pin version of packages installed with pip command
 (#1696)

---
 .github/workflows/build-sphinx.yml       | 2 +-
 .github/workflows/generate_coverage.yaml | 6 +++---
 scripts/install_python_deps.sh           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 9de0097e120..e1719d01ae1 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -103,7 +103,7 @@ jobs:
       - name: Install sphinx dependencies
         run: |
           conda install sphinx sphinx_rtd_theme
-          pip install sphinxcontrib-googleanalytics
+          pip install sphinxcontrib-googleanalytics==0.4
 
       - name: Install dpnp dependencies
         run: |
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index 432377ce10c..e0faec5b567 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -79,7 +79,7 @@ jobs:
       - name: Install coverall dependencies
         run: |
           sudo gem install coveralls-lcov
-          pip install coveralls==3.2.0
+          pip install coveralls==3.3.1
 
       - name: Upload coverage data to coveralls.io
         run: |
@@ -102,7 +102,7 @@ jobs:
     steps:
     - name: Finished
       run: |
-        pip3 install --upgrade coveralls
-        coveralls --finish
+        pip3 install --upgrade coveralls==3.3.1
+        coveralls --service=github --finish
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/scripts/install_python_deps.sh b/scripts/install_python_deps.sh
index e40d9a5b34a..f538e28446c 100755
--- a/scripts/install_python_deps.sh
+++ b/scripts/install_python_deps.sh
@@ -11,7 +11,7 @@ echo ========================= Conda: remove mkl ===============================
 conda remove mkl --force -y || true
 
 echo ========================= PIP3: install prerequisites ==========================
-pip3 install pytest-valgrind
+pip3 install pytest-valgrind==0.2.0
 
 echo ========================= SW versions ==========================================
 conda list

From 1a3866e494bc48ff060eae5328e640699d9082a6 Mon Sep 17 00:00:00 2001
From: vtavana <120411540+vtavana@users.noreply.github.com>
Date: Wed, 7 Feb 2024 17:38:32 -0600
Subject: [PATCH 21/29] update `dpnp.vdot` implementation (#1692)

* update dpnp_vdot

* address comments

* address more comments
---
 dpnp/backend/extensions/blas/CMakeLists.txt   |   1 +
 dpnp/backend/extensions/blas/blas_py.cpp      |  10 +
 dpnp/backend/extensions/blas/dot.hpp          |   8 +
 dpnp/backend/extensions/blas/dotc.cpp         | 241 ++++++++++++++++++
 dpnp/backend/extensions/blas/types_matrix.hpp |  24 ++
 dpnp/dpnp_iface.py                            |  17 +-
 dpnp/dpnp_iface_linearalgebra.py              |  67 ++++-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |  22 +-
 tests/skipped_tests.tbl                       |   3 -
 tests/skipped_tests_gpu.tbl                   |   3 -
 tests/test_dot.py                             | 144 ++++++++++-
 tests/test_sycl_queue.py                      |  25 +-
 tests/test_usm_type.py                        |  15 +-
 13 files changed, 537 insertions(+), 43 deletions(-)
 create mode 100644 dpnp/backend/extensions/blas/dotc.cpp

diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index fe3a92d2181..692c1c0ec59 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -28,6 +28,7 @@ set(python_module_name _blas_impl)
 set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/blas_py.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dotc.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dotu.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gemm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gemm_batch.cpp
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
index 7d5237381b1..d091923e63e 100644
--- a/dpnp/backend/extensions/blas/blas_py.cpp
+++ b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -40,6 +40,7 @@ namespace py = pybind11;
 void init_dispatch_tables(void)
 {
     blas_ext::init_dot_dispatch_table();
+    blas_ext::init_dotc_dispatch_table();
     blas_ext::init_dotu_dispatch_table();
     blas_ext::init_gemm_batch_dispatch_table();
     blas_ext::init_gemm_dispatch_table();
@@ -57,6 +58,15 @@ PYBIND11_MODULE(_blas_impl, m)
               py::arg("result"), py::arg("depends") = py::list());
     }
 
+    {
+        m.def("_dotc", &blas_ext::dotc,
+              "Call `dotc` from OneMKL LAPACK library to return "
+              "the dot product of two complex vectors, "
+              "conjugating the first vector.",
+              py::arg("sycl_queue"), py::arg("vectorA"), py::arg("vectorB"),
+              py::arg("result"), py::arg("depends") = py::list());
+    }
+
     {
         m.def("_dotu", &blas_ext::dotu,
               "Call `dotu` from OneMKL LAPACK library to return "
diff --git a/dpnp/backend/extensions/blas/dot.hpp b/dpnp/backend/extensions/blas/dot.hpp
index 3468196f760..914355b7f1e 100644
--- a/dpnp/backend/extensions/blas/dot.hpp
+++ b/dpnp/backend/extensions/blas/dot.hpp
@@ -45,6 +45,13 @@ extern std::pair<sycl::event, sycl::event>
         dpctl::tensor::usm_ndarray result,
         const std::vector<sycl::event> &depends);
 
+extern std::pair<sycl::event, sycl::event>
+    dotc(sycl::queue &exec_q,
+         dpctl::tensor::usm_ndarray vectorA,
+         dpctl::tensor::usm_ndarray vectorB,
+         dpctl::tensor::usm_ndarray result,
+         const std::vector<sycl::event> &depends);
+
 extern std::pair<sycl::event, sycl::event>
     dotu(sycl::queue &exec_q,
          dpctl::tensor::usm_ndarray vectorA,
@@ -53,6 +60,7 @@ extern std::pair<sycl::event, sycl::event>
          const std::vector<sycl::event> &depends);
 
 extern void init_dot_dispatch_table(void);
+extern void init_dotc_dispatch_table(void);
 extern void init_dotu_dispatch_table(void);
 } // namespace blas
 } // namespace ext
diff --git a/dpnp/backend/extensions/blas/dotc.cpp b/dpnp/backend/extensions/blas/dotc.cpp
new file mode 100644
index 00000000000..3f9e7e17b4c
--- /dev/null
+++ b/dpnp/backend/extensions/blas/dotc.cpp
@@ -0,0 +1,241 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "dot.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+namespace mkl_blas = oneapi::mkl::blas;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*dotc_impl_fn_ptr_t)(sycl::queue &,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::vector<sycl::event> &);
+
+static dotc_impl_fn_ptr_t dotc_dispatch_table[dpctl_td_ns::num_types]
+                                             [dpctl_td_ns::num_types];
+
+template <typename Tab, typename Tc>
+static sycl::event dotc_impl(sycl::queue &exec_q,
+                             const std::int64_t n,
+                             char *vectorA,
+                             const std::int64_t stride_a,
+                             char *vectorB,
+                             const std::int64_t stride_b,
+                             char *result,
+                             const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<Tab>(exec_q);
+    type_utils::validate_type_for_device<Tc>(exec_q);
+
+    Tab *a = reinterpret_cast<Tab *>(vectorA);
+    Tab *b = reinterpret_cast<Tab *>(vectorB);
+    Tc *res = reinterpret_cast<Tc *>(result);
+
+    std::stringstream error_msg;
+    bool is_exception_caught = false;
+
+    sycl::event dotc_event;
+    try {
+        dotc_event = mkl_blas::row_major::dotc(exec_q,
+                                               n, // size of the input vectors
+                                               a, // Pointer to vector a.
+                                               stride_a, // Stride of vector a.
+                                               b,        // Pointer to vector b.
+                                               stride_b, // Stride of vector b.
+                                               res,      // Pointer to result.
+                                               depends);
+    } catch (oneapi::mkl::exception const &e) {
+        error_msg
+            << "Unexpected MKL exception caught during dotc() call:\nreason: "
+            << e.what();
+        is_exception_caught = true;
+    } catch (sycl::exception const &e) {
+        error_msg << "Unexpected SYCL exception caught during dotc() call:\n"
+                  << e.what();
+        is_exception_caught = true;
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        throw std::runtime_error(error_msg.str());
+    }
+
+    return dotc_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    dotc(sycl::queue &exec_q,
+         dpctl::tensor::usm_ndarray vectorA,
+         dpctl::tensor::usm_ndarray vectorB,
+         dpctl::tensor::usm_ndarray result,
+         const std::vector<sycl::event> &depends)
+{
+    const int vectorA_nd = vectorA.get_ndim();
+    const int vectorB_nd = vectorB.get_ndim();
+    const int result_nd = result.get_ndim();
+
+    if ((vectorA_nd != 1)) {
+        throw py::value_error(
+            "The first input array has ndim=" + std::to_string(vectorA_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((vectorB_nd != 1)) {
+        throw py::value_error(
+            "The second input array has ndim=" + std::to_string(vectorB_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((result_nd != 0)) {
+        throw py::value_error(
+            "The output array has ndim=" + std::to_string(result_nd) +
+            ", but a 0-dimensional array is expected.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(vectorA, result)) {
+        throw py::value_error(
+            "The first input array and output array are overlapping "
+            "segments of memory");
+    }
+    if (overlap(vectorB, result)) {
+        throw py::value_error(
+            "The second input array and output array are overlapping "
+            "segments of memory");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q,
+            {vectorA.get_queue(), vectorB.get_queue(), result.get_queue()}))
+    {
+        throw py::value_error(
+            "USM allocations are not compatible with the execution queue.");
+    }
+
+    py::ssize_t a_size = vectorA.get_size();
+    py::ssize_t b_size = vectorB.get_size();
+    if (a_size != b_size) {
+        throw py::value_error("The size of the first input array must be "
+                              "equal to the size of the second input array.");
+    }
+
+    std::vector<py::ssize_t> a_stride = vectorA.get_strides_vector();
+    std::vector<py::ssize_t> b_stride = vectorB.get_strides_vector();
+
+    const std::int64_t n = a_size;
+    const std::int64_t str_a = a_stride[0];
+    const std::int64_t str_b = b_stride[0];
+
+    int vectorA_typenum = vectorA.get_typenum();
+    int vectorB_typenum = vectorB.get_typenum();
+    int result_typenum = result.get_typenum();
+
+    if (vectorA_typenum != vectorB_typenum) {
+        throw py::value_error(
+            "Input arrays must be of must be of the same type.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int vectorAB_type_id = array_types.typenum_to_lookup_id(vectorA_typenum);
+    int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
+
+    dotc_impl_fn_ptr_t dotc_fn =
+        dotc_dispatch_table[vectorAB_type_id][result_type_id];
+    if (dotc_fn == nullptr) {
+        throw py::value_error(
+            "Types of input vectors and result array are mismatched.");
+    }
+
+    char *a_typeless_ptr = vectorA.get_data();
+    char *b_typeless_ptr = vectorB.get_data();
+    char *r_typeless_ptr = result.get_data();
+
+    const int a_elemsize = vectorA.get_elemsize();
+    const int b_elemsize = vectorB.get_elemsize();
+    if (str_a < 0) {
+        a_typeless_ptr -= (n - 1) * std::abs(str_a) * a_elemsize;
+    }
+    if (str_b < 0) {
+        b_typeless_ptr -= (n - 1) * std::abs(str_b) * b_elemsize;
+    }
+
+    sycl::event dotc_ev =
+        dotc_fn(exec_q, n, a_typeless_ptr, str_a, b_typeless_ptr, str_b,
+                r_typeless_ptr, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {vectorA, vectorB, result}, {dotc_ev});
+
+    return std::make_pair(args_ev, dotc_ev);
+}
+
+template <typename fnT, typename Tab, typename Tc>
+struct DotcContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::DotcTypePairSupportFactory<Tab, Tc>::is_defined) {
+            return dotc_impl<Tab, Tc>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_dotc_dispatch_table(void)
+{
+    dpctl_td_ns::DispatchTableBuilder<dotc_impl_fn_ptr_t, DotcContigFactory,
+                                      dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_table(dotc_dispatch_table);
+}
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/types_matrix.hpp b/dpnp/backend/extensions/blas/types_matrix.hpp
index c36ae0e2045..44e297d47e7 100644
--- a/dpnp/backend/extensions/blas/types_matrix.hpp
+++ b/dpnp/backend/extensions/blas/types_matrix.hpp
@@ -62,6 +62,30 @@ struct DotTypePairSupportFactory
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL BLAS library provides support in oneapi::mkl::blas::dotc<Tab, Tc>
+ * function.
+ *
+ * @tparam Tab Type of arrays containing input vectors A and B.
+ * @tparam Tc Type of array containing output.
+ */
+template <typename Tab, typename Tc>
+struct DotcTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<float>,
+                                          Tc,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<double>,
+                                          Tc,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL BLAS library provides support in oneapi::mkl::blas::dotu<Tab, Tc>
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 9aee27b73bc..e37c2e090a6 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -205,7 +205,7 @@ def astype(x1, dtype, order="K", casting="unsafe", copy=True):
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
-def check_supported_arrays_type(*arrays, scalar_type=False):
+def check_supported_arrays_type(*arrays, scalar_type=False, all_scalars=False):
     """
     Return ``True`` if each array has either type of scalar,
     :class:`dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`.
@@ -216,7 +216,9 @@ def check_supported_arrays_type(*arrays, scalar_type=False):
     arrays : {dpnp_array, usm_ndarray}
         Input arrays to check for supported types.
     scalar_type : {bool}, optional
-        A scalar type is also considered as supported if flag is True.
+        A scalar type is also considered as supported if flag is ``True``.
+    all_scalars : {bool}, optional
+        All the input arrays can be scalar if flag is ``True``.
 
     Returns
     -------
@@ -231,13 +233,22 @@ def check_supported_arrays_type(*arrays, scalar_type=False):
 
     """
 
+    any_is_array = False
     for a in arrays:
-        if scalar_type and dpnp.isscalar(a) or is_supported_array_type(a):
+        if is_supported_array_type(a):
+            any_is_array = True
+            continue
+        elif scalar_type and dpnp.isscalar(a):
             continue
 
         raise TypeError(
             "An array must be any of supported type, but got {}".format(type(a))
         )
+
+    if len(arrays) > 1 and not (all_scalars or any_is_array):
+        raise TypeError(
+            "At least one input must be of supported array type, but got all scalars."
+        )
     return True
 
 
diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
index 9d63f7f8c3d..bffe881b626 100644
--- a/dpnp/dpnp_iface_linearalgebra.py
+++ b/dpnp/dpnp_iface_linearalgebra.py
@@ -121,8 +121,7 @@ def dot(a, b, out=None):
 
     """
 
-    dpnp.check_supported_arrays_type(a, scalar_type=True)
-    dpnp.check_supported_arrays_type(b, scalar_type=True)
+    dpnp.check_supported_arrays_type(a, b, scalar_type=True)
 
     if out is not None:
         dpnp.check_supported_arrays_type(out)
@@ -333,8 +332,7 @@ def matmul(
 
     """
 
-    dpnp.check_supported_arrays_type(x1)
-    dpnp.check_supported_arrays_type(x2)
+    dpnp.check_supported_arrays_type(x1, x2)
     if subok is False:
         raise NotImplementedError(
             "subok keyword argument is only supported by its default value."
@@ -444,19 +442,68 @@ def tensordot(x1, x2, axes=2):
     return call_origin(numpy.tensordot, x1, x2, axes)
 
 
-def vdot(*args, **kwargs):
+def vdot(a, b):
     """
     Return the dot product of two vectors.
 
-    For full documentation refer to :obj:`numpy.vdot`.
+    For full documentation refer to :obj:`numpy.dot`.
+
+    Parameters
+    ----------
+    a : {dpnp_array, usm_ndarray, scalar}
+        First input array. Both inputs `a` and `b` can not be
+        scalars at the same time. If `a` is complex, the complex
+        conjugate is taken before the calculation of the dot product.
+    b : {dpnp_array, usm_ndarray, scalar}
+        Second input array. Both inputs `a` and `b` can not be
+        scalars at the same time.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Returns the dot product of `a` and `b`.
 
     See Also
     --------
     :obj:`dpnp.dot` : Returns the dot product.
+    :obj:`dpnp.matmul` : Returns the matrix product.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> a = np.array([1+2j,3+4j])
+    >>> b = np.array([5+6j,7+8j])
+    >>> np.vdot(a, b)
+    array(70-8j)
+    >>> np.vdot(b, a)
+    array(70+8j)
 
-    Notes
-    -----
-    This function works the same as :obj:`dpnp.dot`.
+    Note that higher-dimensional arrays are flattened!
+
+    >>> a = np.array([[1, 4], [5, 6]])
+    >>> b = np.array([[4, 1], [2, 2]])
+    >>> np.vdot(a, b)
+    array(30)
+    >>> np.vdot(b, a)
+    array(30)
+    >>> 1*4 + 4*1 + 5*2 + 6*2
+    30
 
     """
-    return dpnp.dot(*args, **kwargs)
+
+    dpnp.check_supported_arrays_type(a, b, scalar_type=True)
+
+    if dpnp.isscalar(a) or dpnp.isscalar(b):
+        if dpnp.isscalar(b) and a.size != 1:
+            raise ValueError("The first array should be of size one.")
+        if dpnp.isscalar(a) and b.size != 1:
+            raise ValueError("The second array should be of size one.")
+        a_conj = numpy.conj(a) if dpnp.isscalar(a) else dpnp.conj(a)
+        # TODO: investigate usage of axpy (axpy_batch) or scal
+        # functions from BLAS here instead of dpnp.multiply
+        return dpnp.multiply(a_conj, b)
+    elif a.ndim == 1 and b.ndim == 1:
+        return dpnp_dot(a, b, out=None, conjugate=True)
+    else:
+        # dot product of flatten arrays
+        return dpnp_dot(dpnp.ravel(a), dpnp.ravel(b), out=None, conjugate=True)
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 65d97befa98..bf1a3417704 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -175,7 +175,7 @@ def _op_res_dtype(*arrays, dtype, casting, sycl_queue):
             res_dtype = dtype
         else:
             raise TypeError(
-                f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
+                f"Cannot cast from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
             )
 
     op_dtype = (
@@ -185,16 +185,18 @@ def _op_res_dtype(*arrays, dtype, casting, sycl_queue):
     return op_dtype, res_dtype
 
 
-def dpnp_dot(a, b, /, out=None):
+def dpnp_dot(a, b, /, out=None, *, conjugate=False):
     """
     Return the dot product of two arrays.
 
     The routine that is used to perform the main calculation
-    depends on input array data types: 1) For integer and boolean data types,
+    depends on input arrays data type: 1) For integer and boolean data types,
     `dpctl.tensor.vecdot` form the Data Parallel Control library is used,
-    2) For floating point real-valued data types, `dot` routines from
-    BLAS library of OneMKL is used, and 3) For complex data types,
-    `dotu` routines from BLAS library of OneMKL is used.
+    2) For real-valued floating point data types, `dot` routines from
+    BLAS library of OneMKL are used, and 3) For complex data types,
+    `dotu` or `dotc` routines from BLAS library of OneMKL are used.
+    If `conjugate` is ``False``, `dotu` is used. Otherwise, `dotc` is used,
+    for which the first array is conjugated before calculating the dot product.
 
     """
 
@@ -228,7 +230,11 @@ def dpnp_dot(a, b, /, out=None):
         a = _copy_array(a, dep_events_list, host_tasks_list, dtype=dot_dtype)
         b = _copy_array(b, dep_events_list, host_tasks_list, dtype=dot_dtype)
         if dpnp.issubdtype(res_dtype, dpnp.complexfloating):
-            ht_ev, _ = bi._dotu(
+            if conjugate:
+                dot_func = "_dotc"
+            else:
+                dot_func = "_dotu"
+            ht_ev, _ = getattr(bi, dot_func)(
                 exec_q,
                 dpnp.get_usm_ndarray(a),
                 dpnp.get_usm_ndarray(b),
@@ -253,7 +259,7 @@ def dpnp_dot(a, b, /, out=None):
     if dot_dtype != res_dtype:
         result = result.astype(res_dtype, copy=False)
 
-    # NumPy does not allow casting even if it is safe
+    # numpy.dot does not allow casting even if it is safe
     return dpnp.get_result_array(result, out, casting="no")
 
 
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index f91a4f23289..a38624e3757 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -335,13 +335,10 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_invlarge
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_large
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_of_two
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_list_axes
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_reversed_vdot
 
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_broadcast_not_allowed
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_is_equal
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index c3464096085..ce6f6aef984 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -437,13 +437,10 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_invlarge
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_large
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_of_two
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_list_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_multidim_vdot
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_reversed_vdot
 
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_broadcast_not_allowed
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_is_equal
diff --git a/tests/test_dot.py b/tests/test_dot.py
index 55884b00cd3..42478db9634 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -8,7 +8,7 @@
 from .helper import assert_dtype_allclose, get_all_dtypes, get_complex_dtypes
 
 
-class Testdot:
+class TestDot:
     @pytest.mark.parametrize("dtype", get_all_dtypes())
     def test_dot_ones(self, dtype):
         n = 10**5
@@ -371,3 +371,145 @@ def test_multi_dot(type):
     result = dpnp.linalg.multi_dot([a, b, c, d])
     expected = numpy.linalg.multi_dot([a1, b1, c1, d1])
     assert_array_equal(expected, result)
+
+
+class TestVdot:
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    def test_vdot_scalar(self, dtype):
+        a = numpy.array([3.5], dtype=dtype)
+        ia = dpnp.array(a)
+        b = 2 + 3j
+
+        result = dpnp.vdot(ia, b)
+        expected = numpy.vdot(a, b)
+        assert_allclose(result, expected)
+
+        result = dpnp.vdot(b, ia)
+        expected = numpy.vdot(b, a)
+        assert_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_complex=True))
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 1, (), ()),
+            (10, 10, (10,), (10,)),
+            (12, 12, (4, 3), (3, 4)),
+            (12, 12, (4, 3), (12,)),
+            (60, 60, (5, 4, 3), (60,)),
+            (8, 8, (8,), (4, 2)),
+            (60, 60, (5, 3, 4), (3, 4, 5)),
+        ],
+        ids=[
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
+    )
+    def test_vdot(self, dtype, array_info):
+        size1, size2, shape1, shape2 = array_info
+        a = numpy.array(
+            numpy.random.uniform(-5, 5, size1), dtype=dtype
+        ).reshape(shape1)
+        b = numpy.array(
+            numpy.random.uniform(-5, 5, size2), dtype=dtype
+        ).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.vdot(ia, ib)
+        expected = numpy.vdot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_complex_dtypes())
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 1, (), ()),
+            (10, 10, (10,), (10,)),
+            (12, 12, (4, 3), (3, 4)),
+            (12, 12, (4, 3), (12,)),
+            (60, 60, (5, 4, 3), (60,)),
+            (8, 8, (8,), (4, 2)),
+            (60, 60, (5, 3, 4), (3, 4, 5)),
+        ],
+        ids=[
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
+    )
+    def test_vdot_complex(self, dtype, array_info):
+        size1, size2, shape1, shape2 = array_info
+        x11 = numpy.random.uniform(-5, 5, size1)
+        x12 = numpy.random.uniform(-5, 5, size1)
+        x21 = numpy.random.uniform(-5, 5, size2)
+        x22 = numpy.random.uniform(-5, 5, size2)
+        a = numpy.array(x11 + 1j * x12, dtype=dtype).reshape(shape1)
+        b = numpy.array(x21 + 1j * x22, dtype=dtype).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.vdot(ia, ib)
+        expected = numpy.vdot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_vdot_strided(self, dtype):
+        a = numpy.arange(25, dtype=dtype)
+        b = numpy.arange(25, dtype=dtype)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.vdot(ia[::3], ib[::3])
+        expected = numpy.vdot(a[::3], b[::3])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.vdot(ia, ib[::-1])
+        expected = numpy.vdot(a, b[::-1])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.vdot(ia[::-2], ib[::-2])
+        expected = numpy.vdot(a[::-2], b[::-2])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.vdot(ia[::-5], ib[::-5])
+        expected = numpy.vdot(a[::-5], b[::-5])
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype1", get_all_dtypes())
+    @pytest.mark.parametrize("dtype2", get_all_dtypes())
+    def test_vdot_input_dtype_matrix(self, dtype1, dtype2):
+        a = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype1)
+        b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.vdot(ia, ib)
+        expected = numpy.vdot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    def test_vdot_error(self):
+        a = dpnp.ones(25)
+        b = dpnp.ones(24)
+        # size of input arrays differ
+        with pytest.raises(ValueError):
+            dpnp.vdot(a, b)
+
+        a = dpnp.ones(25)
+        b = 2
+        # The first array should be of size one
+        with pytest.raises(ValueError):
+            dpnp.vdot(a, b)
+
+        # The second array should be of size one
+        with pytest.raises(ValueError):
+            dpnp.vdot(b, a)
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index a8b8be52009..f6329d8f216 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -532,11 +532,11 @@ def test_reduce_hypot(device):
         pytest.param(
             "divide", [0.0, 1.0, 2.0, 3.0, 4.0], [4.0, 4.0, 4.0, 4.0, 4.0]
         ),
-        pytest.param(
-            "dot",
-            [3.0, 4.0, 5.0],
-            [1.0, 2.0, 3.0],
-        ),
+        # dpnp.dot has 3 different implementations based on input arrays dtype
+        # checking all of them
+        pytest.param("dot", [3.0, 4.0, 5.0], [1.0, 2.0, 3.0]),
+        pytest.param("dot", [3, 4, 5], [1, 2, 3]),
+        pytest.param("dot", [3 + 2j, 4 + 1j, 5], [1, 2 + 3j, 3]),
         pytest.param(
             "floor_divide", [1.0, 2.0, 3.0, 4.0], [2.5, 2.5, 2.5, 2.5]
         ),
@@ -579,6 +579,11 @@ def test_reduce_hypot(device):
             [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
             [0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0],
         ),
+        # dpnp.vdot has 3 different implementations based on input arrays dtype
+        # checking all of them
+        pytest.param("vdot", [3.0, 4.0, 5.0], [1.0, 2.0, 3.0]),
+        pytest.param("vdot", [3, 4, 5], [1, 2, 3]),
+        pytest.param("vdot", [3 + 2j, 4 + 1j, 5], [1, 2 + 3j, 3]),
     ],
 )
 @pytest.mark.parametrize(
@@ -840,11 +845,11 @@ def test_out_1in_1out(func, data, device):
         pytest.param(
             "divide", [0.0, 1.0, 2.0, 3.0, 4.0], [4.0, 4.0, 4.0, 4.0, 4.0]
         ),
-        pytest.param(
-            "dot",
-            [3.0, 4.0, 5.0],
-            [1.0, 2.0, 3.0],
-        ),
+        # dpnp.dot has 3 different implementations based on input arrays dtype
+        # checking all of them
+        pytest.param("dot", [3.0, 4.0, 5.0], [1.0, 2.0, 3.0]),
+        pytest.param("dot", [3, 4, 5], [1, 2, 3]),
+        pytest.param("dot", [3 + 2j, 4 + 1j, 5], [1, 2 + 3j, 3]),
         pytest.param(
             "floor_divide", [1.0, 2.0, 3.0, 4.0], [2.5, 2.5, 2.5, 2.5]
         ),
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 171e979facf..29101cf9f48 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -492,11 +492,11 @@ def test_1in_1out(func, data, usm_type):
         ),
         pytest.param("arctan2", [[-1, +1, +1, -1]], [[-1, -1, +1, +1]]),
         pytest.param("copysign", [0.0, 1.0, 2.0], [-1.0, 0.0, 1.0]),
-        pytest.param(
-            "dot",
-            [3.0, 4.0, 5.0],
-            [1.0, 2.0, 3.0],
-        ),
+        # dpnp.dot has 3 different implementations based on input arrays dtype
+        # checking all of them
+        pytest.param("dot", [3.0, 4.0, 5.0], [1.0, 2.0, 3.0]),
+        pytest.param("dot", [3, 4, 5], [1, 2, 3]),
+        pytest.param("dot", [3 + 2j, 4 + 1j, 5], [1, 2 + 3j, 3]),
         pytest.param("fmax", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
         pytest.param("fmin", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
         pytest.param(
@@ -505,6 +505,11 @@ def test_1in_1out(func, data, usm_type):
         pytest.param("logaddexp", [[-1, 2, 5, 9]], [[4, -3, 2, -8]]),
         pytest.param("maximum", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
         pytest.param("minimum", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
+        # dpnp.vdot has 3 different implementations based on input arrays dtype
+        # checking all of them
+        pytest.param("vdot", [3.0, 4.0, 5.0], [1.0, 2.0, 3.0]),
+        pytest.param("vdot", [3, 4, 5], [1, 2, 3]),
+        pytest.param("vdot", [3 + 2j, 4 + 1j, 5], [1, 2 + 3j, 3]),
     ],
 )
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)

From d45bb24d58dcbdb7229a776bc4d5b7768293d824 Mon Sep 17 00:00:00 2001
From: vtavana <120411540+vtavana@users.noreply.github.com>
Date: Wed, 7 Feb 2024 22:58:58 -0600
Subject: [PATCH 22/29] Improve performance of `dpnp.matmul` and `dpnp.dot`
 with `out` keyword (#1694)

* use out keyword for result

* fix strided or overlapping out

* address comments

* fix typo

* remove additional check
---
 dpnp/dpnp_iface.py                          | 21 ++++----
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 53 ++++++++++++++++-----
 2 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index e37c2e090a6..d8838e67c8d 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -495,17 +495,20 @@ def get_result_array(a, out=None, casting="safe"):
     if out is None:
         return a
     else:
-        dpnp.check_supported_arrays_type(out)
-        if out.shape != a.shape:
-            raise ValueError(
-                f"Output array of shape {a.shape} is needed, got {out.shape}."
-            )
-        elif isinstance(out, dpt.usm_ndarray):
-            out = dpnp_array._create_from_usm_ndarray(out)
+        if a is out:
+            return out
+        else:
+            dpnp.check_supported_arrays_type(out)
+            if out.shape != a.shape:
+                raise ValueError(
+                    f"Output array of shape {a.shape} is needed, got {out.shape}."
+                )
+            elif isinstance(out, dpt.usm_ndarray):
+                out = dpnp_array._create_from_usm_ndarray(out)
 
-        dpnp.copyto(out, a, casting=casting)
+            dpnp.copyto(out, a, casting=casting)
 
-        return out
+            return out
 
 
 def get_usm_ndarray(a):
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index bf1a3417704..3c36eda042d 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -36,6 +36,41 @@
 __all__ = ["dpnp_dot", "dpnp_matmul"]
 
 
+def _create_result_array(x1, x2, out, shape, dtype, usm_type, sycl_queue):
+    """
+    Create the result array.
+
+    If `out` is not ``None`` and its features match the specified `shape`, `dtype,
+    `usm_type`, and `sycl_queue` and it is C-contiguous or F-contiguous and
+    does not have any memory overlap with `x1` and `x2`, `out` itself is returned.
+    If these conditions are not statisfied, an empty array is returned with the
+    specified `shape`, `dtype, `usm_type`, and `sycl_queue`.
+    """
+
+    if out is not None:
+        x1_usm = dpnp.get_usm_ndarray(x1)
+        x2_usm = dpnp.get_usm_ndarray(x2)
+        out_usm = dpnp.get_usm_ndarray(out)
+
+        if (
+            out.dtype == dtype
+            and out.shape == shape
+            and out.usm_type == usm_type
+            and out.sycl_queue == sycl_queue
+            and (out.flags.c_contiguous or out.flags.f_contiguous)
+            and not ti._array_overlap(x1_usm, out_usm)
+            and not ti._array_overlap(x2_usm, out_usm)
+        ):
+            return out
+
+    return dpnp.empty(
+        shape,
+        dtype=dtype,
+        usm_type=usm_type,
+        sycl_queue=sycl_queue,
+    )
+
+
 def _copy_array(x, dep_events, host_events, contig_copy=False, dtype=None):
     """
     Creating a copy of input array if needed.
@@ -214,14 +249,9 @@ def dpnp_dot(a, b, /, out=None, *, conjugate=False):
         a, b, dtype=None, casting="no", sycl_queue=exec_q
     )
 
-    # create result array
-    result = dpnp.empty(
-        (),
-        dtype=dot_dtype,
-        usm_type=res_usm_type,
-        sycl_queue=exec_q,
+    result = _create_result_array(
+        a, b, out, (), dot_dtype, res_usm_type, exec_q
     )
-
     # input arrays should have the proper data type
     dep_events_list = []
     host_tasks_list = []
@@ -367,13 +397,10 @@ def dpnp_matmul(
         x2_shape = x2.shape
         res_shape = tuple(tmp_shape) + (x1_shape[-2], x2_shape[-1])
 
-    # calculate results
-    result = dpnp.empty(
-        res_shape,
-        dtype=gemm_dtype,
-        usm_type=res_usm_type,
-        sycl_queue=exec_q,
+    result = _create_result_array(
+        x1, x2, out, res_shape, gemm_dtype, res_usm_type, exec_q
     )
+    # calculate result
     if result.size == 0:
         pass
     elif x1.size == 0 or x2.size == 0:

From 1e8675368d598faf34d120ef5260a386159cd810 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 8 Feb 2024 13:52:36 +0100
Subject: [PATCH 23/29] Update dpnp.linalg.qr() function (#1673)

* Impl dpnp.linalg.qr for 2d array

* Add cupy tests for dpnp.linalg.qr

* Add batch implementation of dpnp.linalg.qr

* Remove an old impl of dpnp_qr

* Update test_qr in test_sycl_queue

* Add test_qr in test_usm_type

* Use _real_type for _orgqr

* Use _real_type for _orgqr_batch

* Update dpnp tests for dpnp.linalg.qr

* Pass scratchpad_size to the error message test

* Add additional checks

* Extend error handler for mkl batch funcs

* Add ungqr mkl extension to support complex dtype

* Update tau array size check for orgqr

* Add ungqr_batch mkl extension to support complex dtype

* Add arrays type check

* Fix test_det_singular_matrix

* Expand tests for dpnp.linalg.qr with complex types

* Update examples

* Remove astype for output arrays

* Use empty_like instead of empty

* Use ht_list_ev with dpctl.SyclEvent.wait_for

* Add _triu_inplace func

* Use copy_usm for a_t array overwritten by geqrf/geqrf_batch


---------

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/backend/extensions/lapack/CMakeLists.txt |   6 +
 dpnp/backend/extensions/lapack/geqrf.cpp      | 262 +++++++++++
 dpnp/backend/extensions/lapack/geqrf.hpp      |  63 +++
 .../backend/extensions/lapack/geqrf_batch.cpp | 273 +++++++++++
 dpnp/backend/extensions/lapack/lapack_py.cpp  |  55 +++
 dpnp/backend/extensions/lapack/orgqr.cpp      | 263 +++++++++++
 dpnp/backend/extensions/lapack/orgqr.hpp      |  67 +++
 .../backend/extensions/lapack/orgqr_batch.cpp | 278 ++++++++++++
 .../extensions/lapack/types_matrix.hpp        | 147 ++++++
 dpnp/backend/extensions/lapack/ungqr.cpp      | 263 +++++++++++
 dpnp/backend/extensions/lapack/ungqr.hpp      |  67 +++
 .../backend/extensions/lapack/ungqr_batch.cpp | 278 ++++++++++++
 dpnp/backend/include/dpnp_iface_fptr.hpp      |   2 -
 dpnp/backend/kernels/dpnp_krnl_linalg.cpp     |  34 --
 dpnp/dpnp_algo/dpnp_algo.pxd                  |   2 -
 dpnp/linalg/dpnp_algo_linalg.pyx              |  56 ---
 dpnp/linalg/dpnp_iface_linalg.py              |  70 ++-
 dpnp/linalg/dpnp_utils_linalg.py              | 427 +++++++++++++++++-
 tests/test_linalg.py                          | 210 +++++----
 tests/test_sycl_queue.py                      |  56 ++-
 tests/test_usm_type.py                        |  37 ++
 .../cupy/linalg_tests/test_decomposition.py   |  97 +++-
 22 files changed, 2767 insertions(+), 246 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/geqrf.cpp
 create mode 100644 dpnp/backend/extensions/lapack/geqrf.hpp
 create mode 100644 dpnp/backend/extensions/lapack/geqrf_batch.cpp
 create mode 100644 dpnp/backend/extensions/lapack/orgqr.cpp
 create mode 100644 dpnp/backend/extensions/lapack/orgqr.hpp
 create mode 100644 dpnp/backend/extensions/lapack/orgqr_batch.cpp
 create mode 100644 dpnp/backend/extensions/lapack/ungqr.cpp
 create mode 100644 dpnp/backend/extensions/lapack/ungqr.hpp
 create mode 100644 dpnp/backend/extensions/lapack/ungqr_batch.cpp

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 28fa2072d7d..8f4b35f20ed 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -27,15 +27,21 @@
 set(python_module_name _lapack_impl)
 set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/lapack_py.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/geqrf.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/geqrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gesvd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getri_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/heevd.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/orgqr.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/orgqr_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ungqr.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ungqr_batch.cpp
 )
 
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp
new file mode 100644
index 00000000000..a91f689d503
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/geqrf.cpp
@@ -0,0 +1,262 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "geqrf.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*geqrf_impl_fn_ptr_t)(sycl::queue,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           char *,
+                                           std::int64_t,
+                                           char *,
+                                           std::vector<sycl::event> &,
+                                           const std::vector<sycl::event> &);
+
+static geqrf_impl_fn_ptr_t geqrf_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event geqrf_impl(sycl::queue exec_q,
+                              const std::int64_t m,
+                              const std::int64_t n,
+                              char *in_a,
+                              std::int64_t lda,
+                              char *in_tau,
+                              std::vector<sycl::event> &host_task_events,
+                              const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::geqrf_scratchpad_size<T>(exec_q, m, n, lda);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event geqrf_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        geqrf_event = mkl_lapack::geqrf(
+            exec_q,
+            m,          // The number of rows in the matrix; (0 ≤ m).
+            n,          // The number of columns in the matrix; (0 ≤ n).
+            a,          // Pointer to the m-by-n matrix.
+            lda,        // The leading dimension of `a`; (1 ≤ m).
+            tau,        // Pointer to the array of scalar factors of the
+                        // elementary reflectors.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during geqrf() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << info;
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg << "Unexpected SYCL exception caught during geqrf() call:\n"
+                  << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(geqrf_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+
+    return geqrf_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    geqrf(sycl::queue q,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd != 2) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but a 2-dimensional array is expected.");
+    }
+
+    if (tau_array_nd != 1) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 1-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    bool is_tau_array_f_contig = tau_array.is_f_contiguous();
+
+    if (!is_tau_array_c_contig || !is_tau_array_f_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
+
+    geqrf_impl_fn_ptr_t geqrf_fn = geqrf_dispatch_vector[a_array_type_id];
+    if (geqrf_fn == nullptr) {
+        throw py::value_error(
+            "No geqrf implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    char *tau_array_data = tau_array.get_data();
+
+    const py::ssize_t *a_array_shape = a_array.get_shape_raw();
+
+    // The input array is transponded
+    // Change the order of getting m, n
+    const std::int64_t m = a_array_shape[1];
+    const std::int64_t n = a_array_shape[0];
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    const size_t tau_array_size = tau_array.get_size();
+    const size_t min_m_n = std::max<size_t>(1UL, std::min<size_t>(m, n));
+
+    if (tau_array_size != min_m_n) {
+        throw py::value_error("The array of Householder scalars has size=" +
+                              std::to_string(tau_array_size) + ", but a size=" +
+                              std::to_string(min_m_n) + " array is expected.");
+    }
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event geqrf_ev = geqrf_fn(q, m, n, a_array_data, lda, tau_array_data,
+                                    host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, geqrf_ev);
+}
+
+template <typename fnT, typename T>
+struct GeqrfContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GeqrfTypePairSupportFactory<T>::is_defined) {
+            return geqrf_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_geqrf_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<geqrf_impl_fn_ptr_t, GeqrfContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(geqrf_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp
new file mode 100644
index 00000000000..4ab65286b29
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/geqrf.hpp
@@ -0,0 +1,63 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+extern std::pair<sycl::event, sycl::event>
+    geqrf(sycl::queue exec_q,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+    geqrf_batch(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends = {});
+
+extern void init_geqrf_batch_dispatch_vector(void);
+extern void init_geqrf_dispatch_vector(void);
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
new file mode 100644
index 00000000000..a4fe980a539
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
@@ -0,0 +1,273 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "geqrf.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*geqrf_batch_impl_fn_ptr_t)(
+    sycl::queue,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+static geqrf_batch_impl_fn_ptr_t
+    geqrf_batch_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event geqrf_batch_impl(sycl::queue exec_q,
+                                    std::int64_t m,
+                                    std::int64_t n,
+                                    char *in_a,
+                                    std::int64_t lda,
+                                    std::int64_t stride_a,
+                                    char *in_tau,
+                                    std::int64_t stride_tau,
+                                    std::int64_t batch_size,
+                                    std::vector<sycl::event> &host_task_events,
+                                    const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::geqrf_batch_scratchpad_size<T>(exec_q, m, n, lda, stride_a,
+                                                   stride_tau, batch_size);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event geqrf_batch_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        geqrf_batch_event = mkl_lapack::geqrf_batch(
+            exec_q,
+            m, // The number of rows in each matrix in the batch; (0 ≤ m).
+               // It must be a non-negative integer.
+            n, // The number of columns in each matrix in the batch; (0 ≤ n).
+               // It must be a non-negative integer.
+            a, // Pointer to the batch of matrices, each of size (m x n).
+            lda,      // The leading dimension of each matrix in the batch.
+                      // For row major layout, lda ≥ max(1, m).
+            stride_a, // Stride between consecutive matrices in the batch.
+            tau, // Pointer to the array of scalar factors of the elementary
+                 // reflectors for each matrix in the batch.
+            stride_tau, // Stride between arrays of scalar factors in the batch.
+            batch_size, // The number of matrices in the batch.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
+        }
+        else if (info != 0 && e.detail() == 0) {
+            error_msg << "Error in batch processing. "
+                         "Number of failed calculations: "
+                      << info;
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during geqrf_batch() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg
+            << "Unexpected SYCL exception caught during geqrf_batch() call:\n"
+            << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(geqrf_batch_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+    return geqrf_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    geqrf_batch(sycl::queue q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd < 3) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but an array with ndim >= 3 is expected.");
+    }
+
+    if (tau_array_nd != 2) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 2-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+    if (!is_tau_array_c_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be C-contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
+
+    geqrf_batch_impl_fn_ptr_t geqrf_batch_fn =
+        geqrf_batch_dispatch_vector[a_array_type_id];
+    if (geqrf_batch_fn == nullptr) {
+        throw py::value_error(
+            "No geqrf_batch implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    char *tau_array_data = tau_array.get_data();
+
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event geqrf_batch_ev =
+        geqrf_batch_fn(q, m, n, a_array_data, lda, stride_a, tau_array_data,
+                       stride_tau, batch_size, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, geqrf_batch_ev);
+}
+
+template <typename fnT, typename T>
+struct GeqrfBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GeqrfBatchTypePairSupportFactory<T>::is_defined) {
+            return geqrf_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_geqrf_batch_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<geqrf_batch_impl_fn_ptr_t,
+                                       GeqrfBatchContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(geqrf_batch_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index 0c76d0fc096..eb815ac9f6b 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -30,14 +30,17 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "geqrf.hpp"
 #include "gesv.hpp"
 #include "gesvd.hpp"
 #include "getrf.hpp"
 #include "getri.hpp"
 #include "heevd.hpp"
 #include "linalg_exceptions.hpp"
+#include "orgqr.hpp"
 #include "potrf.hpp"
 #include "syevd.hpp"
+#include "ungqr.hpp"
 
 namespace lapack_ext = dpnp::backend::ext::lapack;
 namespace py = pybind11;
@@ -45,13 +48,19 @@ namespace py = pybind11;
 // populate dispatch vectors
 void init_dispatch_vectors(void)
 {
+    lapack_ext::init_geqrf_batch_dispatch_vector();
+    lapack_ext::init_geqrf_dispatch_vector();
     lapack_ext::init_gesv_dispatch_vector();
     lapack_ext::init_getrf_batch_dispatch_vector();
     lapack_ext::init_getrf_dispatch_vector();
     lapack_ext::init_getri_batch_dispatch_vector();
+    lapack_ext::init_orgqr_batch_dispatch_vector();
+    lapack_ext::init_orgqr_dispatch_vector();
     lapack_ext::init_potrf_batch_dispatch_vector();
     lapack_ext::init_potrf_dispatch_vector();
     lapack_ext::init_syevd_dispatch_vector();
+    lapack_ext::init_ungqr_batch_dispatch_vector();
+    lapack_ext::init_ungqr_dispatch_vector();
 }
 
 // populate dispatch tables
@@ -71,6 +80,20 @@ PYBIND11_MODULE(_lapack_impl, m)
     init_dispatch_vectors();
     init_dispatch_tables();
 
+    m.def("_geqrf_batch", &lapack_ext::geqrf_batch,
+          "Call `geqrf_batch` from OneMKL LAPACK library to return "
+          "the QR factorization of a batch general matrix ",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
+          py::arg("m"), py::arg("n"), py::arg("stride_a"),
+          py::arg("stride_tau"), py::arg("batch_size"),
+          py::arg("depends") = py::list());
+
+    m.def("_geqrf", &lapack_ext::geqrf,
+          "Call `geqrf` from OneMKL LAPACK library to return "
+          "the QR factorization of a general m x n matrix ",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
+          py::arg("depends") = py::list());
+
     m.def("_gesv", &lapack_ext::gesv,
           "Call `gesv` from OneMKL LAPACK library to return "
           "the solution of a system of linear equations with "
@@ -114,6 +137,22 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("eig_vecs"), py::arg("eig_vals"),
           py::arg("depends") = py::list());
 
+    m.def("_orgqr_batch", &lapack_ext::orgqr_batch,
+          "Call `_orgqr_batch` from OneMKL LAPACK library to return "
+          "the real orthogonal matrix Qi of the QR factorization "
+          "for a batch of general matrices",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
+          py::arg("m"), py::arg("n"), py::arg("k"), py::arg("stride_a"),
+          py::arg("stride_tau"), py::arg("batch_size"),
+          py::arg("depends") = py::list());
+
+    m.def("_orgqr", &lapack_ext::orgqr,
+          "Call `orgqr` from OneMKL LAPACK library to return "
+          "the real orthogonal matrix Q of the QR factorization",
+          py::arg("sycl_queue"), py::arg("m"), py::arg("n"), py::arg("k"),
+          py::arg("a_array"), py::arg("tau_array"),
+          py::arg("depends") = py::list());
+
     m.def("_potrf", &lapack_ext::potrf,
           "Call `potrf` from OneMKL LAPACK library to return "
           "the Cholesky factorization of a symmetric positive-definite matrix",
@@ -134,4 +173,20 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("sycl_queue"), py::arg("jobz"), py::arg("upper_lower"),
           py::arg("eig_vecs"), py::arg("eig_vals"),
           py::arg("depends") = py::list());
+
+    m.def("_ungqr_batch", &lapack_ext::ungqr_batch,
+          "Call `_ungqr_batch` from OneMKL LAPACK library to return "
+          "the complex unitary matrices matrix Qi of the QR factorization "
+          "for a batch of general matrices",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
+          py::arg("m"), py::arg("n"), py::arg("k"), py::arg("stride_a"),
+          py::arg("stride_tau"), py::arg("batch_size"),
+          py::arg("depends") = py::list());
+
+    m.def("_ungqr", &lapack_ext::ungqr,
+          "Call `ungqr` from OneMKL LAPACK library to return "
+          "the complex unitary matrix Q of the QR factorization",
+          py::arg("sycl_queue"), py::arg("m"), py::arg("n"), py::arg("k"),
+          py::arg("a_array"), py::arg("tau_array"),
+          py::arg("depends") = py::list());
 }
diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp
new file mode 100644
index 00000000000..22cbbe05bee
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/orgqr.cpp
@@ -0,0 +1,263 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "orgqr.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*orgqr_impl_fn_ptr_t)(sycl::queue,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           char *,
+                                           std::int64_t,
+                                           char *,
+                                           std::vector<sycl::event> &,
+                                           const std::vector<sycl::event> &);
+
+static orgqr_impl_fn_ptr_t orgqr_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event orgqr_impl(sycl::queue exec_q,
+                              const std::int64_t m,
+                              const std::int64_t n,
+                              const std::int64_t k,
+                              char *in_a,
+                              std::int64_t lda,
+                              char *in_tau,
+                              std::vector<sycl::event> &host_task_events,
+                              const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::orgqr_scratchpad_size<T>(exec_q, m, n, k, lda);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event orgqr_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        orgqr_event = mkl_lapack::orgqr(
+            exec_q,
+            m,          // The number of rows in the matrix; (0 ≤ m).
+            n,          // The number of columns in the matrix; (0 ≤ n).
+            k,          // The number of elementary reflectors
+                        // whose product defines the matrix Q; (0 ≤ k ≤ n).
+            a,          // Pointer to the m-by-n matrix.
+            lda,        // The leading dimension of `a`; (1 ≤ m).
+            tau,        // Pointer to the array of scalar factors of the
+                        // elementary reflectors.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during orgqr() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << info;
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg << "Unexpected SYCL exception caught during orfqr() call:\n"
+                  << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(orgqr_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+
+    return orgqr_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    orgqr(sycl::queue q,
+          const std::int64_t m,
+          const std::int64_t n,
+          const std::int64_t k,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd != 2) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but a 2-dimensional array is expected.");
+    }
+
+    if (tau_array_nd != 1) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 1-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    bool is_tau_array_f_contig = tau_array.is_f_contiguous();
+
+    if (!is_tau_array_c_contig || !is_tau_array_f_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be contiguous");
+    }
+
+    const size_t tau_array_size = tau_array.get_size();
+
+    if (static_cast<std::int64_t>(tau_array_size) != k) {
+        throw py::value_error("The array of Householder scalars has size=" +
+                              std::to_string(tau_array_size) +
+                              ", but an array of size=" + std::to_string(k) +
+                              " is expected.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
+
+    orgqr_impl_fn_ptr_t orgqr_fn = orgqr_dispatch_vector[a_array_type_id];
+    if (orgqr_fn == nullptr) {
+        throw py::value_error(
+            "No orgqr implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    char *tau_array_data = tau_array.get_data();
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event orgqr_ev = orgqr_fn(q, m, n, k, a_array_data, lda,
+                                    tau_array_data, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, orgqr_ev);
+}
+
+template <typename fnT, typename T>
+struct OrgqrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::OrgqrTypePairSupportFactory<T>::is_defined) {
+            return orgqr_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_orgqr_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<orgqr_impl_fn_ptr_t, OrgqrContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(orgqr_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp
new file mode 100644
index 00000000000..9cc4f530d03
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/orgqr.hpp
@@ -0,0 +1,67 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+extern std::pair<sycl::event, sycl::event>
+    orgqr(sycl::queue exec_q,
+          const std::int64_t m,
+          const std::int64_t n,
+          const std::int64_t k,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+    orgqr_batch(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t k,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends = {});
+
+extern void init_orgqr_batch_dispatch_vector(void);
+extern void init_orgqr_dispatch_vector(void);
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
new file mode 100644
index 00000000000..dfa9932a8e0
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
@@ -0,0 +1,278 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "orgqr.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*orgqr_batch_impl_fn_ptr_t)(
+    sycl::queue,
+    std::int64_t,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+static orgqr_batch_impl_fn_ptr_t
+    orgqr_batch_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event orgqr_batch_impl(sycl::queue exec_q,
+                                    std::int64_t m,
+                                    std::int64_t n,
+                                    std::int64_t k,
+                                    char *in_a,
+                                    std::int64_t lda,
+                                    std::int64_t stride_a,
+                                    char *in_tau,
+                                    std::int64_t stride_tau,
+                                    std::int64_t batch_size,
+                                    std::vector<sycl::event> &host_task_events,
+                                    const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::orgqr_batch_scratchpad_size<T>(
+            exec_q, m, n, k, lda, stride_a, stride_tau, batch_size);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event orgqr_batch_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        orgqr_batch_event = mkl_lapack::orgqr_batch(
+            exec_q,
+            m, // The number of rows in each matrix in the batch; (0 ≤ m).
+               // It must be a non-negative integer.
+            n, // The number of columns in each matrix in the batch; (0 ≤ n).
+               // It must be a non-negative integer.
+            k, // The number of elementary reflectors
+               // whose product defines the matrices Qi; (0 ≤ k ≤ n).
+            a, // Pointer to the batch of matrices, each of size (m x n).
+            lda,      // The leading dimension of each matrix in the batch.
+                      // For row major layout, lda ≥ max(1, m).
+            stride_a, // Stride between consecutive matrices in the batch.
+            tau, // Pointer to the array of scalar factors of the elementary
+                 // reflectors for each matrix in the batch.
+            stride_tau, // Stride between arrays of scalar factors in the batch.
+            batch_size, // The number of matrices in the batch.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
+        }
+        else if (info != 0 && e.detail() == 0) {
+            error_msg << "Error in batch processing. "
+                         "Number of failed calculations: "
+                      << info;
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during orgqr_batch() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg
+            << "Unexpected SYCL exception caught during orgqr_batch() call:\n"
+            << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(orgqr_batch_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+    return orgqr_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    orgqr_batch(sycl::queue q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t k,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd < 3) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but an array with ndim >= 3 is expected.");
+    }
+
+    if (tau_array_nd != 2) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 2-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+    if (!is_tau_array_c_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be C-contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
+
+    orgqr_batch_impl_fn_ptr_t orgqr_batch_fn =
+        orgqr_batch_dispatch_vector[a_array_type_id];
+    if (orgqr_batch_fn == nullptr) {
+        throw py::value_error(
+            "No orgqr_batch implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    char *tau_array_data = tau_array.get_data();
+
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event orgqr_batch_ev =
+        orgqr_batch_fn(q, m, n, k, a_array_data, lda, stride_a, tau_array_data,
+                       stride_tau, batch_size, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, orgqr_batch_ev);
+}
+
+template <typename fnT, typename T>
+struct OrgqrBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::OrgqrBatchTypePairSupportFactory<T>::is_defined) {
+            return orgqr_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_orgqr_batch_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<orgqr_batch_impl_fn_ptr_t,
+                                       OrgqrBatchContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(orgqr_batch_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp
index 893619e6afb..9a0ab36c8a4 100644
--- a/dpnp/backend/extensions/lapack/types_matrix.hpp
+++ b/dpnp/backend/extensions/lapack/types_matrix.hpp
@@ -43,6 +43,61 @@ namespace lapack
 {
 namespace types
 {
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::geqrf_batch<T>
+ * function.
+ *
+ * @tparam T Type of array containing the input matrices to be QR factorized in
+ * batch mode. Upon execution, each matrix in the batch is transformed to output
+ * arrays representing their respective orthogonal matrix Q and upper triangular
+ * matrix R.
+ */
+template <typename T>
+struct GeqrfBatchTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::geqrf<T>
+ * function.
+ *
+ * @tparam T Type of array containing the input matrix to be QR factorized.
+ * Upon execution, this matrix is transformed to output arrays representing
+ * the orthogonal matrix Q and the upper triangular matrix R.
+ */
+template <typename T>
+struct GeqrfTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::gesv<T>
@@ -190,6 +245,46 @@ struct HeevdTypePairSupportFactory
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::orgqr_batch<T>
+ * function.
+ *
+ * @tparam T Type of array containing the matrix A,
+ * each from a separate instance in the batch, from which the
+ * elementary reflectors were generated (as in QR factorization).
+ * Upon execution, each array in the batch is overwritten with
+ * its respective orthonormal matrix Q.
+ */
+template <typename T>
+struct OrgqrBatchTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::orgqr<T>
+ * function.
+ *
+ * @tparam T Type of array containing the matrix A from which the
+ * elementary reflectors were generated (as in QR factorization).
+ * Upon execution, the array is overwritten with the orthonormal matrix Q.
+ */
+template <typename T>
+struct OrgqrTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::potrf<T>
@@ -259,6 +354,58 @@ struct SyevdTypePairSupportFactory
         // fall-through
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
+
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::ungqr_batch<T>
+ * function.
+ *
+ * @tparam T Type of array containing the matrix A,
+ * each from a separate instance in the batch, from which the
+ * elementary reflectors were generated (as in QR factorization).
+ * Upon execution, each array in the batch is overwritten with
+ * its respective complex unitary matrix Q.
+ */
+template <typename T>
+struct UngqrBatchTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::ungqr<T>
+ * function.
+ *
+ * @tparam T Type of array containing the matrix A from which the
+ * elementary reflectors were generated (as in QR factorization).
+ * Upon execution, the array is overwritten with the complex unitary matrix Q.
+ */
+template <typename T>
+struct UngqrTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
 } // namespace types
 } // namespace lapack
 } // namespace ext
diff --git a/dpnp/backend/extensions/lapack/ungqr.cpp b/dpnp/backend/extensions/lapack/ungqr.cpp
new file mode 100644
index 00000000000..7c8dea4e950
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/ungqr.cpp
@@ -0,0 +1,263 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "types_matrix.hpp"
+#include "ungqr.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*ungqr_impl_fn_ptr_t)(sycl::queue,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           char *,
+                                           std::int64_t,
+                                           char *,
+                                           std::vector<sycl::event> &,
+                                           const std::vector<sycl::event> &);
+
+static ungqr_impl_fn_ptr_t ungqr_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event ungqr_impl(sycl::queue exec_q,
+                              const std::int64_t m,
+                              const std::int64_t n,
+                              const std::int64_t k,
+                              char *in_a,
+                              std::int64_t lda,
+                              char *in_tau,
+                              std::vector<sycl::event> &host_task_events,
+                              const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::ungqr_scratchpad_size<T>(exec_q, m, n, k, lda);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event ungqr_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        ungqr_event = mkl_lapack::ungqr(
+            exec_q,
+            m,          // The number of rows in the matrix; (0 ≤ m).
+            n,          // The number of columns in the matrix; (0 ≤ n).
+            k,          // The number of elementary reflectors
+                        // whose product defines the matrix Q; (0 ≤ k ≤ n).
+            a,          // Pointer to the m-by-n matrix.
+            lda,        // The leading dimension of `a`; (1 ≤ m).
+            tau,        // Pointer to the array of scalar factors of the
+                        // elementary reflectors.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during ungqr() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << info;
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg << "Unexpected SYCL exception caught during orfqr() call:\n"
+                  << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(ungqr_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+
+    return ungqr_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    ungqr(sycl::queue q,
+          const std::int64_t m,
+          const std::int64_t n,
+          const std::int64_t k,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd != 2) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but a 2-dimensional array is expected.");
+    }
+
+    if (tau_array_nd != 1) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 1-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    bool is_tau_array_f_contig = tau_array.is_f_contiguous();
+
+    if (!is_tau_array_c_contig || !is_tau_array_f_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be contiguous");
+    }
+
+    const size_t tau_array_size = tau_array.get_size();
+
+    if (static_cast<std::int64_t>(tau_array_size) != k) {
+        throw py::value_error("The array of Householder scalars has size=" +
+                              std::to_string(tau_array_size) +
+                              ", but an array of size=" + std::to_string(k) +
+                              " is expected.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
+
+    ungqr_impl_fn_ptr_t ungqr_fn = ungqr_dispatch_vector[a_array_type_id];
+    if (ungqr_fn == nullptr) {
+        throw py::value_error(
+            "No ungqr implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    char *tau_array_data = tau_array.get_data();
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event ungqr_ev = ungqr_fn(q, m, n, k, a_array_data, lda,
+                                    tau_array_data, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, ungqr_ev);
+}
+
+template <typename fnT, typename T>
+struct UngqrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::UngqrTypePairSupportFactory<T>::is_defined) {
+            return ungqr_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_ungqr_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<ungqr_impl_fn_ptr_t, UngqrContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(ungqr_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp
new file mode 100644
index 00000000000..1a9b68e94f9
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/ungqr.hpp
@@ -0,0 +1,67 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+extern std::pair<sycl::event, sycl::event>
+    ungqr(sycl::queue exec_q,
+          const std::int64_t m,
+          const std::int64_t n,
+          const std::int64_t k,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+    ungqr_batch(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t k,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends = {});
+
+extern void init_ungqr_batch_dispatch_vector(void);
+extern void init_ungqr_dispatch_vector(void);
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/ungqr_batch.cpp b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
new file mode 100644
index 00000000000..c07eaf150fc
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
@@ -0,0 +1,278 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "types_matrix.hpp"
+#include "ungqr.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*ungqr_batch_impl_fn_ptr_t)(
+    sycl::queue,
+    std::int64_t,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+static ungqr_batch_impl_fn_ptr_t
+    ungqr_batch_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event ungqr_batch_impl(sycl::queue exec_q,
+                                    std::int64_t m,
+                                    std::int64_t n,
+                                    std::int64_t k,
+                                    char *in_a,
+                                    std::int64_t lda,
+                                    std::int64_t stride_a,
+                                    char *in_tau,
+                                    std::int64_t stride_tau,
+                                    std::int64_t batch_size,
+                                    std::vector<sycl::event> &host_task_events,
+                                    const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::ungqr_batch_scratchpad_size<T>(
+            exec_q, m, n, k, lda, stride_a, stride_tau, batch_size);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event ungqr_batch_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        ungqr_batch_event = mkl_lapack::ungqr_batch(
+            exec_q,
+            m, // The number of rows in each matrix in the batch; (0 ≤ m).
+               // It must be a non-negative integer.
+            n, // The number of columns in each matrix in the batch; (0 ≤ n).
+               // It must be a non-negative integer.
+            k, // The number of elementary reflectors
+               // whose product defines the matrices Qi; (0 ≤ k ≤ n).
+            a, // Pointer to the batch of matrices, each of size (m x n).
+            lda,      // The leading dimension of each matrix in the batch.
+                      // For row major layout, lda ≥ max(1, m).
+            stride_a, // Stride between consecutive matrices in the batch.
+            tau, // Pointer to the array of scalar factors of the elementary
+                 // reflectors for each matrix in the batch.
+            stride_tau, // Stride between arrays of scalar factors in the batch.
+            batch_size, // The number of matrices in the batch.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
+        }
+        else if (info != 0 && e.detail() == 0) {
+            error_msg << "Error in batch processing. "
+                         "Number of failed calculations: "
+                      << info;
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during ungqr_batch() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg
+            << "Unexpected SYCL exception caught during ungqr_batch() call:\n"
+            << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(ungqr_batch_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+    return ungqr_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    ungqr_batch(sycl::queue q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t k,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd < 3) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but an array with ndim >= 3 is expected.");
+    }
+
+    if (tau_array_nd != 2) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 2-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+    if (!is_tau_array_c_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be C-contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
+
+    ungqr_batch_impl_fn_ptr_t ungqr_batch_fn =
+        ungqr_batch_dispatch_vector[a_array_type_id];
+    if (ungqr_batch_fn == nullptr) {
+        throw py::value_error(
+            "No ungqr_batch implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    char *tau_array_data = tau_array.get_data();
+
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event ungqr_batch_ev =
+        ungqr_batch_fn(q, m, n, k, a_array_data, lda, stride_a, tau_array_data,
+                       stride_tau, batch_size, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, ungqr_batch_ev);
+}
+
+template <typename fnT, typename T>
+struct UngqrBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::UngqrBatchTypePairSupportFactory<T>::is_defined) {
+            return ungqr_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_ungqr_batch_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<ungqr_batch_impl_fn_ptr_t,
+                                       UngqrBatchContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(ungqr_batch_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp
index 3061bb01f29..e9a3458f84a 100644
--- a/dpnp/backend/include/dpnp_iface_fptr.hpp
+++ b/dpnp/backend/include/dpnp_iface_fptr.hpp
@@ -220,8 +220,6 @@ enum class DPNPFuncName : size_t
     DPNP_FN_PUT,            /**< Used in numpy.put() impl  */
     DPNP_FN_PUT_ALONG_AXIS, /**< Used in numpy.put_along_axis() impl  */
     DPNP_FN_QR,             /**< Used in numpy.linalg.qr() impl  */
-    DPNP_FN_QR_EXT,         /**< Used in numpy.linalg.qr() impl, requires extra
-                               parameters */
     DPNP_FN_RADIANS,        /**< Used in numpy.radians() impl  */
     DPNP_FN_RADIANS_EXT,    /**< Used in numpy.radians() impl, requires extra
                                parameters */
diff --git a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
index 610da8fda3c..d74c593115e 100644
--- a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
@@ -722,17 +722,6 @@ template <typename _InputDT, typename _ComputeDT>
 void (*dpnp_qr_default_c)(void *, void *, void *, void *, size_t, size_t) =
     dpnp_qr_c<_InputDT, _ComputeDT>;
 
-template <typename _InputDT, typename _ComputeDT>
-DPCTLSyclEventRef (*dpnp_qr_ext_c)(DPCTLSyclQueueRef,
-                                   void *,
-                                   void *,
-                                   void *,
-                                   void *,
-                                   size_t,
-                                   size_t,
-                                   const DPCTLEventVectorRef) =
-    dpnp_qr_c<_InputDT, _ComputeDT>;
-
 template <typename _InputDT, typename _ComputeDT, typename _SVDT>
 DPCTLSyclEventRef dpnp_svd_c(DPCTLSyclQueueRef q_ref,
                              void *array1_in,
@@ -1000,29 +989,6 @@ void func_map_init_linalg_func(func_map_t &fmap)
     // fmap[DPNPFuncName::DPNP_FN_QR][eft_C128][eft_C128] = {
     // eft_C128, (void*)dpnp_qr_c<std::complex<double>, std::complex<double>>};
 
-    fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_INT][eft_INT] = {
-        get_default_floating_type(),
-        (void *)dpnp_qr_ext_c<
-            int32_t, func_type_map_t::find_type<get_default_floating_type()>>,
-        get_default_floating_type<std::false_type>(),
-        (void *)dpnp_qr_ext_c<
-            int32_t, func_type_map_t::find_type<
-                         get_default_floating_type<std::false_type>()>>};
-    fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_LNG][eft_LNG] = {
-        get_default_floating_type(),
-        (void *)dpnp_qr_ext_c<
-            int64_t, func_type_map_t::find_type<get_default_floating_type()>>,
-        get_default_floating_type<std::false_type>(),
-        (void *)dpnp_qr_ext_c<
-            int64_t, func_type_map_t::find_type<
-                         get_default_floating_type<std::false_type>()>>};
-    fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_FLT][eft_FLT] = {
-        eft_FLT, (void *)dpnp_qr_ext_c<float, float>};
-    fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_DBL][eft_DBL] = {
-        eft_DBL, (void *)dpnp_qr_ext_c<double, double>};
-    // fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_C128][eft_C128] = {
-    // eft_C128, (void*)dpnp_qr_c<std::complex<double>, std::complex<double>>};
-
     fmap[DPNPFuncName::DPNP_FN_SVD][eft_INT][eft_INT] = {
         eft_DBL, (void *)dpnp_svd_default_c<int32_t, double, double>};
     fmap[DPNPFuncName::DPNP_FN_SVD][eft_LNG][eft_LNG] = {
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index 2fc7e1b4a3b..71382d38f26 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -94,8 +94,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_PARTITION
         DPNP_FN_PARTITION_EXT
         DPNP_FN_PLACE
-        DPNP_FN_QR
-        DPNP_FN_QR_EXT
         DPNP_FN_RADIANS
         DPNP_FN_RADIANS_EXT
         DPNP_FN_RNG_BETA
diff --git a/dpnp/linalg/dpnp_algo_linalg.pyx b/dpnp/linalg/dpnp_algo_linalg.pyx
index 3bf6dad3ee8..67cd5d93034 100644
--- a/dpnp/linalg/dpnp_algo_linalg.pyx
+++ b/dpnp/linalg/dpnp_algo_linalg.pyx
@@ -50,7 +50,6 @@ __all__ = [
     "dpnp_eigvals",
     "dpnp_matrix_rank",
     "dpnp_norm",
-    "dpnp_qr",
 ]
 
 
@@ -323,58 +322,3 @@ cpdef object dpnp_norm(object input, ord=None, axis=None):
         return ret
     else:
         raise ValueError("Improper number of dimensions to norm.")
-
-
-cpdef tuple dpnp_qr(utils.dpnp_descriptor x1, str mode):
-    cdef size_t size_m = x1.shape[0]
-    cdef size_t size_n = x1.shape[1]
-    cdef size_t min_m_n = min(size_m, size_n)
-    cdef size_t size_tau = min_m_n
-
-    cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(x1.dtype)
-    cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_QR_EXT, param1_type, param1_type)
-
-    x1_obj = x1.get_array()
-
-    cdef (DPNPFuncType, void *) ret_type_and_func = utils.get_ret_type_and_func(kernel_data,
-                                                                                x1_obj.sycl_device.has_aspect_fp64)
-    cdef DPNPFuncType return_type = ret_type_and_func[0]
-    cdef custom_linalg_1in_3out_shape_t func = < custom_linalg_1in_3out_shape_t > ret_type_and_func[1]
-
-    cdef utils.dpnp_descriptor res_q = utils.create_output_descriptor((size_m, min_m_n),
-                                                                       return_type,
-                                                                       None,
-                                                                       device=x1_obj.sycl_device,
-                                                                       usm_type=x1_obj.usm_type,
-                                                                       sycl_queue=x1_obj.sycl_queue)
-    cdef utils.dpnp_descriptor res_r = utils.create_output_descriptor((min_m_n, size_n),
-                                                                       return_type,
-                                                                       None,
-                                                                       device=x1_obj.sycl_device,
-                                                                       usm_type=x1_obj.usm_type,
-                                                                       sycl_queue=x1_obj.sycl_queue)
-    cdef utils.dpnp_descriptor tau = utils.create_output_descriptor((size_tau, ),
-                                                                     return_type,
-                                                                     None,
-                                                                     device=x1_obj.sycl_device,
-                                                                     usm_type=x1_obj.usm_type,
-                                                                     sycl_queue=x1_obj.sycl_queue)
-
-    result_sycl_queue = res_q.get_array().sycl_queue
-
-    cdef c_dpctl.SyclQueue q = <c_dpctl.SyclQueue> result_sycl_queue
-    cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref()
-
-    cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref,
-                                                    x1.get_data(),
-                                                    res_q.get_data(),
-                                                    res_r.get_data(),
-                                                    tau.get_data(),
-                                                    size_m,
-                                                    size_n,
-                                                    NULL)  # dep_events_ref
-
-    with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref)
-    c_dpctl.DPCTLEvent_Delete(event_ref)
-
-    return (res_q.get_pyobj(), res_r.get_pyobj())
diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 2b8506130ad..88a904b3c3c 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -51,6 +51,7 @@
     dpnp_det,
     dpnp_eigh,
     dpnp_inv,
+    dpnp_qr,
     dpnp_slogdet,
     dpnp_solve,
     dpnp_svd,
@@ -529,7 +530,7 @@ def norm(x1, ord=None, axis=None, keepdims=False):
     return call_origin(numpy.linalg.norm, x1, ord, axis, keepdims)
 
 
-def qr(x1, mode="reduced"):
+def qr(a, mode="reduced"):
     """
     Compute the qr factorization of a matrix.
 
@@ -538,25 +539,64 @@ def qr(x1, mode="reduced"):
 
     For full documentation refer to :obj:`numpy.linalg.qr`.
 
-    Limitations
-    -----------
-    Input array is supported as :obj:`dpnp.ndarray`.
-    Parameter mode='reduced' is supported.
+    Parameters
+    ----------
+    a : {dpnp.ndarray, usm_ndarray}
+        The input array with the dimensionality of at least 2.
+    mode : {"reduced", "complete", "r", "raw"}, optional
+        If K = min(M, N), then
+        - "reduced" : returns Q, R with dimensions (…, M, K), (…, K, N)
+        - "complete" : returns Q, R with dimensions (…, M, M), (…, M, N)
+        - "r" : returns R only with dimensions (…, K, N)
+        - "raw" : returns h, tau with dimensions (…, N, M), (…, K,)
+        Default: "reduced".
+
+    Returns
+    -------
+    When mode is "reduced" or "complete", the result will be a namedtuple with
+    the attributes Q and R.
+    Q : dpnp.ndarray
+        A matrix with orthonormal columns.
+        When mode = "complete" the result is an orthogonal/unitary matrix
+        depending on whether or not a is real/complex.
+        The determinant may be either +/- 1 in that case.
+        In case the number of dimensions in the input array is greater
+        than 2 then a stack of the matrices with above properties is returned.
+    R : dpnp.ndarray
+        The upper-triangular matrix or a stack of upper-triangular matrices
+        if the number of dimensions in the input array is greater than 2.
+    (h, tau) : tuple of dpnp.ndarray
+        The h array contains the Householder reflectors that generate Q along with R.
+        The tau array contains scaling factors for the reflectors.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> a = np.random.randn(9, 6)
+    >>> Q, R = np.linalg.qr(a)
+    >>> np.allclose(a, np.dot(Q, R))  # a does equal QR
+    array([ True])
+    >>> R2 = np.linalg.qr(a, mode='r')
+    >>> np.allclose(R, R2)  # mode='r' returns the same R as mode='full'
+    array([ True])
+    >>> a = np.random.normal(size=(3, 2, 2)) # Stack of 2 x 2 matrices as input
+    >>> Q, R = np.linalg.qr(a)
+    >>> Q.shape
+    (3, 2, 2)
+    >>> R.shape
+    (3, 2, 2)
+    >>> np.allclose(a, np.matmul(Q, R))
+    array([ True])
 
     """
 
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
-    if x1_desc:
-        if x1_desc.ndim != 2:
-            pass
-        elif mode != "reduced":
-            pass
-        else:
-            result_tup = dpnp_qr(x1_desc, mode)
+    dpnp.check_supported_arrays_type(a)
+    check_stacked_2d(a)
 
-            return result_tup
+    if mode not in ("reduced", "complete", "r", "raw"):
+        raise ValueError(f"Unrecognized mode {mode}")
 
-    return call_origin(numpy.linalg.qr, x1, mode)
+    return dpnp_qr(a, mode)
 
 
 def solve(a, b):
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 93f41883133..a6dcfbf0c2b 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -39,6 +39,7 @@
     "dpnp_det",
     "dpnp_eigh",
     "dpnp_inv",
+    "dpnp_qr",
     "dpnp_slogdet",
     "dpnp_solve",
     "dpnp_svd",
@@ -126,29 +127,6 @@ def _check_lapack_dev_info(dev_info, error_msg=None):
         raise dpnp.linalg.LinAlgError(error_msg)
 
 
-def _real_type(dtype, device=None):
-    """
-    Returns the real data type corresponding to a given dpnp data type.
-
-    Parameters
-    ----------
-    dtype : dpnp.dtype
-        The dtype for which to find the corresponding real data type.
-    device : {None, string, SyclDevice, SyclQueue}, optional
-        An array API concept of device where an array of default floating type might be created.
-
-    Returns
-    -------
-    out : str
-        The name of the real data type.
-
-    """
-
-    default = dpnp.default_float_type(device)
-    real_type = _real_types_map.get(dtype.name, default)
-    return dpnp.dtype(real_type)
-
-
 def _common_type(*arrays):
     """
     Common type for linear algebra operations.
@@ -403,6 +381,29 @@ def _lu_factor(a, res_type):
         return (a_h, ipiv_h, dev_info_array)
 
 
+def _real_type(dtype, device=None):
+    """
+    Returns the real data type corresponding to a given dpnp data type.
+
+    Parameters
+    ----------
+    dtype : dpnp.dtype
+        The dtype for which to find the corresponding real data type.
+    device : {None, string, SyclDevice, SyclQueue}, optional
+        An array API concept of device where an array of default floating type might be created.
+
+    Returns
+    -------
+    out : str
+        The name of the real data type.
+
+    """
+
+    default = dpnp.default_float_type(device)
+    real_type = _real_types_map.get(dtype.name, default)
+    return dpnp.dtype(real_type)
+
+
 def _stacked_identity(
     batch_shape, n, dtype, usm_type="device", sycl_queue=None
 ):
@@ -447,6 +448,48 @@ def _stacked_identity(
     return x
 
 
+def _triu_inplace(a, host_tasks, depends=None):
+    """
+    _triu_inplace(a, host_tasks, depends=None)
+
+    Computes the upper triangular part of an array in-place,
+    but currently allocates extra memory for the result.
+
+    Parameters
+    ----------
+    a : {dpnp.ndarray, usm_ndarray}
+        Input array from which the upper triangular part is to be extracted.
+    host_tasks : list
+        A list to which the function appends the host event corresponding to the computation.
+        This allows for dependency management and synchronization with other tasks.
+    depends : list, optional
+        A list of events that the triangular operation depends on.
+        These tasks are completed before the triangular computation starts.
+        If ``None``, defaults to an empty list.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        A new array containing the upper triangular part of the input array `a`.
+
+    """
+
+    # TODO: implement a dedicated kernel for in-place triu instead of
+    # extra memory allocation for result
+    if depends is None:
+        depends = []
+    out = dpnp.empty_like(a, order="C")
+    ht_triu_ev, _ = ti._triu(
+        src=a.get_array(),
+        dst=out.get_array(),
+        k=0,
+        sycl_queue=a.sycl_queue,
+        depends=depends,
+    )
+    host_tasks.append(ht_triu_ev)
+    return out
+
+
 def check_stacked_2d(*arrays):
     """
     Return ``True`` if each array in `arrays` has at least two dimensions.
@@ -955,6 +998,344 @@ def dpnp_inv(a):
     return b_f
 
 
+def dpnp_qr_batch(a, mode="reduced"):
+    """
+    dpnp_qr_batch(a, mode="reduced")
+
+    Return the batched qr factorization of `a` matrix.
+
+    """
+
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+
+    m, n = a.shape[-2:]
+    k = min(m, n)
+
+    batch_shape = a.shape[:-2]
+    batch_size = prod(batch_shape)
+
+    res_type = _common_type(a)
+
+    if batch_size == 0 or k == 0:
+        if mode == "reduced":
+            return (
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (m, k),
+                    dtype=res_type,
+                ),
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (k, n),
+                    dtype=res_type,
+                ),
+            )
+        elif mode == "complete":
+            q = _stacked_identity(
+                batch_shape,
+                m,
+                dtype=res_type,
+                usm_type=a_usm_type,
+                sycl_queue=a_sycl_queue,
+            )
+            return (
+                q,
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (m, n),
+                    dtype=res_type,
+                ),
+            )
+        elif mode == "r":
+            return dpnp.empty_like(
+                a,
+                shape=batch_shape + (k, n),
+                dtype=res_type,
+            )
+        else:  # mode=="raw"
+            return (
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (n, m),
+                    dtype=res_type,
+                ),
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (k,),
+                    dtype=res_type,
+                ),
+            )
+
+    # get 3d input arrays by reshape
+    a = a.reshape(-1, m, n)
+
+    a = a.swapaxes(-2, -1)
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+
+    a_t = dpnp.empty_like(a, order="C", dtype=res_type)
+
+    # use DPCTL tensor function to fill the matrix array
+    # with content from the input array `a`
+    a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue
+    )
+
+    tau_h = dpnp.empty_like(
+        a_t,
+        shape=(batch_size, k),
+        dtype=res_type,
+    )
+
+    a_stride = a_t.strides[0]
+    tau_stride = tau_h.strides[0]
+
+    # Call the LAPACK extension function _geqrf_batch to compute the QR factorization
+    # of a general m x n matrix.
+    ht_geqrf_batch_ev, geqrf_batch_ev = li._geqrf_batch(
+        a_sycl_queue,
+        a_t.get_array(),
+        tau_h.get_array(),
+        m,
+        n,
+        a_stride,
+        tau_stride,
+        batch_size,
+        [a_copy_ev],
+    )
+
+    ht_list_ev = [ht_geqrf_batch_ev, a_ht_copy_ev]
+
+    if mode in ["r", "raw"]:
+        if mode == "r":
+            r = a_t[..., :k].swapaxes(-2, -1)
+            r = _triu_inplace(r, ht_list_ev, [geqrf_batch_ev])
+            dpctl.SyclEvent.wait_for(ht_list_ev)
+            return r.reshape(batch_shape + r.shape[-2:])
+
+        # mode=="raw"
+        dpctl.SyclEvent.wait_for(ht_list_ev)
+        q = a_t.reshape(batch_shape + a_t.shape[-2:])
+        r = tau_h.reshape(batch_shape + tau_h.shape[-1:])
+        return (q, r)
+
+    if mode == "complete" and m > n:
+        mc = m
+        q = dpnp.empty_like(
+            a_t,
+            shape=(batch_size, m, m),
+            dtype=res_type,
+        )
+    else:
+        mc = k
+        q = dpnp.empty_like(
+            a_t,
+            shape=(batch_size, n, m),
+            dtype=res_type,
+        )
+
+    # use DPCTL tensor function to fill the matrix array `q[..., :n, :]`
+    # with content from the array `a_t` overwritten by geqrf_batch
+    a_t_ht_copy_ev, a_t_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_t.get_array(),
+        dst=q[..., :n, :].get_array(),
+        sycl_queue=a_sycl_queue,
+        depends=[geqrf_batch_ev],
+    )
+
+    ht_list_ev.append(a_t_ht_copy_ev)
+
+    q_stride = q.strides[0]
+    tau_stride = tau_h.strides[0]
+
+    # Get LAPACK function (_orgqr_batch for real or _ungqf_batch for complex data types)
+    # for QR factorization
+    lapack_func = (
+        "_ungqr_batch"
+        if dpnp.issubdtype(res_type, dpnp.complexfloating)
+        else "_orgqr_batch"
+    )
+
+    # Call the LAPACK extension function _orgqr_batch/ to generate the real orthogonal/
+    # complex unitary matrices `Qi` of the QR factorization
+    # for a batch of general matrices.
+    ht_lapack_ev, lapack_ev = getattr(li, lapack_func)(
+        a_sycl_queue,
+        q.get_array(),
+        tau_h.get_array(),
+        m,
+        mc,
+        k,
+        q_stride,
+        tau_stride,
+        batch_size,
+        [a_t_copy_ev],
+    )
+
+    ht_list_ev.append(ht_lapack_ev)
+
+    q = q[..., :mc, :].swapaxes(-2, -1)
+    r = a_t[..., :mc].swapaxes(-2, -1)
+
+    ht_list_ev.append(ht_lapack_ev)
+
+    r = _triu_inplace(r, ht_list_ev, [lapack_ev])
+    dpctl.SyclEvent.wait_for(ht_list_ev)
+
+    return (
+        q.reshape(batch_shape + q.shape[-2:]),
+        r.reshape(batch_shape + r.shape[-2:]),
+    )
+
+
+def dpnp_qr(a, mode="reduced"):
+    """
+    dpnp_qr(a, mode="reduced")
+
+    Return the qr factorization of `a` matrix.
+
+    """
+
+    if a.ndim > 2:
+        return dpnp_qr_batch(a, mode=mode)
+
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+
+    res_type = _common_type(a)
+
+    m, n = a.shape
+    k = min(m, n)
+    if k == 0:
+        if mode == "reduced":
+            return dpnp.empty_like(
+                a,
+                shape=(m, 0),
+                dtype=res_type,
+            ), dpnp.empty_like(
+                a,
+                shape=(0, n),
+                dtype=res_type,
+            )
+        elif mode == "complete":
+            return dpnp.identity(
+                m, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type
+            ), dpnp.empty_like(
+                a,
+                shape=(m, n),
+                dtype=res_type,
+            )
+        elif mode == "r":
+            return dpnp.empty_like(
+                a,
+                shape=(0, n),
+                dtype=res_type,
+            )
+        else:  # mode == "raw"
+            return dpnp.empty_like(
+                a,
+                shape=(n, m),
+                dtype=res_type,
+            ), dpnp.empty_like(
+                a,
+                shape=(0,),
+                dtype=res_type,
+            )
+
+    # Transpose the input matrix to convert from row-major to column-major order.
+    # This adjustment is necessary for compatibility with OneMKL LAPACK routines,
+    # which expect matrices in column-major format.
+    # This allows data to be handled efficiently without the need for additional conversion.
+    a = a.T
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+    a_t = dpnp.empty_like(a, order="C", dtype=res_type)
+
+    # use DPCTL tensor function to fill the matrix array
+    # with content from the input array `a`
+    a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue
+    )
+
+    tau_h = dpnp.empty_like(
+        a,
+        shape=(k,),
+        dtype=res_type,
+    )
+
+    # Call the LAPACK extension function _geqrf to compute the QR factorization
+    # of a general m x n matrix.
+    ht_geqrf_ev, geqrf_ev = li._geqrf(
+        a_sycl_queue, a_t.get_array(), tau_h.get_array(), [a_copy_ev]
+    )
+
+    ht_list_ev = [ht_geqrf_ev, a_ht_copy_ev]
+
+    if mode in ["r", "raw"]:
+        if mode == "r":
+            r = a_t[:, :k].transpose()
+            r = _triu_inplace(r, ht_list_ev, [geqrf_ev])
+            dpctl.SyclEvent.wait_for(ht_list_ev)
+            return r
+
+        # mode == "raw":
+        dpctl.SyclEvent.wait_for(ht_list_ev)
+        return (a_t, tau_h)
+
+    # mc is the total number of columns in the q matrix.
+    # In `complete` mode, mc equals the number of rows.
+    # In `reduced` mode, mc is the lesser of the row count or column count.
+    if mode == "complete" and m > n:
+        mc = m
+        q = dpnp.empty_like(
+            a_t,
+            shape=(m, m),
+            dtype=res_type,
+        )
+    else:
+        mc = k
+        q = dpnp.empty_like(
+            a_t,
+            shape=(n, m),
+            dtype=res_type,
+        )
+
+    # use DPCTL tensor function to fill the matrix array `q[:n]`
+    # with content from the array `a_t` overwritten by geqrf
+    a_t_ht_copy_ev, a_t_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_t.get_array(),
+        dst=q[:n].get_array(),
+        sycl_queue=a_sycl_queue,
+        depends=[geqrf_ev],
+    )
+
+    ht_list_ev.append(a_t_ht_copy_ev)
+
+    # Get LAPACK function (_orgqr for real or _ungqf for complex data types)
+    # for QR factorization
+    lapack_func = (
+        "_ungqr"
+        if dpnp.issubdtype(res_type, dpnp.complexfloating)
+        else "_orgqr"
+    )
+
+    # Call the LAPACK extension function _orgqr/_ungqf to generate the real orthogonal/
+    # complex unitary matrix `Q` of the QR factorization
+    ht_lapack_ev, lapack_ev = getattr(li, lapack_func)(
+        a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [a_t_copy_ev]
+    )
+
+    q = q[:mc].transpose()
+    r = a_t[:, :mc].transpose()
+
+    ht_list_ev.append(ht_lapack_ev)
+
+    r = _triu_inplace(r, ht_list_ev, [lapack_ev])
+    dpctl.SyclEvent.wait_for(ht_list_ev)
+
+    return (q, r)
+
+
 def dpnp_solve(a, b):
     """
     dpnp_solve(a, b)
diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 85206bad5ba..8e32b867b85 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -1,7 +1,12 @@
 import dpctl
 import numpy
 import pytest
-from numpy.testing import assert_allclose, assert_array_equal, assert_raises
+from numpy.testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+    assert_raises,
+)
 
 import dpnp as inp
 from tests.third_party.cupy import testing
@@ -308,8 +313,8 @@ def test_det_singular_matrix(self, matrix):
         a_np = numpy.array(matrix, dtype="float32")
         a_dp = inp.array(a_np)
 
-        expected = numpy.linalg.slogdet(a_np)
-        result = inp.linalg.slogdet(a_dp)
+        expected = numpy.linalg.det(a_np)
+        result = inp.linalg.det(a_dp)
 
         assert_allclose(expected, result, rtol=1e-3, atol=1e-4)
 
@@ -672,88 +677,141 @@ def test_norm3(array, ord, axis):
     assert_allclose(expected, result)
 
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
-@pytest.mark.parametrize(
-    "shape",
-    [(2, 2), (3, 4), (5, 3), (16, 16), (0, 0), (0, 2), (2, 0)],
-    ids=["(2,2)", "(3,4)", "(5,3)", "(16,16)", "(0,0)", "(0,2)", "(2,0)"],
-)
-@pytest.mark.parametrize(
-    "mode", ["complete", "reduced"], ids=["complete", "reduced"]
-)
-def test_qr(type, shape, mode):
-    a = numpy.arange(shape[0] * shape[1], dtype=type).reshape(shape)
-    ia = inp.array(a)
+class TestQr:
+    # TODO: New packages that fix issue CMPLRLLVM-53771 are only available in internal CI.
+    # Skip the tests on cpu until these packages are available for the external CI.
+    # Specifically dpcpp_linux-64>=2024.1.0
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2), (3, 4), (5, 3), (16, 16), (2, 2, 2), (2, 4, 2), (2, 2, 4)],
+        ids=[
+            "(2, 2)",
+            "(3, 4)",
+            "(5, 3)",
+            "(16, 16)",
+            "(2, 2, 2)",
+            "(2, 4, 2)",
+            "(2, 2, 4)",
+        ],
+    )
+    @pytest.mark.parametrize(
+        "mode",
+        ["r", "raw", "complete", "reduced"],
+        ids=["r", "raw", "complete", "reduced"],
+    )
+    def test_qr(self, dtype, shape, mode):
+        a = numpy.random.rand(*shape).astype(dtype)
+        ia = inp.array(a)
+
+        if mode == "r":
+            np_r = numpy.linalg.qr(a, mode)
+            dpnp_r = inp.linalg.qr(ia, mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a, mode)
+            dpnp_q, dpnp_r = inp.linalg.qr(ia, mode)
+
+            # check decomposition
+            if mode in ("complete", "reduced"):
+                if a.ndim == 2:
+                    assert_almost_equal(
+                        inp.dot(dpnp_q, dpnp_r),
+                        a,
+                        decimal=5,
+                    )
+                else:  # a.ndim > 2
+                    assert_almost_equal(
+                        inp.matmul(dpnp_q, dpnp_r),
+                        a,
+                        decimal=5,
+                    )
+            else:  # mode=="raw"
+                assert_dtype_allclose(dpnp_q, np_q)
 
-    np_q, np_r = numpy.linalg.qr(a, mode)
-    dpnp_q, dpnp_r = inp.linalg.qr(ia, mode)
-
-    support_aspect64 = has_support_aspect64()
-
-    if support_aspect64:
-        assert dpnp_q.dtype == np_q.dtype
-        assert dpnp_r.dtype == np_r.dtype
-    assert dpnp_q.shape == np_q.shape
-    assert dpnp_r.shape == np_r.shape
-
-    tol = 1e-6
-    if type == inp.float32:
-        tol = 1e-02
-    elif not support_aspect64 and type in (inp.int32, inp.int64, None):
-        tol = 1e-02
-
-    # check decomposition
-    assert_allclose(
-        ia,
-        inp.dot(dpnp_q, dpnp_r),
-        rtol=tol,
-        atol=tol,
+        if mode in ("raw", "r"):
+            assert_dtype_allclose(dpnp_r, np_r)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "shape",
+        [(0, 0), (0, 2), (2, 0), (2, 0, 3), (2, 3, 0), (0, 2, 3)],
+        ids=[
+            "(0, 0)",
+            "(0, 2)",
+            "(2 ,0)",
+            "(2, 0, 3)",
+            "(2, 3, 0)",
+            "(0, 2, 3)",
+        ],
+    )
+    @pytest.mark.parametrize(
+        "mode",
+        ["r", "raw", "complete", "reduced"],
+        ids=["r", "raw", "complete", "reduced"],
     )
+    def test_qr_empty(self, dtype, shape, mode):
+        a = numpy.empty(shape, dtype=dtype)
+        ia = inp.array(a)
 
-    # NP change sign for comparison
-    ncols = min(a.shape[0], a.shape[1])
-    for i in range(ncols):
-        j = numpy.where(numpy.abs(np_q[:, i]) > tol)[0][0]
-        if np_q[j, i] * dpnp_q[j, i] < 0:
-            np_q[:, i] = -np_q[:, i]
-            np_r[i, :] = -np_r[i, :]
-
-        if numpy.any(numpy.abs(np_r[i, :]) > tol):
-            assert_allclose(
-                inp.asnumpy(dpnp_q)[:, i], np_q[:, i], rtol=tol, atol=tol
-            )
+        if mode == "r":
+            np_r = numpy.linalg.qr(a, mode)
+            dpnp_r = inp.linalg.qr(ia, mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a, mode)
+            dpnp_q, dpnp_r = inp.linalg.qr(ia, mode)
 
-    assert_allclose(dpnp_r, np_r, rtol=tol, atol=tol)
+            assert_dtype_allclose(dpnp_q, np_q)
 
+        assert_dtype_allclose(dpnp_r, np_r)
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-def test_qr_not_2D():
-    a = numpy.arange(12, dtype=numpy.float32).reshape((3, 2, 2))
-    ia = inp.array(a)
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
+    @pytest.mark.parametrize(
+        "mode",
+        ["r", "raw", "complete", "reduced"],
+        ids=["r", "raw", "complete", "reduced"],
+    )
+    def test_qr_strides(self, mode):
+        a = numpy.random.rand(5, 5)
+        ia = inp.array(a)
 
-    np_q, np_r = numpy.linalg.qr(a)
-    dpnp_q, dpnp_r = inp.linalg.qr(ia)
+        # positive strides
+        if mode == "r":
+            np_r = numpy.linalg.qr(a[::2, ::2], mode)
+            dpnp_r = inp.linalg.qr(ia[::2, ::2], mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a[::2, ::2], mode)
+            dpnp_q, dpnp_r = inp.linalg.qr(ia[::2, ::2], mode)
 
-    assert dpnp_q.dtype == np_q.dtype
-    assert dpnp_r.dtype == np_r.dtype
-    assert dpnp_q.shape == np_q.shape
-    assert dpnp_r.shape == np_r.shape
+            assert_dtype_allclose(dpnp_q, np_q)
 
-    assert_allclose(ia, inp.matmul(dpnp_q, dpnp_r))
+        assert_dtype_allclose(dpnp_r, np_r)
 
-    a = numpy.empty((0, 3, 2), dtype=numpy.float32)
-    ia = inp.array(a)
+        # negative strides
+        if mode == "r":
+            np_r = numpy.linalg.qr(a[::-2, ::-2], mode)
+            dpnp_r = inp.linalg.qr(ia[::-2, ::-2], mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a[::-2, ::-2], mode)
+            dpnp_q, dpnp_r = inp.linalg.qr(ia[::-2, ::-2], mode)
 
-    np_q, np_r = numpy.linalg.qr(a)
-    dpnp_q, dpnp_r = inp.linalg.qr(ia)
+            assert_dtype_allclose(dpnp_q, np_q)
 
-    assert dpnp_q.dtype == np_q.dtype
-    assert dpnp_r.dtype == np_r.dtype
-    assert dpnp_q.shape == np_q.shape
-    assert dpnp_r.shape == np_r.shape
+        assert_dtype_allclose(dpnp_r, np_r)
 
-    assert_allclose(ia, inp.matmul(dpnp_q, dpnp_r))
+    def test_qr_errors(self):
+        a_dp = inp.array([[1, 2], [3, 5]], dtype="float32")
+
+        # unsupported type
+        a_np = inp.asnumpy(a_dp)
+        assert_raises(TypeError, inp.linalg.qr, a_np)
+
+        # a.ndim < 2
+        a_dp_ndim_1 = a_dp.flatten()
+        assert_raises(inp.linalg.LinAlgError, inp.linalg.qr, a_dp_ndim_1)
+
+        # invalid mode
+        assert_raises(ValueError, inp.linalg.qr, a_dp, "c")
 
 
 class TestSolve:
@@ -1018,14 +1076,6 @@ def check_decomposition(
             dpnp_diag_s = inp.zeros_like(dp_a, dtype=dp_s.dtype)
             for i in range(min(dp_a.shape[-2], dp_a.shape[-1])):
                 dpnp_diag_s[..., i, i] = dp_s[..., i]
-            # TODO: remove it when dpnp.dot is updated
-            # dpnp.dot does not support complex type
-            if inp.issubdtype(dp_a.dtype, inp.complexfloating):
-                reconstructed = numpy.dot(
-                    inp.asnumpy(dp_u),
-                    numpy.dot(inp.asnumpy(dpnp_diag_s), inp.asnumpy(dp_vt)),
-                )
-            else:
                 reconstructed = inp.dot(dp_u, inp.dot(dpnp_diag_s, dp_vt))
             # TODO: use assert dpnp.allclose() inside check_decomposition()
             # when it will support complex dtypes
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index f6329d8f216..de243744403 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -1202,34 +1202,52 @@ def test_matrix_rank(device):
     assert_array_equal(expected, result)
 
 
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (4, 4),
+        (2, 0),
+        (2, 2, 3),
+        (0, 2, 3),
+        (1, 0, 3),
+    ],
+    ids=[
+        "(4, 4)",
+        "(2, 0)",
+        "(2, 2, 3)",
+        "(0, 2, 3)",
+        "(1, 0, 3)",
+    ],
+)
+@pytest.mark.parametrize(
+    "mode",
+    ["r", "raw", "complete", "reduced"],
+    ids=["r", "raw", "complete", "reduced"],
+)
 @pytest.mark.parametrize(
     "device",
     valid_devices,
     ids=[device.filter_string for device in valid_devices],
 )
-def test_qr(device):
-    data = [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]
-    dpnp_data = dpnp.array(data, device=device)
-    numpy_data = numpy.array(data, dtype=dpnp_data.dtype)
-
-    np_q, np_r = numpy.linalg.qr(numpy_data, "reduced")
-    dpnp_q, dpnp_r = dpnp.linalg.qr(dpnp_data, "reduced")
+def test_qr(shape, mode, device):
+    dtype = dpnp.default_float_type(device)
+    count_elems = numpy.prod(shape)
+    a = dpnp.arange(count_elems, dtype=dtype, device=device).reshape(shape)
 
-    assert dpnp_q.dtype == np_q.dtype
-    assert dpnp_r.dtype == np_r.dtype
-    assert dpnp_q.shape == np_q.shape
-    assert dpnp_r.shape == np_r.shape
+    expected_queue = a.get_array().sycl_queue
 
-    assert_dtype_allclose(dpnp_q, np_q)
-    assert_dtype_allclose(dpnp_r, np_r)
+    if mode == "r":
+        dp_r = dpnp.linalg.qr(a, mode=mode)
+        dp_r_queue = dp_r.get_array().sycl_queue
+        assert_sycl_queue_equal(dp_r_queue, expected_queue)
+    else:
+        dp_q, dp_r = dpnp.linalg.qr(a, mode=mode)
 
-    expected_queue = dpnp_data.get_array().sycl_queue
-    dpnp_q_queue = dpnp_q.get_array().sycl_queue
-    dpnp_r_queue = dpnp_r.get_array().sycl_queue
+        dp_q_queue = dp_q.get_array().sycl_queue
+        dp_r_queue = dp_r.get_array().sycl_queue
 
-    # compare queue and device
-    assert_sycl_queue_equal(dpnp_q_queue, expected_queue)
-    assert_sycl_queue_equal(dpnp_r_queue, expected_queue)
+        assert_sycl_queue_equal(dp_q_queue, expected_queue)
+        assert_sycl_queue_equal(dp_r_queue, expected_queue)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 29101cf9f48..56e2a68756a 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -796,3 +796,40 @@ def test_svd(usm_type, shape, full_matrices_param, compute_uv_param):
         )
 
     assert x.usm_type == s.usm_type
+
+
+@pytest.mark.parametrize("usm_type", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (4, 4),
+        (2, 0),
+        (2, 2, 3),
+        (0, 2, 3),
+        (1, 0, 3),
+    ],
+    ids=[
+        "(4, 4)",
+        "(2, 0)",
+        "(2, 2, 3)",
+        "(0, 2, 3)",
+        "(1, 0, 3)",
+    ],
+)
+@pytest.mark.parametrize(
+    "mode",
+    ["r", "raw", "complete", "reduced"],
+    ids=["r", "raw", "complete", "reduced"],
+)
+def test_qr(shape, mode, usm_type):
+    count_elems = numpy.prod(shape)
+    a = dp.arange(count_elems, usm_type=usm_type).reshape(shape)
+
+    if mode == "r":
+        dp_r = dp.linalg.qr(a, mode=mode)
+        assert a.usm_type == dp_r.usm_type
+    else:
+        dp_q, dp_r = dp.linalg.qr(a, mode=mode)
+
+        assert a.usm_type == dp_q.usm_type
+        assert a.usm_type == dp_r.usm_type
diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py
index fd887c16e6c..234a2e0e381 100644
--- a/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -201,38 +201,31 @@ def check_usv(self, shape, dtype):
         # reconstruct the matrix
         k = s_cpu.shape[-1]
 
-        # dpnp.dot/matmul does not support complex type and unstable on cpu
-        # TODO: remove it and use xp.dot/matmul when dpnp.dot/matmul is updated
-        u_gpu = u_gpu.asnumpy()
-        vh_gpu = vh_gpu.asnumpy()
-        s_gpu = s_gpu.asnumpy()
-        xp = numpy
-
         if len(shape) == 2:
             if self.full_matrices:
-                a_gpu_usv = numpy.dot(u_gpu[:, :k] * s_gpu, vh_gpu[:k, :])
+                a_gpu_usv = cupy.dot(u_gpu[:, :k] * s_gpu, vh_gpu[:k, :])
             else:
-                a_gpu_usv = numpy.dot(u_gpu * s_gpu, vh_gpu)
+                a_gpu_usv = cupy.dot(u_gpu * s_gpu, vh_gpu)
         else:
             if self.full_matrices:
-                a_gpu_usv = numpy.matmul(
+                a_gpu_usv = cupy.matmul(
                     u_gpu[..., :k] * s_gpu[..., None, :], vh_gpu[..., :k, :]
                 )
             else:
-                a_gpu_usv = numpy.matmul(u_gpu * s_gpu[..., None, :], vh_gpu)
+                a_gpu_usv = cupy.matmul(u_gpu * s_gpu[..., None, :], vh_gpu)
         testing.assert_allclose(a_gpu, a_gpu_usv, rtol=1e-4, atol=1e-4)
 
         # assert unitary
         u_len = u_gpu.shape[-1]
         vh_len = vh_gpu.shape[-2]
         testing.assert_allclose(
-            xp.matmul(u_gpu.swapaxes(-1, -2).conj(), u_gpu),
-            stacked_identity(xp, shape[:-2], u_len, dtype),
+            cupy.matmul(u_gpu.swapaxes(-1, -2).conj(), u_gpu),
+            stacked_identity(cupy, shape[:-2], u_len, dtype),
             atol=1e-4,
         )
         testing.assert_allclose(
-            xp.matmul(vh_gpu, vh_gpu.swapaxes(-1, -2).conj()),
-            stacked_identity(xp, shape[:-2], vh_len, dtype),
+            cupy.matmul(vh_gpu, vh_gpu.swapaxes(-1, -2).conj()),
+            stacked_identity(cupy, shape[:-2], vh_len, dtype),
             atol=1e-4,
         )
 
@@ -385,3 +378,77 @@ def test_svd_rank4_empty_array(self):
         self.check_usv((0, 2, 3, 4))
         self.check_usv((1, 2, 0, 4))
         self.check_usv((1, 2, 3, 0))
+
+
+@testing.parameterize(
+    *testing.product(
+        {
+            "mode": ["r", "raw", "complete", "reduced"],
+        }
+    )
+)
+class TestQRDecomposition(unittest.TestCase):
+    @testing.for_dtypes("fdFD")
+    def check_mode(self, array, mode, dtype):
+        a_cpu = numpy.asarray(array, dtype=dtype)
+        a_gpu = cupy.asarray(array, dtype=dtype)
+        result_gpu = cupy.linalg.qr(a_gpu, mode=mode)
+        if (
+            mode != "raw"
+            or numpy.lib.NumpyVersion(numpy.__version__) >= "1.22.0rc1"
+        ):
+            result_cpu = numpy.linalg.qr(a_cpu, mode=mode)
+            self._check_result(result_cpu, result_gpu)
+
+    def _check_result(self, result_cpu, result_gpu):
+        if isinstance(result_cpu, tuple):
+            for b_cpu, b_gpu in zip(result_cpu, result_gpu):
+                assert b_cpu.dtype == b_gpu.dtype
+                testing.assert_allclose(b_cpu, b_gpu, atol=1e-4)
+        else:
+            assert result_cpu.dtype == result_gpu.dtype
+            testing.assert_allclose(result_cpu, result_gpu, atol=1e-4)
+
+    # TODO: New packages that fix issue CMPLRLLVM-53771 are only available in internal CI.
+    # Skip the tests on cpu until these packages are available for the external CI.
+    # Specifically dpcpp_linux-64>=2024.1.0
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
+    @testing.fix_random()
+    @_condition.repeat(3, 10)
+    def test_mode(self):
+        self.check_mode(numpy.random.randn(2, 4), mode=self.mode)
+        self.check_mode(numpy.random.randn(3, 3), mode=self.mode)
+        self.check_mode(numpy.random.randn(5, 4), mode=self.mode)
+
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
+    @testing.with_requires("numpy>=1.22")
+    @testing.fix_random()
+    def test_mode_rank3(self):
+        self.check_mode(numpy.random.randn(3, 2, 4), mode=self.mode)
+        self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode)
+        self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode)
+
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
+    @testing.with_requires("numpy>=1.22")
+    @testing.fix_random()
+    def test_mode_rank4(self):
+        self.check_mode(numpy.random.randn(2, 3, 2, 4), mode=self.mode)
+        self.check_mode(numpy.random.randn(2, 4, 3, 3), mode=self.mode)
+        self.check_mode(numpy.random.randn(2, 2, 5, 4), mode=self.mode)
+
+    @testing.with_requires("numpy>=1.16")
+    def test_empty_array(self):
+        self.check_mode(numpy.empty((0, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 0)), mode=self.mode)
+
+    @testing.with_requires("numpy>=1.22")
+    def test_empty_array_rank3(self):
+        self.check_mode(numpy.empty((0, 3, 2)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 0, 2)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 2, 0)), mode=self.mode)
+        self.check_mode(numpy.empty((0, 3, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 0, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 3, 0)), mode=self.mode)
+        self.check_mode(numpy.empty((0, 2, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((2, 0, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((2, 3, 0)), mode=self.mode)

From 6c99e65187e43c8ec072eca179413baf57fa195e Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Thu, 8 Feb 2024 19:05:54 +0100
Subject: [PATCH 24/29] Implement a helper alias template for complex types
 (#1644)

---
 dpnp/backend/kernels/dpnp_krnl_fft.cpp |  4 ++--
 dpnp/backend/src/dpnp_fptr.hpp         | 12 ++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/dpnp/backend/kernels/dpnp_krnl_fft.cpp b/dpnp/backend/kernels/dpnp_krnl_fft.cpp
index 027f3343178..aec669a8699 100644
--- a/dpnp/backend/kernels/dpnp_krnl_fft.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_fft.cpp
@@ -414,7 +414,7 @@ DPCTLSyclEventRef dpnp_fft_fft_c(DPCTLSyclQueueRef q_ref,
                                  const size_t norm,
                                  const DPCTLEventVectorRef dep_event_vec_ref)
 {
-    static_assert(sycl::detail::is_complex<_DataType_output>::value,
+    static_assert(is_complex<_DataType_output>::value,
                   "Output data type must be a complex type.");
 
     DPCTLSyclEventRef event_ref = nullptr;
@@ -584,7 +584,7 @@ DPCTLSyclEventRef dpnp_fft_rfft_c(DPCTLSyclQueueRef q_ref,
                                   const size_t norm,
                                   const DPCTLEventVectorRef dep_event_vec_ref)
 {
-    static_assert(sycl::detail::is_complex<_DataType_output>::value,
+    static_assert(is_complex<_DataType_output>::value,
                   "Output data type must be a complex type.");
     DPCTLSyclEventRef event_ref = nullptr;
 
diff --git a/dpnp/backend/src/dpnp_fptr.hpp b/dpnp/backend/src/dpnp_fptr.hpp
index a46f3a7d35d..022e844319d 100644
--- a/dpnp/backend/src/dpnp_fptr.hpp
+++ b/dpnp/backend/src/dpnp_fptr.hpp
@@ -219,6 +219,18 @@ template <typename _Tp>
 using dpnp_remove_cvref_t =
     typename std::remove_cv_t<typename std::remove_reference_t<_Tp>>;
 
+/**
+ * A helper alias template to return true value for complex types and false
+ * otherwise.
+ */
+template <typename _Tp>
+struct is_complex : public std::integral_constant<
+                        bool,
+                        std::is_same_v<_Tp, std::complex<float>> ||
+                            std::is_same_v<_Tp, std::complex<double>>>
+{
+};
+
 /**
  * @brief "<" comparison with complex types support.
  *

From 2ce997db0b4bbd9d28f0f224a4f59a78e34451f8 Mon Sep 17 00:00:00 2001
From: vtavana <120411540+vtavana@users.noreply.github.com>
Date: Thu, 8 Feb 2024 17:40:54 -0600
Subject: [PATCH 25/29] Unmute result type tests and modify
 TypeError/ValueError tests (#1663)

* unmute result type tests

* update TypeError and ValueError tests

* update TestResultType
---
 tests/test_manipulation.py                   |  3 --
 tests/test_mathematical.py                   | 44 +++++---------------
 tests/test_umath.py                          | 28 ++++---------
 tests/third_party/cupy/test_type_routines.py | 29 +++++++++++--
 4 files changed, 44 insertions(+), 60 deletions(-)

diff --git a/tests/test_manipulation.py b/tests/test_manipulation.py
index bb5533b0e62..0c830950197 100644
--- a/tests/test_manipulation.py
+++ b/tests/test_manipulation.py
@@ -72,9 +72,6 @@ def test_repeat(arr):
     assert_array_equal(expected, result)
 
 
-# TODO: Temporary skipping the test, until Internal CI is updated with
-# recent changed in dpctl regarding dpt.result_type function
-@pytest.mark.skip("Temporary skipping the test")
 def test_result_type():
     X = [dpnp.ones((2), dtype=dpnp.int64), dpnp.int32, "float32"]
     X_np = [numpy.ones((2), dtype=numpy.int64), numpy.int32, "float32"]
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 56be3db6d92..80fe09c61b8 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -1199,9 +1199,7 @@ def test_invalid_dtype(self, dtype):
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
 
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.ceil(dp_array, out=dp_out)
 
     @pytest.mark.parametrize("dtype", get_float_dtypes())
@@ -1241,9 +1239,7 @@ def test_invalid_dtype(self, dtype):
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
 
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.floor(dp_array, out=dp_out)
 
     @pytest.mark.parametrize("dtype", get_float_dtypes())
@@ -1283,9 +1279,7 @@ def test_invalid_dtype(self, dtype):
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
 
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.trunc(dp_array, out=dp_out)
 
     @pytest.mark.parametrize("dtype", get_float_dtypes())
@@ -1336,9 +1330,7 @@ def test_out_dtypes(self, dtype):
         dp_out = dpnp.empty(size, dtype=dpnp.complex64)
         if dtype != dpnp.complex64:
             # dtype of out mismatches types of input arrays
-            # TODO: change it to ValueError, when dpctl
-            # is being used in internal CI
-            with pytest.raises((TypeError, ValueError)):
+            with pytest.raises(ValueError):
                 dpnp.add(dp_array1, dp_array2, out=dp_out)
 
             # allocate new out with expected type
@@ -1435,9 +1427,7 @@ def test_out_dtypes(self, dtype):
         check_dtype = True
         if dtype != dpnp.complex64:
             # dtype of out mismatches types of input arrays
-            # TODO: change it to ValueError, when dpctl
-            # is being used in internal CI
-            with pytest.raises((TypeError, ValueError)):
+            with pytest.raises(ValueError):
                 dpnp.divide(dp_array1, dp_array2, out=dp_out)
 
             # allocate new out with expected type
@@ -1538,9 +1528,7 @@ def test_out_dtypes(self, dtype):
         dp_out = dpnp.empty(size, dtype=dpnp.complex64)
         if dtype != dpnp.complex64:
             # dtype of out mismatches types of input arrays
-            # TODO: change it to ValueError, when dpctl
-            # is being used in internal CI
-            with pytest.raises((TypeError, ValueError)):
+            with pytest.raises(ValueError):
                 dpnp.floor_divide(dp_array1, dp_array2, out=dp_out)
 
             # allocate new out with expected type
@@ -1800,9 +1788,7 @@ def test_out_dtypes(self, dtype):
         dp_out = dpnp.empty(size, dtype=dpnp.float32)
         if dtype != dpnp.float32:
             # dtype of out mismatches types of input arrays
-            # TODO: change it to ValueError, when dpctl
-            # is being used in internal CI
-            with pytest.raises((TypeError, ValueError)):
+            with pytest.raises(ValueError):
                 dpnp.hypot(dp_array1, dp_array2, out=dp_out)
 
             # allocate new out with expected type
@@ -1970,9 +1956,7 @@ def test_out_dtypes(self, dtype):
         dp_out = dpnp.empty(size, dtype=dpnp.complex64)
         if dtype != dpnp.complex64:
             # dtype of out mismatches types of input arrays
-            # TODO: change it to ValueError, when dpctl
-            # is being used in internal CI
-            with pytest.raises((TypeError, ValueError)):
+            with pytest.raises(ValueError):
                 dpnp.maximum(dp_array1, dp_array2, out=dp_out)
 
             # allocate new out with expected type
@@ -2053,9 +2037,7 @@ def test_out_dtypes(self, dtype):
         dp_out = dpnp.empty(size, dtype=dpnp.complex64)
         if dtype != dpnp.complex64:
             # dtype of out mismatches types of input arrays
-            # TODO: change it to ValueError, when dpctl
-            # is being used in internal CI
-            with pytest.raises((TypeError, ValueError)):
+            with pytest.raises(ValueError):
                 dpnp.minimum(dp_array1, dp_array2, out=dp_out)
 
             # allocate new out with expected type
@@ -2136,9 +2118,7 @@ def test_out_dtypes(self, dtype):
         dp_out = dpnp.empty(size, dtype=dpnp.complex64)
         if dtype != dpnp.complex64:
             # dtype of out mismatches types of input arrays
-            # TODO: change it to ValueError, when dpctl
-            # is being used in internal CI
-            with pytest.raises((TypeError, ValueError)):
+            with pytest.raises(ValueError):
                 dpnp.multiply(dp_array1, dp_array2, out=dp_out)
 
             # allocate new out with expected type
@@ -2233,9 +2213,7 @@ def test_out_dtypes(self, dtype):
         dp_out = dpnp.empty(size, dtype=dpnp.complex64)
         if dtype != dpnp.complex64:
             # dtype of out mismatches types of input arrays
-            # TODO: change it to ValueError, when dpctl
-            # is being used in internal CI
-            with pytest.raises((TypeError, ValueError)):
+            with pytest.raises(ValueError):
                 dpnp.power(dp_array1, dp_array2, out=dp_out)
 
             # allocate new out with expected type
diff --git a/tests/test_umath.py b/tests/test_umath.py
index 8e04a439bc9..2f792c0ab3c 100644
--- a/tests/test_umath.py
+++ b/tests/test_umath.py
@@ -209,9 +209,7 @@ def test_invalid_dtype(self, func_params, dtype):
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
 
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             getattr(dpnp, func_name)(dp_array, out=dp_out)
 
     @pytest.mark.parametrize(
@@ -256,9 +254,7 @@ def test_invalid_dtype(self, dtype):
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
 
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.cbrt(dp_array, out=dp_out)
 
     @pytest.mark.parametrize(
@@ -295,9 +291,7 @@ def test_invalid_dtype(self, dtype):
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
 
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.rsqrt(dp_array, out=dp_out)
 
     @pytest.mark.parametrize(
@@ -338,9 +332,7 @@ def test_invalid_dtype(self, dtype):
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
 
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.square(dp_array, out=dp_out)
 
     @pytest.mark.parametrize(
@@ -423,9 +415,7 @@ def test_invalid_dtype(self, dtype):
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
 
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.arctan2(dp_array, dp_array, out=dp_out)
 
     @pytest.mark.parametrize(
@@ -461,9 +451,7 @@ def test_invalid_dtype(self, dtype):
         dpnp_dtype = get_all_dtypes(no_complex=True, no_none=True)[-1]
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.copysign(dp_array, dp_array, out=dp_out)
 
     @pytest.mark.parametrize(
@@ -499,9 +487,7 @@ def test_invalid_dtype(self, dtype):
         dpnp_dtype = get_all_dtypes(no_complex=True, no_none=True)[-1]
         dp_array = dpnp.arange(10, dtype=dpnp_dtype)
         dp_out = dpnp.empty(10, dtype=dtype)
-        # TODO: change it to ValueError, when dpctl
-        # is being used in internal CI
-        with pytest.raises((TypeError, ValueError)):
+        with pytest.raises(ValueError):
             dpnp.logaddexp(dp_array, dp_array, out=dp_out)
 
     @pytest.mark.parametrize(
diff --git a/tests/third_party/cupy/test_type_routines.py b/tests/third_party/cupy/test_type_routines.py
index e6fd09c7419..ebfe56d6d42 100644
--- a/tests/third_party/cupy/test_type_routines.py
+++ b/tests/third_party/cupy/test_type_routines.py
@@ -4,6 +4,7 @@
 import pytest
 
 import dpnp as cupy
+from tests.helper import has_support_aspect64
 from tests.third_party.cupy import testing
 
 
@@ -87,9 +88,6 @@ def test_common_type_bool(self, dtype):
         }
     )
 )
-# TODO: Temporary skipping the test, until Internal CI is updated with
-# recent changed in dpctl regarding dpt.result_type function
-@pytest.mark.skip("Temporary skipping the test")
 class TestResultType(unittest.TestCase):
     @testing.for_all_dtypes_combination(names=("dtype1", "dtype2"))
     @testing.numpy_cupy_equal()
@@ -100,6 +98,31 @@ def test_result_type(self, xp, dtype1, dtype2):
         input1 = _generate_type_routines_input(xp, dtype1, self.obj_type1)
 
         input2 = _generate_type_routines_input(xp, dtype2, self.obj_type2)
+
+        flag1 = isinstance(input1, (numpy.ndarray, cupy.ndarray))
+        flag2 = isinstance(input2, (numpy.ndarray, cupy.ndarray))
+        dt1 = cupy.dtype(input1) if not flag1 else None
+        dt2 = cupy.dtype(input2) if not flag2 else None
+        # dpnp takes into account device capabilities only if one of the
+        # inputs is an array, for such a case, if the other dtype is not
+        # supported by device, dpnp raise ValueError. So, we skip the test.
+        if flag1 or flag2:
+            if (
+                dt1 in [cupy.float64, cupy.complex128]
+                or dt2 in [cupy.float64, cupy.complex128]
+            ) and not has_support_aspect64():
+                pytest.skip("No fp64 support by device.")
+
         ret = xp.result_type(input1, input2)
+
+        # dpnp takes into account device capabilities if one of the inputs
+        # is an array, for such a case, we have to modify the results for
+        # NumPy to align it with device capabilities.
+        if (flag1 or flag2) and xp == numpy and not has_support_aspect64():
+            ret = numpy.dtype(numpy.float32) if ret == numpy.float64 else ret
+            ret = (
+                numpy.dtype(numpy.complex64) if ret == numpy.complex128 else ret
+            )
+
         assert isinstance(ret, numpy.dtype)
         return ret

From 0957dddc19ff819cfe3bd686308c5a6107adec00 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 9 Feb 2024 13:15:54 +0100
Subject: [PATCH 26/29] Fix memory leak in dpnp_algo_random (#1700)

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/random/dpnp_algo_random.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/random/dpnp_algo_random.pyx b/dpnp/random/dpnp_algo_random.pyx
index 432d3a3294c..3d2a10c51a9 100644
--- a/dpnp/random/dpnp_algo_random.pyx
+++ b/dpnp/random/dpnp_algo_random.pyx
@@ -442,7 +442,7 @@ cdef class MT19937(_Engine):
                 try:
                     for i in range(vector_seed_len):
                         vector_seed[i] = <uint32_t> seed[i]
-                except (ValueError, TypeError) as e:
+                except Exception as e:
                     free(vector_seed)
                     raise e
         else:

From 4c7859b5dae41600377eea665ccf763218d0a226 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 9 Feb 2024 18:36:22 +0100
Subject: [PATCH 27/29] Fix ExecutionPlacementError for dpnp.take_along_axis
 (#1702)

* Follow compute follows data to fill fancy_index

* Update take_along_axis tests to cover the issue

* Update test_take_along_axis
---
 dpnp/dpnp_iface_indexing.py |  7 ++++++-
 tests/test_sycl_queue.py    | 36 ++++++++++++++++++++++++++++++++----
 tests/test_usm_type.py      | 26 ++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 8f973ed1f1a..a930b3c56d9 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -111,7 +111,12 @@ def _build_along_axis_index(a, indices, axis):
         else:
             ind_shape = shape_ones[:dim] + (-1,) + shape_ones[dim + 1 :]
             fancy_index.append(
-                dpnp.arange(n, dtype=indices.dtype).reshape(ind_shape)
+                dpnp.arange(
+                    n,
+                    dtype=indices.dtype,
+                    usm_type=indices.usm_type,
+                    sycl_queue=indices.sycl_queue,
+                ).reshape(ind_shape)
             )
 
     return tuple(fancy_index)
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index de243744403..6bc24af6c7d 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -1520,21 +1520,20 @@ def test_clip(device):
     assert_sycl_queue_equal(x.sycl_queue, y.sycl_queue)
 
 
-@pytest.mark.parametrize("func", ["take", "take_along_axis"])
 @pytest.mark.parametrize(
     "device",
     valid_devices,
     ids=[device.filter_string for device in valid_devices],
 )
-def test_take(func, device):
+def test_take(device):
     numpy_data = numpy.arange(5)
     dpnp_data = dpnp.array(numpy_data, device=device)
 
     dpnp_ind = dpnp.array([0, 2, 4], device=device)
     np_ind = dpnp_ind.asnumpy()
 
-    result = getattr(dpnp, func)(dpnp_data, dpnp_ind, axis=None)
-    expected = getattr(numpy, func)(numpy_data, np_ind, axis=None)
+    result = dpnp.take(dpnp_data, dpnp_ind, axis=None)
+    expected = numpy.take(numpy_data, np_ind, axis=None)
     assert_allclose(expected, result)
 
     expected_queue = dpnp_data.get_array().sycl_queue
@@ -1542,6 +1541,35 @@ def test_take(func, device):
     assert_sycl_queue_equal(result_queue, expected_queue)
 
 
+@pytest.mark.parametrize(
+    "data, ind, axis",
+    [
+        (numpy.arange(6), numpy.array([0, 2, 4]), None),
+        (
+            numpy.arange(6).reshape((2, 3)),
+            numpy.array([0, 1]).reshape((2, 1)),
+            1,
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "device",
+    valid_devices,
+    ids=[device.filter_string for device in valid_devices],
+)
+def test_take_along_axis(data, ind, axis, device):
+    dp_data = dpnp.array(data, device=device)
+    dp_ind = dpnp.array(ind, device=device)
+
+    result = dpnp.take_along_axis(dp_data, dp_ind, axis=axis)
+    expected = numpy.take_along_axis(data, ind, axis=axis)
+    assert_allclose(expected, result)
+
+    expected_queue = dp_data.get_array().sycl_queue
+    result_queue = result.get_array().sycl_queue
+    assert_sycl_queue_equal(result_queue, expected_queue)
+
+
 @pytest.mark.parametrize(
     "device",
     valid_devices,
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 56e2a68756a..e188cdb1c47 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -570,6 +570,32 @@ def test_take(func, usm_type_x, usm_type_ind):
     assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind])
 
 
+@pytest.mark.parametrize(
+    "data, ind, axis",
+    [
+        (numpy.arange(6), numpy.array([0, 2, 4]), None),
+        (
+            numpy.arange(6).reshape((2, 3)),
+            numpy.array([0, 1]).reshape((2, 1)),
+            1,
+        ),
+    ],
+)
+@pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize(
+    "usm_type_ind", list_of_usm_types, ids=list_of_usm_types
+)
+def test_take_along_axis(data, ind, axis, usm_type_x, usm_type_ind):
+    x = dp.array(data, usm_type=usm_type_x)
+    ind = dp.array(ind, usm_type=usm_type_ind)
+
+    z = dp.take_along_axis(x, ind, axis=axis)
+
+    assert x.usm_type == usm_type_x
+    assert ind.usm_type == usm_type_ind
+    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind])
+
+
 @pytest.mark.parametrize(
     "data, is_empty",
     [

From 0457fe174dbfe760bfe5633e8e7d12183a4d6166 Mon Sep 17 00:00:00 2001
From: vtavana <120411540+vtavana@users.noreply.github.com>
Date: Fri, 9 Feb 2024 13:48:00 -0600
Subject: [PATCH 28/29] implement `dpnp.tensordot` (#1699)

* implement dpnp.tensordot

* update doc string

* address comments

* fix doc string

* update scaling factor

* add TODO comment

---------

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/dpnp_iface_linearalgebra.py              | 153 +++++++++++++---
 dpnp/dpnp_iface_sorting.py                    |   2 -
 tests/helper.py                               |   9 +-
 tests/skipped_tests.tbl                       |   4 -
 tests/skipped_tests_gpu.tbl                   |   4 -
 tests/test_dot.py                             | 168 +++++++++++++++---
 tests/test_mathematical.py                    |   4 +-
 tests/test_sycl_queue.py                      |   5 +
 tests/test_usm_type.py                        |   5 +
 .../cupy/linalg_tests/test_product.py         |  20 ---
 10 files changed, 298 insertions(+), 76 deletions(-)

diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
index bffe881b626..7baca14c93b 100644
--- a/dpnp/dpnp_iface_linearalgebra.py
+++ b/dpnp/dpnp_iface_linearalgebra.py
@@ -39,6 +39,7 @@
 
 
 import numpy
+from numpy.core.numeric import normalize_axis_tuple
 
 import dpnp
 from dpnp.dpnp_algo import *
@@ -66,9 +67,9 @@ def dot(a, b, out=None):
 
     Parameters
     ----------
-    a : {dpnp_array, usm_ndarray, scalar}
+    a : {dpnp.ndarray, usm_ndarray, scalar}
         First input array. Both inputs `a` and `b` can not be scalars at the same time.
-    b : {dpnp_array, usm_ndarray, scalar}
+    b : {dpnp.ndarray, usm_ndarray, scalar}
         Second input array. Both inputs `a` and `b` can not be scalars at the same time.
     out : {dpnp.ndarray, usm_ndarray}, optional
         Alternative output array in which to place the result. It must have
@@ -404,42 +405,152 @@ def outer(x1, x2, out=None):
     return call_origin(numpy.outer, x1, x2, out=out)
 
 
-def tensordot(x1, x2, axes=2):
-    """
+def tensordot(a, b, axes=2):
+    r"""
     Compute tensor dot product along specified axes.
 
     For full documentation refer to :obj:`numpy.tensordot`.
 
-    Limitations
-    -----------
-    Parameters `x1` and `x2` are supported as :obj:`dpnp.ndarray`.
-    Keyword argument `kwargs` is currently unsupported.
-    Parameter `axes` is supported only with value ``1``.
-    Otherwise the functions will be executed sequentially on CPU.
-    Input array data types are limited by supported DPNP :ref:`Data types`.
+    Parameters
+    ----------
+    a : {dpnp.ndarray, usm_ndarray, scalar}
+        First input array. Both inputs `a` and `b` can not be scalars at the same time.
+    b : {dpnp.ndarray, usm_ndarray, scalar}
+        Second input array. Both inputs `a` and `b` can not be scalars at the same time.
+    axes : int or (2,) array_like
+        * integer_like
+          If an int `N`, sum over the last `N` axes of `a` and the first `N` axes
+          of `b` in order. The sizes of the corresponding axes must match.
+        * (2,) array_like
+          Or, a list of axes to be summed over, first sequence applying to `a`,
+          second to `b`. Both elements array_like must be of the same length.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Returns the tensordot product of `a` and `b`.
 
     See Also
     --------
     :obj:`dpnp.dot` : Returns the dot product.
     :obj:`dpnp.einsum` : Evaluates the Einstein summation convention on the operands.
 
+    Notes
+    -----
+    Three common use cases are:
+        * ``axes = 0`` : tensor product :math:`a \otimes b`
+        * ``axes = 1`` : tensor dot product :math:`a \cdot b`
+        * ``axes = 2`` : (default) tensor double contraction :math:`a:b`
+
+    When `axes` is integer, the sequence for evaluation will be: first
+    the -Nth axis in `a` and 0th axis in `b`, and the -1th axis in `a` and
+    Nth axis in `b` last.
+
+    When there is more than one axis to sum over - and they are not the last
+    (first) axes of `a` (`b`) - the argument `axes` should consist of
+    two sequences of the same length, with the first axis to sum over given
+    first in both sequences, the second axis second, and so forth.
+
+    The shape of the result consists of the non-contracted axes of the
+    first tensor, followed by the non-contracted axes of the second.
+
     Examples
     --------
     >>> import dpnp as np
     >>> a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     >>> b = np.array([1, 2, 3])
-    >>> result = np.tensordot(a, b, 1)
-    >>> [x for x in result]
-    [14, 32, 50]
+    >>> np.tensordot(a, b, 1)
+    array([14, 32, 50])
+
+    >>> a = np.arange(60.).reshape(3,4,5)
+    >>> b = np.arange(24.).reshape(4,3,2)
+    >>> c = np.tensordot(a,b, axes=([1,0],[0,1]))
+    >>> c.shape
+    (5, 2)
+    >>> c
+    array([[4400., 4730.],
+           [4532., 4874.],
+           [4664., 5018.],
+           [4796., 5162.],
+           [4928., 5306.]])
+
+    A slower but equivalent way of computing the same...
+
+    >>> d = np.zeros((5,2))
+    >>> for i in range(5):
+    ...   for j in range(2):
+    ...     for k in range(3):
+    ...       for n in range(4):
+    ...         d[i,j] += a[k,n,i] * b[n,k,j]
+    >>> c == d
+    array([[ True,  True],
+           [ True,  True],
+           [ True,  True],
+           [ True,  True],
+           [ True,  True]])
 
     """
 
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
-    x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_nondefault_queue=False)
-    if x1_desc and x2_desc and (axes == 1):
-        return dpnp_tensordot_not_implemented(x1_desc, x2_desc)  # dpnp_matmul
+    dpnp.check_supported_arrays_type(a, b, scalar_type=True)
 
-    return call_origin(numpy.tensordot, x1, x2, axes)
+    if dpnp.isscalar(a):
+        a = dpnp.array(a, sycl_queue=b.sycl_queue, usm_type=b.usm_type)
+    elif dpnp.isscalar(b):
+        b = dpnp.array(b, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
+
+    try:
+        iter(axes)
+    except Exception:
+        if not isinstance(axes, int):
+            raise TypeError("Axes must be an integer.")
+        axes_a = tuple(range(-axes, 0))
+        axes_b = tuple(range(0, axes))
+    else:
+        if len(axes) != 2:
+            raise ValueError("Axes must consist of two sequences.")
+
+        axes_a, axes_b = axes
+        axes_a = (axes_a,) if dpnp.isscalar(axes_a) else axes_a
+        axes_b = (axes_b,) if dpnp.isscalar(axes_b) else axes_b
+
+        if len(axes_a) != len(axes_b):
+            raise ValueError("Axes length mismatch.")
+
+    a_shape = a.shape
+    b_shape = b.shape
+    for axis_a, axis_b in zip(axes_a, axes_b):
+        if a_shape[axis_a] != b_shape[axis_b]:
+            raise ValueError(
+                "shape of input arrays is not similar at requested axes."
+            )
+
+    # Make the axes non-negative
+    a_ndim = a.ndim
+    b_ndim = b.ndim
+    axes_a = normalize_axis_tuple(axes_a, a_ndim, "axis")
+    axes_b = normalize_axis_tuple(axes_b, b_ndim, "axis")
+
+    # Move the axes to sum over, to the end of "a"
+    notin = tuple(k for k in range(a_ndim) if k not in axes_a)
+    newaxes_a = notin + axes_a
+    N1 = int(numpy.prod([a_shape[ax] for ax in notin]))
+    N2 = int(numpy.prod([a_shape[ax] for ax in axes_a]))
+    newshape_a = (N1, N2)
+    olda = [a_shape[axis] for axis in notin]
+
+    # Move the axes to sum over, to the front of "b"
+    notin = tuple(k for k in range(b_ndim) if k not in axes_b)
+    newaxes_b = tuple(axes_b + notin)
+    N1 = int(numpy.prod([b_shape[ax] for ax in axes_b]))
+    N2 = int(numpy.prod([b_shape[ax] for ax in notin]))
+    newshape_b = (N1, N2)
+    oldb = [b_shape[axis] for axis in notin]
+
+    at = a.transpose(newaxes_a).reshape(newshape_a)
+    bt = b.transpose(newaxes_b).reshape(newshape_b)
+    res = dpnp.matmul(at, bt)
+
+    return res.reshape(olda + oldb)
 
 
 def vdot(a, b):
@@ -450,11 +561,11 @@ def vdot(a, b):
 
     Parameters
     ----------
-    a : {dpnp_array, usm_ndarray, scalar}
+    a : {dpnp.ndarray, usm_ndarray, scalar}
         First input array. Both inputs `a` and `b` can not be
         scalars at the same time. If `a` is complex, the complex
         conjugate is taken before the calculation of the dot product.
-    b : {dpnp_array, usm_ndarray, scalar}
+    b : {dpnp.ndarray, usm_ndarray, scalar}
         Second input array. Both inputs `a` and `b` can not be
         scalars at the same time.
 
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index 6a3db20e74c..93e8db2172b 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -1,5 +1,3 @@
-# cython: language_level=3
-# distutils: language = c++
 # -*- coding: utf-8 -*-
 # *****************************************************************************
 # Copyright (c) 2016-2024, Intel Corporation
diff --git a/tests/helper.py b/tests/helper.py
index aac6b51a1c6..2a2873afdce 100644
--- a/tests/helper.py
+++ b/tests/helper.py
@@ -8,7 +8,11 @@
 
 
 def assert_dtype_allclose(
-    dpnp_arr, numpy_arr, check_type=True, check_only_type_kind=False
+    dpnp_arr,
+    numpy_arr,
+    check_type=True,
+    check_only_type_kind=False,
+    factor=8,
 ):
     """
     Assert DPNP and NumPy array based on maximum dtype resolution of input arrays
@@ -28,6 +32,7 @@ def assert_dtype_allclose(
     The 'check_only_type_kind' parameter (False by default) asserts only equal type kinds
     for all data types supported by DPNP when set to True.
     It is effective only when 'check_type' is also set to True.
+    The parameter `factor` scales the resolution used for comparing the arrays.
 
     """
 
@@ -44,7 +49,7 @@ def assert_dtype_allclose(
             if is_inexact(numpy_arr)
             else -dpnp.inf
         )
-        tol = 8 * max(tol_dpnp, tol_numpy)
+        tol = factor * max(tol_dpnp, tol_numpy)
         assert_allclose(dpnp_arr.asnumpy(), numpy_arr, atol=tol, rtol=tol)
         if check_type:
             numpy_arr_dtype = numpy_arr.dtype
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index a38624e3757..182eaf8877a 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -335,10 +335,6 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_invlarge
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_large
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_of_two
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_list_axes
 
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_broadcast_not_allowed
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_is_equal
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index ce6f6aef984..d6fd43e1887 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -437,10 +437,6 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_invlarge
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_large
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_of_two
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_list_axes
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
 
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_broadcast_not_allowed
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_is_equal
diff --git a/tests/test_dot.py b/tests/test_dot.py
index 42478db9634..03045f002a8 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -44,9 +44,6 @@ def test_dot_scalar(self, dtype):
         expected = numpy.dot(a, b)
         assert_allclose(result, expected)
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @pytest.mark.parametrize("dtype", get_all_dtypes(no_complex=True))
     @pytest.mark.parametrize(
         "array_info",
@@ -88,9 +85,6 @@ def test_dot(self, dtype, array_info):
         expected = numpy.dot(a, b)
         assert_dtype_allclose(result, expected)
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @pytest.mark.parametrize("dtype", get_complex_dtypes())
     @pytest.mark.parametrize(
         "array_info",
@@ -132,9 +126,6 @@ def test_dot_complex(self, dtype, array_info):
         expected = numpy.dot(a, b)
         assert_dtype_allclose(result, expected)
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @pytest.mark.parametrize("dtype", get_all_dtypes())
     @pytest.mark.parametrize(
         "array_info",
@@ -214,9 +205,6 @@ def test_dot_out_scalar(self, dtype):
         assert result is dp_out
         assert_allclose(result, expected)
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @pytest.mark.parametrize("dtype", get_all_dtypes())
     @pytest.mark.parametrize(
         "array_info",
@@ -294,21 +282,14 @@ def test_dot_out_error_scalar(self, ia):
 
         # output data type is incorrect
         dp_out = dpnp.empty((10,), dtype=dpnp.int64)
-        # TODO: change it to ValueError, when updated
-        # dpctl is being used in internal CI
-        with pytest.raises((ValueError, TypeError)):
+        with pytest.raises(ValueError):
             dpnp.dot(ia, ib, out=dp_out)
 
         # output shape is incorrect
         dp_out = dpnp.empty((2,), dtype=dpnp.int32)
-        # TODO: change it to ValueError, when updated
-        # dpctl is being used in internal CI
-        with pytest.raises((ValueError, TypeError)):
+        with pytest.raises(ValueError):
             dpnp.dot(ia, ib, out=dp_out)
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @pytest.mark.parametrize(
         "shape_pair",
         [
@@ -373,6 +354,151 @@ def test_multi_dot(type):
     assert_array_equal(expected, result)
 
 
+class TestTensordot:
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    def test_tensordot_scalar(self, dtype):
+        a = 2
+        b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype)
+        ib = dpnp.array(b)
+
+        result = dpnp.tensordot(a, ib, axes=0)
+        expected = numpy.tensordot(a, b, axes=0)
+        assert_allclose(result, expected)
+
+        result = dpnp.tensordot(ib, a, axes=0)
+        expected = numpy.tensordot(b, a, axes=0)
+        assert_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_complex=True))
+    @pytest.mark.parametrize("axes", [-3, -2, -1, 0, 1, 2])
+    def test_tensordot(self, dtype, axes):
+        a = numpy.array(numpy.random.uniform(-10, 10, 64), dtype=dtype).reshape(
+            4, 4, 4
+        )
+        b = numpy.array(numpy.random.uniform(-10, 10, 64), dtype=dtype).reshape(
+            4, 4, 4
+        )
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.tensordot(ia, ib, axes=axes)
+        expected = numpy.tensordot(a, b, axes=axes)
+        # TODO: investigate the effect of factor, see SAT-6700
+        assert_dtype_allclose(result, expected, factor=24)
+
+    @pytest.mark.parametrize("dtype", get_complex_dtypes())
+    @pytest.mark.parametrize("axes", [-3, -2, -1, 0, 1, 2])
+    def test_tensordot_complex(self, dtype, axes):
+        x11 = numpy.random.uniform(-10, 10, 64)
+        x12 = numpy.random.uniform(-10, 10, 64)
+        x21 = numpy.random.uniform(-10, 10, 64)
+        x22 = numpy.random.uniform(-10, 10, 64)
+        a = numpy.array(x11 + 1j * x12, dtype=dtype).reshape(4, 4, 4)
+        b = numpy.array(x21 + 1j * x22, dtype=dtype).reshape(4, 4, 4)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.tensordot(ia, ib, axes=axes)
+        expected = numpy.tensordot(a, b, axes=axes)
+        # TODO: investigate the effect of factor, see SAT-6700
+        assert_dtype_allclose(result, expected, factor=24)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "axes",
+        [
+            ([0, 1]),
+            ([0, 1], [1, 2]),
+            (2, 3),
+            ([-2, -3], [3, 2]),
+            ((3, 1), (0, 2)),
+        ],
+    )
+    def test_tensordot_axes(self, dtype, axes):
+        a = numpy.array(
+            numpy.random.uniform(-10, 10, 120), dtype=dtype
+        ).reshape(2, 5, 3, 4)
+        b = numpy.array(
+            numpy.random.uniform(-10, 10, 120), dtype=dtype
+        ).reshape(4, 2, 5, 3)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.tensordot(ia, ib, axes=axes)
+        expected = numpy.tensordot(a, b, axes=axes)
+        # TODO: investigate the effect of factor, see SAT-6700
+        assert_dtype_allclose(result, expected, factor=24)
+
+    @pytest.mark.parametrize("dtype1", get_all_dtypes())
+    @pytest.mark.parametrize("dtype2", get_all_dtypes())
+    def test_tensordot_input_dtype_matrix(self, dtype1, dtype2):
+        a = numpy.array(
+            numpy.random.uniform(-10, 10, 60), dtype=dtype1
+        ).reshape(3, 4, 5)
+        b = numpy.array(
+            numpy.random.uniform(-10, 10, 40), dtype=dtype2
+        ).reshape(4, 5, 2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.tensordot(ia, ib)
+        expected = numpy.tensordot(a, b)
+        # TODO: investigate the effect of factor, see SAT-6700
+        assert_dtype_allclose(result, expected, factor=24)
+
+    def test_tensordot_strided(self):
+        for dim in [1, 2, 3, 4]:
+            axes = 1 if dim == 1 else 2
+            A = numpy.random.rand(*([10] * dim))
+            B = dpnp.asarray(A)
+            # positive stride
+            slices = tuple(slice(None, None, 2) for _ in range(dim))
+            a = A[slices]
+            b = B[slices]
+
+            result = dpnp.tensordot(b, b, axes=axes)
+            expected = numpy.tensordot(a, a, axes=axes)
+            assert_dtype_allclose(result, expected)
+
+            # negative stride
+            slices = tuple(slice(None, None, -2) for _ in range(dim))
+            a = A[slices]
+            b = B[slices]
+
+            result = dpnp.tensordot(b, b, axes=axes)
+            expected = numpy.tensordot(a, a, axes=axes)
+            assert_dtype_allclose(result, expected)
+
+    def test_tensordot_error(self):
+        a = 5
+        b = 2
+        # both inputs are scalar
+        with pytest.raises(TypeError):
+            dpnp.tensordot(a, b, axes=0)
+
+        a = dpnp.arange(24).reshape(2, 3, 4)
+        b = dpnp.arange(24).reshape(3, 4, 2)
+        # axes should be an integer
+        with pytest.raises(TypeError):
+            dpnp.tensordot(a, b, axes=2.0)
+
+        # Axes must consist of two sequences
+        with pytest.raises(ValueError):
+            dpnp.tensordot(a, b, axes=([0, 2],))
+
+        # Axes length mismatch
+        with pytest.raises(ValueError):
+            dpnp.tensordot(a, b, axes=([0, 2], [2]))
+
+        # shape of input arrays is not similar at requested axes
+        with pytest.raises(ValueError):
+            dpnp.tensordot(a, b, axes=([0, 2], [2, 0]))
+
+        # out of range index
+        with pytest.raises(IndexError):
+            dpnp.tensordot(a, b, axes=([0, 3], [2, 0]))
+
+
 class TestVdot:
     @pytest.mark.parametrize("dtype", get_all_dtypes())
     def test_vdot_scalar(self, dtype):
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 80fe09c61b8..12115b5256c 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -2726,7 +2726,7 @@ def test_matmul_strided(self):
         for dim in [1, 2, 3, 4]:
             A = numpy.random.rand(*([20] * dim))
             B = dpnp.asarray(A)
-            # positive strides
+            # positive stride
             slices = tuple(slice(None, None, 2) for _ in range(dim))
             a = A[slices]
             b = B[slices]
@@ -2735,7 +2735,7 @@ def test_matmul_strided(self):
             expected = numpy.matmul(a, a)
             assert_dtype_allclose(result, expected)
 
-            # negative strides
+            # negative stride
             slices = tuple(slice(None, None, -2) for _ in range(dim))
             a = A[slices]
             b = B[slices]
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index 6bc24af6c7d..479e96e0229 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -579,6 +579,11 @@ def test_reduce_hypot(device):
             [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
             [0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0],
         ),
+        pytest.param(
+            "tensordot",
+            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
+            [[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]],
+        ),
         # dpnp.vdot has 3 different implementations based on input arrays dtype
         # checking all of them
         pytest.param("vdot", [3.0, 4.0, 5.0], [1.0, 2.0, 3.0]),
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index e188cdb1c47..21dfb3cde67 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -505,6 +505,11 @@ def test_1in_1out(func, data, usm_type):
         pytest.param("logaddexp", [[-1, 2, 5, 9]], [[4, -3, 2, -8]]),
         pytest.param("maximum", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
         pytest.param("minimum", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
+        pytest.param(
+            "tensordot",
+            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
+            [[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]],
+        ),
         # dpnp.vdot has 3 different implementations based on input arrays dtype
         # checking all of them
         pytest.param("vdot", [3.0, 4.0, 5.0], [1.0, 2.0, 3.0]),
diff --git a/tests/third_party/cupy/linalg_tests/test_product.py b/tests/third_party/cupy/linalg_tests/test_product.py
index 1fd048356b4..e59b30dcd6e 100644
--- a/tests/third_party/cupy/linalg_tests/test_product.py
+++ b/tests/third_party/cupy/linalg_tests/test_product.py
@@ -36,9 +36,6 @@
         }
     )
 )
-# TODO: get rid of falls back on NumPy when tensordot
-# is implemented using OneMKL
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
 class TestDot(unittest.TestCase):
     @testing.for_all_dtypes_combination(["dtype_a", "dtype_b"])
     @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
@@ -161,9 +158,6 @@ def test_dot_vec1(self, xp, dtype):
         b = testing.shaped_arange((2,), xp, dtype)
         return xp.dot(a, b)
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_dot_vec2(self, xp, dtype):
@@ -178,9 +172,6 @@ def test_dot_vec3(self, xp, dtype):
         b = testing.shaped_arange((2,), xp, dtype)
         return xp.dot(a, b)
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_transposed_dot(self, xp, dtype):
@@ -188,9 +179,6 @@ def test_transposed_dot(self, xp, dtype):
         b = testing.shaped_arange((2, 3, 4), xp, dtype).transpose(0, 2, 1)
         return xp.dot(a, b)
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_transposed_dot_with_out(self, xp, dtype):
@@ -200,9 +188,6 @@ def test_transposed_dot_with_out(self, xp, dtype):
         xp.dot(a, b, out=c)
         return c
 
-    # TODO: get rid of falls back on NumPy when tensordot
-    # is implemented using OneMKL
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     def test_transposed_dot_with_out_f_contiguous(self, dtype):
         for xp in (numpy, cupy):
@@ -307,7 +292,6 @@ def test_multidim_outer(self, xp, dtype):
         b = testing.shaped_arange((4, 5), xp, dtype)
         return xp.outer(a, b)
 
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_tensordot(self, xp, dtype):
@@ -322,7 +306,6 @@ def test_transposed_tensordot(self, xp, dtype):
         b = testing.shaped_arange((4, 3, 2), xp, dtype).transpose(2, 0, 1)
         return xp.tensordot(a, b)
 
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_tensordot_with_int_axes(self, xp, dtype):
@@ -352,7 +335,6 @@ def test_transposed_tensordot_with_int_axes(self, xp, dtype):
             )
             return xp.tensordot(a, b, axes=3)
 
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_tensordot_with_list_axes(self, xp, dtype):
@@ -433,8 +415,6 @@ def test_zerodim_kron(self, xp, dtype):
         }
     )
 )
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@testing.gpu
 class TestProductZeroLength(unittest.TestCase):
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()

From a27200b0cee911cc657d3b580c81341f9fed5deb Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Sat, 10 Feb 2024 12:37:52 +0100
Subject: [PATCH 29/29] Updated CHANGELOG.md for 0.14.0 release (#1703)

Co-authored-by: vtavana <120411540+vtavana@users.noreply.github.com>
---
 CHANGELOG.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a15f81807cc..97278d7c719 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,10 +4,85 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.13.0] - TBA
+## [0.14.0] - MM/DD/2024
 
 ### Added
 
+* Added implementation of `dpnp.nanmean` and `dpnp.nanstd` functions [#1654](https://github.com/IntelPython/dpnp/pull/1654)
+* Added implementation of `dpnp.angle` function [#1650](https://github.com/IntelPython/dpnp/pull/1650)
+* Added implementation of `dpnp.logsumexp` and `dpnp.reduce_hypot` functions [#1648](https://github.com/IntelPython/dpnp/pull/1648)
+* Added implementation of `dpnp.column_stack`, `dpnp.dstack` and `dpnp.row_stack` functions [#1647](https://github.com/IntelPython/dpnp/pull/1647)
+* Added implementation of `dpnp.nanargmax`, `dpnp.nanargmin`, `dpnp.nanmax` and `dpnp.nanmin` functions [#1646](https://github.com/IntelPython/dpnp/pull/1646)
+* Added implementation of `dpnp.clip` function, available as well as a method of dpnp array [#1645](https://github.com/IntelPython/dpnp/pull/1645)
+* Added implementation of `dpnp.copysign` and `dpnp.rsqrt` functions [#1624](https://github.com/IntelPython/dpnp/pull/1624)
+* Added implementation of `dpnp.linalg.slogdet` function [#1607](https://github.com/IntelPython/dpnp/pull/1607)
+* Added implementation of `dpnp.can_cast` function [#1600](https://github.com/IntelPython/dpnp/pull/1600)
+* Added implementation of `dpnp.linalg.solve` function [#1598](https://github.com/IntelPython/dpnp/pull/1598)
+* Added implementation of `dpnp.broadcast_arrays` function [#1594](https://github.com/IntelPython/dpnp/pull/1594)
+* Added implementation of `dpnp.tile` function [#1586](https://github.com/IntelPython/dpnp/pull/1586)
+* Added implementation of `dpnp.iinfo` and `dpnp.finfo` functions [#1582](https://github.com/IntelPython/dpnp/pull/1582)
+* Added implementation of `dpnp.logaddexp` function [#1561](https://github.com/IntelPython/dpnp/pull/1561)
+* Added implementation of `dpnp.positive` function [#1559](https://github.com/IntelPython/dpnp/pull/1559)
+
+### Changed
+
+* Enabled compatibility support against numpy `1.26.4` [#1690](https://github.com/IntelPython/dpnp/pull/1690)
+* Implemented `dpnp.true_divide` as an alias on `dpnp.divide` function [#1641](https://github.com/IntelPython/dpnp/pull/1641)
+* Added support of more number of data types and dimensions for input array in `dpnp.vdot` function [#1692](https://github.com/IntelPython/dpnp/pull/1692)
+* Added support of more number of data types and dimensions for input array in `dpnp.linalg.qr` function [#1673](https://github.com/IntelPython/dpnp/pull/1673)
+* Added support of more number of data types and dimensions for input array in `dpnp.dot` function [#1669](https://github.com/IntelPython/dpnp/pull/1669)
+* Added support of more number of data types and dimensions for input array in `dpnp.linalg.inv` function [#1665](https://github.com/IntelPython/dpnp/pull/1665)
+* Added support of more number of data types for input array in `dpnp.sort` and `dpnp.argsort` functions, as well as implementing support of `axis` keyword [#1660](https://github.com/IntelPython/dpnp/pull/1660)
+* Added support of more number of data types and dimensions for input array in `dpnp.linalg.cholesky` function, as well as implementing support of `upper` keyword [#1638](https://github.com/IntelPython/dpnp/pull/1638)
+* Added support of more number of data types and dimensions for input array in `dpnp.diff`, as well as implementing support of `prepend` and `append` keywords [#1637](https://github.com/IntelPython/dpnp/pull/1637)
+* Added support of more number of data types and dimensions for input array in `dpnp.matmul` function [#1616](https://github.com/IntelPython/dpnp/pull/1616)
+* Added support of more number of data types and dimensions for input array in `dpnp.linalg.det` function [#1607](https://github.com/IntelPython/dpnp/pull/1607)
+* Added support of more number of data types and dimensions for input array in `dpnp.linalg.svd` function, as well as implementing support of `full_matrices`, `compute_uv` and `hermitian` keywords [#1604](https://github.com/IntelPython/dpnp/pull/1604)
+* Accepted different data types and dimensions of input arrays in `dpnp.put_along_axis` and `dpnp.take_along_axis` functions, as well as available values of `axis` keyword [#1636](https://github.com/IntelPython/dpnp/pull/1636)
+* Added `keepdims`, `initial` and `where` keywords to `dpnp.amax` and `dpnp.amin` functions [#1639](https://github.com/IntelPython/dpnp/pull/1639)
+* Extended `dpnp.mesgrid` function to support `sparse` and `copy` keyword arguments [#1675](https://github.com/IntelPython/dpnp/pull/1675)
+* Extended `dpnp.average` function to support `axis`, `weights`, `returned` and `keepdims` keywords and `dpnp.nansum` function with `axis`, `dtype`, `keepdims` and `out` keyword arguments [#1654](https://github.com/IntelPython/dpnp/pull/1654)
+* Extended `dpnp.std`, `dpnp.var` and `nanvar` functions to support `axis`, `dtype`, `out` and `keepdims` keyword arguments [#1635](https://github.com/IntelPython/dpnp/pull/1635)
+* Extended `dpnp.ogrid` and `dpnp.mgrid` functions with support of device-aware keywords of compute follows data paradigm [#1622](https://github.com/IntelPython/dpnp/pull/1622)
+* Extended `dpnp.indices` function to support `dtype` and `sparse` keyword arguments, as well as device-aware keywords of compute follows data paradigm [#1622](https://github.com/IntelPython/dpnp/pull/1622)
+* Extended `dpnp.count_nonzero` function to support `axis` and `keepdims` keyword arguments [#1615](https://github.com/IntelPython/dpnp/pull/1615)
+* Extended `dpnp.put_along_axis` and `dpnp.take_along_axis` functions to support `out`, `dtype` and `casting` keyword arguments [#1608](https://github.com/IntelPython/dpnp/pull/1608)
+* Extended `dpnp.stack` and `dpnp.concatenate` functions to support `out`, `dtype` and `casting` keyword arguments [#1608](https://github.com/IntelPython/dpnp/pull/1608)
+* Extended `dpnp.vstack` function to support `dtype` and `casting` keyword arguments [#1595](https://github.com/IntelPython/dpnp/pull/1595)
+* Extended `dpnp.diag`, `dpnp.diagflat`, `dpnp.ptp` and `dpnp.vander` functions with support of extra keywords to align with compute follows data paradigm [#1579](https://github.com/IntelPython/dpnp/pull/1579)
+* Extended `dpnp.tri` and `dpnp.identity` functions with support of device-aware keywords of compute follows data paradigm [#1577](https://github.com/IntelPython/dpnp/pull/1577)
+* Added dedicated in-place kernels to `dpnp.divide` and `dpnp.floor_divide` functions [#1587](https://github.com/IntelPython/dpnp/pull/1587)
+* Redesigned `dpnp.cbrt` and `dpnp.exp2` functions through pybind11 extension of OneMKL call where possible or leveraging on `dpctl.tensor` implementation [#1624](https://github.com/IntelPython/dpnp/pull/1624)
+* Redesigned `dpnp.exp`, `dpnp.expm1`, `dpnp.log10`, `dpnp.log1p` and `dpnp.log2` functions through pybind11 extension of OneMKL call where possible or leveraging on `dpctl.tensor` implementation [#1576](https://github.com/IntelPython/dpnp/pull/1576)
+* Redesigned `dpnp.abs` function through pybind11 extension of OneMKL call where possible or leveraging on `dpctl.tensor` implementation [#1575](https://github.com/IntelPython/dpnp/pull/1575)
+* Redesigned `dpnp.hypot` function through pybind11 extension of OneMKL call where possible or leveraging on `dpctl.tensor` implementation [#1560](https://github.com/IntelPython/dpnp/pull/1560)
+* Leveraged `dpctl.tensor` implementation for `dpnp.reciprocal` function [#1650](https://github.com/IntelPython/dpnp/pull/1650)
+* Leveraged `dpctl.tensor` implementation for `dpnp.mean` function [#1632](https://github.com/IntelPython/dpnp/pull/1632)
+* Leveraged `dpctl.tensor` implementation for `dpnp.repeat` function [#1614](https://github.com/IntelPython/dpnp/pull/1614)
+* Leveraged `dpctl.tensor` implementation for `dpnp.argmax` and `dpnp.argmin` functions [#1610](https://github.com/IntelPython/dpnp/pull/1610)
+* Leveraged `dpctl.tensor` implementation for `dpnp.geomspace` and `dpnp.logspace` functions [#1603](https://github.com/IntelPython/dpnp/pull/1603)
+* Leveraged `dpctl.tensor` implementation for `dpnp.max` and `dpnp.min` functions [#1602](https://github.com/IntelPython/dpnp/pull/1602)
+* Leveraged `dpctl.tensor` implementation for `dpnp.astype` function [#1597](https://github.com/IntelPython/dpnp/pull/1597)
+* Leveraged `dpctl.tensor` implementation for `dpnp.maximum` and `dpnp.minimum` functions [#1558](https://github.com/IntelPython/dpnp/pull/1558)
+
+### Fixed
+
+* Resolved potential raising of execution placement error from `dpnp.take_along_axis` and `dpnp.put_along_axis` functions [#1702](https://github.com/IntelPython/dpnp/pull/1702)
+* Improved performance of `dpnp.matmul` and `dpnp.dot` function when `out` keyword is passed [#1694](https://github.com/IntelPython/dpnp/pull/1694)
+* Completed documentation for each array creation functions [#1674](https://github.com/IntelPython/dpnp/pull/1674)
+* Aligned `dpnp.clip` where both `min` and `max` keywords have `None` value with NumPy implementation [#1670](https://github.com/IntelPython/dpnp/pull/1670)
+* Fixed a bug related to `out` keyword in elementwise functions [#1656](https://github.com/IntelPython/dpnp/pull/1656)
+* Resolved compilation warnings due to `-Wvla-extension` option enabled by default [#1651](https://github.com/IntelPython/dpnp/pull/1651)
+* Replaced deprecated `IntelDPCPPConfig.cmake` script with vendored `IntelSYCLConfig.cmake` [#1611](https://github.com/IntelPython/dpnp/pull/1611)
+* Improved coverage report to include code of pybind11 extensions [#1609](https://github.com/IntelPython/dpnp/pull/1609)
+* Improved performance of `dpnp.atleast_2d` and `dpnp.atleast_3d` functions and fixed to return a correct shape of resulting array [#1560](https://github.com/IntelPython/dpnp/pull/1560)
+
+
+## [0.13.0] - 09/29/2023
+
+### Added
+
+* Added implementation of `dpnp.imag` and `dpnp.real` functions, as well as the corresponding properties and setters of dpnp array [#1557](https://github.com/IntelPython/dpnp/pull/1557)
 * Added implementation of flipping functions: `dpnp.flip`, `dpnp.fliplr` and `dpnp.flipud` [#1543](https://github.com/IntelPython/dpnp/pull/1543)
 * Added implementation of `dpnp.rint` function through `dpnp.round` call [#1537](https://github.com/IntelPython/dpnp/pull/1537)
 * Added in-place support for arithmetic operators [#1530](https://github.com/IntelPython/dpnp/pull/1530)
@@ -64,6 +139,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * Resolved issues with running random functions on a device without fp64 support [#1498](https://github.com/IntelPython/dpnp/pull/1498)
 * Resolved issues with running statistics functions on a device without fp64 support [#1494](https://github.com/IntelPython/dpnp/pull/1494)
 
+
 ## [0.12.1] - 07/18/2023
 
 ### Added