Adding ability to skip final host memcopy of eigenvectors. Updating R…

…EADME.md to include reference to real eigensolver.
NVIDIA · Oct 3, 2017 · d835843 · d835843
1 parent 27af9bd
commit d835843
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -2,24 +2,24 @@
 GPU Eigensolver for Quantum ESPRESSO package
 
 ###
-This library implements a generalized eigensolver for hermetian-definite eigenproblems with functionality similar to
-the ZHEGVD/X functions available within LAPACK/MAGMA. This solver has less dependencies on CPU computation 
+This library implements a generalized eigensolver for symmetric/hermetian-definite eigenproblems with functionality similar to
+the DSYGVD/X or ZHEGVD/X functions available within LAPACK/MAGMA. This solver has less dependencies on CPU computation 
 than comparable implementations within MAGMA, which may be of benefit to systems with limited CPU resources or to 
 users without access to high-performing CPU LAPACK libraries. 
 
 ###
 This implementation can be considered as a "proof of concept" and has been written to target the Quantum ESPRESSO
-code. As such, this implementation is built only to handle one problem configuration of ZHEGVD/X. Specifically, this
+code. As such, this implementation is built only to handle one problem configuration of DSYGVD/X or ZHEGVD/X. Specifically, this
 solver computes eigenvalues and associated eigenvectors over a specified integer range for a 
-hermetian-definite eigenproblem in the following form: 
+symmetric/hermetian-definite eigenproblem in the following form: 
 
 	A * x = lambda * B * x
 
-where `A` and `B` are hermetian-matrices and `B` is positive definite. The solver expects the upper-triangular parts of the 
-input `A` and `B` arguments to be populated. This configuration corresponds to calling ZHEGVX within LAPACK with the configuration 
+where `A` and `B` are symmetric/hermetian-matrices and `B` is positive definite. The solver expects the upper-triangular parts of the 
+input `A` and `B` arguments to be populated. This configuration corresponds to calling DSYGVX/ZHEGVX within LAPACK with the configuration 
 arguments `ITYPE = 1`, `JOBZ = 'V'`, `RANGE = 'I'`, and `UPLO = 'U'`. 
 
-See comments within `zhegvdx_gpu.F90` for specific details on usage.
+See comments within `dsygvdx_gpu.F90` or `zhegvdx_gpu.F90` for specific details on usage.
 
 For additional information about the solver with some performance results, see presentation at the following link: (will be added
 once available publically on the GTC On-Demand website)

diff --git a/lib_eigsolve/dsygvdx_gpu.F90 b/lib_eigsolve/dsygvdx_gpu.F90
@@ -48,6 +48,8 @@ module dsygvdx_gpu
     !   -  lwork_h is an integer specifying length of work_h. lwork_h >= 1 + 6*N + 2*N*N
     !   -  iwork_h is a integer array for integer workspace of length liwork_h. 
     !   -  liwork_h is an integer specifying length of iwork_h. liwork_h >= 3 + 5*N
+    !   -  (optional) _skip_host_copy is an optional logical argument. If .TRUE., memcopy of final updated eigenvectors from 
+    !      device to host will be skipped.
     !
     ! Output:
     ! On device:
@@ -61,13 +63,13 @@ module dsygvdx_gpu
     ! On host:
     !   - Z_h(ldz, iu - il + 1) is a real(8) matrix on the host. On exit, the first iu - il + 1 columns of Z
     !     contains normalized eigenvectors corresponding to eigenvalues in the range [il, iu]. This is a copy of the Z
-    !     matrix on the device
+    !     matrix on the device. This is only true if optional argument _skip_host_copy is not provided or is set to .FALSE.
     !   - w(iu - il + 1) is a real(8) array on the host. On exit, the first iu - il + 1 values of w contain the computed
     !     eigenvalues. This is a copy of the w array on the host.
     !   - info is an integer. info will equal zero if the function completes succesfully. Otherwise, there was an error.
     !
     subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, &
-                          work_h, lwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info)
+                          work_h, lwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info, _skip_host_copy)
       use eigsolve_vars
       use nvtx_inters
       use dsygst_gpu
@@ -78,6 +80,7 @@ subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, &
       real(8), dimension(1:lwork), device         :: work
       real(8), dimension(1:lwork_h), pinned       :: work_h
       integer, dimension(1:liwork_h), pinned      :: iwork_h
+      logical, optional                           :: _skip_host_copy
 
       real(8), dimension(1:lda, 1:N), device      :: A
       real(8), dimension(1:ldb, 1:N), device      :: B
@@ -88,8 +91,11 @@ subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, &
 
       real(8), parameter :: one = 1.d0
       integer :: i, j
+      logical :: skip_host_copy
 
       info = 0
+      skip_host_copy = .FALSE.
+      if(present(_skip_host_copy)) skip_host_copy = _skip_host_copy
 
       ! Check workspace sizes
       if (lwork < 2*64*64 + 66*N) then
@@ -150,11 +156,13 @@ subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, &
       call nvtxEndRange
 
       ! Copy final eigenvectors to host
-      istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m)
-      if (istat .ne. 0) then
-        print*, "dsygvdx_gpu error: cudaMemcpy2D failed!"
-        info = -1
-        return
+      if (not(skip_host_copy)) then
+        istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m)
+        if (istat .ne. 0) then
+          print*, "dsygvdx_gpu error: cudaMemcpy2D failed!"
+          info = -1
+          return
+        endif
       endif
 
     end subroutine dsygvdx_gpu

diff --git a/lib_eigsolve/zhegvdx_gpu.F90 b/lib_eigsolve/zhegvdx_gpu.F90
@@ -52,6 +52,8 @@ module zhegvdx_gpu
     !   -  lrwork_h is an integer specifying length of rwork_h. lrwork_h >= 1 + 5*N + 2*N*N
     !   -  iwork_h is a integer array for integer workspace of length liwork_h. 
     !   -  liwork_h is an integer specifying length of iwork_h. liwork_h >= 3 + 5*N
+    !   -  (optional) _skip_host_copy is an optional logical argument. If .TRUE., memcopy of final updated eigenvectors from 
+    !      device to host will be skipped.
     !
     ! Output:
     ! On device:
@@ -65,13 +67,13 @@ module zhegvdx_gpu
     ! On host:
     !   - Z_h(ldz, iu - il + 1) is a double-complex matrix on the host. On exit, the first iu - il + 1 columns of Z
     !     contains normalized eigenvectors corresponding to eigenvalues in the range [il, iu]. This is a copy of the Z
-    !     matrix on the device
+    !     matrix on the device. This is only true if optional argument _skip_host_copy is not provided or is set to .FALSE.
     !   - w(iu - il + 1) is a real(8) array on the host. On exit, the first iu - il + 1 values of w contain the computed
     !     eigenvalues. This is a copy of the w array on the host.
     !   - info is an integer. info will equal zero if the function completes succesfully. Otherwise, there was an error.
     !
     subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork, lrwork, &
-                          work_h, lwork_h, rwork_h, lrwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info)
+                          work_h, lwork_h, rwork_h, lrwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info, _skip_host_copy)
       use eigsolve_vars
       use nvtx_inters
       use zhegst_gpu
@@ -84,6 +86,7 @@ subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork,
       complex(8), dimension(1:lwork), device      :: work
       complex(8), dimension(1:lwork_h), pinned    :: work_h
       integer, dimension(1:liwork_h), pinned      :: iwork_h
+      logical, optional                           :: _skip_host_copy
 
       complex(8), dimension(1:lda, 1:N), device   :: A
       complex(8), dimension(1:ldb, 1:N), device   :: B
@@ -94,8 +97,11 @@ subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork,
 
       complex(8), parameter :: cone = cmplx(1,0,8)
       integer :: i, j
+      logical :: skip_host_copy
 
       info = 0
+      skip_host_copy = .FALSE.
+      if(present(_skip_host_copy)) skip_host_copy = _skip_host_copy
 
       ! Check workspace sizes
       if (lwork < 2*64*64 + 65*N) then
@@ -164,11 +170,13 @@ subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork,
       call nvtxEndRange
 
       ! Copy final eigenvectors to host
-      istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m)
-      if (istat .ne. 0) then
-        print*, "zhegvdx_gpu error: cudaMemcpy2D failed!"
-        info = -1
-        return
+      if (not(skip_host_copy)) then
+        istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m)
+        if (istat .ne. 0) then
+          print*, "zhegvdx_gpu error: cudaMemcpy2D failed!"
+          info = -1
+          return
+        endif
       endif
 
     end subroutine zhegvdx_gpu