diff --git a/README.md b/README.md index 8dce23c..2d091c6 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,24 @@ GPU Eigensolver for Quantum ESPRESSO package ### -This library implements a generalized eigensolver for hermetian-definite eigenproblems with functionality similar to -the ZHEGVD/X functions available within LAPACK/MAGMA. This solver has less dependencies on CPU computation +This library implements a generalized eigensolver for symmetric/hermetian-definite eigenproblems with functionality similar to +the DSYGVD/X or ZHEGVD/X functions available within LAPACK/MAGMA. This solver has less dependencies on CPU computation than comparable implementations within MAGMA, which may be of benefit to systems with limited CPU resources or to users without access to high-performing CPU LAPACK libraries. ### This implementation can be considered as a "proof of concept" and has been written to target the Quantum ESPRESSO -code. As such, this implementation is built only to handle one problem configuration of ZHEGVD/X. Specifically, this +code. As such, this implementation is built only to handle one problem configuration of DSYGVD/X or ZHEGVD/X. Specifically, this solver computes eigenvalues and associated eigenvectors over a specified integer range for a -hermetian-definite eigenproblem in the following form: +symmetric/hermetian-definite eigenproblem in the following form: A * x = lambda * B * x -where `A` and `B` are hermetian-matrices and `B` is positive definite. The solver expects the upper-triangular parts of the -input `A` and `B` arguments to be populated. This configuration corresponds to calling ZHEGVX within LAPACK with the configuration +where `A` and `B` are symmetric/hermetian-matrices and `B` is positive definite. The solver expects the upper-triangular parts of the +input `A` and `B` arguments to be populated. This configuration corresponds to calling DSYGVX/ZHEGVX within LAPACK with the configuration arguments `ITYPE = 1`, `JOBZ = 'V'`, `RANGE = 'I'`, and `UPLO = 'U'`. -See comments within `zhegvdx_gpu.F90` for specific details on usage. +See comments within `dsygvdx_gpu.F90` or `zhegvdx_gpu.F90` for specific details on usage. For additional information about the solver with some performance results, see presentation at the following link: (will be added once available publically on the GTC On-Demand website) diff --git a/lib_eigsolve/dsygvdx_gpu.F90 b/lib_eigsolve/dsygvdx_gpu.F90 index d432578..e38838e 100644 --- a/lib_eigsolve/dsygvdx_gpu.F90 +++ b/lib_eigsolve/dsygvdx_gpu.F90 @@ -48,6 +48,8 @@ module dsygvdx_gpu ! - lwork_h is an integer specifying length of work_h. lwork_h >= 1 + 6*N + 2*N*N ! - iwork_h is a integer array for integer workspace of length liwork_h. ! - liwork_h is an integer specifying length of iwork_h. liwork_h >= 3 + 5*N + ! - (optional) _skip_host_copy is an optional logical argument. If .TRUE., memcopy of final updated eigenvectors from + ! device to host will be skipped. ! ! Output: ! On device: @@ -61,13 +63,13 @@ module dsygvdx_gpu ! On host: ! - Z_h(ldz, iu - il + 1) is a real(8) matrix on the host. On exit, the first iu - il + 1 columns of Z ! contains normalized eigenvectors corresponding to eigenvalues in the range [il, iu]. This is a copy of the Z - ! matrix on the device + ! matrix on the device. This is only true if optional argument _skip_host_copy is not provided or is set to .FALSE. ! - w(iu - il + 1) is a real(8) array on the host. On exit, the first iu - il + 1 values of w contain the computed ! eigenvalues. This is a copy of the w array on the host. ! - info is an integer. info will equal zero if the function completes succesfully. Otherwise, there was an error. ! subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, & - work_h, lwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info) + work_h, lwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info, _skip_host_copy) use eigsolve_vars use nvtx_inters use dsygst_gpu @@ -78,6 +80,7 @@ subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, & real(8), dimension(1:lwork), device :: work real(8), dimension(1:lwork_h), pinned :: work_h integer, dimension(1:liwork_h), pinned :: iwork_h + logical, optional :: _skip_host_copy real(8), dimension(1:lda, 1:N), device :: A real(8), dimension(1:ldb, 1:N), device :: B @@ -88,8 +91,11 @@ subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, & real(8), parameter :: one = 1.d0 integer :: i, j + logical :: skip_host_copy info = 0 + skip_host_copy = .FALSE. + if(present(_skip_host_copy)) skip_host_copy = _skip_host_copy ! Check workspace sizes if (lwork < 2*64*64 + 66*N) then @@ -150,11 +156,13 @@ subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, & call nvtxEndRange ! Copy final eigenvectors to host - istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m) - if (istat .ne. 0) then - print*, "dsygvdx_gpu error: cudaMemcpy2D failed!" - info = -1 - return + if (not(skip_host_copy)) then + istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m) + if (istat .ne. 0) then + print*, "dsygvdx_gpu error: cudaMemcpy2D failed!" + info = -1 + return + endif endif end subroutine dsygvdx_gpu diff --git a/lib_eigsolve/zhegvdx_gpu.F90 b/lib_eigsolve/zhegvdx_gpu.F90 index eecc1c0..f8a4edd 100644 --- a/lib_eigsolve/zhegvdx_gpu.F90 +++ b/lib_eigsolve/zhegvdx_gpu.F90 @@ -52,6 +52,8 @@ module zhegvdx_gpu ! - lrwork_h is an integer specifying length of rwork_h. lrwork_h >= 1 + 5*N + 2*N*N ! - iwork_h is a integer array for integer workspace of length liwork_h. ! - liwork_h is an integer specifying length of iwork_h. liwork_h >= 3 + 5*N + ! - (optional) _skip_host_copy is an optional logical argument. If .TRUE., memcopy of final updated eigenvectors from + ! device to host will be skipped. ! ! Output: ! On device: @@ -65,13 +67,13 @@ module zhegvdx_gpu ! On host: ! - Z_h(ldz, iu - il + 1) is a double-complex matrix on the host. On exit, the first iu - il + 1 columns of Z ! contains normalized eigenvectors corresponding to eigenvalues in the range [il, iu]. This is a copy of the Z - ! matrix on the device + ! matrix on the device. This is only true if optional argument _skip_host_copy is not provided or is set to .FALSE. ! - w(iu - il + 1) is a real(8) array on the host. On exit, the first iu - il + 1 values of w contain the computed ! eigenvalues. This is a copy of the w array on the host. ! - info is an integer. info will equal zero if the function completes succesfully. Otherwise, there was an error. ! subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork, lrwork, & - work_h, lwork_h, rwork_h, lrwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info) + work_h, lwork_h, rwork_h, lrwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info, _skip_host_copy) use eigsolve_vars use nvtx_inters use zhegst_gpu @@ -84,6 +86,7 @@ subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork, complex(8), dimension(1:lwork), device :: work complex(8), dimension(1:lwork_h), pinned :: work_h integer, dimension(1:liwork_h), pinned :: iwork_h + logical, optional :: _skip_host_copy complex(8), dimension(1:lda, 1:N), device :: A complex(8), dimension(1:ldb, 1:N), device :: B @@ -94,8 +97,11 @@ subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork, complex(8), parameter :: cone = cmplx(1,0,8) integer :: i, j + logical :: skip_host_copy info = 0 + skip_host_copy = .FALSE. + if(present(_skip_host_copy)) skip_host_copy = _skip_host_copy ! Check workspace sizes if (lwork < 2*64*64 + 65*N) then @@ -164,11 +170,13 @@ subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork, call nvtxEndRange ! Copy final eigenvectors to host - istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m) - if (istat .ne. 0) then - print*, "zhegvdx_gpu error: cudaMemcpy2D failed!" - info = -1 - return + if (not(skip_host_copy)) then + istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m) + if (istat .ne. 0) then + print*, "zhegvdx_gpu error: cudaMemcpy2D failed!" + info = -1 + return + endif endif end subroutine zhegvdx_gpu