JuliaMolSim · mfherbst · Sep 28, 2022 · Aug 23, 2022 · Sep 7, 2022 · Sep 13, 2022
diff --git a/Project.toml b/Project.toml
@@ -6,14 +6,15 @@ version = "0.5.8"
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
-BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 Brillouin = "23470ee3-d0df-4052-8b1a-8cbd6363e7f0"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DftFunctionals = "6bd331d2-b28d-4fd3-880e-1a1c7f37947f"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 InteratomicPotentials = "a9efe35a-c65d-452d-b8a8-82646cd5cb04"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -48,7 +49,6 @@ spglib_jll = "ac4a9f1e-bdb2-5204-990c-47c8b2f70d4e"
 [compat]
 AbstractFFTs = "1"
 AtomsBase = "0.2.2"
-BlockArrays = "0.16.2"
 Brillouin = "0.5"
 ChainRulesCore = "1.15"
 Conda = "1"

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
@@ -43,19 +43,96 @@
 vprintln(args...) = nothing
 
 using LinearAlgebra
-using BlockArrays # used for the `mortar` command which makes block matrices
+import Base: *
+import Base.size, Base.adjoint, Base.Array
 
-# when X or Y are BlockArrays, this makes the return value be a proper array (not a BlockArray)
-function array_mul(X::AbstractArray{T}, Y) where {T}
-    Z = Array{T}(undef, size(X, 1), size(Y, 2))
-    mul!(Z, X, Y)
+include("../workarounds/gpu_arrays.jl")
+
+# For now, BlockMatrix can store arrays of different types (for example, an element 
+# of type views and one of type Matrix). Maybe for performance issues it should only
+# store arrays of the same type?
+
+struct BlockMatrix{T <: Number, D <: Tuple} <: AbstractMatrix{T}
+    blocks::D
+end
+
+"""
+Build a BlockMatrix containing the given arrays, from left to right.
+This function will fail (for now) if:
+    -the arrays do not all have the same "height" (ie size[1] must match).
+"""
+function BlockMatrix(arrays::AbstractArray...)
+    length(arrays) ==0 && error("Empty BlockMatrix is not currently implemented")
+    n_ref= size(arrays[1], 1)
+    for array in arrays
+        n_i = size(array, 1)
+        n_ref != n_i && error("The given arrays do not have matching 'height': "*
+        "cannot build a BlockMatrix out of them.")
+    end
+
+    T = promote_type(map(eltype, arrays)...)
+
+    BlockMatrix{T, typeof(arrays)}(arrays)
+end
+
+function Base.size(A::BlockMatrix)
+    n = size(A.blocks[1], 1)
+    m = sum(size(block, 2) for block in A.blocks)
+    (n,m)
+end
+
+Base.Array(A::BlockMatrix)  = hcat(A.blocks...)
+
+Base.adjoint(A::BlockMatrix) = Adjoint(A)
+
+"""
+Given A and B as two BlockMatrices [A1, A2, A3], [B1, B2, B3] form the matrix
+A'B. Return an array, not a BlockMatrix.
+"""
+@views function Base.:*(Aadj::Adjoint{T, <: BlockMatrix}, B::BlockMatrix) where {T}
+    A = Aadj.parent
+    rows = size(A)[2]
+    cols = size(B)[2]
+    ret = similar(A.blocks[1], rows, cols)
+
+    orow = 0  # row offset
+    for (iA, blA) in enumerate(A.blocks)
+        ocol = 0  # column offset
+        for (iB, blB) in enumerate(B.blocks)
+            ret[orow .+ (1:size(blA, 2)), ocol .+ (1:size(blB, 2))] = blA' * blB
+            ocol += size(blB, 2)
+        end
+        orow += size(blA, 2)
+    end
+    ret
+end
+
+Base.:*(Aadj::Adjoint{T, <: BlockMatrix}, B::AbstractMatrix) where {T} = Aadj * BlockMatrix(B)
+
+"""
+Given A as a BlockMatrix [A1, A2, A3] and B a Matrix, compute the matrix-matrix product
+A * B avoiding a concatenation of A's blocks to a dense array.
+"""
+@views function *(Ablock::BlockMatrix, B::AbstractMatrix)
+    res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :]  # First multiplication
+    offset = size(Ablock.blocks[1], 2)
+    for block in Ablock.blocks[2:end]
+        mul!(res, block, B[offset .+ (1:size(block, 2)), :], 1, 1)
+        offset += size(block, 2)
+    end
+    res
+end
+
+function LinearAlgebra.mul!(res::AbstractMatrix, Ablock::BlockMatrix,
+                            B::AbstractVecOrMat, α::Number, β::Number)
+    # Has slightly better performances than a naive res = α*A*B - β*res
+    mul!(res, Ablock*B, I, α, β)
 end
 
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
-@timing function rayleigh_ritz(X, AX, N)
-    XAX = array_mul(X', AX)
-    @assert all(!isnan, XAX)
-    F = eigen(Hermitian(XAX))
+@timing function rayleigh_ritz(X::BlockMatrix, AX::BlockMatrix, N)
+    # Multiplying two BlockMatrices yields an array, not a BlockMatrix
+    F = eigen(Hermitian(X' * AX))
     F.vectors[:,1:N], F.values[1:N]
 end
 
@@ -174,16 +251,16 @@ end
     niter = 1
     ninners = zeros(Int,0)
     while true
-        BYX = BY'X
-        # XXX the one(T) instead of plain old 1 is because of https://github.com/JuliaArrays/BlockArrays.jl/issues/176
-        mul!(X, Y, BYX, -one(T), one(T)) # X -= Y*BY'X
+        BYX = BY' * X
+        mul!(X, Y, BYX, -one(T), one(T))  # X -= Y*BY'X
         # If the orthogonalization has produced results below 2eps, we drop them
         # This is to be able to orthogonalize eg [1;0] against [e^iθ;0],
         # as can happen in extreme cases in the ortho!(cP, cX)
         dropped = drop!(X)
         if dropped != []
-            @views mul!(X[:, dropped], Y, BY' * (X[:, dropped]), -one(T), one(T)) # X -= Y*BY'X
+            X[:, dropped] .-= Y * (BY' * X[:, dropped])  # X = X - Y*BY'*X
         end
+
         if norm(BYX) < tol && niter > 1
             push!(ninners, 0)
             break
@@ -219,11 +296,9 @@ function final_retval(X, AX, resid_history, niter, n_matvec)
     residuals = AX .- X*Diagonal(λ)
     (λ=λ, X=X,
      residual_norms=[norm(residuals[:, i]) for i in 1:size(residuals, 2)],
-     residual_history=resid_history[:, 1:niter+1],
-     n_matvec=n_matvec)
+     residual_history=resid_history[:, 1:niter+1], n_matvec=n_matvec)
 end
 
-
 ### The algorithm is Xn+1 = rayleigh_ritz(hcat(Xn, A*Xn, Xn-Xn-1))
 ### We follow the strategy of Hetmaniuk and Lehoucq, and maintain a B-orthonormal basis Y = (X,R,P)
 ### After each rayleigh_ritz step, the B-orthonormal X and P are deduced by an orthogonal rotation from Y
@@ -234,6 +309,7 @@ end
                         miniter=1, ortho_tol=2eps(real(eltype(X))),
                         n_conv_check=nothing, display_progress=false)
     N, M = size(X)
+
     # If N is too small, we will likely get in trouble
     error_message(verb) = "The eigenproblem is too small, and the iterative " *
                            "eigensolver $verb fail; increase the number of " *
@@ -252,7 +328,7 @@ end
         B_ortho!(X, BX)
     end
 
-    n_matvec = M   # Count number of matrix-vector products
+    n_matvec = M  # Count number of matrix-vector products
     AX = similar(X)
     AX = mul!(AX, A, X)
     @assert all(!isnan, AX)
@@ -275,6 +351,7 @@ end
     nlocked = 0
     niter = 0  # the first iteration is fake
     λs = @views [(X[:,n]'*AX[:,n]) / (X[:,n]'BX[:,n]) for n=1:M]
+    λs = oftype(X[:,1], λs)  # Offload to GPU if needed
     new_X = X
     new_AX = AX
     new_BX = BX
@@ -290,23 +367,23 @@ end
 
             # Form Rayleigh-Ritz subspace
             if niter > 1
-                Y = mortar((X, R, P))
-                AY = mortar((AX, AR, AP))
-                BY = mortar((BX, BR, BP))  # data shared with (X, R, P) in non-general case
+                Y = BlockMatrix(X, R, P)
+                AY = BlockMatrix(AX, AR, AP)
+                BY = BlockMatrix(BX, BR, BP)  # data shared with (X, R, P) in non-general case
             else
-                Y  = mortar((X, R))
-                AY = mortar((AX, AR))
-                BY = mortar((BX, BR))  # data shared with (X, R) in non-general case
+                Y  = BlockMatrix(X, R)
+                AY = BlockMatrix(AX, AR)
+                BY = BlockMatrix(BX, BR)  # data shared with (X, R) in non-general case
             end
             cX, λs = rayleigh_ritz(Y, AY, M-nlocked)
 
             # Update X. By contrast to some other implementations, we
             # wait on updating P because we have to know which vectors
             # to lock (and therefore the residuals) before computing P
             # only for the unlocked vectors. This results in better convergence.
-            new_X  = array_mul(Y, cX)
-            new_AX = array_mul(AY, cX)  # no accuracy loss, since cX orthogonal
-            new_BX = (B == I) ? new_X : array_mul(BY, cX)
+            new_X  = Y * cX
+            new_AX = AY * cX  # no accuracy loss, since cX orthogonal
+            new_BX = (B == I) ? new_X : BY * cX
         end
 
         ### Compute new residuals
@@ -320,7 +397,7 @@ end
         vprintln(niter, "   ", resid_history[:, niter+1])
         if precon !== I
             @timing "preconditioning" begin
-                precondprep!(precon, X) # update preconditioner if needed; defaults to noop
+                precondprep!(precon, X)  # update preconditioner if needed; defaults to noop
                 ldiv!(precon, new_R)
             end
         end
@@ -360,20 +437,23 @@ end
             # orthogonalization, see Hetmaniuk & Lehoucq, and Duersch et. al.
             # cP = copy(cX)
             # cP[Xn_indices,:] .= 0
-            e = zeros(eltype(X), size(cX, 1), M - prev_nlocked)
-            for i in 1:length(Xn_indices)
-                e[Xn_indices[i], i] = 1
-            end
+
+            lenXn = length(Xn_indices)
+            e = zero(similar(X, size(cX, 1), M - prev_nlocked))
+            lower_diag = one(similar(X, lenXn, lenXn))
+            # e has zeros everywhere except on one of its lower diagonal
+            e[Xn_indices[1] : last(Xn_indices), 1 : lenXn] = lower_diag
+
             cP = cX .- e
             cP = cP[:, Xn_indices]
             # orthogonalize against all Xn (including newly locked)
             ortho!(cP, cX, cX, tol=ortho_tol)
 
             # Get new P
-            new_P  = array_mul( Y, cP)
-            new_AP = array_mul(AY, cP)
+            new_P  = Y * cP
+            new_AP = AY * cP
             if B != I
-                new_BP = array_mul(BY, cP)
+                new_BP = BY * cP
             else
                 new_BP = new_P
             end
@@ -418,8 +498,8 @@ end
 
         # Orthogonalize R wrt all X, newly active P
         if niter > 0
-            Z  = mortar((full_X, P))
-            BZ = mortar((full_BX, BP))  # data shared with (full_X, P) in non-general case
+            Z  = BlockMatrix(full_X, P)
+            BZ = BlockMatrix(full_BX, BP) # data shared with (full_X, P) in non-general case
         else
             Z  = full_X
             BZ = full_BX

diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
@@ -0,0 +1,19 @@
+# TODO: remove this when it is implemented in GPUArrays and CUDA
+import LinearAlgebra.dot, LinearAlgebra.eigen
+using LinearAlgebra
+using GPUArrays
+using CUDA
+
+# https://github.com/JuliaGPU/CUDA.jl/issues/1565
+LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y)
+
+# https://github.com/JuliaGPU/CUDA.jl/issues/1572
+function LinearAlgebra.eigen(A::Hermitian{T,AT}) where {T <: Complex,AT <: CuArray}
+    vals, vects = CUDA.CUSOLVER.heevd!('V','U', A.data)
+    (vectors = vects, values = vals)
+end
+
+function LinearAlgebra.eigen(A::Hermitian{T,AT}) where {T <: Real,AT <: CuArray}
+    vals, vects = CUDA.CUSOLVER.syevd!('V','U', A.data)
+    (vectors = vects, values = vals)
+end
diff --git a/test/lobpcg.jl b/test/lobpcg.jl
@@ -122,3 +122,23 @@ end
         @test res1.λ[ik] ≈ res2.λ[ik] atol=1e-6
     end
 end
+
+@testset "LOBPCG Internal data structures" begin
+    a1 = rand(10,5)
+    a2 = rand(10,2)
+    a3 = rand(10,7)
+    b1 = rand(10,6)
+    b2 = rand(10,2)
+    A = hcat(a1,a2,a3)
+    B = hcat(b1,b2)
+    Ablock = DFTK.BlockMatrix(a1, a2, a3)
+    Bblock = DFTK.BlockMatrix(b1,b2)
+    @test Ablock'*Bblock ≈ A'*B
+    @test Ablock'*B ≈ A'*B
+
+    C = rand(14,4)
+    @test Ablock*C ≈ A*C
+
+    D = rand(10,4)
+    @test mul!(D,Ablock, C, 1, 0) ≈ A*C
+end