Skip to content

Commit

Permalink
Merge pull request #443 from JuliaHealth/optimize-precession
Browse files Browse the repository at this point in the history
Optimize run_spin_precession! and run_spin_excitation! for CPU
  • Loading branch information
cncastillo authored Jul 19, 2024
2 parents 1de7627 + 34a3a03 commit e09ba44
Show file tree
Hide file tree
Showing 8 changed files with 251 additions and 53 deletions.
1 change: 1 addition & 0 deletions KomaMRICore/src/datatypes/Spinor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Spinor(α::Complex{T}, β::Complex{T}) where {T<:Real} = Spinor([α], [β])
Spinor::T, β::T) where {T<:Real} = Spinor([complex(α)], [complex(β)])
one(T::Spinor) = Spinor(1.,0.)
Base.getindex(s::Spinor, i) = Spinor(s.α[i], s.β[i])
Base.view(s::Spinor, i::UnitRange) = @views Spinor(s.α[i], s.β[i])
"""
str = show(io::IO, s::Spinor)
Expand Down
151 changes: 151 additions & 0 deletions KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""Stores preallocated structs for use in Bloch CPU run_spin_precession function."""
struct BlochCPUPrealloc{T} <: PreallocResult{T}
M::Mag{T} # Mag{T}
Bz_old::AbstractVector{T} # Vector{T}(Nspins x 1)
Bz_new::AbstractVector{T} # Vector{T}(Nspins x 1)
ϕ::AbstractVector{T} # Vector{T}(Nspins x 1)
φ::AbstractVector{T} # Vector{T}(Nspins x 1)
Rot::Spinor{T} # Spinor{T}
end

Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
@views BlochCPUPrealloc(
p.M[i],
p.Bz_old[i],
p.Bz_new[i],
p.ϕ[i],
p.φ[i],
p.Rot[i]
)
end

"""Preallocates arrays for use in run_spin_precession."""
function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
return BlochCPUPrealloc(
Mag(
similar(M.xy),
similar(M.z)
),
similar(obj.x),
similar(obj.x),
similar(obj.x),
similar(obj.x),
Spinor(
similar(M.xy),
similar(M.xy)
)
)
end

"""
run_spin_precession(obj, seq, Xt, sig, M, sim_method, backend, prealloc)
Alternate implementation of the run_spin_precession! function in BlochSimpleSimulationMethod.jl
optimized for the CPU. Uses a loop to step through time instead of allocating a matrix of size
NSpins x seq.t. The Bz_old, Bz_new, ϕ, and Mxy arrays are pre-allocated in run_sim_time_iter! so
that they can be re-used from block to block.
"""
function run_spin_precession!(
p::Phantom{T},
seq::DiscreteSequence{T},
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::Bloch,
backend::KA.CPU,
prealloc::BlochCPUPrealloc
) where {T<:Real}
#Simulation
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t[1,:]')

#Initialize arrays
Bz_old = prealloc.Bz_old
Bz_new = prealloc.Bz_new
ϕ = prealloc.ϕ
Mxy = prealloc.M.xy
fill!(ϕ, zero(T))
@. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + p.Δw / T(2π * γ)

# Fill sig[1] if needed
ADC_idx = 1
if (seq.ADC[1])
sig[1] = sum(M.xy)
ADC_idx += 1
end

t_seq = zero(T) # Time
for seq_idx=2:length(seq.t)
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t[seq_idx,:]')
t_seq += seq.Δt[seq_idx-1]

#Effective Field
@. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)

#Rotation
@. ϕ += (Bz_old + Bz_new) * T(-π * γ) * seq.Δt[seq_idx-1]

#Acquired Signal
if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
@. Mxy = exp(-t_seq / p.T2) * M.xy * cis(ϕ)
sig[ADC_idx] = sum(Mxy)
ADC_idx += 1
end

Bz_old, Bz_new = Bz_new, Bz_old
end

#Final Spin-State
@. M.xy = M.xy * exp(-t_seq / p.T2) * cis(ϕ)
@. M.z = M.z * exp(-t_seq / p.T1) + p.ρ * (T(1) - exp(-t_seq / p.T1))

return nothing
end

"""
run_spin_excitation!(obj, seq, sig, M, sim_method, backend, prealloc)
Alternate implementation of the run_spin_excitation! function in BlochSimpleSimulationMethod.jl
optimized for the CPU. Uses preallocation for all arrays to reduce memory usage.
"""
function run_spin_excitation!(
p::Phantom{T},
seq::DiscreteSequence{T},
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::Bloch,
backend::KA.CPU,
prealloc::BlochCPUPrealloc
) where {T<:Real}
ΔBz = prealloc.Bz_old
Bz = prealloc.Bz_new
B = prealloc.ϕ
φ = prealloc.φ
α = prealloc.Rot.α
β = prealloc.Rot.β
Maux_xy = prealloc.M.xy
Maux_z = prealloc.M.z

#Can be calculated outside of loop
@. ΔBz = p.Δw / T(2π * γ)

#Simulation
for s in seq #This iterates over seq, "s = seq[i,:]"
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
#Effective field
@. Bz = (s.Gx * x + s.Gy * y + s.Gz * z) + ΔBz - s.Δf / T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
@. B = sqrt(abs(s.B1)^2 + abs(Bz)^2)
@. B[B == 0] = eps(T)
#Spinor Rotation
@. φ = T(-π * γ) * (B * s.Δt) # TODO: Use trapezoidal integration here (?), this is just Forward Euler
@. α = cos(φ) - Complex{T}(im) * (Bz / B) * sin(φ)
@. β = -Complex{T}(im) * (s.B1 / B) * sin(φ)
mul!(Spinor(α, β), M, Maux_xy, Maux_z)
#Relaxation
@. M.xy = M.xy * exp(-s.Δt / p.T2)
@. M.z = M.z * exp(-s.Δt / p.T1) + p.ρ * (T(1) - exp(-s.Δt / p.T1))
end
#Acquired signal
#sig .= -1.4im #<-- This was to test if an ADC point was inside an RF block
return nothing
end
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,24 @@ function run_spin_precession!(
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::BlochDict,
backend::KA.Backend
backend::KA.Backend,
prealloc::PreallocResult
) where {T<:Real}
#Simulation
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
#Effective field
Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
#Rotation
if is_ADC_on(seq)
ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
else
ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
end
#Mxy precession and relaxation, and Mz relaxation
tp = cumsum(seq.Δt) # t' = t - t0
dur = sum(seq.Δt) # Total length, used for signal relaxation
Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))] #This assumes Δw and T2 are constant in time
Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im .* sin.(ϕ))] #This assumes Δw and T2 are constant in time
M.xy .= Mxy[:, end]
#Acquired signal
sig[:, :, 1] .= transpose(Mxy[:, findall(seq.ADC)])
Expand All @@ -64,4 +65,4 @@ function run_spin_precession!(
M.z .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1)) #Jump to the last point
end
return nothing
end
end
Original file line number Diff line number Diff line change
@@ -1,26 +1,10 @@
struct Bloch <: SimulationMethod end
#Simplest sim method, works for GPU and CPU but not optimized for either. Although Bloch()
#is the simulation method chosen if none is passed, the run_spin_precession! and
#run_spin_excitation! functions in this file are dispatched to at the most abstract level,
#so new simulation methods will start by using these functions.
struct BlochSimple <: SimulationMethod end

export Bloch

include("Magnetization.jl") #Defines Mag <: SpinStateRepresentation
@functor Mag #Gives gpu acceleration capabilities, see GPUFunctions.jl

function sim_output_dim(
obj::Phantom{T}, seq::Sequence, sys::Scanner, sim_method::SimulationMethod
) where {T<:Real}
return (sum(seq.ADC.N), 1) #Nt x Ncoils, This should consider the coil info from sys
end

"""Magnetization initialization for Bloch simulation method."""
function initialize_spins_state(
obj::Phantom{T}, sim_method::SimulationMethod
) where {T<:Real}
Nspins = length(obj)
Mxy = zeros(T, Nspins)
Mz = obj.ρ
Xt = Mag{T}(Mxy, Mz)
return Xt, obj
end
export BlochSimple

"""
run_spin_precession(obj, seq, Xt, sig)
Expand All @@ -43,23 +27,24 @@ function run_spin_precession!(
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::SimulationMethod,
backend::KA.Backend
backend::KA.Backend,
prealloc::PreallocResult
) where {T<:Real}
#Simulation
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
#Effective field
Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
#Rotation
if is_ADC_on(seq)
ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
else
ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
end
#Mxy precession and relaxation, and Mz relaxation
tp = cumsum(seq.Δt) # t' = t - t0
dur = sum(seq.Δt) # Total length, used for signal relaxation
Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))] #This assumes Δw and T2 are constant in time
Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im .* sin.(ϕ))] #This assumes Δw and T2 are constant in time
M.xy .= Mxy[:, end]
M.z .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1))
#Acquired signal
Expand Down Expand Up @@ -89,18 +74,20 @@ function run_spin_excitation!(
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::SimulationMethod,
backend::KA.Backend,
prealloc::PreallocResult
) where {T<:Real}
#Simulation
for s in seq #This iterates over seq, "s = seq[i,:]"
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
#Effective field
ΔBz = p.Δw ./ T(2π * γ) .- s.Δf ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
ΔBz = p.Δw ./ T(2π .* γ) .- s.Δf ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
Bz = (s.Gx .* x .+ s.Gy .* y .+ s.Gz .* z) .+ ΔBz
B = sqrt.(abs.(s.B1) .^ 2 .+ abs.(Bz) .^ 2)
B[B .== 0] .= eps(T)
#Spinor Rotation
φ = T(-2π * γ) * (B .* s.Δt) # TODO: Use trapezoidal integration here (?), this is just Forward Euler
φ = T(-2π .* γ) .* (B .* s.Δt) # TODO: Use trapezoidal integration here (?), this is just Forward Euler
mul!(Q(φ, s.B1 ./ B, Bz ./ B), M)
#Relaxation
M.xy .= M.xy .* exp.(-s.Δt ./ p.T2)
Expand All @@ -109,4 +96,4 @@ function run_spin_excitation!(
#Acquired signal
#sig .= -1.4im #<-- This was to test if an ADC point was inside an RF block
return nothing
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,23 @@ Parameter relations for the Shinnar-Le Roux selective excitation pulse design al
(NMR imaging).
IEEE Transactions on Medical Imaging, 10(1), 53-65. doi:10.1109/42.75611
"""
mul!(s::Spinor, M::Mag) = begin
mul!(s::Spinor{T}, M::Mag) where {T<:Real} = begin
M_aux = Mag(
2*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
(abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2*real.(s.α.*s.β.*conj.(M.xy))
T(2) .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
(abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-T(2) .*real.(s.α.*s.β.*conj.(M.xy))
)
M.xy .= M_aux.xy
M.z .= M_aux.z
end
*(s::Spinor, M::Mag) = begin
mul!(s::Spinor{T}, M::Mag, Maux_xy, Maux_z) where {T<:Real} = begin
@. Maux_xy = T(2)*conj(s.α)*s.β*M.z+conj(s.α)^2*M.xy-s.β^2*conj(M.xy)
@. Maux_z = (abs(s.α)^2 -abs(s.β)^2)*M.z-T(2) *real(s.α*s.β*conj(M.xy))
@. M.xy = Maux_xy
@. M.z = Maux_z
end
*(s::Spinor{T}, M::Mag) where {T<:Real} = begin
Mag(
2*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
(abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2*real.(s.α.*s.β.*conj.(M.xy))
T(2) .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
(abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-T(2) .*real.(s.α.*s.β.*conj.(M.xy))
)
end
40 changes: 40 additions & 0 deletions KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#Bloch is the default simulation method
struct Bloch <: SimulationMethod end

export Bloch

include("Magnetization.jl")

@functor Mag #Gives gpu acceleration capabilities, see GPUFunctions.jl

function sim_output_dim(
obj::Phantom{T}, seq::Sequence, sys::Scanner, sim_method::SimulationMethod
) where {T<:Real}
return (sum(seq.ADC.N), 1) #Nt x Ncoils, This should consider the coil info from sys
end

"""Magnetization initialization for Bloch simulation method."""
function initialize_spins_state(
obj::Phantom{T}, sim_method::SimulationMethod
) where {T<:Real}
Nspins = length(obj)
Mxy = zeros(T, Nspins)
Mz = obj.ρ
Xt = Mag{T}(Mxy, Mz)
return Xt, obj
end

abstract type PreallocResult{T<:Real} end

"""Default preallocation struct, stores nothing."""
struct DefaultPreAlloc{T} <: PreallocResult{T} end

Base.view(p::DefaultPreAlloc, i::UnitRange) = p

"""Default preallocation function."""
prealloc(sim_method::SimulationMethod, backend::KA.Backend, obj::Phantom{T}, M::Mag{T}) where {T<:Real} = DefaultPreAlloc{T}()

include("KernelFunctions.jl")
include("BlochSimple/BlochSimple.jl")
include("Bloch/BlochCPU.jl")
include("BlochDict/BlochDict.jl")
Loading

1 comment on commit e09ba44

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

KomaMRI Benchmarks

Benchmark suite Current: e09ba44 Previous: 1de7627 Ratio
MRI Lab/Bloch/CPU/2 thread(s) 244928608.5 ns 1866110763 ns 0.13
MRI Lab/Bloch/CPU/4 thread(s) 140758946 ns 945776837 ns 0.15
MRI Lab/Bloch/CPU/8 thread(s) 171370827 ns 911378432 ns 0.19
MRI Lab/Bloch/CPU/1 thread(s) 415982190.5 ns 3387409372.5 ns 0.12
MRI Lab/Bloch/GPU/CUDA 138386450.5 ns 153551114 ns 0.90
MRI Lab/Bloch/GPU/oneAPI 18715897786.5 ns 26913270216 ns 0.70
MRI Lab/Bloch/GPU/Metal 2988417291 ns 3261239667 ns 0.92
MRI Lab/Bloch/GPU/AMDGPU 1806396508 ns 1817370709.5 ns 0.99
Slice Selection 3D/Bloch/CPU/2 thread(s) 1141348868.5 ns 3756525851 ns 0.30
Slice Selection 3D/Bloch/CPU/4 thread(s) 622082227 ns 1843011618 ns 0.34
Slice Selection 3D/Bloch/CPU/8 thread(s) 462315010 ns 1330616070 ns 0.35
Slice Selection 3D/Bloch/CPU/1 thread(s) 2273113325 ns 6715606353 ns 0.34
Slice Selection 3D/Bloch/GPU/CUDA 255739261 ns 265940605.5 ns 0.96
Slice Selection 3D/Bloch/GPU/oneAPI 1709103436 ns 2149205885 ns 0.80
Slice Selection 3D/Bloch/GPU/Metal 1203673021 ns 1409920896 ns 0.85
Slice Selection 3D/Bloch/GPU/AMDGPU 688213296 ns 712193120 ns 0.97

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.