FFTW Poisson solver: fixes for non-trivial tile geometry

Improve the handling of cases where the Poisson solver data is unevenly distributed over the tiles (because of divisibility, jtot not divisible by nprocx or itot not divisible by nprocy). * p201_flat was declared with wrong size (tripped debug build) * pe,qe may be less on the last tile in x or y, if so the tridiagonal solver may operate on uninitialized data (tripped debug build) * before FFT, zero the unused data on the last tile, to avoid operating on uninitialized data (tripped debug build) * increase size of xrt,yrt and pad with zeroes. Last tile may access outside the old bounds when the data is unevenly distributed (tripped debug build) * when filling xrt, yrt, don't assume symmetry around center. This will be needed for open boundaries, and handles odd sizes correctly.
dalesteam · Mar 5, 2024 · 8f2c585 · v1kko · Mar 5, 2024 · fjansson
1 parent 58edad3
commit 8f2c585
Showing 1 changed file with 25 additions and 15 deletions.
diff --git a/src/modfftw.f90 b/src/modfftw.f90
@@ -41,6 +41,8 @@ module modfftw
   integer                         :: method
   integer                         :: konx, kony
   integer                         :: iony, jonx
+  integer                         :: konx_last, kony_last
+  integer                         :: iony_last, jonx_last
   real(pois_r), dimension(:), allocatable :: bufin, bufout
   interface fftw_plan_many_r2r_if
     procedure :: d_fftw_plan_many_r2r
@@ -134,14 +136,20 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe)
       jonx = jonx + 1
     endif
 
+    konx_last = kmax - konx*(nprocx-1)
+    kony_last = kmax - kony*(nprocy-1)
+    iony_last = itot - iony*(nprocy-1)
+    jonx_last = jtot - jonx*(nprocx-1)
+
 ! Allocate communication buffers for the transpose functions
     sz = max( imax * jmax * konx * nprocx, & ! transpose a1
               iony * jmax * konx * nprocy, & ! transpose a2
               iony * jonx * konx * nprocx  ) ! transpose a3
 
     allocate(bufin (sz))
     allocate(bufout(sz))
-
+    bufin = 0
+    bufout = 0
 ! Allocate temporary arrays to hold transposed matrix
 
     ! calculate memory size as an long int (size_t)
@@ -164,7 +172,7 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe)
     p210(1:itot,1:jmax,1:konx) => fptr(1:itot*jmax*konx)
     p210_flat(1:itot*jmax*konx)=> fptr(1:itot*jmax*konx)
     p201(1:jtot,1:konx,1:iony) => fptr(1:jtot*konx*iony)
-    p201_flat(1:itot*konx*iony)=> fptr(1:itot*konx*iony)
+    p201_flat(1:jtot*konx*iony)=> fptr(1:jtot*konx*iony)
     Fp(1:iony,1:jonx,1:kmax) => fptr(1:iony*jonx*kmax)
 
     ! Prepare 1d FFT transforms
@@ -254,6 +262,10 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe)
     qs = 1
     qe = jonx
 
+    ! special case for final task in x or y, which may have fewer elements
+    if (myidx == nprocx-1) qe = jonx_last
+    if (myidy == nprocy-1) pe = iony_last
+
     else if (method == 2) then
 
     ! Make an array with the halo zone sliced off on the top and the left
@@ -647,6 +659,10 @@ subroutine fftwf(p, Fp)
       call fftw_execute_r2r_if(planx, p210_flat, p210_flat)
 
       call transpose_a2(p210, p201)
+      ! zero the unused part, avoinds SIGFPE from the FFT (Debug mode)
+      ! p201(jtot,konx,iony)
+      if (myidx == nprocx-1) p201(:,konx_last+1:, :) = 0
+      if (myidy == nprocy-1) p201(:,:,iony_last+1:)  = 0
       call fftw_execute_r2r_if(plany, p201_flat, p201_flat)
 
       call transpose_a3(p201, Fp)
@@ -692,7 +708,9 @@ subroutine fftwinit_factors(xyrt)
 
     integer :: i,j,iv,jv
     real(pois_r)    :: fac
-    real(pois_r)    :: xrt(itot), yrt(jtot)
+    real(pois_r)    :: xrt(nprocy*iony), yrt(nprocx*jonx)
+    xrt = 0
+    yrt = 0
 
   ! Generate Eigenvalues xrt and yrt resulting from d**2/dx**2 F = a**2 F
 
@@ -712,33 +730,25 @@ subroutine fftwinit_factors(xyrt)
 
   ! I --> direction
     fac = 1./(2.*itot)
-    do i=2,(itot/2)
+    do i=2,itot
       xrt(i)=-4.*dxi*dxi*(sin(float(2*(i-1))*pi*fac))**2
-      xrt(itot - i + 2) = xrt(i)
     end do
     xrt(1) = 0.
-    if (mod(itot,2) == 0) then
-      ! Nyquist frequency
-      xrt(1 + itot/2) = -4.*dxi*dxi
-    endif
 
   ! J --> direction
     fac = 1./(2.*jtot)
-    do j=2,(jtot/2)
+    do j=2,jtot
       yrt(j)=-4.*dyi*dyi*(sin(float(2*(j-1))*pi*fac))**2
-      yrt(jtot - j + 2) = yrt(j)
     end do
     yrt(1) = 0.
-    if (mod(jtot,2) == 0) then
-      ! Nyquist frequency
-      yrt(1 + jtot/2) = -4.*dyi*dyi
-    endif
 
   ! Combine I and J directions
   ! Note that:
   ! 1. MPI starts counting at 0 so it should be myidy * jmax
   ! 2. real data, ie. no halo, starts at index 2 in the array xyrt(2,2) <-> xrt(1), yrt(1)
 
+  ! the last tile in x or y may have fewer elements than the rest
+  ! filling xyrt will then access xrt or yrt beyond itot or jtot. xrt and yrt are padded above with zeroes.
     if (method == 1) then
       ! nprocx /= 1, nprocy /= 1
       ! we will be working on matrix Fp(1:iony,1:jonx,1:kmax)