From feb8e1567aa75a30a5d570d7de18f54d448e9835 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Tue, 8 Apr 2014 10:21:20 -0400
Subject: [PATCH 01/14] make visc_bound_rem to be private to solve slowdown
 issue

---
 src/parameterizations/lateral/MOM_hor_visc.F90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/parameterizations/lateral/MOM_hor_visc.F90 b/src/parameterizations/lateral/MOM_hor_visc.F90
index 28f67244d2..e00cfa801b 100644
--- a/src/parameterizations/lateral/MOM_hor_visc.F90
+++ b/src/parameterizations/lateral/MOM_hor_visc.F90
@@ -296,7 +296,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, CS, OBC)
         "VarMix%Res_fn_q both need to be associated with Resoln_scaled_Kh.")
   endif
 
-!$OMP parallel do default(shared) private(i, j, k, u0, v0, sh_xx, str_xx, &
+!$OMP parallel do default(shared) private(i, j, k, u0, v0, sh_xx, str_xx, visc_bound_rem,&
 !$OMP                                     sh_xy, str_xy, Ah, Kh, AhSm, KhSm, &
 !$OMP                                     Shear_mag, huq, hvq, hq, Kh_scale, hrat_min)
   do k=1,nz

From 5c741f6bc07be589177f89b087ea9332a9311bb0 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Tue, 8 Apr 2014 10:22:43 -0400
Subject: [PATCH 02/14] implement openmp

---
 src/tracer/MOM_tracer_advect.F90 | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/tracer/MOM_tracer_advect.F90 b/src/tracer/MOM_tracer_advect.F90
index c5c1d8ef47..b95d7dc311 100644
--- a/src/tracer/MOM_tracer_advect.F90
+++ b/src/tracer/MOM_tracer_advect.F90
@@ -136,9 +136,11 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, CS, Reg)
   integer :: nsten_halo         ! The number of stensils that fit in the halos.
   integer :: i, j, k, m, is, ie, js, je, isd, ied, jsd, jed, nz, itt, ntr, do_any
   integer :: isv, iev, jsv, jev ! The valid range of the indices.
+  integer :: IsdB, IedB, JsdB, JedB
 
   is = G%isc ; ie = G%iec ; js = G%jsc ; je = G%jec ; nz = G%ke
   isd = G%isd ; ied = G%ied ; jsd = G%jsd ; jed = G%jed
+  IsdB = G%IsdB ; IedB = G%IedB ; JsdB = G%JsdB ; JedB = G%JedB
   landvolfill = 1.0e-20         ! This is arbitrary, but must be positive.
   stensil = 2                   ! The scheme's stensil; 2 for PLM.
 
@@ -156,12 +158,15 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, CS, Reg)
 
   max_iter = 2*INT(CEILING(dt/CS%dt)) + 1
 
+!$OMP parallel default(shared)
+
 ! This initializes the halos of uhr and vhr because pass_vector might do
 ! calculations on them, even though they are never used.
-  uhr(:,:,:) = 0.0 ; vhr(:,:,:) = 0.0
-  hprev(:,:,:) = landvolfill
-
-  do k=1,nz
+!$OMP do
+  do k = 1, nz
+    do j = jsd,  jed;  do i = IsdB, IedB; uhr(i,j,k) = 0.0; enddo ; enddo
+    do j = jsdB, jedB; do i = Isd,  Ied;  vhr(i,j,k) = 0.0; enddo ; enddo
+    do j = jsd,  jed;  do i = Isd,  Ied;  hprev(i,j,k) = 0.0; enddo ; enddo
     domore_k(k)=1
 !  Put the remaining (total) thickness fluxes into uhr and vhr.
     do j=js,je ; do I=is-1,ie ; uhr(I,j,k) = uhtr(I,j,k) ; enddo ; enddo
@@ -179,13 +184,16 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, CS, Reg)
                      max(0.0, 1.0e-13*hprev(i,j,k) - G%areaT(i,j)*h_end(i,j,k))
     enddo ; enddo
   enddo
+
+!$OMP do
   do j=jsd,jed ; do I=isd,ied-1
     uh_neglect(I,j) = G%H_subroundoff*MIN(G%areaT(i,j),G%areaT(i+1,j))
   enddo ; enddo
+!$OMP do
   do J=jsd,jed-1 ; do i=isd,ied
     vh_neglect(i,J) = G%H_subroundoff*MIN(G%areaT(i,j),G%areaT(i,j+1))
   enddo ; enddo
-
+!$OMP do
   do m=1,ntr
     if (associated(Tr(m)%ad_x)) then
       do k=1,nz ; do j=jsd,jed ; do i=isd,ied
@@ -204,6 +212,7 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, CS, Reg)
       do J=jsd,jed ; do i=isd,ied ; Tr(m)%ad2d_y(i,J) = 0.0 ; enddo ; enddo
     endif
   enddo
+!$OMP end parallel
 
   isv = is ; iev = ie ; jsv = js ; jev = je
 
@@ -222,6 +231,7 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, CS, Reg)
       ! Reevaluate domore_u & domore_v unless the valid range is the same size as
       ! before.  Also, do this if there is Strang splitting.
       if ((nsten_halo > 1) .or. (itt==1)) then
+!$OMP parallel do default(shared)
         do k=1,nz ; if (domore_k(k) > 0) then
           do j=jsv,jev ; if (.not.domore_u(j,k)) then
             do i=isv+stensil-1,iev-stensil; if (uhr(I,j,k) /= 0.0) then
@@ -248,6 +258,7 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, CS, Reg)
     isv = isv + stensil ; iev = iev - stensil
     jsv = jsv + stensil ; jev = jev - stensil
 
+!$OMP parallel do default(shared)
     do k=1,nz ; if (domore_k(k) > 0) then
 !    To ensure positive definiteness of the thickness at each iteration, the
 !  mass fluxes out of each layer are checked each step, and limited to keep

From b70a603139b1a5742e09c240244a1d89e1a55953 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Tue, 8 Apr 2014 10:23:44 -0400
Subject: [PATCH 03/14] implement openmp

---
 src/tracer/MOM_tracer_registry.F90 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tracer/MOM_tracer_registry.F90 b/src/tracer/MOM_tracer_registry.F90
index 0edef9262e..8faa2ad779 100644
--- a/src/tracer/MOM_tracer_registry.F90
+++ b/src/tracer/MOM_tracer_registry.F90
@@ -308,6 +308,7 @@ subroutine tracer_vertdiff(h_old, ea, eb, dt, tr, G, &
 
 
   if (present(sink_rate)) then
+!$OMP parallel do default(shared) private(sink,h_minus_dsink,b_denom_1,b1,d1,h_tr,c1)
     do j=js,je
       ! Find the sinking rates at all interfaces, limiting them if necesary
       ! so that the characteristics do not cross within a timestep.
@@ -375,6 +376,7 @@ subroutine tracer_vertdiff(h_old, ea, eb, dt, tr, G, &
       endif ; enddo ; enddo
     enddo
   else
+!$OMP parallel do default(shared) private(h_tr,b_denom_1,b1,d1,c1)
     do j=js,je
       do i=is,ie ; if (G%mask2dT(i,j) > 0.5) then
         h_tr = h_old(i,j,1) + h_neglect

From afd1b07911940a59e9e964e494ef070e30accab2 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Wed, 9 Apr 2014 10:32:16 -0400
Subject: [PATCH 04/14] implement openmp

---
 .../vertical/MOM_geothermal.F90               | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/parameterizations/vertical/MOM_geothermal.F90 b/src/parameterizations/vertical/MOM_geothermal.F90
index d8631aae36..5995d70b7c 100644
--- a/src/parameterizations/vertical/MOM_geothermal.F90
+++ b/src/parameterizations/vertical/MOM_geothermal.F90
@@ -115,7 +115,7 @@ subroutine geothermal(h, tv, dt, ea, eb, G, CS)
 !  (in)      CS - The control structure returned by a previous call to
 !                 geothermal_init.
 
-  real :: resid(SZI_(G),SZJ_(G))
+!  real :: resid(SZI_(G),SZJ_(G)) !z1l: never been used.
   real, dimension(SZI_(G)) :: &
     heat_rem, & !   The remaining heat, in H degC.
     h_geo_rem, &!   The remaining thickness to which to apply the geothermal 
@@ -173,11 +173,12 @@ subroutine geothermal(h, tv, dt, ea, eb, G, CS)
   if (.not.ASSOCIATED(tv%T)) call MOM_error(FATAL, "MOM geothermal: "//&
       "Geothermal heating can only be applied if T & S are state variables.")
 
-  do i=is,ie ; do j=js,je
-    resid(i,j) = tv%internal_heat(i,j)
-  enddo ; enddo
-
+!  do i=is,ie ; do j=js,je
+!    resid(i,j) = tv%internal_heat(i,j)
+!  enddo ; enddo
 
+!$OMP parallel do default(private) shared(is,ie,js,je,G,CS,dt,Irho_cp,nkmb,tv,p_Ref, &
+!$OMP                                     h,Angstrom,nz,H_neglect,eb) 
   do j=js,je
     ! 1. Only work on columns that are being heated.
     ! 2. Find the deepest layer with any mass.
@@ -345,10 +346,10 @@ subroutine geothermal(h, tv, dt, ea, eb, G, CS)
     enddo ; endif
   enddo ! j-loop
 
-  do i=is,ie ; do j=js,je
-    resid(i,j) = tv%internal_heat(i,j) - resid(i,j) - G%H_to_kg_m2 * &
-           (G%mask2dT(i,j) * (CS%geo_heat(i,j) * (dt*Irho_cp)))
-  enddo ; enddo
+!  do i=is,ie ; do j=js,je
+!    resid(i,j) = tv%internal_heat(i,j) - resid(i,j) - G%H_to_kg_m2 * &
+!           (G%mask2dT(i,j) * (CS%geo_heat(i,j) * (dt*Irho_cp)))
+!  enddo ; enddo
 
 end subroutine geothermal
 

From d93cacdeb59e55b78ec2bb0807f474cefd6756e6 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Wed, 9 Apr 2014 10:44:35 -0400
Subject: [PATCH 05/14] change loop structure and implement openmp

---
 src/tracer/MOM_tracer_hor_diff.F90 | 120 ++++++++++++++++++-----------
 1 file changed, 76 insertions(+), 44 deletions(-)

diff --git a/src/tracer/MOM_tracer_hor_diff.F90 b/src/tracer/MOM_tracer_hor_diff.F90
index bd08f694dc..d4ea69ea4a 100644
--- a/src/tracer/MOM_tracer_hor_diff.F90
+++ b/src/tracer/MOM_tracer_hor_diff.F90
@@ -445,6 +445,7 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
 
   real, dimension(SZI_(G), SZJ_(G), SZK_(G)) :: &
     tr_flux_conv  ! The flux convergence of tracers, in TR m3 or TR kg.
+  real, dimension(SZI_(G), SZJ_(G), SZK_(G)) :: Tr_flux_3d, Tr_adj_vert_L, Tr_adj_vert_R
  
   real, dimension(SZI_(G), SZK_(G), SZJ_(G)) :: &
     rho_srt, & ! The density of each layer of the sorted columns, in kg m-3.
@@ -526,12 +527,14 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
   enddo
   call pass_var(Tr(ntr)%t(:,:,:), G%Domain)
   call cpu_clock_end(id_clock_pass)
-
   ! Determine which layers the mixed- and buffer-layers map into...
-  do k=1,nkmb ; do j=js-2,je+2
-    call calculate_density(tv%T(:,j,k),tv%S(:,j,k), p_ref_cv, &
+!$OMP parallel do default(shared) 
+  do k=1,nkmb 
+    do j=js-2,je+2
+       call calculate_density(tv%T(:,j,k),tv%S(:,j,k), p_ref_cv, &
                            rho_coord(:,j,k), is-2, ie-is+5, tv%eqn_of_state)
-  enddo ; enddo
+    enddo 
+ enddo
 
   do j=js-2,je+2 ; do i=is-2,ie+2
     Rml_max(i,j) = rho_coord(i,j,1)
@@ -540,10 +543,10 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
   do k=2,nkmb ; do j=js-2,je+2 ; do i=is-2,ie+2
     if (Rml_max(i,j) < rho_coord(i,j,k)) Rml_max(i,j) = rho_coord(i,j,k)
   enddo ; enddo ; enddo
-
   !   Use bracketing and bisection to find the k-level that the densest of the
   ! mixed and buffer layer corresponds to, such that:
   !     G%Rlay(max_kRho-1) < Rml_max <= G%Rlay(max_kRho)
+!$OMP parallel do default(shared) private(k_min,k_max,k_test)
   do j=js-2,je+2 ; do i=is-2,ie+2 ; if (G%mask2dT(i,j) > 0.5) then
     if (Rml_max(i,j) > G%Rlay(nz)) then ; max_kRho(i,j) = nz+1
     elseif (Rml_max(i,j) <= G%Rlay(nkmb+1)) then ; max_kRho(i,j) = nkmb+1
@@ -569,27 +572,30 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
   if (PEmax_kRho > nz) PEmax_kRho = nz ! PEmax_kRho could have been nz+1.
 
   h_exclude = 10.0*(G%Angstrom + G%H_subroundoff)
-
-  do k=1,nkmb ; do j=js-1,je+1 ; do i=is-1,ie+1 ; if (G%mask2dT(i,j) > 0.5) then
-    if (h(i,j,k) > h_exclude) then
-      num_srt(i,j) = num_srt(i,j) + 1 ; ns = num_srt(i,j)
-      k0_srt(i,ns,j) = k
-      rho_srt(i,ns,j) = rho_coord(i,j,k)
-      h_srt(i,ns,j) = h(i,j,k)
-    endif
-  endif ; enddo ; enddo ; enddo
-  do k=nkmb+1,PEmax_kRho ; do j=js-1,je+1 ; do i=is-1,ie+1 ; if (G%mask2dT(i,j) > 0.5) then
-    if ((k<=k_end_srt(i,j)) .and. (h(i,j,k) > h_exclude)) then
-      num_srt(i,j) = num_srt(i,j) + 1 ; ns = num_srt(i,j)
-      k0_srt(i,ns,j) = k
-      rho_srt(i,ns,j) = G%Rlay(k)
-      h_srt(i,ns,j) = h(i,j,k)
-    endif
-  endif ; enddo ; enddo ; enddo
-
+!$OMP parallel default(shared) private(ns,tmp,itmp)
+!$OMP do
+  do j=js-1,je+1 
+    do k=1,nkmb ; do i=is-1,ie+1 ; if (G%mask2dT(i,j) > 0.5) then
+      if (h(i,j,k) > h_exclude) then
+        num_srt(i,j) = num_srt(i,j) + 1 ; ns = num_srt(i,j)
+        k0_srt(i,ns,j) = k
+        rho_srt(i,ns,j) = rho_coord(i,j,k)
+        h_srt(i,ns,j) = h(i,j,k)
+      endif
+    endif ; enddo ; enddo
+    do k=nkmb+1,PEmax_kRho ; do i=is-1,ie+1 ; if (G%mask2dT(i,j) > 0.5) then
+      if ((k<=k_end_srt(i,j)) .and. (h(i,j,k) > h_exclude)) then
+        num_srt(i,j) = num_srt(i,j) + 1 ; ns = num_srt(i,j)
+        k0_srt(i,ns,j) = k
+        rho_srt(i,ns,j) = G%Rlay(k)
+        h_srt(i,ns,j) = h(i,j,k)
+      endif
+    endif ; enddo ; enddo
+  enddo
   ! Sort each column by increasing density.  This should already be close, 
   ! and the size of the arrays are small, so straight insertion is used.
-  do j=js-1,je+1 ; do i=is-1,ie+1
+!$OMP do
+   do j=js-1,je+1; do i=is-1,ie+1
     do k=2,num_srt(i,j) ; if (rho_srt(i,k,j) < rho_srt(i,k-1,j)) then
       ! The last segment needs to be shuffled earlier in the list.
       do k2 = k,2,-1 ; if (rho_srt(i,k2,j) >= rho_srt(i,k2-1,j)) exit
@@ -598,12 +604,13 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
         tmp = h_srt(i,k2-1,j) ; h_srt(i,k2-1,j) = h_srt(i,k2,j) ; h_srt(i,k2,j) = tmp
       enddo
     endif ; enddo
-  enddo ; enddo
-
+  enddo; enddo
+!$OMP do
   do j=js-1,je+1
     max_srt(j) = 0
     do i=is-1,ie+1 ; max_srt(j) = max(max_srt(j), num_srt(i,j)) ; enddo
   enddo
+!$OMP end parallel
 
   do j=js,je
     k_size = max(2*max_srt(j),1)
@@ -617,6 +624,9 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
     allocate(k0b_Ru(j)%p(IsdB:IedB,k_size))
   enddo
 
+!$OMP parallel do default(private) shared(is,ie,js,je,G,num_srt,rho_srt,k0b_Lu,k0_srt, &
+!$OMP                                     k0b_Ru,k0a_Lu,k0a_Ru,deep_wt_Lu,deep_wt_Ru,  &
+!$OMP                                     h_srt,nkmb,nPu,hP_Lu,hP_Ru) 
   do j=js,je ; do I=is-1,ie ; if (G%mask2dCu(I,j) > 0.5) then
     ! Set up the pairings for fluxes through the zonal faces.
 
@@ -763,6 +773,9 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
     allocate(k0b_Rv(J)%p(isd:ied,k_size))
   enddo
 
+!$OMP parallel do default(private) shared(is,ie,js,je,G,num_srt,rho_srt,k0b_Lv,k0b_Rv, &
+!$OMP                                     k0_srt,k0a_Lv,k0a_Rv,deep_wt_Lv,deep_wt_Rv,  &
+!$OMP                                     h_srt,nkmb,nPv,hP_Lv,hP_Rv)   
   do J=js-1,je ; do i=is,ie ; if (G%mask2dCv(i,J) > 0.5) then
     ! Set up the pairings for fluxes through the meridional faces.
 
@@ -917,7 +930,9 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
     endif
 
     do m=1,ntr
-    
+!$OMP parallel do default(shared) private(Tr_min_face,Tr_max_face,kLa,kLb,kRa,kRb,Tr_La, &
+!$OMP                                     Tr_Lb,Tr_Ra,Tr_Rb,Tr_av_L,wt_b,Tr_av_R,h_L,h_R, &
+!$OMP                                     Tr_flux,Tr_adj_vert,wt_a,vol)
       do j=js,je ; do I=is-1,ie ; if (G%mask2dCu(I,j) > 0.5) then
         ! Determine the fluxes through the zonal faces.
 
@@ -1051,6 +1066,9 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
         enddo ! Loop over pairings at faces.
       endif ; enddo ; enddo ! i- & j- loops over zonal faces.
 
+!$OMP parallel do default(shared) private(Tr_min_face,Tr_max_face,kLa,kLb,kRa,kRb, &
+!$OMP                                     Tr_La,Tr_Lb,Tr_Ra,Tr_Rb,Tr_av_L,wt_b,Tr_av_R, &
+!$OMP                                     h_L,h_R,Tr_flux,Tr_adj_vert,wt_a,vol)
       do J=js-1,je ; do i=is,ie ; if (G%mask2dCv(i,J) > 0.5) then
         ! Determine the fluxes through the meridional faces.
 
@@ -1105,10 +1123,9 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
           h_L = hP_Lv(J)%p(i,k) ; h_R = hP_Rv(J)%p(i,k)
           Tr_flux = I_maxitt * ((2.0 * h_L * h_R) / (h_L + h_R)) * &
                     khdt_epi_y(i,J) * (Tr_av_L - Tr_av_R)
-                    
-          if (deep_wt_Lv(J)%p(i,k) >= 1.0) then
-            tr_flux_conv(i,j,kLb) = tr_flux_conv(i,j,kLb) - Tr_flux
-          else
+          Tr_flux_3d(i,j,k) = Tr_flux
+
+          if (deep_wt_Lv(J)%p(i,k) < 1.0) then
             Tr_adj_vert = 0.0
             wt_b = deep_wt_Lv(J)%p(i,k) ; wt_a = 1.0 - wt_b
             vol = hP_Lv(J)%p(i,k) * G%areaT(i,j)
@@ -1132,14 +1149,10 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
                                           (vol*wt_a)*(Tr_Lb - Tr_La))
               endif
             endif
-
-            tr_flux_conv(i,j,kLa) = tr_flux_conv(i,j,kLa) - (wt_a*Tr_flux + Tr_adj_vert)
-            tr_flux_conv(i,j,kLb) = tr_flux_conv(i,j,kLb) - (wt_b*Tr_flux - Tr_adj_vert)
+            Tr_adj_vert_L(i,j,k) = Tr_adj_vert            
           endif
 
-          if (deep_wt_Rv(J)%p(i,k) >= 1.0) then
-            tr_flux_conv(i,j+1,kRb) = tr_flux_conv(i,j+1,kRb) + Tr_flux
-          else
+          if (deep_wt_Rv(J)%p(i,k) < 1.0) then
             Tr_adj_vert = 0.0
             wt_b = deep_wt_Rv(J)%p(i,k) ; wt_a = 1.0 - wt_b
             vol = hP_Rv(J)%p(i,k) * G%areaT(i,j+1)
@@ -1163,18 +1176,37 @@ subroutine tracer_epipycnal_ML_diff(h, dt, Tr, ntr, khdt_epi_x, khdt_epi_y, G, &
                                           (vol*wt_a)*(Tr_Rb - Tr_Ra))
               endif
             endif
-
-            tr_flux_conv(i,j+1,kRa) = tr_flux_conv(i,j+1,kRa) + &
-                                            (wt_a*Tr_flux - Tr_adj_vert)
-            tr_flux_conv(i,j+1,kRb) = tr_flux_conv(i,j+1,kRb) + &
-                                            (wt_b*Tr_flux + Tr_adj_vert)
+            Tr_adj_vert_R(i,j,k) = Tr_adj_vert
           endif
           if (associated(Tr(m)%df2d_y)) &
             Tr(m)%df2d_y(i,J) = Tr(m)%df2d_y(i,J) + Tr_flux * Idt
         enddo ! Loop over pairings at faces.
       endif ; enddo ; enddo ! i- & j- loops over meridional faces.
-
-
+!$OMP parallel do default(shared) private(kLa,kLb,kRa,kRb,wt_b,wt_a)
+      do i=is,ie ; do J=js-1,je ; if (G%mask2dCv(i,J) > 0.5) then
+        do k=1,nPv(i,J)
+          kLb = k0b_Lv(J)%p(i,k); kRb = k0b_Rv(J)%p(i,k)
+          if (deep_wt_Lv(J)%p(i,k) >= 1.0) then
+            tr_flux_conv(i,j,kLb) = tr_flux_conv(i,j,kLb) - Tr_flux_3d(i,j,k)
+          else
+            kLa = k0a_Lv(J)%p(i,k)
+            wt_b = deep_wt_Lv(J)%p(i,k) ; wt_a = 1.0 - wt_b 
+            tr_flux_conv(i,j,kLa) = tr_flux_conv(i,j,kLa) - (wt_a*Tr_flux_3d(i,j,k) + Tr_adj_vert_L(i,j,k))
+            tr_flux_conv(i,j,kLb) = tr_flux_conv(i,j,kLb) - (wt_b*Tr_flux_3d(i,j,k) - Tr_adj_vert_L(i,j,k))
+          endif            
+          if (deep_wt_Rv(J)%p(i,k) >= 1.0) then
+            tr_flux_conv(i,j+1,kRb) = tr_flux_conv(i,j+1,kRb) + tr_flux_3d(i,j,k)
+          else          
+            kRa = k0a_Rv(J)%p(i,k) 
+            wt_b = deep_wt_Rv(J)%p(i,k) ; wt_a = 1.0 - wt_b
+            tr_flux_conv(i,j+1,kRa) = tr_flux_conv(i,j+1,kRa) + &
+                                            (wt_a*Tr_flux_3d(i,j,k) - Tr_adj_vert_R(i,j,k))
+            tr_flux_conv(i,j+1,kRb) = tr_flux_conv(i,j+1,kRb) + &
+                                            (wt_b*Tr_flux_3d(i,j,k) + Tr_adj_vert_R(i,j,k))
+          endif
+        enddo
+      endif ; enddo ; enddo
+!$OMP parallel do default(shared)
       do k=1,PEmax_kRho ; do j=js,je ; do i=is,ie
         if ((G%mask2dT(i,j) > 0.5) .and. (h(i,j,k) > 0.0)) then
           Tr(m)%t(i,j,k) = Tr(m)%t(i,j,k) + tr_flux_conv(i,j,k) / &

From 287ac5335a4fc628014fd9029e2facbbae05a0b2 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Wed, 9 Apr 2014 11:00:55 -0400
Subject: [PATCH 06/14] rearrange loop structure and implement openmp

---
 .../lateral/MOM_thickness_diffuse.F90         | 180 ++++++++++--------
 1 file changed, 100 insertions(+), 80 deletions(-)

diff --git a/src/parameterizations/lateral/MOM_thickness_diffuse.F90 b/src/parameterizations/lateral/MOM_thickness_diffuse.F90
index daa11a939b..513fe441d4 100644
--- a/src/parameterizations/lateral/MOM_thickness_diffuse.F90
+++ b/src/parameterizations/lateral/MOM_thickness_diffuse.F90
@@ -408,10 +408,10 @@ subroutine thickness_diffuse_full(h, e, Kh_u, Kh_v, tv, uhD, vhD, dt, G, MEKE, &
   real, dimension(SZI_(G), SZJ_(G), SZK_(G)+1) :: &
     pres, &       ! The pressure at an interface, in Pa.
     h_avail_rsum  ! The running sum of h_avail above an interface, in m3 s-1.
-  real, dimension(SZIB_(G), SZJ_(G)) :: &
+  real, dimension(SZIB_(G)) :: &
     drho_dT_u, &  ! The derivatives of density with temperature and
     drho_dS_u     ! salinity at u points, in kg m-3 K-1 and kg m-3 psu-1.
-  real, dimension(SZI_(G), SZJB_(G)) :: &
+  real, dimension(SZI_(G)) :: &
     drho_dT_v, &  ! The derivatives of density with temperature and
     drho_dS_v     ! salinity at v points, in kg m-3 K-1 and kg m-3 psu-1.
   real :: uhtot(SZIB_(G), SZJ_(G))  ! The vertical sum of uhD, in m3 s-1.
@@ -464,7 +464,7 @@ subroutine thickness_diffuse_full(h, e, Kh_u, Kh_v, tv, uhD, vhD, dt, G, MEKE, &
 ! Diagnostics that should be eliminated altogether later...
  ! real, dimension(SZIB_(G), SZJ_(G), SZK_(G)+1) :: sfn_x, sfn_slope_x
  ! real, dimension(SZI_(G), SZJB_(G), SZK_(G)+1) :: sfn_y, sfn_slope_y
-  logical :: MEKE_not_null
+  logical :: MEKE_not_null, present_int_slope_u, present_int_slope_v
   integer :: is, ie, js, je, nz, IsdB
   integer :: i, j, k
   is = G%isc ; ie = G%iec ; js = G%jsc ; je = G%jec ; nz = G%ke ; IsdB = G%IsdB
@@ -477,6 +477,8 @@ subroutine thickness_diffuse_full(h, e, Kh_u, Kh_v, tv, uhD, vhD, dt, G, MEKE, &
 
   use_EOS = associated(tv%eqn_of_state)
   MEKE_not_null = (LOC(MEKE) .NE. 0)
+  present_int_slope_u = PRESENT(int_slope_u)
+  present_int_slope_u = PRESENT(int_slope_v)
 
   nk_linear = max(G%nkml, 1)
 
@@ -524,40 +526,42 @@ subroutine thickness_diffuse_full(h, e, Kh_u, Kh_v, tv, uhD, vhD, dt, G, MEKE, &
   enddo ; enddo
 !$OMP end parallel
 
-!!$OMP parallel do default(none) 
-  do K=nz,2,-1
+!$OMP parallel do default(private) shared(nz,is,ie,js,je,find_work,use_EOS,G,pres,T,S, &
+!$OMP                                     nk_linear,IsdB,tv,h,h_neglect,e,dz_neglect, &
+!$OMP                                     I_slope_max2,h_neglect2,present_int_slope_u, &
+!$OMP                                     int_slope_u,KH_u,uhtot,h_frac,h_avail_rsum,  &
+!$OMP                                     uhD,h_avail,G_scale,work_u) 
+  do j = js,je ; do K=nz,2,-1
     if (find_work .and. .not.(use_EOS)) then
-      drdiA = 0.0 ; drdiB = 0.0 ; drdjA = 0.0 ; drdjB = 0.0
+      drdiA = 0.0 ; drdiB = 0.0
 !       drdkL = G%g_prime(k) ; drdkR = G%g_prime(k)
       drdkL = G%Rlay(k)-G%Rlay(k-1) ; drdkR = G%Rlay(k)-G%Rlay(k-1)
     endif
 
     ! Calculate the zonal fluxes and gradients.
     if (use_EOS .and. ((k > nk_linear) .or. find_work)) then
-      do j=js,je
-        do I=is-1,ie
-          pres_u(I) = 0.5*(pres(i,j,K) + pres(i+1,j,K))
-          T_u(I) = 0.25*((T(i,j,k) + T(i+1,j,k)) + (T(i,j,k-1) + T(i+1,j,k-1)))
-          S_u(I) = 0.25*((S(i,j,k) + S(i+1,j,k)) + (S(i,j,k-1) + S(i+1,j,k-1)))
-        enddo
-        call calculate_density_derivs(T_u, S_u, pres_u, drho_dT_u(:,j), &
-                     drho_dS_u(:,j), (is-IsdB+1)-1, ie-is+2, tv%eqn_of_state)
+      do I=is-1,ie
+        pres_u(I) = 0.5*(pres(i,j,K) + pres(i+1,j,K))
+        T_u(I) = 0.25*((T(i,j,k) + T(i+1,j,k)) + (T(i,j,k-1) + T(i+1,j,k-1)))
+        S_u(I) = 0.25*((S(i,j,k) + S(i+1,j,k)) + (S(i,j,k-1) + S(i+1,j,k-1)))
       enddo
+      call calculate_density_derivs(T_u, S_u, pres_u, drho_dT_u, &
+                   drho_dS_u, (is-IsdB+1)-1, ie-is+2, tv%eqn_of_state)
     endif
 
-    do j=js,je ; do I=is-1,ie
+    do I=is-1,ie
       if (use_EOS .and. ((k > nk_linear) .or. find_work)) then
         ! Estimate the horizontal density gradients along layers.
-        drdiA = drho_dT_u(I,j) * (T(i+1,j,k-1)-T(i,j,k-1)) + &
-                drho_dS_u(I,j) * (S(i+1,j,k-1)-S(i,j,k-1))
-        drdiB = drho_dT_u(I,j) * (T(i+1,j,k)-T(i,j,k)) + &
-                drho_dS_u(I,j) * (S(i+1,j,k)-S(i,j,k))
+        drdiA = drho_dT_u(I) * (T(i+1,j,k-1)-T(i,j,k-1)) + &
+                drho_dS_u(I) * (S(i+1,j,k-1)-S(i,j,k-1))
+        drdiB = drho_dT_u(I) * (T(i+1,j,k)-T(i,j,k)) + &
+                drho_dS_u(I) * (S(i+1,j,k)-S(i,j,k))
 
         ! Estimate the vertical density gradients times the grid spacing.
-        drdkL = (drho_dT_u(I,j) * (T(i,j,k)-T(i,j,k-1)) + &
-                 drho_dS_u(I,j) * (S(i,j,k)-S(i,j,k-1)))
-        drdkR = (drho_dT_u(I,j) * (T(i+1,j,k)-T(i+1,j,k-1)) + &
-                 drho_dS_u(I,j) * (S(i+1,j,k)-S(i+1,j,k-1)))
+        drdkL = (drho_dT_u(I) * (T(i,j,k)-T(i,j,k-1)) + &
+                 drho_dS_u(I) * (S(i,j,k)-S(i,j,k-1)))
+        drdkR = (drho_dT_u(I) * (T(i+1,j,k)-T(i+1,j,k-1)) + &
+                 drho_dS_u(I) * (S(i+1,j,k)-S(i+1,j,k-1)))
       endif
 
 
@@ -601,7 +605,7 @@ subroutine thickness_diffuse_full(h, e, Kh_u, Kh_v, tv, uhD, vhD, dt, G, MEKE, &
             slope2_Ratio = 1.0e20  ! Force the use of the safe streamfunction.
           endif
 
-          if (present(int_slope_u)) then
+          if (present_int_slope_u) then
             Slope = (1.0 - int_slope_u(I,j,K)) * Slope + &
                     int_slope_u(I,j,K) * ((e(i+1,j,K)-e(i,j,K)) * G%IdxCu(I,j))
             slope2_Ratio = (1.0 - int_slope_u(I,j,K)) * slope2_Ratio
@@ -690,34 +694,45 @@ subroutine thickness_diffuse_full(h, e, Kh_u, Kh_v, tv, uhD, vhD, dt, G, MEKE, &
             (uhD(I,j,K) * drdiB) * 0.25 * &
             ((e(i,j,K) + e(i,j,K+1)) + (e(i+1,j,K) + e(i+1,j,K+1))) )
       endif
-
-    enddo ; enddo
+      
+    enddo
+  enddo ; enddo ! end of j-loop
 
     ! Calculate the meridional fluxes and gradients.
+!$OMP parallel do default(private) shared(nz,is,ie,js,je,find_work,use_EOS,G,pres,T,S, &
+!$OMP                                     nk_linear,IsdB,tv,h,h_neglect,e,dz_neglect, &
+!$OMP                                     I_slope_max2,h_neglect2,present_int_slope_v, &
+!$OMP                                     int_slope_v,KH_v,vhtot,h_frac,h_avail_rsum,  &
+!$OMP                                     vhD,h_avail,G_scale,Work_v) 
+  do j = js-1,je ; do K=nz,2,-1
+    if (find_work .and. .not.(use_EOS)) then
+      drdjA = 0.0 ; drdjB = 0.0
+!       drdkL = G%g_prime(k) ; drdkR = G%g_prime(k)
+      drdkL = G%Rlay(k)-G%Rlay(k-1) ; drdkR = G%Rlay(k)-G%Rlay(k-1)
+    endif
+
     if (use_EOS .and. ((k > nk_linear) .or. find_work)) then
-      do J=js-1,je
-        do i=is,ie
-          pres_v(i) = 0.5*(pres(i,j,K) + pres(i,j+1,K))
-          T_v(i) = 0.25*((T(i,j,k) + T(i,j+1,k)) + (T(i,j,k-1) + T(i,j+1,k-1)))
-          S_v(i) = 0.25*((S(i,j,k) + S(i,j+1,k)) + (S(i,j,k-1) + S(i,j+1,k-1)))
-        enddo
-        call calculate_density_derivs(T_v, S_v, pres_v, drho_dT_v(:,J), &
-                     drho_dS_v(:,J), is, ie-is+1, tv%eqn_of_state)
+      do i=is,ie
+        pres_v(i) = 0.5*(pres(i,j,K) + pres(i,j+1,K))
+        T_v(i) = 0.25*((T(i,j,k) + T(i,j+1,k)) + (T(i,j,k-1) + T(i,j+1,k-1)))
+        S_v(i) = 0.25*((S(i,j,k) + S(i,j+1,k)) + (S(i,j,k-1) + S(i,j+1,k-1)))
       enddo
+      call calculate_density_derivs(T_v, S_v, pres_v, drho_dT_v, &
+                   drho_dS_v, is, ie-is+1, tv%eqn_of_state)
     endif
-    do J=js-1,je ; do i=is,ie
+    do i=is,ie
       if (use_EOS .and. ((k > nk_linear) .or. find_work)) then
         ! Estimate the horizontal density gradients along layers.
-        drdjA = drho_dT_v(i,J) * (T(i,j+1,k-1)-T(i,j,k-1)) + &
-                drho_dS_v(i,J) * (S(i,j+1,k-1)-S(i,j,k-1))
-        drdjB = drho_dT_v(i,J) * (T(i,j+1,k)-T(i,j,k)) + &
-                drho_dS_v(i,J) * (S(i,j+1,k)-S(i,j,k))
+        drdjA = drho_dT_v(i) * (T(i,j+1,k-1)-T(i,j,k-1)) + &
+                drho_dS_v(i) * (S(i,j+1,k-1)-S(i,j,k-1))
+        drdjB = drho_dT_v(i) * (T(i,j+1,k)-T(i,j,k)) + &
+                drho_dS_v(i) * (S(i,j+1,k)-S(i,j,k))
 
         ! Estimate the vertical density gradients times the grid spacing.
-        drdkL = (drho_dT_v(i,J) * (T(i,j,k)-T(i,j,k-1)) + &
-                 drho_dS_v(i,J) * (S(i,j,k)-S(i,j,k-1)))
-        drdkR = (drho_dT_v(i,J) * (T(i,j+1,k)-T(i,j+1,k-1)) + &
-                 drho_dS_v(i,J) * (S(i,j+1,k)-S(i,j+1,k-1)))
+        drdkL = (drho_dT_v(i) * (T(i,j,k)-T(i,j,k-1)) + &
+                 drho_dS_v(i) * (S(i,j,k)-S(i,j,k-1)))
+        drdkR = (drho_dT_v(i) * (T(i,j+1,k)-T(i,j+1,k-1)) + &
+                 drho_dS_v(i) * (S(i,j+1,k)-S(i,j+1,k-1)))
       endif
 
       if (k > nk_linear) then
@@ -760,7 +775,7 @@ subroutine thickness_diffuse_full(h, e, Kh_u, Kh_v, tv, uhD, vhD, dt, G, MEKE, &
             slope2_Ratio = 1.0e20  ! Force the use of the safe streamfunction.
           endif
 
-          if (present(int_slope_v)) then
+          if (present_int_slope_v) then
             Slope = (1.0 - int_slope_v(i,J,K)) * Slope + &
                     int_slope_v(i,J,K) * ((e(i,j+1,K)-e(i,j,K)) * G%IdyCv(i,J))
             slope2_Ratio = (1.0 - int_slope_v(i,J,K)) * slope2_Ratio
@@ -847,54 +862,59 @@ subroutine thickness_diffuse_full(h, e, Kh_u, Kh_v, tv, uhD, vhD, dt, G, MEKE, &
            (vhD(i,J,K) * drdjB) * 0.25 * &
            ((e(i,j,K) + e(i,j,K+1)) + (e(i,j+1,K) + e(i+1,j,K+1))) )
       endif
-    enddo ; enddo
-  enddo ! k-loop
+    enddo
+  enddo ; enddo! j-loop
 
   ! In layer 1, enforce the boundary conditions that Sfn(z=0) = 0.0
   if (.not.find_work .or. .not.(use_EOS)) then
     do j=js,je ; do I=is-1,ie ; uhD(I,j,1) = -uhtot(I,j) ; enddo ; enddo
     do J=js-1,je ; do i=is,ie ; vhD(i,J,1) = -vhtot(i,J) ; enddo ; enddo
   else
-    if (use_EOS) then ; do j=js,je
-      do I=is-1,ie
-        pres_u(I) = 0.5*(pres(i,j,1) + pres(i+1,j,1))
-        T_u(I) = 0.5*(T(i,j,1) + T(i+1,j,1))
-        S_u(I) = 0.5*(S(i,j,1) + S(i+1,j,1))
-      enddo
-      call calculate_density_derivs(T_u, S_u, pres_u, drho_dT_u(:,j), &
-                   drho_dS_u(:,j), (is-IsdB+1)-1, ie-is+2, tv%eqn_of_state)
-    enddo ; endif
-    do j=js,je ; do I=is-1,ie
-      uhD(I,j,1) = -uhtot(I,j)
-
-      if (use_EOS) then
-        drdiB = drho_dT_u(I,j) * (T(i+1,j,1)-T(i,j,1)) + &
-                drho_dS_u(I,j) * (S(i+1,j,1)-S(i,j,1))
+!$OMP parallel do default(shared) private(pres_u,T_u,S_u,drho_dT_u,drho_dS_u,drdiB)
+    do j=js,je
+      if (use_EOS) then 
+        do I=is-1,ie
+          pres_u(I) = 0.5*(pres(i,j,1) + pres(i+1,j,1))
+          T_u(I) = 0.5*(T(i,j,1) + T(i+1,j,1))
+          S_u(I) = 0.5*(S(i,j,1) + S(i+1,j,1))
+        enddo
+        call calculate_density_derivs(T_u, S_u, pres_u, drho_dT_u, &
+                   drho_dS_u, (is-IsdB+1)-1, ie-is+2, tv%eqn_of_state)
       endif
-      Work_u(I,j) = Work_u(I,j) + G_scale * ( (uhD(I,j,1) * drdiB) * 0.25 * &
-          ((e(i,j,1) + e(i,j,2)) + (e(i+1,j,1) + e(i+1,j,2))) )
+      do I=is-1,ie
+        uhD(I,j,1) = -uhtot(I,j)
 
-    enddo ; enddo
+        if (use_EOS) then
+          drdiB = drho_dT_u(I) * (T(i+1,j,1)-T(i,j,1)) + &
+                  drho_dS_u(I) * (S(i+1,j,1)-S(i,j,1))
+        endif
+        Work_u(I,j) = Work_u(I,j) + G_scale * ( (uhD(I,j,1) * drdiB) * 0.25 * &
+            ((e(i,j,1) + e(i,j,2)) + (e(i+1,j,1) + e(i+1,j,2))) )
 
-    if (use_EOS) then ; do J=js-1,je
-      do i=is,ie
-        pres_v(i) = 0.5*(pres(i,j,1) + pres(i,j+1,1))
-        T_v(i) = 0.5*(T(i,j,1) + T(i,j+1,1))
-        S_v(i) = 0.5*(S(i,j,1) + S(i,j+1,1))
-      enddo
-      call calculate_density_derivs(T_v, S_v, pres_v, drho_dT_v(:,J), &
-                   drho_dS_v(:,J), is, ie-is+1, tv%eqn_of_state)
-    enddo ; endif
-    do J=js-1,je ; do i=is,ie
-      vhD(i,J,1) = -vhtot(i,J)
+      enddo 
+    enddo
 
+    do J=js-1,je
       if (use_EOS) then
-        drdjB = drho_dT_v(i,J) * (T(i,j+1,1)-T(i,j,1)) + &
-                drho_dS_v(i,J) * (S(i,j+1,1)-S(i,j,1))
+        do i=is,ie
+          pres_v(i) = 0.5*(pres(i,j,1) + pres(i,j+1,1))
+          T_v(i) = 0.5*(T(i,j,1) + T(i,j+1,1))
+          S_v(i) = 0.5*(S(i,j,1) + S(i,j+1,1))
+        enddo
+        call calculate_density_derivs(T_v, S_v, pres_v, drho_dT_v, &
+                   drho_dS_v, is, ie-is+1, tv%eqn_of_state)
       endif
-      Work_v(i,J) = Work_v(i,J) - G_scale * ( (vhD(i,J,1) * drdjB) * 0.25 * &
-          ((e(i,j,1) + e(i,j,2)) + (e(i,j+1,1) + e(i,j+1,2))) )
-    enddo ; enddo
+      do i=is,ie
+        vhD(i,J,1) = -vhtot(i,J)
+
+        if (use_EOS) then
+          drdjB = drho_dT_v(i) * (T(i,j+1,1)-T(i,j,1)) + &
+                  drho_dS_v(i) * (S(i,j+1,1)-S(i,j,1))
+        endif
+        Work_v(i,J) = Work_v(i,J) - G_scale * ( (vhD(i,J,1) * drdjB) * 0.25 * &
+            ((e(i,j,1) + e(i,j,2)) + (e(i,j+1,1) + e(i,j+1,2))) )
+      enddo 
+    enddo
   endif
 
   if (find_work) then ; do j=js,je ; do i=is,ie

From 93dd435d65b2df8e44d4d6e82cbab202388a0883 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Mon, 14 Apr 2014 09:10:13 -0400
Subject: [PATCH 07/14] Add more OMP directives

---
 .../vertical/MOM_set_diffusivity.F90          | 36 +++++++++++++------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/src/parameterizations/vertical/MOM_set_diffusivity.F90 b/src/parameterizations/vertical/MOM_set_diffusivity.F90
index b2d332ec06..26c2db59d4 100644
--- a/src/parameterizations/vertical/MOM_set_diffusivity.F90
+++ b/src/parameterizations/vertical/MOM_set_diffusivity.F90
@@ -522,19 +522,27 @@ subroutine set_diffusivity(u, v, h, u_h, v_h, tv, fluxes, optics, visc, dt, G, C
 ! the appropriate place to add a depth-dependent parameterization or
 ! another explicit parameterization of Kd.
 
-  if (CS%Bryan_Lewis_diffusivity) then ; do j=js,je ; do i=is,ie
-    Kd_sfc(i,j) = CS%Kd_Bryan_Lewis_surface
-  enddo ; enddo ; else ; do j=js,je ; do i=is,ie
-    Kd_sfc(i,j) = CS%Kd
-  enddo ; enddo ; endif
+  if (CS%Bryan_Lewis_diffusivity) then 
+!$OMP parallel do default(shared)
+    do j=js,je ; do i=is,ie
+      Kd_sfc(i,j) = CS%Kd_Bryan_Lewis_surface
+    enddo ; enddo 
+  else 
+!$OMP parallel do default(shared)
+    do j=js,je ; do i=is,ie
+      Kd_sfc(i,j) = CS%Kd
+    enddo ; enddo 
+  endif
   if (CS%Henyey_IGW_background) then
     I_x30 = 2.0 / invcosh(CS%N0_2Omega*2.0) ! This is evaluated at 30 deg.
+!$OMP parallel do default(shared) private(abs_sin)
     do j=js,je ; do i=is,ie
       abs_sin = abs(sin(G%geoLatT(i,j)*deg_to_rad))
       Kd_sfc(i,j) = max(CS%Kd_min, Kd_sfc(i,j) * &
            ((abs_sin * invcosh(CS%N0_2Omega/max(epsilon,abs_sin))) * I_x30) )
     enddo ; enddo
   elseif (CS%Kd_tanh_lat_fn) then
+!$OMP parallel do default(shared) 
     do j=js,je ; do i=is,ie
       !   The transition latitude and latitude range are hard-scaled here, since
       ! this is not really intended for wide-spread use, but rather for
@@ -728,12 +736,18 @@ subroutine set_diffusivity(u, v, h, u_h, v_h, tv, fluxes, optics, visc, dt, G, C
   endif
 
   if (CS%Kd_add > 0.0) then
-    if (present(Kd_int)) then ; do k=1,nz ; do j=js,je ; do i=is,ie
-      Kd_int(i,j,K) = Kd_int(i,j,K) + CS%Kd_add
-      Kd(i,j,k) = Kd(i,j,k) + CS%Kd_add
-    enddo ; enddo ; enddo ; else ; do k=1,nz ; do j=js,je ; do i=is,ie
-      Kd(i,j,k) = Kd(i,j,k) + CS%Kd_add
-    enddo ; enddo ; enddo ; endif
+    if (present(Kd_int)) then 
+!$OMP parallel do default(shared)
+      do k=1,nz ; do j=js,je ; do i=is,ie
+        Kd_int(i,j,K) = Kd_int(i,j,K) + CS%Kd_add
+        Kd(i,j,k) = Kd(i,j,k) + CS%Kd_add
+      enddo ; enddo ; enddo 
+    else 
+!$OMP parallel do default(shared)
+      do k=1,nz ; do j=js,je ; do i=is,ie
+        Kd(i,j,k) = Kd(i,j,k) + CS%Kd_add
+      enddo ; enddo ; enddo 
+    endif
   endif
 
   if (CS%user_change_diff) then

From 3f6a0c38bd7b0f5ff4d94e100700c50fd4fb81a0 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Mon, 14 Apr 2014 09:52:55 -0400
Subject: [PATCH 08/14] implement openmp

---
 .../vertical/MOM_diabatic_driver.F90          | 107 +++++++++++++-----
 1 file changed, 79 insertions(+), 28 deletions(-)

diff --git a/src/parameterizations/vertical/MOM_diabatic_driver.F90 b/src/parameterizations/vertical/MOM_diabatic_driver.F90
index 9d5c615198..b53b3c0c33 100644
--- a/src/parameterizations/vertical/MOM_diabatic_driver.F90
+++ b/src/parameterizations/vertical/MOM_diabatic_driver.F90
@@ -351,6 +351,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
   if (CS%debugConservation) call MOM_state_stats('1st make_frazil', u, v, h, T, S, G)
 
   if ((CS%ML_mix_first > 0.0) .or. CS%use_geothermal) then
+!$OMP parallel do default(shared)
     do k=1,nz ; do j=js,je ; do i=is,ie
       h_orig(i,j,k) = h(i,j,k) ; eaml(i,j,k) = 0.0 ; ebml(i,j,k) = 0.0
     enddo ; enddo ; enddo
@@ -368,7 +369,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
   ! biological properties and layer thicknesses.
   if (associated(CS%optics)) &
     call set_opacity(CS%optics, fluxes, G, CS%opacity_CSp)
-  
+
   if (CS%bulkmixedlayer) then
     if (CS%ML_mix_first > 0.0) then
 !  This subroutine (1)  Cools the mixed layer.
@@ -460,28 +461,56 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
     ! Thus, KPP is the last contribution to Kd.
     ! Changes: Kd_int. Sets: KPP_NLTheat, KPP_NLTscalar
     if (associated(visc%Kd_turb) .and. CS%matchKPPwithoutKappaShear) then
-      Kd_salt(is:ie,js:je,:) = Kd_int(is:ie,js:je,:) - visc%Kd_turb(is:ie,js:je,:) ! Temporarily remove part due to Kappa-shear
-      Kd_heat(is:ie,js:je,:) = Kd_int(is:ie,js:je,:) - visc%Kd_turb(is:ie,js:je,:) ! Temporarily remove part due to Kappa-shear
+!$OMP parallel do default(shared)
+      do k=1,nz+1 ; do j=js,je ; do i=is,ie
+        Kd_salt(i,j,k) = Kd_int(i,j,k) - visc%Kd_turb(i,j,k) ! Temporarily remove part due to Kappa-shear
+        Kd_heat(i,j,k) = Kd_int(i,j,k) - visc%Kd_turb(i,j,k) ! Temporarily remove part due to Kappa-shear
+      enddo ; enddo ; enddo
     else
-      Kd_salt(is:ie,js:je,:) = Kd_int(is:ie,js:je,:)
-      Kd_heat(is:ie,js:je,:) = Kd_int(is:ie,js:je,:)
+!$OMP parallel do default(shared)
+      do k=1,nz+1 ; do j=js,je ; do i=is,ie
+        Kd_salt(i,j,k) = Kd_int(i,j,k)
+        Kd_heat(i,j,k) = Kd_int(i,j,k)
+      enddo ; enddo ; enddo
+    endif
+    if (associated(visc%Kd_extra_S)) then
+!$OMP parallel do default(shared)
+      do k=1,nz+1 ; do j=js,je ; do i=is,ie
+        Kd_salt(i,j,k) = Kd_salt(i,j,k) + visc%Kd_extra_S(i,j,k)
+      enddo ; enddo ; enddo
+    endif
+    if (associated(visc%Kd_extra_T)) then
+!$OMP parallel do default(shared)
+      do k=1,nz+1 ; do j=js,je ; do i=is,ie
+        Kd_heat(i,j,k) = Kd_heat(i,j,k) + visc%Kd_extra_T(i,j,k)
+      enddo ; enddo ; enddo
     endif
-    if (associated(visc%Kd_extra_S)) &
-        Kd_salt(is:ie,js:je,:) = Kd_salt(is:ie,js:je,:) + visc%Kd_extra_S(is:ie,js:je,:)
-    if (associated(visc%Kd_extra_T)) &
-        Kd_heat(is:ie,js:je,:) = Kd_heat(is:ie,js:je,:) + visc%Kd_extra_T(is:ie,js:je,:)
     call KPP_calculate(CS%KPP_CSp, G, h, tv%T, tv%S, u, v, tv%eqn_of_state, &
            fluxes%ustar, CS%buoyancyFlux, Kd_heat, Kd_salt, visc%Kv_turb, CS%KPP_NLTheat, CS%KPP_NLTscalar)
     if (.not. CS%KPPisPassive) then
       if (associated(visc%Kd_turb) .and. CS%matchKPPwithoutKappaShear) then
-        Kd_salt(is:ie,js:je,:) = ( Kd_salt(is:ie,js:je,:) + visc%Kd_turb(is:ie,js:je,:) )  ! Put back part due to Kappa-shear
-        Kd_heat(is:ie,js:je,:) = ( Kd_heat(is:ie,js:je,:) + visc%Kd_turb(is:ie,js:je,:) )  ! Put back part due to Kappa-shear
+!$OMP parallel do default(shared)
+        do k=1,nz+1 ; do j=js,je ; do i=is,ie
+          Kd_salt(i,j,k) = ( Kd_salt(i,j,k) + visc%Kd_turb(i,j,k) )  ! Put back part due to Kappa-shear
+          Kd_heat(i,j,k) = ( Kd_heat(i,j,k) + visc%Kd_turb(i,j,k) )  ! Put back part due to Kappa-shear
+        enddo ; enddo ; enddo
+      endif
+!$OMP parallel do default(shared)
+      do k=1,nz+1 ; do j=js,je ; do i=is,ie
+        Kd_int(i,j,k) = min( Kd_salt(i,j,k),  Kd_heat(i,j,k) )
+      enddo ; enddo ; enddo
+      if (associated(visc%Kd_extra_S)) then
+!$OMP parallel do default(shared)
+        do k=1,nz+1 ; do j=js,je ; do i=is,ie
+          visc%Kd_extra_S(i,j,k) = Kd_salt(i,j,k) - Kd_int(i,j,k)
+        enddo ; enddo ; enddo
+      endif
+      if (associated(visc%Kd_extra_T)) then
+!$OMP parallel do default(shared)
+        do k=1,nz+1 ; do j=js,je ; do i=is,ie
+          visc%Kd_extra_T(i,j,k) = Kd_heat(i,j,k) - Kd_int(i,j,k)
+        enddo ; enddo ; enddo
       endif
-      Kd_int(is:ie,js:je,:) = min( Kd_salt(is:ie,js:je,:),  Kd_heat(is:ie,js:je,:) )
-      if (associated(visc%Kd_extra_S)) &
-          visc%Kd_extra_S(is:ie,js:je,:) = Kd_salt(is:ie,js:je,:) - Kd_int(is:ie,js:je,:)
-      if (associated(visc%Kd_extra_T)) &
-          visc%Kd_extra_T(is:ie,js:je,:) = Kd_heat(is:ie,js:je,:) - Kd_int(is:ie,js:je,:)
     endif ! not passive
     call cpu_clock_end(id_clock_kpp)
     if (showCallTree) call callTree_waypoint("done with KPP_calculate (diabatic)")
@@ -531,6 +560,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
     do j=js,je ; do i=is,ie
       ea(i,j,1) = 0.
     enddo ; enddo
+!$OMP parallel do default(shared) private(hval)
     do k=2,nz ; do j=js,je ; do i=is,ie
       hval=1.0/(h_neglect + 0.5*(h(i,j,k-1) + h(i,j,k)))
       ea(i,j,k) = (G%m_to_H**2) * dt * hval * Kd_int(i,j,k)
@@ -583,6 +613,8 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
   ! enough iterations are permitted in Calculate_Entrainment, and
   ! even if too few iterations are allowed, it is still guarded
   ! against.  In other words the checks are probably unnecessary.
+
+!$OMP parallel do default(shared)
   do j=js,je
     do i=is,ie
       hold(i,j,1) = h(i,j,1)
@@ -618,6 +650,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
       ! are lighter than the mixed layer have temperatures and salinities
       ! that correspond to their prescribed densities.
       if (CS%massless_match_targets) then
+!$OMP parallel do default (shared) private(h_tr,b1,d1,c1,b_denom_1)
         do j=js,je
           do i=is,ie
             h_tr = hold(i,j,1) + h_neglect
@@ -693,6 +726,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
     if ((CS%ML_mix_first > 0.0) .or. CS%use_geothermal) then
       ! The mixed layer code has already been called, but there is some needed
       ! bookkeeping.
+!$OMP parallel do default(shared)
       do k=1,nz ; do j=js,je ; do i=is,ie
         hold(i,j,k) = h_orig(i,j,k)
         ea(i,j,k) = ea(i,j,k) + eaml(i,j,k)
@@ -769,6 +803,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
       Tdif_flx(i,j,1) = 0.0 ; Tdif_flx(i,j,nz+1) = 0.0
       Tadv_flx(i,j,1) = 0.0 ; Tadv_flx(i,j,nz+1) = 0.0
     enddo ; enddo
+!$OMP parallel do default(shared)
     do K=2,nz ; do j=js,je ; do i=is,ie
       Tdif_flx(i,j,K) = (Idt * 0.5*(ea(i,j,k) + eb(i,j,k-1))) * &
                         (T(i,j,k-1) - T(i,j,k))
@@ -782,6 +817,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
       Sdif_flx(i,j,1) = 0.0 ; Sdif_flx(i,j,nz+1) = 0.0
       Sadv_flx(i,j,1) = 0.0 ; Sadv_flx(i,j,nz+1) = 0.0
     enddo ; enddo
+!$OMP parallel do default(shared)
     do K=2,nz ; do j=js,je ; do i=is,ie
       Sdif_flx(i,j,K) = (Idt * 0.5*(ea(i,j,k) + eb(i,j,k-1))) * &
                         (S(i,j,k-1) - S(i,j,k))
@@ -793,7 +829,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
   call cpu_clock_begin(id_clock_tracers)
   if (CS%mix_boundary_tracers) then
     Tr_ea_BBL = sqrt(dt*CS%Kd_BBL_tr)
-
+!$OMP parallel do default(shared) private(htot,in_boundary,add_ent)
     do j=js,je
       do i=is,ie
         ebtr(i,j,nz) = eb(i,j,nz)
@@ -843,6 +879,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
     do j=js,je ; do i=is,ie
       ebtr(i,j,nz) = eb(i,j,nz) ; eatr(i,j,1) = ea(i,j,1)
     enddo ; enddo
+!$OMP parallel do default(shared) private(add_ent)
     do k=nz,2,-1 ; do j=js,je ; do i=is,ie
       if (visc%Kd_extra_S(i,j,k) > 0.0) then
         add_ent = ((dt * visc%Kd_extra_S(i,j,k)) * G%m_to_H**2) / &
@@ -866,6 +903,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
     call cpu_clock_begin(id_clock_sponge)
     if (CS%bulkmixedlayer .and. ASSOCIATED(tv%eqn_of_state)) then
       do i=is,ie ; p_ref_cv(i) = tv%P_Ref ; enddo
+!$OMP parallel do default(shared)
       do j=js,je
         call calculate_density(T(:,j,1), S(:,j,1), p_ref_cv, Rcv_ml(:,j), &
                                is, ie-is+1, tv%eqn_of_state)
@@ -881,21 +919,26 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
     endif
   endif
 
+!$OMP parallel default(shared)
 !   Save the diapycnal mass fluxes as a diagnostic field.
   if (ASSOCIATED(CDp%diapyc_vel)) then
-    do K=2,nz ; do j=js,je ; do i=is,ie
-      CDp%diapyc_vel(i,j,K) = Idt * (G%H_to_m * (ea(i,j,k) - eb(i,j,k-1)))
-    enddo ; enddo ; enddo
-    do j=js,je ; do i=is,ie
-      CDp%diapyc_vel(i,j,1) = 0.0
-      CDp%diapyc_vel(i,j,nz+1) = 0.0
-    enddo ; enddo
+!$OMP do
+    do j=js,je
+      do K=2,nz ;  do i=is,ie
+        CDp%diapyc_vel(i,j,K) = Idt * (G%H_to_m * (ea(i,j,k) - eb(i,j,k-1)))
+      enddo ; enddo 
+      do i=is,ie
+        CDp%diapyc_vel(i,j,1) = 0.0
+        CDp%diapyc_vel(i,j,nz+1) = 0.0
+      enddo
+    enddo
   endif
 
 !   For momentum, it is only the net flux that homogenizes within
 !  the mixed layer.  Vertical viscosity that is proportional to the
 !  mixed layer turbulence is applied elsewhere.
   if (CS%bulkmixedlayer) then
+!$OMP do
     do k=2,G%nkml ; do j=js,je ; do i=is,ie
       if (ea(i,j,k) >= eb(i,j,k-1)) then
         ea(i,j,k) = ea(i,j,k) - eb(i,j,k-1)
@@ -909,6 +952,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
 
 
 !   This initializes the halo regions of ea, eb, and hold to default values.
+!$OMP do
   do k=1,nz
     do i=is-1,ie+1
       hold(i,js-1,k) = G%Angstrom ; ea(i,js-1,k) = 0.0 ; eb(i,js-1,k) = 0.0
@@ -919,7 +963,8 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
       hold(ie+1,j,k) = G%Angstrom ; ea(ie+1,j,k) = 0.0 ; eb(ie+1,j,k) = 0.0
     enddo
   enddo
-  
+!$OMP end parallel
+
   call cpu_clock_begin(id_clock_pass)
   if (G%symmetric) then
     call pass_var(hold,G%Domain,complete=.false.)
@@ -940,6 +985,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
     call MOM_state_chksum("before u/v tridiag ", u, v, h, G)
   endif
   call cpu_clock_begin(id_clock_tridiag)
+!$OMP parallel do default(shared) private(hval,b1,d1,c1,eaval)
   do j=js,je
     do I=Isq,Ieq
       if (ASSOCIATED(ADp%du_dt_dia)) ADp%du_dt_dia(I,j,1) = u(I,j,1)
@@ -971,6 +1017,7 @@ subroutine diabatic(u, v, h, tv, fluxes, visc, ADp, CDp, dt, G, CS)
   if (CS%debug) then
     call MOM_state_chksum("aft 1st loop tridiag ", u, v, h, G)
   endif
+!$OMP parallel do default(shared) private(hval,b1,d1,c1,eaval)
   do J=Jsq,Jeq
     do i=is,ie
       if (ASSOCIATED(ADp%dv_dt_dia)) ADp%dv_dt_dia(i,J,1) = v(i,J,1)
@@ -1112,7 +1159,8 @@ subroutine make_frazil(h, tv, G, CS)
   if (.not.CS%pressure_dependent_frazil) then
     do k=1,nz ; do i=is,ie ; pressure(i,k) = 0.0 ; enddo ; enddo
   endif
-
+!$OMP parallel do default(shared) firstprivate(pressure) &
+!$OMP                                  private(fraz_col,T_fr_set,T_freeze,hc)
   do j=js,je
     do i=is,ie ; fraz_col(:) = 0.0 ; enddo
 
@@ -1239,6 +1287,8 @@ subroutine differential_diffuse_T_S(h, tv, visc, dt, G)
 
   T => tv%T ; S => tv%S
   Kd_T => visc%Kd_extra_T ; Kd_S => visc%Kd_extra_S
+!$OMP parallel do default(shared) private(I_h_int,mix_T,mix_S,h_tr,b1_T,b1_S, &
+!$OMP                                     d1_T,d1_S,c1_T,c1_S,b_denom_T,b_denom_S)
   do j=js,je
     do i=is,ie
       I_h_int = 1.0 / (0.5 * (h(i,j,1) + h(i,j,2)) + h_neglect)
@@ -1320,7 +1370,8 @@ subroutine adjust_salt(h, tv, G, CS)
   
   salt_add_col(:,:) = 0.0
 
-  do k=nz,1,-1 ; do j=js,je ; do i=is,ie
+  
+  do j=js,je ; do k=nz,1,-1 ; do i=is,ie
     if ((G%mask2dT(i,j) > 0.0) .and. &
          ((tv%S(i,j,k) < S_min) .or. (salt_add_col(i,j) > 0.0))) then
       mc = G%H_to_kg_m2 * h(i,j,k)
@@ -1752,7 +1803,7 @@ subroutine find_uv_at_h(u, v, h, u_h, v_h, G, ea, eb)
   if (present(ea) .neqv. present(eb)) call MOM_error(FATAL, &
       "find_uv_at_h: Either both ea and eb or neither one must be present "// &
       "in call to find_uv_at_h.")
-
+!$OMP parallel do default(shared) private(s,Idenom,a_w,a_e,a_s,a_n,b_denom_1,b1,d1,c1)
   do j=js,je
     do i=is,ie
       s = G%areaCu(i-1,j)+G%areaCu(i,j)

From 662d47ea5766bf399f05567a207b9a5cd3fca8fa Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Mon, 14 Apr 2014 09:54:58 -0400
Subject: [PATCH 09/14] implement openmp

---
 .../vertical/MOM_opacity.F90                  | 156 ++++++++++--------
 1 file changed, 88 insertions(+), 68 deletions(-)

diff --git a/src/parameterizations/vertical/MOM_opacity.F90 b/src/parameterizations/vertical/MOM_opacity.F90
index 7b7df16acd..d7d6d871ef 100644
--- a/src/parameterizations/vertical/MOM_opacity.F90
+++ b/src/parameterizations/vertical/MOM_opacity.F90
@@ -152,15 +152,17 @@ subroutine set_opacity(optics, fluxes, G, CS)
 
     ! Make sure there is no division by 0.
     inv_sw_pen_scale = 1.0 / max(CS%pen_sw_scale, 0.1*G%Angstrom_z, G%H_to_m*G%H_subroundoff)
-
+!$OMP parallel do default(shared)
     do k=1,nz ; do j=js,je ; do i=is,ie  ; do n=1,optics%nbands
       optics%opacity_band(n,i,j,k) = inv_sw_pen_scale
     enddo ; enddo ; enddo ; enddo
     if (.not.associated(fluxes%sw) .or. (CS%pen_SW_scale <= 0.0)) then
+!$OMP parallel do default(shared)
       do j=js,je ; do i=is,ie ; do n=1,optics%nbands
         optics%sw_pen_band(n,i,j) = 0.0
       enddo ; enddo ; enddo
     else
+!$OMP parallel do default(shared)
       do j=js,je ; do i=is,ie ; do n=1,optics%nbands
         optics%sw_pen_band(n,i,j) = CS%pen_SW_frac * Inv_nbands * fluxes%sw(i,j)
       enddo ; enddo ; enddo
@@ -169,6 +171,7 @@ subroutine set_opacity(optics, fluxes, G, CS)
   
   if (query_averaging_enabled(CS%diag)) then
     if (CS%id_sw_pen > 0) then
+!$OMP parallel do default(shared)
       do j=js,je ; do i=is,ie
         Pen_SW_tot(i,j) = 0.0
         do n=1,optics%nbands
@@ -179,6 +182,7 @@ subroutine set_opacity(optics, fluxes, G, CS)
     endif
     if (CS%id_sw_vis_pen > 0) then
       if (CS%opacity_scheme == MANIZZA_05) then
+!$OMP parallel do default(shared)
         do j=js,je ; do i=is,ie
           Pen_SW_tot(i,j) = 0.0
           do n=1,min(optics%nbands,2)
@@ -186,6 +190,7 @@ subroutine set_opacity(optics, fluxes, G, CS)
           enddo
         enddo ; enddo
       else
+!$OMP parallel do default(shared)
         do j=js,je ; do i=is,ie
           Pen_SW_tot(i,j) = 0.0
           do n=1,optics%nbands
@@ -196,6 +201,7 @@ subroutine set_opacity(optics, fluxes, G, CS)
       call post_data(CS%id_sw_vis_pen, Pen_SW_tot, CS%diag)
     endif
     do n=1,optics%nbands ; if (CS%id_opacity(n) > 0) then
+!$OMP parallel do default(shared)
       do k=1,nz ; do j=js,je ; do i=is,ie
         tmp(i,j,k) = optics%opacity_band(n,i,j,k)
       enddo ; enddo ; enddo
@@ -265,62 +271,96 @@ subroutine opacity_from_chl(optics, fluxes, G, CS, chl_in)
   multiband_nir_input = (associated(fluxes%sw_nir_dir) .and. &
                          associated(fluxes%sw_nir_dif))
 
-  do k=1,nz
-
-    if (present(chl_in)) then
-      do j=js,je ; do i=is,ie ; chl_data(i,j) = chl_in(i,j,k) ; enddo ; enddo
-      do j=js,je ; do i=is,ie
-        if ((G%mask2dT(i,j) > 0.5) .and. (chl_data(i,j) < 0.0)) then
+  chl_data(:,:) = 0.0
+  if(present(chl_in)) then
+    do j=js,je ; do i=is,ie ; chl_data(i,j) = chl_in(i,j,1) ; enddo ; enddo
+    do k=1,nz; do j=js,je ; do i=is,ie
+      if ((G%mask2dT(i,j) > 0.5) .and. (chl_in(i,j,k) < 0.0)) then
           write(mesg,'(" Negative chl_in of ",(1pe12.4)," found at i,j,k = ", &
                     & 3(1x,i3), " lon/lat = ",(1pe12.4)," E ", (1pe12.4), " N.")') &
                      chl_data(i,j), i, j, k, G%geoLonT(i,j), G%geoLatT(i,j)
           call MOM_error(FATAL,"MOM_opacity opacity_from_chl: "//trim(mesg))
-        endif
-      enddo ; enddo
-    elseif (k==1) then
-      ! Only the 2-d surface chlorophyll can be read in from a file.  The
-      ! same value is assumed for all layers.
-      call get_time(CS%Time,seconds,days)
-      do j=js,je ; do i=is,ie ; chl_data(i,j) = 0.0 ; enddo ; enddo
-      call time_interp_external(CS%sbc_chl, CS%Time, chl_data)
+      endif
+    enddo; enddo; enddo
+  else
+    ! Only the 2-d surface chlorophyll can be read in from a file.  The
+    ! same value is assumed for all layers.
+    call get_time(CS%Time,seconds,days)
+    call time_interp_external(CS%sbc_chl, CS%Time, chl_data)
+    do j=js,je ; do i=is,ie
+      if ((G%mask2dT(i,j) > 0.5) .and. (chl_data(i,j) < 0.0)) then
+        write(mesg,'(" Time_interp negative chl of ",(1pe12.4)," at i,j = ",&
+                  & 2(i3), "lon/lat = ",(1pe12.4)," E ", (1pe12.4), " N.")') &
+                   chl_data(i,j), i, j, G%geoLonT(i,j), G%geoLatT(i,j)
+        call MOM_error(FATAL,"MOM_opacity opacity_from_chl: "//trim(mesg))
+      endif
+    enddo ; enddo
+  endif
+
+  if (CS%id_chl > 0) then
+    if(present(chl_in)) then
+      call post_data(CS%id_chl, chl_in(:,:,1), CS%diag)
+    else
+      call post_data(CS%id_chl, chl_data, CS%diag)
+    endif
+  endif
+
+  select case (CS%opacity_scheme)
+    case (MANIZZA_05)
+!$OMP parallel do default(shared) private(SW_vis_tot,SW_nir_tot)
       do j=js,je ; do i=is,ie
-        if ((G%mask2dT(i,j) > 0.5) .and. (chl_data(i,j) < 0.0)) then
-          write(mesg,'(" Time_interp negative chl of ",(1pe12.4)," at i,j = ",&
-                    & 2(i3), "lon/lat = ",(1pe12.4)," E ", (1pe12.4), " N.")') &
-                     chl_data(i,j), i, j, G%geoLonT(i,j), G%geoLatT(i,j)
-          call MOM_error(FATAL,"MOM_opacity opacity_from_chl: "//trim(mesg))
+        SW_vis_tot = 0.0 ; SW_nir_tot = 0.0
+        if (G%mask2dT(i,j) > 0.5) then
+          if (multiband_vis_input) then
+            SW_vis_tot = fluxes%sw_vis_dir(i,j) + fluxes%sw_vis_dif(i,j)
+          else  ! Follow Manizza 05 in assuming that 42% of SW is visible.
+            SW_vis_tot = 0.42 * fluxes%sw(i,j)
+          endif
+          if (multiband_nir_input) then
+            SW_nir_tot = fluxes%sw_nir_dir(i,j) + fluxes%sw_nir_dif(i,j)
+          else
+            SW_nir_tot = fluxes%sw(i,j) - SW_vis_tot
+          endif
         endif
+
+        ! Band 1 is Manizza blue.
+        optics%sw_pen_band(1,i,j) = CS%blue_frac*SW_vis_tot
+        ! Band 2 (if used) is Manizza red.
+        if (nbands > 1) &
+          optics%sw_pen_band(2,i,j) = (1.0-CS%blue_frac)*SW_vis_tot
+        ! All remaining bands are NIR, for lack of something better to do.
+        do n=3,nbands
+          optics%sw_pen_band(n,i,j) = Inv_nbands_nir * SW_nir_tot
+        enddo
+      enddo ; enddo 
+    case (MOREL_88)
+!$OMP parallel do default(shared) private(SW_pen_tot)
+      do j=js,je ; do i=is,ie
+        SW_pen_tot = 0.0
+        if (G%mask2dT(i,j) > 0.5) then ; if (multiband_vis_input) then
+            SW_pen_tot = SW_pen_frac_morel(chl_data(i,j)) * &
+                (fluxes%sw_vis_dir(i,j) + fluxes%sw_vis_dif(i,j))
+          else
+            SW_pen_tot = SW_pen_frac_morel(chl_data(i,j)) * &
+                0.5*fluxes%sw(i,j)
+        endif ; endif
+
+        do n=1,nbands
+          optics%sw_pen_band(n,i,j) = Inv_nbands*SW_pen_tot
+        enddo
       enddo ; enddo
+    case default
+        call MOM_error(FATAL, "opacity_from_chl: CS%opacity_scheme is not valid.")
+    end select
+
+!$OMP parallel do default(shared) firstprivate(chl_data)
+  do k=1,nz
+    if (present(chl_in)) then
+      do j=js,je ; do i=is,ie ; chl_data(i,j) = chl_in(i,j,k) ; enddo ; enddo
     endif
 
     select case (CS%opacity_scheme)
       case (MANIZZA_05)
-        if (k==1) then ; do j=js,je ; do i=is,ie
-          SW_vis_tot = 0.0 ; SW_nir_tot = 0.0
-          if (G%mask2dT(i,j) > 0.5) then
-            if (multiband_vis_input) then
-              SW_vis_tot = fluxes%sw_vis_dir(i,j) + fluxes%sw_vis_dif(i,j)
-            else  ! Follow Manizza 05 in assuming that 42% of SW is visible.
-              SW_vis_tot = 0.42 * fluxes%sw(i,j)
-            endif
-            if (multiband_nir_input) then
-              SW_nir_tot = fluxes%sw_nir_dir(i,j) + fluxes%sw_nir_dif(i,j)
-            else
-              SW_nir_tot = fluxes%sw(i,j) - SW_vis_tot
-            endif
-          endif
-
-          ! Band 1 is Manizza blue.
-          optics%sw_pen_band(1,i,j) = CS%blue_frac*SW_vis_tot
-          ! Band 2 (if used) is Manizza red.
-          if (nbands > 1) &
-            optics%sw_pen_band(2,i,j) = (1.0-CS%blue_frac)*SW_vis_tot
-          ! All remaining bands are NIR, for lack of something better to do.
-          do n=3,nbands
-            optics%sw_pen_band(n,i,j) = Inv_nbands_nir * SW_nir_tot
-          enddo
-        enddo ; enddo ; endif
-
         do j=js,je ; do i=is,ie
           if (G%mask2dT(i,j) <= 0.5) then
             do n=1,optics%nbands
@@ -335,25 +375,7 @@ subroutine opacity_from_chl(optics, fluxes, G, CS, chl_in)
             do n=3,nbands ; optics%opacity_band(n,i,j,k) = 2.86 ; enddo
           endif
         enddo ; enddo
-
       case (MOREL_88)
-        if (k==1) then ! Set up the surface fluxes.
-          do j=js,je ; do i=is,ie
-            SW_pen_tot = 0.0
-            if (G%mask2dT(i,j) > 0.5) then ; if (multiband_vis_input) then
-                SW_pen_tot = SW_pen_frac_morel(chl_data(i,j)) * &
-                    (fluxes%sw_vis_dir(i,j) + fluxes%sw_vis_dif(i,j))
-              else
-                SW_pen_tot = SW_pen_frac_morel(chl_data(i,j)) * &
-                    0.5*fluxes%sw(i,j)
-            endif ; endif
-
-            do n=1,optics%nbands
-              optics%sw_pen_band(n,i,j) = Inv_nbands*SW_pen_tot
-            enddo
-          enddo ; enddo
-        endif
-
         do j=js,je ; do i=is,ie
           optics%opacity_band(1,i,j,k) = CS%opacity_land_value
           if (G%mask2dT(i,j) > 0.5) &
@@ -367,11 +389,9 @@ subroutine opacity_from_chl(optics, fluxes, G, CS, chl_in)
       case default
         call MOM_error(FATAL, "opacity_from_chl: CS%opacity_scheme is not valid.")
     end select
-
-    if ((k==1) .and. (CS%id_chl > 0)) then
-      call post_data(CS%id_chl, chl_data, CS%diag)
-    endif
   enddo
+
+
 end subroutine opacity_from_chl
 
 function opacity_morel(chl_data)

From b68270e4d06f84d7cd6e2410850f170d2621eb92 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Mon, 14 Apr 2014 09:55:48 -0400
Subject: [PATCH 10/14] Add more OMP directives

---
 src/parameterizations/lateral/MOM_thickness_diffuse.F90 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/parameterizations/lateral/MOM_thickness_diffuse.F90 b/src/parameterizations/lateral/MOM_thickness_diffuse.F90
index 513fe441d4..65cd525aca 100644
--- a/src/parameterizations/lateral/MOM_thickness_diffuse.F90
+++ b/src/parameterizations/lateral/MOM_thickness_diffuse.F90
@@ -1399,10 +1399,12 @@ subroutine vert_fill_TS(h, T_in, S_in, kappa, dt, T_f, S_f, G, halo_here)
   h0 = 1.0e-16*sqrt(kappa*dt)*G%m_to_H
 
   if (kap_dt_x2 <= 0.0) then
+!$OMP parallel do default(shared)
     do k=1,nz ; do j=js,je ; do i=is,ie
       T_f(i,j,k) = T_in(i,j,k) ; S_f(i,j,k) = S_in(i,j,k)
     enddo ; enddo ; enddo
   else
+!$OMP parallel do default(shared) private(ent,b1,d1,c1)
     do j=js,je
       do i=is,ie
         ent(i,2) = kap_dt_x2 / ((h(i,j,1)+h(i,j,2)) + h0)

From b3a9a2fd0f8a72c0469f59d4325e3cb643ec8365 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Mon, 14 Apr 2014 09:56:57 -0400
Subject: [PATCH 11/14] implement openmp

---
 .../lateral/MOM_mixed_layer_restrat.F90       | 99 ++++++++++---------
 1 file changed, 53 insertions(+), 46 deletions(-)

diff --git a/src/parameterizations/lateral/MOM_mixed_layer_restrat.F90 b/src/parameterizations/lateral/MOM_mixed_layer_restrat.F90
index d57b61a14b..e98b821245 100644
--- a/src/parameterizations/lateral/MOM_mixed_layer_restrat.F90
+++ b/src/parameterizations/lateral/MOM_mixed_layer_restrat.F90
@@ -175,7 +175,9 @@ subroutine mixedlayer_restrat(h, uhtr, vhtr, tv, fluxes, dt, G, CS)
   ! Fix this later for G%nkml >= 3.
 
   p0(:) = 0.0
-
+!$OMP parallel default(shared) private(Rho0,h_vel,u_star,absf,mom_mixrate,timescale,uDml, &
+!$OMP                                  I2htot,z_topx2,hx2,a,vDml)
+!$OMP do
   do j=js-1,je+1
     do i=is-1,ie+1
       htot(i,j) = 0.0 ; Rml_av(i,j) = 0.0
@@ -198,55 +200,58 @@ subroutine mixedlayer_restrat(h, uhtr, vhtr, tv, fluxes, dt, G, CS)
 !   1. Mixing extends below the mixing layer to the mixed layer.  Find it!
 !   2. Add exponential tail to streamfunction?
 
-  do j=js,je ; do i=is,ie ; utimescale_diag(i,j) = 0.0 ; enddo ; enddo
-  do j=js,je ; do i=is,ie ; vtimescale_diag(i,j) = 0.0 ; enddo ; enddo
-
 !   U - Component
-  do j=js,je ; do I=is-1,ie
-    h_vel = 0.5*(htot(i,j) + htot(i+1,j)) * G%H_to_m
-
-    u_star = 0.5*(fluxes%ustar(i,j) + fluxes%ustar(i+1,j))
-    absf = 0.5*(abs(G%CoriolisBu(I,J-1)) + abs(G%CoriolisBu(I,J)))
-    ! peak ML visc: u_star * 0.41 * (h_ml*u_star)/(absf*h_ml + 4.0*u_star)
-    ! momentum mixing rate: pi^2*visc/h_ml^2 
-    ! 0.41 is the von Karmen constant, 9.8696 = pi^2.
-    mom_mixrate = (0.41*9.8696)*u_star**2 / &
-                  (absf*h_vel**2 + 4.0*(h_vel+dz_neglect)*u_star)
-    timescale = 0.0625 * (absf + 2.0*mom_mixrate) / (absf**2 + mom_mixrate**2)
-
-    timescale = timescale * CS%ml_restrat_coef
+!$OMP do  
+  do j=js,je 
+    do i=is,ie ; utimescale_diag(i,j) = 0.0 ; enddo
+    do i=is,ie ; vtimescale_diag(i,j) = 0.0 ; enddo
+    do I=is-1,ie
+      h_vel = 0.5*(htot(i,j) + htot(i+1,j)) * G%H_to_m
+
+      u_star = 0.5*(fluxes%ustar(i,j) + fluxes%ustar(i+1,j))
+      absf = 0.5*(abs(G%CoriolisBu(I,J-1)) + abs(G%CoriolisBu(I,J)))
+      ! peak ML visc: u_star * 0.41 * (h_ml*u_star)/(absf*h_ml + 4.0*u_star)
+      ! momentum mixing rate: pi^2*visc/h_ml^2 
+      ! 0.41 is the von Karmen constant, 9.8696 = pi^2.
+      mom_mixrate = (0.41*9.8696)*u_star**2 / &
+                    (absf*h_vel**2 + 4.0*(h_vel+dz_neglect)*u_star)
+      timescale = 0.0625 * (absf + 2.0*mom_mixrate) / (absf**2 + mom_mixrate**2)
+
+      timescale = timescale * CS%ml_restrat_coef
 !        timescale = timescale*(2?)*(L_def/L_MLI)*min(EKE/MKE,1.0 + G%dyCv(i,j)**2/L_def**2))
 
-    utimescale_diag(I,j) = timescale
-
-    uDml(I) = timescale * G%mask2dCu(I,j)*G%dyCu(I,j)* &
-        G%IdxCu(I,j)*(Rml_av(i+1,j)-Rml_av(i,j)) * (h_vel**2 * G%m_to_H)
-
-    if (uDml(i) == 0) then
-      do k=1,G%nkml ; uhml(I,j,k) = 0.0 ; enddo
-    else
-      I2htot = 1.0 / (htot(i,j) + htot(i+1,j) + h_neglect)
-      z_topx2 = 0.0
-      ! a(k) relates the sublayer transport to uDml with a linear profile.
-      ! The sum of a through the mixed layers must be 0.
-      do k=1,G%nkml
-        hx2 = (h(i,j,k) + h(i+1,j,k) + h_neglect)
-        a(k) = (hx2 * I2htot) * (2.0 - 4.0*(z_topx2+0.5*hx2)*I2htot)
-        z_topx2 = z_topx2 + hx2
-        if (a(k)*uDml(I) > 0.0) then
-          if (a(k)*uDml(I) > h_avail(i,j,k)) uDml(I) = h_avail(i,j,k) / a(k)
-        else
-          if (-a(k)*uDml(I) > h_avail(i+1,j,k)) uDml(I) = -h_avail(i+1,j,k)/a(k)
-        endif
-      enddo
-      do k=1,G%nkml
-        uhml(I,j,k) = a(k)*uDml(I)
-        uhtr(I,j,k) = uhtr(I,j,k) + uhml(I,j,k)*dt
-      enddo
-    endif
-  enddo ; enddo
+      utimescale_diag(I,j) = timescale
+
+      uDml(I) = timescale * G%mask2dCu(I,j)*G%dyCu(I,j)* &
+          G%IdxCu(I,j)*(Rml_av(i+1,j)-Rml_av(i,j)) * (h_vel**2 * G%m_to_H)
+
+      if (uDml(i) == 0) then
+        do k=1,G%nkml ; uhml(I,j,k) = 0.0 ; enddo
+      else
+        I2htot = 1.0 / (htot(i,j) + htot(i+1,j) + h_neglect)
+        z_topx2 = 0.0
+        ! a(k) relates the sublayer transport to uDml with a linear profile.
+        ! The sum of a through the mixed layers must be 0.
+        do k=1,G%nkml
+          hx2 = (h(i,j,k) + h(i+1,j,k) + h_neglect)
+          a(k) = (hx2 * I2htot) * (2.0 - 4.0*(z_topx2+0.5*hx2)*I2htot)
+          z_topx2 = z_topx2 + hx2
+          if (a(k)*uDml(I) > 0.0) then
+            if (a(k)*uDml(I) > h_avail(i,j,k)) uDml(I) = h_avail(i,j,k) / a(k)
+          else
+            if (-a(k)*uDml(I) > h_avail(i+1,j,k)) uDml(I) = -h_avail(i+1,j,k)/a(k)
+          endif
+        enddo
+        do k=1,G%nkml
+          uhml(I,j,k) = a(k)*uDml(I)
+          uhtr(I,j,k) = uhtr(I,j,k) + uhml(I,j,k)*dt
+        enddo
+      endif
+    enddo 
+  enddo
 
 !  V- component
+!$OMP do
   do J=js-1,je ; do i=is,ie
     h_vel = 0.5*(htot(i,j) + htot(i,j+1)) * G%H_to_m
 
@@ -290,10 +295,12 @@ subroutine mixedlayer_restrat(h, uhtr, vhtr, tv, fluxes, dt, G, CS)
     endif
   enddo ; enddo
 
-  do k=1,G%nkml ; do j=js,je ; do i=is,ie
+!$OMP do
+  do j=js,je ; do k=1,G%nkml ; do i=is,ie
     h(i,j,k) = h(i,j,k) - dt*G%IareaT(i,j) * &
         ((uhml(I,j,k) - uhml(I-1,j,k)) + (vhml(i,J,k) - vhml(i,J-1,k)))
   enddo ; enddo ; enddo
+!$OMP end parallel
 
 ! Offer fields for averaging.
   if (query_averaging_enabled(CS%diag) .and. &

From dd7a7996bc5fd17407702ee4f7c9cbc24c8ef924 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Mon, 14 Apr 2014 10:01:07 -0400
Subject: [PATCH 12/14] implement openmp

---
 .../lateral/MOM_lateral_mixing_coeffs.F90     | 63 ++++++++++++-------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/src/parameterizations/lateral/MOM_lateral_mixing_coeffs.F90 b/src/parameterizations/lateral/MOM_lateral_mixing_coeffs.F90
index e823a23c05..5bf68e7db4 100644
--- a/src/parameterizations/lateral/MOM_lateral_mixing_coeffs.F90
+++ b/src/parameterizations/lateral/MOM_lateral_mixing_coeffs.F90
@@ -271,6 +271,8 @@ subroutine calc_slope_function_(h, tv, G, CS, e)
   real :: one_meter     ! One meter in thickness units of m or kg m-2.
   integer :: is, ie, js, je, nz
   integer :: i, j, k, kb_max
+  real    :: SN_u_local(SZIB_(G), SZJ_(G),SZK_(G))
+  real    :: SN_v_local(SZI_(G), SZJB_(G),SZK_(G))
 
   if (.not. ASSOCIATED(CS)) call MOM_error(FATAL, "calc_slope_function:"// &
          "Module must be initialized before it is used.")
@@ -286,6 +288,8 @@ subroutine calc_slope_function_(h, tv, G, CS, e)
   h_neglect = G%H_subroundoff
   H_cutoff = real(2*nz) * (G%Angstrom + h_neglect)
 
+!$OMP parallel default(shared) private(E_x,E_y,S2,Hdn,Hup,H_geom,N2)
+!$OMP do
   do j=js-1,je+1 ; do i=is-1,ie+1
     CS%SN_u(i,j) = 0.0
     CS%SN_v(i,j) = 0.0
@@ -296,14 +300,16 @@ subroutine calc_slope_function_(h, tv, G, CS, e)
   ! and midlatitude deformation radii, using calc_resoln_function as a template.
 
   ! Set the length scale at u-points.
+!$OMP do
   do j=js,je ; do I=is-1,ie
     CS%L2u(I,j) = CS%Visbeck_L_scale**2
   enddo ; enddo
   ! Set length scale at v-points
+!$OMP do
   do J=js-1,je ; do i=is,ie
     CS%L2v(i,J) = CS%Visbeck_L_scale**2
   enddo ; enddo
-
+!$OMP do
   do k=nz,CS%VarMix_Ktop,-1
 
     ! Calculate the interface slopes E_x and E_y and u- and v- points respectively
@@ -328,7 +334,7 @@ subroutine calc_slope_function_(h, tv, G, CS, e)
       N2 = G%g_prime(k) / (G%H_to_m * max(Hdn,Hup,one_meter))
       if (min(h(i,j,k-1), h(i+1,j,k-1), h(i,j,k), h(i+1,j,k)) < H_cutoff) &
         S2 = 0.0
-      CS%SN_u(I,j) = CS%SN_u(I,j) + (H_geom * G%H_to_m) * S2 * N2
+      SN_u_local(I,j,k) = (H_geom * G%H_to_m) * S2 * N2
     enddo ; enddo
     do J=js-1,je ; do i=is,ie
       S2 = ( E_y(i,J)**2  + 0.25*( &
@@ -339,30 +345,43 @@ subroutine calc_slope_function_(h, tv, G, CS, e)
       N2 = G%g_prime(k) / (G%H_to_m * max(Hdn,Hup,one_meter))
       if (min(h(i,j,k-1), h(i,j+1,k-1), h(i,j,k), h(i,j+1,k)) < H_cutoff) &
         S2 = 0.0
-      CS%SN_v(i,J) = CS%SN_v(i,J) + (H_geom * G%H_to_m) * S2 * N2
+      SN_v_local(i,J,k) = (H_geom * G%H_to_m) * S2 * N2
     enddo ; enddo
 
   enddo ! k
+!$OMP do
+  do j = js,je; 
+    do k=nz,CS%VarMix_Ktop,-1 ; do I=is-1,ie
+      CS%SN_u(I,j) = CS%SN_u(I,j) + SN_u_local(I,j,k)
+    enddo ; enddo
+    ! SN above contains S^2*N^2*H, convert to vertical average of S*N
+    do I=is-1,ie
+      !SN_u(I,j) = sqrt( SN_u(I,j) / ( max(G%bathyT(I,j), G%bathyT(I+1,j)) + G%Angstrom ) )
+      !The code below behaves better than the line above. Not sure why? AJA
+      if ( min(G%bathyT(I,j), G%bathyT(I+1,j)) > H_cutoff ) then
+        CS%SN_u(I,j) = sqrt( CS%SN_u(I,j) / max(G%bathyT(I,j), G%bathyT(I+1,j)) )
+      else
+        CS%SN_u(I,j) = 0.0
+      endif
+    enddo 
+  enddo
+!$OMP do
+  do J=js-1,je 
+    do k=nz,CS%VarMix_Ktop,-1 ; do I=is,ie
+      CS%SN_v(i,J) = CS%SN_v(i,J) + SN_v_local(i,J,k)
+    enddo ; enddo
+    do i=is,ie
+      !SN_v(i,J) = sqrt( SN_v(i,J) / ( max(G%bathyT(i,J), G%bathyT(i,J+1)) + G%Angstrom ) )
+      !The code below behaves better than the line above. Not sure why? AJA
+      if ( min(G%bathyT(I,j), G%bathyT(I+1,j)) > H_cutoff ) then
+        CS%SN_v(i,J) = sqrt( CS%SN_v(i,J) / max(G%bathyT(i,J), G%bathyT(i,J+1)) )
+      else
+        CS%SN_v(I,j) = 0.0
+      endif
+    enddo 
+  enddo
 
-  ! SN above contains S^2*N^2*H, convert to vertical average of S*N
-  do j=js,je ; do I=is-1,ie
-   !SN_u(I,j) = sqrt( SN_u(I,j) / ( max(G%bathyT(I,j), G%bathyT(I+1,j)) + G%Angstrom ) )
-   !The code below behaves better than the line above. Not sure why? AJA
-    if ( min(G%bathyT(I,j), G%bathyT(I+1,j)) > H_cutoff ) then
-      CS%SN_u(I,j) = sqrt( CS%SN_u(I,j) / max(G%bathyT(I,j), G%bathyT(I+1,j)) )
-    else
-      CS%SN_u(I,j) = 0.0
-    endif
-  enddo ; enddo
-  do J=js-1,je ; do i=is,ie
-   !SN_v(i,J) = sqrt( SN_v(i,J) / ( max(G%bathyT(i,J), G%bathyT(i,J+1)) + G%Angstrom ) )
-   !The code below behaves better than the line above. Not sure why? AJA
-    if ( min(G%bathyT(I,j), G%bathyT(I+1,j)) > H_cutoff ) then
-      CS%SN_v(i,J) = sqrt( CS%SN_v(i,J) / max(G%bathyT(i,J), G%bathyT(i,J+1)) )
-    else
-      CS%SN_v(I,j) = 0.0
-    endif
-  enddo ; enddo
+!$OMP end parallel
 
 ! Offer diagnostic fields for averaging.
   if (query_averaging_enabled(CS%diag)) then

From 5d742b8c9cda9695fe607e4569f2a8a1a5b7ec92 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Mon, 14 Apr 2014 10:03:24 -0400
Subject: [PATCH 13/14] implement openmp

---
 src/diagnostics/MOM_diagnostics.F90 | 143 ++++++++++++++++------------
 1 file changed, 83 insertions(+), 60 deletions(-)

diff --git a/src/diagnostics/MOM_diagnostics.F90 b/src/diagnostics/MOM_diagnostics.F90
index fd951c58cf..33f05b4430 100644
--- a/src/diagnostics/MOM_diagnostics.F90
+++ b/src/diagnostics/MOM_diagnostics.F90
@@ -141,7 +141,7 @@ subroutine calculate_diagnostic_fields(u, v, h, uh, vh, tv, ADp, CDp, dt, G, &
   type(cont_diag_ptrs),                     intent(in)    :: CDp
   real,                                     intent(in)    :: dt
   type(ocean_grid_type),                    intent(inout) :: G
-  type(diagnostics_CS),                     pointer       :: CS
+  type(diagnostics_CS),                     intent(inout)       :: CS
   real, dimension(NIMEM_,NJMEM_), optional, intent(in)    :: eta_bt
 !   Any diagnostic quantities that are not more naturally calculated
 ! in the various other subroutines are calculated here.
@@ -188,7 +188,7 @@ subroutine calculate_diagnostic_fields(u, v, h, uh, vh, tv, ADp, CDp, dt, G, &
   ! one iteration which would break the follwoing one-line workaround!
   if (nkmb==0) nkmb = nz
 
-  if (.not.associated(CS)) call MOM_error(FATAL, &
+  if (loc(CS)==0) call MOM_error(FATAL, &
          "calculate_diagnostic_fields: Module must be initialized before it is used.")
 
   call calculate_derivs(dt, G, CS)
@@ -222,6 +222,7 @@ subroutine calculate_diagnostic_fields(u, v, h, uh, vh, tv, ADp, CDp, dt, G, &
 
     if (associated(tv%eqn_of_state)) then
       pres(:) = tv%P_Ref
+!$OMP parallel do default(shared)
       do k=1,nz ; do j=js,je+1
         call calculate_density(tv%T(:,j,k),tv%S(:,j,k),pres, &
                                Rcv(:,j,k),is,ie-is+2, tv%eqn_of_state)
@@ -231,82 +232,103 @@ subroutine calculate_diagnostic_fields(u, v, h, uh, vh, tv, ADp, CDp, dt, G, &
     endif
 
     if (ASSOCIATED(CS%h_Rlay)) then
-      do k=1,nkmb ; do j=js,je; do i=is,ie
-        CS%h_Rlay(i,j,k) = 0.0
-      enddo ; enddo ; enddo
-      do k=nkmb+1,nz ; do j=js,je ; do i=is,ie
-        CS%h_Rlay(i,j,k) = h(i,j,k)
-      enddo ; enddo ; enddo
       k_list = nz/2
-      do j=js,je ; do k=1,nkmb ; do i=is,ie
-        call find_weights(G%Rlay, Rcv(i,j,k), k_list, nz, wt, wt_p)
-        CS%h_Rlay(i,j,k_list)   = CS%h_Rlay(i,j,k_list)   + h(i,j,k)*wt
-        CS%h_Rlay(i,j,k_list+1) = CS%h_Rlay(i,j,k_list+1) + h(i,j,k)*wt_p
-      enddo ; enddo ; enddo
+!$OMP parallel do default(shared) private(wt,wt_p) firstprivate(k_list)
+      do j=js,je 
+        do k=1,nkmb ; do i=is,ie
+          CS%h_Rlay(i,j,k) = 0.0
+        enddo ; enddo
+        do k=nkmb+1,nz ; do i=is,ie
+        CS%h_Rlay(i,j,k) = h(i,j,k)
+        enddo ; enddo 
+        do k=1,nkmb ; do i=is,ie
+          call find_weights(G%Rlay, Rcv(i,j,k), k_list, nz, wt, wt_p)
+          CS%h_Rlay(i,j,k_list)   = CS%h_Rlay(i,j,k_list)   + h(i,j,k)*wt
+          CS%h_Rlay(i,j,k_list+1) = CS%h_Rlay(i,j,k_list+1) + h(i,j,k)*wt_p
+        enddo ; enddo 
+      enddo
+
       if (CS%id_h_Rlay > 0) call post_data(CS%id_h_Rlay, CS%h_Rlay, CS%diag)
     endif
 
     if (ASSOCIATED(CS%uh_Rlay)) then
-      do k=1,nkmb ; do j=js,je; do I=Isq,Ieq
-        CS%uh_Rlay(I,j,k) = 0.0
-      enddo ; enddo ; enddo
-      do k=nkmb+1,nz ; do j=js,je ; do I=Isq,Ieq
-        CS%uh_Rlay(I,j,k) = uh(I,j,k)
-      enddo ; enddo ; enddo
       k_list = nz/2
-      do j=js,je ; do k=1,nkmb ; do I=Isq,Ieq
-        call find_weights(G%Rlay, 0.5*(Rcv(i,j,k)+Rcv(i+1,j,k)), k_list, nz, wt, wt_p)
-        CS%uh_Rlay(I,j,k_list)   = CS%uh_Rlay(I,j,k_list)   + uh(I,j,k)*wt
-        CS%uh_Rlay(I,j,k_list+1) = CS%uh_Rlay(I,j,k_list+1) + uh(I,j,k)*wt_p
-      enddo ; enddo ; enddo
+!$OMP parallel do default(shared) private(wt,wt_p) firstprivate(k_list)
+      do j=js,je
+        do k=1,nkmb ; do I=Isq,Ieq
+          CS%uh_Rlay(I,j,k) = 0.0
+        enddo ; enddo
+        do k=nkmb+1,nz ; do I=Isq,Ieq
+          CS%uh_Rlay(I,j,k) = uh(I,j,k)
+        enddo ; enddo 
+        k_list = nz/2
+        do k=1,nkmb ; do I=Isq,Ieq
+          call find_weights(G%Rlay, 0.5*(Rcv(i,j,k)+Rcv(i+1,j,k)), k_list, nz, wt, wt_p)
+          CS%uh_Rlay(I,j,k_list)   = CS%uh_Rlay(I,j,k_list)   + uh(I,j,k)*wt
+          CS%uh_Rlay(I,j,k_list+1) = CS%uh_Rlay(I,j,k_list+1) + uh(I,j,k)*wt_p
+        enddo ; enddo 
+      enddo
+
       if (CS%id_uh_Rlay > 0) call post_data(CS%id_uh_Rlay, CS%uh_Rlay, CS%diag)
     endif
 
     if (ASSOCIATED(CS%vh_Rlay)) then
-      do k=1,nkmb ; do J=Jsq,Jeq; do i=is,ie
-        CS%vh_Rlay(i,J,k) = 0.0
-      enddo ; enddo ; enddo
-      do k=nkmb+1,nz ; do J=Jsq,Jeq ; do i=is,ie
-        CS%vh_Rlay(i,J,k) = vh(i,J,k)
-      enddo ; enddo ; enddo
       k_list = nz/2
-      do J=Jsq,Jeq ; do k=1,nkmb ; do i=is,ie
-        call find_weights(G%Rlay, 0.5*(Rcv(i,j,k)+Rcv(i,j+1,k)), k_list, nz, wt, wt_p)
-        CS%vh_Rlay(i,J,k_list)   = CS%vh_Rlay(i,J,k_list)   + vh(i,J,k)*wt
-        CS%vh_Rlay(i,J,k_list+1) = CS%vh_Rlay(i,J,k_list+1) + vh(i,J,k)*wt_p
-      enddo ; enddo ; enddo
+!$OMP parallel do default(shared) private(wt,wt_p) firstprivate(k_list)
+      do J=Jsq,Jeq
+        do k=1,nkmb ; do i=is,ie
+          CS%vh_Rlay(i,J,k) = 0.0
+        enddo ; enddo 
+        do k=nkmb+1,nz ; do i=is,ie
+          CS%vh_Rlay(i,J,k) = vh(i,J,k)
+        enddo ; enddo
+        do k=1,nkmb ; do i=is,ie
+          call find_weights(G%Rlay, 0.5*(Rcv(i,j,k)+Rcv(i,j+1,k)), k_list, nz, wt, wt_p)
+          CS%vh_Rlay(i,J,k_list)   = CS%vh_Rlay(i,J,k_list)   + vh(i,J,k)*wt
+          CS%vh_Rlay(i,J,k_list+1) = CS%vh_Rlay(i,J,k_list+1) + vh(i,J,k)*wt_p
+        enddo ; enddo
+      enddo
+
       if (CS%id_vh_Rlay > 0) call post_data(CS%id_vh_Rlay, CS%vh_Rlay, CS%diag)
     endif
 
     if (ASSOCIATED(CS%uhGM_Rlay) .and. ASSOCIATED(CDp%uhGM)) then
-      do k=1,nkmb ; do j=js,je; do I=Isq,Ieq
-        CS%uhGM_Rlay(I,j,k) = 0.0
-      enddo ; enddo ; enddo
-      do k=nkmb+1,nz ; do j=js,je ; do I=Isq,Ieq
-        CS%uhGM_Rlay(I,j,k) = CDp%uhGM(I,j,k)
-      enddo ; enddo ; enddo
       k_list = nz/2
-      do j=js,je ; do k=1,nkmb ; do I=Isq,Ieq
-        call find_weights(G%Rlay, 0.5*(Rcv(i,j,k)+Rcv(i+1,j,k)), k_list, nz, wt, wt_p)
-        CS%uhGM_Rlay(I,j,k_list)   = CS%uhGM_Rlay(I,j,k_list)   + CDp%uhGM(I,j,k)*wt
-        CS%uhGM_Rlay(I,j,k_list+1) = CS%uhGM_Rlay(I,j,k_list+1) + CDp%uhGM(I,j,k)*wt_p
-      enddo ; enddo ; enddo
+!$OMP parallel do default(shared) private(wt,wt_p) firstprivate(k_list)
+      do j=js,je
+        do k=1,nkmb ; do I=Isq,Ieq
+          CS%uhGM_Rlay(I,j,k) = 0.0
+        enddo ; enddo
+        do k=nkmb+1,nz ; do I=Isq,Ieq
+          CS%uhGM_Rlay(I,j,k) = CDp%uhGM(I,j,k)
+        enddo ; enddo
+        do k=1,nkmb ; do I=Isq,Ieq
+          call find_weights(G%Rlay, 0.5*(Rcv(i,j,k)+Rcv(i+1,j,k)), k_list, nz, wt, wt_p)
+          CS%uhGM_Rlay(I,j,k_list)   = CS%uhGM_Rlay(I,j,k_list)   + CDp%uhGM(I,j,k)*wt
+          CS%uhGM_Rlay(I,j,k_list+1) = CS%uhGM_Rlay(I,j,k_list+1) + CDp%uhGM(I,j,k)*wt_p
+        enddo ; enddo
+      enddo
+
       if (CS%id_uh_Rlay > 0) call post_data(CS%id_uhGM_Rlay, CS%uhGM_Rlay, CS%diag)
     endif
 
     if (ASSOCIATED(CS%vhGM_Rlay) .and. ASSOCIATED(CDp%vhGM)) then
-      do k=1,nkmb ; do J=Jsq,Jeq; do i=is,ie
-        CS%vhGM_Rlay(i,J,k) = 0.0
-      enddo ; enddo ; enddo
-      do k=nkmb+1,nz ; do J=Jsq,Jeq ; do i=is,ie
-        CS%vhGM_Rlay(i,J,k) = CDp%vhGM(i,J,k)
-      enddo ; enddo ; enddo
       k_list = nz/2
-      do J=Jsq,Jeq ; do k=1,nkmb ; do i=is,ie
-        call find_weights(G%Rlay, 0.5*(Rcv(i,j,k)+Rcv(i,j+1,k)), k_list, nz, wt, wt_p)
-        CS%vhGM_Rlay(i,J,k_list)   = CS%vhGM_Rlay(i,J,k_list)   + CDp%vhGM(i,J,k)*wt
-        CS%vhGM_Rlay(i,J,k_list+1) = CS%vhGM_Rlay(i,J,k_list+1) + CDp%vhGM(i,J,k)*wt_p
-      enddo ; enddo ; enddo
+!$OMP parallel do default(shared) private(wt,wt_p) firstprivate(k_list)
+      do J=Jsq,Jeq
+        do k=1,nkmb ; do i=is,ie
+          CS%vhGM_Rlay(i,J,k) = 0.0
+        enddo ; enddo
+        do k=nkmb+1,nz ; do i=is,ie
+          CS%vhGM_Rlay(i,J,k) = CDp%vhGM(i,J,k)
+        enddo ; enddo
+        do k=1,nkmb ; do i=is,ie
+          call find_weights(G%Rlay, 0.5*(Rcv(i,j,k)+Rcv(i,j+1,k)), k_list, nz, wt, wt_p)
+          CS%vhGM_Rlay(i,J,k_list)   = CS%vhGM_Rlay(i,J,k_list)   + CDp%vhGM(i,J,k)*wt
+          CS%vhGM_Rlay(i,J,k_list+1) = CS%vhGM_Rlay(i,J,k_list+1) + CDp%vhGM(i,J,k)*wt_p
+        enddo ; enddo
+      enddo
+
       if (CS%id_vhGM_Rlay > 0) call post_data(CS%id_vhGM_Rlay, CS%vhGM_Rlay, CS%diag)
     endif
   endif
@@ -316,6 +338,7 @@ subroutine calculate_diagnostic_fields(u, v, h, uh, vh, tv, ADp, CDp, dt, G, &
     call wave_speed(h, tv, G, CS%cg1, CS%wave_speed_CSp)
     if (CS%id_cg1>0) call post_data(CS%id_cg1, CS%cg1, CS%diag)
     if (CS%id_Rd1>0) then
+!$OMP parallel do default(shared) private(f2_h,mag_beta)
       do j=js,je ; do i=is,ie
         ! Blend the equatorial deformation radius with the standard one.
         f2_h = absurdly_small_freq2 + 0.25 * &
@@ -424,7 +447,7 @@ subroutine calculate_vertical_integrals(h, tv, G, CS)
   real, dimension(NIMEM_,NJMEM_,NKMEM_),    intent(in)    :: h
   type(thermo_var_ptrs),                    intent(in)    :: tv
   type(ocean_grid_type),                    intent(inout) :: G
-  type(diagnostics_CS),                     pointer       :: CS
+  type(diagnostics_CS),                     intent(inout) :: CS
 ! This subroutine calculates the vertical integrals of several tracers, along
 ! with the mass-weight of these tracers, the total column mass, and the
 ! carefully calculated column height.
@@ -523,7 +546,7 @@ subroutine calculate_energy_diagnostics(u, v, h, uh, vh, ADp, CDp, G, CS)
   type(accel_diag_ptrs),                  intent(in)    :: ADp
   type(cont_diag_ptrs),                   intent(in)    :: CDp
   type(ocean_grid_type),                  intent(inout) :: G
-  type(diagnostics_CS),                   pointer       :: CS
+  type(diagnostics_CS),                   intent(inout) :: CS
 !   This subroutine calculates a series of terms in the energy
 ! balance equations.
 
@@ -739,7 +762,7 @@ end subroutine register_time_deriv
 subroutine calculate_derivs(dt, G, CS)
   real,                  intent(in)    :: dt
   type(ocean_grid_type), intent(inout) :: G
-  type(diagnostics_CS),  pointer       :: CS
+  type(diagnostics_CS),  intent(inout) :: CS
 ! This subroutine calculates all registered time derivatives.
 ! Arguments: dt - the time interval in s over which differences occur
 !  (in)      G - The ocean's grid structure.

From 2c162fc1a4fc2c980581a55eef5b49db3d7d0836 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Mon, 14 Apr 2014 10:04:13 -0400
Subject: [PATCH 14/14] openmp bug fix

---
 src/core/MOM_legacy_barotropic.F90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/MOM_legacy_barotropic.F90 b/src/core/MOM_legacy_barotropic.F90
index 54a1a5c66b..4f03a28498 100644
--- a/src/core/MOM_legacy_barotropic.F90
+++ b/src/core/MOM_legacy_barotropic.F90
@@ -3356,7 +3356,7 @@ subroutine legacy_bt_mass_source(h, eta, fluxes, set_cor, dt_therm, &
 
   is = G%isc ; ie = G%iec ; js = G%jsc ; je = G%jec ; nz = G%ke
 
-!$OMP parallel do default(shared) private(i,j,k,eta_h)
+!$OMP parallel do default(shared) private(eta_h,h_tot,limit_dt,d_eta) 
   do j=js,je
     do i=is,ie ; h_tot(i) = h(i,j,1) ; enddo
     if (G%Boussinesq) then