From 90c1adff687146122a166ab999c796e62960e6cc Mon Sep 17 00:00:00 2001
From: VanillaBrooks <brooks@karlik.org>
Date: Thu, 10 Nov 2022 17:53:21 -0800
Subject: [PATCH 1/5] dissipation rate calculated in streams, with python
 bindings

---
 python/main.py      |   3 ++
 src/Makefile        |   4 +-
 src/dissipation.F90 | 125 ++++++++++++++++++++++++++++++++++++++++++++
 src/min_api.F90     |   8 +++
 src/mod_streams.F90 |   6 +++
 5 files changed, 144 insertions(+), 2 deletions(-)
 create mode 100644 src/dissipation.F90

diff --git a/python/main.py b/python/main.py
index 9446cce..3ff1104 100755
--- a/python/main.py
+++ b/python/main.py
@@ -92,6 +92,9 @@
     time += streams.mod_streams.dtglobal
     time_array[:] = time
 
+    streams.wrap_dissipation_calculation()
+    utils.hprint(f"dissipation rate is {streams.mod_streams.dissipation_rate}")
+
     if (i % config.temporal.span_average_io_steps) == 0:
         utils.hprint("writing span average to output")
         streams.wrap_copy_gpu_to_cpu()
diff --git a/src/Makefile b/src/Makefile
index 8f6a3c4..11563bf 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -54,7 +54,7 @@ OBJ_FILES = alloc.o bcdf.o bcextr.o bcfree.o bc.o bcrecyc.o bcrelax.o bcshk.o bc
     setup.o solver.o startmpi.o stats.o step.o target_reystress.o tbforce.o updateghost.o utility.o utyibm.o \
     visflx.o writedf.o writefield.o writefieldvtk.o writegridplot3d.o writerst.o \
     writestatbl.o writestatchann.o writestat.o writestatzbl.o write_wallpressure.o visflx_stag.o \
-	write_probe_data.o bcblow.o
+	write_probe_data.o bcblow.o dissipation.o
 
 PY_API_OBJS = min_api.o
 
@@ -67,7 +67,7 @@ STATIC_FILES = alloc.F90 bcdf.F90 bcextr.F90 bcfree.F90 bc.F90 bcrecyc.F90 bcrel
     setup.F90 solver.F90 startmpi.F90 stats.F90 step.F90 target_reystress.F90 tbforce.F90 updateghost.F90 utility.F90 utyibm.F90 \
     visflx.F90 writedf.F90 writefield.F90 writefieldvtk.F90 writegridplot3d.F90 writerst.F90 \
     writestatbl.F90 writestatchann.F90 writestat.F90 writestatzbl.F90 write_wallpressure.F90 visflx_stag.F90 \
-	write_probe_data.F90 bcblow.F90 
+	write_probe_data.F90 bcblow.F90 dissipation.F90
 
 STATIC_MODS = mod_streams.F90 mod_sys.F90 euler.F90
 
diff --git a/src/dissipation.F90 b/src/dissipation.F90
new file mode 100644
index 0000000..95a41b3
--- /dev/null
+++ b/src/dissipation.F90
@@ -0,0 +1,125 @@
+! calculate the dissipation rate. See lab streams documentation for
+! `Calculated Quantities` for the equation being calculated
+!
+! after calculation, the value is stored in `dissipation_rate` in mod_streams
+subroutine dissipation_calculation()
+    use mod_streams
+
+    implicit none
+
+    integer :: i, j, k
+
+    real*8 :: uu, vv, ww, rho
+    real*8 :: dudx, dvdy, dwdz
+    real*8 :: dx, dy, dz
+    real*8 :: mpi_dissipation_sum
+
+    ! dissipation_rate is defined in mod_streams, we reset its value here
+    dissipation_rate = 0.0
+
+    mpi_dissipation_sum = 0.0
+
+    ! w_gpu(:, :, :, 1) -> rho
+    ! w_gpu(:, :, :, 2) -> rho u
+    ! w_gpu(:, :, :, 3) -> rho v
+    ! w_gpu(:, :, :, 4) -> rho w
+    ! w_gpu(:, :, :, 5) -> rho E
+
+    !$cuf kernel do(3) <<<*,*>>> 
+    do k = 1,nz
+        do j = 1,ny
+            do i = 1,nx
+
+                !
+                ! du/dx
+                !
+                dudx = ( &
+                    ( &
+                        -1 * w_gpu(i+2, j, k, 2) / w_gpu(i+2, j, k, 1) &
+                    ) &
+                    + &
+                    ( &
+                        8 * w_gpu(i+1, j, k, 2) / w_gpu(i+1, j, k, 1) &
+                    ) &
+                    - &
+                    ( &
+                        8 * w_gpu(i-1, j, k, 2) / w_gpu(i-1, j, k, 1) &
+                    ) &
+                    + &
+                    ( &
+                        w_gpu(i-2, j, k, 2) / w_gpu(i-2, j, k, 1) &
+                    ) &
+                ) &
+                ! replace 12 * \Delta x by 6 * (2 \Delta x) since x can vary
+                ! with grid size (Without really looking at grid mesh generation
+                / ( 6 * ( x_gpu(i+1) - x_gpu(i-1)))
+
+                !
+                ! dv/dy
+                !
+                dvdy = ( &
+                    ( &
+                        -1 * w_gpu(i, j+2, k, 3) / w_gpu(i, j+2, k, 1) &
+                    ) &
+                    + &
+                    ( &
+                        8 * w_gpu(i, j+1, k, 3) / w_gpu(i, j+1, k, 1) &
+                    ) &
+                    - &
+                    ( &
+                        8 * w_gpu(i, j-1, k, 3) / w_gpu(i, j-1, k, 1) &
+                    ) &
+                    + &
+                    ( &
+                        w_gpu(i, j-2, k, 3) / w_gpu(i, j-2, k, 1) &
+                    ) &
+                ) &
+                ! replace 12 * \Delta y by 6 * (2 \Delta y) since y can vary
+                ! with grid size (Without really looking at grid mesh generation
+                / ( 6 * ( y_gpu(j+1) - y_gpu(j-1)))
+
+                !
+                ! dw/dz
+                !
+                dwdz = ( &
+                    ( &
+                        -1 * w_gpu(i, j, k+2, 4) / w_gpu(i, j, k+2, 1) &
+                    ) &
+                    + &
+                    ( &
+                        8 * w_gpu(i, j, k+1, 4) / w_gpu(i, j, k+1, 1) &
+                    ) &
+                    - &
+                    ( &
+                        8 * w_gpu(i, j, k-1, 4) / w_gpu(i, j, k-1, 1) &
+                    ) &
+                    + &
+                    ( &
+                        w_gpu(i, j, k-2, 4) / w_gpu(i, j, k-2, 1) &
+                    ) &
+                ) &
+                ! replace 12 * \Delta z by 6 * (2 \Delta z) since z can vary
+                ! with grid size (Without really looking at grid mesh generation
+                / ( 6 * ( z_gpu(k+1) - z_gpu(k-1)))
+
+                dx = (x_gpu(i-1) + x_gpu(i+1)) / 2
+                dy = (y_gpu(j-1) + y_gpu(j+1)) / 2
+                dz = (z_gpu(k-1) + z_gpu(k+1)) / 2
+
+                dissipation_rate = dissipation_rate + &
+                    (dudx**2 + dvdy**2 + dwdz**2) * dx * dy * dz
+            enddo
+        enddo
+    enddo
+    !@cuf iercuda=cudaDeviceSynchronize()
+
+    dissipation_rate = dissipation_rate / (rlx * rly * rlz)
+
+    ! sum all the values across MPI, store the result in mpi_dissipation_sum
+    call MPI_REDUCE(dissipation_rate, mpi_dissipation_sum, 1, mpi_prec,  MPI_SUM, 0, MPI_COMM_WORLD, iermpi)
+
+    ! distribute mpi_dissipation_sum to all MPI procs
+    call MPI_BCAST(mpi_dissipation_sum, 1, mpi_prec, 0, MPI_COMM_WORLD, iermpi)
+
+    dissipation_rate = mpi_dissipation_sum
+endsubroutine dissipation_calculation
diff --git a/src/min_api.F90 b/src/min_api.F90
index 67b3922..78d6783 100644
--- a/src/min_api.F90
+++ b/src/min_api.F90
@@ -82,3 +82,11 @@ subroutine wrap_tauw_calculate()
     end if
 !
 end subroutine
+
+! calculate dissipation rate. See inner subroutine documentation.
+! after calculation, the value is stored in `dissipation_rate` in mod_streams
+subroutine wrap_dissipation_calculation()
+    implicit none
+
+    call dissipation_calculation()
+end subroutine
diff --git a/src/mod_streams.F90 b/src/mod_streams.F90
index 4f40ad1..a6ec0ea 100644
--- a/src/mod_streams.F90
+++ b/src/mod_streams.F90
@@ -81,6 +81,12 @@ module mod_streams
  ! boundary condition integer (see bc.f90) to indicate that the bottom boundary
  ! of the SBLI case should be blowing
  integer, parameter :: blowing_sbli_boundary_condition = 11
+
+
+ ! dissipation_rate is calculated in dissipation.F90, subroutine dissipation_calculation
+!f2py real*8 :: dissipation_rate
+ real(mykind) :: dissipation_rate
+
  integer :: iflow
  integer :: idiski, ndim
  integer :: istore, istore_restart 

From a249c92ceafc9d6444dc50c73ab295c895298d18 Mon Sep 17 00:00:00 2001
From: VanillaBrooks <brooks@karlik.org>
Date: Thu, 10 Nov 2022 18:01:28 -0800
Subject: [PATCH 2/5] move dissipation rate to be exported with shear stress
 information

---
 python/main.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/main.py b/python/main.py
index 3ff1104..8881d41 100755
--- a/python/main.py
+++ b/python/main.py
@@ -44,6 +44,7 @@
 temp_field = np.zeros((config.nx_mpi(), config.ny_mpi(), config.nz_mpi()), dtype=np.float64)
 dt_array = np.zeros(1)
 time_array = np.zeros(1)
+dissipation_rate_array = np.zeros(1)
 
 #
 # execute streams setup routines
@@ -77,6 +78,7 @@
 span_average_dset = io_utils.VectorFieldXY2D(span_averages, [5, *span_average_shape], numwrites, "span_average", rank)
 shear_stress_dset = io_utils.ScalarFieldX1D(span_averages, [config.grid.nx], numwrites, "shear_stress", rank)
 span_average_time_dset = io_utils.Scalar1D(span_averages, [1], numwrites, "time", rank)
+dissipation_rate_dset = io_utils.Scalar1D(span_averages, [1], numwrites, "dissipation_rate", rank)
 
 # trajectories files
 dt_dset = io_utils.Scalar1D(trajectories, [1], config.temporal.num_iter, "dt", rank)
@@ -92,9 +94,6 @@
     time += streams.mod_streams.dtglobal
     time_array[:] = time
 
-    streams.wrap_dissipation_calculation()
-    utils.hprint(f"dissipation rate is {streams.mod_streams.dissipation_rate}")
-
     if (i % config.temporal.span_average_io_steps) == 0:
         utils.hprint("writing span average to output")
         streams.wrap_copy_gpu_to_cpu()
@@ -110,6 +109,11 @@
         # write the time at which this data was collected
         span_average_time_dset.write_array(time_array)
 
+        # calculate dissipation rate on GPU and store the result
+        streams.wrap_dissipation_calculation()
+        dissipation_rate_array[:] = streams.mod_streams.dissipation_rate
+        dissipation_rate_dset.write_array(dissipation_rate_array)
+
     # save dt information for every step
     dt_array[:] = streams.mod_streams.dtglobal
     dt_dset.write_array(dt_array)

From 5c485b35fbcb7b2a1b29728779df50b38054b8a4 Mon Sep 17 00:00:00 2001
From: VanillaBrooks <brooks@karlik.org>
Date: Thu, 10 Nov 2022 18:20:20 -0800
Subject: [PATCH 3/5] energy gpu calculation and export to hdf5

---
 python/main.py      |  7 ++++++
 src/dissipation.F90 | 54 +++++++++++++++++++++++++++++++++++++++++++++
 src/min_api.F90     |  8 +++++++
 src/mod_streams.F90 |  4 ++++
 4 files changed, 73 insertions(+)

diff --git a/python/main.py b/python/main.py
index 8881d41..474a601 100755
--- a/python/main.py
+++ b/python/main.py
@@ -45,6 +45,7 @@
 dt_array = np.zeros(1)
 time_array = np.zeros(1)
 dissipation_rate_array = np.zeros(1)
+energy_array = np.zeros(1)
 
 #
 # execute streams setup routines
@@ -79,6 +80,7 @@
 shear_stress_dset = io_utils.ScalarFieldX1D(span_averages, [config.grid.nx], numwrites, "shear_stress", rank)
 span_average_time_dset = io_utils.Scalar1D(span_averages, [1], numwrites, "time", rank)
 dissipation_rate_dset = io_utils.Scalar1D(span_averages, [1], numwrites, "dissipation_rate", rank)
+energy_dset = io_utils.Scalar1D(span_averages, [1], numwrites, "energy", rank)
 
 # trajectories files
 dt_dset = io_utils.Scalar1D(trajectories, [1], config.temporal.num_iter, "dt", rank)
@@ -114,6 +116,11 @@
         dissipation_rate_array[:] = streams.mod_streams.dissipation_rate
         dissipation_rate_dset.write_array(dissipation_rate_array)
 
+        # calculate energy on GPU and store the result
+        streams.wrap_energy_calculation()
+        energy_array[:] = streams.mod_streams.energy
+        energy_dset.write_array(energy_array)
+
     # save dt information for every step
     dt_array[:] = streams.mod_streams.dtglobal
     dt_dset.write_array(dt_array)
diff --git a/src/dissipation.F90 b/src/dissipation.F90
index 95a41b3..6cf314c 100644
--- a/src/dissipation.F90
+++ b/src/dissipation.F90
@@ -123,3 +123,57 @@ subroutine dissipation_calculation()
 
     dissipation_rate = mpi_dissipation_sum
 endsubroutine dissipation_calculation
+
+subroutine energy_calculation()
+    use mod_streams
+
+    implicit none
+
+    integer :: i, j, k
+
+    real*8 :: uu, vv, ww, rho
+    real*8 :: dudx, dvdy, dwdz
+    real*8 :: dx, dy, dz
+    real*8 :: mpi_energy_sum
+
+    ! dissipation_rate is defined in mod_streams, we reset its value here
+    energy = 0.0
+
+    mpi_energy_sum = 0.0
+
+    ! w_gpu(:, :, :, 1) -> rho
+    ! w_gpu(:, :, :, 2) -> rho u
+    ! w_gpu(:, :, :, 3) -> rho v
+    ! w_gpu(:, :, :, 4) -> rho w
+    ! w_gpu(:, :, :, 5) -> rho E
+
+    !$cuf kernel do(3) <<<*,*>>> 
+    do k = 1,nz
+        do j = 1,ny
+            do i = 1,nx
+                rho = w_gpu(i, j, k, 1)
+                uu = w_gpu(i, j, k, 2) / rho
+                vv = w_gpu(i, j, k, 3) / rho
+                ww = w_gpu(i, j, k, 4) / rho
+
+                dx = (x_gpu(i-1) + x_gpu(i+1)) / 2
+                dy = (y_gpu(j-1) + y_gpu(j+1)) / 2
+                dz = (z_gpu(k-1) + z_gpu(k+1)) / 2
+
+                energy = energy + &
+                    (uu**2 + vv**2 + ww**2) * dx * dy * dz
+            enddo
+        enddo
+    enddo
+
+    energy = energy * 0.5
+
+    ! sum all the values across MPI, store the result in mpi_energy_sum 
+    call MPI_REDUCE(energy, mpi_energy_sum, 1, mpi_prec,  MPI_SUM, 0, MPI_COMM_WORLD, iermpi)
+
+    ! distribute mpi_energy_sum to all MPI procs
+    call MPI_BCAST(mpi_energy_sum, 1, mpi_prec, 0, MPI_COMM_WORLD, iermpi)
+
+    energy = mpi_energy_sum
+
+end subroutine energy_calculation
diff --git a/src/min_api.F90 b/src/min_api.F90
index 78d6783..969a933 100644
--- a/src/min_api.F90
+++ b/src/min_api.F90
@@ -90,3 +90,11 @@ subroutine wrap_dissipation_calculation()
 
     call dissipation_calculation()
 end subroutine
+
+! calculate energy. See inner subroutine documentation.
+! after calculation, the value is stored in `energy` in mod_streams
+subroutine wrap_energy_calculation()
+    implicit none
+
+    call energy_calculation()
+end subroutine
diff --git a/src/mod_streams.F90 b/src/mod_streams.F90
index a6ec0ea..922f753 100644
--- a/src/mod_streams.F90
+++ b/src/mod_streams.F90
@@ -87,6 +87,10 @@ module mod_streams
 !f2py real*8 :: dissipation_rate
  real(mykind) :: dissipation_rate
 
+ ! energy is calculated in dissipation.F90, subroutine energy_calculation
+!f2py real*8 :: energy
+ real(mykind) :: energy 
+
  integer :: iflow
  integer :: idiski, ndim
  integer :: istore, istore_restart 

From 627dedc424c2ee3aee9ff9d0241aee677370de94 Mon Sep 17 00:00:00 2001
From: VanillaBrooks <brooks@karlik.org>
Date: Mon, 14 Nov 2022 17:49:01 -0800
Subject: [PATCH 4/5] fix finite difference equations, still need irregular
 grid equations

---
 python/main.py      |  3 +++
 src/dissipation.F90 | 47 +++++++++++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/python/main.py b/python/main.py
index 474a601..9cf634e 100755
--- a/python/main.py
+++ b/python/main.py
@@ -115,12 +115,15 @@
         streams.wrap_dissipation_calculation()
         dissipation_rate_array[:] = streams.mod_streams.dissipation_rate
         dissipation_rate_dset.write_array(dissipation_rate_array)
+        utils.hprint(f"dissipation is {dissipation_rate_array[0]}")
 
         # calculate energy on GPU and store the result
         streams.wrap_energy_calculation()
         energy_array[:] = streams.mod_streams.energy
         energy_dset.write_array(energy_array)
 
+        utils.hprint(f"energy is {energy_array[0]}")
+
     # save dt information for every step
     dt_array[:] = streams.mod_streams.dtglobal
     dt_dset.write_array(dt_array)
diff --git a/src/dissipation.F90 b/src/dissipation.F90
index 6cf314c..0c1183d 100644
--- a/src/dissipation.F90
+++ b/src/dissipation.F90
@@ -29,6 +29,27 @@ subroutine dissipation_calculation()
     do k = 1,nz
         do j = 1,ny
             do i = 1,nx
+                !
+                ! at i = 1 dx = 0 so the boundaries will be really wonky. 
+                ! the same happens for dy at j = 1, and for dz at k = 1
+                !
+                if (i == 1) then
+                    dx = (x_gpu(i+1) - x_gpu(i)) / 2
+                else
+                    dx = (x_gpu(i+1) - x_gpu(i-1)) / 2
+                endif
+
+                if (j == 1) then
+                    dy = (y_gpu(j+1) - y_gpu(j)) / 2
+                else
+                    dy = (y_gpu(j+1) - y_gpu(j-1)) / 2
+                endif
+
+                if (k == 1) then
+                    dz = (z_gpu(k+1) - z_gpu(k)) / 2
+                else
+                    dz = (z_gpu(k+1) - z_gpu(k-1)) / 2
+                endif
 
                 !
                 ! du/dx
@@ -50,9 +71,7 @@ subroutine dissipation_calculation()
                         w_gpu(i-2, j, k, 2) / w_gpu(i-2, j, k, 1) &
                     ) &
                 ) &
-                ! replace 12 * \Delta x by 6 * (2 \Delta x) since x can vary
-                ! with grid size (Without really looking at grid mesh generation
-                / ( 6 * ( x_gpu(i+1) - x_gpu(i-1)))
+                / ( 12 * dx)
 
                 !
                 ! dv/dy
@@ -74,9 +93,7 @@ subroutine dissipation_calculation()
                         w_gpu(i, j-2, k, 3) / w_gpu(i, j-2, k, 1) &
                     ) &
                 ) &
-                ! replace 12 * \Delta y by 6 * (2 \Delta y) since y can vary
-                ! with grid size (Without really looking at grid mesh generation
-                / ( 6 * ( y_gpu(j+1) - y_gpu(j-1)))
+                / ( 12 * dy)
 
                 !
                 ! dw/dz
@@ -98,16 +115,10 @@ subroutine dissipation_calculation()
                         w_gpu(i, j, k-2, 4) / w_gpu(i, j, k-2, 1) &
                     ) &
                 ) &
-                ! replace 12 * \Delta z by 6 * (2 \Delta z) since z can vary
-                ! with grid size (Without really looking at grid mesh generation
-                / ( 6 * ( z_gpu(k+1) - z_gpu(k-1)))
-
-                dx = (x_gpu(i-1) + x_gpu(i+1)) / 2
-                dy = (y_gpu(j-1) + y_gpu(j+1)) / 2
-                dz = (z_gpu(k-1) + z_gpu(k+1)) / 2
+                / ( 12 * dz)
 
                 dissipation_rate = dissipation_rate + &
-                    (dudx**2 + dvdy**2 + dwdz**2) * dx * dy * dz
+                    ((dudx**2 + dvdy**2 + dwdz**2) * dx * dy * dz)
             enddo
         enddo
     enddo
@@ -115,6 +126,8 @@ subroutine dissipation_calculation()
 
     dissipation_rate = dissipation_rate / (rlx * rly * rlz)
 
+    !write(*, *) "dissipation rate", dissipation_rate
+
     ! sum all the values across MPI, store the result in mpi_dissipation_sum
     call MPI_REDUCE(dissipation_rate, mpi_dissipation_sum, 1, mpi_prec,  MPI_SUM, 0, MPI_COMM_WORLD, iermpi)
 
@@ -156,9 +169,9 @@ subroutine energy_calculation()
                 vv = w_gpu(i, j, k, 3) / rho
                 ww = w_gpu(i, j, k, 4) / rho
 
-                dx = (x_gpu(i-1) + x_gpu(i+1)) / 2
-                dy = (y_gpu(j-1) + y_gpu(j+1)) / 2
-                dz = (z_gpu(k-1) + z_gpu(k+1)) / 2
+                dx = (x_gpu(i+1) - x_gpu(i-1)) / 2
+                dy = (y_gpu(j+1) - y_gpu(j-1)) / 2
+                dz = (z_gpu(k+1) - z_gpu(k-1)) / 2
 
                 energy = energy + &
                     (uu**2 + vv**2 + ww**2) * dx * dy * dz

From cbea0e77869f684129eec428f1f67b1b60af7006 Mon Sep 17 00:00:00 2001
From: VanillaBrooks <brooks@karlik.org>
Date: Fri, 18 Nov 2022 16:39:03 -0800
Subject: [PATCH 5/5] dissipation calculation in the y direction includes a FDM
 for irregular grids

---
 src/alloc.F90       |   5 ++
 src/dissipation.F90 | 159 +++++++++++++++++++++++++++++++++++++++-----
 src/mod_streams.F90 |  12 ++++
 src/setup.F90       |   1 +
 4 files changed, 159 insertions(+), 18 deletions(-)

diff --git a/src/alloc.F90 b/src/alloc.F90
index fce6196..d001422 100644
--- a/src/alloc.F90
+++ b/src/alloc.F90
@@ -215,6 +215,11 @@ subroutine allocate_vars()
  allocate(wrecycav_gpu(ng,ny,nv))
 
  allocate(tauw_x(1:nx))
+
+ allocate(fdm_y_stencil_gpu(5, ny))
+ allocate(fdm_y_stencil(5, ny))
+ allocate(fdm_individual_stencil(0:1, 0:4, 0:4))
+ allocate(fdm_grid_points(0:4))
 !
 endsubroutine allocate_vars
 
diff --git a/src/dissipation.F90 b/src/dissipation.F90
index 0c1183d..624ab06 100644
--- a/src/dissipation.F90
+++ b/src/dissipation.F90
@@ -76,24 +76,13 @@ subroutine dissipation_calculation()
                 !
                 ! dv/dy
                 !
-                dvdy = ( &
-                    ( &
-                        -1 * w_gpu(i, j+2, k, 3) / w_gpu(i, j+2, k, 1) &
-                    ) &
-                    + &
-                    ( &
-                        8 * w_gpu(i, j+1, k, 3) / w_gpu(i, j+1, k, 1) &
-                    ) &
-                    - &
-                    ( &
-                        8 * w_gpu(i, j-1, k, 3) / w_gpu(i, j-1, k, 1) &
-                    ) &
-                    + &
-                    ( &
-                        w_gpu(i, j-2, k, 3) / w_gpu(i, j-2, k, 1) &
-                    ) &
-                ) &
-                / ( 12 * dy)
+                ! fdm_y_stencil_gpu coeffs are calculated in generate_full_fdm_stencil() subroutine
+                dvdy = &
+                    (w_gpu(i, j, k, 3) / w_gpu(i, j, k, 1)) * fdm_y_stencil_gpu(1, j) + &
+                    (w_gpu(i, j+1, k, 3) / w_gpu(i, j+1, k, 1)) * fdm_y_stencil_gpu(2, j) + &
+                    (w_gpu(i, j-1, k, 3) / w_gpu(i, j-1, k, 1)) * fdm_y_stencil_gpu(3, j) + &
+                    (w_gpu(i, j+2, k, 3) / w_gpu(i, j+2, k, 1)) * fdm_y_stencil_gpu(4, j) + &
+                    (w_gpu(i, j-2, k, 3) / w_gpu(i, j-2, k, 1)) * fdm_y_stencil_gpu(5, j)
 
                 !
                 ! dw/dz
@@ -137,6 +126,140 @@ subroutine dissipation_calculation()
     dissipation_rate = mpi_dissipation_sum
 endsubroutine dissipation_calculation
 
+subroutine generate_full_fdm_stencil()
+    use mod_streams, only: delta => fdm_individual_stencil, alpha => fdm_grid_points, &
+        ny, fdm_y_stencil, fdm_y_stencil_gpu, masterproc, y
+
+    implicit none
+
+    integer :: j, i
+    real*8 :: y0, ym1, ym2, yp1, yp2, total
+    total = 0.0
+
+    if (masterproc) write(*,*) "generating FDM stencil for dissipation"
+
+    do j = 1,ny
+        ym2 = y(j-2)
+        ym1 = y(j-1)
+        y0  = y(j)
+        yp1 = y(j+1)
+        yp2 = y(j+2)
+
+        ! j
+        alpha(0) = y0
+        ! j+1
+        alpha(1) = yp1
+        ! j-1
+        alpha(2) = ym1
+        ! j+2
+        alpha(3) = yp2
+        ! j-2
+        alpha(4) = ym2
+
+        ! use alpha to compute the stencil for this set of points
+        call irregular_grid_stencil()
+        fdm_y_stencil(:, j) = delta(1, 4, :)
+
+        do i = 1,5
+            total = total + abs(fdm_y_stencil(i, j))
+        enddo
+    enddo
+
+    write(*,*) "total of stencil coefficients is", total
+
+! copy over to GPU variable
+#ifdef USE_CUDA
+    fdm_y_stencil_gpu = fdm_y_stencil
+#else
+    call move_alloc(fdm_y_stencil, fdm_y_stencil_gpu)
+#endif
+
+endsubroutine
+
+! fetches the grid points from `fdm_grid_points` and operates on `fdm_individual_stencil_gpu` 
+! to perform the algorithm from
+! Generation of Finite Difference Formulas on Arbitrarily Spaced Grids (Fornberg, 1988)
+! to generate a stencil that we can use for the finite difference calculation
+subroutine irregular_grid_stencil()
+    use mod_streams, only: delta => fdm_individual_stencil, alpha => fdm_grid_points
+
+    implicit none
+
+    integer :: M0, N0, v, m, n
+    real*8 :: c1, c2, c3, t1, t2, t3, x0
+
+    M0 = 1
+    N0 = 4
+
+    delta(:, :, :) = 0.0
+    delta(0, 0, 0) = 1.0
+    c1 = 1.0
+
+    x0 = alpha(0)
+
+    do n = 1,N0
+        c2 = 1.0
+
+        do v = 0,n-1
+            c3 = alpha(n) - alpha(v)
+            c2 = c2 * c3
+
+            do m = 0, min(n,M0)
+                t3 = delta(m, n-1, v)
+                t1 = (alpha(n) - x0) * t3
+
+                if (m == 0) then
+                    t2 = 0.0
+                else
+                    t2 = m * delta(m-1, n-1, v)
+                endif
+
+                delta(m, n, v)  = (t1 -t2) / c3
+            enddo
+        enddo
+
+        do m = 0, min(n,M0)
+            if (m == 0) then
+                t1 = 0.0
+            else 
+                t1 = m * delta(m-1, n-1, n-1)
+            endif
+
+            t3 = delta(m, n-1, n-1)
+            t2 = (alpha(n-1) - x0) * t3
+
+            delta(m, n, n) = (c1 / c2) * (t1 - t2)
+        enddo
+
+        c1 = c2
+    enddo
+
+end subroutine
+
+! calculate the first order derivative with 4th order error for a simple grid
+subroutine validate_irregular_fdm()
+    use mod_streams
+
+    fdm_grid_points(0) = 0.0
+    fdm_grid_points(1) = 1.
+    fdm_grid_points(2) = -1.
+    fdm_grid_points(3) = 2.
+    fdm_grid_points(4) = -2.
+
+    call irregular_grid_stencil()
+    ! should output 
+    ! -0.0          (0)
+    ! 0.666667      (2/3)
+    ! -0.666667     (-2/3)
+    ! -0.0833333    (-1/12)
+    ! 0.0833333     (1/12)
+#ifdef USE_CUDA
+#else
+    ! can only print in CPU mode
+    write(*,*) fdm_individual_stencil(1, 4, :)
+#endif
+endsubroutine
+
 subroutine energy_calculation()
     use mod_streams
 
diff --git a/src/mod_streams.F90 b/src/mod_streams.F90
index deffec2..83ff3ac 100644
--- a/src/mod_streams.F90
+++ b/src/mod_streams.F90
@@ -87,6 +87,16 @@ module mod_streams
 !f2py real*8 :: dissipation_rate
  real(mykind) :: dissipation_rate
 
+ ! stencil for computing finite differences on the irregular y mesh
+!f2py real*8, dimension(:, :), allocatable :: fdm_y_stencil
+ real(mykind), dimension(:, :), allocatable :: fdm_y_stencil
+ real(mykind), dimension(:, :), allocatable :: fdm_y_stencil_gpu
+ ! the array we can re-use for calculating a 4th order accurate approximation for the first derivative
+ ! according to 
+ ! Generation of Finite Difference Formulas on Arbitrarily Spaced Grids (Fornberg, 1988)
+ real(mykind), dimension(:, :, :), allocatable :: fdm_individual_stencil
+ real(mykind), dimension(:), allocatable :: fdm_grid_points
+
  ! energy is calculated in dissipation.F90, subroutine energy_calculation
 !f2py real*8 :: energy
  real(mykind) :: energy 
@@ -241,6 +251,8 @@ module mod_streams
  attributes(device) :: fhat_trans_gpu, fl_trans_gpu
  attributes(device) :: temperature_trans_gpu
  attributes(device) :: wv_gpu, wv_trans_gpu
+
+ attributes(device) :: fdm_y_stencil_gpu
 !
  ! TODO: this might be problematic
  !f2py intent(hide) :: local_comm, mydev
diff --git a/src/setup.F90 b/src/setup.F90
index 4a27395..4da8d88 100644
--- a/src/setup.F90
+++ b/src/setup.F90
@@ -30,6 +30,7 @@ subroutine setup
  if (masterproc) write(*,*) 'Initialize field'
  call init()
  call checkdt()
+ call generate_full_fdm_stencil()
 !===================================================
 !
 end subroutine setup