Skip to content

Commit

Permalink
Add cupy and pycuda kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
jfowkes committed Jan 13, 2025
1 parent 5186fd9 commit 0f20f57
Show file tree
Hide file tree
Showing 2 changed files with 203 additions and 0 deletions.
110 changes: 110 additions & 0 deletions ptypy/accelerate/cuda_cupy/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,18 @@ def __init__(self, queue_thread=None,
'MATH_TYPE': self.math_type
})
self.pr_update2_ML_cuda = None
self.ob_update_ML_wavefield_cuda = load_kernel("ob_update_ML_wavefield", {
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
'MATH_TYPE': self.math_type
})
self.ob_update2_ML_wavefield_cuda = None
self.pr_update_ML_wavefield_cuda = load_kernel("pr_update_ML_wavefield", {
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
'MATH_TYPE': self.math_type
})
self.pr_update2_ML_wavefield_cuda = None
self.ob_update_local_cuda = load_kernel("ob_update_local", {
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
Expand Down Expand Up @@ -1096,6 +1108,58 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
np.float32(
fac) if ex.dtype == np.complex64 else np.float64(fac)))

def ob_update_ML_wavefield(self, addr, ob, of, pr, ex, fac=2.0, atomics=True):
obsh = [np.int32(ax) for ax in ob.shape]
ofsh = [np.int32(ax) for ax in of.shape]
prsh = [np.int32(ax) for ax in pr.shape]
exsh = [np.int32(ax) for ax in ex.shape]

if self.queue is not None:
self.queue.use()
if atomics:
if addr.shape[3] != 3 or addr.shape[2] != 5:
raise ValueError(
'Address not in required shape for tiled ob_update')

num_pods = np.int32(addr.shape[0] * addr.shape[1])
self.ob_update_ML_wavefield_cuda(grid=(int(num_pods), 1, 1),
block=(32, 32, 1),
args=(ex, num_pods, exsh[1], exsh[2],
pr, prsh[0], prsh[1], prsh[2],
ob, obsh[0], obsh[1], obsh[2],
of, ofsh[0], ofsh[1], ofsh[2],
addr,
np.float32(
fac) if ex.dtype == np.complex64 else np.float64(fac)))
else:
if addr.shape[0] != 5 or addr.shape[1] != 3:
raise ValueError(
'Address not in required shape for tiled ob_update')

num_pods = np.int32(addr.shape[2] * addr.shape[3])
if not self.ob_update2_ML_wavefield_cuda:
self.ob_update2_ML_wavefield_cuda = load_kernel("ob_update2_ML_wavefield", {
"NUM_MODES": obsh[0],
"BDIM_X": 16,
"BDIM_Y": 16,
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
'MATH_TYPE': self.math_type,
'ACC_TYPE': self.accumulator_type
})
grid = [int((x+15)//16) for x in ob.shape[-2:]]
grid = (grid[1], grid[0], int(1))
self.ob_update2_ML_wavefield_cuda(grid=grid,
block=(16, 16, 1),
args=(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1],
prsh[0],
np.int32(ex.shape[0]),
np.int32(ex.shape[1]),
np.int32(ex.shape[2]),
ob, of, pr, ex, addr,
np.float32(
fac) if ex.dtype == np.complex64 else np.float64(fac)))

def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
obsh = [np.int32(ax) for ax in ob.shape]
prsh = [np.int32(ax) for ax in pr.shape]
Expand Down Expand Up @@ -1140,6 +1204,52 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
np.float32(
fac) if ex.dtype == np.complex64 else np.float64(fac)))

def pr_update_ML_wavefield(self, addr, pr, pf, ob, ex, fac=2.0, atomics=False):
obsh = [np.int32(ax) for ax in ob.shape]
prsh = [np.int32(ax) for ax in pr.shape]
pfsh = [np.int32(ax) for ax in pf.shape]
if self.queue is not None:
self.queue.use()
if atomics:
if addr.shape[3] != 3 or addr.shape[2] != 5:
raise ValueError(
'Address not in required shape for tiled pr_update')
num_pods = np.int32(addr.shape[0] * addr.shape[1])
self.pr_update_ML_wavefield_cuda(grid=(int(num_pods), 1, 1),
block=(32, 32, 1),
args=(ex, num_pods, prsh[1], prsh[2],
pr, prsh[0], prsh[1], prsh[2],
pf, pfsh[0], pfsh[1], pfsh[2],
ob, obsh[0], obsh[1], obsh[2],
addr,
np.float32(
fac) if ex.dtype == np.complex64 else np.float64(fac)))
else:
if addr.shape[0] != 5 or addr.shape[1] != 3:
raise ValueError(
'Address not in required shape for tiled pr_update')
num_pods = np.int32(addr.shape[2] * addr.shape[3])
if not self.pr_update2_ML_wavefield_cuda:
self.pr_update2_ML_wavefield_cuda = load_kernel("pr_update2_ML_wavefield", {
"NUM_MODES": prsh[0],
"BDIM_X": 16,
"BDIM_Y": 16,
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
'MATH_TYPE': self.math_type,
'ACC_TYPE': self.accumulator_type
})

grid = [int((x+15)//16) for x in pr.shape[-2:]]
grid = (grid[0], grid[1], int(1))
self.pr_update2_ML_wavefield_cuda(grid=grid,
block=(16, 16, 1),
args=(prsh[-1], obsh[-2], obsh[-1],
prsh[0], obsh[0], num_pods,
pr, pf, ob, ex, addr,
np.float32(
fac) if ex.dtype == np.complex64 else np.float64(fac)))

def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
if self.queue is not None:
self.queue.use()
Expand Down
93 changes: 93 additions & 0 deletions ptypy/accelerate/cuda_pycuda/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,18 @@ def __init__(self, queue_thread=None,
'MATH_TYPE': self.math_type
})
self.pr_update2_ML_cuda = None
self.ob_update_ML_wavefield_cuda = load_kernel("ob_update_ML_wavefield", {
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
'MATH_TYPE': self.math_type
})
self.ob_update2_ML_wavefield_cuda = None
self.pr_update_ML_wavefield_cuda = load_kernel("pr_update_ML_wavefield", {
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
'MATH_TYPE': self.math_type
})
self.pr_update2_ML_wavefield_cuda = None
self.ob_update_local_cuda = load_kernel("ob_update_local", {
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
Expand Down Expand Up @@ -1045,6 +1057,50 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
block=(16, 16, 1), grid=grid, stream=self.queue)

def ob_update_ML_wavefield(self, addr, ob, of, pr, ex, fac=2.0, atomics=True):
obsh = [np.int32(ax) for ax in ob.shape]
ofsh = [np.int32(ax) for ax in of.shape]
prsh = [np.int32(ax) for ax in pr.shape]
exsh = [np.int32(ax) for ax in ex.shape]

if atomics:
if addr.shape[3] != 3 or addr.shape[2] != 5:
raise ValueError('Address not in required shape for tiled ob_update')

num_pods = np.int32(addr.shape[0] * addr.shape[1])
self.ob_update_ML_wavefield_cuda(ex, num_pods, exsh[1], exsh[2],
pr, prsh[0], prsh[1], prsh[2],
ob, obsh[0], obsh[1], obsh[2],
of, ofsh[0], ofsh[1], ofsh[2],
addr,
np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue)
else:
if addr.shape[0] != 5 or addr.shape[1] != 3:
raise ValueError('Address not in required shape for tiled ob_update')

num_pods = np.int32(addr.shape[2] * addr.shape[3])
if not self.ob_update2_ML_wavefield_cuda:
self.ob_update2_ML_wavefield_cuda = load_kernel("ob_update2_ML_wavefield", {
"NUM_MODES": obsh[0],
"BDIM_X": 16,
"BDIM_Y": 16,
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
'MATH_TYPE': self.math_type,
'ACC_TYPE': self.accumulator_type
})
grid = [int((x+15)//16) for x in ob.shape[-2:]]
grid = (grid[1], grid[0], int(1))
self.ob_update2_ML_wavefield_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1],
prsh[0],
np.int32(ex.shape[0]),
np.int32(ex.shape[1]),
np.int32(ex.shape[2]),
ob, of, pr, ex, addr,
np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
block=(16, 16, 1), grid=grid, stream=self.queue)

def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
obsh = [np.int32(ax) for ax in ob.shape]
prsh = [np.int32(ax) for ax in pr.shape]
Expand Down Expand Up @@ -1081,6 +1137,43 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
block=(16, 16, 1), grid=grid, stream=self.queue)

def pr_update_ML_wavefield(self, addr, pr, pf, ob, ex, fac=2.0, atomics=False):
obsh = [np.int32(ax) for ax in ob.shape]
prsh = [np.int32(ax) for ax in pr.shape]
pfsh = [np.int32(ax) for ax in pf.shape]
if atomics:
if addr.shape[3] != 3 or addr.shape[2] != 5:
raise ValueError('Address not in required shape for tiled pr_update')
num_pods = np.int32(addr.shape[0] * addr.shape[1])
self.pr_update_ML_wavefield_cuda(ex, num_pods, prsh[1], prsh[2],
pr, prsh[0], prsh[1], prsh[2],
pf, pfsh[0], pfsh[1], pfsh[2],
ob, obsh[0], obsh[1], obsh[2],
addr,
np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue)
else:
if addr.shape[0] != 5 or addr.shape[1] != 3:
raise ValueError('Address not in required shape for tiled pr_update')
num_pods = np.int32(addr.shape[2] * addr.shape[3])
if not self.pr_update2_ML_wavefield_cuda:
self.pr_update2_ML_wavefield_cuda = load_kernel("pr_update2_ML_wavefield", {
"NUM_MODES": prsh[0],
"BDIM_X": 16,
"BDIM_Y": 16,
'IN_TYPE': 'float',
'OUT_TYPE': 'float',
'MATH_TYPE': self.math_type,
'ACC_TYPE': self.accumulator_type
})

grid = [int((x+15)//16) for x in pr.shape[-2:]]
grid = (grid[0], grid[1], int(1))
self.pr_update2_ML_wavefield_cuda(prsh[-1], obsh[-2], obsh[-1],
prsh[0], obsh[0], num_pods,
pr, pf, ob, ex, addr,
np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
block=(16, 16, 1), grid=grid, stream=self.queue)

def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
prn_max = gpuarray.max(prn, stream=self.queue)
Expand Down

0 comments on commit 0f20f57

Please sign in to comment.