Skip to content

Commit

Permalink
Merge pull request #126 from adityapb/cpy-jit
Browse files Browse the repository at this point in the history
Add local memory CUDA support to Kernel
  • Loading branch information
prabhuramachandran authored Oct 30, 2018
2 parents 2a448c4 + 21e79e1 commit cf63c32
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 23 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ install:
- conda update -q conda
- conda info -a
- conda config --add channels conda-forge
- conda config --add channels anaconda
- conda config --add channels defaults
- conda install -c conda-forge pocl pyopencl
- conda install -c anaconda virtualenv
- conda install -c defaults virtualenv
- python -c 'import pyopencl as cl'
- pip install beaker tox tox-travis

Expand Down
3 changes: 2 additions & 1 deletion examples/cpy/vm_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,5 +113,6 @@ def run(nv, backend):
default=10000, help='Number of particles.')
o = p.parse_args()
get_config().use_double = o.use_double
assert o.backend in ['opencl'], "Only OpenCL backend is supported."
assert o.backend in ['opencl', 'cuda'], ("Only OpenCL/CUDA backend is "
"supported.")
run(o.n, o.backend)
34 changes: 29 additions & 5 deletions pysph/cpy/low_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class LocalMem(object):
Note that this is basically ``sizeof(double) * 128 * 2``
'''

def __init__(self, size, backend=None):
'''
Constructor
Expand Down Expand Up @@ -66,7 +67,7 @@ def get(self, c_type, workgroup_size):
elif self.backend == 'opencl':
import pyopencl as cl
dtype = ctype_to_dtype(c_type)
sz = dtype().itemsize
sz = dtype.itemsize
mem = cl.LocalMemory(sz * self.size * workgroup_size)
self._cache[key] = mem
return mem
Expand Down Expand Up @@ -104,7 +105,7 @@ def splay_cl(queue, n, kernel_specific_max_wg_size=None):
group_count = (n + max_work_items - 1) // max_work_items
work_items_per_group = max_work_items

return (group_count*work_items_per_group,), (work_items_per_group,)
return (group_count * work_items_per_group,), (work_items_per_group,)


class Kernel(object):
Expand All @@ -124,6 +125,7 @@ class Kernel(object):
type checking of the passed constants.
"""

def __init__(self, func, backend='opencl'):
backend = get_backend(backend)
if backend == 'cython':
Expand All @@ -133,7 +135,9 @@ def __init__(self, func, backend='opencl'):
elif backend == 'opencl':
from .opencl import get_queue
self.queue = get_queue()

elif backend == 'cuda':
from .cuda import set_context
set_context()
self.tp = Transpiler(backend=backend)
self.backend = backend
self.name = func.__name__
Expand All @@ -155,19 +159,33 @@ def _get_func_info(self):
)

arg_info = []
local_info = {}
for arg in argspec.args:
kt = annotations[arg]
if not self._use_double:
kt = KnownType(
self._to_float(kt.type), self._to_float(kt.base_type)
)
if 'LOCAL_MEM' in kt.type:
local_info[arg] = kt.base_type
arg_info.append((arg, kt))
func_info = {
'args': arg_info,
'local_info': local_info,
'return': annotations.get('return', KnownType('void'))
}
return func_info

def _get_local_size(self, args, workgroup_size):
local_info = self._func_info['local_info']
arg_info = self._func_info['args']
total_size = 0
for arg, a_info in zip(args, arg_info):
if isinstance(arg, LocalMem):
dtype = ctype_to_dtype(local_info[a_info[0]])
total_size += dtype.itemsize
return workgroup_size * total_size

def _generate(self):
self.tp.add(self.func)
self._correct_opencl_address_space()
Expand Down Expand Up @@ -198,7 +216,10 @@ def _massage_arg(self, x, type_info, workgroup_size):
elif self.backend == 'cuda':
return x.dev
elif isinstance(x, LocalMem):
return x.get(type_info.base_type, workgroup_size)
if self.backend == 'opencl':
return x.get(type_info.base_type, workgroup_size)
elif self.backend == 'cuda':
return np.array(workgroup_size, dtype=np.int32)
else:
dtype = ctype_to_dtype(type_info.type)
return np.array([x], dtype=dtype)
Expand Down Expand Up @@ -230,6 +251,8 @@ def __call__(self, *args, **kw):
gs = (global_size, )
else:
gs, ls = self._get_workgroup_size(n)
if self.backend == 'cuda':
shared_mem_size = self._get_local_size(args, ls[0])
c_args = self._get_args(args, ls[0])
if self.backend == 'opencl':
prepend = [self.queue, gs, ls]
Expand All @@ -239,7 +262,8 @@ def __call__(self, *args, **kw):
elif self.backend == 'cuda':
num_blocks = int((n + ls[0] - 1) / ls[0])
num_tpb = ls[0]
self.knl(*c_args, block=(num_tpb, 1, 1), grid=(num_blocks, 1))
self.knl(*c_args, block=(num_tpb, 1, 1), grid=(num_blocks, 1),
shared=shared_mem_size)


class _prange(Extern):
Expand Down
33 changes: 32 additions & 1 deletion pysph/cpy/tests/test_low_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def knl(x, y, a, size):
y.pull()
self.assertTrue(np.allclose(y.data, x.data * a))

def test_kernel_with_local_memory(self):
def test_kernel_with_local_memory_opencl(self):
importorskip('pyopencl')

# Given
Expand Down Expand Up @@ -92,6 +92,37 @@ def knl(x, y, xc, a):
y.pull()
self.assertTrue(np.allclose(y.data, x.data * a))

def test_kernel_with_local_memory_cuda(self):
importorskip('pycuda')

# Given
@annotate(gdoublep='x, y', xc='ldoublep', a='float')
def knl(x, y, xc, a):
i, lid = declare('int', 2)
lid = LID_0
i = GID_0 * LDIM_0 + lid

xc[lid] = x[i]

local_barrier()

y[i] = xc[lid] * a

x = np.linspace(0, 1, 1024)
y = np.zeros_like(x)
xc = LocalMem(1, backend='cuda')

x, y = wrap(x, y, backend='cuda')

# When
k = Kernel(knl, backend='cuda')
a = 21.0
k(x, y, xc, a)

# Then
y.pull()
self.assertTrue(np.allclose(y.data, x.data * a))


@annotate(double='x, y, a', return_='double')
def func(x, y, a):
Expand Down
50 changes: 47 additions & 3 deletions pysph/cpy/tests/test_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ..types import annotate, declare
from ..translator import (
CConverter, CodeGenerationError, CStructHelper, KnownType,
OpenCLConverter, py2c
OpenCLConverter, CUDAConverter, py2c
)


Expand Down Expand Up @@ -1164,7 +1164,7 @@ def not_me(self):
assert result.strip() == expect.strip()


def test_opencl_conversion():
def check_opencl_cuda_conversion(converter_obj):
# Note that LID_0 etc. are predefined symbols when we include the CLUDA
# preamble, therefore should be known.
src = dedent('''
Expand All @@ -1174,7 +1174,7 @@ def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)):

# When
known_types = {'d_p': KnownType('GLOBAL_MEM int*')}
converter = OpenCLConverter(known_types=known_types)
converter = converter_obj(known_types=known_types)
code = converter.convert(src)

# Then
Expand All @@ -1188,6 +1188,14 @@ def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)):
assert code.strip() == expect.strip()


def test_cuda_conversion():
check_opencl_cuda_conversion(CUDAConverter)


def test_opencl_conversion():
check_opencl_cuda_conversion(OpenCLConverter)


def test_opencl_class():
src = dedent('''
class Foo(object):
Expand All @@ -1209,6 +1217,42 @@ def g(self, x=0.0):
assert code.strip() == expect.strip()


def test_cuda_local_conversion():
@annotate(xc='ldoublep', yc='lintp')
def knl(xc, yc):
xc[LID_0] = 1
yc[LID_0] = 1

# When
converter = CUDAConverter()
code = converter.parse(knl)

# Then
expect_1 = dedent('''
WITHIN_KERNEL void knl(int size_xc, int size_yc)
{
extern LOCAL_MEM float shared_buff[];
double* xc = (double*) shared_buff;
int* yc = (int*) &xc[size_xc];
xc[LID_0] = 1;
yc[LID_0] = 1;
}
''')

expect_2 = dedent('''
WITHIN_KERNEL void knl(int size_xc, int size_yc)
{
extern LOCAL_MEM float shared_buff[];
int* yc = (int*) shared_buff;
double* xc = (double*) &yc[size_yc];
xc[LID_0] = 1;
yc[LID_0] = 1;
}
''')

assert code.strip() == expect_1.strip() or code.strip() == expect_2.strip()


def test_handles_parsing_functions():
# Given
def f(x=1.0):
Expand Down
Loading

0 comments on commit cf63c32

Please sign in to comment.