Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add local memory CUDA support to Kernel #126

Merged
merged 3 commits into from
Oct 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ install:
- conda update -q conda
- conda info -a
- conda config --add channels conda-forge
- conda config --add channels anaconda
- conda config --add channels defaults
- conda install -c conda-forge pocl pyopencl
- conda install -c anaconda virtualenv
- conda install -c defaults virtualenv
- python -c 'import pyopencl as cl'
- pip install beaker tox tox-travis

Expand Down
3 changes: 2 additions & 1 deletion examples/cpy/vm_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,5 +113,6 @@ def run(nv, backend):
default=10000, help='Number of particles.')
o = p.parse_args()
get_config().use_double = o.use_double
assert o.backend in ['opencl'], "Only OpenCL backend is supported."
assert o.backend in ['opencl', 'cuda'], ("Only OpenCL/CUDA backend is "
"supported.")
run(o.n, o.backend)
34 changes: 29 additions & 5 deletions pysph/cpy/low_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class LocalMem(object):
Note that this is basically ``sizeof(double) * 128 * 2``
'''

def __init__(self, size, backend=None):
'''
Constructor
Expand Down Expand Up @@ -66,7 +67,7 @@ def get(self, c_type, workgroup_size):
elif self.backend == 'opencl':
import pyopencl as cl
dtype = ctype_to_dtype(c_type)
sz = dtype().itemsize
sz = dtype.itemsize
mem = cl.LocalMemory(sz * self.size * workgroup_size)
self._cache[key] = mem
return mem
Expand Down Expand Up @@ -104,7 +105,7 @@ def splay_cl(queue, n, kernel_specific_max_wg_size=None):
group_count = (n + max_work_items - 1) // max_work_items
work_items_per_group = max_work_items

return (group_count*work_items_per_group,), (work_items_per_group,)
return (group_count * work_items_per_group,), (work_items_per_group,)


class Kernel(object):
Expand All @@ -124,6 +125,7 @@ class Kernel(object):
type checking of the passed constants.
"""

def __init__(self, func, backend='opencl'):
backend = get_backend(backend)
if backend == 'cython':
Expand All @@ -133,7 +135,9 @@ def __init__(self, func, backend='opencl'):
elif backend == 'opencl':
from .opencl import get_queue
self.queue = get_queue()

elif backend == 'cuda':
from .cuda import set_context
set_context()
self.tp = Transpiler(backend=backend)
self.backend = backend
self.name = func.__name__
Expand All @@ -155,19 +159,33 @@ def _get_func_info(self):
)

arg_info = []
local_info = {}
for arg in argspec.args:
kt = annotations[arg]
if not self._use_double:
kt = KnownType(
self._to_float(kt.type), self._to_float(kt.base_type)
)
if 'LOCAL_MEM' in kt.type:
local_info[arg] = kt.base_type
arg_info.append((arg, kt))
func_info = {
'args': arg_info,
'local_info': local_info,
'return': annotations.get('return', KnownType('void'))
}
return func_info

def _get_local_size(self, args, workgroup_size):
local_info = self._func_info['local_info']
arg_info = self._func_info['args']
total_size = 0
for arg, a_info in zip(args, arg_info):
if isinstance(arg, LocalMem):
dtype = ctype_to_dtype(local_info[a_info[0]])
total_size += dtype.itemsize
return workgroup_size * total_size

def _generate(self):
self.tp.add(self.func)
self._correct_opencl_address_space()
Expand Down Expand Up @@ -198,7 +216,10 @@ def _massage_arg(self, x, type_info, workgroup_size):
elif self.backend == 'cuda':
return x.dev
elif isinstance(x, LocalMem):
return x.get(type_info.base_type, workgroup_size)
if self.backend == 'opencl':
return x.get(type_info.base_type, workgroup_size)
elif self.backend == 'cuda':
return np.array(workgroup_size, dtype=np.int32)
else:
dtype = ctype_to_dtype(type_info.type)
return np.array([x], dtype=dtype)
Expand Down Expand Up @@ -230,6 +251,8 @@ def __call__(self, *args, **kw):
gs = (global_size, )
else:
gs, ls = self._get_workgroup_size(n)
if self.backend == 'cuda':
shared_mem_size = self._get_local_size(args, ls[0])
c_args = self._get_args(args, ls[0])
if self.backend == 'opencl':
prepend = [self.queue, gs, ls]
Expand All @@ -239,7 +262,8 @@ def __call__(self, *args, **kw):
elif self.backend == 'cuda':
num_blocks = int((n + ls[0] - 1) / ls[0])
num_tpb = ls[0]
self.knl(*c_args, block=(num_tpb, 1, 1), grid=(num_blocks, 1))
self.knl(*c_args, block=(num_tpb, 1, 1), grid=(num_blocks, 1),
shared=shared_mem_size)


class _prange(Extern):
Expand Down
33 changes: 32 additions & 1 deletion pysph/cpy/tests/test_low_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def knl(x, y, a, size):
y.pull()
self.assertTrue(np.allclose(y.data, x.data * a))

def test_kernel_with_local_memory(self):
def test_kernel_with_local_memory_opencl(self):
importorskip('pyopencl')

# Given
Expand Down Expand Up @@ -92,6 +92,37 @@ def knl(x, y, xc, a):
y.pull()
self.assertTrue(np.allclose(y.data, x.data * a))

def test_kernel_with_local_memory_cuda(self):
importorskip('pycuda')

# Given
@annotate(gdoublep='x, y', xc='ldoublep', a='float')
def knl(x, y, xc, a):
i, lid = declare('int', 2)
lid = LID_0
i = GID_0 * LDIM_0 + lid

xc[lid] = x[i]

local_barrier()

y[i] = xc[lid] * a

x = np.linspace(0, 1, 1024)
y = np.zeros_like(x)
xc = LocalMem(1, backend='cuda')

x, y = wrap(x, y, backend='cuda')

# When
k = Kernel(knl, backend='cuda')
a = 21.0
k(x, y, xc, a)

# Then
y.pull()
self.assertTrue(np.allclose(y.data, x.data * a))


@annotate(double='x, y, a', return_='double')
def func(x, y, a):
Expand Down
50 changes: 47 additions & 3 deletions pysph/cpy/tests/test_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ..types import annotate, declare
from ..translator import (
CConverter, CodeGenerationError, CStructHelper, KnownType,
OpenCLConverter, py2c
OpenCLConverter, CUDAConverter, py2c
)


Expand Down Expand Up @@ -1164,7 +1164,7 @@ def not_me(self):
assert result.strip() == expect.strip()


def test_opencl_conversion():
def check_opencl_cuda_conversion(converter_obj):
# Note that LID_0 etc. are predefined symbols when we include the CLUDA
# preamble, therefore should be known.
src = dedent('''
Expand All @@ -1174,7 +1174,7 @@ def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)):

# When
known_types = {'d_p': KnownType('GLOBAL_MEM int*')}
converter = OpenCLConverter(known_types=known_types)
converter = converter_obj(known_types=known_types)
code = converter.convert(src)

# Then
Expand All @@ -1188,6 +1188,14 @@ def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)):
assert code.strip() == expect.strip()


def test_cuda_conversion():
check_opencl_cuda_conversion(CUDAConverter)


def test_opencl_conversion():
check_opencl_cuda_conversion(OpenCLConverter)


def test_opencl_class():
src = dedent('''
class Foo(object):
Expand All @@ -1209,6 +1217,42 @@ def g(self, x=0.0):
assert code.strip() == expect.strip()


def test_cuda_local_conversion():
@annotate(xc='ldoublep', yc='lintp')
def knl(xc, yc):
xc[LID_0] = 1
yc[LID_0] = 1

# When
converter = CUDAConverter()
code = converter.parse(knl)

# Then
expect_1 = dedent('''
WITHIN_KERNEL void knl(int size_xc, int size_yc)
{
extern LOCAL_MEM float shared_buff[];
double* xc = (double*) shared_buff;
int* yc = (int*) &xc[size_xc];
xc[LID_0] = 1;
yc[LID_0] = 1;
}
''')

expect_2 = dedent('''
WITHIN_KERNEL void knl(int size_xc, int size_yc)
{
extern LOCAL_MEM float shared_buff[];
int* yc = (int*) shared_buff;
double* xc = (double*) &yc[size_yc];
xc[LID_0] = 1;
yc[LID_0] = 1;
}
''')

assert code.strip() == expect_1.strip() or code.strip() == expect_2.strip()


def test_handles_parsing_functions():
# Given
def f(x=1.0):
Expand Down
Loading