pypr · prabhuramachandran · Oct 30, 2018 · Oct 17, 2018 · Oct 24, 2018 · Oct 24, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -20,9 +20,9 @@ install:
     - conda update -q conda
     - conda info -a
     - conda config --add channels conda-forge
-    - conda config --add channels anaconda
+    - conda config --add channels defaults
     - conda install -c conda-forge pocl pyopencl
-    - conda install -c anaconda virtualenv
+    - conda install -c defaults virtualenv
     - python -c 'import pyopencl as cl'
     - pip install beaker tox tox-travis
 

diff --git a/examples/cpy/vm_kernel.py b/examples/cpy/vm_kernel.py
@@ -113,5 +113,6 @@ def run(nv, backend):
                    default=10000, help='Number of particles.')
     o = p.parse_args()
     get_config().use_double = o.use_double
-    assert o.backend in ['opencl'], "Only OpenCL backend is supported."
+    assert o.backend in ['opencl', 'cuda'], ("Only OpenCL/CUDA backend is "
+                                             "supported.")
     run(o.n, o.backend)
diff --git a/pysph/cpy/low_level.py b/pysph/cpy/low_level.py
@@ -39,6 +39,7 @@ class LocalMem(object):
 
     Note that this is basically ``sizeof(double) * 128 * 2``
     '''
+
     def __init__(self, size, backend=None):
         '''
         Constructor
@@ -66,7 +67,7 @@ def get(self, c_type, workgroup_size):
         elif self.backend == 'opencl':
             import pyopencl as cl
             dtype = ctype_to_dtype(c_type)
-            sz = dtype().itemsize
+            sz = dtype.itemsize
             mem = cl.LocalMemory(sz * self.size * workgroup_size)
             self._cache[key] = mem
             return mem
@@ -104,7 +105,7 @@ def splay_cl(queue, n, kernel_specific_max_wg_size=None):
         group_count = (n + max_work_items - 1) // max_work_items
         work_items_per_group = max_work_items
 
-    return (group_count*work_items_per_group,), (work_items_per_group,)
+    return (group_count * work_items_per_group,), (work_items_per_group,)
 
 
 class Kernel(object):
@@ -124,6 +125,7 @@ class Kernel(object):
     type checking of the passed constants.
 
     """
+
     def __init__(self, func, backend='opencl'):
         backend = get_backend(backend)
         if backend == 'cython':
@@ -133,7 +135,9 @@ def __init__(self, func, backend='opencl'):
         elif backend == 'opencl':
             from .opencl import get_queue
             self.queue = get_queue()
-
+        elif backend == 'cuda':
+            from .cuda import set_context
+            set_context()
         self.tp = Transpiler(backend=backend)
         self.backend = backend
         self.name = func.__name__
@@ -155,19 +159,33 @@ def _get_func_info(self):
         )
 
         arg_info = []
+        local_info = {}
         for arg in argspec.args:
             kt = annotations[arg]
             if not self._use_double:
                 kt = KnownType(
                     self._to_float(kt.type), self._to_float(kt.base_type)
                 )
+            if 'LOCAL_MEM' in kt.type:
+                local_info[arg] = kt.base_type
             arg_info.append((arg, kt))
         func_info = {
             'args': arg_info,
+            'local_info': local_info,
             'return': annotations.get('return', KnownType('void'))
         }
         return func_info
 
+    def _get_local_size(self, args, workgroup_size):
+        local_info = self._func_info['local_info']
+        arg_info = self._func_info['args']
+        total_size = 0
+        for arg, a_info in zip(args, arg_info):
+            if isinstance(arg, LocalMem):
+                dtype = ctype_to_dtype(local_info[a_info[0]])
+                total_size += dtype.itemsize
+        return workgroup_size * total_size
+
     def _generate(self):
         self.tp.add(self.func)
         self._correct_opencl_address_space()
@@ -198,7 +216,10 @@ def _massage_arg(self, x, type_info, workgroup_size):
             elif self.backend == 'cuda':
                 return x.dev
         elif isinstance(x, LocalMem):
-            return x.get(type_info.base_type, workgroup_size)
+            if self.backend == 'opencl':
+                return x.get(type_info.base_type, workgroup_size)
+            elif self.backend == 'cuda':
+                return np.array(workgroup_size, dtype=np.int32)
         else:
             dtype = ctype_to_dtype(type_info.type)
             return np.array([x], dtype=dtype)
@@ -230,6 +251,8 @@ def __call__(self, *args, **kw):
             gs = (global_size, )
         else:
             gs, ls = self._get_workgroup_size(n)
+        if self.backend == 'cuda':
+            shared_mem_size = self._get_local_size(args, ls[0])
         c_args = self._get_args(args, ls[0])
         if self.backend == 'opencl':
             prepend = [self.queue, gs, ls]
@@ -239,7 +262,8 @@ def __call__(self, *args, **kw):
         elif self.backend == 'cuda':
             num_blocks = int((n + ls[0] - 1) / ls[0])
             num_tpb = ls[0]
-            self.knl(*c_args, block=(num_tpb, 1, 1), grid=(num_blocks, 1))
+            self.knl(*c_args, block=(num_tpb, 1, 1), grid=(num_blocks, 1),
+                     shared=shared_mem_size)
 
 
 class _prange(Extern):

diff --git a/pysph/cpy/tests/test_low_level.py b/pysph/cpy/tests/test_low_level.py
@@ -61,7 +61,7 @@ def knl(x, y, a, size):
         y.pull()
         self.assertTrue(np.allclose(y.data, x.data * a))
 
-    def test_kernel_with_local_memory(self):
+    def test_kernel_with_local_memory_opencl(self):
         importorskip('pyopencl')
 
         # Given
@@ -92,6 +92,37 @@ def knl(x, y, xc, a):
         y.pull()
         self.assertTrue(np.allclose(y.data, x.data * a))
 
+    def test_kernel_with_local_memory_cuda(self):
+        importorskip('pycuda')
+
+        # Given
+        @annotate(gdoublep='x, y', xc='ldoublep', a='float')
+        def knl(x, y, xc, a):
+            i, lid = declare('int', 2)
+            lid = LID_0
+            i = GID_0 * LDIM_0 + lid
+
+            xc[lid] = x[i]
+
+            local_barrier()
+
+            y[i] = xc[lid] * a
+
+        x = np.linspace(0, 1, 1024)
+        y = np.zeros_like(x)
+        xc = LocalMem(1, backend='cuda')
+
+        x, y = wrap(x, y, backend='cuda')
+
+        # When
+        k = Kernel(knl, backend='cuda')
+        a = 21.0
+        k(x, y, xc, a)
+
+        # Then
+        y.pull()
+        self.assertTrue(np.allclose(y.data, x.data * a))
+
 
 @annotate(double='x, y, a', return_='double')
 def func(x, y, a):

diff --git a/pysph/cpy/tests/test_translator.py b/pysph/cpy/tests/test_translator.py
@@ -7,7 +7,7 @@
 from ..types import annotate, declare
 from ..translator import (
     CConverter, CodeGenerationError, CStructHelper, KnownType,
-    OpenCLConverter, py2c
+    OpenCLConverter, CUDAConverter, py2c
 )
 
 
@@ -1164,7 +1164,7 @@ def not_me(self):
     assert result.strip() == expect.strip()
 
 
-def test_opencl_conversion():
+def check_opencl_cuda_conversion(converter_obj):
     # Note that LID_0 etc. are predefined symbols when we include the CLUDA
     # preamble, therefore should be known.
     src = dedent('''
@@ -1174,7 +1174,7 @@ def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)):
 
     # When
     known_types = {'d_p': KnownType('GLOBAL_MEM int*')}
-    converter = OpenCLConverter(known_types=known_types)
+    converter = converter_obj(known_types=known_types)
     code = converter.convert(src)
 
     # Then
@@ -1188,6 +1188,14 @@ def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)):
     assert code.strip() == expect.strip()
 
 
+def test_cuda_conversion():
+    check_opencl_cuda_conversion(CUDAConverter)
+
+
+def test_opencl_conversion():
+    check_opencl_cuda_conversion(OpenCLConverter)
+
+
 def test_opencl_class():
     src = dedent('''
     class Foo(object):
@@ -1209,6 +1217,42 @@ def g(self, x=0.0):
     assert code.strip() == expect.strip()
 
 
+def test_cuda_local_conversion():
+    @annotate(xc='ldoublep', yc='lintp')
+    def knl(xc, yc):
+        xc[LID_0] = 1
+        yc[LID_0] = 1
+
+    # When
+    converter = CUDAConverter()
+    code = converter.parse(knl)
+
+    # Then
+    expect_1 = dedent('''
+WITHIN_KERNEL void knl(int size_xc, int size_yc)
+{
+    extern LOCAL_MEM float shared_buff[];
+    double* xc = (double*) shared_buff;
+    int* yc = (int*) &xc[size_xc];
+    xc[LID_0] = 1;
+    yc[LID_0] = 1;
+}
+    ''')
+
+    expect_2 = dedent('''
+WITHIN_KERNEL void knl(int size_xc, int size_yc)
+{
+    extern LOCAL_MEM float shared_buff[];
+    int* yc = (int*) shared_buff;
+    double* xc = (double*) &yc[size_yc];
+    xc[LID_0] = 1;
+    yc[LID_0] = 1;
+}
+    ''')
+
+    assert code.strip() == expect_1.strip() or code.strip() == expect_2.strip()
+
+
 def test_handles_parsing_functions():
     # Given
     def f(x=1.0):