此页面描述了类似 ROC ufunc 的对象。
为了支持 ROC 程序的编程模式,ROC Vectorize 和 GUVectorize 不能生成传统的 ufunc。相反,返回类似 ufunc 的对象。此对象是一个非常模拟但与常规 NumPy ufunc 不完全兼容的对象。 ROC ufunc 增加了对传递设备内阵列(已在 GPU 设备上)的支持,以减少 PCI-express 总线上的流量。它还接受<cite>流</cite>关键字以在异步模式下启动。
import math
from numba import vectorize
import numpy as np
@vectorize(['float32(float32, float32, float32)',
'float64(float64, float64, float64)'],
target='roc')
def roc_discriminant(a, b, c):
return math.sqrt(b ** 2 - 4 * a * c)
N = 10000
dtype = np.float32
# prepare the input
A = np.array(np.random.sample(N), dtype=dtype)
B = np.array(np.random.sample(N) + 10, dtype=dtype)
C = np.array(np.random.sample(N), dtype=dtype)
D = roc_discriminant(A, B, C)
print(D) # print result
所有 ROC ufunc 内核都能够调用其他 ROC 设备函数:
from numba import vectorize, roc
# define a device function
@roc.jit('float32(float32, float32, float32)', device=True)
def roc_device_fn(x, y, z):
return x ** y / z
# define a ufunc that calls our device function
@vectorize(['float32(float32, float32, float32)'], target='roc')
def roc_ufunc(x, y, z):
return roc_device_fn(x, y, z)
可以使用 ROC 在 GPU 上执行广义 ufunc,类似于 ROC ufunc 功能。这可以通过以下方式完成:
from numba import guvectorize
@guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'],
'(m,n),(n,p)->(m,p)', target='roc')
def matmulcore(A, B, C):
...
也可以看看
将数据分区为块允许计算和内存传输重叠。这可以提高 ufunc 的吞吐量,并使您的 ufunc 能够处理大于 GPU 内存容量的数据。例如:
import math
from numba import vectorize, roc
import numpy as np
# the ufunc kernel
def discriminant(a, b, c):
return math.sqrt(b ** 2 - 4 * a * c)
roc_discriminant = vectorize(['float32(float32, float32, float32)'],
target='roc')(discriminant)
N = int(1e+8)
dtype = np.float32
# prepare the input
A = np.array(np.random.sample(N), dtype=dtype)
B = np.array(np.random.sample(N) + 10, dtype=dtype)
C = np.array(np.random.sample(N), dtype=dtype)
D = np.zeros(A.shape, dtype=A.dtype)
# create a ROC stream
stream = roc.stream()
chunksize = 1e+6
chunkcount = N // chunksize
# partition numpy arrays into chunks
# no copying is performed
sA = np.split(A, chunkcount)
sB = np.split(B, chunkcount)
sC = np.split(C, chunkcount)
sD = np.split(D, chunkcount)
device_ptrs = []
# helper function, async requires operation on coarsegrain memory regions
def async_array(arr):
coarse_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
coarse_arr[:] = arr
return coarse_arr
with stream.auto_synchronize():
# every operation in this context with be launched asynchronously
# by using the ROC stream
dchunks = [] # holds the result chunks
# for each chunk
for a, b, c, d in zip(sA, sB, sC, sD):
# create coarse grain arrays
asyncA = async_array(a)
asyncB = async_array(b)
asyncC = async_array(c)
asyncD = async_array(d)
# transfer to device
dA = roc.to_device(asyncA, stream=stream)
dB = roc.to_device(asyncB, stream=stream)
dC = roc.to_device(asyncC, stream=stream)
dD = roc.to_device(asyncD, stream=stream, copy=False) # no copying
# launch kernel
roc_discriminant(dA, dB, dC, out=dD, stream=stream)
# retrieve result
dD.copy_to_host(asyncD, stream=stream)
# store device pointers to prevent them from freeing before
# the kernel is scheduled
device_ptrs.extend([dA, dB, dC, dD])
# store result reference
dchunks.append(asyncD)
# put result chunks into the output array 'D'
for i, result in enumerate(dchunks):
sD[i][:] = result[:]
# data is ready at this point inside D
print(D)