From 3ff5ab9d0363cca15936f8d30dd9d0cd1d762c85 Mon Sep 17 00:00:00 2001 From: dkoes Date: Mon, 16 Aug 2021 15:56:22 -0400 Subject: [PATCH] add cuda file So now grid interp should work. Inludes texture memory implementation, but it isn't as fast or accurate as the default. --- .gitignore | 5 + src/grid_interpolater.cu | 223 ++++++++++++++++++++++++++++++++++++++ test/bench_gridinterp.cpp | 46 ++++++++ 3 files changed, 274 insertions(+) create mode 100644 src/grid_interpolater.cu create mode 100644 test/bench_gridinterp.cpp diff --git a/.gitignore b/.gitignore index e23347d..36a2411 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,8 @@ vendor/ # Ignore editors .vscode /.pytest_cache/ +.cproject +.idea +.project +.pydevproject +.settings diff --git a/src/grid_interpolater.cu b/src/grid_interpolater.cu new file mode 100644 index 0000000..e473504 --- /dev/null +++ b/src/grid_interpolater.cu @@ -0,0 +1,223 @@ +/* + * grid_interpolater.cu + * + * Created on: Aug 6, 2021 + * Author: dkoes + */ + +#include "libmolgrid/grid_interpolater.h" +#include +#include +#include + +namespace libmolgrid { + + //wrapper that returns pad if out of bounds + template + CUDA_CALLABLE_MEMBER Dtype GridInterpolater::get_pt(const Grid& in, int x, int y, int z) const { + if(x < 0 || x >= int(in_dim) || y < 0 || y >= int(in_dim) || z < 0 || z >= int(in_dim)) + return 0; + else + return in[x][y][z]; + } + + //given a non-rounded gridpoint in the input grid linearly interpolate values + template + CUDA_CALLABLE_MEMBER Dtype GridInterpolater::interpolate(const Grid& in, float3 gridpt) const { + //https://en.wikipedia.org/wiki/Trilinear_interpolation + int xl = floor(gridpt.x); + int xh = ceil(gridpt.x); + int yl = floor(gridpt.y); + int yh = ceil(gridpt.y); + int zl = floor(gridpt.z); + int zh = ceil(gridpt.z); + + Dtype p000 = get_pt(in, xl, yl, zl); + Dtype p001 = get_pt(in, xl, yl, zh); + Dtype p010 = get_pt(in, xl, yh, zl); + Dtype p011 = get_pt(in, xl, yh, zh); + Dtype p100 = get_pt(in, xh, yl, zl); + Dtype p101 = get_pt(in, xh, yl, zh); + Dtype p110 = get_pt(in, xh, yh, zl); + Dtype p111 = get_pt(in, xh, yh, zh); + + Dtype xd = xh > xl ? (gridpt.x-xl)/(xh-xl) : 0; + Dtype yd = yh > yl ? (gridpt.y-yl)/(yh-yl) : 0; + Dtype zd = zh > zl ? (gridpt.z-zl)/(zh-zl) : 0; + + Dtype c00 = p000*(1-xd) + p100*xd; + Dtype c01 = p001*(1-xd) + p101*xd; + Dtype c10 = p010*(1-xd) + p110*xd; + Dtype c11 = p011*(1-xd) + p111*xd; + + Dtype c0 = c00*(1-yd)+c10*yd; + Dtype c1 = c01*(1-yd)+c11*yd; + + Dtype c = c0*(1-zd)+c1*zd; + return c; + } + + template float GridInterpolater::interpolate(const Grid& in, float3 gridpt) const; + template float GridInterpolater::interpolate(const Grid& in, float3 gridpt) const; + template double GridInterpolater::interpolate(const Grid& in, float3 gridpt) const; + + //convert to texture coords + __device__ float3 cart2tex(float3 origin, float resolution, float x, float y, float z) { + //textures interpolate assuming value is in center of pixel instead of at grid point + float3 pt = { 0.5f+(x-origin.x)/resolution, 0.5f+(y-origin.y)/resolution, 0.5f+(z-origin.z)/resolution }; + return pt; + } + + //use texture memory to perform interpolation + __global__ void + gpu_set_outgrid_texture(cudaTextureObject_t tex, + float3 in_origin, float in_res, unsigned in_dim, + float3 out_origin, float out_res, unsigned out_dim, + Quaternion invQ, float3 untranslate, float3 center, + Grid out) { + //figure out coordinate we are setting for out + unsigned xi = threadIdx.x + blockIdx.x * blockDim.x; + unsigned yi = threadIdx.y + blockIdx.y * blockDim.y; + unsigned zi = threadIdx.z + blockIdx.z * blockDim.z; + + if(xi >= out_dim || yi >= out_dim || zi >= out_dim) + return;//bail if we're off-grid, this should not be common + + //compute x,y,z coordinate of grid point + float3 outpt; + outpt.x = xi * out_res + out_origin.x; + outpt.y = yi * out_res + out_origin.y; + outpt.z = zi * out_res + out_origin.z; + + //apply inverse transformation + float3 newpt = invQ.rotate(outpt.x+untranslate.x, outpt.y+untranslate.y, outpt.z+untranslate.z); + //get (not rounded) input grid coordinates (not Cartesian) + + float3 inpt = cart2tex(in_origin, in_res, newpt.x+center.x, newpt.y+center.y, newpt.z+center.z); + + //lookup in normalized texture + float val = tex3D(tex, inpt.z, inpt.y, inpt.x); //why reverse order? + + //set + out(xi,yi,zi) = val; + } + + //interpolate manually + __global__ void + gpu_set_outgrid(GridInterpolater interp, Grid in, + float3 in_origin, float in_res, unsigned in_dim, + float3 out_origin, float out_res, unsigned out_dim, + Quaternion invQ, float3 untranslate, float3 center, + Grid out) { + //figure out coordinate we are setting for out + unsigned xi = threadIdx.x + blockIdx.x * blockDim.x; + unsigned yi = threadIdx.y + blockIdx.y * blockDim.y; + unsigned zi = threadIdx.z + blockIdx.z * blockDim.z; + + if(xi >= out_dim || yi >= out_dim || zi >= out_dim) + return;//bail if we're off-grid, this should not be common + + //compute x,y,z coordinate of grid point + float3 outpt; + outpt.x = xi * out_res + out_origin.x; + outpt.y = yi * out_res + out_origin.y; + outpt.z = zi * out_res + out_origin.z; + + //apply inverse transformation + float3 newpt = invQ.rotate(outpt.x+untranslate.x, outpt.y+untranslate.y, outpt.z+untranslate.z); + //get (not rounded) input grid coordinates (not Cartesian) + float3 inpt = cart2grid(in_origin, in_res, newpt.x+center.x, newpt.y+center.y, newpt.z+center.z); + //interpolate + out(xi,yi,zi) = interp.interpolate(in, inpt); + } + + template + void GridInterpolater::forward(float3 in_center, const Grid& in, const Transform& transform, float3 out_center, Grid& out) const { + + checkGrids(in, out); + float3 center = transform.get_rotation_center(); + float in_radius = in_dimension/2.0; + float out_radius = out_dimension/2.0; + float3 in_origin = {in_center.x-in_radius,in_center.y-in_radius,in_center.z-in_radius}; + float3 out_origin = {out_center.x-out_radius,out_center.y-out_radius,out_center.z-out_radius}; + + Quaternion invQ = transform.get_quaternion().inverse(); + float3 t = transform.get_translation(); + float3 untranslate = {-t.x-center.x, -t.y-center.y, -t.z-center.z}; + unsigned K = in.dimension(0); + + dim3 threads(LMG_CUDA_BLOCKDIM, LMG_CUDA_BLOCKDIM, LMG_CUDA_BLOCKDIM); + unsigned blocksperside = ceil(out_dim / float(LMG_CUDA_BLOCKDIM)); + dim3 blocks(blocksperside, blocksperside, blocksperside); + + if(false) { + //texture memory (mostly) works, but is not faster then interpolating ourself + //and isn't as close to the cpu version + //TODO: profile and optimize + for(unsigned c = 0; c < K; c++) { + cudaTextureObject_t tex = initializeTexture(in[c]); + gpu_set_outgrid_texture<<>>(tex, in_origin, in_resolution, in_dim, out_origin, out_resolution, out_dim, invQ, untranslate, center, out[c]); + } + } else { + for(unsigned c = 0; c < K; c++) { + gpu_set_outgrid<<>>(*this, in[c], in_origin, in_resolution, in_dim, out_origin, out_resolution, out_dim, invQ, untranslate, center, out[c]); + } + } + } + + + template void GridInterpolater::forward(float3 in_center, const Grid& in, const Transform& transform, float3 out_center, Grid& out) const; + + + cudaTextureObject_t GridInterpolater::initializeTexture(const Grid& in) const { + //create an appropriately sized texture memory object for the input + cudaExtent extent = make_cudaExtent(in_dim, in_dim, in_dim); + if(!cuArray) { //must allocate array + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + LMG_CUDA_CHECK(cudaMalloc3DArray(&cuArray, &channelDesc, extent)); + } + //copy values from in grid + //first convert to pitched ptr + cudaPitchedPtr grid = make_cudaPitchedPtr(in.data(), sizeof(float)*in_dim, in_dim, in_dim); + cudaMemcpy3DParms parms = {0}; + parms.dstArray = cuArray; + parms.srcPtr = grid; + parms.extent = extent; + parms.kind = cudaMemcpyDeviceToDevice; + LMG_CUDA_CHECK(cudaMemcpy3D(&parms)); + + // Specify texture + struct cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + resDesc.resType = cudaResourceTypeArray; + resDesc.res.array.array = cuArray; + + // Specify texture object parameters + struct cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + + texDesc.addressMode[0] = cudaAddressModeBorder; + texDesc.addressMode[1] = cudaAddressModeBorder; + texDesc.addressMode[2] = cudaAddressModeBorder; + texDesc.filterMode = cudaFilterModeLinear; + texDesc.readMode = cudaReadModeElementType; + + // Create texture object + cudaTextureObject_t texObj = 0; + LMG_CUDA_CHECK(cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL)); + return texObj; + } + + void GridInterpolater::clearTexture() { + //deallocate only if allocated + if(cuArray) { + cudaFreeArray(cuArray); + cuArray = nullptr; + } + } + +} + + + diff --git a/test/bench_gridinterp.cpp b/test/bench_gridinterp.cpp new file mode 100644 index 0000000..1f04b39 --- /dev/null +++ b/test/bench_gridinterp.cpp @@ -0,0 +1,46 @@ +#include "libmolgrid/grid_maker.h" +#include "libmolgrid/grid_interpolater.h" +#include "libmolgrid/example_provider.h" +#include "libmolgrid/example.h" +#include "libmolgrid/grid.h" +#include +#include +#include + +#include + +using namespace libmolgrid; +using namespace std; + + +int main(int argc, char *argv[]) { + + if(argc < 2) { + printf("Need data directory\n"); + exit(-1); + } + + ExampleProviderSettings settings; + settings.data_root = string(argv[1]) + "/structs"; + ExampleProvider provider(settings); + provider.populate(string(argv[1])+"/small.types"); + Example ex; + provider.next(ex); + + + // set up gridmaker and run forward + GridMaker biggmaker(0.25,41.5); + float3 grid_dims = biggmaker.get_grid_dims(); + MGrid4f out(ex.num_types(), grid_dims.x, grid_dims.y, grid_dims.z); + biggmaker.forward(ex, out.gpu()); + + GridInterpolater gi(0.25,41.5,0.25,41.5); + MGrid4f outi(ex.num_types(), grid_dims.x, grid_dims.y, grid_dims.z); + + { + boost::timer::auto_cpu_timer timer; + for(unsigned i = 0; i < 100; i++) { + gi.forward(out.gpu(), outi.gpu(), 2.0, true); + } + } +}