Skip to content

Commit

Permalink
refactor code to remove dependency on THC
Browse files Browse the repository at this point in the history
  • Loading branch information
ajonnavittula committed Sep 13, 2022
1 parent 27d9577 commit 638be95
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 31 deletions.
23 changes: 14 additions & 9 deletions lib/model/csrc/cuda/ROIAlign_cuda.cu
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/ceil_div.h>

#include <THC/THC.h>
#include <THC/THCAtomics.cuh>
#include <THC/THCDeviceUtils.cuh>
// #include <THC/THC.h>
// #include <THC/THCAtomics.cuh>
// #include <THC/THCDeviceUtils.cuh>

// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)

// template <typename T>
// __host__ __device__ __forceinline__ T THCCeilDiv(T a, T b) {
// return (a + b - 1) / b;
// }

template <typename T>
__device__ T bilinear_interpolate(const T* bottom_data,
Expand Down Expand Up @@ -272,11 +277,11 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
auto output_size = num_rois * pooled_height * pooled_width * channels;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
dim3 grid(std::min(at::ceil_div(output_size, 512L), 4096L));
dim3 block(512);

if (output.numel() == 0) {
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
return output;
}

Expand All @@ -294,7 +299,7 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
rois.contiguous().data<scalar_t>(),
output.data<scalar_t>());
});
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
return output;
}

Expand All @@ -317,12 +322,12 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,

cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
dim3 grid(std::min(at::ceil_div(grad.numel(), 512L), 4096L));
dim3 block(512);

// handle possibly empty gradients
if (grad.numel() == 0) {
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
return grad_input;
}

Expand All @@ -341,6 +346,6 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
grad_input.data<scalar_t>(),
rois.contiguous().data<scalar_t>());
});
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
24 changes: 14 additions & 10 deletions lib/model/csrc/cuda/ROIPool_cuda.cu
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/ceil_div.h>

#include <THC/THC.h>
#include <THC/THCAtomics.cuh>
#include <THC/THCDeviceUtils.cuh>

// #include <THC/THC.h>
// #include <THC/THCAtomics.cuh>
// #include <THC/THCDeviceUtils.cuh>

// template <typename T>
// __host__ __device__ __forceinline__ T THCCeilDiv(T a, T b) {
// return (a + b - 1) / b;
// }
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
Expand Down Expand Up @@ -126,11 +130,11 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,

cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
dim3 grid(std::min(at::ceil_div(output_size, 512L), 4096L));
dim3 block(512);

if (output.numel() == 0) {
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(output, argmax);
}

Expand All @@ -148,7 +152,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
output.data<scalar_t>(),
argmax.data<int>());
});
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(output, argmax);
}

Expand All @@ -173,12 +177,12 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,

cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
dim3 grid(std::min(at::ceil_div(grad.numel(), 512L), 4096L));
dim3 block(512);

// handle possibly empty gradients
if (grad.numel() == 0) {
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
return grad_input;
}

Expand All @@ -197,6 +201,6 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
grad_input.data<scalar_t>(),
rois.contiguous().data<scalar_t>());
});
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
26 changes: 14 additions & 12 deletions lib/model/csrc/cuda/nms.cu
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>

#include <THC/THC.h>
#include <THC/THCDeviceUtils.cuh>
#include <ATen/ceil_div.h>
#include <c10/cuda/CUDACachingAllocator.h>
// #include <THC/THC.h>
// #include <THC/THCDeviceUtils.cuh>

#include <vector>
#include <iostream>
Expand Down Expand Up @@ -61,7 +62,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
t |= 1ULL << i;
}
}
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
const int col_blocks = at::ceil_div(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
Expand All @@ -76,28 +77,28 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {

int boxes_num = boxes.size(0);

const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
const int col_blocks = at::ceil_div(boxes_num, threadsPerBlock);

scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();

THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
// THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState

unsigned long long* mask_dev = NULL;
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));

mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));

dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
// mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
mask_dev = (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc(boxes_num * col_blocks * sizeof(unsigned long long));
dim3 blocks(at::ceil_div(boxes_num, threadsPerBlock),
at::ceil_div(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<<blocks, threads>>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);

std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
C10_CUDA_CHECK(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
Expand All @@ -122,7 +123,8 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
}
}

THCudaFree(state, mask_dev);
c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);
// THCudaFree(state, mask_dev);
// TODO improve this part
return std::get<0>(order_t.index({
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
Expand Down

0 comments on commit 638be95

Please sign in to comment.