aten/src/ATen/native/cuda/EmbeddingBag.cu

#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/AccumulateType.h>
#include <ATen/ceil_div.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/Atomic.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/DeviceUtils.cuh>
#include <ATen/TensorUtils.h>

#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/arange.h>
#include <ATen/ops/empty.h>
#include <ATen/ops/empty_like.h>
#include <ATen/ops/zeros.h>
#include <ATen/ops/_embedding_bag_native.h>
#include <ATen/ops/_embedding_bag_forward_only_native.h>
#include <ATen/ops/_embedding_bag_dense_backward_native.h>
#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
#endif

#include <ATen/cuda/cub.cuh>
#include <ATen/native/cuda/SortingCommon.cuh>
#include <ATen/native/cuda/EmbeddingBackwardKernel.cuh>
#include <ATen/native/cuda/KernelUtils.cuh>

#include <c10/macros/Macros.h>

#if CUB_SUPPORTS_SCAN_BY_KEY()
#include <thrust/iterator/reverse_iterator.h>
#endif

namespace at {
namespace native {

#if !CUB_SUPPORTS_SCAN_BY_KEY()
template<typename index_t>
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
#endif

namespace {

constexpr int MODE_SUM = 0;
constexpr int MODE_MEAN = 1;
constexpr int MODE_MAX = 2;

std::pair<Tensor, Tensor> promoteIndicesAndOffsets(
    const Tensor& indices,
    const Tensor& offsets) {
  const auto commonType =
      promoteTypes(offsets.scalar_type(), indices.scalar_type());
  return {
      indices.scalar_type() == commonType ? indices
                                          : indices.toType(commonType),
      offsets.scalar_type() == commonType ? offsets
                                          : offsets.toType(commonType)};
}

// This kernel assumes that all input tensors except `weight` and
// per_sample_weights are contiguous.
template <typename scalar_t, typename index_t>
__global__ void EmbeddingBag_updateOutputKernel_max(
    index_t *input, index_t *offsets, scalar_t *weight, scalar_t *output,
    index_t *offset2bag, int64_t numIndices, int64_t numBags,
    int64_t featureSize, int64_t weight_stride0, int64_t weight_stride1,
    index_t *bag_size, index_t *max_indices,
    index_t padding_idx) {

  // the strategy here is that each bag x feature is handled by a single thread

  int64_t chunksPerBag = ceil_div(featureSize, (int64_t)blockDim.x);
  int64_t numChunks = numBags * chunksPerBag;
  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
  int64_t chunkStride = gridDim.x * blockDim.y;

  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
    if (featureDim < featureSize) {
      int64_t bag = chunk / chunksPerBag;
      scalar_t *weightFeat = weight + featureDim * weight_stride1;
      int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it
      int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
      CUDA_KERNEL_ASSERT(end >= begin);
      scalar_t weightFeatMax = 0;
      int64_t bag_size_ = 0;
      int64_t maxWord = -1;
      for (int64_t emb = begin; emb < end; emb++) {
        bool pad = (input[emb] == padding_idx);
        const int64_t weightRow = input[emb] * weight_stride0;
        scalar_t weightValue = weightFeat[weightRow];
        if (bag_size_ == 0 || weightValue > weightFeatMax) {
          weightFeatMax = pad ? weightFeatMax : weightValue;
          maxWord = pad ? maxWord : input[emb];
        }
        bag_size_ += pad ? 0 : 1;

        if (featureDim == 0) {
          offset2bag[emb] = bag;
        }
      }
      bag_size[bag] = bag_size_;
      max_indices[bag * featureSize + featureDim] = maxWord;
      output[bag * featureSize + featureDim] = weightFeatMax;
    }
  }
}

// This kernel assumes that all input tensors except `weight` and
// per_sample_weights are contiguous.
template <typename scalar_t, typename index_t>
__global__ void EmbeddingBag_updateOutputKernel_sum_mean(
    index_t *input, index_t *offsets, scalar_t *weight, scalar_t *output,
    index_t *offset2bag, int64_t numIndices, int64_t numBags,
    int64_t featureSize, int64_t weight_stride0, int64_t weight_stride1,
    int mode, index_t *bag_size,
    scalar_t* per_sample_weights, int64_t per_sample_weights_stride,
    index_t padding_idx) {

  // the strategy here is that each bag x feature is handled by a single thread

  using accscalar_t = acc_type<scalar_t, true>;
  int64_t chunksPerBag = ceil_div(featureSize, (int64_t)blockDim.x);
  int64_t numChunks = numBags * chunksPerBag;
  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
  int64_t chunkStride = gridDim.x * blockDim.y;

  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
    if (featureDim < featureSize) {
      int64_t bag = chunk / chunksPerBag;
      scalar_t *weightFeat = weight + featureDim * weight_stride1;
      int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it
      int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
      CUDA_KERNEL_ASSERT(end >= begin);
      accscalar_t weightFeatSum = 0;
      int64_t bag_size_ = 0;
      for (int64_t emb = begin; emb < end; emb++) {
        bool pad = (input[emb] == padding_idx);
        const int64_t weightRow = input[emb] * weight_stride0;
        scalar_t weightValue = weightFeat[weightRow];
        weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
        if (per_sample_weights) {
          accscalar_t scaleWeightBy = static_cast<accscalar_t>(
              per_sample_weights[emb * per_sample_weights_stride]);
          weightFeatSum += scaleWeightBy * static_cast<accscalar_t>(weightValue);
        } else {
          weightFeatSum += static_cast<accscalar_t>(weightValue);
        }
        bag_size_ += pad ? 0 : 1;

        if (featureDim == 0) {
          offset2bag[emb] = bag;
        }
      }
      if (mode == MODE_MEAN) {
        if (bag_size_ != 0) {
          weightFeatSum = weightFeatSum / static_cast<accscalar_t>(bag_size_);
        }
      }
      bag_size[bag] = bag_size_;
      output[bag * featureSize + featureDim] = static_cast<scalar_t>(weightFeatSum);
    }
  }
}

Tensor embedding_bag_backward_cuda_sum_avg(
                                   const Tensor &grad,
                                   const Tensor &indices_,
                                   const Tensor &offset2bag,
                                   const Tensor &bag_size,
                                   int64_t num_weights,
                                   bool scale_grad_by_freq, int64_t mode,
                                   const Tensor& per_sample_weights,
                                   int64_t padding_idx) {
  auto indices = indices_.contiguous();

  ptrdiff_t num_indices = indices.numel();

  if (num_indices == 0) {
    // all empty bags
    return at::zeros({num_weights, grad.size(1)}, grad.options());
  }

  auto sorted_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  auto orig_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  Tensor count;

  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
    auto range = at::arange(num_indices, indices.options());
    // int64_t nbits = cuda::cub::get_num_bits(num_weights);
    cuda::cub::radix_sort_pairs(
      indices.data_ptr<index_t>(), sorted_indices.data_ptr<index_t>(),
      range.data_ptr<index_t>(), orig_indices.data_ptr<index_t>(),
      num_indices, false/*, 0, nbits*/);
  });

  if (scale_grad_by_freq) {
    count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
#if CUB_SUPPORTS_SCAN_BY_KEY()
    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
      cudaStream_t stream = at::cuda::getCurrentCUDAStream();

      // Compute an increasing sequence per unique item in sortedIndices:
      // sorted: 2 5 5 5 7 7 8 9 9
      //  count: 1 1 2 3 1 2 1 1 2
      auto sorted_data = sorted_indices.data_ptr<index_t>();
      auto count_data = count.data_ptr<index_t>();
      cuda::cub::inclusive_sum_by_key(
        sorted_data,
        at_cuda_detail::cub::ConstantInputIterator<index_t>(1),
        count_data,
        num_indices
      );

      // Take the maximum of each count per unique key in reverse:
      // sorted: 2 5 5 5 7 7 8 9 9
      //  count: 1 3 3 3 2 2 1 2 2
      cuda::cub::inclusive_scan_by_key(
        thrust::make_reverse_iterator(sorted_data + num_indices),
        thrust::make_reverse_iterator(count_data + num_indices),
        thrust::make_reverse_iterator(count_data + num_indices),
        at_cuda_detail::cub::Max(),
        num_indices
      );
    });
#else
    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
    });
#endif
  }
  return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
      count, num_weights, padding_idx, mode == MODE_MEAN, offset2bag,
      bag_size, per_sample_weights);
}

template <typename scalar_t, typename index_t>
__global__ void EmbeddingBag_accGradParametersKernel_max(
    index_t *max_indices, scalar_t *gradOutput,
    scalar_t *gradWeight, int64_t stride, int64_t numBags,
    index_t padding_idx, const index_t numel) {

  using accscalar_t = acc_type<scalar_t, true>;

  int64_t chunksPerBag = ceil_div(stride, (int64_t)blockDim.x);
  int64_t numChunks = numBags * chunksPerBag;
  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
  int64_t chunkStride = gridDim.x * blockDim.y;

  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
    if (featureDim < stride) {
      int64_t bag = chunk / chunksPerBag;

      index_t word_idx = max_indices[bag * stride + featureDim];
      if (word_idx >= 0 && word_idx != padding_idx) {
        // If bag is empty, we have max_indices[idx] set to -1 in forward.
        fastAtomicAdd(
            gradWeight, static_cast<index_t>(word_idx * stride + featureDim),
            numel, gradOutput[bag * stride + featureDim], true);
      }
    }
  }
}

Tensor embedding_bag_backward_cuda_max(const Tensor &grad,
                                   const Tensor &max_indices,
                                   int64_t num_weights,
                                   int64_t padding_idx) {
  // See Note [Writing Nondeterministic Operations]
  // Nondeterministic because of atomicAdd usage
  globalContext().alertNotDeterministic("embedding_bag_backward_cuda_max");

  auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.options());

  int64_t stride = grad_weight.stride(0);

  int64_t numBags = grad.size(0);

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

#if defined(USE_ROCM)
  dim3 block = dim3(64, 4);
#else
  dim3 block = dim3(32, 8);
#endif
  int grid = 1024;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad.scalar_type(), "embedding_bag_backward_cuda_max", [&] {
        AT_DISPATCH_INDEX_TYPES(max_indices.scalar_type(), "embedding_bag_backward_cuda_max", [&] () {
          EmbeddingBag_accGradParametersKernel_max<
              scalar_t, index_t><<<grid, block, 0, stream>>>(
              max_indices.data_ptr<index_t>(), grad.data_ptr<scalar_t>(),
              grad_weight.data_ptr<scalar_t>(), stride, numBags,
              padding_idx, grad_weight.numel());
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      });
  });

  return grad_weight;
}
}

// Assumes all input tensors are contiguous.
// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
std::tuple<Tensor, Tensor, Tensor, Tensor>
_embedding_bag_forward_only_cuda(const Tensor &weight, const Tensor &indices,
                   const Tensor &offsets, const bool scale_grad_by_freq,
                   const int64_t mode, bool sparse, const c10::optional<Tensor>& per_sample_weights_opt,
                   bool include_last_offset, int64_t padding_idx) {
  // See [Note: hacky wrapper removal for optional tensor]
  c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
  const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;

  return _embedding_bag_cuda(
      weight,
      indices,
      offsets,
      scale_grad_by_freq,
      mode,
      sparse,
      per_sample_weights,
      include_last_offset,
      padding_idx);
}

// Assumes all input tensors are contiguous.
// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
std::tuple<Tensor, Tensor, Tensor, Tensor>
_embedding_bag_cuda(const Tensor &weight, const Tensor &indices_,
                   const Tensor &offsets_, const bool scale_grad_by_freq,
                   const int64_t mode, bool sparse, const c10::optional<Tensor>& per_sample_weights_opt,
                   bool include_last_offset, int64_t padding_idx) {
  // See [Note: hacky wrapper removal for optional tensor]
  c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
  const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;

  Tensor indices, offsets;
  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
  auto indices_arg = TensorArg(indices, "indices", 1);
  checkScalarTypes("embedding_bag_cuda", indices_arg, {kLong, kInt});
  auto offsets_arg = TensorArg(offsets, "offsets", 1);
  checkScalarTypes("embedding_bag_cuda", offsets_arg, {kLong, kInt});
  checkSameType("embedding_bag_cuda", indices_arg, offsets_arg);
  auto weight_arg = TensorArg(weight, "weight", 1);
  checkSameGPU("embedding_bag_cuda", weight_arg, indices_arg);
  checkSameGPU("embedding_bag_cuda", weight_arg, offsets_arg);

  int64_t numIndices = indices.size(0);
  int64_t numBags = offsets.size(0);
  if (include_last_offset) {
    // Check https://github.com/pytorch/pytorch/issues/29019
    // We plan to add one more element in offsets, which is equal to the size of
    // indices. Currently for cuda devices, we still use the legacy
    // implementation even this flag is enabled.
    TORCH_CHECK(
        numBags >= 1, "include_last_offset: numBags should be at least 1");
    numBags -= 1;
  }
  int64_t featureSize = weight.size(1);

  auto bag_size = at::empty(offsets.sizes(), indices.options());
  auto offset2bag =
      at::empty({indices.size(0)}, indices.options()); // offset2bag = [0 0 0 0 0]

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  auto output = at::empty({numBags, featureSize}, weight.options());

  Tensor max_indices;

  if (mode == MODE_MAX) {
    max_indices = at::empty({numBags, featureSize}, indices.options());
  } else {
    // No need to allocate if we aren't doing a backwards pass
    max_indices = at::empty({0}, indices.options());
  }

#if defined(USE_ROCM)
  dim3 block = dim3(64, 4);
#else
  dim3 block = dim3(32, 8);
#endif
  int grid = 1024;
  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, weight.scalar_type(), "embedding_bag_cuda", [&] {
    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_cuda", [&] () {
      if (mode == MODE_MAX) {
        EmbeddingBag_updateOutputKernel_max<scalar_t, index_t><<<grid, block, 0, stream>>>(
            indices.data_ptr<index_t>(), offsets.data_ptr<index_t>(),
            weight.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
            offset2bag.data_ptr<index_t>(), numIndices, numBags, featureSize,
            weight.stride(0), weight.stride(1), bag_size.data_ptr<index_t>(),
            max_indices.data_ptr<index_t>(),
            padding_idx);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else {
        EmbeddingBag_updateOutputKernel_sum_mean<scalar_t, index_t><<<grid, block, 0, stream>>>(
            indices.data_ptr<index_t>(), offsets.data_ptr<index_t>(),
            weight.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
            offset2bag.data_ptr<index_t>(), numIndices, numBags, featureSize,
            weight.stride(0), weight.stride(1), mode, bag_size.data_ptr<index_t>(),
            per_sample_weights.defined() ? per_sample_weights.data_ptr<scalar_t>() : NULL,
            per_sample_weights.defined() ? per_sample_weights.stride(0) : 0,
            padding_idx);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      }
    });
  });

  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, offset2bag, bag_size, max_indices);
}

Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &indices,
                                   const Tensor &offset2bag,
                                   const Tensor &bag_size_,
                                   const Tensor &max_indices,
                                   int64_t num_weights,
                                   bool scale_grad_by_freq, int64_t mode, const c10::optional<Tensor>& per_sample_weights_opt,
                                   int64_t padding_idx) {
  // See [Note: hacky wrapper removal for optional tensor]
  c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
  const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;

  // indices, offsets and offset2bag are assumed having correct dtypes and
  // contiguous here due to the checks in _embedding_bag_backward in
  // EmbeddingBag.cpp.
  // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
  // for more details.

  Tensor grad = grad_.contiguous();
  auto indices_arg = TensorArg(indices, "indices", 1);
  auto grad_arg = TensorArg(grad, "grad", 1);
  checkSameGPU("embedding_bag_cuda", grad_arg, indices_arg);


  switch (mode) {
    case MODE_SUM:
    case MODE_MEAN:
      if (mode == MODE_MEAN)
        AT_ASSERT(!per_sample_weights.defined());
      return embedding_bag_backward_cuda_sum_avg(grad, indices, offset2bag,
              bag_size_, num_weights, scale_grad_by_freq, mode,
              per_sample_weights, padding_idx);

    case MODE_MAX:
      AT_ASSERT(!per_sample_weights.defined());
      return embedding_bag_backward_cuda_max(grad, max_indices, num_weights,
              padding_idx);

    default:
      AT_ERROR(
          "Unknown mode for embedding_bag_backward_cuda ", mode);
  }
}

template <typename scalar_t>
__inline__ __device__
static scalar_t warpReduceSum(scalar_t val) {
  for (int offset = C10_WARP_SIZE/2; offset > 0; offset /= 2)
    val += WARP_SHFL_DOWN(val, offset);
  return val;
}

template <typename scalar_t, typename index_t>
__global__ static void _embedding_bag_per_sample_weights_backward_kernel(
    const scalar_t* grad, int64_t grad_stride0, int64_t grad_stride1,
    const scalar_t* weight, int64_t weight_stride0, int64_t weight_stride1,
    const index_t* indices,  // contiguous
    const index_t* offset2bag,  // contiguous
    int64_t num_samples,
    int64_t embedding_features,
    scalar_t* output,
    index_t padding_idx) {
  using accscalar_t = acc_type<scalar_t, true>;
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  const int warp = idx / C10_WARP_SIZE;
  const int thread_in_warp = idx % C10_WARP_SIZE;
  const int num_warps = blockDim.x * gridDim.x / C10_WARP_SIZE;

  // Each warp is responsible for the accumulation of one sample.
  // This involves doing one dot product between grad[bag_idx] and weight[embedding_idx].
  for (int sample_idx = warp; sample_idx < num_samples; sample_idx += num_warps) {
    accscalar_t result = 0.;
    const int bag_idx = (int)offset2bag[sample_idx];
    const int embedding_idx = (int)indices[sample_idx];
    if (embedding_idx != padding_idx) {
      for (int feature_idx = thread_in_warp; feature_idx < embedding_features;
          feature_idx += C10_WARP_SIZE) {
        result +=
            grad[grad_stride0 * bag_idx + grad_stride1 * feature_idx] *
            weight[weight_stride0 * embedding_idx + weight_stride1 * feature_idx];
      }
    }
    result = warpReduceSum<accscalar_t>(result);
    if (thread_in_warp == 0) {
      output[sample_idx] = result;
    }
  }
}

Tensor _embedding_bag_per_sample_weights_backward_cuda(
    const Tensor& grad,
    const Tensor& weight,  // NB: embedding table, not per_sample_weights
    const Tensor& indices_,
    const Tensor& offsets_,
    const Tensor& offset2bag,
    int64_t mode,
    int64_t padding_idx) {
  TORCH_CHECK(
      mode == MODE_SUM,
      "embedding_bag_backward: per_sample_weights only supported for mode='sum'");

  AT_ASSERT(grad.dim() == 2);
  auto embedding_features = grad.size(1);

  Tensor indices, offsets;
  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
  AT_ASSERT(indices.dim() == 1);
  auto num_samples = indices.size(0);

  AT_ASSERT(weight.dim() == 2);
  AT_ASSERT(weight.size(1) == embedding_features);

  const int threads_per_block = 512;
  const int warps_per_block = threads_per_block / at::cuda::warp_size();

  dim3 block(threads_per_block);
  dim3 grid((num_samples + warps_per_block - 1) / warps_per_block);

  auto output = at::empty({num_samples}, grad.options());

  // Early return when there is no samples in the batch. This saves unnecesary kernel
  // launch, but also prevents cudaGetLastError() to complain about invalid launch args
  if (num_samples == 0) {
    return output;
  }

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
    grad.scalar_type(), "_embedding_bag_per_sample_weights_backward_cuda", [&]() {
      AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "_embedding_bag_per_sample_weights_backward_cuda", [&]() {
        _embedding_bag_per_sample_weights_backward_kernel<scalar_t, index_t>
          <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
            grad.data_ptr<scalar_t>(), grad.stride(0), grad.stride(1),
            weight.data_ptr<scalar_t>(), weight.stride(0), weight.stride(1),
            indices.data_ptr<index_t>(),
            offset2bag.data_ptr<index_t>(),
            num_samples,
            embedding_features,
            output.data_ptr<scalar_t>(),
            padding_idx);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      });
    }
  );
  return output;
}

}
}