Skip to content

Commit

Permalink
Enabling vectorization load for Slice (#3529)
Browse files Browse the repository at this point in the history
Previously, vectorization analysis can only support resize in `PadOp`
with positive extents: 1. General resize operation or negative extent
would exclude the resized iter domain to participate in vectorized data
movement; 2. Sliced inputs wouldn't have vectorized load.

This is a series of stacked PRs to adds support in vectorization
analysis for general resize and it allows vectorized load on sliced
inputs as well.

Order of PRs:
1. Adding general support for `resize` op in `propagateResize` during
projection; Adding support for negative resize extent in propagation.
#3457

[with updated more restrictive analysis on 1, the second PR is only
optional at this point.] This PR is dropped
~2. Adding alignment check on stride for resize-introduced
non-contiguity, where a contiguous dimension becomes non-contiguous due
to resize on its immediate inner dimension. #3528~

**_3. Enable vectorized load on slice, refactoring slice vectorize
manual test to use automatic scheduler instead._**
  • Loading branch information
jjsjann123 authored Dec 23, 2024
1 parent f0ab0cd commit 6143a6b
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 96 deletions.
16 changes: 4 additions & 12 deletions csrc/scheduler/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1198,9 +1198,9 @@ std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
for (auto tv : in_tvs) {
if (tv->uses().empty() || ir_utils::isTorchGatherLookupTv(tv) ||
ir_utils::isIndexSelectLookupTv(tv) ||
ir_utils::isTvUsedByOpsOfType<SliceOp, SelectOp>(tv)) {
// Right now, tensors that are input to the slice, select, and pad ops
// can't be cached as they must be in global memory.
ir_utils::isTvUsedByOpsOfType<SelectOp>(tv)) {
// Right now, tensors that are input to the select, gather and
// index_select ops can't be cached as they must be in global memory.
continue;
}

Expand All @@ -1214,7 +1214,7 @@ std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
// caching load instructions.
std::vector<Expr*> cached_uses;
for (auto use : tv->uses()) {
if (!use->isA<PadOp>()) {
if (!use->isOneOf<PadOp, SliceOp>()) {
cached_uses.push_back(use);
}
}
Expand Down Expand Up @@ -1577,14 +1577,6 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(
continue;
}

// Slice op is explicitly not enabled for vectorized load.
if (std::all_of(
input_tv->uses().begin(),
input_tv->uses().end(),
[](Expr* e) -> bool { return e->isA<SliceOp>(); })) {
continue;
}

if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) {
vectorizable_tensors.push_back(input_tv);
}
Expand Down
224 changes: 140 additions & 84 deletions tests/cpp/test_resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2342,23 +2342,28 @@ TEST_F(ResizeTest, SliceVectorization) {

constexpr int N = 1024 * 1024 * 64;

auto tv0 = makeContigConcreteTensor({N + 1});
auto tv0 = makeContigConcreteTensor({N + 8});
fusion.addInput(tv0);
auto tv1 = makeContigConcreteTensor({N});
fusion.addInput(tv1);

// Vectorization analysis is conservative. We considers the resize extent on
// both side. The slice here technically could have vectorization enabled,
// even when tv0 is sized as {N + 7}, which gives us resize extent `-3`. but
// the analysis doesn't support it at this time and requires resize extent to
// be vectorization friendly size.
auto tv2 = slice(
tv0,
{{IrBuilder::create<Val>(1L),
IrBuilder::create<Val>(N + 1L),
{{IrBuilder::create<Val>(4L),
IrBuilder::create<Val>(N + 4L),
IrBuilder::create<Val>(1L)}});

auto tv3 = add(tv2, tv1);

fusion.addOutput(tv3);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn(N + 1, options);
at::Tensor t0 = at::randn(N + 8, options);
at::Tensor t1 = at::randn(N, options);

std::vector<c10::IValue> inputs = {t0, t1};
Expand Down Expand Up @@ -2637,7 +2642,7 @@ TEST_F(ResizeTest, SliceAndReshape2) {
}

// Trivial case of slice vectorization. Just slicing a fusion input
TEST_F(ResizeTest, Slice1DVectorizeManual1) {
TEST_F(ResizeTest, Slice1DVectorize) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());
Expand All @@ -2655,28 +2660,70 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) {
sub(tv0->axis(0)->extent(), IrBuilder::create<Val>(slice_offset))}});
fusion.addOutput(tv1);

tv1->split(0, 4);
tv1->split(0, 128);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(shape, options);
std::vector<c10::IValue> aten_inputs({t0});

tv1->axis(0)->parallelize(ParallelType::BIDx);
tv1->axis(1)->parallelize(ParallelType::TIDx);
tv1->axis(2)->parallelize(ParallelType::Vectorize);
auto cg_results =
scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs);
auto pparams = cg_results.heuristic_params->as<PointwiseParams>();
// check vectorization
ASSERT_EQ(pparams->vectorization_factor, 4)
<< "Unexpected factor of vectorization";
EXPECT_THAT(
tv1->getLoopDomain(),
Contains(Property(&IterDomain::getParallelType, ParallelType::Vectorize)))
<< "Failed to vectorize: " << tv1;

testValidate(&fusion, cg_results.outputs, aten_inputs, __LINE__, __FILE__);
}

// An input is sliced twice. Both should be vectorizable.
TEST_F(ResizeTest, Slice1DVectorize2) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

const int64_t slice_offset = 4;
const std::vector<int64_t> shape({1024L * 1024L});

auto tv0 = makeContigConcreteTensor(shape);
fusion.addInput(tv0);

// Following two slices are vectorized individually. No cache is introduced
auto tv1 = slice(
tv0,
{{IrBuilder::create<Val>(slice_offset),
sub(tv0->axis(0)->extent(), IrBuilder::create<Val>(slice_offset))}});
fusion.addOutput(tv1);

auto tv2 = slice(
tv0,
{{IrBuilder::create<Val>(slice_offset * 2),
sub(tv0->axis(0)->extent(),
IrBuilder::create<Val>(slice_offset * 2))}});
fusion.addOutput(tv2);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(shape, options);
std::vector<c10::IValue> aten_inputs({t0});

KernelExecutor ke;
ke.compile(&fusion, aten_inputs);
auto cg_outputs = ke.run(aten_inputs);
auto cg_results =
scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs);
auto pparams = cg_results.heuristic_params->as<PointwiseParams>();
// check vectorization
ASSERT_EQ(pparams->vectorization_factor, 4)
<< "Unexpected factor of vectorization";
EXPECT_THAT(
tv1->getLoopDomain(),
Contains(Property(&IterDomain::getParallelType, ParallelType::Vectorize)))
<< "Failed to vectorize: " << tv1;

auto ref =
t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)});
ASSERT_TRUE(ref.equal(cg_outputs[0]));
testValidate(&fusion, cg_results.outputs, aten_inputs, __LINE__, __FILE__);
}

// An input is sliced twice. Both should be vectorizable.
TEST_F(ResizeTest, Slice1DVectorizeManual2) {
TEST_F(ResizeTest, Slice1DVectorize2Manual) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());
Expand Down Expand Up @@ -2732,7 +2779,46 @@ TEST_F(ResizeTest, Slice1DVectorizeManual2) {
}

// An input is sliced and also entirely read. Both should be vectorizable.
TEST_F(ResizeTest, Slice1DVectorizeManual3) {
TEST_F(ResizeTest, Slice1DVectorize3) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

const int64_t slice_offset = 4;
const std::vector<int64_t> shape({1024L * 1024L});

auto tv0 = makeContigConcreteTensor(shape);
fusion.addInput(tv0);

auto tv1 = slice(
tv0,
{{IrBuilder::create<Val>(slice_offset),
sub(tv0->axis(0)->extent(), IrBuilder::create<Val>(slice_offset))}});
fusion.addOutput(tv1);

auto tv2 = set(tv0);
fusion.addOutput(tv2);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(shape, options);
std::vector<c10::IValue> aten_inputs({t0});

auto cg_results =
scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs);
auto pparams = cg_results.heuristic_params->as<PointwiseParams>();
// check vectorization
ASSERT_EQ(pparams->vectorization_factor, 4)
<< "Unexpected factor of vectorization";
EXPECT_THAT(
tv1->getLoopDomain(),
Contains(Property(&IterDomain::getParallelType, ParallelType::Vectorize)))
<< "Failed to vectorize: " << tv1;

testValidate(&fusion, cg_results.outputs, aten_inputs, __LINE__, __FILE__);
}

// An input is sliced and also entirely read. Both should be vectorizable.
TEST_F(ResizeTest, Slice1DVectorize3Manual) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());
Expand Down Expand Up @@ -2780,6 +2866,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual3) {
ASSERT_TRUE(t0.equal(cg_outputs.at(1)));
}

// TODO: this is a case not yet supported by vectorization analysis
// Vectorizing a slice of [1:-3]. It's vectorizable as long as the
// offset at 1 is aligned
TEST_F(ResizeTest, Slice1DVectorizeManual4) {
Expand Down Expand Up @@ -2819,7 +2906,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) {
}

// Contig merged vectorization with slice
TEST_F(ResizeTest, Slice2DVectorizeManual1) {
TEST_F(ResizeTest, Slice2DVectorize1) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());
Expand All @@ -2841,85 +2928,65 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) {
{IrBuilder::create<Val>(0), tv0->axis(1)->extent()}});
fusion.addOutput(tv1);

tv1->merge(0);
tv1->split(0, 4);
tv1->split(0, 128);

tv1->axis(0)->parallelize(ParallelType::BIDx);
tv1->axis(1)->parallelize(ParallelType::TIDx);
tv1->axis(2)->parallelize(ParallelType::Vectorize);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(shape, options);
std::vector<c10::IValue> aten_inputs({t0});

KernelExecutor ke;
ke.compile(&fusion, aten_inputs);
auto cg_outputs = ke.run(aten_inputs);
auto cg_results =
scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs);
auto pparams = cg_results.heuristic_params->as<PointwiseParams>();
// check vectorization
ASSERT_EQ(pparams->vectorization_factor, 4)
<< "Unexpected factor of vectorization";
EXPECT_THAT(
tv1->getLoopDomain(),
Contains(Property(&IterDomain::getParallelType, ParallelType::Vectorize)))
<< "Failed to vectorize: " << tv1;

auto ref = t0.index(
{at::indexing::Slice(slice_offset, shape[0] - slice_offset),
at::indexing::Slice(0, at::indexing::None)});
ASSERT_TRUE(ref.equal(cg_outputs.at(0)));
testValidate(&fusion, cg_results.outputs, aten_inputs, __LINE__, __FILE__);
}

// Fully contiguous tensor, but a sliced domain makes the domain to
// the left non-contiguous
TEST_F(ResizeTest, Slice3DVectorizeManual1) {
// the left non-contiguous, hence we need to check for its stride
TEST_F(ResizeTest, Slice3DVectorize1) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

const std::vector<int64_t> shape({4, 1025, 3});
const std::vector<int64_t> shape({1024, 1025, 3});

auto tv0 = makeContigConcreteTensor(shape);
fusion.addInput(tv0);

auto tv1 = slice(
tv0,
{{IrBuilder::create<Val>(0), tv0->axis(0)->extent()},
{IrBuilder::create<Val>(4), IrBuilder::create<Val>(6)},
{IrBuilder::create<Val>(4), IrBuilder::create<Val>(1024)},
{IrBuilder::create<Val>(0), tv0->axis(2)->extent()}});
fusion.addOutput(tv1);

// Vectorize tv1 by a factor of 2. The sliced domain and the
// innermost domain can be contiguous merged, thus producing a
// domain of extent 6, so vectorization by a factor of 2 appears to
// be valid, but due to the middle domain being sliced, the
// outermost domain is no longer contiguous, which means its stride
// must be divisible by 2, which is not the case here.

// [4, 2, 3]
tv1->merge(1);
// [4, 6]
tv1->split(1, 2);
// [4, 3, 2]

tv1->axis(0)->parallelize(ParallelType::BIDx);
tv1->axis(1)->parallelize(ParallelType::TIDx);
tv1->axis(2)->parallelize(ParallelType::Vectorize);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(shape, options);
std::vector<c10::IValue> aten_inputs({t0});

KernelExecutor ke;
ke.compile(&fusion, aten_inputs);
auto cg_results =
scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs);
auto pparams = cg_results.heuristic_params->as<PointwiseParams>();

EXPECT_THAT(
[&]() { ke.run(aten_inputs); },
ThrowsMessage<nvfError>(
HasSubstr("with word size 2 not possible due to invalid stride")));
ASSERT_EQ(pparams->vectorization_factor, 1)
<< "Unexpected factor of vectorization";

testValidate(&fusion, cg_results.outputs, aten_inputs, __LINE__, __FILE__);
}

// Similar to Slice3DVectorizeManual2 but with a middle broadcast
// Similar to Slice3DVectorize2 but with a middle broadcast
// domain
TEST_F(ResizeTest, Slice3DVectorizeManual2) {
TEST_F(ResizeTest, Slice3DVectorize2) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

const std::vector<int64_t> shape({4, 1, 1025, 3});
const std::vector<int64_t> shape({1024, 1, 1025, 3});

auto tv0 = makeContigConcreteTensor(shape);
fusion.addInput(tv0);
Expand All @@ -2932,27 +2999,18 @@ TEST_F(ResizeTest, Slice3DVectorizeManual2) {
{IrBuilder::create<Val>(0), tv0->axis(3)->extent()}});
fusion.addOutput(tv1);

// [4, 1, 1024, 3]
tv1->merge(2);
// [4, 1, 3072]
tv1->split(2, 4);
// [4, 1, 768, 4]

tv1->axis(0)->parallelize(ParallelType::BIDx);
tv1->axis(2)->parallelize(ParallelType::TIDx);
tv1->axis(3)->parallelize(ParallelType::Vectorize);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(shape, options);
std::vector<c10::IValue> aten_inputs({t0});

KernelExecutor ke;
ke.compile(&fusion, aten_inputs);
auto cg_results =
scheduleAndRun(&fusion, SchedulerType::PointWise, aten_inputs);
auto pparams = cg_results.heuristic_params->as<PointwiseParams>();
// check vectorization
ASSERT_EQ(pparams->vectorization_factor, 1)
<< "Unexpected factor of vectorization";

EXPECT_THAT(
[&]() { ke.run(aten_inputs); },
ThrowsMessage<nvfError>(
HasSubstr("with word size 4 not possible due to invalid stride")));
testValidate(&fusion, cg_results.outputs, aten_inputs, __LINE__, __FILE__);
}

// Repro of issue 540 without transpose
Expand Down Expand Up @@ -3434,14 +3492,12 @@ TEST_F(ResizeTest, PadVectorization) {
ASSERT_EQ(pparams->vectorization_factor, 4)
<< "Unexpected factor of vectorization";

// Make sure tv1 is not vectorized, i.e., no loop IterDomains are vectorized.
// Make sure tv1/tv2 are vectorized, i.e., at least one loop IterDomain is
// vectorized.
EXPECT_THAT(
tv1->getLoopDomain(),
Contains(Property(&IterDomain::getParallelType, ParallelType::Vectorize)))
<< "Failed to vectorize: " << tv1;

// Make sure tv2 should be vectorized, i.e., at least one loop IterDomain is
// vectorized.
EXPECT_THAT(
tv2->getLoopDomain(),
Contains(Property(&IterDomain::getParallelType, ParallelType::Vectorize)))
Expand Down

0 comments on commit 6143a6b

Please sign in to comment.