Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
rdspring1 committed Jan 10, 2025
1 parent 8a85cae commit 3a24daa
Showing 1 changed file with 47 additions and 28 deletions.
75 changes: 47 additions & 28 deletions tests/cpp/test_circular_buffering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1000,25 +1000,10 @@ class TmaCircularBufferingTest
NVFuserTest::SetUp();
}

void hopperOnlyRegisterSharing(
KernelExecutor& ke,
Fusion* f,
const at::ArrayRef<c10::IValue>& inputs) {
bool enable_register_sharing =
std::holds_alternative<WarpSpecialized>(circular_buffer_type) &&
bool testEnablesRegisterSharing() {
return std::holds_alternative<WarpSpecialized>(circular_buffer_type) &&
std::get<WarpSpecialized>(circular_buffer_type)
.num_registers.has_value();
if (enable_register_sharing && deviceMajorMinorCheck(10)) {
try {
ke.compile(f, inputs);
} catch (const std::exception& e) {
const char* reference =
R"(Warp Specialized Circular Buffering uses the setmaxnreg ptx instruction, which requires Hopper (9.0))";
const char* str_match_pointer = strstr(e.what(), reference);
ASSERT_TRUE(str_match_pointer != nullptr);
}
FAIL() << "Expected failure!";
}
}

template <typename data_type>
Expand Down Expand Up @@ -1158,6 +1143,10 @@ TEST_F(NVFuserTest, ElectSyncCompatibility) {

TEST_P(TmaCircularBufferingTest, SingleDim) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1199,7 +1188,6 @@ TEST_P(TmaCircularBufferingTest, SingleDim) {
at::Tensor t1 = at::exp(t0);

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0});
ke.compile(fusion.get(), {t0});

std::vector<at::Tensor> cg_outputs = ke.run({t0});
Expand All @@ -1209,6 +1197,10 @@ TEST_P(TmaCircularBufferingTest, SingleDim) {

TEST_P(TmaCircularBufferingTest, SingleDimUnroll) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1254,7 +1246,6 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnroll) {
at::Tensor t1 = at::exp(t0);

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0});
ke.compile(fusion.get(), {t0});

int64_t axis_extent =
Expand All @@ -1271,6 +1262,10 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnroll) {

TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1316,7 +1311,6 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) {
at::Tensor t1 = at::exp(t0);

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0});
ke.compile(fusion.get(), {t0});

int64_t axis_extent =
Expand All @@ -1333,6 +1327,10 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) {

TEST_P(TmaCircularBufferingTest, MultiDim) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1388,7 +1386,6 @@ TEST_P(TmaCircularBufferingTest, MultiDim) {
at::Tensor t1 = at::exp(t0);

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0});
ke.compile(fusion.get(), {t0});

std::vector<at::Tensor> cg_outputs = ke.run({t0});
Expand All @@ -1398,6 +1395,10 @@ TEST_P(TmaCircularBufferingTest, MultiDim) {

TEST_P(TmaCircularBufferingTest, Pointwise) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}
std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

Expand Down Expand Up @@ -1458,7 +1459,6 @@ TEST_P(TmaCircularBufferingTest, Pointwise) {
at::Tensor t2 = t0 + t1;

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0, t1});
ke.compile(fusion.get(), {t0, t1});

std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
Expand All @@ -1471,6 +1471,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) {
<< "Needs shared memory predicate, but current needsSharedMemoryPredicate() returns false";

NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}
std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

Expand Down Expand Up @@ -1527,7 +1531,6 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) {
at::Tensor t2 = t0 + t1;

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0, t1});
ke.compile(fusion.get(), {t0, t1});

std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
Expand All @@ -1537,6 +1540,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) {

TEST_P(TmaCircularBufferingTest, InnerReduction) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1589,7 +1596,6 @@ TEST_P(TmaCircularBufferingTest, InnerReduction) {
at::Tensor t1 = sum(t0, {-1});

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0});
ke.compile(fusion.get(), {t0});

std::vector<at::Tensor> cg_outputs = ke.run({t0});
Expand All @@ -1599,6 +1605,10 @@ TEST_P(TmaCircularBufferingTest, InnerReduction) {

TEST_P(TmaCircularBufferingTest, OuterReduction) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1641,7 +1651,6 @@ TEST_P(TmaCircularBufferingTest, OuterReduction) {
at::Tensor t1 = sum(t0, {0});

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0});
ke.compile(fusion.get(), {t0});

std::vector<at::Tensor> cg_outputs = ke.run({t0});
Expand All @@ -1653,6 +1662,10 @@ TEST_P(TmaCircularBufferingTest, OuterReduction) {

TEST_P(TmaCircularBufferingTest, Persistent) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

constexpr at::ScalarType dtype = at::ScalarType::Float;
constexpr int64_t correction = 0;
Expand Down Expand Up @@ -1771,7 +1784,6 @@ TEST_P(TmaCircularBufferingTest, Persistent) {

// Compile with KernelExecutor directly to avoid scheduling
KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {at_tv0});
ke.compile(fusion.get(), {at_tv0});
std::vector<at::Tensor> cg_outputs = ke.run({at_tv0});

Expand All @@ -1787,6 +1799,10 @@ TEST_P(TmaCircularBufferingTest, Persistent) {

TEST_P(TmaCircularBufferingTest, Matmul) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Expand Down Expand Up @@ -1896,7 +1912,6 @@ TEST_P(TmaCircularBufferingTest, Matmul) {
(t0.unsqueeze(/*dim=*/-1) * t1.unsqueeze(/*dim=*/0)).sum(/*dim=*/1);

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0, t1});
ke.compile(fusion.get(), {t0, t1});

std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
Expand All @@ -1909,6 +1924,11 @@ TEST_P(TmaCircularBufferingTest, Matmul) {
TEST_P(TmaCircularBufferingTest, MatmulWithBroadcastedInput) {
NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);

if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
GTEST_SKIP() << "Register Sharing is only for hopper";
return;
}

std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

Expand Down Expand Up @@ -2014,7 +2034,6 @@ TEST_P(TmaCircularBufferingTest, MatmulWithBroadcastedInput) {
at::Tensor aten_output = (t0 * t1).sum(/*dim=*/1);

KernelExecutor ke;
hopperOnlyRegisterSharing(ke, fusion.get(), {t0, t1});
ke.compile(fusion.get(), {t0, t1});

std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
Expand Down

0 comments on commit 3a24daa

Please sign in to comment.