Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Matmul with DID loop split #3651

Merged
merged 8 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion csrc/ir/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <ir/iostream.h>
#include <ir/utils.h>
#include <iter_visitor.h>
#include <multidevice/utils.h>
#include <ops/arith.h>
#include <scheduler/mma_utils.h>

Expand Down Expand Up @@ -1312,9 +1313,37 @@ bool hasTrivialAllocationDomain(const TensorView* tv) {
}
const std::vector<IterDomain*>& alloc = tv->getMaybeAllocationDomain();
const std::vector<IterDomain*>& logical = tv->getLogicalDomain();
return TensorDomain::noBroadcasts(TensorDomain::noReductions(logical)) ==
const auto alloc_no_red_bcast =
Priya2698 marked this conversation as resolved.
Show resolved Hide resolved
TensorDomain::noBroadcasts(TensorDomain::noReductions(alloc));
const auto logical_no_red_bcast =
TensorDomain::noBroadcasts(TensorDomain::noReductions(logical));

if (!isSharded(tv)) {
return alloc_no_red_bcast == logical_no_red_bcast;
}

// This handles the case where DID parallelization is applied on
// allocation/logical dimensions.
const auto alloc_no_red_bcast_device =
TensorDomain::noDevices(alloc_no_red_bcast);
if (alloc_no_red_bcast_device.size() != logical_no_red_bcast.size()) {
return false;
}

int64_t sharded_logical_idx = getShardedLogicalAxis(tv, ParallelType::DIDx);
for (auto idx : c10::irange((int64_t)logical_no_red_bcast.size())) {
// Compare all but the sharded axis since the logical and alloc ID will have
// different extents.
if (idx == sharded_logical_idx) {
continue;
}
if (alloc_no_red_bcast_device.at(idx) != logical_no_red_bcast.at(idx)) {
return false;
}
}
return true;
}

bool hasUniformSiblings(Expr* expr) {
return !expr->isOneOf<SdpaFwdOp, SdpaBwdOp>();
}
Expand Down
55 changes: 55 additions & 0 deletions tests/python/test_multidevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,61 @@ def multidevice_schedule(self) -> None:
torch.testing.assert_close(in_grad.cpu(), expected_in_grad, rtol=1e-3, atol=1e-2)


@pytest.mark.mpi
def test_matmul_loop_split(multidevice_test):
class Model(FusionDefinition):
def __init__(self, num_devices, batch, sequence, hidden):
super().__init__()
self._num_devices = num_devices
self._batch = batch
self._sequence = sequence
self._hidden = hidden

def definition(self):
d, b, s, e = self._num_devices, self._batch, self._sequence, self._hidden
self.inp = self.define_tensor([b, s, e])
self.weight = self.define_tensor([e, d * e])
self.out = self.ops.matmul(self.inp, self.weight)
self.add_output(self.out)

def multidevice_schedule(self):
for t in [self.inp, self.weight, self.out]:
self.sched._set_device_mesh(t, mesh)

# Shard N for weight (K, N)
self.sched.split(self.weight, -1, d, False)
self.sched.parallelize(self.weight, -2, nvfuser.ParallelType.mesh_x)
self.sched.set_allocation_as_loop(self.weight)

# Output of linear: {.., i{M}, i{N}, r{K}}
# Shard N -> axis(-2)
self.sched.split(self.out, -2, d, False)
self.sched.parallelize(self.out, -3, nvfuser.ParallelType.mesh_x)
self.sched.set_allocation_as_loop(self.out)

d = multidevice_test.size
mesh = nvfuser.DeviceMesh(range(d))
rank = multidevice_test.rank

torch.cuda.set_device(multidevice_test.local_rank)

b, s, e = 2, 1024, 768
inp_tensor = torch.randn(b, s, e, device="cuda")
unsharded_weight_tensor = torch.randn(e, d * e)
sharded_weight_tensor = multidevice_test.shard_tensor(unsharded_weight_tensor, -1, mesh)

fd = Model(d, b, s, e)
out_tensors = fd.execute([inp_tensor, sharded_weight_tensor])

# [b, s, d*e]
unsharded_out_tensor = torch.matmul(inp_tensor.cpu(), unsharded_weight_tensor)
expected_out_tensor = multidevice_test.shard_tensor(unsharded_out_tensor, -1, mesh)
# rtol is the same as the default for fp32. atol is slightly increased.
torch.testing.assert_close(
out_tensors[0], expected_out_tensor.squeeze(0), rtol=1.3e-6, atol=1e-3
)


class QkvFormat(Enum):
BHSE = auto()
BSHE = auto()
Expand Down
Loading