Skip to content

Commit

Permalink
#0: Support uneven shards for direct write/readback
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-aho committed Sep 17, 2024
1 parent 9a1fa41 commit bf04e2e
Show file tree
Hide file tree
Showing 22 changed files with 520 additions and 206 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1659,8 +1659,10 @@ std::shared_ptr<tt::tt_metal::Buffer> create_and_transfer_data_sharded_cb(
{Mt * tt::constants::TILE_HEIGHT, Nt * tt::constants::TILE_WIDTH},
ShardOrientation::ROW_MAJOR,
false,
TensorMemoryLayout::HEIGHT_SHARDED,
{tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH},
{Mt, Nt});
{Mt, Nt},
Nt * page_size_bytes);

log_debug("size_bytes: {}", size_bytes);
log_debug("page_size_bytes: {}", page_size_bytes);
Expand Down Expand Up @@ -1691,8 +1693,10 @@ std::shared_ptr<tt::tt_metal::Buffer> create_and_transfer_data_sharded_cb_fp8(
{Mt * tt::constants::TILE_HEIGHT, Nt * tt::constants::TILE_WIDTH},
ShardOrientation::ROW_MAJOR,
false,
TensorMemoryLayout::HEIGHT_SHARDED,
{tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH},
{Mt, Nt});
{Mt, Nt},
Nt * page_size_bytes);

log_debug("size_bytes: {}", size_bytes);
log_debug("page_size_bytes: {}", page_size_bytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,11 @@ struct L1Config {
(uint32_t)num_tiles_per_core_width * tt::constants::TILE_WIDTH},
ShardOrientation::ROW_MAJOR,
false,
buffer_layout,
{tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH},
{1* num_cores_height * num_tiles_per_core_height * num_cores_height,
num_tiles_per_core_width * num_cores_width});
num_tiles_per_core_width * num_cores_width},
num_tiles_per_core_width * page_size_bytes);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,39 @@ class BufferStressTestConfigSharded {
TensorMemoryLayout mem_config = TensorMemoryLayout::HEIGHT_SHARDED;
ShardOrientation shard_orientation = ShardOrientation::ROW_MAJOR;
bool halo = false;
bool uneven = false;

BufferStressTestConfigSharded(std::array<uint32_t, 2> pages_per_core, std::array<uint32_t, 2> cores) :
max_num_pages_per_core(pages_per_core), max_num_cores(cores) {
this->num_pages_per_core = pages_per_core;
this->num_cores = cores;
BufferStressTestConfigSharded(std::array<uint32_t, 2> pages_per_core, std::array<uint32_t, 2> cores, bool uneven=false) :
max_num_pages_per_core(pages_per_core), num_pages_per_core(pages_per_core), max_num_cores(cores), num_cores(cores), uneven(uneven) {
}

std::array<uint32_t, 2> tensor2d_shape() {
return {num_pages_per_core[0] * num_cores[0], num_pages_per_core[1] * num_cores[1]};
auto shard_shape = this->shard_shape();
auto tensor_shape = shard_shape;
switch (mem_config) {
case TensorMemoryLayout::HEIGHT_SHARDED:
tensor_shape = {num_cores[0] * num_cores[1] * shard_shape[0], shard_shape[1]};
break;
case TensorMemoryLayout::WIDTH_SHARDED:
tensor_shape = {shard_shape[0], num_cores[0] * num_cores[1] * shard_shape[1]};
break;
case TensorMemoryLayout::BLOCK_SHARDED:
if (shard_orientation == ShardOrientation::ROW_MAJOR) {
tensor_shape = {num_cores[1] * shard_shape[0], num_cores[0] * shard_shape[1]};
} else {
tensor_shape = {num_cores[0] * shard_shape[0], num_cores[1] * shard_shape[1]};
}
break;
default:
TT_THROW("Invalid memory configuration");
}
if (uneven) {
tensor_shape[1] -= this->page_shape[1] / 2;
}
return tensor_shape;
}

uint32_t num_pages() { return tensor2d_shape()[0] * tensor2d_shape()[1]; }
uint32_t num_pages() { return num_cores[0] * num_pages_per_core[0] * num_cores[1] * num_pages_per_core[1]; }

std::array<uint32_t, 2> shard_shape() {
return {num_pages_per_core[0] * page_shape[0], num_pages_per_core[1] * page_shape[1]};
Expand All @@ -66,13 +87,16 @@ class BufferStressTestConfigSharded {
}

ShardSpecBuffer shard_parameters() {
auto tensor_shape = this->tensor2d_shape();
return ShardSpecBuffer(
this->shard_grid(),
this->shard_shape(),
this->shard_orientation,
this->halo,
this->mem_config,
this->page_shape,
this->tensor2d_shape());
tensor_shape,
page_shape[0] * tensor_shape[1] * this->element_size);
}

uint32_t page_size() { return page_shape[0] * page_shape[1] * element_size; }
Expand Down Expand Up @@ -254,7 +278,14 @@ void stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
}

vector<uint32_t> res;
res.resize(buf_size / sizeof(uint32_t));
if (config.uneven) {
auto shape = config.tensor2d_shape();
uint32_t width = shape[1] * config.element_size;
src.resize(shape[0] * width / sizeof(uint32_t));
res.resize(shape[0] * width / sizeof(uint32_t));
} else {
res.resize(buf_size / sizeof(uint32_t));
}
if (cq_read) {
EnqueueReadBuffer(cq, buf, res.data(), true);
} else {
Expand Down Expand Up @@ -721,6 +752,57 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
}
}

TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1UnevenReadWrites) {
std::map<std::string, std::vector<std::array<uint32_t, 2>>> test_params;

for (Device *device : devices_) {
// This test hangs on Blackhole A0 when using static VCs through static TLBs and there are large number of reads/writes issued
// workaround is to use dynamic VC (implemented in UMD)
if (tt::Cluster::instance().is_galaxy_cluster()) {
test_params = {
{"cores",
{{1, 1},
{static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}},
{"num_pages", {{3, 1}}},
{"page_shape", {{1, 32}}}};
} else {
test_params = {
{"cores",
{
{5, 3},
{3, 5},
{5, 5},
{static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}},
{"num_pages", {{7, 1}, {3, 1}, {67, 1}, {137, 1}}},
{"page_shape", {{1, 8}, {1, 120}, {1, 2048}}}};
}
for (const std::array<uint32_t, 2> cores : test_params.at("cores")) {
for (const std::array<uint32_t, 2> num_pages : test_params.at("num_pages")) {
for (const std::array<uint32_t, 2> page_shape : test_params.at("page_shape")) {
for (const TensorMemoryLayout shard_strategy :
{TensorMemoryLayout::WIDTH_SHARDED,
TensorMemoryLayout::BLOCK_SHARDED}) {
for (const uint32_t num_iterations : {
1,
}) {
BufferStressTestConfigSharded config(num_pages, cores, true);
config.seed = 0;
config.num_iterations = num_iterations;
config.mem_config = shard_strategy;
config.page_shape = page_shape;
tt::log_info(tt::LogTest, "Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations);
local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
device, device->command_queue(), config, BufferType::L1, false);
}
}
}
}
}
}
}

TEST_F(CommandQueueSingleCardFixture, ShardedBufferDRAMReadWrites) {
for (Device *device : devices_) {
for (const std::array<uint32_t, 2> cores :
Expand Down
64 changes: 64 additions & 0 deletions tests/ttnn/unit_tests/operations/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,70 @@ def test_shard_with_corerangeset(
assert_with_pcc(torch_input_tensor, output, 1.0)


@pytest.mark.parametrize(
"data_transfer_strategy",
[
(DirectReadWriteType.READ_ONLY),
(DirectReadWriteType.WRITE_ONLY),
(DirectReadWriteType.NONE),
(DirectReadWriteType.READ_WRITE),
],
)
@pytest.mark.parametrize(
"input_shape, input_shard_shape, input_sharded_memory_config_args",
[
(
[1, 1, 32, 1000],
[32, 256],
dict(
core_grid=ttnn.CoreRangeSet(
{
ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 3)),
}
),
strategy=ttnn.ShardStrategy.WIDTH,
),
),
(
[1, 1, 120, 1000],
[32, 256],
dict(
core_grid=ttnn.CoreRangeSet(
{
ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(3, 3)),
}
),
strategy=ttnn.ShardStrategy.BLOCK,
),
),
],
)
def test_uneven_shard(device, input_shape, input_shard_shape, input_sharded_memory_config_args, data_transfer_strategy):
torch_input_tensor = torch.rand(input_shape, dtype=torch.bfloat16)
input_shard_memory_config = ttnn.create_sharded_memory_config(
input_shard_shape, **input_sharded_memory_config_args, use_height_and_width_as_shard_shape=True
)

if data_transfer_strategy == DirectReadWriteType.READ_ONLY or data_transfer_strategy == DirectReadWriteType.NONE:
interleaved_input_tensor = ttnn.from_torch(
torch_input_tensor, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.DRAM_MEMORY_CONFIG
)
# interleaved_to_sharded
sharded_input_tensor = ttnn.to_memory_config(interleaved_input_tensor, input_shard_memory_config)
else:
sharded_input_tensor = ttnn.from_torch(
torch_input_tensor, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=input_shard_memory_config
)
if data_transfer_strategy == DirectReadWriteType.WRITE_ONLY or data_transfer_strategy == DirectReadWriteType.NONE:
# sharded_to_interleaved
interleaved_output_tensor = ttnn.to_memory_config(sharded_input_tensor, ttnn.DRAM_MEMORY_CONFIG)
output = ttnn.to_torch(interleaved_output_tensor)
else:
output = ttnn.to_torch(sharded_input_tensor)

assert_with_pcc(torch_input_tensor, output, 1.0)


@pytest.mark.parametrize(
"shape, strategy, orientation, core_grid",
[
Expand Down
Loading

0 comments on commit bf04e2e

Please sign in to comment.