Skip to content

Commit

Permalink
[Flow] Improve dispatch name categorization around broadcast/transpose (
Browse files Browse the repository at this point in the history
#17890)

The dispatch names are largely to tell us
1) What kind of computation it is and
2) What did fusion come up with

This patch changes the way that broadcast and transpose is labeled to
reflect what we want to know about each dispatch. Essentially, it tries
to categorize dispatches as follows:

Elementwise: Dispatches that are pure elementwise (identity) maps with
potentially some minor transposed/broadcasted operands. This indicates
that the core memory bound operands are pure elementwise.

Transpose: Same as elementwise except either the input or output maps
are permuted. This indicates that there is data movement happening.

Broadcast: Cases where the input maps are all strict projections of the
output maps. This should only ever appear if something in fusion went
off the rails.
  • Loading branch information
qedawkins authored Jul 12, 2024
1 parent d65c6d4 commit 10dfd9d
Show file tree
Hide file tree
Showing 2 changed files with 203 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -227,13 +227,50 @@ static std::string summarizeLinalgOp(linalg::LinalgOp op) {
// Categorize linalg.generic ops better. The following checks more specific
// cases before more general ones.
if (prefix.empty() && isa<linalg::GenericOp>(op)) {
if (llvm::all_of(op.getIndexingMapsArray(),
[](AffineMap m) { return m.isIdentity(); })) {
SmallVector<AffineMap> indexingMaps = op.getIndexingMapsArray();
ArrayRef<AffineMap> inputMaps(indexingMaps.begin(),
indexingMaps.begin() + op.getNumDpsInputs());
ArrayRef<AffineMap> outputMaps(indexingMaps.begin() + op.getNumDpsInputs(),
indexingMaps.end());
bool isIdentityOuts =
llvm::all_of(outputMaps, [](AffineMap m) { return m.isIdentity(); });
bool isPermutationOuts =
llvm::all_of(outputMaps, [](AffineMap m) { return m.isPermutation(); });
bool isProjectedPermIns = llvm::all_of(
inputMaps, [](AffineMap m) { return m.isProjectedPermutation(true); });
int64_t numIdentityIn =
llvm::count_if(inputMaps, [](AffineMap m) { return m.isIdentity(); });
int64_t numPermutationIn = llvm::count_if(
inputMaps, [](AffineMap m) { return m.isPermutation(); });
// We categorize elementwise operations as follows:
// 1. All output maps are identity with the iteration space.
// 2. There is at least one input with an identity indexing map.
// 3. There are no permuted inputs that are not also broadcast.
//
// This categorization tells us that the dispatch includes limited or no
// non-trivial data movement.
bool hasIdentityInputRoot =
numIdentityIn > 0 && numIdentityIn == numPermutationIn;
if (isIdentityOuts && isProjectedPermIns && hasIdentityInputRoot) {
prefix = "elementwise";
} else if (llvm::all_of(op.getIndexingMapsArray(),
[](AffineMap m) { return m.isMinorIdentity(); })) {
// We have checked that this is not pure elementwise in the above.
prefix = "broadcast";
// We draw a distinction between pure elementwise operations and
// elementwise operations that include a transpose. To separate
// transposes, there are two cases:
// 1. 2) and 3) hold for elementwise, but the output maps are instead
// permutations.
// 2. The output maps are permutations or identity, and the most major
// input indexing map is a permutation.
} else if (isPermutationOuts && isProjectedPermIns &&
((hasIdentityInputRoot && !isIdentityOuts) ||
numPermutationIn > numIdentityIn)) {
prefix = "elementwise_transpose";
// Broadcasts are an indication that fusion went off the rails. We treat
// anything where all output maps are permutations, but the inputs are all
// projected permutations (without full rank) as a broadcast, which could
// potentially be fused with other elementwise operations/transposes.
} else if (isPermutationOuts && isProjectedPermIns &&
numPermutationIn == 0) {
prefix = "elementwise_broadcast";
} else if (isMatvecLike(op)) {
prefix = "matvec_like";
} else if (isMatmulLike(op)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ flow.executable private @ex {
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
flow.executable private @ex {
// CHECK: flow.executable.export public @ex_unpack_broadcast_384x512_f32_pack
// CHECK: flow.executable.export public @ex_unpack_elementwise_384x512_f32_pack
flow.executable.export public @ex
builtin.module {
func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>) {
Expand Down Expand Up @@ -509,3 +509,162 @@ flow.executable private @ex {
}
}
}

// -----

// Test transposing elementwise operation.

#map = affine_map<(d0, d1) -> (d0)>
#map1 = affine_map<(d0, d1) -> (d1, d0)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
flow.executable private @ex {
// CHECK: flow.executable.export public @ex_elementwise_transpose_7x5_f32
flow.executable.export public @ex
builtin.module {
func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<5x7xf32>>,
%arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>,
%arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [5, 7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<5x7xf32>> -> tensor<5x7xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
%2 = tensor.empty() : tensor<7x5xf32>
%3 = linalg.generic {
indexing_maps = [#map, #map1, #map2],
iterator_types = ["parallel", "parallel"]
} ins(%1, %0 : tensor<7xf32>, tensor<5x7xf32>) outs(%2 : tensor<7x5xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%5 = arith.addf %in, %in_0 : f32
linalg.yield %5 : f32
} -> tensor<7x5xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
return
}
}
}

// -----

// Same as the above, but with the transpose map represented on the output.

#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
flow.executable private @ex {
// CHECK: flow.executable.export public @ex_elementwise_transpose_5x7_f32
flow.executable.export public @ex
builtin.module {
func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<5x7xf32>>,
%arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>,
%arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [5, 7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<5x7xf32>> -> tensor<5x7xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
%2 = tensor.empty() : tensor<7x5xf32>
%3 = linalg.generic {
indexing_maps = [#map, #map1, #map2],
iterator_types = ["parallel", "parallel"]
} ins(%1, %0 : tensor<7xf32>, tensor<5x7xf32>) outs(%2 : tensor<7x5xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%5 = arith.addf %in, %in_0 : f32
linalg.yield %5 : f32
} -> tensor<7x5xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
return
}
}
}

// -----

// Test marking a strictly broadcasting elementwise operation as a broadcast.

#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
flow.executable private @ex {
// CHECK: flow.executable.export public @ex_elementwise_broadcast_7x5_f32
flow.executable.export public @ex
builtin.module {
func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<5xf32>>,
%arg1: !flow.dispatch.tensor<readonly:tensor<5xf32>>,
%arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5xf32>> -> tensor<5xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5xf32>> -> tensor<5xf32>
%2 = tensor.empty() : tensor<7x5xf32>
%3 = linalg.generic {
indexing_maps = [#map, #map, #map1],
iterator_types = ["parallel", "parallel"]
} ins(%1, %0 : tensor<5xf32>, tensor<5xf32>) outs(%2 : tensor<7x5xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%5 = arith.addf %in, %in_0 : f32
linalg.yield %5 : f32
} -> tensor<7x5xf32>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
return
}
}
}

// -----

// Test a pure elementwise operation with a broadcasted operand.

#map = affine_map<(d0, d1) -> (d0)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
flow.executable private @ex {
// CHECK: flow.executable.export public @ex_elementwise_7x5_f32
flow.executable.export public @ex
builtin.module {
func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<7x5xf32>>,
%arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>,
%arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>,
%arg3: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<7x5xf32>> -> tensor<7x5xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
%2 = tensor.empty() : tensor<7x5xf32>
%3:2 = linalg.generic {
indexing_maps = [#map, #map1, #map1, #map1],
iterator_types = ["parallel", "parallel"]
} ins(%1, %0 : tensor<7xf32>, tensor<7x5xf32>) outs(%2, %2 : tensor<7x5xf32>, tensor<7x5xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32, %out_0: f32):
%4 = arith.mulf %in, %in_0 : f32
%5 = arith.addf %in, %in_0 : f32
linalg.yield %4, %5 : f32, f32
} -> (tensor<7x5xf32>, tensor<7x5xf32>)
flow.dispatch.tensor.store %3#0, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
flow.dispatch.tensor.store %3#1, %arg3, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
return
}
}
}

// -----

// Test a multi-result elementwise operation where one result is transposed.

#map = affine_map<(d0, d1) -> (d0)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
flow.executable private @ex {
// CHECK: flow.executable.export public @ex_elementwise_transpose_7x5_f32
flow.executable.export public @ex
builtin.module {
func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<7x5xf32>>,
%arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>,
%arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>,
%arg3: !flow.dispatch.tensor<writeonly:tensor<5x7xf32>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<7x5xf32>> -> tensor<7x5xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
%2 = tensor.empty() : tensor<7x5xf32>
%3 = tensor.empty() : tensor<5x7xf32>
%4:2 = linalg.generic {
indexing_maps = [#map, #map1, #map1, #map2],
iterator_types = ["parallel", "parallel"]
} ins(%1, %0 : tensor<7xf32>, tensor<7x5xf32>) outs(%2, %3 : tensor<7x5xf32>, tensor<5x7xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32, %out_0: f32):
%5 = arith.addf %in, %in_0 : f32
linalg.yield %5, %5 : f32, f32
} -> (tensor<7x5xf32>, tensor<5x7xf32>)
flow.dispatch.tensor.store %4#0, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
flow.dispatch.tensor.store %4#1, %arg3, offsets = [0, 0], sizes = [5, 7], strides = [1, 1] : tensor<5x7xf32> -> !flow.dispatch.tensor<writeonly:tensor<5x7xf32>>
return
}
}
}

0 comments on commit 10dfd9d

Please sign in to comment.