Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support i1 datatype #18713

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 197 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,202 @@ static void populateIreeNarrowTypeEmulationPatterns(
patterns.getContext());
}

struct PadSubtypeTransferWritePattern
: public OpRewritePattern<vector::TransferWriteOp> {
using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
PatternRewriter &rewriter) const final {
auto source = writeOp.getSource();
auto target = writeOp.getVector();
auto targetType = cast<VectorType>(target.getType());
auto sourceType = cast<ShapedType>(source.getType());
auto elemType = targetType.getElementType();
// check that the type size is byte aligned
unsigned elementBits = targetType.getElementType().getIntOrFloatBitWidth();
auto numElements = targetType.getNumElements();
if ((numElements * elementBits) % 8 != 0) {
SmallVector<int64_t> strides;
SmallVector<int64_t> offsets;
for (unsigned i = 0; i < sourceType.getRank(); ++i) {
strides.push_back(1);
offsets.push_back(0);
}

// TODO: we should keep the source and sink ... otherwise we are
// overwriting some part of the source tensor

SmallVector<int64_t> newShape =
SmallVector<int64_t>(targetType.getShape());
newShape.back() += (8 - (numElements * elementBits) % 8) / elementBits;
auto newTargetType = VectorType::get(newShape, elemType);

// create an empty vector of the correct size
auto numElements = newTargetType.getNumElements();
SmallVector<bool> zeroValues;
for (unsigned i = 0; i < numElements; ++i) {
zeroValues.push_back(false);
}
auto zeroVector = rewriter.create<arith::ConstantOp>(
writeOp.getLoc(),
DenseIntElementsAttr::get(newTargetType, zeroValues));

auto extendedOp = rewriter.create<vector::InsertStridedSliceOp>(
writeOp->getLoc(), target, zeroVector, offsets, strides);

writeOp.getVectorMutable().assign(extendedOp);
return success();
}
return failure();
}
};


struct PadSubtypeTransferReadPattern
: public OpRewritePattern<vector::TransferReadOp> {
using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
PatternRewriter &rewriter) const final {
auto resultType = cast<VectorType>(readOp.getResult().getType());
// check that the type size is byte aligned
unsigned elementBits = resultType.getElementType().getIntOrFloatBitWidth();
auto numElements = resultType.getNumElements();
if ((numElements * elementBits) % 8 != 0) {
// pad the type to be byte aligned
SmallVector<int64_t> newShape =
SmallVector<int64_t>(resultType.getShape());
newShape.back() += (8 - (numElements * elementBits) % 8) / elementBits;
// Create a new vector type with the padded shape
auto newType = VectorType::get(newShape, resultType.getElementType());

// Create a new transfer read op with the new type
auto paddingValue = rewriter.create<arith::ConstantOp>(
readOp.getLoc(), resultType.getElementType(),
rewriter.getZeroAttr(resultType.getElementType()));

// use a vector extract to extract the original vector
SmallVector<int64_t> offsets, strides;
for (unsigned i = 0; i < resultType.getRank(); ++i) {
offsets.push_back(0);
strides.push_back(1);
}

auto newTransferReadOp = rewriter.create<vector::TransferReadOp>(
readOp.getLoc(), newType, readOp.getSource(), readOp.getIndices(),
paddingValue);

rewriter.replaceOpWithNewOp<vector::ExtractStridedSliceOp>(
readOp, newTransferReadOp, offsets, resultType.getShape(), strides);
}
return success(true);
}
};

struct PadSubtypeVectorLoadPattern
: public OpRewritePattern<vector::LoadOp> {
using OpRewritePattern<vector::LoadOp>::OpRewritePattern;
LogicalResult matchAndRewrite(vector::LoadOp loadOp,
PatternRewriter &rewriter) const final {
auto result = loadOp.getResult();
auto resultType = mlir::cast<VectorType>(result.getType());
// check that the type size is byte aligned
unsigned elementBits = resultType.getElementType().getIntOrFloatBitWidth();
auto numElements = resultType.getNumElements();
if ((numElements * elementBits) % 8 != 0) {
SmallVector<int64_t> newShape = SmallVector<int64_t>(resultType.getShape());
newShape.back() +=
(8 - (numElements * elementBits) % 8) / elementBits;
auto newTargetType = VectorType::get(newShape, resultType.getElementType());

// create a new vector load op with the new type
auto newVectorLoad = rewriter.create<vector::LoadOp>(
loadOp.getLoc(), newTargetType, loadOp.getBase(), loadOp.getIndices());

auto numElements = newTargetType.getNumElements();
SmallVector<bool> zeroValues;
for (unsigned i = 0; i < numElements; ++i) {
zeroValues.push_back(false);
}

//extract strided slice
SmallVector<int64_t> offsets, strides;
for (unsigned i = 0; i < resultType.getRank(); ++i) {
offsets.push_back(0);
strides.push_back(1);
}

rewriter.replaceOpWithNewOp<vector::ExtractStridedSliceOp>(
loadOp, newVectorLoad, offsets, resultType.getShape(), strides);
return success();
}
return failure();
}
};

struct PadSubtypeVectorStorePattern
: public OpRewritePattern<vector::StoreOp> {
using OpRewritePattern<vector::StoreOp>::OpRewritePattern;
LogicalResult matchAndRewrite(vector::StoreOp storeOp,
PatternRewriter &rewriter) const final {
auto storeValue = storeOp.getValueToStore();
auto target = storeOp.getBase();

auto valueType = mlir::cast<ShapedType>(storeValue.getType());
auto targetType = mlir::cast<ShapedType>(target.getType());
// check that the type size is byte aligned
auto elemType = valueType.getElementType();
unsigned elementBits = valueType.getElementType().getIntOrFloatBitWidth();
auto numElements = valueType.getNumElements();

if ((numElements * elementBits) % 8 != 0) {
SmallVector<int64_t> newShape = SmallVector<int64_t>(valueType.getShape());
newShape.back() +=
(8 - (numElements * elementBits) % 8) / elementBits;
auto newValueType = VectorType::get(newShape, elemType);

SmallVector<int64_t> strides;
SmallVector<int64_t> offsets;
for (unsigned i = 0; i < targetType.getRank(); ++i) {
strides.push_back(1);
offsets.push_back(0);
}

// create an empty vector of the correct size
auto numElements = newValueType.getNumElements();
SmallVector<bool> zeroValues;
for (unsigned i = 0; i < numElements; ++i) {
zeroValues.push_back(false);
}
auto zeroVector = rewriter.create<arith::ConstantOp>(
storeOp.getLoc(), DenseIntElementsAttr::get(newValueType, zeroValues));

auto extendedOp = rewriter.create<vector::InsertStridedSliceOp>(
storeOp->getLoc(), storeValue, zeroVector, offsets, strides);

// create a mask and use masked store:
SmallVector<Value> maskShape;
for (auto dim : valueType.getShape()) {
maskShape.push_back(
rewriter.create<arith::ConstantIndexOp>(storeOp.getLoc(), dim));
}
auto mask = rewriter.create<vector::CreateMaskOp>(
storeOp.getLoc(), newValueType, maskShape);

rewriter.replaceOpWithNewOp<vector::MaskedStoreOp>(
storeOp, target, storeOp.getIndices(), mask, extendedOp);
return success();
}
return failure();
}
};

static void populateSubbyteTypeHandlingPatterns(RewritePatternSet &patterns) {
patterns.add<PadSubtypeTransferReadPattern, PadSubtypeTransferWritePattern,
PadSubtypeVectorLoadPattern, PadSubtypeVectorStorePattern>(
patterns.getContext());
}

//===----------------------------------------------------------------------===//
// Pass Definition
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -133,6 +329,7 @@ struct EmulateNarrowTypePass final
affine::AffineDialect, IREE::HAL::HALDialect>(opLegalCallback);

RewritePatternSet patterns(ctx);
populateSubbyteTypeHandlingPatterns(patterns);
arith::populateArithNarrowTypeEmulationPatterns(typeConverter, patterns);
memref::populateMemRefNarrowTypeEmulationPatterns(typeConverter, patterns);
populateIREEResolveExtractStridedMetadataPatterns(ctx, patterns);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-generic-vectorization{vectorize-padding=true}))" --split-input-file %s | FileCheck %s

func.func @test_subbyte_6_i1() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<8xi1>>
%3 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<writeonly:tensor<8xi1>> -> tensor<6xi1>
%4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<6xi1>
%5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<6xi1>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<6xi1>, tensor<6xi1>) outs(%3 : tensor<6xi1>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[6], [8], [0], [0]]>} {
^bb0(%in: i1, %in_0: i1, %out: i1):
%7 = arith.addi %in, %in_0 : i1
linalg.yield %7 : i1
} -> tensor<6xi1>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [6], strides = [1] : tensor<6xi1> -> !flow.dispatch.tensor<writeonly:tensor<8xi1>>
return
}

// CHECK-LABEL: @test_subbyte_6_i1

// CHECK: %[[TR1:.+]] = vector.transfer_read %[[.+]][%c0], %false : tensor<6xi1>, vector<8xi1>
// CHECK: %[[ESS1:.+]] = vector.extract_strided_slice %[[TR1]] {offsets = [0], sizes = [6], strides = [1]} : vector<8xi1> to vector<6xi1>

// CHECK: %[[TR2:.+]] = vector.transfer_read %[[.+]][%c0], %false : tensor<6xi1>, vector<8xi1>
// CHECK: %[[ESS2:.+]] = vector.extract_strided_slice %[[TR2]] {offsets = [0], sizes = [6], strides = [1]} : vector<8xi1> to vector<6xi1>

// CHECK: %[[ISS:.+]] = vector.insert_strided_slice
// CHECK-SAME: {offsets = [0], strides = [1]} : vector<6xi1> into vector<8xi1>
// CHECK: vector.transfer_write %[[ISS]], %[[.+]][%c0] {in_bounds = [true]} : vector<8xi1>, tensor<6xi1>

14 changes: 7 additions & 7 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,13 +537,13 @@ HALDispatchABI::buildScopeAttr(mlir::ModuleOp moduleOp,
if (!llvmFuncOp.isExternal()) {
id = DistinctAttr::create(UnitAttr::get(context));
}
return LLVM::DISubprogramAttr::get(context, id, compileUnitAttr, fileAttr,
funcNameAttr, funcNameAttr, fileAttr,
/*line=*/1,
/*scopeline=*/1,
LLVM::DISubprogramFlags::Definition |
LLVM::DISubprogramFlags::Optimized,
subroutineTypeAttr, /*retainedNodes =*/{});
return LLVM::DISubprogramAttr::get(
context, id, compileUnitAttr, fileAttr, funcNameAttr, funcNameAttr,
fileAttr,
/*line=*/1,
/*scopeline=*/1,
LLVM::DISubprogramFlags::Definition | LLVM::DISubprogramFlags::Optimized,
subroutineTypeAttr, /*retainedNodes =*/{}, /*annotations =*/{});
}

// Returns the most local DISubprogramAttr starting from |forOp|.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2937,6 +2937,15 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
}
}

// lialan: hack for now:
//commonVecTileSizes[0] = 8;


// check to make sure the innermost tile size times element size is multiple of byte
auto elementTypeSize = cast<ShapedType>(rootOperation->getResultTypes().front()).getElementType().getIntOrFloatBitWidth();
auto innermostTileSize = commonVecTileSizes.back();
commonVecTileSizes.back() = llvm::alignTo(innermostTileSize * elementTypeSize, 8) / elementTypeSize;

// Set the lowering configs with new tile sizes.
for (auto op : computeOps) {
int numLoops = cast<TilingInterface>(op).getLoopIteratorTypes().size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
#executable_target_system_elf_riscv_64_ = #hal.executable.target<"llvm-cpu", "system-elf-riscv_64", {cpu = "generic-rv64", cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128", native_vector_size = 64 : index, target_triple = "riscv64"}>
#executable_target_system_elf_riscv_64_ = #hal.executable.target<"llvm-cpu", "system-elf-riscv_64", {cpu = "generic-rv64", cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128", native_vector_size = 4 : index, target_triple = "riscv64"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1) -> (d0 + d1 * 257)>
func.func @main_dispatch_77_generic_1x257x257x21() attributes {hal.executable.target = #executable_target_system_elf_riscv_64_} {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,8 @@ static Value applyMask(OpBuilder &builder, Location loc, AffineMap qkMap,
Value maskVal = args[0];

// TODO: Replace bool mask condition once treated as i1 (instead of i8)
if (maskVal.getType().isInteger()) {
auto maskValType = maskVal.getType();
if (maskValType.isInteger() && !maskValType.isInteger(1)) {
maskVal =
b.create<arith::TruncIOp>(loc, builder.getI1Type(), maskVal);
maskVal = b.create<arith::SelectOp>(loc, maskVal, zero, negInf);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,21 @@ static bool insertBindingOp(BlockArgument arg,
}
}

// align tensor type to multiple of 8 bits:
auto rankedTensorType = tensorType.asRankedTensorType();
auto elementSize = rankedTensorType.getElementType().getIntOrFloatBitWidth();
auto typeSize = tensorType.getNumElements() * elementSize;

if (typeSize * elementSize % 8 != 0) {
SmallVector<int64_t> newShape(rankedTensorType.getShape());
newShape.back() = llvm::alignTo(newShape.back(), 8 / elementSize);

auto newTensorType = IREE::Flow::DispatchTensorType::get(
tensorType.getAccess(), newShape,
rankedTensorType.getElementType(), rankedTensorType.getEncoding());
tensorType = newTensorType;
}
Comment on lines +940 to +953
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need some input from @benvanik about how we land this properly. My understanding is that we want to align i1 shape with bytes. E.g., 6xi1 becomes 8xi1 on both stream allocation and dispatch sides. The current approach replaces flow.dispatch.tensor type with 8xi1, while it is leaving 6xi1 type in the stream.tensor.sizeof op. See below snippet for more details. This is off to me because:

  1. I think it does not work with dynamic shapes. Because the arguments of DispatchTieShapeOp are not taken into accounts.
  2. It leaks the stream.tensor.sizeof lowering logic to FlowToStream conversion. Is it okay?

Ben knows more details, please correct me if I'm wrong. I think we still can cook all the logics in FlowToStream conversion. We either need a type converter or introduce a legalizePackedType method in ElementPackingUtils.[h|cpp] which shares the logic between buildResultSizeOf method and ConvertExecutableOp patterns. The legalizePackedType takes tensor type and dynamicDims and does something similar to calculateStorageElementCountInBytes.

The other approach is updating the logics in EncodeTensors.cpp. The pass has logics to encode host tensors and device tensors, and we probably just need to update the alignTensorType logic. (I don't know, please do some study.)

@benvanik do you have any suggestions about where the change should happen?

// -----// IR Dump Before ConvertToStreamPass (iree-stream-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0) -> (d0)>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  flow.executable private @add_tensors_dispatch_0 {
    flow.executable.export public @add_tensors_dispatch_0_elementwise_6_i1 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @add_tensors_dispatch_0_elementwise_6_i1(%arg0: !flow.dispatch.tensor<readonly:tensor<6xi1>>, %arg1: !flow.dispatch.tensor<readonly:tensor<6xi1>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<6xi1>>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<6xi1>> -> tensor<6xi1>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<6xi1>> -> tensor<6xi1>
        %2 = tensor.empty() : tensor<6xi1>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<6xi1>, tensor<6xi1>) outs(%2 : tensor<6xi1>) {
        ^bb0(%in: i1, %in_0: i1, %out: i1):
          %4 = arith.addi %in, %in_0 : i1
          linalg.yield %4 : i1
        } -> tensor<6xi1>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [6], strides = [1] : tensor<6xi1> -> !flow.dispatch.tensor<writeonly:tensor<6xi1>>
        return
      }
    }
  }
  util.func public @add_tensors(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @add_tensors(%input0: tensor<2x3xi1>, %input1: tensor<2x3xi1>) -> (%output0: tensor<2x3xi1>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<2x3xi1>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<2x3xi1>
    %2 = flow.tensor.reshape %0 : tensor<2x3xi1> -> tensor<6xi1>
    %3 = flow.tensor.reshape %1 : tensor<2x3xi1> -> tensor<6xi1>
    %4 = flow.dispatch @add_tensors_dispatch_0::@add_tensors_dispatch_0_elementwise_6_i1(%2, %3) : (tensor<6xi1>, tensor<6xi1>) -> tensor<6xi1>
    %5 = flow.tensor.reshape %4 : tensor<6xi1> -> tensor<2x3xi1>
    %6 = hal.tensor.export %5 "output0" : tensor<2x3xi1> -> !hal.buffer_view
    util.return %6 : !hal.buffer_view
  }
}


// -----// IR Dump Before VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0) -> (d0)>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_local
  stream.executable private @add_tensors_dispatch_0 {
    stream.executable.export public @add_tensors_dispatch_0_elementwise_6_i1 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @add_tensors_dispatch_0_elementwise_6_i1(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<8xi1>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<8xi1>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<8xi1>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<6xi1>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<6xi1>
        %5 = tensor.empty() : tensor<6xi1>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<6xi1>, tensor<6xi1>) outs(%5 : tensor<6xi1>) {
        ^bb0(%in: i1, %in_0: i1, %out: i1):
          %7 = arith.addi %in, %in_0 : i1
          linalg.yield %7 : i1
        } -> tensor<6xi1>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [6], strides = [1] : tensor<6xi1> -> !flow.dispatch.tensor<writeonly:tensor<8xi1>>
        return
      }
    }
  }
  util.func public @add_tensors(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @add_tensors(%input0: tensor<2x3xi1>, %input1: tensor<2x3xi1>) -> (%output0: tensor<2x3xi1>)"}} {
    %element_type_i1 = hal.element_type<i1> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c2, %c3]) type(%element_type_i1) encoding(%dense_row_major)
    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x3xi1> : index
    %1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<2x3xi1> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0}
    %element_type_i1_0 = hal.element_type<i1> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    %c2_2 = arith.constant 2 : index
    %c3_3 = arith.constant 3 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c2_2, %c3_3]) type(%element_type_i1_0) encoding(%dense_row_major_1)
    %3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x3xi1> : index
    %4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<2x3xi1> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3}
    %6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<6xi1> : index
    %7 = stream.tensor.clone on(#hal.device.affinity<@__device_0>) %2 : tensor<2x3xi1> in !stream.resource<*>{%0} -> tensor<6xi1> in !stream.resource<*>{%6}
    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<6xi1> : index
    %9 = stream.tensor.clone on(#hal.device.affinity<@__device_0>) %5 : tensor<2x3xi1> in !stream.resource<*>{%3} -> tensor<6xi1> in !stream.resource<*>{%8}
    %c0 = arith.constant 0 : index
    %10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<6xi1> : index
    %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @add_tensors_dispatch_0::@add_tensors_dispatch_0_elementwise_6_i1(%7[%c0 to %6 for %6], %9[%c0 to %8 for %8]) : (!stream.resource<*>{%6}, !stream.resource<*>{%8}) -> !stream.resource<*>{%10}
    %12 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2x3xi1> : index
    %13 = stream.tensor.clone on(#hal.device.affinity<@__device_0>) %11 : tensor<6xi1> in !stream.resource<*>{%10} -> tensor<2x3xi1> in !stream.resource<*>{%12}
    %14 = stream.async.transfer %13 : !stream.resource<*>{%12} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%12}
    %15 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %14 : tensor<2x3xi1> in !stream.resource<external>{%12} -> !hal.buffer_view
    util.return %15 : !hal.buffer_view
  }
}


auto subspanOp = builder.create<IREE::Stream::BindingSubspanOp>(
arg.getLoc(), tensorType, arg, zero, dynamicDims);
arg.replaceAllUsesExcept(subspanOp.getResult(), subspanOp);
Expand Down
7 changes: 6 additions & 1 deletion compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ bool needToPackSubByteElementBitWidth(unsigned bitWidth) {
// trickiness and weirdness of packing and cross-byte access.
// Also disallow boolean values for now--they may require separate interface
// choices.
return bitWidth < 8 && llvm::isPowerOf2_32(bitWidth) && bitWidth != 1;
return bitWidth < 8 && llvm::isPowerOf2_32(bitWidth);
}

bool needToPackSubByteElements(RankedTensorType shapedType) {
Expand Down Expand Up @@ -99,6 +99,11 @@ Value calculateStorageElementCountInBytes(Location loc,
}
}

// make sure the last dimension is byte aligned.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: proper punctuation (here and elsewhere) in comments: https://google.github.io/styleguide/cppguide.html#Punctuation,_Spelling_and_Grammar

if (needToPackSubByteElementBitWidth(elementBits)) {
paddedShape.back() = llvm::alignTo(paddedShape.back(), 8 / elementBits);
}

for (unsigned i = 0; i < shapedType.getRank(); ++i) {
if (!shapedType.isDynamicDim(i))
staticCount *= paddedShape[i];
Expand Down
2 changes: 1 addition & 1 deletion third_party/llvm-project
Submodule llvm-project updated 178 files
Loading