[Codegen] Migrate MapStoreOp to VectorizableOpInterface

hanhanW · hanhanW · commit 4c1bb138f2c6 · 2026-03-05T10:42:57.000-08:00
The revision also deletes VectorizeIREELinalgExtOps, because it is already covered in GenericVectorization pass. No new tests because it is an NFC in terms of functionality. It just follows different mechanism for vectorization. It is a step towards https://lists.lfaidata.foundation/g/iree-technical-discussion/message/15 Assisted-by: Claude Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
@@ -1001,3 +1001,145 @@ func.func @arg_compare_with_index_base(%input: tensor<4x128xf32>,
 // CHECK:         %[[WRITE_VAL:.+]] = vector.transfer_write %[[RESULT_VAL]], %[[OUT_VAL]]
 // CHECK:         %[[WRITE_IDX:.+]] = vector.transfer_write %[[RESULT_IDX]], %[[OUT_IDX]]
 // CHECK:         return %[[WRITE_VAL]], %[[WRITE_IDX]]
+
+// -----
+
+func.func @map_store(
+    %input: tensor<4x16x64xf32>, %output: tensor<4x16x64xf32>
+) -> tensor<4x16x64xf32> {
+  %0 = iree_linalg_ext.map_store %input into %output {
+    ^bb0(%idx0: index, %idx1: index, %idx2: index):
+      %mask = arith.constant true
+      iree_linalg_ext.yield %idx0, %idx1, %idx2, %mask : index, index, index, i1
+  } : tensor<4x16x64xf32> into tensor<4x16x64xf32> -> tensor<4x16x64xf32>
+  return %0 : tensor<4x16x64xf32>
+}
+// CHECK-LABEL: @map_store
+//  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:     %[[OUTPUT:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[READ:.+]] = vector.transfer_read %[[INPUT]]
+//       CHECK:   %[[MAP_SCATTER:.+]] = iree_linalg_ext.map_store
+//  CHECK-SAME:     %[[READ]] into %[[OUTPUT]]
+//       CHECK:     : vector<4x16x64xf32> into tensor<4x16x64xf32> -> tensor<4x16x64xf32>
+//       CHECK:   return %[[MAP_SCATTER]] : tensor<4x16x64xf32>
+
+// -----
+
+func.func @no_vectorize_map_store_dynamic(
+    %input: tensor<?xf32>, %output: tensor<64xf32>
+) -> tensor<64xf32> {
+  %0 = iree_linalg_ext.map_store %input into %output {
+    ^bb0(%idx0: index):
+      %mask = arith.constant true
+      iree_linalg_ext.yield %idx0, %mask : index, i1
+  } : tensor<?xf32> into tensor<64xf32> -> tensor<64xf32>
+  return %0 : tensor<64xf32>
+}
+// CHECK-LABEL: @no_vectorize_map_store_dynamic
+//   CHECK-NOT:   vector
+
+// -----
+
+func.func @map_store_f4_multiple_of_byte(
+    %input: tensor<2x2xf4E2M1FN>, %output: tensor<2x2xf4E2M1FN>
+) -> tensor<2x2xf4E2M1FN> {
+  %0 = iree_linalg_ext.map_store %input into %output {
+    ^bb0(%idx0: index, %idx1: index):
+      %mask = arith.constant true
+      iree_linalg_ext.yield %idx0, %idx1, %mask : index, index, i1
+  } : tensor<2x2xf4E2M1FN> into tensor<2x2xf4E2M1FN> -> tensor<2x2xf4E2M1FN>
+  return %0 : tensor<2x2xf4E2M1FN>
+}
+// CHECK-LABEL: @map_store_f4_multiple_of_byte
+//  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:     %[[OUTPUT:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[READ:.+]] = vector.transfer_read %[[INPUT]]
+//       CHECK:   %[[MAP_SCATTER:.+]] = iree_linalg_ext.map_store
+//  CHECK-SAME:     %[[READ]] into %[[OUTPUT]]
+//       CHECK:     : vector<2x2xf4E2M1FN> into tensor<2x2xf4E2M1FN> -> tensor<2x2xf4E2M1FN>
+//       CHECK:   return %[[MAP_SCATTER]] : tensor<2x2xf4E2M1FN>
+
+// -----
+
+func.func @map_store_f4_not_multiple_of_byte(
+    %input: tensor<2x1xf4E2M1FN>, %output: tensor<2x2xf4E2M1FN>
+) -> tensor<2x2xf4E2M1FN> {
+  %0 = iree_linalg_ext.map_store %input into %output {
+    ^bb0(%idx0: index, %idx1: index):
+      %mask = arith.constant true
+      iree_linalg_ext.yield %idx0, %idx1, %mask : index, index, i1
+  } : tensor<2x1xf4E2M1FN> into tensor<2x2xf4E2M1FN> -> tensor<2x2xf4E2M1FN>
+  return %0 : tensor<2x2xf4E2M1FN>
+}
+// CHECK-LABEL: @map_store_f4_not_multiple_of_byte
+//   CHECK-NOT:   vector
+
+// -----
+
+func.func @map_store_f4_unit_stride(
+    %input: tensor<2x2xf4E2M1FN>, %output: tensor<2x4xf4E2M1FN>
+) -> tensor<2x4xf4E2M1FN> {
+  %0 = iree_linalg_ext.map_store %input into %output {
+    ^bb0(%idx0: index, %idx1: index):
+      %mask = arith.constant true
+      %1 = affine.apply affine_map<(d0) -> (d0 + 2)>(%idx1)
+      iree_linalg_ext.yield %idx0, %1, %mask : index, index, i1
+  } : tensor<2x2xf4E2M1FN> into tensor<2x4xf4E2M1FN> -> tensor<2x4xf4E2M1FN>
+  return %0 : tensor<2x4xf4E2M1FN>
+}
+// CHECK-LABEL: @map_store_f4_unit_stride
+//  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:     %[[OUTPUT:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[READ:.+]] = vector.transfer_read %[[INPUT]]
+//       CHECK:   %[[MAP_SCATTER:.+]] = iree_linalg_ext.map_store
+//  CHECK-SAME:     %[[READ]] into %[[OUTPUT]]
+//       CHECK:     : vector<2x2xf4E2M1FN> into tensor<2x4xf4E2M1FN> -> tensor<2x4xf4E2M1FN>
+//       CHECK:   return %[[MAP_SCATTER]] : tensor<2x4xf4E2M1FN>
+
+// -----
+
+func.func @map_store_f4_not_unit_stride(
+    %input: tensor<2x2xf4E2M1FN>, %output: tensor<2x4xf4E2M1FN>
+) -> tensor<2x4xf4E2M1FN> {
+  %0 = iree_linalg_ext.map_store %input into %output {
+    ^bb0(%idx0: index, %idx1: index):
+      %mask = arith.constant true
+      %1 = affine.apply affine_map<(d0) -> (d0 * 2)>(%idx1)
+      iree_linalg_ext.yield %idx0, %1, %mask : index, index, i1
+  } : tensor<2x2xf4E2M1FN> into tensor<2x4xf4E2M1FN> -> tensor<2x4xf4E2M1FN>
+  return %0 : tensor<2x4xf4E2M1FN>
+}
+// CHECK-LABEL: @map_store_f4_not_unit_stride
+//   CHECK-NOT:   vector
+
+// -----
+
+func.func @map_store_f4_not_index_applied_multiple_times(
+    %input: tensor<2x2xf4E2M1FN>, %output: tensor<2x4xf4E2M1FN>
+) -> tensor<2x4xf4E2M1FN> {
+  %0 = iree_linalg_ext.map_store %input into %output {
+    ^bb0(%idx0: index, %idx1: index):
+      %mask = arith.constant true
+      %1 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%idx1, %idx1)
+      iree_linalg_ext.yield %idx0, %1, %mask : index, index, i1
+  } : tensor<2x2xf4E2M1FN> into tensor<2x4xf4E2M1FN> -> tensor<2x4xf4E2M1FN>
+  return %0 : tensor<2x4xf4E2M1FN>
+}
+// CHECK-LABEL: @map_store_f4_not_index_applied_multiple_times
+//   CHECK-NOT:   vector
+
+// -----
+
+func.func @map_store_f4_mask_depends_on_inner_index(
+    %input: tensor<2x2xf4E2M1FN>, %output: tensor<2x4xf4E2M1FN>
+) -> tensor<2x4xf4E2M1FN> {
+  %0 = iree_linalg_ext.map_store %input into %output {
+    ^bb0(%idx0: index, %idx1: index):
+      %c1 = arith.constant 1 : index
+      %mask = arith.cmpi uge, %idx1, %c1 : index
+      iree_linalg_ext.yield %idx0, %idx1, %mask : index, index, i1
+  } : tensor<2x2xf4E2M1FN> into tensor<2x4xf4E2M1FN> -> tensor<2x4xf4E2M1FN>
+  return %0 : tensor<2x4xf4E2M1FN>
+}
+// CHECK-LABEL: @map_store_f4_mask_depends_on_inner_index
+//   CHECK-NOT:   vector
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel
@@ -233,6 +233,8 @@ iree_compiler_cc_library(
         ":VectorizableOpInterfaceGen",
         "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect",
         "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
+        "//compiler/src/iree/compiler/Utils",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
@@ -169,13 +169,15 @@ iree_cc_library(
     "VectorizableOpInterface.cpp"
   DEPS
     ::VectorizableOpInterfaceGen
+    MLIRAnalysis
     MLIRArithDialect
     MLIRIR
     MLIRTensorDialect
     MLIRUBDialect
     MLIRVectorDialect
     iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect
     iree::compiler::Dialect::LinalgExt::IR
+    iree::compiler::Utils
   PUBLIC
 )
 
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp
@@ -8,9 +8,14 @@
 
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h"
+#include "iree/compiler/Codegen/Dialect/VectorExt/Transforms/Transforms.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree/compiler/Utils/Indexing.h"
+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -385,16 +390,85 @@ struct ToLayoutOpVectorizationModel
   }
 };
 
+struct MapStoreOpVectorizationModel
+    : public VectorizableOpInterface::ExternalModel<
+          MapStoreOpVectorizationModel, IREE::LinalgExt::MapStoreOp> {
+
+  bool isVectorizable(Operation *op, ArrayRef<int64_t> vectorSizes,
+                      ArrayRef<bool> scalableDims,
+                      DictionaryAttr options) const {
+    auto mapStoreOp = cast<IREE::LinalgExt::MapStoreOp>(op);
+    if (mapStoreOp.isVectorized()) {
+      return false;
+    }
+    ShapedType inputType = mapStoreOp.getInputType();
+    if (!inputType.hasStaticShape()) {
+      return false;
+    }
+    const int64_t innerSize = inputType.getShape()[inputType.getRank() - 1];
+    const int64_t bitWidth = inputType.getElementTypeBitWidth();
+    if ((innerSize * bitWidth % 8) != 0) {
+      return false;
+    }
+    // In case of a sub-byte bitwidth, we check that there is a contiguous copy
+    // on the inner dimension that is a multiple of a byte. Note that the mask
+    // shouldn't depend on the inner index for this.
+    if (bitWidth < 8) {
+      // First check that the mask is not the forward slice of the inner index.
+      Value innermostInputIdx =
+          mapStoreOp.getInputIndex(mapStoreOp.getInputRank() - 1);
+      SetVector<Operation *> slice;
+      getForwardSlice(innermostInputIdx, &slice);
+      Operation *maskOp = mapStoreOp.getMask().getDefiningOp();
+      if (maskOp && slice.contains(maskOp)) {
+        return false;
+      }
+      // Next check that the inner index of the yield is a unit function of
+      // the inner input index.
+      Value innermostOutputIdx =
+          mapStoreOp.getOutputIndex(mapStoreOp.getOutputRank() - 1);
+      if (!isUnitFunctionOf(innermostOutputIdx, innermostInputIdx)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  FailureOr<SmallVector<Value>> vectorize(Operation *op, RewriterBase &rewriter,
+                                          ArrayRef<int64_t> vectorSizes,
+                                          ArrayRef<bool> scalableDims,
+                                          DictionaryAttr options) const {
+    auto mapStoreOp = cast<IREE::LinalgExt::MapStoreOp>(op);
+    Location loc = mapStoreOp.getLoc();
+    rewriter.setInsertionPoint(mapStoreOp);
+    ShapedType inputType = mapStoreOp.getInputType();
+    Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    SmallVector<Value> zeros(inputType.getRank(), zero);
+    auto inputVectorType =
+        VectorType::get(inputType.getShape(), inputType.getElementType());
+    Value inputVector = vector::TransferReadOp::create(
+        rewriter, loc, inputVectorType, mapStoreOp.getInput(),
+        /*indices=*/zeros,
+        /*padding=*/std::nullopt);
+    auto vectorizedMapStoreOp =
+        clone(rewriter, mapStoreOp, mapStoreOp.getResultTypes(),
+              {inputVector, mapStoreOp.getOutput()});
+    return SmallVector<Value>(vectorizedMapStoreOp->getResults());
+  }
+};
+
 } // namespace
 
 void registerVectorizableOpInterfaceExternalModels(DialectRegistry &registry) {
-  registry.addExtension(
-      +[](MLIRContext *ctx, IREE::LinalgExt::IREELinalgExtDialect *dialect) {
-        IREE::LinalgExt::GatherOp::attachInterface<GatherOpVectorizationModel>(
-            *ctx);
-        IREE::LinalgExt::ArgCompareOp::attachInterface<
-            ArgCompareOpVectorizationModel>(*ctx);
-      });
+  registry.addExtension(+[](MLIRContext *ctx,
+                            IREE::LinalgExt::IREELinalgExtDialect *dialect) {
+    IREE::LinalgExt::GatherOp::attachInterface<GatherOpVectorizationModel>(
+        *ctx);
+    IREE::LinalgExt::ArgCompareOp::attachInterface<
+        ArgCompareOpVectorizationModel>(*ctx);
+    IREE::LinalgExt::MapStoreOp::attachInterface<MapStoreOpVectorizationModel>(
+        *ctx);
+  });
   registry.addExtension(+[](MLIRContext *ctx,
                             IREE::VectorExt::IREEVectorExtDialect *dialect) {
     IREE::VectorExt::ToLayoutOp::attachInterface<ToLayoutOpVectorizationModel>(
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -573,8 +573,6 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(createGPUCombineValueSemanticBarriersPass());
 
   // Step 6. Lower special ops and vectorize.
-  funcPassManager.addPass(
-      IREE::LinalgExt::createVectorizeIREELinalgExtOpsPass());
   funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass());
   addGPUVectorizationPasses(funcPassManager, /*vectorizeCopies=*/false,
                             /*enableMasking=*/true,
@@ -836,8 +834,6 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(tensor::createFoldTensorSubsetOpsPass());
 
   // Linalg -> Vector
-  funcPassManager.addPass(
-      IREE::LinalgExt::createVectorizeIREELinalgExtOpsPass());
   addGPUVectorizationPasses(funcPassManager, /*vectorizeCopies=*/true,
                             /*enableMasking=*/true);
 
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/BUILD.bazel
@@ -48,7 +48,6 @@ iree_compiler_cc_library(
         "TestReshapeFusion.cpp",
         "TileAttention.cpp",
         "TransposeFusion.cpp",
-        "VectorizeIREELinalgExtOps.cpp",
     ],
     hdrs = [
         "LoopMappingUtils.h",
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/CMakeLists.txt
@@ -46,7 +46,6 @@ iree_cc_library(
     "TestReshapeFusion.cpp"
     "TileAttention.cpp"
     "TransposeFusion.cpp"
-    "VectorizeIREELinalgExtOps.cpp"
   DEPS
     ::PassesIncGen
     LLVMSupport
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/Passes.td
@@ -146,13 +146,4 @@ def TestReshapeFusionPass :
   let summary = "Test reshape fusion patterns";
 }
 
-def VectorizeIREELinalgExtOpsPass :
-    InterfacePass<"iree-linalg-ext-vectorize-ops", "mlir::FunctionOpInterface"> {
-  let summary = "Convert linalg_ext ops into their vector form.";
-  let dependentDialects = [
-    "::mlir::vector::VectorDialect",
-    "::mlir::arith::ArithDialect"
-  ];
-}
-
 #endif  // IREE_DIALECT_LINALGEXT_PASSES
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/VectorizeIREELinalgExtOps.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/VectorizeIREELinalgExtOps.cpp
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/BUILD.bazel
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/CMakeLists.txt
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/vectorize_iree_linalg_ext_ops.mlir b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/vectorize_iree_linalg_ext_ops.mlir