diff --git a/include/aie/Dialect/AIE/IR/AIEDialect.h b/include/aie/Dialect/AIE/IR/AIEDialect.h
index 3634165838b..36d7df97541 100644
--- a/include/aie/Dialect/AIE/IR/AIEDialect.h
+++ b/include/aie/Dialect/AIE/IR/AIEDialect.h
@@ -228,6 +228,18 @@ void collectBuffers(
 // linearized by the compiler.
 bool isContiguousBDTransfer(llvm::ArrayRef<BDDimLayoutAttr> dims);
 
+// Verify that a BD's per-dimension sizes and strides (innermost-first) are
+// realizable for hardware with the given address generation granularity.
+// Checks: positive sizes; innermost contiguous run is a granularity multiple;
+// positive strides for non-repeat dims (last stride may be zero); each
+// non-innermost stride byte-aligned to granularity (innermost stride==1 is
+// always allowed); for elemWidth > granularity, innermost stride must be 1.
+mlir::LogicalResult verifyBDSizesStrides(mlir::Operation *forOp,
+                                         unsigned elemWidthBits,
+                                         uint32_t addressGranularityBits,
+                                         llvm::ArrayRef<int64_t> inputSizes,
+                                         llvm::ArrayRef<int64_t> inputStrides);
+
 } // namespace xilinx::AIE
 
 namespace llvm {
diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp
index 98e34be71b7..7f7a9d7838a 100644
--- a/lib/Dialect/AIE/IR/AIEDialect.cpp
+++ b/lib/Dialect/AIE/IR/AIEDialect.cpp
@@ -2150,6 +2150,74 @@ void DMABDOp::print(::mlir::OpAsmPrinter &printer) {
 // A BDDimLayoutAttr array (outermost-first) describes a contiguous row-major
 // scan when the innermost stride is 1 and each outer stride equals the product
 // of all inner sizes.  Used by both DMABDOp verification and canonicalization.
+mlir::LogicalResult xilinx::AIE::verifyBDSizesStrides(
+    mlir::Operation *forOp, unsigned elemWidthBits,
+    uint32_t addressGranularityBits, llvm::ArrayRef<int64_t> inputSizes,
+    llvm::ArrayRef<int64_t> inputStrides) {
+  assert(inputSizes.size() == inputStrides.size());
+  const int n = static_cast<int>(inputSizes.size());
+  if (n == 0)
+    return success();
+
+  for (int i = 0; i < n; ++i) {
+    if (inputSizes[i] <= 0)
+      return forOp->emitOpError("Size ") << i << " must be a positive integer.";
+  }
+
+  // Innermost contiguous run must be a multiple of address granularity --
+  // hardware moves whole words; a sub-word innermost run is unrealizable.
+  if (inputSizes[0] * elemWidthBits % addressGranularityBits != 0) {
+    std::stringstream msg;
+    msg << "Transfer sizes must be multiples of "
+        << (addressGranularityBits / 8) << " bytes. " << inputSizes[0]
+        << " elements at " << (elemWidthBits / 8) << " bytes each equal "
+        << (inputSizes[0] * elemWidthBits / 8)
+        << " bytes, which is not divisible by " << (addressGranularityBits / 8)
+        << ". ";
+    return forOp->emitOpError(msg.str());
+  }
+
+  // Non-repeat dim strides must be positive when the corresponding size > 1
+  // (the repeat dim, if present as the outermost, may have stride 0).
+  const int repeatDim = n - 1;
+  for (int i = 0; i < n; ++i) {
+    if (inputSizes[i] > 1 && inputStrides[i] < 1) {
+      if (i == repeatDim && inputStrides[i] == 0)
+        continue;
+      return forOp->emitOpError("Stride ")
+             << i << " must be a positive integer.";
+    }
+  }
+
+  // Stride byte-alignment: innermost stride==1 is always allowed (sub-word
+  // packed contiguous run, paired with the granularity check above); any other
+  // stride must be a granularity multiple in bytes.
+  for (int i = 0; i < n; ++i) {
+    if (i == 0 && inputStrides[i] == 1)
+      continue;
+    if (inputStrides[i] * elemWidthBits % addressGranularityBits != 0) {
+      std::stringstream msg;
+      msg << "Stride " << i << " is " << inputStrides[i] << " elements * "
+          << (elemWidthBits / 8)
+          << " bytes = " << (inputStrides[i] * elemWidthBits / 8)
+          << " bytes, which is not divisible by "
+          << (addressGranularityBits / 8) << ". ";
+      return forOp->emitOpError(msg.str());
+    }
+  }
+
+  // For element widths larger than the granularity (e.g. bfp blocks, i64),
+  // the hardware cannot encode a non-1 innermost stride; getHardwareStrides-
+  // Wraps would silently drop the stride. Force innermost stride == 1.
+  if (elemWidthBits > addressGranularityBits && inputStrides[0] != 1)
+    return forOp->emitOpError(
+               "For element widths larger than the address granularity (")
+           << (addressGranularityBits / 8)
+           << " bytes), innermost dim stride must be 1.";
+
+  return success();
+}
+
 bool xilinx::AIE::isContiguousBDTransfer(llvm::ArrayRef<BDDimLayoutAttr> dims) {
   if (dims.empty())
     return true; // no ND layout = trivially contiguous
@@ -2260,16 +2328,21 @@ LogicalResult DMABDOp::verify() {
         return emitOpError() << "Stride may not exceed " << (1 << 20);
     }
 
-    // Since streams read 32b words, there's no way to read eg 16b with stride
-    // of 2 (ie lower halfs of each 32b). So force it to be 1 (and then in
-    // CDODirect/XAIEV2 scale the size by 4/getBufferElementTypeWidthInBytes).
-    if (getBufferElementTypeWidthInBytes() < 4 && dims->back().getStride() != 1)
-      return emitOpError(
-          "For <32b width datatypes, inner-most dim stride must be 1");
-
-    if (getBufferElementTypeWidthInBytes() > 4 && dims->back().getStride() != 1)
-      return emitOpError(
-          "For >32b width datatypes, inner-most dim stride must be 1");
+    // Granularity / sub-word / stride alignment checks, shared with
+    // AIEX::verifyStridesWraps. dims are stored outermost-first; the helper
+    // expects innermost-first.
+    SmallVector<int64_t, 4> inputSizes, inputStrides;
+    for (auto it = dims->rbegin(); it != dims->rend(); ++it) {
+      inputSizes.push_back(static_cast<int64_t>(it->getSize()));
+      inputStrides.push_back(static_cast<int64_t>(it->getStride()));
+    }
+    DataLayout dataLayout = DataLayout::closest(getOperation());
+    unsigned elemWidthBits =
+        dataLayout.getTypeSizeInBits(buffer.getElementType());
+    if (failed(xilinx::AIE::verifyBDSizesStrides(
+            getOperation(), elemWidthBits,
+            targetModel.getAddressGenGranularity(), inputSizes, inputStrides)))
+      return failure();
   }
   if (auto paddims = getPadDimensions(); paddims.has_value()) {
     auto dims = getDimensions();
diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index aeda8495d3b..59de8ce36fa 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -202,51 +202,9 @@ AIEX::verifyStridesWraps(mlir::Operation *forOp,
         std::to_string(tileRow) + ") Must be ShimNOC, Mem or Core.");
   }
 
-  for (int i = 0; i < 4; i++) {
-    if (inputSizes[i] <= 0) {
-      return forOp->emitOpError("Size ") << i << " must be a positive integer.";
-    }
-  }
-
-  if (inputSizes[0] * elemWidth % addressGranularity != 0) {
-    std::stringstream msg;
-    msg << "Transfer sizes must be multiples of " << (addressGranularity / 8)
-        << " bytes. " << inputSizes[0] << " elements at " << (elemWidth / 8)
-        << " bytes each equal " << (inputSizes[0] * elemWidth / 8)
-        << " bytes, which is not divisible by " << (addressGranularity / 8)
-        << ". ";
-    return forOp->emitOpError(msg.str());
-  }
-
-  for (int i = 0; i < 3; i++) {
-    if (inputSizes[i] > 1 && inputStrides[i] < 1) {
-      // If inputSize[i] == 1, anything is allowable in the stride, since that
-      // stride will never be applied. For any larger size, we must verify that
-      // the stride is positive.
-      return forOp->emitOpError("Stride ")
-             << i << " must be a positive integer.";
-    }
-  }
-  // A value of zero is allowable for the fourth-dimension stride
-  // (this indicates an interation stride for the repeat of 0)
-  if (inputSizes[3] > 1 && inputStrides[3] < 0) {
-    return forOp->emitOpError("Stride 3 must be a non-negative integer.");
-  }
-
-  for (int i = 0; i < 4; i++) {
-    // strides[0] == 1 is ok iff the transfer size is a multiple of
-    // addressGranularity, which is checked below
-    if (i == 0 && inputStrides[i] == 1)
-      continue;
-    if (inputStrides[i] * elemWidth % addressGranularity != 0) {
-      std::stringstream msg;
-      msg << "Stride " << i << " is " << inputStrides[i] << " elements * "
-          << (elemWidth / 8) << " bytes = " << (inputStrides[i] * elemWidth / 8)
-          << " bytes, which is not divisible by " << (addressGranularity / 8)
-          << ". ";
-      return forOp->emitOpError(msg.str());
-    }
-  }
+  if (failed(AIE::verifyBDSizesStrides(forOp, elemWidth, addressGranularity,
+                                       inputSizes, inputStrides)))
+    return failure();
 
   if (!skipTransformationChecks && hardwareSizes[0] > (1 << wrap_bits) - 1)
     return forOp->emitOpError(
diff --git a/test/dialect/AIE/bad_dma_op.mlir b/test/dialect/AIE/bad_dma_op.mlir
index 15c13ced2e9..02192a71e55 100644
--- a/test/dialect/AIE/bad_dma_op.mlir
+++ b/test/dialect/AIE/bad_dma_op.mlir
@@ -53,7 +53,7 @@ module {
 
 // -----
 
-// CHECK: For >32b width datatypes, inner-most dim stride must be 1 
+// CHECK: For element widths larger than the address granularity (4 bytes), innermost dim stride must be 1
 module {
   aie.device(npu1) {
     %tile14 = aie.tile(1, 4)
@@ -63,7 +63,27 @@ module {
       ^bd0:
         aie.dma_bd(%buf14 : memref<128x!aiex.bfp<"v8bfp16ebs8">>, 0, 128, [<size = 8, stride = 16>]) {}
         aie.next_bd ^end
-      ^end: 
+      ^end:
+        aie.end
+    }
+  }
+}
+
+// -----
+
+// Sub-word innermost contiguous run on i8: innermost size=2 elements * 1 byte
+// = 2 bytes, sub-word and unrealizable by 32-bit-granularity DMA.
+// CHECK: 2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4
+module {
+  aie.device(npu1) {
+    %tile14 = aie.tile(1, 4)
+    %buf14 = aie.buffer(%tile14) { sym_name = "buf14" } : memref<128xi8>
+    %mem14 = aie.mem(%tile14) {
+      %srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end)
+      ^bd0:
+        aie.dma_bd(%buf14 : memref<128xi8>, 0, 24, [<size = 3, stride = 4>, <size = 2, stride = 1>])
+        aie.next_bd ^end
+      ^end:
         aie.end
     }
   }
diff --git a/test/dialect/AIE/nd-dma-bad-stride.mlir b/test/dialect/AIE/nd-dma-bad-stride.mlir
index 7848a7c268a..d5e8d1424e3 100644
--- a/test/dialect/AIE/nd-dma-bad-stride.mlir
+++ b/test/dialect/AIE/nd-dma-bad-stride.mlir
@@ -8,24 +8,46 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-opt --verify-diagnostics %s
+// RUN: aie-opt --split-input-file --verify-diagnostics %s
 
-module @tutorial_2b {
+// i16 (16b) inner stride of 2 elements = 32b = address-granularity multiple.
+// This is realizable in hardware and must pass verification.
+module @stride_word_aligned_ok {
   aie.device(xcve2802) {
     %tile14 = aie.tile(1, 4)
     %tile34 = aie.tile(3, 4)
-
     aie.flow(%tile14, DMA : 0, %tile34, DMA : 0)
     %buf14 = aie.buffer(%tile14) : memref<128xi16>
     %lock14_done = aie.lock(%tile14, 0) { init = 0 : i32 }
     %mem14 = aie.mem(%tile14) {
       %srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end)
       ^bd0:
-        // expected-error@+1 {{'aie.dma_bd' op For <32b width datatypes, inner-most dim stride must be 1}}
         aie.dma_bd(%buf14 : memref<128xi16>, 0, 128, [<size = 32, stride = 2>])
         aie.next_bd ^end
       ^end:
         aie.end
     }
   }
-}
\ No newline at end of file
+}
+
+// -----
+
+// i16 (16b) inner stride of 3 elements = 48b, not a granularity multiple.
+module @stride_not_word_aligned {
+  aie.device(xcve2802) {
+    %tile14 = aie.tile(1, 4)
+    %tile34 = aie.tile(3, 4)
+    aie.flow(%tile14, DMA : 0, %tile34, DMA : 0)
+    %buf14 = aie.buffer(%tile14) : memref<128xi16>
+    %lock14_done = aie.lock(%tile14, 0) { init = 0 : i32 }
+    %mem14 = aie.mem(%tile14) {
+      %srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end)
+      ^bd0:
+        // expected-error@+1 {{'aie.dma_bd' op Stride 0 is 3 elements * 2 bytes = 6 bytes, which is not divisible by 4}}
+        aie.dma_bd(%buf14 : memref<128xi16>, 0, 128, [<size = 32, stride = 3>])
+        aie.next_bd ^end
+      ^end:
+        aie.end
+    }
+  }
+}