openzfs · tiehexue · Jun 10, 2026 · Jun 12, 2026 · Jun 15, 2026 · Jun 23, 2026
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
@@ -449,28 +449,38 @@ abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size)
 	ASSERT3U(offset, <, PAGE_SIZE);
 	ASSERT3P(pages, !=, NULL);
 
-	abd_t *abd = abd_alloc_struct(size);
-	abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
-	abd->abd_size = size;
+	abd_t *abd;
 
 	if ((offset + size) <= PAGE_SIZE) {
 		/*
 		 * There is only a single page worth of data, so we will just
-		 * use  a linear ABD. We have to make sure to take into account
-		 * the offset though. In all other cases our offset will be 0
-		 * as we are always PAGE_SIZE aligned.
+		 * use a linear ABD. We have to make sure to take into account
+		 * the offset though.
 		 */
+		abd = abd_alloc_struct(size);
+		abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
+		abd->abd_size = size;
 		abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
 		ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0],
 		    &abd->abd_u.abd_linear.sf) + offset;
 	} else {
+		/*
+		 * Multi-page scatter ABD.  The first page may have a
+		 * non-zero byte offset (bio_ma_offset), so allocate
+		 * enough chunk slots to cover the full range from the
+		 * offset through the end of the data.
+		 */
+		uint_t chunkcnt = abd_chunkcnt_for_bytes(offset + size);
+
+		abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT);
+		abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
+		abd->abd_size = size;
 		ABD_SCATTER(abd).abd_offset = offset;
-		ASSERT0(ABD_SCATTER(abd).abd_offset);
 
 		/*
 		 * Setting the ABD's abd_chunks to point to the user pages.
 		 */
-		for (int i = 0; i < abd_chunkcnt_for_bytes(size); i++)
+		for (int i = 0; i < chunkcnt; i++)
 			ABD_SCATTER(abd).abd_chunks[i] = pages[i];
 	}
 

diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
@@ -99,6 +99,8 @@
 #include <geom/geom.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
+#include <sys/abd.h>
+#include <sys/dmu_impl.h>
 #include <cityhash.h>
 
 #include "zfs_namecheck.h"
@@ -154,6 +156,21 @@ SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
  */
 int zvol_maxphys = DMU_MAX_ACCESS / 2;
 
+/*
+ * Enable Direct I/O for zvols. When enabled, page-aligned reads and
+ * block-aligned writes will bypass the ARC and DMA directly into/from
+ * the bio_data buffer, avoiding the data copy overhead.
+ *
+ * This is particularly beneficial for high-bandwidth NVMe-oF workloads
+ * where the CPU memcpy bottleneck limits throughput.
+ *
+ * Default: 0 (disabled) for safety. Set to 1 to enable.
+ */
+static int zvol_dio_enabled = 0;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, dio_enabled, CTLFLAG_RWTUN,
+	&zvol_dio_enabled, 0,
+	"Enable Direct I/O for zvols (bypass ARC, DMA directly to/from bio)");
+
 static void zvol_ensure_zilog(zvol_state_t *zv);
 
 static d_open_t		zvol_cdev_open;
@@ -579,6 +596,136 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
 	return (0);
 }
 
+/*
+ * Determine if a zvol read can use the Direct I/O path.
+ *
+ * For unmapped (scattered) BIOs, alignment is checked per-page.
+ * For mapped (linear) BIOs, alignment is checked on the buffer address.
+ *
+ * Requirements:
+ * - Direct I/O must be enabled (zvol_dio_enabled)
+ * - The I/O offset and size must be page-aligned
+ * - The buffer must be page-aligned (either addr for mapped, or
+ *   bio_ma_offset for unmapped)
+ */
+static boolean_t
+zvol_dio_can_read(struct bio *bp, uint64_t off, size_t size)
+{
+	if (!zvol_dio_enabled)
+		return (B_FALSE);
+
+	if (size == 0)
+		return (B_FALSE);
+
+	if (!zfs_dio_aligned(off, size, PAGESIZE))
+		return (B_FALSE);
+
+	/*
+	 * For unmapped BIOs, the bio_ma array contains physical page
+	 * pointers; the bio_ma_offset is the byte offset within the
+	 * first page (may be non-zero).  No additional buffer address
+	 * alignment check needed since pages are always aligned.
+	 */
+	if (bp->bio_flags & BIO_UNMAPPED)
+		return (B_TRUE);
+
+	/* For mapped BIOs, check the linear buffer address. */
+	if (!zfs_dio_page_aligned(bp->bio_data))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Determine if a zvol write can use the Direct I/O path.
+ *
+ * Requirements:
+ * - Direct I/O must be enabled
+ * - The write must be block-aligned (volblocksize)
+ * - The write must be at least one full volblocksize
+ * - For mapped BIOs: buffer address must be page-aligned
+ * - For unmapped BIOs: page pointers are always aligned;
+ *   bio_ma_offset is the byte offset within the first page
+ */
+static boolean_t
+zvol_dio_can_write(zvol_state_t *zv, struct bio *bp,
+    uint64_t off, size_t size)
+{
+	if (!zvol_dio_enabled)
+		return (B_FALSE);
+
+	if (size < zv->zv_volblocksize)
+		return (B_FALSE);
+
+	if (!zfs_dio_aligned(off, size, zv->zv_volblocksize))
+		return (B_FALSE);
+
+	/* For mapped BIOs, check the linear buffer address. */
+	if (!(bp->bio_flags & BIO_UNMAPPED) &&
+	    !zfs_dio_page_aligned(bp->bio_data))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Perform a Direct I/O read on a zvol, bypassing the ARC.
+ *
+ * For unmapped (scattered) BIOs, creates a scattered ABD from the
+ * bio_ma page array for true zero-copy DMA into the consumer's pages.
+ * For mapped (linear) BIOs, allocates a new ABD, reads into it, then
+ * copies the result back to the bio_data kernel buffer.  This avoids
+ * DMA issues with non-physically-contiguous kernel buffers.
+ */
+static int
+zvol_dio_read(zvol_state_t *zv, struct bio *bp, uint64_t off, size_t size)
+{
+	abd_t *abd;
+	int error;
+
+	if (bp->bio_flags & BIO_UNMAPPED) {
+		abd = abd_alloc_from_pages(bp->bio_ma, bp->bio_ma_offset, size);
+		error = dmu_read_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO);
+		abd_free(abd);
+	} else {
+		abd = abd_alloc_for_io(size, B_FALSE);
+		error = dmu_read_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO);
+		if (error == 0)
+			abd_copy_to_buf(bp->bio_data, abd, size);
+		abd_free(abd);
+	}
+	return (error);
+}
+
+/*
+ * Perform a Direct I/O write on a zvol, bypassing the ARC.
+ *
+ * For unmapped (scattered) BIOs, creates a scattered ABD from the
+ * bio_ma page array for true zero-copy DMA from the consumer's pages.
+ * For mapped (linear) BIOs, allocates a new ABD and copies the data
+ * from the bio_data kernel buffer.  This avoids issues with wrapping
+ * non-physically-contiguous kernel buffers in a linear ABD, which can
+ * break DMA when the buffer spans page boundaries.
+ * This is a synchronous write — it waits for the I/O to complete.
+ */
+static int
+zvol_dio_write(zvol_state_t *zv, struct bio *bp, uint64_t off, size_t size,
+    dmu_tx_t *tx)
+{
+	abd_t *abd;
+	int error;
+
+	if (bp->bio_flags & BIO_UNMAPPED) {
+		abd = abd_alloc_from_pages(bp->bio_ma, bp->bio_ma_offset, size);
+	} else {
+		abd = abd_alloc_for_io(size, B_FALSE);
+		abd_copy_from_buf(abd, bp->bio_data, size);
+	}
+	error = dmu_write_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO, tx);
+	abd_free(abd);
+	return (error);
+}
+
 static void
 zvol_strategy_impl(zv_request_t *zvr)
 {
@@ -670,18 +817,98 @@ zvol_strategy_impl(zv_request_t *zvr)
 	while (resid != 0 && off < volsize) {
 		size_t size = MIN(resid, zvol_maxphys);
 		if (doread) {
-			error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
-			    DMU_READ_PREFETCH);
+			/*
+			 * Try Direct I/O first for page-aligned reads.
+			 * This bypasses the ARC and DMAs data directly
+			 * into the bio_data/bio_ma buffer. On checksum
+			 * error, fall back to the ARC path for safety.
+			 */
+			if (zvol_dio_can_read(bp, off, size)) {
+				error = zvol_dio_read(zv, bp, off, size);
+				if (error == ECKSUM) {
+					/*
+					 * For unmapped BIOs with ECKSUM,
+					 * create a temp ABD, borrow/copy
+					 * the linear buffer for ARC retry,
+					 * then copy results back to pages.
+					 */
+					abd_t *tmp = NULL;
+					if (bp->bio_flags & BIO_UNMAPPED) {
+						tmp = abd_alloc_from_pages(
+						    bp->bio_ma,
+						    bp->bio_ma_offset, size);
+						addr = abd_borrow_buf_copy(
+						    tmp, size);
+					}
+					error = dmu_read_by_dnode(zv->zv_dn,
+					    off, size, addr,
+					    DMU_READ_PREFETCH);
+					if (bp->bio_flags & BIO_UNMAPPED) {
+						abd_return_buf_copy(
+						    tmp, addr, size);
+						abd_free(tmp);
+					}
+				}
+			} else {
+				abd_t *tmp = NULL;
+				if (bp->bio_flags & BIO_UNMAPPED) {
+					tmp = abd_alloc_from_pages(
+					    bp->bio_ma,
+					    bp->bio_ma_offset, size);
+					addr = abd_borrow_buf_copy(
+					    tmp, size);
+				}
+				error = dmu_read_by_dnode(zv->zv_dn, off, size,
+				    addr, DMU_READ_PREFETCH);
+				if (bp->bio_flags & BIO_UNMAPPED) {
+					abd_return_buf_copy(tmp, addr, size);
+					abd_free(tmp);
+				}
+			}
 		} else {
 			dmu_tx_t *tx = dmu_tx_create(os);
 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
 			error = dmu_tx_assign(tx, DMU_TX_WAIT);
 			if (error) {
 				dmu_tx_abort(tx);
 			} else {
-				dmu_write_by_dnode(zv->zv_dn, off, size, addr,
-				    tx, DMU_READ_PREFETCH);
-				zvol_log_write(zv, tx, off, size, commit);
+				/*
+				 * Try Direct I/O for block-aligned writes.
+				 * This bypasses the ARC and writes directly
+				 * to disk, avoiding the data copy overhead.
+				 */
+				if (zvol_dio_can_write(zv, bp, off, size)) {
+					error = zvol_dio_write(zv, bp, off,
+					    size, tx);
+					if (error == 0)
+						zvol_log_write(zv, tx, off,
+						    size, commit);
+				} else {
+					/*
+					 * For unmapped BIOs falling back
+					 * to ARC, create a temp ABD from
+					 * scatter pages, borrow a linear
+					 * buffer, write it, then release.
+					 */
+					abd_t *tmp = NULL;
+					if (bp->bio_flags & BIO_UNMAPPED) {
+						tmp = abd_alloc_from_pages(
+						    bp->bio_ma,
+						    bp->bio_ma_offset, size);
+						addr = abd_borrow_buf_copy(
+						    tmp, size);
+					}
+					dmu_write_by_dnode(zv->zv_dn, off,
+					    size, addr, tx,
+					    DMU_READ_PREFETCH);
+					if (bp->bio_flags & BIO_UNMAPPED) {
+						abd_return_buf(tmp, addr,
+						    size);
+						abd_free(tmp);
+					}
+					zvol_log_write(zv, tx, off, size,
+					    commit);
+				}
 				dmu_tx_commit(tx);
 			}
 		}
@@ -1278,7 +1505,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 		g_wither_provider(pp, ENXIO);
 
 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
-		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND |
+		    G_PF_ACCEPT_UNMAPPED;
 		pp->sectorsize = DEV_BSIZE;
 		pp->mediasize = zv->zv_volsize;
 		pp->private = zv;
@@ -1361,7 +1589,8 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
 		gp->start = zvol_geom_bio_start;
 		gp->access = zvol_geom_access;
 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
-		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND |
+		    G_PF_ACCEPT_UNMAPPED;
 		pp->sectorsize = DEV_BSIZE;
 		pp->mediasize = 0;
 		pp->private = zv;

diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c
@@ -359,6 +359,16 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
 			uio->uio_bvec++;
 			uio->uio_iovcnt--;
 		}
+	} else if (uio->uio_segflg == UIO_BVEC) {
+		/*
+		 * When using a uio backed by a struct request (blk-mq),
+		 * the bvec pointers are not maintained during uioskip.
+		 * Callers (e.g. zvol_dio_read) derive page mappings
+		 * directly from the request using zvol_dio_get_pages(),
+		 * which walks the request segments independently using
+		 * the uio_loffset.  We only need to advance the logical
+		 * offset and resid — no bvec accounting needed.
+		 */
 	} else if (uio->uio_segflg == UIO_ITER) {
 		iov_iter_advance(uio->uio_iter, n);
 	} else {
@@ -403,6 +413,35 @@ zfs_uio_page_aligned(zfs_uio_t *uio)
 		unsigned long alignment =
 		    iov_iter_alignment(uio->uio_iter);
 		aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
+	} else if (uio->uio_segflg == UIO_BVEC) {
+		/*
+		 * For bio_vec-backed I/O (zvols), check that each
+		 * segment is page-aligned.  The block layer typically
+		 * allocates page-aligned I/O, so this should almost
+		 * always pass.
+		 */
+		if (uio->rq != NULL) {
+			struct bio_vec bv;
+			struct req_iterator iter;
+			rq_for_each_segment(bv, uio->rq, iter) {
+				if (!IS_P2ALIGNED(bv.bv_offset, PAGE_SIZE) ||
+				    !IS_P2ALIGNED(bv.bv_len, PAGE_SIZE)) {
+					aligned = B_FALSE;
+					break;
+				}
+			}
+		} else if (uio->uio_bvec != NULL) {
+			const struct bio_vec *bv = uio->uio_bvec;
+			for (int i = 0; i < uio->uio_iovcnt; i++, bv++) {
+				if (!IS_P2ALIGNED(bv->bv_offset, PAGE_SIZE) ||
+				    !IS_P2ALIGNED(bv->bv_len, PAGE_SIZE)) {
+					aligned = B_FALSE;
+					break;
+				}
+			}
+		} else {
+			aligned = B_FALSE;
+		}
 	} else {
 		/* Currently not supported */
 		aligned = B_FALSE;