Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions module/os/freebsd/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -449,28 +449,38 @@ abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size)
ASSERT3U(offset, <, PAGE_SIZE);
ASSERT3P(pages, !=, NULL);

abd_t *abd = abd_alloc_struct(size);
abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
abd->abd_size = size;
abd_t *abd;

if ((offset + size) <= PAGE_SIZE) {
/*
* There is only a single page worth of data, so we will just
* use a linear ABD. We have to make sure to take into account
* the offset though. In all other cases our offset will be 0
* as we are always PAGE_SIZE aligned.
* use a linear ABD. We have to make sure to take into account
* the offset though.
*/
abd = abd_alloc_struct(size);
abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
abd->abd_size = size;
abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0],
&abd->abd_u.abd_linear.sf) + offset;
} else {
/*
* Multi-page scatter ABD. The first page may have a
* non-zero byte offset (bio_ma_offset), so allocate
* enough chunk slots to cover the full range from the
* offset through the end of the data.
*/
uint_t chunkcnt = abd_chunkcnt_for_bytes(offset + size);

abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT);
Comment thread
tiehexue marked this conversation as resolved.
Outdated
abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
abd->abd_size = size;
ABD_SCATTER(abd).abd_offset = offset;
ASSERT0(ABD_SCATTER(abd).abd_offset);
Comment thread
tiehexue marked this conversation as resolved.

/*
* Setting the ABD's abd_chunks to point to the user pages.
*/
for (int i = 0; i < abd_chunkcnt_for_bytes(size); i++)
for (int i = 0; i < chunkcnt; i++)
ABD_SCATTER(abd).abd_chunks[i] = pages[i];
}

Expand Down
243 changes: 236 additions & 7 deletions module/os/freebsd/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@
#include <geom/geom.h>
#include <sys/zvol.h>
#include <sys/zvol_impl.h>
#include <sys/abd.h>
#include <sys/dmu_impl.h>
#include <cityhash.h>

#include "zfs_namecheck.h"
Expand Down Expand Up @@ -154,6 +156,21 @@ SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
*/
int zvol_maxphys = DMU_MAX_ACCESS / 2;

/*
* Enable Direct I/O for zvols. When enabled, page-aligned reads and
* block-aligned writes will bypass the ARC and DMA directly into/from
* the bio_data buffer, avoiding the data copy overhead.
*
* This is particularly beneficial for high-bandwidth NVMe-oF workloads
* where the CPU memcpy bottleneck limits throughput.
*
* Default: 0 (disabled) for safety. Set to 1 to enable.
*/
static int zvol_dio_enabled = 0;
SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, dio_enabled, CTLFLAG_RWTUN,
&zvol_dio_enabled, 0,
"Enable Direct I/O for zvols (bypass ARC, DMA directly to/from bio)");

static void zvol_ensure_zilog(zvol_state_t *zv);

static d_open_t zvol_cdev_open;
Expand Down Expand Up @@ -579,6 +596,136 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
return (0);
}

/*
* Determine if a zvol read can use the Direct I/O path.
*
* For unmapped (scattered) BIOs, alignment is checked per-page.
* For mapped (linear) BIOs, alignment is checked on the buffer address.
*
* Requirements:
* - Direct I/O must be enabled (zvol_dio_enabled)
* - The I/O offset and size must be page-aligned
* - The buffer must be page-aligned (either addr for mapped, or
* bio_ma_offset for unmapped)
*/
static boolean_t
zvol_dio_can_read(struct bio *bp, uint64_t off, size_t size)
{
if (!zvol_dio_enabled)
return (B_FALSE);

if (size == 0)
return (B_FALSE);

if (!zfs_dio_aligned(off, size, PAGESIZE))
return (B_FALSE);

/*
* For unmapped BIOs, the bio_ma array contains physical page
* pointers; the bio_ma_offset is the byte offset within the
* first page (may be non-zero). No additional buffer address
* alignment check needed since pages are always aligned.
*/
if (bp->bio_flags & BIO_UNMAPPED)
return (B_TRUE);

/* For mapped BIOs, check the linear buffer address. */
if (!zfs_dio_page_aligned(bp->bio_data))
return (B_FALSE);

return (B_TRUE);
}

/*
* Determine if a zvol write can use the Direct I/O path.
*
* Requirements:
* - Direct I/O must be enabled
* - The write must be block-aligned (volblocksize)
* - The write must be at least one full volblocksize
* - For mapped BIOs: buffer address must be page-aligned
* - For unmapped BIOs: page pointers are always aligned;
* bio_ma_offset is the byte offset within the first page
*/
static boolean_t
zvol_dio_can_write(zvol_state_t *zv, struct bio *bp,
uint64_t off, size_t size)
{
if (!zvol_dio_enabled)
return (B_FALSE);

if (size < zv->zv_volblocksize)
return (B_FALSE);

if (!zfs_dio_aligned(off, size, zv->zv_volblocksize))
return (B_FALSE);

/* For mapped BIOs, check the linear buffer address. */
if (!(bp->bio_flags & BIO_UNMAPPED) &&
!zfs_dio_page_aligned(bp->bio_data))
return (B_FALSE);

return (B_TRUE);
}

/*
* Perform a Direct I/O read on a zvol, bypassing the ARC.
*
* For unmapped (scattered) BIOs, creates a scattered ABD from the
* bio_ma page array for true zero-copy DMA into the consumer's pages.
* For mapped (linear) BIOs, allocates a new ABD, reads into it, then
* copies the result back to the bio_data kernel buffer. This avoids
* DMA issues with non-physically-contiguous kernel buffers.
*/
static int
zvol_dio_read(zvol_state_t *zv, struct bio *bp, uint64_t off, size_t size)
{
abd_t *abd;
int error;

if (bp->bio_flags & BIO_UNMAPPED) {
abd = abd_alloc_from_pages(bp->bio_ma, bp->bio_ma_offset, size);
error = dmu_read_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO);
abd_free(abd);
} else {
abd = abd_alloc_for_io(size, B_FALSE);
error = dmu_read_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO);
if (error == 0)
abd_copy_to_buf(bp->bio_data, abd, size);
abd_free(abd);
Comment thread
tiehexue marked this conversation as resolved.
Outdated
}
return (error);
}

/*
* Perform a Direct I/O write on a zvol, bypassing the ARC.
*
* For unmapped (scattered) BIOs, creates a scattered ABD from the
* bio_ma page array for true zero-copy DMA from the consumer's pages.
* For mapped (linear) BIOs, allocates a new ABD and copies the data
* from the bio_data kernel buffer. This avoids issues with wrapping
* non-physically-contiguous kernel buffers in a linear ABD, which can
* break DMA when the buffer spans page boundaries.
* This is a synchronous write — it waits for the I/O to complete.
*/
static int
zvol_dio_write(zvol_state_t *zv, struct bio *bp, uint64_t off, size_t size,
dmu_tx_t *tx)
{
abd_t *abd;
int error;

if (bp->bio_flags & BIO_UNMAPPED) {
abd = abd_alloc_from_pages(bp->bio_ma, bp->bio_ma_offset, size);
} else {
abd = abd_alloc_for_io(size, B_FALSE);
abd_copy_from_buf(abd, bp->bio_data, size);
}
error = dmu_write_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO, tx);
abd_free(abd);
return (error);
}

static void
zvol_strategy_impl(zv_request_t *zvr)
{
Expand Down Expand Up @@ -670,18 +817,98 @@ zvol_strategy_impl(zv_request_t *zvr)
while (resid != 0 && off < volsize) {
size_t size = MIN(resid, zvol_maxphys);
if (doread) {
error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
DMU_READ_PREFETCH);
/*
* Try Direct I/O first for page-aligned reads.
* This bypasses the ARC and DMAs data directly
* into the bio_data/bio_ma buffer. On checksum
* error, fall back to the ARC path for safety.
*/
if (zvol_dio_can_read(bp, off, size)) {
error = zvol_dio_read(zv, bp, off, size);
if (error == ECKSUM) {
/*
* For unmapped BIOs with ECKSUM,
* create a temp ABD, borrow/copy
* the linear buffer for ARC retry,
* then copy results back to pages.
*/
abd_t *tmp = NULL;
if (bp->bio_flags & BIO_UNMAPPED) {
tmp = abd_alloc_from_pages(
bp->bio_ma,
bp->bio_ma_offset, size);
addr = abd_borrow_buf_copy(
tmp, size);
}
error = dmu_read_by_dnode(zv->zv_dn,
off, size, addr,
DMU_READ_PREFETCH);
if (bp->bio_flags & BIO_UNMAPPED) {
abd_return_buf_copy(
tmp, addr, size);
abd_free(tmp);
}
}
} else {
abd_t *tmp = NULL;
if (bp->bio_flags & BIO_UNMAPPED) {
tmp = abd_alloc_from_pages(
bp->bio_ma,
bp->bio_ma_offset, size);
addr = abd_borrow_buf_copy(
tmp, size);
}
error = dmu_read_by_dnode(zv->zv_dn, off, size,
addr, DMU_READ_PREFETCH);
if (bp->bio_flags & BIO_UNMAPPED) {
abd_return_buf_copy(tmp, addr, size);
abd_free(tmp);
Comment thread
tiehexue marked this conversation as resolved.
}
}
} else {
dmu_tx_t *tx = dmu_tx_create(os);
dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
error = dmu_tx_assign(tx, DMU_TX_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
dmu_write_by_dnode(zv->zv_dn, off, size, addr,
tx, DMU_READ_PREFETCH);
zvol_log_write(zv, tx, off, size, commit);
/*
* Try Direct I/O for block-aligned writes.
* This bypasses the ARC and writes directly
* to disk, avoiding the data copy overhead.
*/
if (zvol_dio_can_write(zv, bp, off, size)) {
error = zvol_dio_write(zv, bp, off,
size, tx);
if (error == 0)
zvol_log_write(zv, tx, off,
size, commit);
} else {
/*
* For unmapped BIOs falling back
* to ARC, create a temp ABD from
* scatter pages, borrow a linear
* buffer, write it, then release.
*/
abd_t *tmp = NULL;
if (bp->bio_flags & BIO_UNMAPPED) {
tmp = abd_alloc_from_pages(
bp->bio_ma,
bp->bio_ma_offset, size);
addr = abd_borrow_buf_copy(
tmp, size);
}
dmu_write_by_dnode(zv->zv_dn, off,
size, addr, tx,
DMU_READ_PREFETCH);
if (bp->bio_flags & BIO_UNMAPPED) {
abd_return_buf(tmp, addr,
size);
abd_free(tmp);
}
zvol_log_write(zv, tx, off, size,
commit);
}
dmu_tx_commit(tx);
}
}
Expand Down Expand Up @@ -1278,7 +1505,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
g_wither_provider(pp, ENXIO);

pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND |
G_PF_ACCEPT_UNMAPPED;
pp->sectorsize = DEV_BSIZE;
pp->mediasize = zv->zv_volsize;
pp->private = zv;
Expand Down Expand Up @@ -1361,7 +1589,8 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
gp->start = zvol_geom_bio_start;
gp->access = zvol_geom_access;
pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND |
G_PF_ACCEPT_UNMAPPED;
pp->sectorsize = DEV_BSIZE;
pp->mediasize = 0;
pp->private = zv;
Expand Down
39 changes: 39 additions & 0 deletions module/os/linux/zfs/zfs_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,16 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
uio->uio_bvec++;
uio->uio_iovcnt--;
}
} else if (uio->uio_segflg == UIO_BVEC) {
/*
* When using a uio backed by a struct request (blk-mq),
* the bvec pointers are not maintained during uioskip.
* Callers (e.g. zvol_dio_read) derive page mappings
* directly from the request using zvol_dio_get_pages(),
* which walks the request segments independently using
* the uio_loffset. We only need to advance the logical
* offset and resid — no bvec accounting needed.
*/
} else if (uio->uio_segflg == UIO_ITER) {
iov_iter_advance(uio->uio_iter, n);
} else {
Expand Down Expand Up @@ -403,6 +413,35 @@ zfs_uio_page_aligned(zfs_uio_t *uio)
unsigned long alignment =
iov_iter_alignment(uio->uio_iter);
aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
} else if (uio->uio_segflg == UIO_BVEC) {
/*
* For bio_vec-backed I/O (zvols), check that each
* segment is page-aligned. The block layer typically
* allocates page-aligned I/O, so this should almost
* always pass.
*/
if (uio->rq != NULL) {
struct bio_vec bv;
struct req_iterator iter;
rq_for_each_segment(bv, uio->rq, iter) {
if (!IS_P2ALIGNED(bv.bv_offset, PAGE_SIZE) ||
!IS_P2ALIGNED(bv.bv_len, PAGE_SIZE)) {
aligned = B_FALSE;
break;
}
}
} else if (uio->uio_bvec != NULL) {
const struct bio_vec *bv = uio->uio_bvec;
for (int i = 0; i < uio->uio_iovcnt; i++, bv++) {
if (!IS_P2ALIGNED(bv->bv_offset, PAGE_SIZE) ||
!IS_P2ALIGNED(bv->bv_len, PAGE_SIZE)) {
aligned = B_FALSE;
break;
}
}
} else {
aligned = B_FALSE;
}
} else {
/* Currently not supported */
aligned = B_FALSE;
Expand Down
Loading
Loading