Skip to content

Commit 680eb8b

Browse files
committed
zvol with directio
Signed-off-by: tiehexue <tiehexue@hotmail.com>
1 parent 7e054b2 commit 680eb8b

15 files changed

Lines changed: 2070 additions & 19 deletions

File tree

module/os/freebsd/zfs/abd_os.c

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -449,28 +449,38 @@ abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size)
449449
ASSERT3U(offset, <, PAGE_SIZE);
450450
ASSERT3P(pages, !=, NULL);
451451

452-
abd_t *abd = abd_alloc_struct(size);
453-
abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
454-
abd->abd_size = size;
452+
abd_t *abd;
455453

456454
if ((offset + size) <= PAGE_SIZE) {
457455
/*
458456
* There is only a single page worth of data, so we will just
459-
* use a linear ABD. We have to make sure to take into account
460-
* the offset though. In all other cases our offset will be 0
461-
* as we are always PAGE_SIZE aligned.
457+
* use a linear ABD. We have to make sure to take into account
458+
* the offset though.
462459
*/
460+
abd = abd_alloc_struct(size);
461+
abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
462+
abd->abd_size = size;
463463
abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
464464
ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0],
465465
&abd->abd_u.abd_linear.sf) + offset;
466466
} else {
467+
/*
468+
* Multi-page scatter ABD. The first page may have a
469+
* non-zero byte offset (bio_ma_offset), so allocate
470+
* enough chunk slots to cover the full range from the
471+
* offset through the end of the data.
472+
*/
473+
uint_t chunkcnt = abd_chunkcnt_for_bytes(offset + size);
474+
475+
abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT);
476+
abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
477+
abd->abd_size = size;
467478
ABD_SCATTER(abd).abd_offset = offset;
468-
ASSERT0(ABD_SCATTER(abd).abd_offset);
469479

470480
/*
471481
* Setting the ABD's abd_chunks to point to the user pages.
472482
*/
473-
for (int i = 0; i < abd_chunkcnt_for_bytes(size); i++)
483+
for (int i = 0; i < chunkcnt; i++)
474484
ABD_SCATTER(abd).abd_chunks[i] = pages[i];
475485
}
476486

module/os/freebsd/zfs/zvol_os.c

Lines changed: 226 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@
9999
#include <geom/geom.h>
100100
#include <sys/zvol.h>
101101
#include <sys/zvol_impl.h>
102+
#include <sys/abd.h>
103+
#include <sys/dmu_impl.h>
102104
#include <cityhash.h>
103105

104106
#include "zfs_namecheck.h"
@@ -154,6 +156,21 @@ SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
154156
*/
155157
int zvol_maxphys = DMU_MAX_ACCESS / 2;
156158

159+
/*
160+
* Enable Direct I/O for zvols. When enabled, page-aligned reads and
161+
* block-aligned writes will bypass the ARC and DMA directly into/from
162+
* the bio_data buffer, avoiding the data copy overhead.
163+
*
164+
* This is particularly beneficial for high-bandwidth NVMe-oF workloads
165+
* where the CPU memcpy bottleneck limits throughput.
166+
*
167+
* Default: 0 (disabled) for safety. Set to 1 to enable.
168+
*/
169+
static int zvol_dio_enabled = 0;
170+
SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, dio_enabled, CTLFLAG_RWTUN,
171+
&zvol_dio_enabled, 0,
172+
"Enable Direct I/O for zvols (bypass ARC, DMA directly to/from bio)");
173+
157174
static void zvol_ensure_zilog(zvol_state_t *zv);
158175

159176
static d_open_t zvol_cdev_open;
@@ -579,6 +596,126 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
579596
return (0);
580597
}
581598

599+
/*
600+
* Determine if a zvol read can use the Direct I/O path.
601+
*
602+
* For unmapped (scattered) BIOs, alignment is checked per-page.
603+
* For mapped (linear) BIOs, alignment is checked on the buffer address.
604+
*
605+
* Requirements:
606+
* - Direct I/O must be enabled (zvol_dio_enabled)
607+
* - The I/O offset and size must be page-aligned
608+
* - The buffer must be page-aligned (either addr for mapped, or
609+
* bio_ma_offset for unmapped)
610+
*/
611+
static boolean_t
612+
zvol_dio_can_read(struct bio *bp, uint64_t off, size_t size)
613+
{
614+
if (!zvol_dio_enabled)
615+
return (B_FALSE);
616+
617+
if (size == 0)
618+
return (B_FALSE);
619+
620+
if (!zfs_dio_aligned(off, size, PAGESIZE))
621+
return (B_FALSE);
622+
623+
/*
624+
* For unmapped BIOs, the bio_ma array contains physical page
625+
* pointers; the bio_ma_offset is the byte offset within the
626+
* first page (may be non-zero). No additional buffer address
627+
* alignment check needed since pages are always aligned.
628+
*/
629+
if (bp->bio_flags & BIO_UNMAPPED)
630+
return (B_TRUE);
631+
632+
/* For mapped BIOs, check the linear buffer address. */
633+
if (!zfs_dio_page_aligned(bp->bio_data))
634+
return (B_FALSE);
635+
636+
return (B_TRUE);
637+
}
638+
639+
/*
640+
* Determine if a zvol write can use the Direct I/O path.
641+
*
642+
* Requirements:
643+
* - Direct I/O must be enabled
644+
* - The write must be block-aligned (volblocksize)
645+
* - The write must be at least one full volblocksize
646+
* - For mapped BIOs: buffer address must be page-aligned
647+
* - For unmapped BIOs: page pointers are always aligned;
648+
* bio_ma_offset is the byte offset within the first page
649+
*/
650+
static boolean_t
651+
zvol_dio_can_write(zvol_state_t *zv, struct bio *bp,
652+
uint64_t off, size_t size)
653+
{
654+
if (!zvol_dio_enabled)
655+
return (B_FALSE);
656+
657+
if (size < zv->zv_volblocksize)
658+
return (B_FALSE);
659+
660+
if (!zfs_dio_aligned(off, size, zv->zv_volblocksize))
661+
return (B_FALSE);
662+
663+
/* For mapped BIOs, check the linear buffer address. */
664+
if (!(bp->bio_flags & BIO_UNMAPPED) &&
665+
!zfs_dio_page_aligned(bp->bio_data))
666+
return (B_FALSE);
667+
668+
return (B_TRUE);
669+
}
670+
671+
/*
672+
* Perform a Direct I/O read on a zvol, bypassing the ARC.
673+
*
674+
* For unmapped (scattered) BIOs, creates a scattered ABD from the
675+
* bio_ma page array for true zero-copy DMA into the consumer's pages.
676+
* For mapped (linear) BIOs, wraps bio_data in a linear ABD.
677+
*/
678+
static int
679+
zvol_dio_read(zvol_state_t *zv, struct bio *bp, uint64_t off, size_t size)
680+
{
681+
abd_t *abd;
682+
int error;
683+
684+
if (bp->bio_flags & BIO_UNMAPPED) {
685+
abd = abd_alloc_from_pages(bp->bio_ma, bp->bio_ma_offset, size);
686+
} else {
687+
abd = abd_get_from_buf(bp->bio_data, size);
688+
}
689+
error = dmu_read_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO);
690+
abd_free(abd);
691+
return (error);
692+
}
693+
694+
/*
695+
* Perform a Direct I/O write on a zvol, bypassing the ARC.
696+
*
697+
* For unmapped (scattered) BIOs, creates a scattered ABD from the
698+
* bio_ma page array for true zero-copy DMA from the consumer's pages.
699+
* For mapped (linear) BIOs, wraps bio_data in a linear ABD.
700+
* This is a synchronous write — it waits for the I/O to complete.
701+
*/
702+
static int
703+
zvol_dio_write(zvol_state_t *zv, struct bio *bp, uint64_t off, size_t size,
704+
dmu_tx_t *tx)
705+
{
706+
abd_t *abd;
707+
int error;
708+
709+
if (bp->bio_flags & BIO_UNMAPPED) {
710+
abd = abd_alloc_from_pages(bp->bio_ma, bp->bio_ma_offset, size);
711+
} else {
712+
abd = abd_get_from_buf(bp->bio_data, size);
713+
}
714+
error = dmu_write_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO, tx);
715+
abd_free(abd);
716+
return (error);
717+
}
718+
582719
static void
583720
zvol_strategy_impl(zv_request_t *zvr)
584721
{
@@ -670,18 +807,98 @@ zvol_strategy_impl(zv_request_t *zvr)
670807
while (resid != 0 && off < volsize) {
671808
size_t size = MIN(resid, zvol_maxphys);
672809
if (doread) {
673-
error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
674-
DMU_READ_PREFETCH);
810+
/*
811+
* Try Direct I/O first for page-aligned reads.
812+
* This bypasses the ARC and DMAs data directly
813+
* into the bio_data/bio_ma buffer. On checksum
814+
* error, fall back to the ARC path for safety.
815+
*/
816+
if (zvol_dio_can_read(bp, off, size)) {
817+
error = zvol_dio_read(zv, bp, off, size);
818+
if (error == ECKSUM) {
819+
/*
820+
* For unmapped BIOs with ECKSUM,
821+
* create a temp ABD, borrow/copy
822+
* the linear buffer for ARC retry,
823+
* then copy results back to pages.
824+
*/
825+
abd_t *tmp = NULL;
826+
if (bp->bio_flags & BIO_UNMAPPED) {
827+
tmp = abd_alloc_from_pages(
828+
bp->bio_ma,
829+
bp->bio_ma_offset, size);
830+
addr = abd_borrow_buf_copy(
831+
tmp, size);
832+
}
833+
error = dmu_read_by_dnode(zv->zv_dn,
834+
off, size, addr,
835+
DMU_READ_PREFETCH);
836+
if (bp->bio_flags & BIO_UNMAPPED) {
837+
abd_return_buf_copy(
838+
tmp, addr, size);
839+
abd_free(tmp);
840+
}
841+
}
842+
} else {
843+
abd_t *tmp = NULL;
844+
if (bp->bio_flags & BIO_UNMAPPED) {
845+
tmp = abd_alloc_from_pages(
846+
bp->bio_ma,
847+
bp->bio_ma_offset, size);
848+
addr = abd_borrow_buf_copy(
849+
tmp, size);
850+
}
851+
error = dmu_read_by_dnode(zv->zv_dn, off, size,
852+
addr, DMU_READ_PREFETCH);
853+
if (bp->bio_flags & BIO_UNMAPPED) {
854+
abd_return_buf_copy(tmp, addr, size);
855+
abd_free(tmp);
856+
}
857+
}
675858
} else {
676859
dmu_tx_t *tx = dmu_tx_create(os);
677860
dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
678861
error = dmu_tx_assign(tx, DMU_TX_WAIT);
679862
if (error) {
680863
dmu_tx_abort(tx);
681864
} else {
682-
dmu_write_by_dnode(zv->zv_dn, off, size, addr,
683-
tx, DMU_READ_PREFETCH);
684-
zvol_log_write(zv, tx, off, size, commit);
865+
/*
866+
* Try Direct I/O for block-aligned writes.
867+
* This bypasses the ARC and writes directly
868+
* to disk, avoiding the data copy overhead.
869+
*/
870+
if (zvol_dio_can_write(zv, bp, off, size)) {
871+
error = zvol_dio_write(zv, bp, off,
872+
size, tx);
873+
if (error == 0)
874+
zvol_log_write(zv, tx, off,
875+
size, commit);
876+
} else {
877+
/*
878+
* For unmapped BIOs falling back
879+
* to ARC, create a temp ABD from
880+
* scatter pages, borrow a linear
881+
* buffer, write it, then release.
882+
*/
883+
abd_t *tmp = NULL;
884+
if (bp->bio_flags & BIO_UNMAPPED) {
885+
tmp = abd_alloc_from_pages(
886+
bp->bio_ma,
887+
bp->bio_ma_offset, size);
888+
addr = abd_borrow_buf_copy(
889+
tmp, size);
890+
}
891+
dmu_write_by_dnode(zv->zv_dn, off,
892+
size, addr, tx,
893+
DMU_READ_PREFETCH);
894+
if (bp->bio_flags & BIO_UNMAPPED) {
895+
abd_return_buf(tmp, addr,
896+
size);
897+
abd_free(tmp);
898+
}
899+
zvol_log_write(zv, tx, off, size,
900+
commit);
901+
}
685902
dmu_tx_commit(tx);
686903
}
687904
}
@@ -1278,7 +1495,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
12781495
g_wither_provider(pp, ENXIO);
12791496

12801497
pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1281-
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1498+
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND |
1499+
G_PF_ACCEPT_UNMAPPED;
12821500
pp->sectorsize = DEV_BSIZE;
12831501
pp->mediasize = zv->zv_volsize;
12841502
pp->private = zv;
@@ -1361,7 +1579,8 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
13611579
gp->start = zvol_geom_bio_start;
13621580
gp->access = zvol_geom_access;
13631581
pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1364-
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1582+
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND |
1583+
G_PF_ACCEPT_UNMAPPED;
13651584
pp->sectorsize = DEV_BSIZE;
13661585
pp->mediasize = 0;
13671586
pp->private = zv;

module/os/linux/zfs/zfs_uio.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,16 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
359359
uio->uio_bvec++;
360360
uio->uio_iovcnt--;
361361
}
362+
} else if (uio->uio_segflg == UIO_BVEC) {
363+
/*
364+
* When using a uio backed by a struct request (blk-mq),
365+
* the bvec pointers are not maintained during uioskip.
366+
* Callers (e.g. zvol_dio_read) derive page mappings
367+
* directly from the request using zvol_dio_get_pages(),
368+
* which walks the request segments independently using
369+
* the uio_loffset. We only need to advance the logical
370+
* offset and resid — no bvec accounting needed.
371+
*/
362372
} else if (uio->uio_segflg == UIO_ITER) {
363373
iov_iter_advance(uio->uio_iter, n);
364374
} else {
@@ -403,6 +413,35 @@ zfs_uio_page_aligned(zfs_uio_t *uio)
403413
unsigned long alignment =
404414
iov_iter_alignment(uio->uio_iter);
405415
aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
416+
} else if (uio->uio_segflg == UIO_BVEC) {
417+
/*
418+
* For bio_vec-backed I/O (zvols), check that each
419+
* segment is page-aligned. The block layer typically
420+
* allocates page-aligned I/O, so this should almost
421+
* always pass.
422+
*/
423+
if (uio->rq != NULL) {
424+
struct bio_vec bv;
425+
struct req_iterator iter;
426+
rq_for_each_segment(bv, uio->rq, iter) {
427+
if (!IS_P2ALIGNED(bv.bv_offset, PAGE_SIZE) ||
428+
!IS_P2ALIGNED(bv.bv_len, PAGE_SIZE)) {
429+
aligned = B_FALSE;
430+
break;
431+
}
432+
}
433+
} else if (uio->uio_bvec != NULL) {
434+
const struct bio_vec *bv = uio->uio_bvec;
435+
for (int i = 0; i < uio->uio_iovcnt; i++, bv++) {
436+
if (!IS_P2ALIGNED(bv->bv_offset, PAGE_SIZE) ||
437+
!IS_P2ALIGNED(bv->bv_len, PAGE_SIZE)) {
438+
aligned = B_FALSE;
439+
break;
440+
}
441+
}
442+
} else {
443+
aligned = B_FALSE;
444+
}
406445
} else {
407446
/* Currently not supported */
408447
aligned = B_FALSE;

0 commit comments

Comments
 (0)