|
99 | 99 | #include <geom/geom.h> |
100 | 100 | #include <sys/zvol.h> |
101 | 101 | #include <sys/zvol_impl.h> |
| 102 | +#include <sys/abd.h> |
| 103 | +#include <sys/dmu_impl.h> |
102 | 104 | #include <cityhash.h> |
103 | 105 |
|
104 | 106 | #include "zfs_namecheck.h" |
@@ -154,6 +156,21 @@ SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, |
154 | 156 | */ |
155 | 157 | int zvol_maxphys = DMU_MAX_ACCESS / 2; |
156 | 158 |
|
| 159 | +/* |
| 160 | + * Enable Direct I/O for zvols. When enabled, page-aligned reads and |
| 161 | + * block-aligned writes will bypass the ARC and DMA directly into/from |
| 162 | + * the bio_data buffer, avoiding the data copy overhead. |
| 163 | + * |
| 164 | + * This is particularly beneficial for high-bandwidth NVMe-oF workloads |
| 165 | + * where the CPU memcpy bottleneck limits throughput. |
| 166 | + * |
| 167 | + * Default: 0 (disabled) for safety. Set to 1 to enable. |
| 168 | + */ |
| 169 | +static int zvol_dio_enabled = 0; |
| 170 | +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, dio_enabled, CTLFLAG_RWTUN, |
| 171 | + &zvol_dio_enabled, 0, |
| 172 | + "Enable Direct I/O for zvols (bypass ARC, DMA directly to/from bio)"); |
| 173 | + |
157 | 174 | static void zvol_ensure_zilog(zvol_state_t *zv); |
158 | 175 |
|
159 | 176 | static d_open_t zvol_cdev_open; |
@@ -579,6 +596,126 @@ zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) |
579 | 596 | return (0); |
580 | 597 | } |
581 | 598 |
|
| 599 | +/* |
| 600 | + * Determine if a zvol read can use the Direct I/O path. |
| 601 | + * |
| 602 | + * For unmapped (scattered) BIOs, alignment is checked per-page. |
| 603 | + * For mapped (linear) BIOs, alignment is checked on the buffer address. |
| 604 | + * |
| 605 | + * Requirements: |
| 606 | + * - Direct I/O must be enabled (zvol_dio_enabled) |
| 607 | + * - The I/O offset and size must be page-aligned |
| 608 | + * - The buffer must be page-aligned (either addr for mapped, or |
| 609 | + * bio_ma_offset for unmapped) |
| 610 | + */ |
| 611 | +static boolean_t |
| 612 | +zvol_dio_can_read(struct bio *bp, uint64_t off, size_t size) |
| 613 | +{ |
| 614 | + if (!zvol_dio_enabled) |
| 615 | + return (B_FALSE); |
| 616 | + |
| 617 | + if (size == 0) |
| 618 | + return (B_FALSE); |
| 619 | + |
| 620 | + if (!zfs_dio_aligned(off, size, PAGESIZE)) |
| 621 | + return (B_FALSE); |
| 622 | + |
| 623 | + /* |
| 624 | + * For unmapped BIOs, the bio_ma array contains physical page |
| 625 | + * pointers; the bio_ma_offset is the byte offset within the |
| 626 | + * first page (may be non-zero). No additional buffer address |
| 627 | + * alignment check needed since pages are always aligned. |
| 628 | + */ |
| 629 | + if (bp->bio_flags & BIO_UNMAPPED) |
| 630 | + return (B_TRUE); |
| 631 | + |
| 632 | + /* For mapped BIOs, check the linear buffer address. */ |
| 633 | + if (!zfs_dio_page_aligned(bp->bio_data)) |
| 634 | + return (B_FALSE); |
| 635 | + |
| 636 | + return (B_TRUE); |
| 637 | +} |
| 638 | + |
| 639 | +/* |
| 640 | + * Determine if a zvol write can use the Direct I/O path. |
| 641 | + * |
| 642 | + * Requirements: |
| 643 | + * - Direct I/O must be enabled |
| 644 | + * - The write must be block-aligned (volblocksize) |
| 645 | + * - The write must be at least one full volblocksize |
| 646 | + * - For mapped BIOs: buffer address must be page-aligned |
| 647 | + * - For unmapped BIOs: page pointers are always aligned; |
| 648 | + * bio_ma_offset is the byte offset within the first page |
| 649 | + */ |
| 650 | +static boolean_t |
| 651 | +zvol_dio_can_write(zvol_state_t *zv, struct bio *bp, |
| 652 | + uint64_t off, size_t size) |
| 653 | +{ |
| 654 | + if (!zvol_dio_enabled) |
| 655 | + return (B_FALSE); |
| 656 | + |
| 657 | + if (size < zv->zv_volblocksize) |
| 658 | + return (B_FALSE); |
| 659 | + |
| 660 | + if (!zfs_dio_aligned(off, size, zv->zv_volblocksize)) |
| 661 | + return (B_FALSE); |
| 662 | + |
| 663 | + /* For mapped BIOs, check the linear buffer address. */ |
| 664 | + if (!(bp->bio_flags & BIO_UNMAPPED) && |
| 665 | + !zfs_dio_page_aligned(bp->bio_data)) |
| 666 | + return (B_FALSE); |
| 667 | + |
| 668 | + return (B_TRUE); |
| 669 | +} |
| 670 | + |
| 671 | +/* |
| 672 | + * Perform a Direct I/O read on a zvol, bypassing the ARC. |
| 673 | + * |
| 674 | + * For unmapped (scattered) BIOs, creates a scattered ABD from the |
| 675 | + * bio_ma page array for true zero-copy DMA into the consumer's pages. |
| 676 | + * For mapped (linear) BIOs, wraps bio_data in a linear ABD. |
| 677 | + */ |
| 678 | +static int |
| 679 | +zvol_dio_read(zvol_state_t *zv, struct bio *bp, uint64_t off, size_t size) |
| 680 | +{ |
| 681 | + abd_t *abd; |
| 682 | + int error; |
| 683 | + |
| 684 | + if (bp->bio_flags & BIO_UNMAPPED) { |
| 685 | + abd = abd_alloc_from_pages(bp->bio_ma, bp->bio_ma_offset, size); |
| 686 | + } else { |
| 687 | + abd = abd_get_from_buf(bp->bio_data, size); |
| 688 | + } |
| 689 | + error = dmu_read_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO); |
| 690 | + abd_free(abd); |
| 691 | + return (error); |
| 692 | +} |
| 693 | + |
| 694 | +/* |
| 695 | + * Perform a Direct I/O write on a zvol, bypassing the ARC. |
| 696 | + * |
| 697 | + * For unmapped (scattered) BIOs, creates a scattered ABD from the |
| 698 | + * bio_ma page array for true zero-copy DMA from the consumer's pages. |
| 699 | + * For mapped (linear) BIOs, wraps bio_data in a linear ABD. |
| 700 | + * This is a synchronous write — it waits for the I/O to complete. |
| 701 | + */ |
| 702 | +static int |
| 703 | +zvol_dio_write(zvol_state_t *zv, struct bio *bp, uint64_t off, size_t size, |
| 704 | + dmu_tx_t *tx) |
| 705 | +{ |
| 706 | + abd_t *abd; |
| 707 | + int error; |
| 708 | + |
| 709 | + if (bp->bio_flags & BIO_UNMAPPED) { |
| 710 | + abd = abd_alloc_from_pages(bp->bio_ma, bp->bio_ma_offset, size); |
| 711 | + } else { |
| 712 | + abd = abd_get_from_buf(bp->bio_data, size); |
| 713 | + } |
| 714 | + error = dmu_write_abd(zv->zv_dn, off, size, abd, DMU_DIRECTIO, tx); |
| 715 | + abd_free(abd); |
| 716 | + return (error); |
| 717 | +} |
| 718 | + |
582 | 719 | static void |
583 | 720 | zvol_strategy_impl(zv_request_t *zvr) |
584 | 721 | { |
@@ -670,18 +807,98 @@ zvol_strategy_impl(zv_request_t *zvr) |
670 | 807 | while (resid != 0 && off < volsize) { |
671 | 808 | size_t size = MIN(resid, zvol_maxphys); |
672 | 809 | if (doread) { |
673 | | - error = dmu_read_by_dnode(zv->zv_dn, off, size, addr, |
674 | | - DMU_READ_PREFETCH); |
| 810 | + /* |
| 811 | + * Try Direct I/O first for page-aligned reads. |
| 812 | + * This bypasses the ARC and DMAs data directly |
| 813 | + * into the bio_data/bio_ma buffer. On checksum |
| 814 | + * error, fall back to the ARC path for safety. |
| 815 | + */ |
| 816 | + if (zvol_dio_can_read(bp, off, size)) { |
| 817 | + error = zvol_dio_read(zv, bp, off, size); |
| 818 | + if (error == ECKSUM) { |
| 819 | + /* |
| 820 | + * For unmapped BIOs with ECKSUM, |
| 821 | + * create a temp ABD, borrow/copy |
| 822 | + * the linear buffer for ARC retry, |
| 823 | + * then copy results back to pages. |
| 824 | + */ |
| 825 | + abd_t *tmp = NULL; |
| 826 | + if (bp->bio_flags & BIO_UNMAPPED) { |
| 827 | + tmp = abd_alloc_from_pages( |
| 828 | + bp->bio_ma, |
| 829 | + bp->bio_ma_offset, size); |
| 830 | + addr = abd_borrow_buf_copy( |
| 831 | + tmp, size); |
| 832 | + } |
| 833 | + error = dmu_read_by_dnode(zv->zv_dn, |
| 834 | + off, size, addr, |
| 835 | + DMU_READ_PREFETCH); |
| 836 | + if (bp->bio_flags & BIO_UNMAPPED) { |
| 837 | + abd_return_buf_copy( |
| 838 | + tmp, addr, size); |
| 839 | + abd_free(tmp); |
| 840 | + } |
| 841 | + } |
| 842 | + } else { |
| 843 | + abd_t *tmp = NULL; |
| 844 | + if (bp->bio_flags & BIO_UNMAPPED) { |
| 845 | + tmp = abd_alloc_from_pages( |
| 846 | + bp->bio_ma, |
| 847 | + bp->bio_ma_offset, size); |
| 848 | + addr = abd_borrow_buf_copy( |
| 849 | + tmp, size); |
| 850 | + } |
| 851 | + error = dmu_read_by_dnode(zv->zv_dn, off, size, |
| 852 | + addr, DMU_READ_PREFETCH); |
| 853 | + if (bp->bio_flags & BIO_UNMAPPED) { |
| 854 | + abd_return_buf_copy(tmp, addr, size); |
| 855 | + abd_free(tmp); |
| 856 | + } |
| 857 | + } |
675 | 858 | } else { |
676 | 859 | dmu_tx_t *tx = dmu_tx_create(os); |
677 | 860 | dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); |
678 | 861 | error = dmu_tx_assign(tx, DMU_TX_WAIT); |
679 | 862 | if (error) { |
680 | 863 | dmu_tx_abort(tx); |
681 | 864 | } else { |
682 | | - dmu_write_by_dnode(zv->zv_dn, off, size, addr, |
683 | | - tx, DMU_READ_PREFETCH); |
684 | | - zvol_log_write(zv, tx, off, size, commit); |
| 865 | + /* |
| 866 | + * Try Direct I/O for block-aligned writes. |
| 867 | + * This bypasses the ARC and writes directly |
| 868 | + * to disk, avoiding the data copy overhead. |
| 869 | + */ |
| 870 | + if (zvol_dio_can_write(zv, bp, off, size)) { |
| 871 | + error = zvol_dio_write(zv, bp, off, |
| 872 | + size, tx); |
| 873 | + if (error == 0) |
| 874 | + zvol_log_write(zv, tx, off, |
| 875 | + size, commit); |
| 876 | + } else { |
| 877 | + /* |
| 878 | + * For unmapped BIOs falling back |
| 879 | + * to ARC, create a temp ABD from |
| 880 | + * scatter pages, borrow a linear |
| 881 | + * buffer, write it, then release. |
| 882 | + */ |
| 883 | + abd_t *tmp = NULL; |
| 884 | + if (bp->bio_flags & BIO_UNMAPPED) { |
| 885 | + tmp = abd_alloc_from_pages( |
| 886 | + bp->bio_ma, |
| 887 | + bp->bio_ma_offset, size); |
| 888 | + addr = abd_borrow_buf_copy( |
| 889 | + tmp, size); |
| 890 | + } |
| 891 | + dmu_write_by_dnode(zv->zv_dn, off, |
| 892 | + size, addr, tx, |
| 893 | + DMU_READ_PREFETCH); |
| 894 | + if (bp->bio_flags & BIO_UNMAPPED) { |
| 895 | + abd_return_buf(tmp, addr, |
| 896 | + size); |
| 897 | + abd_free(tmp); |
| 898 | + } |
| 899 | + zvol_log_write(zv, tx, off, size, |
| 900 | + commit); |
| 901 | + } |
685 | 902 | dmu_tx_commit(tx); |
686 | 903 | } |
687 | 904 | } |
@@ -1278,7 +1495,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) |
1278 | 1495 | g_wither_provider(pp, ENXIO); |
1279 | 1496 |
|
1280 | 1497 | pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); |
1281 | | - pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; |
| 1498 | + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND | |
| 1499 | + G_PF_ACCEPT_UNMAPPED; |
1282 | 1500 | pp->sectorsize = DEV_BSIZE; |
1283 | 1501 | pp->mediasize = zv->zv_volsize; |
1284 | 1502 | pp->private = zv; |
@@ -1361,7 +1579,8 @@ zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize, |
1361 | 1579 | gp->start = zvol_geom_bio_start; |
1362 | 1580 | gp->access = zvol_geom_access; |
1363 | 1581 | pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); |
1364 | | - pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; |
| 1582 | + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND | |
| 1583 | + G_PF_ACCEPT_UNMAPPED; |
1365 | 1584 | pp->sectorsize = DEV_BSIZE; |
1366 | 1585 | pp->mediasize = 0; |
1367 | 1586 | pp->private = zv; |
|
0 commit comments