Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/os/linux/spl/sys/rwlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ spl_rw_clear_owner(krwlock_t *rwp)
rwp->rw_owner = NULL;
}

static inline kthread_t *
rw_owner(krwlock_t *rwp)
static inline const kthread_t *
rw_owner(const krwlock_t *rwp)
{
return (rwp->rw_owner);
}
Expand Down Expand Up @@ -100,7 +100,7 @@ RW_LOCK_HELD(krwlock_t *rwp)
}

static inline int
RW_WRITE_HELD(krwlock_t *rwp)
RW_WRITE_HELD(const krwlock_t *rwp)
{
return (rw_owner(rwp) == current);
}
Expand Down
18 changes: 17 additions & 1 deletion include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ typedef struct dmu_buf_impl {
/* buffer holding our data */
arc_buf_t *db_buf;

/* db_mtx protects the members below */
/* db_mtx protects the members below, plus db_dirtycnt */
kmutex_t db_mtx;

/*
Expand Down Expand Up @@ -329,6 +329,22 @@ typedef struct dmu_buf_impl {
dmu_buf_user_t *db_user;
} dmu_buf_impl_t;

/*
* Assert that the value of db.db_data cannot currently be changed. Either
* it's locked, or it's in an immutable state.
*/
void assert_db_data_addr_locked(const dmu_buf_impl_t *db);
/*
* Assert that the provided dbuf's contents can only be accessed by the caller,
* and by no other thread. Either it must be locked, or in a state where
* locking is not required.
*/
#ifdef __linux__
void assert_db_data_contents_locked(dmu_buf_impl_t *db, boolean_t wr);
#else
void assert_db_data_contents_locked(const dmu_buf_impl_t *db, boolean_t wr);
#endif

#define DBUF_HASH_MUTEX(h, idx) \
(&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])

Expand Down
88 changes: 75 additions & 13 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,37 @@ static unsigned long dbuf_metadata_cache_target_bytes(void);
static uint_t dbuf_cache_hiwater_pct = 10;
static uint_t dbuf_cache_lowater_pct = 10;

void
assert_db_data_addr_locked(const dmu_buf_impl_t *db)
{
if (db->db_level > 0)
return;
else if (db->db.db_object == DMU_META_DNODE_OBJECT)
return;
ASSERT(MUTEX_HELD(&db->db_mtx));
}

void
#ifdef __linux__
assert_db_data_contents_locked(dmu_buf_impl_t *db, boolean_t writer)
#else
assert_db_data_contents_locked(const dmu_buf_impl_t *db, boolean_t writer)
#endif
{
/*
* db_rwlock protects indirect blocks and the data block of the meta
* dnode.
*/
if (db->db_level == 0 && db->db.db_object != DMU_META_DNODE_OBJECT)
return;
if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID)
return;
if (writer)
ASSERT(RW_WRITE_HELD(&db->db_rwlock));
else
ASSERT(RW_LOCK_HELD(&db->db_rwlock));
}

static int
dbuf_cons(void *vdb, void *unused, int kmflag)
{
Expand Down Expand Up @@ -1706,6 +1737,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
assert_db_data_contents_locked(db, FALSE);
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
dnode_t *dn = DB_DNODE(db);
Expand Down Expand Up @@ -1736,6 +1768,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
assert_db_data_contents_locked(db, FALSE);
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
} else {
db->db_buf = NULL;
Expand Down Expand Up @@ -3028,6 +3061,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
assert_db_data_contents_locked(db, TRUE);
memset(db->db.db_data, 0, db->db.db_size);
db->db_freed_in_flight = FALSE;
db->db_state = DB_CACHED;
Expand Down Expand Up @@ -3160,6 +3194,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
ASSERT(!arc_is_encrypted(buf));
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
assert_db_data_contents_locked(db, TRUE);
memcpy(db->db.db_data, buf->b_data, db->db.db_size);
arc_buf_destroy(buf, db);
return;
Expand Down Expand Up @@ -3403,6 +3438,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
*parentp = NULL;
return (err);
}
assert_db_data_addr_locked(*parentp);
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
(blkid & ((1ULL << epbs) - 1));
return (0);
Expand Down Expand Up @@ -4589,10 +4625,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
} else {
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
assert_db_data_addr_locked(parent_db);
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
VERIFY3U(parent_db->db_level, ==, 1);
VERIFY3P(DB_DNODE(parent_db), ==, dn);
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
assert_db_data_contents_locked(parent_db, FALSE);
blkptr_t *bp = parent_db->db.db_data;
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
}
Expand All @@ -4603,12 +4641,22 @@ dbuf_lightweight_ready(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
blkptr_t *bp = zio->io_bp;
dmu_buf_impl_t *parent_db = NULL;

if (zio->io_error != 0)
return;

dnode_t *dn = dr->dr_dnode;

EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
if (dr->dr_parent == NULL) {
parent_db = dn->dn_dbuf;
} else {
parent_db = dr->dr_parent->dr_dbuf;
}

assert_db_data_addr_locked(parent_db);
rw_enter(&parent_db->db_rwlock, RW_WRITER);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if dn_maxblkid update below we could move before the lock acquisition (or after the release?) to not think about the lock ordering? They seem unrelated.

blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
spa_t *spa = dmu_objset_spa(dn->dn_objset);
int64_t delta = bp_get_dsize_sync(spa, bp) -
Expand All @@ -4628,14 +4676,6 @@ dbuf_lightweight_ready(zio_t *zio)
BP_SET_FILL(bp, fill);
}

dmu_buf_impl_t *parent_db;
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
if (dr->dr_parent == NULL) {
parent_db = dn->dn_dbuf;
} else {
parent_db = dr->dr_parent->dr_dbuf;
}
rw_enter(&parent_db->db_rwlock, RW_WRITER);
*bp_orig = *bp;
rw_exit(&parent_db->db_rwlock);
}
Expand Down Expand Up @@ -4669,6 +4709,7 @@ noinline static void
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dnode_t *dn = dr->dr_dnode;
dmu_buf_impl_t *parent_db = NULL;
zio_t *pio;
if (dn->dn_phys->dn_nlevels == 1) {
pio = dn->dn_zio;
Expand All @@ -4687,6 +4728,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* See comment in dbuf_write(). This is so that zio->io_bp_orig
* will have the old BP in dbuf_lightweight_done().
*/
if (dr->dr_dnode->dn_phys->dn_nlevels != 1) {
parent_db = dr->dr_parent->dr_dbuf;
assert_db_data_addr_locked(parent_db);
rw_enter(&parent_db->db_rwlock, RW_READER);
Comment on lines +4731 to +4734
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you think this lock is needed, shouldn't this block be similar to one in dbuf_lightweight_ready() above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Similar" how?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Locking dr->dr_parent->dr_dbuf if it is present or dn->dn_dbuf otherwise, as you've done two chunks above? I see dbuf_lightweight_bp() does check for dn_nlevels == 1 to decide what to access, but I wonder if it is equivalent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If anything, I think the logic in dbuf_lightweight_ready is wrong. Because dbuf_lightweight_bp doesn't actually access any db_data field if dn->dn_phys->dn_nlevels == 1. So I think that both places should look more like the code here. Do you agree? The relevant code was introduced in commit ba67d82 .

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If dn->dn_phys->dn_nlevels == 1, dbuf_lightweight_bp() returns pointer on dn_phys content, which is a part of dn->dn_dbuf. And I guess access to one may need to be locked, isn't it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My double-check says that there are many places where dn_phys is accessed without locking db_rwlock. In fact, it's harder to find places where db_rwlock is locked. So either you are mistaken, or we need another PR to clean up dn_phys accesses just like this PR does for db_rwlock.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dn_phys includes many things aside of the block pointers, while db_rwlock protects only block pointers. Some other fields may be protected by dn_struct_rwlock, etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couple chunks below in dbuf_write_ready() you take db_rwlock on the dnode buffer. Though both cases are reads in sync context, and I would not expect them to race.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you prove that they won't race? If so, we can add that proof to assert_db_data_contents_locked and use that here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should follow from the way sync thread works. Block pointers in a parent block are modified/filled by children ZIOs, and its own ZIO/logic should wait for all the children ZIOs to complete before reading them. Would something be modified out of order, we would be in a deep trouble, with or without this locking. Open context readers though are out of this loop, and so they may require locking.

}
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);

dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
Expand All @@ -4696,6 +4742,9 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);

if (parent_db)
rw_exit(&parent_db->db_rwlock);

zio_nowait(dr->dr_zio);
}

Expand Down Expand Up @@ -4852,6 +4901,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
} else {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
assert_db_data_contents_locked(db, FALSE);
memcpy((*datap)->b_data, db->db.db_data, psize);
}
db->db_data_pending = dr;
Expand Down Expand Up @@ -4958,6 +5008,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)

if (dn->dn_type == DMU_OT_DNODE) {
i = 0;
rw_enter(&db->db_rwlock, RW_READER);
while (i < db->db.db_size) {
dnode_phys_t *dnp =
(void *)(((char *)db->db.db_data) + i);
Expand All @@ -4983,6 +5034,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DNODE_MIN_SIZE;
}
}
rw_exit(&db->db_rwlock);
} else {
if (BP_IS_HOLE(bp)) {
fill = 0;
Expand All @@ -4991,6 +5043,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
}
}
} else {
rw_enter(&db->db_rwlock, RW_READER);
blkptr_t *ibp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
Expand All @@ -5000,6 +5053,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
fill += BP_GET_FILL(ibp);
}
rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);

Expand Down Expand Up @@ -5034,6 +5088,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
ASSERT3U(epbs, <, 31);

assert_db_data_addr_locked(db);
rw_enter(&db->db_rwlock, RW_READER);
/* Determine if all our children are holes */
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
Expand All @@ -5050,10 +5106,13 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
* anybody from reading the blocks we're about to
* zero out.
*/
rw_enter(&db->db_rwlock, RW_WRITER);
if (!rw_tryupgrade(&db->db_rwlock)) {
rw_exit(&db->db_rwlock);
rw_enter(&db->db_rwlock, RW_WRITER);
}
memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
}
rw_exit(&db->db_rwlock);
}

static void
Expand Down Expand Up @@ -5248,11 +5307,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
* avoid lock contention, only grab it when we are actually
* changing the BP.
*/
if (rw != NULL)
if (rw != NULL && !RW_WRITE_HELD(rw) && !rw_tryupgrade(rw)) {
rw_exit(rw);
rw_enter(rw, RW_WRITER);
}
*bp = bp_copy;
if (rw != NULL)
rw_exit(rw);
}
}

Expand All @@ -5268,6 +5327,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
return;

assert_db_data_addr_locked(db);
rw_enter(&db->db_rwlock, RW_READER);
if (db->db_level > 0) {
blkptr_t *bp = db->db.db_data;
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
Expand All @@ -5286,6 +5347,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
}
}
}
rw_exit(&db->db_rwlock);
}


Expand Down
10 changes: 7 additions & 3 deletions module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -2167,8 +2167,8 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
dbuf_dirty_record_t *dr;
void *data;

ASSERT(MUTEX_HELD(&db->db_mtx));
if (db->db_dirtycnt == 0) {
ASSERT(MUTEX_HELD(&db->db_mtx));
return (db->db.db_data); /* Nothing is changing */
}

Expand Down Expand Up @@ -2235,8 +2235,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
FTAG, (dmu_buf_t **)&db);
ASSERT0(error);
mutex_enter(&db->db_mtx);
data = (before) ? db->db.db_data :
dmu_objset_userquota_find_data(db, tx);
if (before) {
assert_db_data_contents_locked(db, FALSE);
data = db->db.db_data;
} else {
data = dmu_objset_userquota_find_data(db, tx);
}
have_spill = B_TRUE;
} else {
mutex_enter(&dn->dn_mtx);
Expand Down
9 changes: 9 additions & 0 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ dnode_verify(dnode_t *dn)
ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
if (dn->dn_dbuf != NULL) {
assert_db_data_addr_locked(dn->dn_dbuf);
ASSERT3P(dn->dn_phys, ==,
(dnode_phys_t *)dn->dn_dbuf->db.db_data +
(dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
Expand Down Expand Up @@ -1522,6 +1523,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
epb = db->db.db_size >> DNODE_SHIFT;

idx = object & (epb - 1);
assert_db_data_addr_locked(db);
dn_block = (dnode_phys_t *)db->db.db_data;

ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
Expand Down Expand Up @@ -1608,8 +1610,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
dn = dnh->dnh_dnode;
} else {
rw_enter(&db->db_rwlock, RW_READER);
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
rw_exit(&db->db_rwlock);
dmu_buf_add_user_size(&db->db,
sizeof (dnode_t));
}
Expand Down Expand Up @@ -1681,8 +1685,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
dn = dnh->dnh_dnode;
} else {
rw_enter(&db->db_rwlock, RW_READER);
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
rw_exit(&db->db_rwlock);
dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
}

Expand Down Expand Up @@ -2200,8 +2206,10 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
if (db->db_level != 1 || db->db_blkid >= end_blkid)
break;
mutex_enter(&db->db_mtx);
if (db->db_state != DB_EVICTING)
ASSERT(db->db_dirtycnt > 0);
mutex_exit(&db->db_mtx);
}
#endif
kmem_free(db_search, sizeof (dmu_buf_impl_t));
Expand Down Expand Up @@ -2557,6 +2565,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, int lvl, uint64_t blkid,
dbuf_rele(db, FTAG);
return (error);
}
assert_db_data_addr_locked(db);
data = db->db.db_data;
rw_enter(&db->db_rwlock, RW_READER);
}
Expand Down
Loading
Loading