Skip to content

Commit 8949b9a

Browse files
fdmananakdave
authored andcommitted
btrfs: fix lock inversion problem when doing qgroup extent tracing
At btrfs_qgroup_trace_extent_post() we call btrfs_find_all_roots() with a NULL value as the transaction handle argument, which makes that function take the commit_root_sem semaphore, which is necessary when we don't hold a transaction handle or any other mechanism to prevent a transaction commit from wiping out commit roots. However btrfs_qgroup_trace_extent_post() can be called in a context where we are holding a write lock on an extent buffer from a subvolume tree, namely from btrfs_truncate_inode_items(), called either during truncate or unlink operations. In this case we end up with a lock inversion problem because the commit_root_sem is a higher level lock, always supposed to be acquired before locking any extent buffer. Lockdep detects this lock inversion problem since we switched the extent buffer locks from custom locks to semaphores, and when running btrfs/158 from fstests, it reported the following trace: [ 9057.626435] ====================================================== [ 9057.627541] WARNING: possible circular locking dependency detected [ 9057.628334] 5.14.0-rc2-btrfs-next-93 #1 Not tainted [ 9057.628961] ------------------------------------------------------ [ 9057.629867] kworker/u16:4/30781 is trying to acquire lock: [ 9057.630824] ffff8e2590f58760 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.632542] but task is already holding lock: [ 9057.633551] ffff8e25582d4b70 (&fs_info->commit_root_sem){++++}-{3:3}, at: iterate_extent_inodes+0x10b/0x280 [btrfs] [ 9057.635255] which lock already depends on the new lock. [ 9057.636292] the existing dependency chain (in reverse order) is: [ 9057.637240] -> #1 (&fs_info->commit_root_sem){++++}-{3:3}: [ 9057.638138] down_read+0x46/0x140 [ 9057.638648] btrfs_find_all_roots+0x41/0x80 [btrfs] [ 9057.639398] btrfs_qgroup_trace_extent_post+0x37/0x70 [btrfs] [ 9057.640283] btrfs_add_delayed_data_ref+0x418/0x490 [btrfs] [ 9057.641114] btrfs_free_extent+0x35/0xb0 [btrfs] [ 9057.641819] btrfs_truncate_inode_items+0x424/0xf70 [btrfs] [ 9057.642643] btrfs_evict_inode+0x454/0x4f0 [btrfs] [ 9057.643418] evict+0xcf/0x1d0 [ 9057.643895] do_unlinkat+0x1e9/0x300 [ 9057.644525] do_syscall_64+0x3b/0xc0 [ 9057.645110] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 9057.645835] -> #0 (btrfs-tree-00){++++}-{3:3}: [ 9057.646600] __lock_acquire+0x130e/0x2210 [ 9057.647248] lock_acquire+0xd7/0x310 [ 9057.647773] down_read_nested+0x4b/0x140 [ 9057.648350] __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.649175] btrfs_read_lock_root_node+0x31/0x40 [btrfs] [ 9057.650010] btrfs_search_slot+0x537/0xc00 [btrfs] [ 9057.650849] scrub_print_warning_inode+0x89/0x370 [btrfs] [ 9057.651733] iterate_extent_inodes+0x1e3/0x280 [btrfs] [ 9057.652501] scrub_print_warning+0x15d/0x2f0 [btrfs] [ 9057.653264] scrub_handle_errored_block.isra.0+0x135f/0x1640 [btrfs] [ 9057.654295] scrub_bio_end_io_worker+0x101/0x2e0 [btrfs] [ 9057.655111] btrfs_work_helper+0xf8/0x400 [btrfs] [ 9057.655831] process_one_work+0x247/0x5a0 [ 9057.656425] worker_thread+0x55/0x3c0 [ 9057.656993] kthread+0x155/0x180 [ 9057.657494] ret_from_fork+0x22/0x30 [ 9057.658030] other info that might help us debug this: [ 9057.659064] Possible unsafe locking scenario: [ 9057.659824] CPU0 CPU1 [ 9057.660402] ---- ---- [ 9057.660988] lock(&fs_info->commit_root_sem); [ 9057.661581] lock(btrfs-tree-00); [ 9057.662348] lock(&fs_info->commit_root_sem); [ 9057.663254] lock(btrfs-tree-00); [ 9057.663690] *** DEADLOCK *** [ 9057.664437] 4 locks held by kworker/u16:4/30781: [ 9057.665023] #0: ffff8e25922a1148 ((wq_completion)btrfs-scrub){+.+.}-{0:0}, at: process_one_work+0x1c7/0x5a0 [ 9057.666260] #1: ffffabb3451ffe70 ((work_completion)(&work->normal_work)){+.+.}-{0:0}, at: process_one_work+0x1c7/0x5a0 [ 9057.667639] #2: ffff8e25922da198 (&ret->mutex){+.+.}-{3:3}, at: scrub_handle_errored_block.isra.0+0x5d2/0x1640 [btrfs] [ 9057.669017] #3: ffff8e25582d4b70 (&fs_info->commit_root_sem){++++}-{3:3}, at: iterate_extent_inodes+0x10b/0x280 [btrfs] [ 9057.670408] stack backtrace: [ 9057.670976] CPU: 7 PID: 30781 Comm: kworker/u16:4 Not tainted 5.14.0-rc2-btrfs-next-93 #1 [ 9057.672030] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 9057.673492] Workqueue: btrfs-scrub btrfs_work_helper [btrfs] [ 9057.674258] Call Trace: [ 9057.674588] dump_stack_lvl+0x57/0x72 [ 9057.675083] check_noncircular+0xf3/0x110 [ 9057.675611] __lock_acquire+0x130e/0x2210 [ 9057.676132] lock_acquire+0xd7/0x310 [ 9057.676605] ? __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.677313] ? lock_is_held_type+0xe8/0x140 [ 9057.677849] down_read_nested+0x4b/0x140 [ 9057.678349] ? __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.679068] __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.679760] btrfs_read_lock_root_node+0x31/0x40 [btrfs] [ 9057.680458] btrfs_search_slot+0x537/0xc00 [btrfs] [ 9057.681083] ? _raw_spin_unlock+0x29/0x40 [ 9057.681594] ? btrfs_find_all_roots_safe+0x11f/0x140 [btrfs] [ 9057.682336] scrub_print_warning_inode+0x89/0x370 [btrfs] [ 9057.683058] ? btrfs_find_all_roots_safe+0x11f/0x140 [btrfs] [ 9057.683834] ? scrub_write_block_to_dev_replace+0xb0/0xb0 [btrfs] [ 9057.684632] iterate_extent_inodes+0x1e3/0x280 [btrfs] [ 9057.685316] scrub_print_warning+0x15d/0x2f0 [btrfs] [ 9057.685977] ? ___ratelimit+0xa4/0x110 [ 9057.686460] scrub_handle_errored_block.isra.0+0x135f/0x1640 [btrfs] [ 9057.687316] scrub_bio_end_io_worker+0x101/0x2e0 [btrfs] [ 9057.688021] btrfs_work_helper+0xf8/0x400 [btrfs] [ 9057.688649] ? lock_is_held_type+0xe8/0x140 [ 9057.689180] process_one_work+0x247/0x5a0 [ 9057.689696] worker_thread+0x55/0x3c0 [ 9057.690175] ? process_one_work+0x5a0/0x5a0 [ 9057.690731] kthread+0x155/0x180 [ 9057.691158] ? set_kthread_struct+0x40/0x40 [ 9057.691697] ret_from_fork+0x22/0x30 Fix this by making btrfs_find_all_roots() never attempt to lock the commit_root_sem when it is called from btrfs_qgroup_trace_extent_post(). We can't just pass a non-NULL transaction handle to btrfs_find_all_roots() from btrfs_qgroup_trace_extent_post(), because that would make backref lookup not use commit roots and acquire read locks on extent buffers, and therefore could deadlock when btrfs_qgroup_trace_extent_post() is called from the btrfs_truncate_inode_items() code path which has acquired a write lock on an extent buffer of the subvolume btree. CC: [email protected] # 4.19+ Reviewed-by: Qu Wenruo <[email protected]> Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 16a200f commit 8949b9a

File tree

6 files changed

+48
-25
lines changed

6 files changed

+48
-25
lines changed

fs/btrfs/backref.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1488,15 +1488,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
14881488
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
14891489
struct btrfs_fs_info *fs_info, u64 bytenr,
14901490
u64 time_seq, struct ulist **roots,
1491-
bool ignore_offset)
1491+
bool ignore_offset, bool skip_commit_root_sem)
14921492
{
14931493
int ret;
14941494

1495-
if (!trans)
1495+
if (!trans && !skip_commit_root_sem)
14961496
down_read(&fs_info->commit_root_sem);
14971497
ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
14981498
time_seq, roots, ignore_offset);
1499-
if (!trans)
1499+
if (!trans && !skip_commit_root_sem)
15001500
up_read(&fs_info->commit_root_sem);
15011501
return ret;
15021502
}

fs/btrfs/backref.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
4747
const u64 *extent_item_pos, bool ignore_offset);
4848
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
4949
struct btrfs_fs_info *fs_info, u64 bytenr,
50-
u64 time_seq, struct ulist **roots, bool ignore_offset);
50+
u64 time_seq, struct ulist **roots, bool ignore_offset,
51+
bool skip_commit_root_sem);
5152
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
5253
u32 name_len, unsigned long name_off,
5354
struct extent_buffer *eb_in, u64 parent,

fs/btrfs/delayed-ref.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -974,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
974974
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
975975

976976
if (qrecord_inserted)
977-
btrfs_qgroup_trace_extent_post(fs_info, record);
977+
btrfs_qgroup_trace_extent_post(trans, record);
978978

979979
return 0;
980980
}
@@ -1069,7 +1069,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
10691069

10701070

10711071
if (qrecord_inserted)
1072-
return btrfs_qgroup_trace_extent_post(fs_info, record);
1072+
return btrfs_qgroup_trace_extent_post(trans, record);
10731073
return 0;
10741074
}
10751075

fs/btrfs/qgroup.c

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,17 +1704,39 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
17041704
return 0;
17051705
}
17061706

1707-
int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
1707+
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
17081708
struct btrfs_qgroup_extent_record *qrecord)
17091709
{
17101710
struct ulist *old_root;
17111711
u64 bytenr = qrecord->bytenr;
17121712
int ret;
17131713

1714-
ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
1714+
/*
1715+
* We are always called in a context where we are already holding a
1716+
* transaction handle. Often we are called when adding a data delayed
1717+
* reference from btrfs_truncate_inode_items() (truncating or unlinking),
1718+
* in which case we will be holding a write lock on extent buffer from a
1719+
* subvolume tree. In this case we can't allow btrfs_find_all_roots() to
1720+
* acquire fs_info->commit_root_sem, because that is a higher level lock
1721+
* that must be acquired before locking any extent buffers.
1722+
*
1723+
* So we want btrfs_find_all_roots() to not acquire the commit_root_sem
1724+
* but we can't pass it a non-NULL transaction handle, because otherwise
1725+
* it would not use commit roots and would lock extent buffers, causing
1726+
* a deadlock if it ends up trying to read lock the same extent buffer
1727+
* that was previously write locked at btrfs_truncate_inode_items().
1728+
*
1729+
* So pass a NULL transaction handle to btrfs_find_all_roots() and
1730+
* explicitly tell it to not acquire the commit_root_sem - if we are
1731+
* holding a transaction handle we don't need its protection.
1732+
*/
1733+
ASSERT(trans != NULL);
1734+
1735+
ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
1736+
false, true);
17151737
if (ret < 0) {
1716-
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1717-
btrfs_warn(fs_info,
1738+
trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1739+
btrfs_warn(trans->fs_info,
17181740
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
17191741
ret);
17201742
return 0;
@@ -1758,7 +1780,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
17581780
kfree(record);
17591781
return 0;
17601782
}
1761-
return btrfs_qgroup_trace_extent_post(fs_info, record);
1783+
return btrfs_qgroup_trace_extent_post(trans, record);
17621784
}
17631785

17641786
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@@ -2629,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
26292651
/* Search commit root to find old_roots */
26302652
ret = btrfs_find_all_roots(NULL, fs_info,
26312653
record->bytenr, 0,
2632-
&record->old_roots, false);
2654+
&record->old_roots, false, false);
26332655
if (ret < 0)
26342656
goto cleanup;
26352657
}
@@ -2645,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
26452667
* current root. It's safe inside commit_transaction().
26462668
*/
26472669
ret = btrfs_find_all_roots(trans, fs_info,
2648-
record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
2670+
record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false);
26492671
if (ret < 0)
26502672
goto cleanup;
26512673
if (qgroup_to_skip) {
@@ -3179,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
31793201
num_bytes = found.offset;
31803202

31813203
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
3182-
&roots, false);
3204+
&roots, false, false);
31833205
if (ret < 0)
31843206
goto out;
31853207
/* For rescan, just pass old_roots as NULL */

fs/btrfs/qgroup.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ int btrfs_qgroup_trace_extent_nolock(
298298
* using current root, then we can move all expensive backref walk out of
299299
* transaction committing, but not now as qgroup accounting will be wrong again.
300300
*/
301-
int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
301+
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
302302
struct btrfs_qgroup_extent_record *qrecord);
303303

304304
/*

fs/btrfs/tests/qgroup-tests.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
224224
* quota.
225225
*/
226226
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
227-
false);
227+
false, false);
228228
if (ret) {
229229
ulist_free(old_roots);
230230
test_err("couldn't find old roots: %d", ret);
@@ -237,7 +237,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
237237
return ret;
238238

239239
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
240-
false);
240+
false, false);
241241
if (ret) {
242242
ulist_free(old_roots);
243243
ulist_free(new_roots);
@@ -261,7 +261,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
261261
new_roots = NULL;
262262

263263
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
264-
false);
264+
false, false);
265265
if (ret) {
266266
ulist_free(old_roots);
267267
test_err("couldn't find old roots: %d", ret);
@@ -273,7 +273,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
273273
return -EINVAL;
274274

275275
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
276-
false);
276+
false, false);
277277
if (ret) {
278278
ulist_free(old_roots);
279279
ulist_free(new_roots);
@@ -325,7 +325,7 @@ static int test_multiple_refs(struct btrfs_root *root,
325325
}
326326

327327
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
328-
false);
328+
false, false);
329329
if (ret) {
330330
ulist_free(old_roots);
331331
test_err("couldn't find old roots: %d", ret);
@@ -338,7 +338,7 @@ static int test_multiple_refs(struct btrfs_root *root,
338338
return ret;
339339

340340
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
341-
false);
341+
false, false);
342342
if (ret) {
343343
ulist_free(old_roots);
344344
ulist_free(new_roots);
@@ -360,7 +360,7 @@ static int test_multiple_refs(struct btrfs_root *root,
360360
}
361361

362362
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
363-
false);
363+
false, false);
364364
if (ret) {
365365
ulist_free(old_roots);
366366
test_err("couldn't find old roots: %d", ret);
@@ -373,7 +373,7 @@ static int test_multiple_refs(struct btrfs_root *root,
373373
return ret;
374374

375375
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
376-
false);
376+
false, false);
377377
if (ret) {
378378
ulist_free(old_roots);
379379
ulist_free(new_roots);
@@ -401,7 +401,7 @@ static int test_multiple_refs(struct btrfs_root *root,
401401
}
402402

403403
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
404-
false);
404+
false, false);
405405
if (ret) {
406406
ulist_free(old_roots);
407407
test_err("couldn't find old roots: %d", ret);
@@ -414,7 +414,7 @@ static int test_multiple_refs(struct btrfs_root *root,
414414
return ret;
415415

416416
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
417-
false);
417+
false, false);
418418
if (ret) {
419419
ulist_free(old_roots);
420420
ulist_free(new_roots);

0 commit comments

Comments
 (0)