Skip to content

Commit a1d59da

Browse files
bwatkinsonixhamza
authored andcommitted
Update pin_user_pages() calls for Direct I/O
Originally openzfs#16856 updated Linux Direct I/O requests to use the new pin_user_pages API. However, it was an oversight that this PR only handled iov_iter's of type ITER_IOVEC and ITER_UBUF. Other iov_iter types may try and use the pin_user_pages API if it is available. This can lead to panics as the iov_iter is not being iterated over correctly in zfs_uio_pin_user_pages(). Unfortunately, generic iov_iter API's that call pin_user_page_fast() are protected as GPL only. Rather than update zfs_uio_pin_user_pages() to account for all iov_iter types, we can simply just call zfs_uio_get_dio_page_iov_iter() if the iov_iter type is not ITER_IOVEC or ITER_UBUF. zfs_uio_get_dio_page_iov_iter() calls the iov_iter_get_pages() calls that can handle any iov_iter type. In the future it might be worth using the exposed iov_iter iterator functions that are included in the header iov_iter.h since v6.7. These functions allow for any iov_iter type to be iterated over and advanced while applying a step function during iteration. This could possibly be leveraged in zfs_uio_pin_user_pages(). A new ZFS test case was added to test that a ITER_BVEC is handled correctly using this new code path. This test case was provided though issue openzfs#16956. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Ameer Hamza <[email protected]> Signed-off-by: Brian Atkinson <[email protected]> Closes openzfs#16956 Closes openzfs#17006
1 parent bc06d81 commit a1d59da

File tree

6 files changed

+157
-25
lines changed

6 files changed

+157
-25
lines changed

config/kernel-vfs-iov_iter.m4

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
2121
__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
2222
])
2323
24+
ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [
25+
#include <linux/uio.h>
26+
],[
27+
struct iov_iter iter = { 0 };
28+
struct page **pages = NULL;
29+
size_t maxsize = 4096;
30+
unsigned maxpages = 1;
31+
size_t start;
32+
size_t ret __attribute__ ((unused));
33+
34+
ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages,
35+
&start);
36+
])
37+
2438
ZFS_LINUX_TEST_SRC([iter_is_ubuf], [
2539
#include <linux/uio.h>
2640
],[
@@ -64,6 +78,19 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
6478
AC_MSG_RESULT(no)
6579
])
6680
81+
82+
dnl #
83+
dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_get_pages2().
84+
dnl #
85+
AC_MSG_CHECKING([whether iov_iter_get_pages2() is available])
86+
ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [
87+
AC_MSG_RESULT(yes)
88+
AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1,
89+
[iov_iter_get_pages2() is available])
90+
],[
91+
AC_MSG_RESULT(no)
92+
])
93+
6794
dnl #
6895
dnl # Kernel 6.0 introduced the ITER_UBUF iov_iter type. iter_is_ubuf()
6996
dnl # was also added to determine if the iov_iter is an ITER_UBUF.

include/os/linux/spl/sys/uio.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ typedef enum zfs_uio_seg {
6363
typedef struct {
6464
struct page **pages; /* Mapped pages */
6565
long npages; /* Number of mapped pages */
66+
boolean_t pinned; /* Whether FOLL_PIN was used */
6667
} zfs_uio_dio_t;
6768

6869
typedef struct zfs_uio {
@@ -199,4 +200,13 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset,
199200
#define zfs_uio_iov_iter_type(iter) (iter)->type
200201
#endif
201202

203+
#if defined(HAVE_ITER_IS_UBUF)
204+
#define zfs_user_backed_iov_iter(iter) \
205+
(iter_is_ubuf((iter)) || \
206+
(zfs_uio_iov_iter_type((iter)) == ITER_IOVEC))
207+
#else
208+
#define zfs_user_backed_iov_iter(iter) \
209+
(zfs_uio_iov_iter_type((iter)) == ITER_IOVEC)
210+
#endif
211+
202212
#endif /* SPL_UIO_H */

module/os/linux/zfs/zfs_uio.c

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,6 @@ zfs_uio_page_aligned(zfs_uio_t *uio)
404404
return (aligned);
405405
}
406406

407-
408407
#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
409408
#define ZFS_MARKEED_PAGE 0x0
410409
#define IS_ZFS_MARKED_PAGE(_p) 0
@@ -441,7 +440,6 @@ zfs_unmark_page(struct page *page)
441440
}
442441
#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
443442

444-
#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED)
445443
static void
446444
zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
447445
{
@@ -473,7 +471,6 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
473471
}
474472
}
475473
}
476-
#endif
477474

478475
void
479476
zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
@@ -482,21 +479,24 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
482479
ASSERT(uio->uio_extflg & UIO_DIRECT);
483480
ASSERT3P(uio->uio_dio.pages, !=, NULL);
484481

482+
if (uio->uio_dio.pinned) {
485483
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
486-
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
487-
#else
488-
for (long i = 0; i < uio->uio_dio.npages; i++) {
489-
struct page *p = uio->uio_dio.pages[i];
484+
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
485+
#endif
486+
} else {
487+
for (long i = 0; i < uio->uio_dio.npages; i++) {
488+
struct page *p = uio->uio_dio.pages[i];
490489

491-
if (IS_ZFS_MARKED_PAGE(p)) {
492-
zfs_unmark_page(p);
493-
__free_page(p);
494-
continue;
495-
}
490+
if (IS_ZFS_MARKED_PAGE(p)) {
491+
zfs_unmark_page(p);
492+
__free_page(p);
493+
continue;
494+
}
496495

497-
put_page(p);
496+
put_page(p);
497+
}
498498
}
499-
#endif
499+
500500
vmem_free(uio->uio_dio.pages,
501501
uio->uio_dio.npages * sizeof (struct page *));
502502
}
@@ -523,6 +523,7 @@ zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
523523
if (len == 0)
524524
return (0);
525525

526+
uio->uio_dio.pinned = B_TRUE;
526527
#if defined(HAVE_ITER_IS_UBUF)
527528
if (iter_is_ubuf(uio->uio_iter)) {
528529
nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);
@@ -569,8 +570,8 @@ zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
569570

570571
return (0);
571572
}
573+
#endif
572574

573-
#else
574575
static int
575576
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
576577
{
@@ -581,9 +582,15 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
581582
unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
582583

583584
while (wanted) {
585+
#if defined(HAVE_IOV_ITER_GET_PAGES2)
586+
cnt = iov_iter_get_pages2(uio->uio_iter,
587+
&uio->uio_dio.pages[uio->uio_dio.npages],
588+
wanted, maxpages, &start);
589+
#else
584590
cnt = iov_iter_get_pages(uio->uio_iter,
585591
&uio->uio_dio.pages[uio->uio_dio.npages],
586592
wanted, maxpages, &start);
593+
#endif
587594
if (cnt < 0) {
588595
iov_iter_revert(uio->uio_iter, rollback);
589596
return (SET_ERROR(-cnt));
@@ -595,15 +602,19 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
595602
uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
596603
rollback += cnt;
597604
wanted -= cnt;
605+
#if !defined(HAVE_IOV_ITER_GET_PAGES2)
606+
/*
607+
* iov_iter_get_pages2() advances the iov_iter on success.
608+
*/
598609
iov_iter_advance(uio->uio_iter, cnt);
610+
#endif
599611

600612
}
601613
ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
602614
iov_iter_revert(uio->uio_iter, rollback);
603615

604616
return (0);
605617
}
606-
#endif /* HAVE_PIN_USER_PAGES_UNLOCKED */
607618

608619
/*
609620
* This function pins user pages. In the event that the user pages were not
@@ -621,7 +632,10 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
621632
if (uio->uio_segflg == UIO_ITER) {
622633
uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
623634
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
624-
error = zfs_uio_pin_user_pages(uio, rw);
635+
if (zfs_user_backed_iov_iter(uio->uio_iter))
636+
error = zfs_uio_pin_user_pages(uio, rw);
637+
else
638+
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
625639
#else
626640
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
627641
#endif
@@ -632,22 +646,24 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
632646
ASSERT3S(uio->uio_dio.npages, >=, 0);
633647

634648
if (error) {
649+
if (uio->uio_dio.pinned) {
635650
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
636-
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
637-
#else
638-
for (long i = 0; i < uio->uio_dio.npages; i++)
639-
put_page(uio->uio_dio.pages[i]);
651+
unpin_user_pages(uio->uio_dio.pages,
652+
uio->uio_dio.npages);
640653
#endif
654+
} else {
655+
for (long i = 0; i < uio->uio_dio.npages; i++)
656+
put_page(uio->uio_dio.pages[i]);
657+
}
658+
641659
vmem_free(uio->uio_dio.pages, size);
642660
return (error);
643661
} else {
644662
ASSERT3S(uio->uio_dio.npages, ==, npages);
645663
}
646664

647-
#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED)
648-
if (rw == UIO_WRITE)
665+
if (rw == UIO_WRITE && !uio->uio_dio.pinned)
649666
zfs_uio_dio_check_for_zero_page(uio);
650-
#endif
651667

652668
uio->uio_extflg |= UIO_DIRECT;
653669

tests/runfiles/linux.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos']
103103
tags = ['functional', 'devices']
104104

105105
[tests/functional/direct:Linux]
106-
tests = ['dio_write_verify']
106+
tests = ['dio_loopback_dev', 'dio_write_verify']
107107
tags = ['functional', 'direct']
108108

109109
[tests/functional/events:Linux]

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,6 +1474,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
14741474
functional/direct/dio_dedup.ksh \
14751475
functional/direct/dio_encryption.ksh \
14761476
functional/direct/dio_grow_block.ksh \
1477+
functional/direct/dio_loopback_dev.ksh \
14771478
functional/direct/dio_max_recordsize.ksh \
14781479
functional/direct/dio_mixed.ksh \
14791480
functional/direct/dio_mmap.ksh \
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/bin/ksh -p
2+
#
3+
# CDDL HEADER START
4+
#
5+
# The contents of this file are subject to the terms of the
6+
# Common Development and Distribution License (the "License").
7+
# You may not use this file except in compliance with the License.
8+
#
9+
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10+
# or https://opensource.org/licenses/CDDL-1.0.
11+
# See the License for the specific language governing permissions
12+
# and limitations under the License.
13+
#
14+
# When distributing Covered Code, include this CDDL HEADER in each
15+
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16+
# If applicable, add the following below this CDDL HEADER, with the
17+
# fields enclosed by brackets "[]" replaced with your own identifying
18+
# information: Portions Copyright [yyyy] [name of copyright owner]
19+
#
20+
# CDDL HEADER END
21+
#
22+
23+
#
24+
# Copyright (c) 2025 by Triad National Security, LLC.
25+
#
26+
27+
. $STF_SUITE/include/libtest.shlib
28+
. $STF_SUITE/tests/functional/direct/dio.cfg
29+
. $STF_SUITE/tests/functional/direct/dio.kshlib
30+
31+
#
32+
# DESCRIPTION:
33+
# Verify Direct I/O reads work with loopback devices using direct=always.
34+
#
35+
# STRATEGY:
36+
# 1. Create raidz zpool.
37+
# 2. Create dataset with the direct dataset property set to always.
38+
# 3. Create an empty file in dataset and setup loop device on it.
39+
# 4. Read from loopback device.
40+
#
41+
42+
verify_runnable "global"
43+
44+
function cleanup
45+
{
46+
if [[ -n $lofidev ]]; then
47+
losetup -d $lofidev
48+
fi
49+
dio_cleanup
50+
}
51+
52+
log_assert "Verify loopback devices with Direct I/O."
53+
54+
if ! is_linux; then
55+
log_unsupported "This is just a check for Linux Direct I/O"
56+
fi
57+
58+
log_onexit cleanup
59+
60+
# Create zpool
61+
log_must truncate -s $MINVDEVSIZE $DIO_VDEVS
62+
log_must create_pool $TESTPOOL1 "raidz" $DIO_VDEVS
63+
64+
# Creating dataset with direct=always
65+
log_must eval "zfs create -o direct=always $TESTPOOL1/$TESTFS1"
66+
mntpt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1)
67+
68+
# Getting a loopback device
69+
lofidev=$(losetup -f)
70+
71+
# Create loopback device
72+
log_must truncate -s 1M "$mntpt/temp_file"
73+
log_must losetup $lofidev "$mntpt/temp_file"
74+
75+
# Read from looback device to make sure Direct I/O works with loopback device
76+
log_must dd if=$lofidev of=/dev/null count=1 bs=4k
77+
78+
log_pass "Verified loopback devices for Direct I/O."

0 commit comments

Comments
 (0)