Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Always validate checksums for Direct I/O reads #16598

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/sys/fm/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ extern "C" {
#define FM_EREPORT_ZFS_DATA "data"
#define FM_EREPORT_ZFS_DELAY "delay"
#define FM_EREPORT_ZFS_DEADMAN "deadman"
#define FM_EREPORT_ZFS_DIO_VERIFY "dio_verify"
#define FM_EREPORT_ZFS_DIO_VERIFY_WR "dio_verify_wr"
#define FM_EREPORT_ZFS_DIO_VERIFY_RD "dio_verify_rd"
#define FM_EREPORT_ZFS_POOL "zpool"
#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
Expand Down
29 changes: 15 additions & 14 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,25 +208,25 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_PROBE (1ULL << 16)
#define ZIO_FLAG_TRYHARD (1ULL << 17)
#define ZIO_FLAG_OPTIONAL (1ULL << 18)

#define ZIO_FLAG_DIO_READ (1ULL << 19)
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)

/*
* Flags not inherited by any children.
*/
#define ZIO_FLAG_DONT_QUEUE (1ULL << 19) /* must be first for INHERIT */
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 20)
#define ZIO_FLAG_IO_BYPASS (1ULL << 21)
#define ZIO_FLAG_IO_REWRITE (1ULL << 22)
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 23)
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 24)
#define ZIO_FLAG_GANG_CHILD (1ULL << 25)
#define ZIO_FLAG_DDT_CHILD (1ULL << 26)
#define ZIO_FLAG_GODFATHER (1ULL << 27)
#define ZIO_FLAG_NOPWRITE (1ULL << 28)
#define ZIO_FLAG_REEXECUTED (1ULL << 29)
#define ZIO_FLAG_DELEGATED (1ULL << 30)
#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 31)
#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21)
#define ZIO_FLAG_IO_BYPASS (1ULL << 22)
#define ZIO_FLAG_IO_REWRITE (1ULL << 23)
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24)
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25)
#define ZIO_FLAG_GANG_CHILD (1ULL << 26)
#define ZIO_FLAG_DDT_CHILD (1ULL << 27)
#define ZIO_FLAG_GODFATHER (1ULL << 28)
#define ZIO_FLAG_NOPWRITE (1ULL << 29)
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
#define ZIO_FLAG_DELEGATED (1ULL << 31)
#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32)

#define ZIO_ALLOCATOR_NONE (-1)
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
Expand Down Expand Up @@ -647,6 +647,7 @@ extern void zio_vdev_io_redone(zio_t *zio);
extern void zio_change_priority(zio_t *pio, zio_priority_t priority);

extern void zio_checksum_verified(zio_t *zio);
extern void zio_dio_chksum_verify_error_report(zio_t *zio);
extern int zio_worst_error(int e1, int e2);

extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
Expand Down
2 changes: 1 addition & 1 deletion man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ write.
It can also help to identify if reported checksum errors are tied to Direct I/O
writes.
Each verify error causes a
.Sy dio_verify
.Sy dio_verify_wr
zevent.
Direct Write I/O checkum verify errors can be seen with
.Nm zpool Cm status Fl d .
Expand Down
5 changes: 4 additions & 1 deletion man/man8/zpool-events.8
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,10 @@ This can be an indicator of problems with the underlying storage device.
The number of delay events is ratelimited by the
.Sy zfs_slow_io_events_per_second
module parameter.
.It Sy dio_verify
.It Sy dio_verify_rd
Issued when there was a checksum verify error after a Direct I/O read has been
issued.
.It Sy dio_verify_wr
Issued when there was a checksum verify error after a Direct I/O write has been
issued.
This event can only take place if the module parameter
Expand Down
8 changes: 6 additions & 2 deletions man/man8/zpool-status.8
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,18 @@ Specify
.Sy --json-pool-key-guid
to set pool GUID as key for pool objects instead of pool names.
.It Fl d
Display the number of Direct I/O write checksum verify errors that have occured
on a top-level VDEV.
Display the number of Direct I/O read/write checksum verify errors that have
occured on a top-level VDEV.
See
.Sx zfs_vdev_direct_write_verify
in
.Xr zfs 4
for details about the conditions that can cause Direct I/O write checksum
verify failures to occur.
Direct I/O reads checksum verify errors can also occur if the contents of the
buffer are being manipulated after the I/O has been issued and is in flight.
In the case of Direct I/O read checksum verify errors, the I/O will be reissued
through the ARC.
.It Fl D
Display a histogram of deduplication statistics, showing the allocated
.Pq physically present on disk
Expand Down
41 changes: 37 additions & 4 deletions module/os/freebsd/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -620,9 +620,16 @@ abd_borrow_buf_copy(abd_t *abd, size_t n)

/*
* Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
* no change the contents of the ABD and will ASSERT that you didn't modify
* the buffer since it was borrowed. If you want any changes you made to buf to
* be copied back to abd, use abd_return_buf_copy() instead.
* not change the contents of the ABD. If you want any changes you made to
* buf to be copied back to abd, use abd_return_buf_copy() instead. If the
* ABD is not constructed from user pages from Direct I/O then an ASSERT
* checks to make sure the contents of the buffer have not changed since it was
* borrowed. We can not ASSERT the contents of the buffer have not changed if
* it is composed of user pages. While Direct I/O write pages are placed under
* write protection and can not be changed, this is not the case for Direct I/O
* reads. The pages of a Direct I/O read could be manipulated at any time.
* Checksum verifications in the ZIO pipeline check for this issue and handle
* it by returning an error on checksum verification failure.
*/
void
abd_return_buf(abd_t *abd, void *buf, size_t n)
Expand All @@ -632,8 +639,34 @@ abd_return_buf(abd_t *abd, void *buf, size_t n)
#ifdef ZFS_DEBUG
(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
#endif
if (abd_is_linear(abd)) {
if (abd_is_from_pages(abd)) {
if (!abd_is_linear_page(abd))
zio_buf_free(buf, n);
} else if (abd_is_linear(abd)) {
ASSERT3P(buf, ==, abd_to_buf(abd));
} else if (abd_is_gang(abd)) {
#ifdef ZFS_DEBUG
/*
* We have to be careful with gang ABD's that we do not ASSERT
* for nay ABD's that contain user pages from Direct I/O. See
* the comment above about Direct I/O read buffers possibly
* being manipulated. In order to handle this, we jsut interate
Comment on lines +651 to +653
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"for any ABDs"
"we just iterate"

* through the gang ABD and only verify ABD's that are not from
* user pages.
*/
void *cmp_buf = buf;

for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
cabd != NULL;
cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
if (!abd_is_from_pages(cabd)) {
ASSERT0(abd_cmp_buf(cabd, cmp_buf,
cabd->abd_size));
}
cmp_buf = (char *)cmp_buf + cabd->abd_size;
}
#endif
zio_buf_free(buf, n);
} else {
ASSERT0(abd_cmp_buf(abd, buf, n));
zio_buf_free(buf, n);
Expand Down
4 changes: 3 additions & 1 deletion module/os/linux/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1008,7 +1008,9 @@ abd_borrow_buf_copy(abd_t *abd, size_t n)
* borrowed. We can not ASSERT that the contents of the buffer have not changed
* if it is composed of user pages because the pages can not be placed under
* write protection and the user could have possibly changed the contents in
* the pages at any time.
* the pages at any time. This is also an issue for Direct I/O reads. Checksum
* verifications in the ZIO pipeline check fo rthis issue and handle it by
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"check for this issue"

* returning an error on checksum verification failure.
*/
void
abd_return_buf(abd_t *abd, void *buf, size_t n)
Expand Down
1 change: 1 addition & 0 deletions module/zcommon/zfs_valstr.c
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ _VALSTR_BITFIELD_IMPL(zio_flag,
{ '.', "PR", "PROBE" },
{ '.', "TH", "TRYHARD" },
{ '.', "OP", "OPTIONAL" },
{ '.', "RD", "DIO_READ" },
{ '.', "DQ", "DONT_QUEUE" },
{ '.', "DP", "DONT_PROPAGATE" },
{ '.', "BY", "IO_BYPASS" },
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dmu_direct.c
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
*/
zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL, &zb);
ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb);
mutex_exit(&db->db_mtx);

zfs_racct_read(spa, db->db.db_size, 1, flags);
Expand Down
14 changes: 14 additions & 0 deletions module/zfs/vdev_indirect.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <sys/zap.h>
#include <sys/abd.h>
#include <sys/zthr.h>
#include <sys/fm/fs/zfs.h>

/*
* An indirect vdev corresponds to a vdev that has been removed. Since
Expand Down Expand Up @@ -1832,6 +1833,19 @@ vdev_indirect_io_done(zio_t *zio)

zio_bad_cksum_t zbc;
int ret = zio_checksum_error(zio, &zbc);
/*
* Any Direct I/O read with that has a checksum error must be treated
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Any Direct I/O read with that has a checksum error must be treated
* Any Direct I/O read that has a checksum error must be treated

* as suspicious as the contents of the buffer could be getting
* manipulated while the I/O is taking place. The checksum verify error
* will be reported to the top-level VDEV.
*/
if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
zio->io_error = ret;
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
ret = 0;
}

if (ret == 0) {
zio_checksum_verified(zio);
return;
Expand Down
13 changes: 13 additions & 0 deletions module/zfs/vdev_mirror.c
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,19 @@ vdev_mirror_io_done(zio_t *zio)

ASSERT(zio->io_type == ZIO_TYPE_READ);

/*
* Any Direct I/O read that has a checksum error must be treated as
* suspicious as the contents of the buffer could be getting
* manipulated while the I/O is taking place. The checksum verify error
* will be reported to the top-level Mirror VDEV. If additional copies
* of the data are available, they will still be attempted to be read
* below.
*/
if ((zio->io_flags & ZIO_FLAG_DIO_READ) &&
(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
zio_dio_chksum_verify_error_report(zio);
}

/*
* If we don't have a good copy yet, keep trying other children.
*/
Expand Down
21 changes: 20 additions & 1 deletion module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -2633,6 +2633,20 @@ raidz_checksum_verify(zio_t *zio)
raidz_map_t *rm = zio->io_vsd;

int ret = zio_checksum_error(zio, &zbc);
/*
* Any Direct I/O read that has a checksum error must be treated as
* suspicious as the contents of the buffer could be getting
* manipulated while the I/O is taking place. The checksum verify error
* will be reported to the top-level RAIDZ VDEV.
*/
if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
zio->io_error = ret;
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
zio_checksum_verified(zio);
return (0);
}

if (ret != 0 && zbc.zbc_injected != 0)
rm->rm_ecksuminjected = 1;

Expand Down Expand Up @@ -2979,6 +2993,8 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)

/* Check for success */
if (raidz_checksum_verify(zio) == 0) {
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
return (0);

/* Reconstruction succeeded - report errors */
for (int i = 0; i < rm->rm_nrows; i++) {
Expand Down Expand Up @@ -3379,7 +3395,6 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio)
zio_bad_cksum_t zbc;
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;

mutex_enter(&cvd->vdev_stat_lock);
cvd->vdev_stat.vs_checksum_errors++;
mutex_exit(&cvd->vdev_stat_lock);
Expand Down Expand Up @@ -3444,6 +3459,9 @@ vdev_raidz_io_done(zio_t *zio)
}

if (raidz_checksum_verify(zio) == 0) {
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
goto done;

for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_verified(zio, rr);
Expand Down Expand Up @@ -3538,6 +3556,7 @@ vdev_raidz_io_done(zio_t *zio)
}
}
}
done:
if (rm->rm_lr != NULL) {
zfs_rangelock_exit(rm->rm_lr);
rm->rm_lr = NULL;
Expand Down
31 changes: 28 additions & 3 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
(void) cr;
int error = 0;
boolean_t frsync = B_FALSE;
boolean_t dio_checksum_failure = B_FALSE;
ssize_t nbytes;

zfsvfs_t *zfsvfs = ZTOZSB(zp);
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
Expand Down Expand Up @@ -407,7 +409,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
}

while (n > 0) {
ssize_t nbytes = MIN(n, chunk_size -
top:
nbytes = MIN(n, chunk_size -
P2PHASE(zfs_uio_offset(uio), chunk_size));
#ifdef UIO_NOCOPY
if (zfs_uio_segflg(uio) == UIO_NOCOPY)
Expand All @@ -424,8 +427,26 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)

if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = SET_ERROR(EIO);
if (error == ECKSUM) {
/*
* If a Direct I/O read returned a checksum
* verify error, then it must be treated as
* suspicious. The contents of the buffer could
* have beeen manipulated while the I/O was in
* flight. In this case, the remainder of I/O
* request will just be just be reissued through
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* request will just be just be reissued through
* request will just be reissued through the ARC.

* the ARC.
*/
if (uio->uio_extflg & UIO_DIRECT) {
dio_checksum_failure = B_TRUE;
uio->uio_extflg &= ~UIO_DIRECT;
n += dio_remaining_resid;
dio_remaining_resid = 0;
goto top;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The goto looks like it can be replaced with continue here and the "top" label removed.

} else {
error = SET_ERROR(EIO);
}
}

#if defined(__linux__)
/*
Expand Down Expand Up @@ -472,6 +493,10 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
out:
zfs_rangelock_exit(lr);

if (dio_checksum_failure == B_TRUE) {
uio->uio_extflg |= UIO_DIRECT;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: parenthesis can be removed.


/*
* Cleanup for Direct I/O if requested.
*/
Expand Down
Loading
Loading