Skip to content

Commit

Permalink
Modify vdev_*_io_start_read()
Browse files Browse the repository at this point in the history
Move the outlier skip optimization check after the readable and missing
checks are performed on all of the children.  This way readable columns
with missing data are included in rr_missingdata and rr_missingparity.

I opted to add an additional field to raidz_row and raidz_col to store
the results of vdev_skip_latency_outlier().  This doesn't cost of any
memory due to the way the structure was already packed and seemed the
simplest.

Signed-off-by: Brian Behlendorf <[email protected]>
  • Loading branch information
behlendorf committed Jan 4, 2025
1 parent 00e4044 commit b482c07
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 44 deletions.
2 changes: 2 additions & 0 deletions include/sys/vdev_raidz_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ typedef struct raidz_col {
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
uint8_t rc_force_repair:1; /* Write good data to this column */
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
int rc_shadow_devidx; /* for double write during expansion */
int rc_shadow_error; /* for double write during expansion */
uint64_t rc_shadow_offset; /* for double write during expansion */
Expand All @@ -132,6 +133,7 @@ typedef struct raidz_row {
int rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
int rr_noutliers; /* Count of latency outlier devices */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */
Expand Down
42 changes: 23 additions & 19 deletions module/zfs/vdev_draid.c
Original file line number Diff line number Diff line change
Expand Up @@ -1889,17 +1889,6 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
/* Sequential rebuild must do IO at redundancy group boundary. */
IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);

/*
* Calculate how much parity is available for sitting out reads
*/
int parity_avail = rr->rr_firstdatacol;
for (int p = 0; p < rr->rr_firstdatacol; p++) {
raidz_col_t *rc = &rr->rr_col[p];
if (!vdev_draid_readable(vd->vdev_child[rc->rc_devidx],
rc->rc_offset)) {
parity_avail--;
}
}
/*
* Iterate over the columns in reverse order so that we hit the parity
* last. Any errors along the way will force us to read the parity.
Expand Down Expand Up @@ -2004,14 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_force_repair = 1;
rc->rc_allow_repair = 1;
}
} else if (parity_avail > 0 && c >= rr->rr_firstdatacol &&
rr->rr_missingdata == 0 &&
vdev_skip_latency_outlier(cvd, zio->io_flags)) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
parity_avail--;
continue;
} else if (vdev_skip_latency_outlier(cvd, zio->io_flags)) {
rr->rr_noutliers++;
rc->rc_latency_outlier = 1;
}
}

/*
* When the row contains a latency outlier and sufficient parity
* exists to reconstruct the column data, then skip reading the
* known slow child vdev as a performance optimization.
*/
if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 &&
(rr->rr_firstdatacol - rr->rr_missingparity) > 0) {

for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
raidz_col_t *rc = &rr->rr_col[c];

if (rc->rc_latency_outlier) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
break;
}
}
}

Expand Down
59 changes: 34 additions & 25 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -2482,18 +2482,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
{
vdev_t *vd = zio->io_vd;

/*
* Calculate how much parity is available for sitting out reads
*/
int parity_avail = rr->rr_firstdatacol;
for (int p = 0; p < rr->rr_firstdatacol; p++) {
raidz_col_t *rc = &rr->rr_col[p];
if (rc->rc_size > 0 &&
!vdev_readable(vd->vdev_child[rc->rc_devidx])) {
parity_avail--;
}
}

/*
* Iterate over the columns in reverse order so that we hit the parity
* last -- any errors along the way will force us to read the parity.
Expand All @@ -2513,19 +2501,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
rc->rc_skipped = 1;
continue;
}
/*
* Check if a data colummn read should be skipped
*/
if (parity_avail > 0 &&
c >= rr->rr_firstdatacol &&
rr->rr_missingdata == 0 &&
vdev_skip_latency_outlier(cvd, zio->io_flags)) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
parity_avail--;
continue;
}
if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
if (c >= rr->rr_firstdatacol)
rr->rr_missingdata++;
Expand All @@ -2535,6 +2510,40 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
rc->rc_skipped = 1;
continue;
}

if (vdev_skip_latency_outlier(cvd, zio->io_flags)) {
rr->rr_noutliers++;
rc->rc_latency_outlier = 1;
}
}

/*
* When the row contains a latency outlier and sufficient parity
* exists to reconstruct the column data, then skip reading the
* known slow child vdev as a performance optimization.
*/
if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 &&
(rr->rr_firstdatacol - rr->rr_missingparity) > 0) {

Check failure on line 2526 in module/zfs/vdev_raidz.c

View workflow job for this annotation

GitHub Actions / checkstyle

indent by spaces instead of tabs

for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
raidz_col_t *rc = &rr->rr_col[c];

if (rc->rc_latency_outlier) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
break;
}

Check failure on line 2536 in module/zfs/vdev_raidz.c

View workflow job for this annotation

GitHub Actions / checkstyle

spaces instead of tabs

Check failure on line 2536 in module/zfs/vdev_raidz.c

View workflow job for this annotation

GitHub Actions / checkstyle

non-continuation indented 4 spaces
}
}

Check failure on line 2538 in module/zfs/vdev_raidz.c

View workflow job for this annotation

GitHub Actions / checkstyle

indent by spaces instead of tabs

Check failure on line 2538 in module/zfs/vdev_raidz.c

View workflow job for this annotation

GitHub Actions / checkstyle

non-continuation indented 4 spaces

for (int c = rr->rr_cols - 1; c >= 0; c--) {
raidz_col_t *rc = &rr->rr_col[c];
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

if (rc->rc_error || rc->rc_size == 0)
continue;

if (forceparity ||
c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
Expand Down

0 comments on commit b482c07

Please sign in to comment.