diff --git a/.mailmap b/.mailmap index 5aa1eef464d9..64e02681e37d 100644 --- a/.mailmap +++ b/.mailmap @@ -70,6 +70,7 @@ Rob Norris Rob Norris Sam Lunt Sanjeev Bagewadi +Sebastian Wuerl Stoiko Ivanov Tamas TEVESZ WHR @@ -78,6 +79,7 @@ Youzhong Yang # Signed-off-by: overriding Author: Ryan +Sietse Qiuhao Chen Yuxin Wang Zhenlei Huang diff --git a/AUTHORS b/AUTHORS index 6a5cc088e651..b4342f6912ae 100644 --- a/AUTHORS +++ b/AUTHORS @@ -423,6 +423,7 @@ CONTRIBUTORS: Mathieu Velten Matt Fiddaman Matthew Ahrens + Matthew Heller Matthew Thode Matthias Blankertz Matt Johnston @@ -562,6 +563,7 @@ CONTRIBUTORS: Scot W. Stevenson Sean Eric Fagan Sebastian Gottschall + Sebastian Wuerl Sebastien Roy Sen Haerens Serapheim Dimitropoulos @@ -574,6 +576,7 @@ CONTRIBUTORS: Shawn Bayern Shengqi Chen Shen Yan + Sietse Simon Guest Simon Klinkert Sowrabha Gopal @@ -629,6 +632,7 @@ CONTRIBUTORS: Trevor Bautista Trey Dockendorf Troels Nørgaard + tstabrawa Tulsi Jain Turbo Fredriksson Tyler J. Stachecki diff --git a/META b/META index 0fe0dedae79e..5446d3e7c348 100644 --- a/META +++ b/META @@ -2,9 +2,9 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.3.0 -Release: rc3 +Release: rc4 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.11 +Linux-Maximum: 6.12 Linux-Minimum: 4.18 diff --git a/cmd/arc_summary b/cmd/arc_summary index c24d400fa39a..72381d266e64 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -662,10 +662,7 @@ def section_arc(kstats_dict): print() print('ARC hash breakdown:') - prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max'])) - prt_i2('Elements current:', - f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']), - f_hits(arc_stats['hash_elements'])) + prt_i1('Elements:', f_hits(arc_stats['hash_elements'])) prt_i1('Collisions:', f_hits(arc_stats['hash_collisions'])) prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max'])) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 46587671202a..aba99fabbbb9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1967,17 +1967,53 @@ dump_dedup_ratio(const ddt_stat_t *dds) static void dump_ddt_log(ddt_t *ddt) { + if (ddt->ddt_version != DDT_VERSION_FDT || + !(ddt->ddt_flags & DDT_FLAG_LOG)) + return; + for (int n = 0; n < 2; n++) { ddt_log_t *ddl = &ddt->ddt_log[n]; - uint64_t count = avl_numnodes(&ddl->ddl_tree); - if (count == 0) - continue; + char flagstr[64] = {0}; + if (ddl->ddl_flags > 0) { + flagstr[0] = ' '; + int c = 1; + if (ddl->ddl_flags & DDL_FLAG_FLUSHING) + c += strlcpy(&flagstr[c], " FLUSHING", + sizeof (flagstr) - c); + if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) + c += strlcpy(&flagstr[c], " CHECKPOINT", + sizeof (flagstr) - c); + if (ddl->ddl_flags & + ~(DDL_FLAG_FLUSHING|DDL_FLAG_CHECKPOINT)) + c += strlcpy(&flagstr[c], " UNKNOWN", + sizeof (flagstr) - c); + flagstr[1] = '['; + flagstr[c++] = ']'; + } - printf(DMU_POOL_DDT_LOG ": %lu log entries\n", - zio_checksum_table[ddt->ddt_checksum].ci_name, n, count); + uint64_t count = avl_numnodes(&ddl->ddl_tree); - if (dump_opt['D'] < 4) + printf(DMU_POOL_DDT_LOG ": flags=0x%02x%s; obj=%llu; " + "len=%llu; txg=%llu; entries=%llu\n", + zio_checksum_table[ddt->ddt_checksum].ci_name, n, + ddl->ddl_flags, flagstr, + (u_longlong_t)ddl->ddl_object, + (u_longlong_t)ddl->ddl_length, + (u_longlong_t)ddl->ddl_first_txg, (u_longlong_t)count); + + if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) { + const ddt_key_t *ddk = &ddl->ddl_checkpoint; + printf(" checkpoint: " + "%016llx:%016llx:%016llx:%016llx:%016llx\n", + (u_longlong_t)ddk->ddk_cksum.zc_word[0], + (u_longlong_t)ddk->ddk_cksum.zc_word[1], + (u_longlong_t)ddk->ddk_cksum.zc_word[2], + (u_longlong_t)ddk->ddk_cksum.zc_word[3], + (u_longlong_t)ddk->ddk_prop); + } + + if (count == 0 || dump_opt['D'] < 4) continue; ddt_lightweight_entry_t ddlwe; @@ -1991,7 +2027,7 @@ dump_ddt_log(ddt_t *ddt) } static void -dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) +dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { char name[DDT_NAMELEN]; ddt_lightweight_entry_t ddlwe; @@ -2016,11 +2052,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) ddt_object_name(ddt, type, class, name); - (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", - name, - (u_longlong_t)count, - (u_longlong_t)dspace, - (u_longlong_t)mspace); + (void) printf("%s: dspace=%llu; mspace=%llu; entries=%llu\n", name, + (u_longlong_t)dspace, (u_longlong_t)mspace, (u_longlong_t)count); if (dump_opt['D'] < 3) return; @@ -2043,24 +2076,52 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) (void) printf("\n"); } +static void +dump_ddt(ddt_t *ddt) +{ + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) + return; + + char flagstr[64] = {0}; + if (ddt->ddt_flags > 0) { + flagstr[0] = ' '; + int c = 1; + if (ddt->ddt_flags & DDT_FLAG_FLAT) + c += strlcpy(&flagstr[c], " FLAT", + sizeof (flagstr) - c); + if (ddt->ddt_flags & DDT_FLAG_LOG) + c += strlcpy(&flagstr[c], " LOG", + sizeof (flagstr) - c); + if (ddt->ddt_flags & ~DDT_FLAG_MASK) + c += strlcpy(&flagstr[c], " UNKNOWN", + sizeof (flagstr) - c); + flagstr[1] = '['; + flagstr[c] = ']'; + } + + printf("DDT-%s: version=%llu [%s]; flags=0x%02llx%s; rootobj=%llu\n", + zio_checksum_table[ddt->ddt_checksum].ci_name, + (u_longlong_t)ddt->ddt_version, + (ddt->ddt_version == 0) ? "LEGACY" : + (ddt->ddt_version == 1) ? "FDT" : "UNKNOWN", + (u_longlong_t)ddt->ddt_flags, flagstr, + (u_longlong_t)ddt->ddt_dir_object); + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) + dump_ddt_object(ddt, type, class); + + dump_ddt_log(ddt); +} + static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total = {{{0}}}; ddt_stat_t dds_total = {0}; - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) - continue; - for (ddt_type_t type = 0; type < DDT_TYPES; type++) { - for (ddt_class_t class = 0; class < DDT_CLASSES; - class++) { - dump_ddt(ddt, type, class); - } - } - dump_ddt_log(ddt); - } + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) + dump_ddt(spa->spa_ddt[c]); ddt_get_dedup_stats(spa, &dds_total); @@ -2119,9 +2180,6 @@ dump_brt(spa_t *spa) return; } - brt_t *brt = spa->spa_brt; - VERIFY(brt); - char count[32], used[32], saved[32]; zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); @@ -2132,11 +2190,8 @@ dump_brt(spa_t *spa) if (dump_opt['T'] < 2) return; - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd == NULL) - continue; - + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; if (!brtvd->bv_initiated) { printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); continue; @@ -2160,20 +2215,21 @@ dump_brt(spa_t *spa) if (!do_histo) printf("\n%-16s %-10s\n", "DVA", "REFCNT"); - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd == NULL || !brtvd->bv_initiated) + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + if (!brtvd->bv_initiated) continue; uint64_t counts[64] = {}; zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); - for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); + for (zap_cursor_init(&zc, spa->spa_meta_objset, + brtvd->bv_mos_entries); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t refcnt; - VERIFY0(zap_lookup_uint64(brt->brt_mos, + VERIFY0(zap_lookup_uint64(spa->spa_meta_objset, brtvd->bv_mos_entries, (const uint64_t *)za->za_name, 1, za->za_integer_length, za->za_num_integers, @@ -6897,7 +6953,7 @@ iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) for (zap_cursor_init(&zc, mos, zap_obj); zap_cursor_retrieve(&zc, attrp) == 0; (void) zap_cursor_advance(&zc)) { - dsl_deadlist_open(&ll, mos, attrp->za_first_integer); + VERIFY0(dsl_deadlist_open(&ll, mos, attrp->za_first_integer)); func(&ll, arg); dsl_deadlist_close(&ll); } @@ -8227,14 +8283,11 @@ dump_mos_leaks(spa_t *spa) } } - if (spa->spa_brt != NULL) { - brt_t *brt = spa->spa_brt; - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd != NULL && brtvd->bv_initiated) { - mos_obj_refd(brtvd->bv_mos_brtvdev); - mos_obj_refd(brtvd->bv_mos_entries); - } + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + if (brtvd->bv_initiated) { + mos_obj_refd(brtvd->bv_mos_brtvdev); + mos_obj_refd(brtvd->bv_mos_entries); } } diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 80d81c1154ae..d0a9bf1aacb6 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -67,19 +67,19 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, const void *arg) const lr_create_t *lrc = arg; const _lr_create_t *lr = &lrc->lr_create; time_t crtime = lr->lr_crtime[0]; - char *name, *link; + const char *name, *link; lr_attr_t *lrattr; - name = (char *)(lr + 1); + name = (const char *)&lrc->lr_data[0]; if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR || lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) { - lrattr = (lr_attr_t *)(lr + 1); + lrattr = (lr_attr_t *)&lrc->lr_data[0]; name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); } if (txtype == TX_SYMLINK) { - link = name + strlen(name) + 1; + link = (const char *)&lrc->lr_data[strlen(name) + 1]; (void) printf("%s%s -> %s\n", tab_prefix, name, link); } else if (txtype != TX_MKXATTR) { (void) printf("%s%s\n", tab_prefix, name); @@ -104,7 +104,7 @@ zil_prt_rec_remove(zilog_t *zilog, int txtype, const void *arg) const lr_remove_t *lr = arg; (void) printf("%sdoid %llu, name %s\n", tab_prefix, - (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); + (u_longlong_t)lr->lr_doid, (const char *)&lr->lr_data[0]); } static void @@ -115,7 +115,7 @@ zil_prt_rec_link(zilog_t *zilog, int txtype, const void *arg) (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix, (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, - (char *)(lr + 1)); + (const char *)&lr->lr_data[0]); } static void @@ -124,8 +124,8 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, const void *arg) (void) zilog, (void) txtype; const lr_rename_t *lrr = arg; const _lr_rename_t *lr = &lrr->lr_rename; - char *snm = (char *)(lr + 1); - char *tnm = snm + strlen(snm) + 1; + const char *snm = (const char *)&lrr->lr_data[0]; + const char *tnm = (const char *)&lrr->lr_data[strlen(snm) + 1]; (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix, (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); @@ -211,7 +211,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) /* data is stored after the end of the lr_write record */ data = abd_alloc(lr->lr_length, B_FALSE); - abd_copy_from_buf(data, lr + 1, lr->lr_length); + abd_copy_from_buf(data, &lr->lr_data[0], lr->lr_length); } (void) printf("%s", tab_prefix); @@ -309,7 +309,7 @@ zil_prt_rec_setsaxattr(zilog_t *zilog, int txtype, const void *arg) (void) zilog, (void) txtype; const lr_setsaxattr_t *lr = arg; - char *name = (char *)(lr + 1); + const char *name = (const char *)&lr->lr_data[0]; (void) printf("%sfoid %llu\n", tab_prefix, (u_longlong_t)lr->lr_foid); @@ -318,7 +318,7 @@ zil_prt_rec_setsaxattr(zilog_t *zilog, int txtype, const void *arg) (void) printf("%sXAT_VALUE NULL\n", tab_prefix); } else { (void) printf("%sXAT_VALUE ", tab_prefix); - char *val = name + (strlen(name) + 1); + const char *val = (const char *)&lr->lr_data[strlen(name) + 1]; for (int i = 0; i < lr->lr_size; i++) { (void) printf("%c", *val); val++; diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index 1ef5c631a438..6f994b68a127 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -445,8 +445,8 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, * its a loopback event from spa_async_remove(). Just * ignore it. */ - if (vs->vs_state == VDEV_STATE_REMOVED && - state == VDEV_STATE_REMOVED) + if ((vs->vs_state == VDEV_STATE_REMOVED && state == + VDEV_STATE_REMOVED) || vs->vs_state == VDEV_STATE_OFFLINE) return; /* Remove the vdev since device is unplugged */ diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 6a45a063d91a..506427a10672 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -512,7 +512,8 @@ get_usage(zpool_help_t idx) return (gettext("\tinitialize [-c | -s | -u] [-w] " "[ ...]\n")); case HELP_SCRUB: - return (gettext("\tscrub [-s | -p] [-w] [-e] ...\n")); + return (gettext("\tscrub [-e | -s | -p | -C] [-w] " + " ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: @@ -6882,8 +6883,13 @@ collect_pool(zpool_handle_t *zhp, list_cbdata_t *cb) if (cb->cb_json) { if (pl->pl_prop == ZPOOL_PROP_NAME) continue; + const char *prop_name; + if (pl->pl_prop != ZPROP_USERPROP) + prop_name = zpool_prop_to_name(pl->pl_prop); + else + prop_name = pl->pl_user_prop; (void) zprop_nvlist_one_property( - zpool_prop_to_name(pl->pl_prop), propstr, + prop_name, propstr, sourcetype, NULL, NULL, props, cb->cb_json_as_int); } else { /* @@ -8424,12 +8430,13 @@ wait_callback(zpool_handle_t *zhp, void *data) } /* - * zpool scrub [-s | -p] [-w] [-e] ... + * zpool scrub [-e | -s | -p | -C] [-w] ... * * -e Only scrub blocks in the error log. * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. + * -C Scrub from last saved txg. */ int zpool_do_scrub(int argc, char **argv) @@ -8445,9 +8452,10 @@ zpool_do_scrub(int argc, char **argv) boolean_t is_error_scrub = B_FALSE; boolean_t is_pause = B_FALSE; boolean_t is_stop = B_FALSE; + boolean_t is_txg_continue = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "spwe")) != -1) { + while ((c = getopt(argc, argv, "spweC")) != -1) { switch (c) { case 'e': is_error_scrub = B_TRUE; @@ -8461,6 +8469,9 @@ zpool_do_scrub(int argc, char **argv) case 'w': wait = B_TRUE; break; + case 'C': + is_txg_continue = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -8472,6 +8483,18 @@ zpool_do_scrub(int argc, char **argv) (void) fprintf(stderr, gettext("invalid option " "combination :-s and -p are mutually exclusive\n")); usage(B_FALSE); + } else if (is_pause && is_txg_continue) { + (void) fprintf(stderr, gettext("invalid option " + "combination :-p and -C are mutually exclusive\n")); + usage(B_FALSE); + } else if (is_stop && is_txg_continue) { + (void) fprintf(stderr, gettext("invalid option " + "combination :-s and -C are mutually exclusive\n")); + usage(B_FALSE); + } else if (is_error_scrub && is_txg_continue) { + (void) fprintf(stderr, gettext("invalid option " + "combination :-e and -C are mutually exclusive\n")); + usage(B_FALSE); } else { if (is_error_scrub) cb.cb_type = POOL_SCAN_ERRORSCRUB; @@ -8480,6 +8503,8 @@ zpool_do_scrub(int argc, char **argv) cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; } else if (is_stop) { cb.cb_type = POOL_SCAN_NONE; + } else if (is_txg_continue) { + cb.cb_scrub_cmd = POOL_SCRUB_FROM_LAST_TXG; } else { cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; } @@ -10029,9 +10054,8 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { - uint64_t copied, total, elapsed, mins_left, hours_left; + uint64_t copied, total, elapsed, rate, mins_left, hours_left; double fraction_done; - uint_t rate; assert(prs->prs_state == DSS_SCANNING); @@ -10127,9 +10151,8 @@ print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres) copied_buf, time_buf, ctime((time_t *)&end)); } else { char examined_buf[7], total_buf[7], rate_buf[7]; - uint64_t copied, total, elapsed, secs_left; + uint64_t copied, total, elapsed, rate, secs_left; double fraction_done; - uint_t rate; assert(pres->pres_state == DSS_SCANNING); diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 index 12ffe9d95142..8dc17e2d42f9 100644 --- a/config/kernel-register_sysctl_table.m4 +++ b/config/kernel-register_sysctl_table.m4 @@ -36,7 +36,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [ ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [ #include ],[ - struct ctl_table test_table[] __attribute__((unused)) = {0}; + struct ctl_table test_table[] __attribute__((unused)) = {{}}; register_sysctl_sz("", test_table, 0); ]) ]) diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h index 9eb424dd0373..fd22e6b001dc 100644 --- a/include/os/freebsd/spl/sys/debug.h +++ b/include/os/freebsd/spl/sys/debug.h @@ -105,7 +105,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) __FILE__, __FUNCTION__, __LINE__)) #define VERIFYF(cond, str, ...) do { \ - if (unlikely(!cond)) \ + if (unlikely(!(cond))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\ } while (0) @@ -201,7 +201,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) "failed (%lld " #OP " %lld) " STR "\n", \ (long long)(_verify3_left), \ (long long)(_verify3_right), \ - __VA_ARGS); \ + __VA_ARGS__); \ } while (0) #define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \ @@ -213,7 +213,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) "failed (%llu " #OP " %llu) " STR "\n", \ (unsigned long long)(_verify3_left), \ (unsigned long long)(_verify3_right), \ - __VA_ARGS); \ + __VA_ARGS__); \ } while (0) #define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \ diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h index e2c4830cb964..aa3688718ae7 100644 --- a/include/os/freebsd/spl/sys/sdt.h +++ b/include/os/freebsd/spl/sys/sdt.h @@ -31,9 +31,9 @@ #include_next #ifdef KDTRACE_HOOKS -/* BEGIN CSTYLED */ SDT_PROBE_DECLARE(sdt, , , set__error); +/* BEGIN CSTYLED */ #define SET_ERROR(err) ({ \ SDT_PROBE1(sdt, , , set__error, (uintptr_t)err); \ err; \ diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h index 76ea3eff3792..b9d3e81d0812 100644 --- a/include/os/freebsd/spl/sys/vnode.h +++ b/include/os/freebsd/spl/sys/vnode.h @@ -68,47 +68,30 @@ enum symfollow { NO_FOLLOW = NOFOLLOW }; #include typedef struct vop_vector vnodeops_t; -#define VOP_FID VOP_VPTOFH #define vop_fid vop_vptofh #define vop_fid_args vop_vptofh_args #define a_fid a_fhp -#define rootvfs (rootvnode == NULL ? NULL : rootvnode->v_mount) - -#ifndef IN_BASE -static __inline int -vn_is_readonly(vnode_t *vp) -{ - return (vp->v_mount->mnt_flag & MNT_RDONLY); -} -#endif #define vn_vfswlock(vp) (0) #define vn_vfsunlock(vp) do { } while (0) -#define vn_ismntpt(vp) \ - ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) -#define vn_mountedvfs(vp) ((vp)->v_mountedhere) + +#ifndef IN_BASE #define vn_has_cached_data(vp) \ ((vp)->v_object != NULL && \ (vp)->v_object->resident_page_count > 0) -#ifndef IN_BASE static __inline void vn_flush_cached_data(vnode_t *vp, boolean_t sync) { if (vm_object_mightbedirty(vp->v_object)) { int flags = sync ? OBJPC_SYNC : 0; - vn_lock(vp, LK_SHARED | LK_RETRY); zfs_vmobject_wlock(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, flags); zfs_vmobject_wunlock(vp->v_object); - VOP_UNLOCK(vp); } } #endif -#define vn_exists(vp) do { } while (0) -#define vn_invalid(vp) do { } while (0) -#define vn_free(vp) do { } while (0) #define vn_matchops(vp, vops) ((vp)->v_op == &(vops)) #define VN_HOLD(v) vref(v) @@ -123,9 +106,6 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync) #define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0) #define vnevent_rename_dest_dir(vp, ct) do { } while (0) -#define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) -#define MANDLOCK(vp, mode) (0) - /* * We will use va_spare is place of Solaris' va_mask. * This field is initialized in zfs_setattr(). diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h index f041dde34fc8..3459d6979fe8 100644 --- a/include/os/linux/spl/sys/debug.h +++ b/include/os/linux/spl/sys/debug.h @@ -109,7 +109,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) __FILE__, __FUNCTION__, __LINE__)) #define VERIFYF(cond, str, ...) do { \ - if (unlikely(!cond)) \ + if (unlikely(!(cond))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\ } while (0) @@ -205,7 +205,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) "failed (%lld " #OP " %lld) " STR "\n", \ (long long)(_verify3_left), \ (long long)(_verify3_right), \ - __VA_ARGS); \ + __VA_ARGS__); \ } while (0) #define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \ @@ -217,7 +217,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) "failed (%llu " #OP " %llu) " STR "\n", \ (unsigned long long)(_verify3_left), \ (unsigned long long)(_verify3_right), \ - __VA_ARGS); \ + __VA_ARGS__); \ } while (0) #define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \ diff --git a/include/sys/arc.h b/include/sys/arc.h index 883c07b4ff3d..5148905c93d8 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -347,6 +347,7 @@ void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); void l2arc_spa_rebuild_start(spa_t *spa); +void l2arc_spa_rebuild_stop(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 01693d72dda8..b2839bdf1485 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -942,6 +942,7 @@ typedef struct arc_sums { wmsum_t arcstat_evict_l2_eligible_mru; wmsum_t arcstat_evict_l2_ineligible; wmsum_t arcstat_evict_l2_skip; + wmsum_t arcstat_hash_elements; wmsum_t arcstat_hash_collisions; wmsum_t arcstat_hash_chains; aggsum_t arcstat_size; diff --git a/include/sys/brt_impl.h b/include/sys/brt_impl.h index 9cc06fbb2c3a..168d81f17b72 100644 --- a/include/sys/brt_impl.h +++ b/include/sys/brt_impl.h @@ -86,28 +86,38 @@ typedef struct brt_vdev_phys { uint64_t bvp_savedspace; } brt_vdev_phys_t; -typedef struct brt_vdev { +struct brt_vdev { /* - * VDEV id. + * Pending changes from open contexts. */ - uint64_t bv_vdevid; + kmutex_t bv_pending_lock; + avl_tree_t bv_pending_tree[TXG_SIZE]; /* - * Is the structure initiated? - * (bv_entcount and bv_bitmap are allocated?) + * Protects bv_mos_*. */ - boolean_t bv_initiated; + krwlock_t bv_mos_entries_lock ____cacheline_aligned; + /* + * Protects all the fields starting from bv_initiated. + */ + krwlock_t bv_lock ____cacheline_aligned; + /* + * VDEV id. + */ + uint64_t bv_vdevid ____cacheline_aligned; /* * Object number in the MOS for the entcount array and brt_vdev_phys. */ uint64_t bv_mos_brtvdev; /* - * Object number in the MOS for the entries table. + * Object number in the MOS and dnode for the entries table. */ uint64_t bv_mos_entries; + dnode_t *bv_mos_entries_dnode; /* - * Entries to sync. + * Is the structure initiated? + * (bv_entcount and bv_bitmap are allocated?) */ - avl_tree_t bv_tree; + boolean_t bv_initiated; /* * Does the bv_entcount[] array needs byte swapping? */ @@ -120,6 +130,26 @@ typedef struct brt_vdev { * This is the array with BRT entry count per BRT_RANGESIZE. */ uint16_t *bv_entcount; + /* + * bv_entcount[] potentially can be a bit too big to sychronize it all + * when we just changed few entcounts. The fields below allow us to + * track updates to bv_entcount[] array since the last sync. + * A single bit in the bv_bitmap represents as many entcounts as can + * fit into a single BRT_BLOCKSIZE. + * For example we have 65536 entcounts in the bv_entcount array + * (so the whole array is 128kB). We updated bv_entcount[2] and + * bv_entcount[5]. In that case only first bit in the bv_bitmap will + * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + */ + ulong_t *bv_bitmap; + /* + * bv_entcount[] needs updating on disk. + */ + boolean_t bv_entcount_dirty; + /* + * brt_vdev_phys needs updating on disk. + */ + boolean_t bv_meta_dirty; /* * Sum of all bv_entcount[]s. */ @@ -133,65 +163,27 @@ typedef struct brt_vdev { */ uint64_t bv_savedspace; /* - * brt_vdev_phys needs updating on disk. - */ - boolean_t bv_meta_dirty; - /* - * bv_entcount[] needs updating on disk. - */ - boolean_t bv_entcount_dirty; - /* - * bv_entcount[] potentially can be a bit too big to sychronize it all - * when we just changed few entcounts. The fields below allow us to - * track updates to bv_entcount[] array since the last sync. - * A single bit in the bv_bitmap represents as many entcounts as can - * fit into a single BRT_BLOCKSIZE. - * For example we have 65536 entcounts in the bv_entcount array - * (so the whole array is 128kB). We updated bv_entcount[2] and - * bv_entcount[5]. In that case only first bit in the bv_bitmap will - * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + * Entries to sync. */ - ulong_t *bv_bitmap; - uint64_t bv_nblocks; -} brt_vdev_t; - -/* - * In-core brt - */ -typedef struct brt { - krwlock_t brt_lock; - spa_t *brt_spa; -#define brt_mos brt_spa->spa_meta_objset - uint64_t brt_rangesize; - uint64_t brt_usedspace; - uint64_t brt_savedspace; - avl_tree_t brt_pending_tree[TXG_SIZE]; - kmutex_t brt_pending_lock[TXG_SIZE]; - /* Sum of all entries across all bv_trees. */ - uint64_t brt_nentries; - brt_vdev_t *brt_vdevs; - uint64_t brt_nvdevs; -} brt_t; + avl_tree_t bv_tree; +}; -/* Size of bre_offset / sizeof (uint64_t). */ +/* Size of offset / sizeof (uint64_t). */ #define BRT_KEY_WORDS (1) +#define BRE_OFFSET(bre) (DVA_GET_OFFSET(&(bre)->bre_bp.blk_dva[0])) + /* * In-core brt entry. - * On-disk we use bre_offset as the key and bre_refcount as the value. + * On-disk we use ZAP with offset as the key and count as the value. */ typedef struct brt_entry { - uint64_t bre_offset; - uint64_t bre_refcount; avl_node_t bre_node; + blkptr_t bre_bp; + uint64_t bre_count; + uint64_t bre_pcount; } brt_entry_t; -typedef struct brt_pending_entry { - blkptr_t bpe_bp; - int bpe_count; - avl_node_t bpe_node; -} brt_pending_entry_t; - #ifdef __cplusplus } #endif diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 22cbd7fc73b6..29f715039d29 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -381,6 +381,7 @@ typedef struct dmu_buf { #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_ERRORSCRUB "error_scrub" +#define DMU_POOL_LAST_SCRUBBED_TXG "last_scrubbed_txg" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h index 3feb3bbf062f..798f9e3f6245 100644 --- a/include/sys/dsl_deadlist.h +++ b/include/sys/dsl_deadlist.h @@ -89,7 +89,7 @@ extern int zfs_livelist_min_percent_shared; typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle); -void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); +int dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); void dsl_deadlist_close(dsl_deadlist_t *dl); void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg); uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index f7c0d9acd10d..04ea7a8f293b 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -198,7 +198,7 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); -void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj); +int dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj); void dsl_dir_livelist_close(dsl_dir_t *dd); void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total); int dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 63734dbc176f..ef181c3ff2cd 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -179,6 +179,12 @@ typedef struct dsl_scan { dsl_errorscrub_phys_t errorscrub_phys; } dsl_scan_t; +typedef struct { + pool_scan_func_t func; + uint64_t txgstart; + uint64_t txgend; +} setup_sync_arg_t; + typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; void scan_init(void); @@ -189,7 +195,8 @@ void dsl_scan_setup_sync(void *, dmu_tx_t *); void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); int dsl_scan_cancel(struct dsl_pool *); -int dsl_scan(struct dsl_pool *, pool_scan_func_t); +int dsl_scan(struct dsl_pool *, pool_scan_func_t, uint64_t starttxg, + uint64_t txgend); void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); boolean_t dsl_errorscrubbing(const struct dsl_pool *dp); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 1676020d04d3..dc474e3739f3 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -265,6 +265,7 @@ typedef enum { ZPOOL_PROP_DEDUP_TABLE_SIZE, ZPOOL_PROP_DEDUP_TABLE_QUOTA, ZPOOL_PROP_DEDUPCACHED, + ZPOOL_PROP_LAST_SCRUBBED_TXG, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -1088,6 +1089,7 @@ typedef enum pool_scan_func { typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, + POOL_SCRUB_FROM_LAST_TXG, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; diff --git a/include/sys/spa.h b/include/sys/spa.h index ca30b60c0af7..510d1119bffd 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -53,6 +53,7 @@ extern "C" { /* * Forward references that lots of things need. */ +typedef struct brt_vdev brt_vdev_t; typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; @@ -821,6 +822,8 @@ extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); +extern int spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, + uint64_t txgend); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); @@ -1079,6 +1082,7 @@ extern uint64_t spa_get_deadman_failmode(spa_t *spa); extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); +extern uint64_t spa_get_last_scrubbed_txg(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern space_map_t *spa_syncing_log_sm(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 7811abbb9ce3..b0a2d46ff2c4 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -318,6 +318,7 @@ struct spa { uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ uint64_t spa_scan_pass_issued; /* issued bytes per pass */ + uint64_t spa_scrubbed_last_txg; /* last txg scrubbed */ /* error scrub pause time in milliseconds */ uint64_t spa_scan_pass_errorscrub_pause; @@ -412,8 +413,12 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + uint64_t spa_rdspace; /* raw (non-dedup) --//-- */ boolean_t spa_active_ddt_prune; /* ddt prune process active */ - struct brt *spa_brt; /* in-core BRT */ + brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */ + uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */ + uint64_t spa_brt_rangesize; /* pool's BRT range size */ + krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ diff --git a/include/sys/zap.h b/include/sys/zap.h index 53166e094a72..c8d24b1100be 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -223,11 +223,15 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, boolean_t *normalization_conflictp); int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name); int zap_prefetch_object(objset_t *os, uint64_t zapobj); int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); +int zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints); int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); @@ -236,9 +240,6 @@ int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp); -int zap_count_write_by_dnode(dnode_t *dn, const char *name, - int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite); - /* * Create an attribute with the given name and value. * diff --git a/lib/libspl/atomic.c b/lib/libspl/atomic.c index 8cc350710ba0..f61f5fcc47f5 100644 --- a/lib/libspl/atomic.c +++ b/lib/libspl/atomic.c @@ -35,7 +35,6 @@ (void) __atomic_add_fetch(target, 1, __ATOMIC_SEQ_CST); \ } -/* BEGIN CSTYLED */ ATOMIC_INC(8, uint8_t) ATOMIC_INC(16, uint16_t) ATOMIC_INC(32, uint32_t) @@ -44,7 +43,6 @@ ATOMIC_INC(uchar, uchar_t) ATOMIC_INC(ushort, ushort_t) ATOMIC_INC(uint, uint_t) ATOMIC_INC(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_DEC(name, type) \ @@ -53,7 +51,6 @@ ATOMIC_INC(ulong, ulong_t) (void) __atomic_sub_fetch(target, 1, __ATOMIC_SEQ_CST); \ } -/* BEGIN CSTYLED */ ATOMIC_DEC(8, uint8_t) ATOMIC_DEC(16, uint16_t) ATOMIC_DEC(32, uint32_t) @@ -62,7 +59,6 @@ ATOMIC_DEC(uchar, uchar_t) ATOMIC_DEC(ushort, ushort_t) ATOMIC_DEC(uint, uint_t) ATOMIC_DEC(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_ADD(name, type1, type2) \ @@ -77,7 +73,6 @@ atomic_add_ptr(volatile void *target, ssize_t bits) (void) __atomic_add_fetch((void **)target, bits, __ATOMIC_SEQ_CST); } -/* BEGIN CSTYLED */ ATOMIC_ADD(8, uint8_t, int8_t) ATOMIC_ADD(16, uint16_t, int16_t) ATOMIC_ADD(32, uint32_t, int32_t) @@ -86,7 +81,6 @@ ATOMIC_ADD(char, uchar_t, signed char) ATOMIC_ADD(short, ushort_t, short) ATOMIC_ADD(int, uint_t, int) ATOMIC_ADD(long, ulong_t, long) -/* END CSTYLED */ #define ATOMIC_SUB(name, type1, type2) \ @@ -101,7 +95,6 @@ atomic_sub_ptr(volatile void *target, ssize_t bits) (void) __atomic_sub_fetch((void **)target, bits, __ATOMIC_SEQ_CST); } -/* BEGIN CSTYLED */ ATOMIC_SUB(8, uint8_t, int8_t) ATOMIC_SUB(16, uint16_t, int16_t) ATOMIC_SUB(32, uint32_t, int32_t) @@ -110,7 +103,6 @@ ATOMIC_SUB(char, uchar_t, signed char) ATOMIC_SUB(short, ushort_t, short) ATOMIC_SUB(int, uint_t, int) ATOMIC_SUB(long, ulong_t, long) -/* END CSTYLED */ #define ATOMIC_OR(name, type) \ @@ -119,7 +111,6 @@ ATOMIC_SUB(long, ulong_t, long) (void) __atomic_or_fetch(target, bits, __ATOMIC_SEQ_CST); \ } -/* BEGIN CSTYLED */ ATOMIC_OR(8, uint8_t) ATOMIC_OR(16, uint16_t) ATOMIC_OR(32, uint32_t) @@ -128,7 +119,6 @@ ATOMIC_OR(uchar, uchar_t) ATOMIC_OR(ushort, ushort_t) ATOMIC_OR(uint, uint_t) ATOMIC_OR(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_AND(name, type) \ @@ -137,7 +127,6 @@ ATOMIC_OR(ulong, ulong_t) (void) __atomic_and_fetch(target, bits, __ATOMIC_SEQ_CST); \ } -/* BEGIN CSTYLED */ ATOMIC_AND(8, uint8_t) ATOMIC_AND(16, uint16_t) ATOMIC_AND(32, uint32_t) @@ -146,7 +135,6 @@ ATOMIC_AND(uchar, uchar_t) ATOMIC_AND(ushort, ushort_t) ATOMIC_AND(uint, uint_t) ATOMIC_AND(ulong, ulong_t) -/* END CSTYLED */ /* @@ -159,7 +147,6 @@ ATOMIC_AND(ulong, ulong_t) return (__atomic_add_fetch(target, 1, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_INC_NV(8, uint8_t) ATOMIC_INC_NV(16, uint16_t) ATOMIC_INC_NV(32, uint32_t) @@ -168,7 +155,6 @@ ATOMIC_INC_NV(uchar, uchar_t) ATOMIC_INC_NV(ushort, ushort_t) ATOMIC_INC_NV(uint, uint_t) ATOMIC_INC_NV(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_DEC_NV(name, type) \ @@ -177,7 +163,6 @@ ATOMIC_INC_NV(ulong, ulong_t) return (__atomic_sub_fetch(target, 1, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_DEC_NV(8, uint8_t) ATOMIC_DEC_NV(16, uint16_t) ATOMIC_DEC_NV(32, uint32_t) @@ -186,7 +171,6 @@ ATOMIC_DEC_NV(uchar, uchar_t) ATOMIC_DEC_NV(ushort, ushort_t) ATOMIC_DEC_NV(uint, uint_t) ATOMIC_DEC_NV(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_ADD_NV(name, type1, type2) \ @@ -201,7 +185,6 @@ atomic_add_ptr_nv(volatile void *target, ssize_t bits) return (__atomic_add_fetch((void **)target, bits, __ATOMIC_SEQ_CST)); } -/* BEGIN CSTYLED */ ATOMIC_ADD_NV(8, uint8_t, int8_t) ATOMIC_ADD_NV(16, uint16_t, int16_t) ATOMIC_ADD_NV(32, uint32_t, int32_t) @@ -210,7 +193,6 @@ ATOMIC_ADD_NV(char, uchar_t, signed char) ATOMIC_ADD_NV(short, ushort_t, short) ATOMIC_ADD_NV(int, uint_t, int) ATOMIC_ADD_NV(long, ulong_t, long) -/* END CSTYLED */ #define ATOMIC_SUB_NV(name, type1, type2) \ @@ -225,7 +207,6 @@ atomic_sub_ptr_nv(volatile void *target, ssize_t bits) return (__atomic_sub_fetch((void **)target, bits, __ATOMIC_SEQ_CST)); } -/* BEGIN CSTYLED */ ATOMIC_SUB_NV(8, uint8_t, int8_t) ATOMIC_SUB_NV(char, uchar_t, signed char) ATOMIC_SUB_NV(16, uint16_t, int16_t) @@ -234,7 +215,6 @@ ATOMIC_SUB_NV(32, uint32_t, int32_t) ATOMIC_SUB_NV(int, uint_t, int) ATOMIC_SUB_NV(long, ulong_t, long) ATOMIC_SUB_NV(64, uint64_t, int64_t) -/* END CSTYLED */ #define ATOMIC_OR_NV(name, type) \ @@ -243,7 +223,6 @@ ATOMIC_SUB_NV(64, uint64_t, int64_t) return (__atomic_or_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_OR_NV(8, uint8_t) ATOMIC_OR_NV(16, uint16_t) ATOMIC_OR_NV(32, uint32_t) @@ -252,7 +231,6 @@ ATOMIC_OR_NV(uchar, uchar_t) ATOMIC_OR_NV(ushort, ushort_t) ATOMIC_OR_NV(uint, uint_t) ATOMIC_OR_NV(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_AND_NV(name, type) \ @@ -261,7 +239,6 @@ ATOMIC_OR_NV(ulong, ulong_t) return (__atomic_and_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_AND_NV(8, uint8_t) ATOMIC_AND_NV(16, uint16_t) ATOMIC_AND_NV(32, uint32_t) @@ -270,7 +247,6 @@ ATOMIC_AND_NV(uchar, uchar_t) ATOMIC_AND_NV(ushort, ushort_t) ATOMIC_AND_NV(uint, uint_t) ATOMIC_AND_NV(ulong, ulong_t) -/* END CSTYLED */ /* @@ -300,7 +276,6 @@ atomic_cas_ptr(volatile void *target, void *exp, void *des) return (exp); } -/* BEGIN CSTYLED */ ATOMIC_CAS(8, uint8_t) ATOMIC_CAS(16, uint16_t) ATOMIC_CAS(32, uint32_t) @@ -309,7 +284,6 @@ ATOMIC_CAS(uchar, uchar_t) ATOMIC_CAS(ushort, ushort_t) ATOMIC_CAS(uint, uint_t) ATOMIC_CAS(ulong, ulong_t) -/* END CSTYLED */ /* @@ -322,7 +296,6 @@ ATOMIC_CAS(ulong, ulong_t) return (__atomic_exchange_n(target, bits, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_SWAP(8, uint8_t) ATOMIC_SWAP(16, uint16_t) ATOMIC_SWAP(32, uint32_t) @@ -331,7 +304,6 @@ ATOMIC_SWAP(uchar, uchar_t) ATOMIC_SWAP(ushort, ushort_t) ATOMIC_SWAP(uint, uint_t) ATOMIC_SWAP(ulong, ulong_t) -/* END CSTYLED */ void * atomic_swap_ptr(volatile void *target, void *bits) diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index ac9ae233c72d..1f9fde6677d8 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -3132,7 +3132,8 @@ - + + @@ -5984,7 +5985,8 @@ - + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 44f2c6f19dff..f256535e8ea0 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -471,13 +471,15 @@ int zpool_get_userprop(zpool_handle_t *zhp, const char *propname, char *buf, size_t len, zprop_source_t *srctype) { - nvlist_t *nv, *nvl; + nvlist_t *nv; uint64_t ival; const char *value; zprop_source_t source = ZPROP_SRC_LOCAL; - nvl = zhp->zpool_props; - if (nvlist_lookup_nvlist(nvl, propname, &nv) == 0) { + if (zhp->zpool_props == NULL) + zpool_get_all_props(zhp); + + if (nvlist_lookup_nvlist(zhp->zpool_props, propname, &nv) == 0) { if (nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0) source = ival; verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 1f7e7b0e647e..7cc91f984a40 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -932,6 +932,7 @@ libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags, pid = fork(); if (pid == 0) { /* Child process */ + setpgid(0, 0); devnull_fd = open("/dev/null", O_WRONLY | O_CLOEXEC); if (devnull_fd < 0) diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index f4fcc620e4d9..7c0dd4caad3e 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -28,7 +28,7 @@ .\" Copyright (c) 2021, Colm Buckley .\" Copyright (c) 2023, Klara Inc. .\" -.Dd July 29, 2024 +.Dd November 18, 2024 .Dt ZPOOLPROPS 7 .Os . @@ -135,6 +135,19 @@ A unique identifier for the pool. The current health of the pool. Health can be one of .Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL . +.It Sy last_scrubbed_txg +Indicates the transaction group (TXG) up to which the most recent scrub +operation has checked and repaired the dataset. +This provides insight into the data integrity status of their pool at +a specific point in time. +.Xr zpool-scrub 8 +can utilize this property to scan only data that has changed since the last +scrub completed, when given the +.Fl C +flag. +This property is not updated when performing an error scrub with the +.Fl e +flag. .It Sy leaked Space not released while .Sy freeing diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index ad9e7a42bfac..abccc4d086e0 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -23,7 +23,7 @@ .\" .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS .\" -.Dd April 4, 2024 +.Dd December 2, 2024 .Dt ZINJECT 8 .Os . @@ -268,7 +268,7 @@ Run for this many seconds before reporting failure. .It Fl T Ar failure Set the failure type to one of .Sy all , -.Sy ioctl , +.Sy flush , .Sy claim , .Sy free , .Sy read , diff --git a/man/man8/zpool-remove.8 b/man/man8/zpool-remove.8 index b5cc6e4fc57e..00216b65a8d7 100644 --- a/man/man8/zpool-remove.8 +++ b/man/man8/zpool-remove.8 @@ -109,7 +109,7 @@ Stops and cancels an in-progress removal of a top-level vdev. .El . .Sh EXAMPLES -.\" These are, respectively, examples 14 from zpool.8 +.\" These are, respectively, examples 15 from zpool.8 .\" Make sure to update them bidirectionally .Ss Example 1 : No Removing a Mirrored top-level (Log or Data) Device The following commands remove the mirrored log device @@ -142,9 +142,43 @@ The command to remove the mirrored log .Ar mirror-2 No is : .Dl # Nm zpool Cm remove Ar tank mirror-2 .Pp +At this point, the log device no longer exists +(both sides of the mirror have been removed): +.Bd -literal -compact -offset Ds + pool: tank + state: ONLINE + scan: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 + mirror-1 ONLINE 0 0 0 + sdc ONLINE 0 0 0 + sdd ONLINE 0 0 0 +.Ed +.Pp The command to remove the mirrored data .Ar mirror-1 No is : .Dl # Nm zpool Cm remove Ar tank mirror-1 +.Pp +After +.Ar mirror-1 No has been evacuated, the pool remains redundant, but +the total amount of space is reduced: +.Bd -literal -compact -offset Ds + pool: tank + state: ONLINE + scan: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 +.Ed . .Sh SEE ALSO .Xr zpool-add 8 , diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 index 03f3ad4991f9..676286b038da 100644 --- a/man/man8/zpool-scrub.8 +++ b/man/man8/zpool-scrub.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd June 22, 2023 +.Dd November 18, 2024 .Dt ZPOOL-SCRUB 8 .Os . @@ -36,9 +36,8 @@ .Sh SYNOPSIS .Nm zpool .Cm scrub -.Op Fl s Ns | Ns Fl p +.Op Ns Fl e | Ns Fl p | Fl s Ns | Fl C Ns .Op Fl w -.Op Fl e .Ar pool Ns … . .Sh DESCRIPTION @@ -114,6 +113,10 @@ The pool must have been scrubbed at least once with the feature enabled to use this option. Error scrubbing cannot be run simultaneously with regular scrubbing or resilvering, nor can it be run when a regular scrub is paused. +.It Fl C +Continue scrub from last saved txg (see zpool +.Sy last_scrubbed_txg +property). .El .Sh EXAMPLES .Ss Example 1 diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 02a258f66708..b54a92f96151 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -405,9 +405,43 @@ The command to remove the mirrored log .Ar mirror-2 No is : .Dl # Nm zpool Cm remove Ar tank mirror-2 .Pp +At this point, the log device no longer exists +(both sides of the mirror have been removed): +.Bd -literal -compact -offset Ds + pool: tank + state: ONLINE + scan: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 + mirror-1 ONLINE 0 0 0 + sdc ONLINE 0 0 0 + sdd ONLINE 0 0 0 +.Ed +.Pp The command to remove the mirrored data .Ar mirror-1 No is : .Dl # Nm zpool Cm remove Ar tank mirror-1 +.Pp +After +.Ar mirror-1 No has been evacuated, the pool remains redundant, but +the total amount of space is reduced: +.Bd -literal -compact -offset Ds + pool: tank + state: ONLINE + scan: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 +.Ed . .Ss Example 16 : No Displaying expanded space on a device The following command displays the detailed information for the pool diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c index 887f7d32df4a..9034873474fe 100644 --- a/module/nvpair/nvpair.c +++ b/module/nvpair/nvpair.c @@ -3281,7 +3281,6 @@ nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...) \ #endif -/* BEGIN CSTYLED */ NVS_BUILD_XDRPROC_T(char); NVS_BUILD_XDRPROC_T(short); NVS_BUILD_XDRPROC_T(u_short); @@ -3289,7 +3288,6 @@ NVS_BUILD_XDRPROC_T(int); NVS_BUILD_XDRPROC_T(u_int); NVS_BUILD_XDRPROC_T(longlong_t); NVS_BUILD_XDRPROC_T(u_longlong_t); -/* END CSTYLED */ /* * The format of xdr encoded nvpair is: diff --git a/module/os/freebsd/spl/spl_dtrace.c b/module/os/freebsd/spl/spl_dtrace.c index 4b9cc65d641e..0a2fcf110d7b 100644 --- a/module/os/freebsd/spl/spl_dtrace.c +++ b/module/os/freebsd/spl/spl_dtrace.c @@ -31,5 +31,4 @@ #include #include -/* CSTYLED */ SDT_PROBE_DEFINE1(sdt, , , set__error, "int"); diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c index 0a0af102ea82..370ce2d806e8 100644 --- a/module/os/freebsd/zfs/dmu_os.c +++ b/module/os/freebsd/zfs/dmu_os.c @@ -103,6 +103,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(*ma, &sf); + ASSERT(db->db_data != NULL); memcpy((char *)db->db_data + bufoff, va, thiscpy); zfs_unmap_page(sf); ma += 1; @@ -172,6 +173,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, ASSERT3U(db->db_size, >, PAGE_SIZE); bufoff = IDX_TO_OFF(m->pindex) % db->db_size; va = zfs_map_page(m, &sf); + ASSERT(db->db_data != NULL); memcpy(va, (char *)db->db_data + bufoff, PAGESIZE); zfs_unmap_page(sf); vm_page_valid(m); @@ -211,8 +213,10 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, */ tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); ASSERT3S(tocpy, >=, 0); - if (m != bogus_page) + if (m != bogus_page) { + ASSERT(db->db_data != NULL); memcpy(va + pgoff, (char *)db->db_data + bufoff, tocpy); + } pgoff += tocpy; ASSERT3S(pgoff, >=, 0); @@ -290,6 +294,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, bufoff = IDX_TO_OFF(m->pindex) % db->db_size; tocpy = MIN(db->db_size - bufoff, PAGESIZE); va = zfs_map_page(m, &sf); + ASSERT(db->db_data != NULL); memcpy(va, (char *)db->db_data + bufoff, tocpy); if (tocpy < PAGESIZE) { ASSERT3S(i, ==, *rahead - 1); diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index c84cb7407a9c..7350b8a6d49f 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -187,12 +187,10 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS) return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_max, "LU", "Maximum ARC size in bytes (LEGACY)"); -/* END CSTYLED */ int param_set_arc_min(SYSCTL_HANDLER_ARGS) @@ -218,12 +216,10 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS) return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_min, "LU", "Minimum ARC size in bytes (LEGACY)"); -/* END CSTYLED */ extern uint_t zfs_arc_free_target; @@ -252,13 +248,11 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on * pagedaemon initialization. */ -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim" " (LEGACY)"); -/* END CSTYLED */ int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) @@ -278,84 +272,64 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_no_grow_shift, "I", "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_write_max; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN, &l2arc_write_max, 0, "Max write bytes per interval (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_write_boost; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN, &l2arc_write_boost, 0, "Extra write bytes during device warmup (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_headroom; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN, &l2arc_headroom, 0, "Number of max device writes to precache (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_headroom_boost; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost, CTLFLAG_RWTUN, &l2arc_headroom_boost, 0, "Compressed l2arc_headroom multiplier (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_feed_secs; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN, &l2arc_feed_secs, 0, "Seconds between L2ARC writing (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_feed_min_ms; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0, "Min feed interval in milliseconds (LEGACY)"); -/* END CSTYLED */ extern int l2arc_noprefetch; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN, &l2arc_noprefetch, 0, "Skip caching prefetched buffers (LEGACY)"); -/* END CSTYLED */ extern int l2arc_feed_again; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN, &l2arc_feed_again, 0, "Turbo L2ARC warmup (LEGACY)"); -/* END CSTYLED */ extern int l2arc_norw; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN, &l2arc_norw, 0, "No reads during writes (LEGACY)"); -/* END CSTYLED */ static int param_get_arc_state_size(SYSCTL_HANDLER_ARGS) @@ -370,7 +344,6 @@ param_get_arc_state_size(SYSCTL_HANDLER_ARGS) extern arc_state_t ARC_anon; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_anon, 0, param_get_arc_state_size, "Q", @@ -381,11 +354,9 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in anonymous state"); -/* END CSTYLED */ extern arc_state_t ARC_mru; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mru, 0, param_get_arc_state_size, "Q", @@ -396,11 +367,9 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mru state"); -/* END CSTYLED */ extern arc_state_t ARC_mru_ghost; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", @@ -411,11 +380,9 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mru ghost state"); -/* END CSTYLED */ extern arc_state_t ARC_mfu; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mfu, 0, param_get_arc_state_size, "Q", @@ -426,11 +393,9 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mfu state"); -/* END CSTYLED */ extern arc_state_t ARC_mfu_ghost; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", @@ -441,11 +406,9 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mfu ghost state"); -/* END CSTYLED */ extern arc_state_t ARC_uncached; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_uncached, 0, param_get_arc_state_size, "Q", @@ -456,16 +419,13 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in uncached state"); -/* END CSTYLED */ extern arc_state_t ARC_l2c_only; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_l2c_only, 0, param_get_arc_state_size, "Q", "size of l2c_only state"); -/* END CSTYLED */ /* dbuf.c */ @@ -477,19 +437,15 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); extern uint32_t zfetch_max_distance; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); -/* END CSTYLED */ extern uint32_t zfetch_max_idistance; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream (LEGACY)"); -/* END CSTYLED */ /* dsl_pool.c */ @@ -527,12 +483,10 @@ param_set_active_allocator(SYSCTL_HANDLER_ARGS) */ extern int zfs_metaslab_sm_blksz_no_log; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0, "Block size for space map in pools with log space map disabled. " "Power of 2 greater than 4096."); -/* END CSTYLED */ /* * When the log space map feature is enabled, we accumulate a lot of @@ -541,12 +495,10 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, */ extern int zfs_metaslab_sm_blksz_with_log; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0, "Block size for space map in pools with log space map enabled. " "Power of 2 greater than 4096."); -/* END CSTYLED */ /* * The in-core space map representation is more compact than its on-disk form. @@ -556,29 +508,23 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, */ extern uint_t zfs_condense_pct; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, &zfs_condense_pct, 0, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); -/* END CSTYLED */ extern uint_t zfs_remove_max_segment; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to allocate when removing" " a device"); -/* END CSTYLED */ extern int zfs_removal_suspend_progress; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while in the middle of a removal"); -/* END CSTYLED */ /* * Minimum size which forces the dynamic allocator to change @@ -588,12 +534,10 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, */ extern uint64_t metaslab_df_alloc_threshold; -/* BEGIN CSTYLED */ SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0, "Minimum size which forces the dynamic allocator to change its" " allocation strategy"); -/* END CSTYLED */ /* * The minimum free space, in percent, which must be available @@ -603,12 +547,10 @@ SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, */ extern uint_t metaslab_df_free_pct; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, &metaslab_df_free_pct, 0, "The minimum free space, in percent, which must be available in a" " space map to continue allocations in a first-fit fashion"); -/* END CSTYLED */ /* mmp.c */ @@ -631,28 +573,22 @@ param_set_multihost_interval(SYSCTL_HANDLER_ARGS) extern int zfs_ccw_retry_interval; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, &zfs_ccw_retry_interval, 0, "Configuration cache file write, retry after failure, interval" " (seconds)"); -/* END CSTYLED */ extern uint64_t zfs_max_missing_tvds_cachefile; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0, "Allow importing pools with missing top-level vdevs in cache file"); -/* END CSTYLED */ extern uint64_t zfs_max_missing_tvds_scan; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0, "Allow importing pools with missing top-level vdevs during scan"); -/* END CSTYLED */ /* spa_misc.c */ @@ -681,11 +617,9 @@ sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0, sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); -/* END CSTYLED */ int param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) @@ -768,10 +702,8 @@ param_set_slop_shift(SYSCTL_HANDLER_ARGS) extern int space_map_ibs; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, &space_map_ibs, 0, "Space map indirect block shift"); -/* END CSTYLED */ /* vdev.c */ @@ -795,13 +727,11 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), param_set_min_auto_ashift, "IU", "Min ashift used when creating new top-level vdev. (LEGACY)"); -/* END CSTYLED */ int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) @@ -822,14 +752,12 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), param_set_max_auto_ashift, "IU", "Max ashift used when optimizing for logical -> physical sector size on" " new top-level vdevs. (LEGACY)"); -/* END CSTYLED */ /* * Since the DTL space map of a vdev is not expected to have a lot of @@ -837,11 +765,9 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, */ extern int zfs_vdev_dtl_sm_blksz; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_dtl_sm_blksz, 0, "Block size for DTL space map. Power of 2 greater than 4096."); -/* END CSTYLED */ /* * vdev-wide space maps that have lots of entries written to them at @@ -850,19 +776,15 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, */ extern int zfs_vdev_standard_sm_blksz; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 greater than 4096."); -/* END CSTYLED */ extern int vdev_validate_skip; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, &vdev_validate_skip, 0, "Enable to bypass vdev_validate()."); -/* END CSTYLED */ /* vdev_mirror.c */ @@ -870,17 +792,13 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, extern uint_t zfs_vdev_max_active; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, "The maximum number of I/Os of all types active for each device." " (LEGACY)"); -/* END CSTYLED */ /* zio.c */ -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); -/* END CSTYLED */ diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index a2222a899380..b8c2c341dace 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -291,8 +291,12 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, case F_SEEK_HOLE: { off = *(offset_t *)data; + error = vn_lock(vp, LK_SHARED); + if (error) + return (error); /* offset parameter is in/out */ error = zfs_holey(VTOZ(vp), com, &off); + VOP_UNLOCK(vp); if (error) return (error); *(offset_t *)data = off; @@ -452,8 +456,10 @@ mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio) if (!vm_page_wired(pp) && pp->valid == 0 && vm_page_busy_tryupgrade(pp)) vm_page_free(pp); - else + else { + vm_page_deactivate_noreuse(pp); vm_page_sunbusy(pp); + } zfs_vmobject_wunlock(obj); } } else { @@ -3928,6 +3934,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0) return (zfs_vm_pagerret_error); + object = ma[0]->object; start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); @@ -3936,33 +3943,45 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, * Note that we need to handle the case of the block size growing. */ for (;;) { + uint64_t len; + blksz = zp->z_blksz; + len = roundup(end, blksz) - rounddown(start, blksz); + lr = zfs_rangelock_tryenter(&zp->z_rangelock, - rounddown(start, blksz), - roundup(end, blksz) - rounddown(start, blksz), RL_READER); + rounddown(start, blksz), len, RL_READER); if (lr == NULL) { - if (rahead != NULL) { - *rahead = 0; - rahead = NULL; - } - if (rbehind != NULL) { - *rbehind = 0; - rbehind = NULL; - } - break; + /* + * Avoid a deadlock with update_pages(). We need to + * hold the range lock when copying from the DMU, so + * give up the busy lock to allow update_pages() to + * proceed. We might need to allocate new pages, which + * isn't quite right since this allocation isn't subject + * to the page fault handler's OOM logic, but this is + * the best we can do for now. + */ + for (int i = 0; i < count; i++) + vm_page_xunbusy(ma[i]); + + lr = zfs_rangelock_enter(&zp->z_rangelock, + rounddown(start, blksz), len, RL_READER); + + zfs_vmobject_wlock(object); + (void) vm_page_grab_pages(object, OFF_TO_IDX(start), + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO, + ma, count); + zfs_vmobject_wunlock(object); } if (blksz == zp->z_blksz) break; zfs_rangelock_exit(lr); } - object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { - if (lr != NULL) - zfs_rangelock_exit(lr); + zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (zfs_vm_pagerret_bad); } @@ -3987,11 +4006,33 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ - error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b, - &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); + for (int i = 0; i < count; i++) { + int dummypgsin, count1, j, last_size; - if (lr != NULL) - zfs_rangelock_exit(lr); + if (vm_page_any_valid(ma[i])) { + ASSERT(vm_page_all_valid(ma[i])); + continue; + } + for (j = i + 1; j < count; j++) { + if (vm_page_any_valid(ma[j])) { + ASSERT(vm_page_all_valid(ma[j])); + break; + } + } + count1 = j - i; + dummypgsin = 0; + last_size = j == count ? + MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE; + error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1, + i == 0 ? &pgsin_b : &dummypgsin, + j == count ? &pgsin_a : &dummypgsin, + last_size); + if (error != 0) + break; + i += count1 - 1; + } + + zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE); @@ -6159,7 +6200,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) } else { #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \ __FreeBSD_version >= 1400086 - vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false, + vn_lock_pair(invp, false, LK_SHARED, outvp, false, LK_EXCLUSIVE); #else vn_lock_pair(invp, false, outvp, false); diff --git a/module/os/freebsd/zfs/zfs_znode_os.c b/module/os/freebsd/zfs/zfs_znode_os.c index a31ecc367414..31ca07a86dda 100644 --- a/module/os/freebsd/zfs/zfs_znode_os.c +++ b/module/os/freebsd/zfs/zfs_znode_os.c @@ -370,8 +370,6 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, */ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) ZTOV(zp)->v_flag |= VROOT; - - vn_exists(ZTOV(zp)); } void diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index feaca93fb933..195ac58f6f1a 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1823,7 +1823,6 @@ zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, } #if defined(_KERNEL) && defined(HAVE_SPL) -/* CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); diff --git a/module/os/linux/spl/spl-err.c b/module/os/linux/spl/spl-err.c index 29781b9515b2..81e520547dd7 100644 --- a/module/os/linux/spl/spl-err.c +++ b/module/os/linux/spl/spl-err.c @@ -33,7 +33,6 @@ * But we would still default to the current default of not to do that. */ static unsigned int spl_panic_halt; -/* CSTYLED */ module_param(spl_panic_halt, uint, 0644); MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c index 6a95d77ac278..e13914221a6a 100644 --- a/module/os/linux/spl/spl-generic.c +++ b/module/os/linux/spl/spl-generic.c @@ -54,7 +54,6 @@ unsigned long spl_hostid = 0; EXPORT_SYMBOL(spl_hostid); -/* CSTYLED */ module_param(spl_hostid, ulong, 0644); MODULE_PARM_DESC(spl_hostid, "The system hostid."); diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 7e806bd5699c..33c7d0879741 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -48,7 +48,6 @@ #define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) #endif -/* BEGIN CSTYLED */ /* * Cache magazines are an optimization designed to minimize the cost of * allocating memory. They do this by keeping a per-cpu cache of recently @@ -97,7 +96,6 @@ static unsigned int spl_kmem_cache_kmem_threads = 4; module_param(spl_kmem_cache_kmem_threads, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, "Number of spl_kmem_cache threads"); -/* END CSTYLED */ /* * Slab allocation interfaces diff --git a/module/os/linux/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c index cae304d33bc3..3e8361184d57 100644 --- a/module/os/linux/spl/spl-kmem.c +++ b/module/os/linux/spl/spl-kmem.c @@ -26,7 +26,6 @@ #include #include -/* BEGIN CSTYLED */ /* * As a general rule kmem_alloc() allocations should be small, preferably * just a few pages since they must by physically contiguous. Therefore, a @@ -62,7 +61,6 @@ module_param(spl_kmem_alloc_max, uint, 0644); MODULE_PARM_DESC(spl_kmem_alloc_max, "Maximum size in bytes for a kmem_alloc()"); EXPORT_SYMBOL(spl_kmem_alloc_max); -/* END CSTYLED */ int kmem_debugging(void) diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index 7f4cab5da114..77dd472ea8b1 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -117,9 +117,7 @@ module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); static uint_t spl_taskq_thread_timeout_ms = 5000; -/* BEGIN CSTYLED */ module_param(spl_taskq_thread_timeout_ms, uint, 0644); -/* END CSTYLED */ MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, "Minimum idle threads exit interval for dynamic taskqs"); @@ -133,9 +131,7 @@ MODULE_PARM_DESC(spl_taskq_thread_priority, "Allow non-default priority for taskq threads"); static uint_t spl_taskq_thread_sequential = 4; -/* BEGIN CSTYLED */ module_param(spl_taskq_thread_sequential, uint, 0644); -/* END CSTYLED */ MODULE_PARM_DESC(spl_taskq_thread_sequential, "Create new taskq threads after N sequential tasks"); diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 04ab8bbca352..39ea3e62dba0 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -1346,7 +1346,6 @@ MODULE_PARM_DESC(zfs_abd_scatter_enabled, module_param(zfs_abd_scatter_min_size, int, 0644); MODULE_PARM_DESC(zfs_abd_scatter_min_size, "Minimum size of scatter allocations."); -/* CSTYLED */ module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index a017900d5538..7d01f8f373b2 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -214,7 +214,5 @@ __dprintf(boolean_t dprint, const char *file, const char *func, module_param(zfs_dbgmsg_enable, int, 0644); MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log"); -/* BEGIN CSTYLED */ module_param(zfs_dbgmsg_maxsize, uint, 0644); -/* END CSTYLED */ MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size"); diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index 0146d842339a..f08415fdb2e3 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -546,8 +546,9 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) unlock_page(p); put_page(p); - p = __page_cache_alloc(gfp_zero_page); - zfs_mark_page(p); + uio->uio_dio.pages[i] = + __page_cache_alloc(gfp_zero_page); + zfs_mark_page(uio->uio_dio.pages[i]); } else { unlock_page(p); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index dd9fd760b9c2..a882c88a7a72 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -4345,7 +4345,6 @@ EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); -/* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); #endif diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c index bbaca2f58394..aff7b1f4dac1 100644 --- a/module/os/linux/zfs/zfs_znode_os.c +++ b/module/os/linux/zfs/zfs_znode_os.c @@ -1967,7 +1967,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); -/* CSTYLED */ module_param(zfs_object_mutex_size, uint, 0644); MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); module_param(zfs_unlink_suspend_progress, int, 0644); diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 21f3740f6fe6..22eeef7f0743 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -2073,7 +2073,6 @@ zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, } #if defined(_KERNEL) -/* CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index f6e014327717..ff1370c543dc 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1143,7 +1143,6 @@ const struct file_operations zpl_dir_file_operations = { #endif }; -/* CSTYLED */ module_param(zfs_fallocate_reserve_percent, uint, 0644); MODULE_PARM_DESC(zfs_fallocate_reserve_percent, "Percentage of length to use for the available capacity check"); diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index 287f5f36f9dd..b97b701b7460 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -375,7 +375,18 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg) struct super_block *sb = (struct super_block *)arg; int objects = 0; - (void) -zfs_prune(sb, nr_to_scan, &objects); + /* + * deactivate_locked_super calls shrinker_free and only then + * sops->kill_sb cb, resulting in UAF on umount when trying to reach + * for the shrinker functions in zpl_prune_sb of in-umount dataset. + * Increment if s_active is not zero, but don't prune if it is - + * umount could be underway. + */ + if (atomic_inc_not_zero(&sb->s_active)) { + (void) -zfs_prune(sb, nr_to_scan, &objects); + atomic_dec(&sb->s_active); + } + } const struct super_operations zpl_super_operations = { diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 2396690b40fd..7c9aae6a66af 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1176,7 +1176,7 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, limits->zql_max_segment_size = UINT_MAX; } - limits->zql_io_opt = zv->zv_volblocksize; + limits->zql_io_opt = DMU_MAX_ACCESS / 2; limits->zql_physical_block_size = zv->zv_volblocksize; limits->zql_max_discard_sectors = @@ -1899,7 +1899,6 @@ zvol_fini(void) ida_destroy(&zvol_ida); } -/* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); @@ -1908,7 +1907,7 @@ MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" - "to 0 to use all active CPUs"); + "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); @@ -1933,11 +1932,9 @@ MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, - "Process volblocksize blocks per thread"); + "Process volblocksize blocks per thread"); #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); #endif - -/* END CSTYLED */ diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c index 43bccea14a85..fde8ae28ef36 100644 --- a/module/zcommon/zfs_valstr.c +++ b/module/zcommon/zfs_valstr.c @@ -185,7 +185,6 @@ zfs_valstr_ ## name(int v, char *out, size_t outlen) \ /* String tables */ /* ZIO flags: zio_flag_t, typically zio->io_flags */ -/* BEGIN CSTYLED */ _VALSTR_BITFIELD_IMPL(zio_flag, { '.', "DA", "DONT_AGGREGATE" }, { '.', "RP", "IO_REPAIR" }, @@ -221,13 +220,11 @@ _VALSTR_BITFIELD_IMPL(zio_flag, { '.', "DG", "DELEGATED" }, { '.', "DC", "DIO_CHKSUM_ERR" }, ) -/* END CSTYLED */ /* * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or * zio->io_pipeline. */ -/* BEGIN CSTYLED */ _VALSTR_BITFIELD_IMPL(zio_stage, { 'O', "O ", "OPEN" }, { 'I', "RI", "READ_BP_INIT" }, @@ -257,10 +254,8 @@ _VALSTR_BITFIELD_IMPL(zio_stage, { 'C', "DC", "DIO_CHECKSUM_VERIFY" }, { 'X', "X ", "DONE" }, ) -/* END CSTYLED */ /* ZIO priority: zio_priority_t, typically zio->io_priority */ -/* BEGIN CSTYLED */ _VALSTR_ENUM_IMPL(zio_priority, "SYNC_READ", "SYNC_WRITE", @@ -274,7 +269,6 @@ _VALSTR_ENUM_IMPL(zio_priority, "[NUM_QUEUEABLE]", "NOW", ) -/* END CSTYLED */ #undef _VALSTR_BITFIELD_IMPL #undef _VALSTR_ENUM_IMPL diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index d3355730ba3d..a709679b9032 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -128,6 +128,9 @@ zpool_prop_init(void) zprop_register_number(ZPOOL_PROP_DEDUP_TABLE_SIZE, "dedup_table_size", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "DDTSIZE", B_FALSE, sfeatures); + zprop_register_number(ZPOOL_PROP_LAST_SCRUBBED_TXG, + "last_scrubbed_txg", 0, PROP_READONLY, ZFS_TYPE_POOL, "", + "LAST_SCRUBBED_TXG", B_FALSE, sfeatures); /* default number properties */ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 76dc0b19139d..fa7baac04b7b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1074,12 +1074,9 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); - ARCSTAT_MAX(arcstat_hash_chain_max, i); } - uint64_t he = atomic_inc_64_nv( - &arc_stats.arcstat_hash_elements.value.ui64); - ARCSTAT_MAX(arcstat_hash_elements_max, he); + ARCSTAT_BUMP(arcstat_hash_elements); return (NULL); } @@ -1103,8 +1100,7 @@ buf_hash_remove(arc_buf_hdr_t *hdr) arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ - atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64); - + ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); @@ -7008,6 +7004,9 @@ arc_kstat_update(kstat_t *ksp, int rw) wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); as->arcstat_evict_l2_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_skip); + as->arcstat_hash_elements.value.ui64 = + as->arcstat_hash_elements_max.value.ui64 = + wmsum_value(&arc_sums.arcstat_hash_elements); as->arcstat_hash_collisions.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_collisions); as->arcstat_hash_chains.value.ui64 = @@ -7432,6 +7431,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); + wmsum_init(&arc_sums.arcstat_hash_elements, 0); wmsum_init(&arc_sums.arcstat_hash_collisions, 0); wmsum_init(&arc_sums.arcstat_hash_chains, 0); aggsum_init(&arc_sums.arcstat_size, 0); @@ -7590,6 +7590,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); wmsum_fini(&arc_sums.arcstat_evict_l2_skip); + wmsum_fini(&arc_sums.arcstat_hash_elements); wmsum_fini(&arc_sums.arcstat_hash_collisions); wmsum_fini(&arc_sums.arcstat_hash_chains); aggsum_fini(&arc_sums.arcstat_size); @@ -9287,6 +9288,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | + ARC_FLAG_L2_WRITING); + + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + l2arc_hdr_arcstats_increment(hdr); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* @@ -9298,12 +9307,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); - arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | - ARC_FLAG_L2_WRITING); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - l2arc_hdr_arcstats_increment(hdr); boolean_t commit = l2arc_log_blk_insert(dev, hdr); mutex_exit(hash_lock); @@ -9333,7 +9336,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; - vdev_space_update(dev->l2ad_vdev, asize, 0, 0); if (commit) { /* l2ad_hand will be adjusted inside. */ @@ -9844,6 +9846,37 @@ l2arc_spa_rebuild_start(spa_t *spa) } } +void +l2arc_spa_rebuild_stop(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); + + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) + continue; + mutex_enter(&l2arc_rebuild_thr_lock); + dev->l2ad_rebuild_cancel = B_TRUE; + mutex_exit(&l2arc_rebuild_thr_lock); + } + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) + continue; + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild_began == B_TRUE) { + while (dev->l2ad_rebuild == B_TRUE) { + cv_wait(&l2arc_rebuild_thr_cv, + &l2arc_rebuild_thr_lock); + } + } + mutex_exit(&l2arc_rebuild_thr_lock); + } +} + /* * Main entry point for L2ARC rebuilding. */ @@ -9852,12 +9885,12 @@ l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; - VERIFY(!dev->l2ad_rebuild_cancel); VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; + cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); @@ -10008,8 +10041,6 @@ l2arc_rebuild(l2arc_dev_t *dev) for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { - dev->l2ad_rebuild = B_FALSE; - cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; @@ -10585,6 +10616,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) (void) zio_nowait(wzio); dev->l2ad_hand += asize; + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. @@ -10598,7 +10631,6 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); - vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); diff --git a/module/zfs/brt.c b/module/zfs/brt.c index ea8c0735c4b7..79748cd69bc0 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -243,7 +243,6 @@ */ static kmem_cache_t *brt_entry_cache; -static kmem_cache_t *brt_pending_entry_cache; /* * Enable/disable prefetching of BRT entries that we are going to modify. @@ -266,14 +265,11 @@ static int brt_zap_default_ibs = 12; static kstat_t *brt_ksp; typedef struct brt_stats { - kstat_named_t brt_addref_entry_in_memory; kstat_named_t brt_addref_entry_not_on_disk; kstat_named_t brt_addref_entry_on_disk; - kstat_named_t brt_addref_entry_read_lost_race; kstat_named_t brt_decref_entry_in_memory; kstat_named_t brt_decref_entry_loaded_from_disk; kstat_named_t brt_decref_entry_not_in_memory; - kstat_named_t brt_decref_entry_not_on_disk; kstat_named_t brt_decref_entry_read_lost_race; kstat_named_t brt_decref_entry_still_referenced; kstat_named_t brt_decref_free_data_later; @@ -282,14 +278,11 @@ typedef struct brt_stats { } brt_stats_t; static brt_stats_t brt_stats = { - { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, - { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, - { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, { "decref_free_data_later", KSTAT_DATA_UINT64 }, @@ -298,14 +291,11 @@ static brt_stats_t brt_stats = { }; struct { - wmsum_t brt_addref_entry_in_memory; wmsum_t brt_addref_entry_not_on_disk; wmsum_t brt_addref_entry_on_disk; - wmsum_t brt_addref_entry_read_lost_race; wmsum_t brt_decref_entry_in_memory; wmsum_t brt_decref_entry_loaded_from_disk; wmsum_t brt_decref_entry_not_in_memory; - wmsum_t brt_decref_entry_not_on_disk; wmsum_t brt_decref_entry_read_lost_race; wmsum_t brt_decref_entry_still_referenced; wmsum_t brt_decref_free_data_later; @@ -316,24 +306,24 @@ struct { #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) static int brt_entry_compare(const void *x1, const void *x2); -static int brt_pending_entry_compare(const void *x1, const void *x2); +static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); static void -brt_rlock(brt_t *brt) +brt_rlock(spa_t *spa) { - rw_enter(&brt->brt_lock, RW_READER); + rw_enter(&spa->spa_brt_lock, RW_READER); } static void -brt_wlock(brt_t *brt) +brt_wlock(spa_t *spa) { - rw_enter(&brt->brt_lock, RW_WRITER); + rw_enter(&spa->spa_brt_lock, RW_WRITER); } static void -brt_unlock(brt_t *brt) +brt_unlock(spa_t *spa) { - rw_exit(&brt->brt_lock); + rw_exit(&spa->spa_brt_lock); } static uint16_t @@ -394,14 +384,15 @@ brt_vdev_dump(brt_vdev_t *brtvd) { uint64_t idx; + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " - "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", + "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", (u_longlong_t)brtvd->bv_vdevid, brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, (u_longlong_t)brtvd->bv_size, (u_longlong_t)brtvd->bv_totalcount, - (u_longlong_t)brtvd->bv_nblocks, - (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); + (u_longlong_t)nblocks, + (size_t)BT_SIZEOFMAP(nblocks)); if (brtvd->bv_totalcount > 0) { zfs_dbgmsg(" entcounts:"); for (idx = 0; idx < brtvd->bv_size; idx++) { @@ -415,51 +406,56 @@ brt_vdev_dump(brt_vdev_t *brtvd) if (brtvd->bv_entcount_dirty) { char *bitmap; - bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); - for (idx = 0; idx < brtvd->bv_nblocks; idx++) { + bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); + for (idx = 0; idx < nblocks; idx++) { bitmap[idx] = BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } bitmap[idx] = '\0'; zfs_dbgmsg(" dirty: %s", bitmap); - kmem_free(bitmap, brtvd->bv_nblocks + 1); + kmem_free(bitmap, nblocks + 1); } } #endif static brt_vdev_t * -brt_vdev(brt_t *brt, uint64_t vdevid) +brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) { - brt_vdev_t *brtvd; + brt_vdev_t *brtvd = NULL; - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - - if (vdevid < brt->brt_nvdevs) { - brtvd = &brt->brt_vdevs[vdevid]; - } else { - brtvd = NULL; + brt_rlock(spa); + if (vdevid < spa->spa_brt_nvdevs) { + brtvd = spa->spa_brt_vdevs[vdevid]; + } else if (alloc) { + /* New VDEV was added. */ + brt_unlock(spa); + brt_wlock(spa); + if (vdevid >= spa->spa_brt_nvdevs) + brt_vdevs_expand(spa, vdevid + 1); + brtvd = spa->spa_brt_vdevs[vdevid]; } - + brt_unlock(spa); return (brtvd); } static void -brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_initiated); ASSERT0(brtvd->bv_mos_brtvdev); ASSERT0(brtvd->bv_mos_entries); - ASSERT(brtvd->bv_entcount != NULL); - ASSERT(brtvd->bv_size > 0); - ASSERT(brtvd->bv_bitmap != NULL); - ASSERT(brtvd->bv_nblocks > 0); - brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, + uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); - VERIFY(brtvd->bv_mos_entries != 0); + VERIFY(mos_entries != 0); + VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, + &brtvd->bv_mos_entries_dnode)); + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); + brtvd->bv_mos_entries = mos_entries; + rw_exit(&brtvd->bv_mos_entries_lock); BRT_DEBUG("MOS entries created, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); @@ -468,7 +464,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) * We will keep array size (bv_size) and cummulative count for all * bv_entcount[]s (bv_totalcount) in the bonus buffer. */ - brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, + brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); VERIFY(brtvd->bv_mos_brtvdev != 0); @@ -477,27 +473,27 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); - VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, + VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("Pool directory object created, object=%s", name); - spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void -brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) +brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) { vdev_t *vd; uint16_t *entcount; ulong_t *bitmap; - uint64_t nblocks, size; + uint64_t nblocks, onblocks, size; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); - spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); - vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); - size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; - spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, brtvd->bv_vdevid); + size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; + spa_config_exit(spa, SCL_VDEV, FTAG); entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); @@ -505,38 +501,33 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) if (!brtvd->bv_initiated) { ASSERT0(brtvd->bv_size); - ASSERT(brtvd->bv_entcount == NULL); - ASSERT(brtvd->bv_bitmap == NULL); - ASSERT0(brtvd->bv_nblocks); - - avl_create(&brtvd->bv_tree, brt_entry_compare, - sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); + ASSERT0P(brtvd->bv_entcount); + ASSERT0P(brtvd->bv_bitmap); } else { ASSERT(brtvd->bv_size > 0); ASSERT(brtvd->bv_entcount != NULL); ASSERT(brtvd->bv_bitmap != NULL); - ASSERT(brtvd->bv_nblocks > 0); /* * TODO: Allow vdev shrinking. We only need to implement * shrinking the on-disk BRT VDEV object. - * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, - * size, tx); + * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + * offset, size, tx); */ ASSERT3U(brtvd->bv_size, <=, size); memcpy(entcount, brtvd->bv_entcount, sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); - memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), - BT_SIZEOFMAP(brtvd->bv_nblocks))); vmem_free(brtvd->bv_entcount, sizeof (entcount[0]) * brtvd->bv_size); - kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), + BT_SIZEOFMAP(onblocks))); + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); } brtvd->bv_size = size; brtvd->bv_entcount = entcount; brtvd->bv_bitmap = bitmap; - brtvd->bv_nblocks = nblocks; if (!brtvd->bv_initiated) { brtvd->bv_need_byteswap = FALSE; brtvd->bv_initiated = TRUE; @@ -545,36 +536,29 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) } } -static void -brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) +static int +brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) { - char name[64]; dmu_buf_t *db; brt_vdev_phys_t *bvphys; int error; - snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, - (u_longlong_t)brtvd->bv_vdevid); - error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); - if (error != 0) - return; + ASSERT(!brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); - error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); - ASSERT0(error); + error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + FTAG, &db); if (error != 0) - return; + return (error); bvphys = db->db_data; - if (brt->brt_rangesize == 0) { - brt->brt_rangesize = bvphys->bvp_rangesize; + if (spa->spa_brt_rangesize == 0) { + spa->spa_brt_rangesize = bvphys->bvp_rangesize; } else { - ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); + ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); } - ASSERT(!brtvd->bv_initiated); - brt_vdev_realloc(brt, brtvd); + brt_vdev_realloc(spa, brtvd); /* TODO: We don't support VDEV shrinking. */ ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); @@ -582,163 +566,176 @@ brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) /* * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. */ - error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), brtvd->bv_entcount, DMU_READ_NO_PREFETCH); - ASSERT0(error); + if (error != 0) + return (error); + ASSERT(bvphys->bvp_mos_entries != 0); + VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, + &brtvd->bv_mos_entries_dnode)); + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = bvphys->bvp_mos_entries; - ASSERT(brtvd->bv_mos_entries != 0); + rw_exit(&brtvd->bv_mos_entries_lock); brtvd->bv_need_byteswap = (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); brtvd->bv_totalcount = bvphys->bvp_totalcount; brtvd->bv_usedspace = bvphys->bvp_usedspace; brtvd->bv_savedspace = bvphys->bvp_savedspace; - brt->brt_usedspace += brtvd->bv_usedspace; - brt->brt_savedspace += brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); - BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", - name, (u_longlong_t)brtvd->bv_mos_brtvdev, + BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu", + (u_longlong_t)brtvd->bv_vdevid, + (u_longlong_t)brtvd->bv_mos_brtvdev, (u_longlong_t)brtvd->bv_mos_entries); + return (0); } static void -brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) +brt_vdev_dealloc(brt_vdev_t *brtvd) { - - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); ASSERT(brtvd->bv_initiated); + ASSERT0(avl_numnodes(&brtvd->bv_tree)); vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); brtvd->bv_entcount = NULL; - kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); brtvd->bv_bitmap = NULL; - ASSERT0(avl_numnodes(&brtvd->bv_tree)); - avl_destroy(&brtvd->bv_tree); brtvd->bv_size = 0; - brtvd->bv_nblocks = 0; brtvd->bv_initiated = FALSE; BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); } static void -brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; uint64_t count; - dmu_buf_t *db; - brt_vdev_phys_t *bvphys; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(brtvd->bv_mos_entries != 0); + ASSERT0(brtvd->bv_totalcount); + ASSERT0(brtvd->bv_usedspace); + ASSERT0(brtvd->bv_savedspace); - VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); - VERIFY0(count); - VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); - BRT_DEBUG("MOS entries destroyed, object=%llu", - (u_longlong_t)brtvd->bv_mos_entries); + uint64_t mos_entries = brtvd->bv_mos_entries; + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = 0; + rw_exit(&brtvd->bv_mos_entries_lock); + dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); + brtvd->bv_mos_entries_dnode = NULL; + ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); + ASSERT0(count); + VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); + BRT_DEBUG("MOS entries destroyed, object=%llu", + (u_longlong_t)mos_entries); - VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); - bvphys = db->db_data; - ASSERT0(bvphys->bvp_totalcount); - ASSERT0(bvphys->bvp_usedspace); - ASSERT0(bvphys->bvp_savedspace); - dmu_buf_rele(db, FTAG); - - VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); + VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + tx)); BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); brtvd->bv_mos_brtvdev = 0; + brtvd->bv_entcount_dirty = FALSE; snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); - VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, tx)); BRT_DEBUG("Pool directory object removed, object=%s", name); - brt_vdev_dealloc(brt, brtvd); + brtvd->bv_meta_dirty = FALSE; - spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); + rw_enter(&brtvd->bv_lock, RW_WRITER); + brt_vdev_dealloc(brtvd); + rw_exit(&brtvd->bv_lock); + + spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void -brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) +brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) { - brt_vdev_t *brtvd, *vdevs; - uint64_t vdevid; + brt_vdev_t **vdevs; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - ASSERT3U(nvdevs, >, brt->brt_nvdevs); + ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); + ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs); - vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); - if (brt->brt_nvdevs > 0) { - ASSERT(brt->brt_vdevs != NULL); + if (nvdevs == spa->spa_brt_nvdevs) + return; + + vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); + if (spa->spa_brt_nvdevs > 0) { + ASSERT(spa->spa_brt_vdevs != NULL); - memcpy(vdevs, brt->brt_vdevs, - sizeof (brt_vdev_t) * brt->brt_nvdevs); - kmem_free(brt->brt_vdevs, - sizeof (brt_vdev_t) * brt->brt_nvdevs); + memcpy(vdevs, spa->spa_brt_vdevs, + sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); + kmem_free(spa->spa_brt_vdevs, + sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); } - for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { - brtvd = &vdevs[vdevid]; + spa->spa_brt_vdevs = vdevs; + for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { + brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); + rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); brtvd->bv_vdevid = vdevid; brtvd->bv_initiated = FALSE; + rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); + avl_create(&brtvd->bv_tree, brt_entry_compare, + sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); + for (int i = 0; i < TXG_SIZE; i++) { + avl_create(&brtvd->bv_pending_tree[i], + brt_entry_compare, sizeof (brt_entry_t), + offsetof(brt_entry_t, bre_node)); + } + mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); + spa->spa_brt_vdevs[vdevid] = brtvd; } BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", - (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); - - brt->brt_vdevs = vdevs; - brt->brt_nvdevs = nvdevs; + (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); + spa->spa_brt_nvdevs = nvdevs; } static boolean_t -brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) +brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset) { - uint64_t idx; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - - idx = bre->bre_offset / brt->brt_rangesize; - if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { + uint64_t idx = offset / spa->spa_brt_rangesize; + if (idx < brtvd->bv_size) { /* VDEV wasn't expanded. */ return (brt_vdev_entcount_get(brtvd, idx) > 0); } - return (FALSE); } static void -brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, - uint64_t dsize) +brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, + uint64_t dsize, uint64_t count) { uint64_t idx; - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - ASSERT(brtvd != NULL); - ASSERT(brtvd->bv_entcount != NULL); + ASSERT(brtvd->bv_initiated); - brt->brt_savedspace += dsize; - brtvd->bv_savedspace += dsize; + brtvd->bv_savedspace += dsize * count; brtvd->bv_meta_dirty = TRUE; - if (bre->bre_refcount > 1) { + if (bre->bre_count > 0) return; - } - brt->brt_usedspace += dsize; brtvd->bv_usedspace += dsize; - idx = bre->bre_offset / brt->brt_rangesize; + idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; if (idx >= brtvd->bv_size) { /* VDEV has been expanded. */ - brt_vdev_realloc(brt, brtvd); + rw_enter(&brtvd->bv_lock, RW_WRITER); + brt_vdev_realloc(spa, brtvd); + rw_exit(&brtvd->bv_lock); } ASSERT3U(idx, <, brtvd->bv_size); @@ -748,35 +745,26 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); - -#ifdef ZFS_DEBUG - if (zfs_flags & ZFS_DEBUG_BRT) - brt_vdev_dump(brtvd); -#endif } static void -brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, +brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize) { uint64_t idx; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - ASSERT(brtvd != NULL); - ASSERT(brtvd->bv_entcount != NULL); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); + ASSERT(brtvd->bv_initiated); - brt->brt_savedspace -= dsize; brtvd->bv_savedspace -= dsize; brtvd->bv_meta_dirty = TRUE; - if (bre->bre_refcount > 0) { + if (bre->bre_count > 0) return; - } - brt->brt_usedspace -= dsize; brtvd->bv_usedspace -= dsize; - idx = bre->bre_offset / brt->brt_rangesize; + idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; ASSERT3U(idx, <, brtvd->bv_size); ASSERT(brtvd->bv_totalcount > 0); @@ -785,15 +773,10 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); - -#ifdef ZFS_DEBUG - if (zfs_flags & ZFS_DEBUG_BRT) - brt_vdev_dump(brtvd); -#endif } static void -brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { dmu_buf_t *db; brt_vdev_phys_t *bvphys; @@ -802,16 +785,18 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(dmu_tx_is_syncing(tx)); - VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); + VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + FTAG, &db)); if (brtvd->bv_entcount_dirty) { /* * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. */ - dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), brtvd->bv_entcount, tx); - memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); brtvd->bv_entcount_dirty = FALSE; } @@ -825,7 +810,7 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; } bvphys->bvp_totalcount = brtvd->bv_totalcount; - bvphys->bvp_rangesize = brt->brt_rangesize; + bvphys->bvp_rangesize = spa->spa_brt_rangesize; bvphys->bvp_usedspace = brtvd->bv_usedspace; bvphys->bvp_savedspace = brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); @@ -834,114 +819,51 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) } static void -brt_vdevs_alloc(brt_t *brt, boolean_t load) -{ - brt_vdev_t *brtvd; - uint64_t vdevid; - - brt_wlock(brt); - - brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); - - if (load) { - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; - ASSERT(brtvd->bv_entcount == NULL); - - brt_vdev_load(brt, brtvd); - } - } - - if (brt->brt_rangesize == 0) { - brt->brt_rangesize = BRT_RANGESIZE; - } - - brt_unlock(brt); -} - -static void -brt_vdevs_free(brt_t *brt) +brt_vdevs_free(spa_t *spa) { - brt_vdev_t *brtvd; - uint64_t vdevid; - - brt_wlock(brt); - - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; + if (spa->spa_brt_vdevs == 0) + return; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + rw_enter(&brtvd->bv_lock, RW_WRITER); if (brtvd->bv_initiated) - brt_vdev_dealloc(brt, brtvd); + brt_vdev_dealloc(brtvd); + rw_exit(&brtvd->bv_lock); + rw_destroy(&brtvd->bv_lock); + if (brtvd->bv_mos_entries != 0) + dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); + rw_destroy(&brtvd->bv_mos_entries_lock); + avl_destroy(&brtvd->bv_tree); + for (int i = 0; i < TXG_SIZE; i++) + avl_destroy(&brtvd->bv_pending_tree[i]); + mutex_destroy(&brtvd->bv_pending_lock); + kmem_free(brtvd, sizeof (*brtvd)); } - kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); - - brt_unlock(brt); + kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * + spa->spa_brt_nvdevs); } static void brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) { - bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); - bre->bre_refcount = 0; + bre->bre_bp = *bp; + bre->bre_count = 0; + bre->bre_pcount = 0; *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); } static int -brt_entry_compare(const void *x1, const void *x2) -{ - const brt_entry_t *bre1 = x1; - const brt_entry_t *bre2 = x2; - - return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); -} - -static int -brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) +brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) { - uint64_t mos_entries; - int error; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + uint64_t off = BRE_OFFSET(bre); - if (!brt_vdev_lookup(brt, brtvd, bre)) + if (brtvd->bv_mos_entries == 0) return (SET_ERROR(ENOENT)); - /* - * Remember mos_entries object number. After we reacquire the BRT lock, - * the brtvd pointer may be invalid. - */ - mos_entries = brtvd->bv_mos_entries; - if (mos_entries == 0) - return (SET_ERROR(ENOENT)); - - brt_unlock(brt); - - error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, - BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); - - brt_wlock(brt); - - return (error); -} - -static void -brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) -{ - brt_vdev_t *brtvd; - uint64_t mos_entries = 0; - - brt_rlock(brt); - brtvd = brt_vdev(brt, vdevid); - if (brtvd != NULL) - mos_entries = brtvd->bv_mos_entries; - brt_unlock(brt); - - if (mos_entries == 0) - return; - - (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); } /* @@ -952,72 +874,66 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; - brt_entry_t bre_search; - boolean_t mayexists = FALSE; - uint64_t vdevid; - brt_entry_fill(bp, &bre_search, &vdevid); + if (spa->spa_brt_nvdevs == 0) + return (B_FALSE); - brt_rlock(brt); + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); + if (brtvd == NULL || !brtvd->bv_initiated) + return (FALSE); - brtvd = brt_vdev(brt, vdevid); - if (brtvd != NULL && brtvd->bv_initiated) { - if (!avl_is_empty(&brtvd->bv_tree) || - brt_vdev_lookup(brt, brtvd, &bre_search)) { - mayexists = TRUE; - } - } - - brt_unlock(brt); - - return (mayexists); + /* + * We don't need locks here, since bv_entcount pointer must be + * stable at this point, and we don't care about false positive + * races here, while false negative should be impossible, since + * all brt_vdev_addref() have already completed by this point. + */ + uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); + return (brt_vdev_lookup(spa, brtvd, off)); } uint64_t brt_get_dspace(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) + if (spa->spa_brt_nvdevs == 0) return (0); - return (brt->brt_savedspace); + brt_rlock(spa); + uint64_t s = 0; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) + s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; + brt_unlock(spa); + return (s); } uint64_t brt_get_used(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) + if (spa->spa_brt_nvdevs == 0) return (0); - return (brt->brt_usedspace); + brt_rlock(spa); + uint64_t s = 0; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) + s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; + brt_unlock(spa); + return (s); } uint64_t brt_get_saved(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) - return (0); - - return (brt->brt_savedspace); + return (brt_get_dspace(spa)); } uint64_t brt_get_ratio(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt->brt_usedspace == 0) + uint64_t used = brt_get_used(spa); + if (used == 0) return (100); - - return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / - brt->brt_usedspace); + return ((used + brt_get_saved(spa)) * 100 / used); } static int @@ -1028,22 +944,16 @@ brt_kstats_update(kstat_t *ksp, int rw) if (rw == KSTAT_WRITE) return (EACCES); - bs->brt_addref_entry_in_memory.value.ui64 = - wmsum_value(&brt_sums.brt_addref_entry_in_memory); bs->brt_addref_entry_not_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); bs->brt_addref_entry_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_on_disk); - bs->brt_addref_entry_read_lost_race.value.ui64 = - wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); bs->brt_decref_entry_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_in_memory); bs->brt_decref_entry_loaded_from_disk.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); bs->brt_decref_entry_not_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); - bs->brt_decref_entry_not_on_disk.value.ui64 = - wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); bs->brt_decref_entry_read_lost_race.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); bs->brt_decref_entry_still_referenced.value.ui64 = @@ -1062,14 +972,11 @@ static void brt_stat_init(void) { - wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); - wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); - wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); wmsum_init(&brt_sums.brt_decref_free_data_later, 0); @@ -1093,14 +1000,11 @@ brt_stat_fini(void) brt_ksp = NULL; } - wmsum_fini(&brt_sums.brt_addref_entry_in_memory); wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); wmsum_fini(&brt_sums.brt_addref_entry_on_disk); - wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); wmsum_fini(&brt_sums.brt_decref_entry_in_memory); wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); - wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); wmsum_fini(&brt_sums.brt_decref_free_data_later); @@ -1113,8 +1017,6 @@ brt_init(void) { brt_entry_cache = kmem_cache_create("brt_entry_cache", sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", - sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); brt_stat_init(); } @@ -1125,105 +1027,12 @@ brt_fini(void) brt_stat_fini(); kmem_cache_destroy(brt_entry_cache); - kmem_cache_destroy(brt_pending_entry_cache); -} - -static brt_entry_t * -brt_entry_alloc(const brt_entry_t *bre_init) -{ - brt_entry_t *bre; - - bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); - bre->bre_offset = bre_init->bre_offset; - bre->bre_refcount = bre_init->bre_refcount; - - return (bre); -} - -static void -brt_entry_free(brt_entry_t *bre) -{ - - kmem_cache_free(brt_entry_cache, bre); -} - -static void -brt_entry_addref(brt_t *brt, const blkptr_t *bp) -{ - brt_vdev_t *brtvd; - brt_entry_t *bre, *racebre; - brt_entry_t bre_search; - avl_index_t where; - uint64_t vdevid; - int error; - - ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); - - brt_entry_fill(bp, &bre_search, &vdevid); - - brt_wlock(brt); - - brtvd = brt_vdev(brt, vdevid); - if (brtvd == NULL) { - ASSERT3U(vdevid, >=, brt->brt_nvdevs); - - /* New VDEV was added. */ - brt_vdevs_expand(brt, vdevid + 1); - brtvd = brt_vdev(brt, vdevid); - } - ASSERT(brtvd != NULL); - if (!brtvd->bv_initiated) - brt_vdev_realloc(brt, brtvd); - - bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); - if (bre != NULL) { - BRTSTAT_BUMP(brt_addref_entry_in_memory); - } else { - /* - * brt_entry_lookup() may drop the BRT (read) lock and - * reacquire it (write). - */ - error = brt_entry_lookup(brt, brtvd, &bre_search); - /* bre_search now contains correct bre_refcount */ - ASSERT(error == 0 || error == ENOENT); - if (error == 0) - BRTSTAT_BUMP(brt_addref_entry_on_disk); - else - BRTSTAT_BUMP(brt_addref_entry_not_on_disk); - /* - * When the BRT lock was dropped, brt_vdevs[] may have been - * expanded and reallocated, we need to update brtvd's pointer. - */ - brtvd = brt_vdev(brt, vdevid); - ASSERT(brtvd != NULL); - - racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); - if (racebre == NULL) { - bre = brt_entry_alloc(&bre_search); - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - avl_insert(&brtvd->bv_tree, bre, where); - brt->brt_nentries++; - } else { - /* - * The entry was added when the BRT lock was dropped in - * brt_entry_lookup(). - */ - BRTSTAT_BUMP(brt_addref_entry_read_lost_race); - bre = racebre; - } - } - bre->bre_refcount++; - brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); - - brt_unlock(brt); } /* Return TRUE if block should be freed immediately. */ boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; brt_entry_t *bre, *racebre; brt_entry_t bre_search; avl_index_t where; @@ -1232,11 +1041,11 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) brt_entry_fill(bp, &bre_search, &vdevid); - brt_wlock(brt); - - brtvd = brt_vdev(brt, vdevid); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); + rw_enter(&brtvd->bv_lock, RW_WRITER); + ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre != NULL) { BRTSTAT_BUMP(brt_decref_entry_in_memory); @@ -1244,67 +1053,49 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) } else { BRTSTAT_BUMP(brt_decref_entry_not_in_memory); } + rw_exit(&brtvd->bv_lock); - /* - * brt_entry_lookup() may drop the BRT lock and reacquire it. - */ - error = brt_entry_lookup(brt, brtvd, &bre_search); - /* bre_search now contains correct bre_refcount */ - ASSERT(error == 0 || error == ENOENT); - /* - * When the BRT lock was dropped, brt_vdevs[] may have been expanded - * and reallocated, we need to update brtvd's pointer. - */ - brtvd = brt_vdev(brt, vdevid); - ASSERT(brtvd != NULL); - + error = brt_entry_lookup(brtvd, &bre_search); + /* bre_search now contains correct bre_count */ if (error == ENOENT) { - BRTSTAT_BUMP(brt_decref_entry_not_on_disk); - bre = NULL; - goto out; + BRTSTAT_BUMP(brt_decref_no_entry); + return (B_TRUE); } + ASSERT0(error); + rw_enter(&brtvd->bv_lock, RW_WRITER); racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); if (racebre != NULL) { - /* - * The entry was added when the BRT lock was dropped in - * brt_entry_lookup(). - */ + /* The entry was added when the lock was dropped. */ BRTSTAT_BUMP(brt_decref_entry_read_lost_race); bre = racebre; goto out; } BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); - bre = brt_entry_alloc(&bre_search); - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); + bre->bre_bp = bre_search.bre_bp; + bre->bre_count = bre_search.bre_count; + bre->bre_pcount = 0; avl_insert(&brtvd->bv_tree, bre, where); - brt->brt_nentries++; out: - if (bre == NULL) { - /* - * This is a free of a regular (not cloned) block. - */ - brt_unlock(brt); - BRTSTAT_BUMP(brt_decref_no_entry); - return (B_TRUE); - } - if (bre->bre_refcount == 0) { - brt_unlock(brt); + if (bre->bre_count == 0) { + rw_exit(&brtvd->bv_lock); BRTSTAT_BUMP(brt_decref_free_data_now); return (B_TRUE); } - ASSERT(bre->bre_refcount > 0); - bre->bre_refcount--; - if (bre->bre_refcount == 0) + bre->bre_pcount--; + ASSERT(bre->bre_count > 0); + bre->bre_count--; + if (bre->bre_count == 0) BRTSTAT_BUMP(brt_decref_free_data_later); else BRTSTAT_BUMP(brt_decref_entry_still_referenced); - brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); + brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); - brt_unlock(brt); + rw_exit(&brtvd->bv_lock); return (B_FALSE); } @@ -1312,222 +1103,259 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; brt_entry_t bre_search, *bre; uint64_t vdevid, refcnt; int error; brt_entry_fill(bp, &bre_search, &vdevid); - brt_rlock(brt); - - brtvd = brt_vdev(brt, vdevid); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); + rw_enter(&brtvd->bv_lock, RW_READER); + ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre == NULL) { - error = brt_entry_lookup(brt, brtvd, &bre_search); - ASSERT(error == 0 || error == ENOENT); - if (error == ENOENT) + rw_exit(&brtvd->bv_lock); + error = brt_entry_lookup(brtvd, &bre_search); + if (error == ENOENT) { refcnt = 0; - else - refcnt = bre_search.bre_refcount; - } else - refcnt = bre->bre_refcount; + } else { + ASSERT0(error); + refcnt = bre_search.bre_count; + } + } else { + refcnt = bre->bre_count; + rw_exit(&brtvd->bv_lock); + } - brt_unlock(brt); return (refcnt); } static void -brt_prefetch(brt_t *brt, const blkptr_t *bp) +brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) { - brt_entry_t bre; - uint64_t vdevid; - - ASSERT(bp != NULL); - - if (!brt_zap_prefetch) + if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) return; - brt_entry_fill(bp, &bre, &vdevid); - - brt_entry_prefetch(brt, vdevid, &bre); + uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); + rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); + if (brtvd->bv_mos_entries != 0) { + (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS); + } + rw_exit(&brtvd->bv_mos_entries_lock); } static int -brt_pending_entry_compare(const void *x1, const void *x2) +brt_entry_compare(const void *x1, const void *x2) { - const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; - const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; - int cmp; - - cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), - DVA_GET_VDEV(&bp2->blk_dva[0])); - if (cmp == 0) { - cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), - DVA_GET_OFFSET(&bp2->blk_dva[0])); - if (unlikely(cmp == 0)) { - cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2)); - } - } + const brt_entry_t *bre1 = x1, *bre2 = x2; + const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp; - return (cmp); + return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), + DVA_GET_OFFSET(&bp2->blk_dva[0]))); } void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { - brt_t *brt; - avl_tree_t *pending_tree; - kmutex_t *pending_lock; - brt_pending_entry_t *bpe, *newbpe; + brt_entry_t *bre, *newbre; avl_index_t where; uint64_t txg; - brt = spa->spa_brt; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; - newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); - newbpe->bpe_bp = *bp; - newbpe->bpe_count = 1; + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); + avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; - mutex_enter(pending_lock); + newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); + newbre->bre_bp = *bp; + newbre->bre_count = 0; + newbre->bre_pcount = 1; - bpe = avl_find(pending_tree, newbpe, &where); - if (bpe == NULL) { - avl_insert(pending_tree, newbpe, where); - newbpe = NULL; + mutex_enter(&brtvd->bv_pending_lock); + bre = avl_find(pending_tree, newbre, &where); + if (bre == NULL) { + avl_insert(pending_tree, newbre, where); + newbre = NULL; } else { - bpe->bpe_count++; + bre->bre_pcount++; } + mutex_exit(&brtvd->bv_pending_lock); - mutex_exit(pending_lock); - - if (newbpe != NULL) { - ASSERT(bpe != NULL); - ASSERT(bpe != newbpe); - kmem_cache_free(brt_pending_entry_cache, newbpe); + if (newbre != NULL) { + ASSERT(bre != NULL); + ASSERT(bre != newbre); + kmem_cache_free(brt_entry_cache, newbre); } else { - ASSERT(bpe == NULL); + ASSERT0P(bre); /* Prefetch BRT entry for the syncing context. */ - brt_prefetch(brt, bp); + brt_prefetch(brtvd, bp); } } void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { - brt_t *brt; - avl_tree_t *pending_tree; - kmutex_t *pending_lock; - brt_pending_entry_t *bpe, bpe_search; + brt_entry_t *bre, bre_search; uint64_t txg; - brt = spa->spa_brt; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; - bpe_search.bpe_bp = *bp; + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); + ASSERT(brtvd != NULL); + avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; + + bre_search.bre_bp = *bp; + + mutex_enter(&brtvd->bv_pending_lock); + bre = avl_find(pending_tree, &bre_search, NULL); + ASSERT(bre != NULL); + ASSERT(bre->bre_pcount > 0); + bre->bre_pcount--; + if (bre->bre_pcount == 0) + avl_remove(pending_tree, bre); + else + bre = NULL; + mutex_exit(&brtvd->bv_pending_lock); + + if (bre) + kmem_cache_free(brt_entry_cache, bre); +} + +static void +brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) +{ + brt_entry_t *bre, *nbre; + + /* + * We are in syncing context, so no other bv_pending_tree accesses + * are possible for the TXG. So we don't need bv_pending_lock. + */ + ASSERT(avl_is_empty(&brtvd->bv_tree)); + avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]); - mutex_enter(pending_lock); + for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) { + nbre = AVL_NEXT(&brtvd->bv_tree, bre); - bpe = avl_find(pending_tree, &bpe_search, NULL); - /* I believe we should always find bpe when this function is called. */ - if (bpe != NULL) { - ASSERT(bpe->bpe_count > 0); + /* + * If the block has DEDUP bit set, it means that it + * already exists in the DEDUP table, so we can just + * use that instead of creating new entry in the BRT. + */ + if (BP_GET_DEDUP(&bre->bre_bp)) { + while (bre->bre_pcount > 0) { + if (!ddt_addref(spa, &bre->bre_bp)) + break; + bre->bre_pcount--; + } + if (bre->bre_pcount == 0) { + avl_remove(&brtvd->bv_tree, bre); + kmem_cache_free(brt_entry_cache, bre); + continue; + } + } - bpe->bpe_count--; - if (bpe->bpe_count == 0) { - avl_remove(pending_tree, bpe); - kmem_cache_free(brt_pending_entry_cache, bpe); + /* + * Unless we know that the block is definitely not in ZAP, + * try to get its reference count from there. + */ + uint64_t off = BRE_OFFSET(bre); + if (brtvd->bv_mos_entries != 0 && + brt_vdev_lookup(spa, brtvd, off)) { + int error = zap_lookup_uint64_by_dnode( + brtvd->bv_mos_entries_dnode, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count); + if (error == 0) { + BRTSTAT_BUMP(brt_addref_entry_on_disk); + } else { + ASSERT3U(error, ==, ENOENT); + BRTSTAT_BUMP(brt_addref_entry_not_on_disk); + } } } - mutex_exit(pending_lock); + /* + * If all the cloned blocks we had were handled by DDT, we don't need + * to initiate the vdev. + */ + if (avl_is_empty(&brtvd->bv_tree)) + return; + + if (!brtvd->bv_initiated) { + rw_enter(&brtvd->bv_lock, RW_WRITER); + brt_vdev_realloc(spa, brtvd); + rw_exit(&brtvd->bv_lock); + } + + /* + * Convert pending references into proper ones. This has to be a + * separate loop, since entcount modifications would cause false + * positives for brt_vdev_lookup() on following iterations. + */ + for (bre = avl_first(&brtvd->bv_tree); bre; + bre = AVL_NEXT(&brtvd->bv_tree, bre)) { + brt_vdev_addref(spa, brtvd, bre, + bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount); + bre->bre_count += bre->bre_pcount; + } } void brt_pending_apply(spa_t *spa, uint64_t txg) { - brt_t *brt = spa->spa_brt; - brt_pending_entry_t *bpe; - avl_tree_t *pending_tree; - void *c; - ASSERT3U(txg, !=, 0); + brt_rlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + brt_unlock(spa); - /* - * We are in syncing context, so no other brt_pending_tree accesses - * are possible for the TXG. Don't need to acquire brt_pending_lock. - */ - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - - c = NULL; - while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { - boolean_t added_to_ddt; - - for (int i = 0; i < bpe->bpe_count; i++) { - /* - * If the block has DEDUP bit set, it means that it - * already exists in the DEDUP table, so we can just - * use that instead of creating new entry in - * the BRT table. - */ - if (BP_GET_DEDUP(&bpe->bpe_bp)) { - added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); - } else { - added_to_ddt = B_FALSE; - } - if (!added_to_ddt) - brt_entry_addref(brt, &bpe->bpe_bp); - } + brt_pending_apply_vdev(spa, brtvd, txg); - kmem_cache_free(brt_pending_entry_cache, bpe); + brt_rlock(spa); } + brt_unlock(spa); } static void brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { - if (bre->bre_refcount == 0) { - int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset, + uint64_t off = BRE_OFFSET(bre); + + if (bre->bre_pcount == 0) { + /* The net change is zero, nothing to do in ZAP. */ + } else if (bre->bre_count == 0) { + int error = zap_remove_uint64_by_dnode(dn, &off, BRT_KEY_WORDS, tx); VERIFY(error == 0 || error == ENOENT); } else { - VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset, - BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), - &bre->bre_refcount, tx)); + VERIFY0(zap_update_uint64_by_dnode(dn, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count, tx)); } } static void -brt_sync_table(brt_t *brt, dmu_tx_t *tx) +brt_sync_table(spa_t *spa, dmu_tx_t *tx) { - brt_vdev_t *brtvd; brt_entry_t *bre; - dnode_t *dn; - uint64_t vdevid; - void *c; - brt_wlock(brt); - - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; - - if (!brtvd->bv_initiated) - continue; + brt_rlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + brt_unlock(spa); if (!brtvd->bv_meta_dirty) { ASSERT(!brtvd->bv_entcount_dirty); ASSERT0(avl_numnodes(&brtvd->bv_tree)); + brt_rlock(spa); continue; } @@ -1535,139 +1363,122 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) avl_numnodes(&brtvd->bv_tree) != 0); if (brtvd->bv_mos_brtvdev == 0) - brt_vdev_create(brt, brtvd, tx); - - VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, - FTAG, &dn)); + brt_vdev_create(spa, brtvd, tx); - c = NULL; + void *c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(dn, bre, tx); - brt_entry_free(bre); - ASSERT(brt->brt_nentries > 0); - brt->brt_nentries--; + brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); + kmem_cache_free(brt_entry_cache, bre); } - dnode_rele(dn, FTAG); - - brt_vdev_sync(brt, brtvd, tx); - +#ifdef ZFS_DEBUG + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); +#endif if (brtvd->bv_totalcount == 0) - brt_vdev_destroy(brt, brtvd, tx); + brt_vdev_destroy(spa, brtvd, tx); + else + brt_vdev_sync(spa, brtvd, tx); + brt_rlock(spa); } - - ASSERT0(brt->brt_nentries); - - brt_unlock(brt); + brt_unlock(spa); } void brt_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; - brt_t *brt; + uint64_t vdevid; - ASSERT(spa_syncing_txg(spa) == txg); + ASSERT3U(spa_syncing_txg(spa), ==, txg); - brt = spa->spa_brt; - brt_rlock(brt); - if (brt->brt_nentries == 0) { - /* No changes. */ - brt_unlock(brt); + brt_rlock(spa); + for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) + break; + } + if (vdevid >= spa->spa_brt_nvdevs) { + brt_unlock(spa); return; } - brt_unlock(brt); + brt_unlock(spa); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - brt_sync_table(brt, tx); - + brt_sync_table(spa, tx); dmu_tx_commit(tx); } -static void -brt_table_alloc(brt_t *brt) -{ - - for (int i = 0; i < TXG_SIZE; i++) { - avl_create(&brt->brt_pending_tree[i], - brt_pending_entry_compare, - sizeof (brt_pending_entry_t), - offsetof(brt_pending_entry_t, bpe_node)); - mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, - NULL); - } -} - -static void -brt_table_free(brt_t *brt) -{ - - for (int i = 0; i < TXG_SIZE; i++) { - ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); - - avl_destroy(&brt->brt_pending_tree[i]); - mutex_destroy(&brt->brt_pending_lock[i]); - } -} - static void brt_alloc(spa_t *spa) { - brt_t *brt; - - ASSERT(spa->spa_brt == NULL); - - brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); - rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); - brt->brt_spa = spa; - brt->brt_rangesize = 0; - brt->brt_nentries = 0; - brt->brt_vdevs = NULL; - brt->brt_nvdevs = 0; - brt_table_alloc(brt); - - spa->spa_brt = brt; + rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); + spa->spa_brt_vdevs = NULL; + spa->spa_brt_nvdevs = 0; + spa->spa_brt_rangesize = 0; } void brt_create(spa_t *spa) { - brt_alloc(spa); - brt_vdevs_alloc(spa->spa_brt, B_FALSE); + spa->spa_brt_rangesize = BRT_RANGESIZE; } int brt_load(spa_t *spa) { + int error = 0; brt_alloc(spa); - brt_vdevs_alloc(spa->spa_brt, B_TRUE); + brt_wlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children; + vdevid++) { + char name[64]; + uint64_t mos_brtvdev; + + /* Look if this vdev had active block cloning. */ + snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, + (u_longlong_t)vdevid); + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, + &mos_brtvdev); + if (error == ENOENT) { + error = 0; + continue; + } + if (error != 0) + break; + + /* If it did, then allocate them all and load this one. */ + brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + rw_enter(&brtvd->bv_lock, RW_WRITER); + brtvd->bv_mos_brtvdev = mos_brtvdev; + error = brt_vdev_load(spa, brtvd); + rw_exit(&brtvd->bv_lock); + if (error != 0) + break; + } - return (0); + if (spa->spa_brt_rangesize == 0) + spa->spa_brt_rangesize = BRT_RANGESIZE; + brt_unlock(spa); + return (error); } void brt_unload(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) + if (spa->spa_brt_rangesize == 0) return; - - brt_vdevs_free(brt); - brt_table_free(brt); - rw_destroy(&brt->brt_lock); - kmem_free(brt, sizeof (*brt)); - spa->spa_brt = NULL; + brt_vdevs_free(spa); + rw_destroy(&spa->spa_brt_lock); + spa->spa_brt_rangesize = 0; } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, "Enable prefetching of BRT ZAP entries"); ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, "BRT ZAP leaf blockshift"); ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, "BRT ZAP indirect blockshift"); -/* END CSTYLED */ diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 9c52083603f1..bff2b6c21f44 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -2208,8 +2208,6 @@ zfs_btree_verify(zfs_btree_t *tree) zfs_btree_verify_poison(tree); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, "Enable btree verification. Levels above 4 require ZFS be built " "with debugging"); -/* END CSTYLED */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index b1419d96f4ef..90395cad6e45 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -89,7 +89,6 @@ typedef struct dbuf_stats { kstat_named_t hash_misses; kstat_named_t hash_collisions; kstat_named_t hash_elements; - kstat_named_t hash_elements_max; /* * Number of sublists containing more than one dbuf in the dbuf * hash table. Keep track of the longest hash chain. @@ -134,7 +133,6 @@ dbuf_stats_t dbuf_stats = { { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, - { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, @@ -154,6 +152,7 @@ struct { wmsum_t hash_hits; wmsum_t hash_misses; wmsum_t hash_collisions; + wmsum_t hash_elements; wmsum_t hash_chains; wmsum_t hash_insert_race; wmsum_t metadata_cache_count; @@ -432,8 +431,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); - uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); - DBUF_STAT_MAX(hash_elements_max, he); + DBUF_STAT_BUMP(hash_elements); return (NULL); } @@ -506,7 +504,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); + DBUF_STAT_BUMPDOWN(hash_elements); } typedef enum { @@ -903,6 +901,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw) wmsum_value(&dbuf_sums.hash_misses); ds->hash_collisions.value.ui64 = wmsum_value(&dbuf_sums.hash_collisions); + ds->hash_elements.value.ui64 = + wmsum_value(&dbuf_sums.hash_elements); ds->hash_chains.value.ui64 = wmsum_value(&dbuf_sums.hash_chains); ds->hash_insert_race.value.ui64 = @@ -1004,6 +1004,7 @@ dbuf_init(void) wmsum_init(&dbuf_sums.hash_hits, 0); wmsum_init(&dbuf_sums.hash_misses, 0); wmsum_init(&dbuf_sums.hash_collisions, 0); + wmsum_init(&dbuf_sums.hash_elements, 0); wmsum_init(&dbuf_sums.hash_chains, 0); wmsum_init(&dbuf_sums.hash_insert_race, 0); wmsum_init(&dbuf_sums.metadata_cache_count, 0); @@ -1077,6 +1078,7 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.hash_hits); wmsum_fini(&dbuf_sums.hash_misses); wmsum_fini(&dbuf_sums.hash_collisions); + wmsum_fini(&dbuf_sums.hash_elements); wmsum_fini(&dbuf_sums.hash_chains); wmsum_fini(&dbuf_sums.hash_insert_race); wmsum_fini(&dbuf_sums.metadata_cache_count); @@ -2578,8 +2580,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * We are freeing a block that we cloned in the same * transaction group. */ - brt_pending_remove(dmu_objset_spa(db->db_objset), - &dr->dt.dl.dr_overridden_by, tx); + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + brt_pending_remove(dmu_objset_spa(db->db_objset), + bp, tx); + } } dnode_t *dn = dr->dr_dnode; @@ -2916,7 +2921,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) * pending clone and mark the block as uncached. This will be * as if the clone was never done. */ - if (dr && dr->dt.dl.dr_brtwrite) { + if (db->db_state == DB_NOFILL) { VERIFY(!dbuf_undirty(db, tx)); db->db_state = DB_UNCACHED; } @@ -4774,8 +4779,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) if (*datap != NULL && *datap == db->db_buf && dn->dn_object != DMU_META_DNODE_OBJECT && - zfs_refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN) { + zfs_refcount_count(&db->db_holds) > 1) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 137fe487a997..64924bc4fa61 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -258,9 +258,7 @@ const ddt_ops_t ddt_zap_ops = { ddt_zap_count, }; -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW, "DDT ZAP leaf blockshift"); ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW, "DDT ZAP indirect blockshift"); -/* END CSTYLED */ diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 362415a25895..32609399b79e 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1221,6 +1221,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); + ASSERT(db->db_data != NULL); (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy); offset += tocpy; @@ -1278,6 +1279,7 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, else dmu_buf_will_dirty(db, tx); + ASSERT(db->db_data != NULL); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) @@ -1426,6 +1428,7 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); + ASSERT(db->db_data != NULL); err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); @@ -1550,6 +1553,7 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) else dmu_buf_will_dirty(db, tx); + ASSERT(db->db_data != NULL); err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); @@ -2938,10 +2942,8 @@ ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, "Enable forcing txg sync to find holes"); -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, "Limit one prefetch call to this size"); -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW, "Override copies= for dedup objects"); diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index 56986ea43446..344b0e3750e9 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -519,7 +519,5 @@ EXPORT_SYMBOL(dmu_object_next); EXPORT_SYMBOL(dmu_object_zapify); EXPORT_SYMBOL(dmu_object_free_zapified); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW, "CPU-specific allocator grabs 2^N objects at once"); -/* END CSTYLED */ diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index b1cd981cec1d..a33216be6ecf 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -3843,4 +3843,3 @@ ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW, "Ignore errors during corrective receive"); -/* END CSTYLED */ diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 15cc2885e805..aa0434f3c722 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -818,6 +818,5 @@ MODULE_PARM_DESC(ignore_hole_birth, "Alias for send_holes_without_birth_time"); #endif -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW, "Ignore hole_birth txg for zfs send"); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 6aee7afb6954..e4895a6bcd7f 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -800,6 +800,14 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) case THT_CLONE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; break; default: cmn_err(CE_PANIC, "bad txh_type %d", diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index ed50f1889b59..ea593c0d86e1 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -63,8 +63,8 @@ static unsigned int zfetch_min_distance = 4 * 1024 * 1024; /* max bytes to prefetch per stream (default 64MB) */ unsigned int zfetch_max_distance = 64 * 1024 * 1024; #endif -/* max bytes to prefetch indirects for per stream (default 64MB) */ -unsigned int zfetch_max_idistance = 64 * 1024 * 1024; +/* max bytes to prefetch indirects for per stream (default 128MB) */ +unsigned int zfetch_max_idistance = 128 * 1024 * 1024; /* max request reorder distance within a stream (default 16MB) */ unsigned int zfetch_max_reorder = 16 * 1024 * 1024; /* Max log2 fraction of holes in a stream */ @@ -472,6 +472,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, zstream_t *zs; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; + int64_t ipf_start, ipf_end; if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) return (NULL); @@ -571,13 +572,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, * This access is not part of any existing stream. Create a new * stream for it unless we are at the end of file. */ + ASSERT0P(zs); if (end_blkid < maxblkid) dmu_zfetch_stream_create(zf, end_blkid); mutex_exit(&zf->zf_lock); - if (!have_lock) - rw_exit(&zf->zf_dnode->dn_struct_rwlock); ZFETCHSTAT_BUMP(zfetchstat_misses); - return (NULL); + ipf_start = 0; + goto prescient; hit: nblks = dmu_zfetch_hit(zs, nblks); @@ -650,6 +651,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, pf_nblks = zs->zs_ipf_dist >> dbs; if (zs->zs_ipf_start < zs->zs_pf_end) zs->zs_ipf_start = zs->zs_pf_end; + ipf_start = zs->zs_ipf_end; if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; @@ -658,8 +660,30 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, zfs_refcount_add(&zs->zs_callers, NULL); mutex_exit(&zf->zf_lock); +prescient: + /* + * Prefetch the following indirect blocks for this access to reduce + * dbuf_hold() sync read delays in dmu_buf_hold_array_by_dnode(). + * This covers the gap during the first couple accesses when we can + * not predict the future yet, but know what is needed right now. + * This should be very rare for reads/writes to need more than one + * indirect, but more useful for cloning due to much bigger accesses. + */ + ipf_start = MAX(ipf_start, blkid + 1); + int epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; + ipf_end = P2ROUNDUP(end_blkid, 1 << epbs) >> epbs; + + int issued = 0; + for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { + issued += dbuf_prefetch(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_SYNC_READ, ARC_FLAG_PRESCIENT_PREFETCH); + } + if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); + if (issued) + ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); return (zs); } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 2248f644bee7..629edd813fb9 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -68,6 +68,7 @@ #include #include #include +#include /* * The SPA supports block sizes up to 16MB. However, very large blocks @@ -289,8 +290,26 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object); - dsl_free(tx->tx_pool, tx->tx_txg, bp); + /* + * Put blocks that would create IO on the pool's deadlist for + * dsl_process_async_destroys() to find. This is to prevent + * zio_free() from creating a ZIO_TYPE_FREE IO for them, which + * are very heavy and can lead to out-of-memory conditions if + * something tries to free millions of blocks on the same txg. + */ + boolean_t defer = spa_version(spa) >= SPA_VERSION_DEADLISTS && + (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || + brt_maybe_exists(spa, bp)); + + if (defer) { + dprintf_bp(bp, "putting on free list: %s", ""); + bpobj_enqueue(&ds->ds_dir->dd_pool->dp_free_bpobj, + bp, B_FALSE, tx); + } else { + dprintf_bp(bp, "freeing ds=%llu", + (u_longlong_t)ds->ds_object); + dsl_free(tx->tx_pool, tx->tx_txg, bp); + } mutex_enter(&ds->ds_lock); ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || @@ -298,9 +317,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, delta = parent_delta(ds, -used); dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); + dsl_dir_diduse_transfer_space(ds->ds_dir, delta, -compressed, -uncompressed, -used, DD_USED_REFRSRV, DD_USED_HEAD, tx); + + if (defer) + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, compressed, uncompressed, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) { @@ -677,13 +701,17 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag, ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } - dsl_deadlist_open(&ds->ds_deadlist, - mos, dsl_dataset_phys(ds)->ds_deadlist_obj); - uint64_t remap_deadlist_obj = - dsl_dataset_get_remap_deadlist_object(ds); - if (remap_deadlist_obj != 0) { - dsl_deadlist_open(&ds->ds_remap_deadlist, mos, - remap_deadlist_obj); + if (err == 0) { + err = dsl_deadlist_open(&ds->ds_deadlist, + mos, dsl_dataset_phys(ds)->ds_deadlist_obj); + } + if (err == 0) { + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + if (remap_deadlist_obj != 0) { + err = dsl_deadlist_open(&ds->ds_remap_deadlist, + mos, remap_deadlist_obj); + } } dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, @@ -692,7 +720,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag, winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); if (err != 0 || winner != NULL) { - dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_deadlist)) + dsl_deadlist_close(&ds->ds_deadlist); if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) dsl_deadlist_close(&ds->ds_remap_deadlist); dsl_bookmark_fini_ds(ds); @@ -1799,8 +1828,8 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_open(&ds->ds_deadlist, mos, - dsl_dataset_phys(ds)->ds_deadlist_obj); + VERIFY0(dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj)); dsl_deadlist_add_key(&ds->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); dsl_bookmark_snapshotted(ds, tx); @@ -4020,14 +4049,14 @@ dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone, if (clone_remap_dl_obj != 0) { dsl_dataset_set_remap_deadlist_object(origin, clone_remap_dl_obj, tx); - dsl_deadlist_open(&origin->ds_remap_deadlist, - dp->dp_meta_objset, clone_remap_dl_obj); + VERIFY0(dsl_deadlist_open(&origin->ds_remap_deadlist, + dp->dp_meta_objset, clone_remap_dl_obj)); } if (origin_remap_dl_obj != 0) { dsl_dataset_set_remap_deadlist_object(clone, origin_remap_dl_obj, tx); - dsl_deadlist_open(&clone->ds_remap_deadlist, - dp->dp_meta_objset, origin_remap_dl_obj); + VERIFY0(dsl_deadlist_open(&clone->ds_remap_deadlist, + dp->dp_meta_objset, origin_remap_dl_obj)); } } @@ -4198,10 +4227,10 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_deadlist_close(&origin_head->ds_deadlist); SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, dsl_dataset_phys(clone)->ds_deadlist_obj); - dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, - dsl_dataset_phys(clone)->ds_deadlist_obj); - dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, - dsl_dataset_phys(origin_head)->ds_deadlist_obj); + VERIFY0(dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, + dsl_dataset_phys(clone)->ds_deadlist_obj)); + VERIFY0(dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, + dsl_dataset_phys(origin_head)->ds_deadlist_obj)); dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); /* @@ -4935,8 +4964,8 @@ dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); dsl_dataset_set_remap_deadlist_object(ds, remap_deadlist_obj, tx); - dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), - remap_deadlist_obj); + VERIFY0(dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), + remap_deadlist_obj)); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index e457e2fd86ef..b1be8fae3b47 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -299,30 +299,33 @@ dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) } } -void +int dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { dmu_object_info_t doi; + int err; ASSERT(!dsl_deadlist_is_open(dl)); mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); dl->dl_os = os; dl->dl_object = object; - VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); + err = dmu_bonus_hold(os, object, dl, &dl->dl_dbuf); + if (err != 0) + return (err); dmu_object_info_from_db(dl->dl_dbuf, &doi); if (doi.doi_type == DMU_OT_BPOBJ) { dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_oldfmt = B_TRUE; - VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); - return; + return (bpobj_open(&dl->dl_bpobj, os, object)); } dl->dl_oldfmt = B_FALSE; dl->dl_phys = dl->dl_dbuf->db_data; dl->dl_havetree = B_FALSE; dl->dl_havecache = B_FALSE; + return (0); } boolean_t @@ -686,7 +689,7 @@ dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, dsl_deadlist_t dl = { 0 }; dsl_pool_t *dp = dmu_objset_pool(os); - dsl_deadlist_open(&dl, os, dlobj); + VERIFY0(dsl_deadlist_open(&dl, os, dlobj)); if (dl.dl_oldfmt) { dsl_deadlist_close(&dl); return; diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index b2b925b135f7..e6c7e79ed6c0 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -182,10 +182,10 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, dsl_dataset_phys(ds)->ds_deadlist_obj = dsl_dataset_phys(ds_next)->ds_deadlist_obj; dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj; - dsl_deadlist_open(&ds->ds_deadlist, mos, - dsl_dataset_phys(ds)->ds_deadlist_obj); - dsl_deadlist_open(&ds_next->ds_deadlist, mos, - dsl_dataset_phys(ds_next)->ds_deadlist_obj); + VERIFY0(dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj)); + VERIFY0(dsl_deadlist_open(&ds_next->ds_deadlist, mos, + dsl_dataset_phys(ds_next)->ds_deadlist_obj)); } typedef struct remaining_clones_key { diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 1b60fa620b8d..71f151b14d9b 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -272,9 +272,11 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, err = zap_lookup(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj); - if (err == 0) - dsl_dir_livelist_open(dd, obj); - else if (err != ENOENT) + if (err == 0) { + err = dsl_dir_livelist_open(dd, obj); + if (err != 0) + goto errout; + } else if (err != ENOENT) goto errout; } } @@ -2301,15 +2303,18 @@ dsl_dir_is_zapified(dsl_dir_t *dd) return (doi.doi_type == DMU_OTN_ZAP_METADATA); } -void +int dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) { objset_t *mos = dd->dd_pool->dp_meta_objset; ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, SPA_FEATURE_LIVELIST)); - dsl_deadlist_open(&dd->dd_livelist, mos, obj); + int err = dsl_deadlist_open(&dd->dd_livelist, mos, obj); + if (err != 0) + return (err); bplist_create(&dd->dd_pending_allocs); bplist_create(&dd->dd_pending_frees); + return (0); } void @@ -2489,6 +2494,5 @@ EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); #endif -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW, "Enable strict ZVOL quota enforcment"); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 6cd0dbdea195..3eba4cb35cc6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -231,6 +231,9 @@ static uint_t zfs_resilver_defer_percent = 10; ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) +#define DSL_SCAN_IS_SCRUB(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB) + /* * Enable/disable the processing of the free_bpobj object. */ @@ -855,15 +858,15 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { - (void) arg; + setup_sync_arg_t *setup_sync_arg = (setup_sync_arg_t *)arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(!dsl_scan_is_running(scn)); - ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + ASSERT3U(setup_sync_arg->func, >, POOL_SCAN_NONE); + ASSERT3U(setup_sync_arg->func, <, POOL_SCAN_FUNCS); memset(&scn->scn_phys, 0, sizeof (scn->scn_phys)); /* @@ -873,10 +876,14 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); dsl_errorscrub_sync_state(scn, tx); - scn->scn_phys.scn_func = *funcp; + scn->scn_phys.scn_func = setup_sync_arg->func; scn->scn_phys.scn_state = DSS_SCANNING; - scn->scn_phys.scn_min_txg = 0; - scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_min_txg = setup_sync_arg->txgstart; + if (setup_sync_arg->txgend == 0) { + scn->scn_phys.scn_max_txg = tx->tx_txg; + } else { + scn->scn_phys.scn_max_txg = setup_sync_arg->txgend; + } scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; @@ -963,7 +970,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", - *funcp, (u_longlong_t)scn->scn_phys.scn_min_txg, + setup_sync_arg->func, (u_longlong_t)scn->scn_phys.scn_min_txg, (u_longlong_t)scn->scn_phys.scn_max_txg); } @@ -973,10 +980,16 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) * error scrub. */ int -dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func, uint64_t txgstart, + uint64_t txgend) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; + setup_sync_arg_t setup_sync_arg; + + if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) { + return (EINVAL); + } /* * Purge all vdev caches and probe all devices. We do this here @@ -1027,8 +1040,13 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) return (SET_ERROR(err)); } + setup_sync_arg.func = func; + setup_sync_arg.txgstart = txgstart; + setup_sync_arg.txgend = txgend; + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, - dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); + dsl_scan_setup_sync, &setup_sync_arg, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static void @@ -1116,15 +1134,24 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) spa_notify_waiters(spa); - if (dsl_scan_restarting(scn, tx)) + if (dsl_scan_restarting(scn, tx)) { spa_history_log_internal(spa, "scan aborted, restarting", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); - else if (!complete) + } else if (!complete) { spa_history_log_internal(spa, "scan cancelled", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); - else + } else { spa_history_log_internal(spa, "scan done", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + if (DSL_SCAN_IS_SCRUB(scn)) { + VERIFY0(zap_update(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LAST_SCRUBBED_TXG, + sizeof (uint64_t), 1, + &scn->scn_phys.scn_max_txg, tx)); + spa->spa_scrubbed_last_txg = scn->scn_phys.scn_max_txg; + } + } if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { spa->spa_scrub_active = B_FALSE; @@ -4330,14 +4357,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * current scan progress is below zfs_resilver_defer_percent. */ if (dsl_scan_restarting(scn, tx) || restart_early) { - pool_scan_func_t func = POOL_SCAN_SCRUB; + setup_sync_arg_t setup_sync_arg = { + .func = POOL_SCAN_SCRUB, + .txgstart = 0, + .txgend = 0, + }; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) - func = POOL_SCAN_RESILVER; + setup_sync_arg.func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u on %s txg=%llu early=%d", - func, dp->dp_spa->spa_name, (longlong_t)tx->tx_txg, - restart_early); - dsl_scan_setup_sync(&func, tx); + setup_sync_arg.func, dp->dp_spa->spa_name, + (longlong_t)tx->tx_txg, restart_early); + dsl_scan_setup_sync(&setup_sync_arg, tx); } /* @@ -5314,4 +5345,3 @@ ZFS_MODULE_PARAM(zfs, zfs_, resilver_defer_percent, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW, "Error blocks to be scrubbed in one txg"); -/* END CSTYLED */ diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3bd6e93e93a4..7affbfac9dc7 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -6226,7 +6226,6 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW, "Delay in milliseconds after metaslab was last used before unloading"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be free to make it " "eligible for allocation"); @@ -6239,7 +6238,6 @@ ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, "Use the fragmentation metric to prefer less fragmented metaslabs"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT, ZMOD_RW, "Fragmentation for metaslab to allow allocation"); @@ -6280,8 +6278,6 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, "Normally only consider this many of the best metaslabs in each vdev"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, param_set_active_allocator, param_get_charp, ZMOD_RW, "SPA active allocator"); -/* END CSTYLED */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 71122542758d..493884cf04c4 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -736,11 +736,9 @@ mmp_signal_all_threads(void) mutex_exit(&spa_namespace_lock); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval, param_set_multihost_interval, spl_param_get_u64, ZMOD_RW, "Milliseconds between mmp writes to each leaf"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW, "Max allowed period without a successful mmp write"); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 718bbb34a8d5..0dd7da1aa197 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -349,11 +349,9 @@ EXPORT_SYMBOL(zfs_refcount_add); EXPORT_SYMBOL(zfs_refcount_remove); EXPORT_SYMBOL(zfs_refcount_held); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW, "Track reference holders to refcount_t objects"); ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW, "Maximum reference holders being tracked"); -/* END CSTYLED */ #endif /* ZFS_DEBUG */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6b8c7ee93daa..b83c982c13fd 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -451,9 +451,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv) spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, ddt_get_ddt_dsize(spa), src); - spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); + spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL, + spa_get_last_scrubbed_txg(spa), src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { @@ -2081,6 +2082,7 @@ spa_unload(spa_t *spa) vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); + l2arc_spa_rebuild_stop(spa); } } @@ -3064,7 +3066,7 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) dsl_deadlist_entry_t *dle; bplist_t to_free; ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); - dsl_deadlist_open(ll, mos, ll_obj); + VERIFY0(dsl_deadlist_open(ll, mos, ll_obj)); dle = dsl_deadlist_first(ll); ASSERT3P(dle, !=, NULL); bplist_create(&to_free); @@ -4726,6 +4728,12 @@ spa_ld_get_props(spa_t *spa) if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* Load the last scrubbed txg. */ + error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG, + &spa->spa_scrubbed_last_txg, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* * Load the livelist deletion field. If a livelist is queued for * deletion, indicate that in the spa @@ -7115,6 +7123,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); + l2arc_spa_rebuild_stop(spa); /* * We want this to be reflected on every label, @@ -8867,6 +8876,13 @@ spa_scan_stop(spa_t *spa) int spa_scan(spa_t *spa, pool_scan_func_t func) +{ + return (spa_scan_range(spa, func, 0, 0)); +} + +int +spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, + uint64_t txgend) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); @@ -8877,6 +8893,9 @@ spa_scan(spa_t *spa, pool_scan_func_t func) !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) return (SET_ERROR(ENOTSUP)); + if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) + return (SET_ERROR(ENOTSUP)); + /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. @@ -8891,7 +8910,7 @@ spa_scan(spa_t *spa, pool_scan_func_t func) !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) return (SET_ERROR(ENOTSUP)); - return (dsl_scan(spa->spa_dsl_pool, func)); + return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend)); } /* @@ -10974,6 +10993,7 @@ EXPORT_SYMBOL(spa_l2cache_drop); /* scanning */ EXPORT_SYMBOL(spa_scan); +EXPORT_SYMBOL(spa_scan_range); EXPORT_SYMBOL(spa_scan_stop); /* spa syncing */ @@ -10991,11 +11011,9 @@ EXPORT_SYMBOL(spa_event_notify); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, "Percentage of CPUs to run a metaslab preload taskq"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, "log2 fraction of arc that can be used by inflight I/Os when " "verifying pool during import"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, "Set to traverse metadata on pool import"); @@ -11012,11 +11030,9 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, "Number of threads per IO worker taskqueue"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); @@ -11024,7 +11040,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, "Set the livelist condense synctask to pause"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the synctask"); @@ -11046,7 +11061,6 @@ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, "Configure IO queues for write IO"); #endif -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, "Number of CPUs per write issue taskq"); diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index 1efff47f87a0..4c3721c159be 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -633,8 +633,6 @@ EXPORT_SYMBOL(spa_checkpoint_get_stats); EXPORT_SYMBOL(spa_checkpoint_discard_thread); EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW, "Limit for memory used in prefetching the checkpoint space map done " "on each vdev while discarding the checkpoint"); -/* END CSTYLED */ diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index a49e28ee7a43..18b3970ac0dc 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -1491,8 +1491,6 @@ EXPORT_SYMBOL(zep_to_zb); EXPORT_SYMBOL(name_to_errphys); #endif -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW, "Limit the number of errors which will be upgraded to the new " "on-disk error log when enabling head_errlog"); -/* END CSTYLED */ diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index de036d6c3718..81bb99eb2ccd 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -390,6 +390,9 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) return (err); } + ASSERT3UF(tx->tx_txg, <=, spa_final_dirty_txg(spa), + "Logged %s after final txg was set!", "nvlist"); + VERIFY0(nvlist_dup(nvl, &nvarg, KM_SLEEP)); if (spa_history_zone() != NULL) { fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, @@ -527,6 +530,9 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, return; } + ASSERT3UF(tx->tx_txg, <=, spa_final_dirty_txg(spa), + "Logged after final txg was set: %s %s", operation, fmt); + msg = kmem_vasprintf(fmt, adx); fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); kmem_strfree(msg); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index f55218e3579b..a95152608578 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1364,7 +1364,6 @@ spa_ld_log_spacemaps(spa_t *spa) return (error); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW, "Specific hard-limit in memory that ZFS allows to be used for " "unflushed changes"); @@ -1383,8 +1382,8 @@ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW, "log spacemap (see zfs_unflushed_log_block_max)"); ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW, - "Hard limit (upper-bound) in the size of the space map log " - "in terms of dirty TXGs."); + "Hard limit (upper-bound) in the size of the space map log " + "in terms of dirty TXGs."); ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW, "Tunable used to determine the number of blocks that can be used for " @@ -1399,7 +1398,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW, "Prevent the log spacemaps from being flushed and destroyed " "during pool export/destroy"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW, "Maximum number of rows allowed in the summary of the spacemap log"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f486513fcaf9..0550dfd4766d 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1870,13 +1870,7 @@ spa_get_slop_space(spa_t *spa) if (spa->spa_dedup_dspace == ~0ULL) spa_update_dspace(spa); - /* - * spa_get_dspace() includes the space only logically "used" by - * deduplicated data, so since it's not useful to reserve more - * space with more deduplicated data, we subtract that out here. - */ - space = - spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa); + space = spa->spa_rdspace; slop = MIN(space >> spa_slop_shift, spa_max_slop); /* @@ -1912,8 +1906,7 @@ spa_get_checkpoint_space(spa_t *spa) void spa_update_dspace(spa_t *spa) { - spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + - ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); + spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa)); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that @@ -1933,9 +1926,11 @@ spa_update_dspace(spa_t *spa) * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new device). */ - ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace); - spa->spa_dspace -= spa->spa_nonallocating_dspace; + ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace); + spa->spa_rdspace -= spa->spa_nonallocating_dspace; } + spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) + + brt_get_dspace(spa); } /* @@ -2681,6 +2676,12 @@ spa_mode(spa_t *spa) return (spa->spa_mode); } +uint64_t +spa_get_last_scrubbed_txg(spa_t *spa) +{ + return (spa->spa_scrubbed_last_txg); +} + uint64_t spa_bootfs(spa_t *spa) { @@ -3122,7 +3123,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, "Failmode for deadman timer"); @@ -3138,7 +3138,6 @@ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); -/* END CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 983f444d79b0..9f0f1dee656c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -6551,7 +6551,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, "Direct I/O writes will perform for checksum verification before " "commiting write"); @@ -6559,7 +6558,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); @@ -6573,7 +6571,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); @@ -6582,4 +6579,3 @@ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); -/* END CSTYLED */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index e3dba0257b21..cd24f97ae7cd 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1897,7 +1897,6 @@ EXPORT_SYMBOL(vdev_indirect_sync_obsolete); EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); EXPORT_SYMBOL(vdev_obsolete_sm_object); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); @@ -1922,4 +1921,3 @@ ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, UINT, ZMOD_RW, "Maximum number of combinations when reconstructing split segments"); -/* END CSTYLED */ diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 65a840bf9728..850569d1a35e 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -1047,12 +1047,10 @@ ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW, "Rotating media load increment for seeking I/Os"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW, "Offset in bytes from the last I/O which triggers " "a reduced rotating media seek increment"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os"); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 5e330626be2b..e4487c485075 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -357,7 +357,11 @@ uint_t raidz_expand_pause_point = 0; /* * Maximum amount of copy io's outstanding at once. */ +#ifdef _ILP32 +static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; +#else static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; +#endif /* * Apply raidz map abds aggregation if the number of rows in the map is equal @@ -3811,22 +3815,33 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) * setup a scrub. All the data has been sucessfully copied * but we have not validated any checksums. */ - pool_scan_func_t func = POOL_SCAN_SCRUB; - if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) - dsl_scan_setup_sync(&func, tx); + setup_sync_arg_t setup_sync_arg = { + .func = POOL_SCAN_SCRUB, + .txgstart = 0, + .txgend = 0, + }; + if (zfs_scrub_after_expand && + dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { + dsl_scan_setup_sync(&setup_sync_arg, tx); + } } /* - * Struct for one copy zio. + * State of one copy batch. */ typedef struct raidz_reflow_arg { - vdev_raidz_expand_t *rra_vre; - zfs_locked_range_t *rra_lr; - uint64_t rra_txg; + vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ + zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ + uint64_t rra_txg; /* TXG of this batch. */ + uint_t rra_ashift; /* Ashift of the vdev. */ + uint32_t rra_tbd; /* Number of in-flight ZIOs. */ + uint32_t rra_writes; /* Number of write ZIOs. */ + zio_t *rra_zio[]; /* Write ZIO pointers. */ } raidz_reflow_arg_t; /* - * The write of the new location is done. + * Write of the new location on one child is done. Once all of them are done + * we can unlock and free everything. */ static void raidz_reflow_write_done(zio_t *zio) @@ -3850,17 +3865,19 @@ raidz_reflow_write_done(zio_t *zio) zio->io_size; } cv_signal(&vre->vre_cv); + boolean_t done = (--rra->rra_tbd == 0); mutex_exit(&vre->vre_lock); - zfs_rangelock_exit(rra->rra_lr); - - kmem_free(rra, sizeof (*rra)); + if (!done) + return; spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); + zfs_rangelock_exit(rra->rra_lr); + kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); } /* - * The read of the old location is done. The parent zio is the write to - * the new location. Allow it to start. + * Read of the old location on one child is done. Once all of them are done + * writes should have all the data and we can issue them. */ static void raidz_reflow_read_done(zio_t *zio) @@ -3868,6 +3885,10 @@ raidz_reflow_read_done(zio_t *zio) raidz_reflow_arg_t *rra = zio->io_private; vdev_raidz_expand_t *vre = rra->rra_vre; + /* Reads of only one block use write ABDs. For bigger free gangs. */ + if (zio->io_size > (1 << rra->rra_ashift)) + abd_free(zio->io_abd); + /* * If the read failed, or if it was done on a vdev that is not fully * healthy (e.g. a child that has a resilver in progress), we may not @@ -3891,7 +3912,11 @@ raidz_reflow_read_done(zio_t *zio) mutex_exit(&vre->vre_lock); } - zio_nowait(zio_unique_parent(zio)); + if (atomic_dec_32_nv(&rra->rra_tbd) > 0) + return; + rra->rra_tbd = rra->rra_writes; + for (uint64_t i = 0; i < rra->rra_writes; i++) + zio_nowait(rra->rra_zio[i]); } static void @@ -3932,21 +3957,19 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; - int ashift = vd->vdev_top->vdev_ashift; - uint64_t offset, size; + uint_t ashift = vd->vdev_top->vdev_ashift; - if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, - &offset, &size)) { + range_seg_t *rs = range_tree_first(rt); + if (rt == NULL) return (B_FALSE); - } + uint64_t offset = rs_get_start(rs, rt); ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); + uint64_t size = rs_get_end(rs, rt) - offset; ASSERT3U(size, >=, 1 << ashift); - uint64_t length = 1 << ashift; - int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + ASSERT(IS_P2ALIGNED(size, 1 << ashift)); uint64_t blkid = offset >> ashift; - - int old_children = vd->vdev_children - 1; + uint_t old_children = vd->vdev_children - 1; /* * We can only progress to the point that writes will not overlap @@ -3965,26 +3988,34 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, uint64_t next_overwrite_blkid = ubsync_blkid + ubsync_blkid / old_children - old_children; VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); - if (blkid >= next_overwrite_blkid) { raidz_reflow_record_progress(vre, next_overwrite_blkid << ashift, tx); return (B_TRUE); } - range_tree_remove(rt, offset, length); + size = MIN(size, raidz_expand_max_copy_bytes); + size = MIN(size, (uint64_t)old_children * + MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); + size = MAX(size, 1 << ashift); + uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); + size = (uint64_t)blocks << ashift; + + range_tree_remove(rt, offset, size); - raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); + uint_t reads = MIN(blocks, old_children); + uint_t writes = MIN(blocks, vd->vdev_children); + raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + + sizeof (zio_t *) * writes, KM_SLEEP); rra->rra_vre = vre; rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, - offset, length, RL_WRITER); + offset, size, RL_WRITER); rra->rra_txg = dmu_tx_get_txg(tx); + rra->rra_ashift = ashift; + rra->rra_tbd = reads; + rra->rra_writes = writes; - raidz_reflow_record_progress(vre, offset + length, tx); - - mutex_enter(&vre->vre_lock); - vre->vre_outstanding_bytes += length; - mutex_exit(&vre->vre_lock); + raidz_reflow_record_progress(vre, offset + size, tx); /* * SCL_STATE will be released when the read and write are done, @@ -4006,29 +4037,61 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, mutex_exit(&vre->vre_lock); /* drop everything we acquired */ - zfs_rangelock_exit(rra->rra_lr); - kmem_free(rra, sizeof (*rra)); spa_config_exit(spa, SCL_STATE, spa); + zfs_rangelock_exit(rra->rra_lr); + kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); return (B_TRUE); } + mutex_enter(&vre->vre_lock); + vre->vre_outstanding_bytes += size; + mutex_exit(&vre->vre_lock); + + /* Allocate ABD and ZIO for each child we write. */ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; zio_t *pio = spa->spa_txg_zio[txgoff]; - abd_t *abd = abd_alloc_for_io(length, B_FALSE); - zio_t *write_zio = zio_vdev_child_io(pio, NULL, - vd->vdev_child[blkid % vd->vdev_children], - (blkid / vd->vdev_children) << ashift, - abd, length, - ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, - ZIO_FLAG_CANFAIL, - raidz_reflow_write_done, rra); - - zio_nowait(zio_vdev_child_io(write_zio, NULL, - vd->vdev_child[blkid % old_children], - (blkid / old_children) << ashift, - abd, length, - ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, - ZIO_FLAG_CANFAIL, - raidz_reflow_read_done, rra)); + uint_t b = blocks / vd->vdev_children; + uint_t bb = blocks % vd->vdev_children; + for (uint_t i = 0; i < writes; i++) { + uint_t n = b + (i < bb); + abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); + rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, + vd->vdev_child[(blkid + i) % vd->vdev_children], + ((blkid + i) / vd->vdev_children) << ashift, + abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); + } + + /* + * Allocate and issue ZIO for each child we read. For reads of only + * one block we can use respective writer ABDs, since they will also + * have only one block. For bigger reads create gang ABDs and fill + * them with respective blocks from writer ABDs. + */ + b = blocks / old_children; + bb = blocks % old_children; + for (uint_t i = 0; i < reads; i++) { + uint_t n = b + (i < bb); + abd_t *abd; + if (n > 1) { + abd = abd_alloc_gang(); + for (uint_t j = 0; j < n; j++) { + uint_t b = j * old_children + i; + abd_t *cabd = abd_get_offset_size( + rra->rra_zio[b % vd->vdev_children]->io_abd, + (b / vd->vdev_children) << ashift, + 1 << ashift); + abd_gang_add(abd, cabd, B_TRUE); + } + } else { + abd = rra->rra_zio[i]->io_abd; + } + zio_nowait(zio_vdev_child_io(pio, NULL, + vd->vdev_child[(blkid + i) % old_children], + ((blkid + i) / old_children) << ashift, abd, + n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); + } return (B_FALSE); } @@ -4122,7 +4185,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], - write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, + write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); @@ -4142,7 +4205,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], read_size, ZIO_TYPE_READ, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); @@ -4197,7 +4260,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], - write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); @@ -4246,7 +4309,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) for (int i = 0; i < raidvd->vdev_children; i++) { zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], write_size, ZIO_TYPE_WRITE, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, + ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); @@ -4355,8 +4418,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], - write_size, ZIO_TYPE_READ, - ZIO_PRIORITY_ASYNC_READ, 0, + write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } zio_wait(pio); @@ -4368,7 +4430,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) for (int i = 0; i < raidvd->vdev_children; i++) { zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], write_size, ZIO_TYPE_WRITE, - ZIO_PRIORITY_ASYNC_WRITE, 0, + ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } zio_wait(pio); @@ -4490,8 +4552,11 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) * space. Note that there may be a little bit more free * space (e.g. in ms_defer), and it's fine to copy that too. */ - range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, - NULL, 0, 0); + uint64_t shift, start; + range_seg_type_t type = metaslab_calculate_range_tree_type( + raidvd, msp, &start, &shift); + range_tree_t *rt = range_tree_create(NULL, type, NULL, + start, shift); range_tree_add(rt, msp->ms_start, msp->ms_size); range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); mutex_exit(&msp->ms_lock); @@ -4516,7 +4581,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) * when importing a pool with a expansion in progress), * discard any state that we have already processed. */ - range_tree_clear(rt, 0, vre->vre_offset); + if (vre->vre_offset > msp->ms_start) { + range_tree_clear(rt, msp->ms_start, + vre->vre_offset - msp->ms_start); + } while (!zthr_iscancelled(zthr) && !range_tree_is_empty(rt) && @@ -5043,7 +5111,6 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, "For testing, pause RAIDZ expansion after reflowing this many bytes"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, @@ -5053,4 +5120,3 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); -/* END CSTYLED */ diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 8a8b02cab5c6..f80ed1b401f9 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -345,10 +345,14 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) * While we're in syncing context take the opportunity to * setup the scrub when there are no more active rebuilds. */ - pool_scan_func_t func = POOL_SCAN_SCRUB; - if (dsl_scan_setup_check(&func, tx) == 0 && + setup_sync_arg_t setup_sync_arg = { + .func = POOL_SCAN_SCRUB, + .txgstart = 0, + .txgend = 0, + }; + if (dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0 && zfs_rebuild_scrub_enabled) { - dsl_scan_setup_sync(&func, tx); + dsl_scan_setup_sync(&setup_sync_arg, tx); } cv_broadcast(&vd->vdev_rebuild_cv); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 1249657f9d72..08c85a874803 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -2551,11 +2551,9 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW, "Largest span of free chunks a remap segment can span"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW, "Pause device removal after this many bytes are copied " "(debug use only - causes removal to hang)"); -/* END CSTYLED */ EXPORT_SYMBOL(free_from_removing_vdev); EXPORT_SYMBOL(spa_removal_get_stats); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 40e7bcf3ed1f..99fc4ec1928f 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -1706,10 +1706,8 @@ zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) return (err); } -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, "When iterating ZAP object, prefetch it"); -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, "Enable ZAP shrinking"); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 12938022e976..55b60006e58c 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1227,6 +1227,21 @@ zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, return (err); } +static int +zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints) +{ + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (0); +} + int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) @@ -1237,13 +1252,37 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); + err = zap_prefetch_uint64_impl(zap, key, key_numints); + /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_prefetch_uint64_impl(zap, key, key_numints); + /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +static int +zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } - fzap_prefetch(zn); + int err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); @@ -1259,16 +1298,25 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } + err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size, + num_integers, buf); + /* zap_lookup_uint64_impl() calls zap_unlockdir() */ + return (err); +} - err = fzap_lookup(zn, integer_size, num_integers, buf, - NULL, 0, NULL); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); +int +zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size, + num_integers, buf); + /* zap_lookup_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1982,7 +2030,6 @@ EXPORT_SYMBOL(zap_cursor_serialize); EXPORT_SYMBOL(zap_cursor_init_serialized); EXPORT_SYMBOL(zap_get_stats); -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); #endif diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 8188a9e46865..b1b0ae54460b 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1718,6 +1718,9 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); } else if (scan_type == POOL_SCAN_NONE) { error = spa_scan_stop(spa); + } else if (scan_cmd == POOL_SCRUB_FROM_LAST_TXG) { + error = spa_scan_range(spa, scan_type, + spa_get_last_scrubbed_txg(spa), 0); } else { error = spa_scan(spa, scan_type); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a5daf73d59ba..f4d7e57542a1 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2192,31 +2192,20 @@ zio_delay_interrupt(zio_t *zio) } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; - clock_t expire_at_tick = ddi_get_lbolt() + - NSEC_TO_TICK(diff); + int ticks = MAX(1, NSEC_TO_TICK(diff)); + clock_t expire_at_tick = ddi_get_lbolt() + ticks; DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); - if (NSEC_TO_TICK(diff) == 0) { - /* Our delay is less than a jiffy - just spin */ - zfs_sleep_until(zio->io_target_timestamp); - zio_interrupt(zio); - } else { + tid = taskq_dispatch_delay(system_taskq, zio_interrupt, + zio, TQ_NOSLEEP, expire_at_tick); + if (tid == TASKQID_INVALID) { /* - * Use taskq_dispatch_delay() in the place of - * OpenZFS's timeout_generic(). + * Couldn't allocate a task. Just finish the + * zio without a delay. */ - tid = taskq_dispatch_delay(system_taskq, - zio_interrupt, zio, TQ_NOSLEEP, - expire_at_tick); - if (tid == TASKQID_INVALID) { - /* - * Couldn't allocate a task. Just - * finish the zio without a delay. - */ - zio_interrupt(zio); - } + zio_interrupt(zio); } } return; diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index ce6772a40c8b..0d2fda8d5270 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -160,6 +160,12 @@ abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, abd_fletcher_4_impl(abd, size, &acd); } +/* + * Checksum vectors. + * + * Note: you cannot change the name string for these functions, as they are + * embedded in on-disk data in some places (eg dedup table names). + */ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 10c482573862..1a0178eb2830 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -44,10 +44,6 @@ static unsigned long zio_decompress_fail_fraction = 0; /* * Compression vectors. - * - * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS. - * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE - * PART OF THE ON-DISK FORMAT. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"inherit", 0, NULL, NULL, NULL}, diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index cd85dd28cf56..6735c4a67ec5 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -32,6 +32,7 @@ Requires(post): gcc, make, perl, diffutils %if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version}%{?openEuler} Requires: kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 Requires(post): kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 +Conflicts: kernel-devel < @ZFS_META_KVER_MIN@, kernel-devel > @ZFS_META_KVER_MAX@.999 Obsoletes: spl-dkms <= %{version} %endif Provides: %{module}-kmod = %{version} diff --git a/scripts/cstyle.pl b/scripts/cstyle.pl index d47fd3362408..123020b08127 100755 --- a/scripts/cstyle.pl +++ b/scripts/cstyle.pl @@ -211,6 +211,7 @@ ($$) my $in_comment = 0; my $comment_done = 0; my $in_warlock_comment = 0; +my $in_macro_call = 0; my $in_function = 0; my $in_function_header = 0; my $function_header_full_indent = 0; @@ -395,12 +396,18 @@ ($$) } } + # If this looks like a top-level macro invocation, remember it so we + # don't mistake it for a function declaration below. + if (/^[A-Za-z_][A-Za-z_0-9]*\(/) { + $in_macro_call = 1; + } + # # If this matches something of form "foo(", it's probably a function # definition, unless it ends with ") bar;", in which case it's a declaration # that uses a macro to generate the type. # - if (/^\w+\(/ && !/\) \w+;/) { + if (!$in_macro_call && /^\w+\(/ && !/\) \w+;/) { $in_function_header = 1; if (/\($/) { $function_header_full_indent = 1; @@ -565,7 +572,9 @@ ($$) err("comma or semicolon followed by non-blank"); } # allow "for" statements to have empty "while" clauses - if (/\s[,;]/ && !/^[\t]+;$/ && !/^\s*for \([^;]*; ;[^;]*\)/) { + # allow macro invocations to have empty parameters + if (/\s[,;]/ && !/^[\t]+;$/ && + !($in_macro_call || /^\s*for \([^;]*; ;[^;]*\)/)) { err("comma or semicolon preceded by blank"); } if (/^\s*(&&|\|\|)/) { @@ -686,10 +695,13 @@ ($$) err("unary * followed by space"); } } - if ($check_posix_types) { + if ($check_posix_types && !$in_macro_call) { # try to detect old non-POSIX types. # POSIX requires all non-standard typedefs to end in _t, # but historically these have been used. + # + # We don't check inside macro invocations because macros have + # legitmate uses for these names in function generators. if (/\b(unchar|ushort|uint|ulong|u_int|u_short|u_long|u_char|quad)\b/) { err("non-POSIX typedef $1 used: use $old2posix{$1} instead"); } @@ -700,6 +712,14 @@ ($$) "else and right brace should be on same line"); } } + + # Macro invocations end with a closing paren, and possibly a semicolon. + # We do this check down here to make sure all the regular checks are + # applied to calls that appear entirely on a single line. + if ($in_macro_call && /\);?$/) { + $in_macro_call = 0; + } + $prev = $line; } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index fc4adc42d00a..a69d36df2f98 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -82,7 +82,7 @@ tests = ['block_cloning_clone_mmap_cached', 'block_cloning_copyfilerange_fallback_same_txg', 'block_cloning_replay', 'block_cloning_replay_encrypted', 'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write', - 'block_cloning_rlimit_fsize'] + 'block_cloning_rlimit_fsize', 'block_cloning_large_offset'] tags = ['functional', 'block_cloning'] [tests/functional/bootfs] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 07ec2c4b601b..0bfc64959c38 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -339,6 +339,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_rlimit_fsize': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_large_offset': + ['SKIP', cfr_reason], 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], 'cp_files/cp_files_002_pos': ['SKIP', cfr_reason], diff --git a/tests/zfs-tests/cmd/getversion.c b/tests/zfs-tests/cmd/getversion.c index 1e026b92d17d..3626d1e968a3 100644 --- a/tests/zfs-tests/cmd/getversion.c +++ b/tests/zfs-tests/cmd/getversion.c @@ -19,9 +19,13 @@ */ #include +#ifdef _KERNEL +#include +#else +#include +#endif #include #include -#include #include #include #include diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 7d1551a63f0d..67630cb564ae 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -482,6 +482,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_replay_encrypted.ksh \ functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \ functional/block_cloning/block_cloning_rlimit_fsize.ksh \ + functional/block_cloning/block_cloning_large_offset.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ @@ -1225,6 +1226,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ + functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh index f8aa1c875c60..08ed5717b9da 100755 --- a/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh +++ b/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh @@ -41,9 +41,11 @@ log_must zfs set compress=zle $TESTDSTFS for prop in "${sync_prop_vals[@]}"; do log_must zfs set sync=$prop $TESTSRCFS + # 15*8=120, which is greater than 113, so we are sure the data won't + # be embedded into BP. # 32767*8=262136, which is larger than a single default recordsize of # 131072. - FILESIZE=$(random_int_between 1 32767) + FILESIZE=$(random_int_between 15 32767) FILESIZE=$((FILESIZE * 8)) bclone_test random $FILESIZE false $TESTSRCDIR $TESTSRCDIR done @@ -52,9 +54,11 @@ for srcprop in "${sync_prop_vals[@]}"; do log_must zfs set sync=$srcprop $TESTSRCFS for dstprop in "${sync_prop_vals[@]}"; do log_must zfs set sync=$dstprop $TESTDSTFS + # 15*8=120, which is greater than 113, so we are sure the data won't + # be embedded into BP. # 32767*8=262136, which is larger than a single default recordsize of # 131072. - FILESIZE=$(random_int_between 1 32767) + FILESIZE=$(random_int_between 15 32767) FILESIZE=$((FILESIZE * 8)) bclone_test random $FILESIZE false $TESTSRCDIR $TESTDSTDIR done diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_large_offset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_large_offset.ksh new file mode 100755 index 000000000000..1d5a2619ebf3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_large_offset.ksh @@ -0,0 +1,83 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# Verify that cloning a file at a large offset is possible. +# +# STRATEGY: +# 1. Create dataset. +# 2. Populate the source file with 1024 blocks at 1024 block offset. +# 3. Clone 1024 blocks at a 1024-block offset. +# 4. Compare the cloned file with the original file. +# + +verify_runnable "global" + +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="The first clone at a large offset is functional" + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# +# 1. Create dataset. +# +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS +sync_pool $TESTPOOL + +# +# 2. Populate the source file with 1024 blocks at 1024 block offset. +# +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 \ + oflag=sync bs=128k count=1024 seek=1024 +sync_pool $TESTPOOL + +# +# 3. Clone 1024 blocks at a 1024-block offset. +# +log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 134217728 134217728 \ + 134217728 +sync_pool $TESTPOOL + +# +# 4. Compare the cloned file with the original file. +# +log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2 +typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2) + +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 0 1023 | sed 's/ $//')" ] + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index e1fe865b1d3b..e5a8b9026e03 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -63,6 +63,7 @@ typeset -a properties=( "bcloneused" "bclonesaved" "bcloneratio" + "last_scrubbed_txg" "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh new file mode 100755 index 000000000000..b28a8d2cf72f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2023, Klara Inc. +# +# This software was developed by +# Mariusz Zaborski +# under sponsorship from Wasabi Technology, Inc. and Klara Inc. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify scrub -C +# +# STRATEGY: +# 1. Create a pool and create one file. +# 2. Verify that the last_txg_scrub is 0. +# 3. Run scrub. +# 4. Verify that the last_txg_scrub is set. +# 5. Create second file. +# 6. Invalidate both files. +# 7. Run scrub only from last point. +# 8. Verify that only one file, that was created with newer txg, +# was detected. +# + +verify_runnable "global" + +function cleanup +{ + log_must zinject -c all + log_must rm -f $mntpnt/f1 + log_must rm -f $mntpnt/f2 +} + +log_onexit cleanup + +log_assert "Verify scrub -C." + +# Create one file. +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +log_must file_write -b 1048576 -c 10 -o create -d 0 -f $mntpnt/f1 +log_must sync_pool $TESTPOOL true +f1txg=$(get_last_txg_synced $TESTPOOL) + +# Verify that last_scrubbed_txg isn't set. +zpoollasttxg=$(zpool get -H -o value last_scrubbed_txg $TESTPOOL) +log_must [ $zpoollasttxg -eq 0 ] + +# Run scrub. +log_must zpool scrub -w $TESTPOOL + +# Verify that last_scrubbed_txg is set. +zpoollasttxg=$(zpool get -H -o value last_scrubbed_txg $TESTPOOL) +log_must [ $zpoollasttxg -ne 0 ] + +# Create second file. +log_must file_write -b 1048576 -c 10 -o create -d 0 -f $mntpnt/f2 +log_must sync_pool $TESTPOOL true +f2txg=$(get_last_txg_synced $TESTPOOL) + +# Make sure that the sync txg are different. +log_must [ $f1txg -ne $f2txg ] + +# Insert faults. +log_must zinject -a -t data -e io -T read $mntpnt/f1 +log_must zinject -a -t data -e io -T read $mntpnt/f2 + +# Run scrub from last saved point. +log_must zpool scrub -w -C $TESTPOOL + +# Verify that only newer file was detected. +log_mustnot eval "zpool status -v $TESTPOOL | grep '$mntpnt/f1'" +log_must eval "zpool status -v $TESTPOOL | grep '$mntpnt/f2'" + +# Verify that both files are corrupted. +log_must zpool scrub -w $TESTPOOL +log_must eval "zpool status -v $TESTPOOL | grep '$mntpnt/f1'" +log_must eval "zpool status -v $TESTPOOL | grep '$mntpnt/f2'" + +log_pass "Verified scrub -C show expected status." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh index 4fb900c73cf6..7c44e800c16a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh @@ -69,15 +69,16 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" # Check no ONLINE slow vdevs are show. Then mark IOs greater than - # 160ms slow, delay IOs 320ms to vdev6, check slow IOs. + # 750ms slow, delay IOs 1000ms to vdev6, check slow IOs. log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE" log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE" - log_must set_tunable64 ZIO_SLOW_IO_MS 160 - log_must zinject -d $TESTDIR/vdev6 -D320:100 $TESTPOOL2 + log_must set_tunable64 ZIO_SLOW_IO_MS 750 + log_must zinject -d $TESTDIR/vdev6 -D1000:100 $TESTPOOL2 log_must mkfile 1048576 /$TESTPOOL2/testfile sync_pool $TESTPOOL2 log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must zinject -c all # Check vdev6 slow IOs are only shown when requested with -s. log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" @@ -95,10 +96,9 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE" log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE" - log_must zinject -c all log_must zpool status -es $TESTPOOL2 - zpool destroy $TESTPOOL2 + log_must zpool destroy $TESTPOOL2 done log_pass "Verify zpool status -e shows only unhealthy vdevs" diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh index 4f6e5805bb3a..3b17de5a4073 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh @@ -70,7 +70,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" # four entries in the unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # single containing object in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 @@ -84,7 +84,7 @@ log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 log_must zpool sync # now four entries in the duplicate table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'" # now two DDT ZAPs in the container object; DDT ZAPs aren't cleaned up until # the entire logical table is destroyed diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh index 259eaddc0843..faa9b7e044cd 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh @@ -70,7 +70,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" # four entries in the unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # single containing object in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 @@ -107,7 +107,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" # four entries in the unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # single containing object in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh index e3efcf5c8b36..9e524ddbe28e 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh @@ -63,7 +63,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" # should be four entries in the unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # should be just one DDT ZAP in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 @@ -73,7 +73,7 @@ log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 log_must zpool sync # now four entries in the duplicate table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'" # now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire # logical table is destroyed diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh index 114cf0266e12..fd3b01e8cd2c 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh @@ -71,7 +71,7 @@ log_must dd if=/dev/urandom of=/$TESTPOOL/ds1/file1 bs=128k count=4 log_must zpool sync # should be four entries in the skein unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-skein-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-skein-zap-unique:.*entries=4'" # should be just one DDT ZAP in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1 @@ -90,7 +90,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" # now also four entries in the blake3 unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-blake3-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-blake3-zap-unique:.*entries=4'" # two entries in the MOS: the legacy skein DDT ZAP, and the containing dir for # the blake3 FDT table diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh index c36463134fde..7a1e8006db16 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh @@ -71,7 +71,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" # should be four entries in the unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # should be just one DDT ZAP in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 @@ -90,7 +90,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" # now four entries in the duplicate table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'" # now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire # logical table is destroyed @@ -117,7 +117,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" # four entries in the unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # single containing object in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh index a7b667eaf882..4de46e89fc05 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh @@ -63,7 +63,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" # should be four entries in the unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # should be just one DDT ZAP in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 @@ -96,7 +96,7 @@ log_must zpool sync log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" # should be four entries in the unique table -log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # should be just one DDT ZAP in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1