Skip to content

Commit

Permalink
Use DNODE_FIND_DIRTY for SEEK_HOLE/SEEK_DATA.
Browse files Browse the repository at this point in the history
There is no need to worry about txg syncs anymore, so remove
dmu_offset_next_sync and dnode_is_dirty too.

This should be transparent to userland execpt that holes created by
compression do not become visible until after the next txg sync.

mmap_seek_001_pos is updated to force a sync to match this case.

Signed-off-by: Robert Evans <[email protected]>
  • Loading branch information
rrevans committed Mar 28, 2024
1 parent 11e4e6c commit 8756866
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 83 deletions.
1 change: 0 additions & 1 deletion include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,6 @@ boolean_t dnode_add_ref(dnode_t *dn, const void *ref);
void dnode_rele(dnode_t *dn, const void *ref);
void dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting);
int dnode_try_claim(objset_t *os, uint64_t object, int slots);
boolean_t dnode_is_dirty(dnode_t *dn);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
Expand Down
49 changes: 4 additions & 45 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,6 @@ static int zfs_nopwrite_enabled = 1;
*/
static uint_t zfs_per_txg_dirty_frees_percent = 30;

/*
* Enable/disable forcing txg sync when dirty checking for holes with lseek().
* By default this is enabled to ensure accurate hole reporting, it can result
* in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
* Disabling this option will result in holes never being reported in dirty
* files which is always safe.
*/
static int zfs_dmu_offset_next_sync = 1;

/*
* Limit the amount we can prefetch with one call to this amount. This
* helps to limit the amount of memory that can be used by prefetching.
Expand Down Expand Up @@ -2159,45 +2150,16 @@ int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
int restarted = 0, err;
int err;

restart:
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);

rw_enter(&dn->dn_struct_rwlock, RW_READER);

if (dnode_is_dirty(dn)) {
/*
* If the zfs_dmu_offset_next_sync module option is enabled
* then hole reporting has been requested. Dirty dnodes
* must be synced to disk to accurately report holes.
*
* Provided a RL_READER rangelock spanning 0-UINT64_MAX is
* held by the caller only a single restart will be required.
* We tolerate callers which do not hold the rangelock by
* returning EBUSY and not reporting holes after one restart.
*/
if (zfs_dmu_offset_next_sync) {
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
err = dnode_next_offset(dn,
DNODE_FIND_DIRTY | (hole ? DNODE_FIND_HOLE : 0),
off, 1, 1, 0);

if (restarted)
return (SET_ERROR(EBUSY));

txg_wait_synced(dmu_objset_pool(os), 0);
restarted = 1;
goto restart;
}

err = SET_ERROR(EBUSY);
} else {
err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
(hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
}

rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);

return (err);
Expand Down Expand Up @@ -2597,9 +2559,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
"Percentage of dirtied blocks from frees in one TXG");

ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
"Enable forcing txg sync to find holes");

/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
"Limit one prefetch call to this size");
28 changes: 0 additions & 28 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1785,34 +1785,6 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
slots, NULL, NULL));
}

/*
* Checks if the dnode itself is dirty, or is carrying any uncommitted records.
* It is important to check both conditions, as some operations (eg appending
* to a file) can dirty both as a single logical unit, but they are not synced
* out atomically, so checking one and not the other can result in an object
* appearing to be clean mid-way through a commit.
*
* Do not change this lightly! If you get it wrong, dmu_offset_next() can
* detect a hole where there is really data, leading to silent corruption.
*/
boolean_t
dnode_is_dirty(dnode_t *dn)
{
mutex_enter(&dn->dn_mtx);

for (int i = 0; i < TXG_SIZE; i++) {
if (multilist_link_active(&dn->dn_dirty_link[i]) ||
!list_is_empty(&dn->dn_dirty_records[i])) {
mutex_exit(&dn->dn_mtx);
return (B_TRUE);
}
}

mutex_exit(&dn->dn_mtx);

return (B_FALSE);
}

void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/cmd/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree


scripts_zfs_tests_bin_PROGRAMS += %D%/mmap_exec %D%/mmap_seek %D%/mmap_sync %D%/mmapwrite %D%/readmmap
%C%_mmap_seek_LDADD = libzfs_core.la
%C%_mmapwrite_LDADD = -lpthread

if WANT_MMAP_LIBAIO
Expand Down
19 changes: 17 additions & 2 deletions tests/zfs-tests/cmd/mmap_seek.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#ifdef __linux__
#include <linux/fs.h>
#endif
#include <libnvpair.h>
#include <libzfs_core.h>

static void
seek_data(int fd, off_t offset, off_t expected)
Expand Down Expand Up @@ -65,12 +67,14 @@ main(int argc, char **argv)
char *buf = NULL;
int err;

if (argc != 4) {
if (argc != 5) {
(void) printf("usage: %s <file name> <file size> "
"<block size>\n", argv[0]);
"<block size> <poolname>\n", argv[0]);
exit(1);
}

(void) libzfs_core_init();

int fd = open(file_path, O_RDWR | O_CREAT, 0666);
if (fd == -1) {
(void) fprintf(stderr, "%s: %s: ", execname, file_path);
Expand All @@ -80,6 +84,7 @@ main(int argc, char **argv)

off_t file_size = atoi(argv[2]);
off_t block_size = atoi(argv[3]);
const char *poolname = argv[4];

if (block_size * 2 > file_size) {
(void) fprintf(stderr, "file size must be at least "
Expand Down Expand Up @@ -133,6 +138,16 @@ main(int argc, char **argv)

/* Punch a hole (required compression be enabled). */
memset(buf + block_size, 0, block_size);
err = msync(buf, file_size, MS_SYNC);
if (err == -1) {
perror("msync");
exit(2);
}
err = lzc_sync(poolname, NULL, NULL);
if (err != 0) {
perror("lzc_sync");
exit(2);
}
seek_data(fd, 0, 0);
seek_data(fd, block_size, 2 * block_size);
seek_hole(fd, 0, block_size);
Expand Down
1 change: 0 additions & 1 deletion tests/zfs-tests/include/tunables.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode
DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms
DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms
DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check
DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync
EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms
INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size
INITIALIZE_VALUE initialize_value zfs_initialize_value
Expand Down
7 changes: 1 addition & 6 deletions tests/zfs-tests/tests/functional/mmap/mmap_seek_001_pos.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,19 @@ function cleanup
log_must zfs set compression=off $TESTPOOL/$TESTFS
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
log_must rm -f $TESTDIR/test-mmap-file
log_must set_tunable64 DMU_OFFSET_NEXT_SYNC $dmu_offset_next_sync
}

log_assert "lseek() data/holes for an mmap()'d file."

log_onexit cleanup

# Enable hole reporting for dirty files.
typeset dmu_offset_next_sync=$(get_tunable DMU_OFFSET_NEXT_SYNC)
log_must set_tunable64 DMU_OFFSET_NEXT_SYNC 1

# Compression must be enabled to convert zero'd blocks to holes.
# This behavior is checked by the mmap_seek test.
log_must zfs set compression=on $TESTPOOL/$TESTFS

for bs in 4096 8192 16384 32768 65536 131072; do
log_must zfs set recordsize=$bs $TESTPOOL/$TESTFS
log_must mmap_seek $TESTDIR/test-mmap-file $((1024*1024)) $bs
log_must mmap_seek $TESTDIR/test-mmap-file $((1024*1024)) $bs $TESTPOOL
log_must rm $TESTDIR/test-mmap-file
done

Expand Down

0 comments on commit 8756866

Please sign in to comment.