Skip to content

Commit

Permalink
Fix I/O replay timing
Browse files Browse the repository at this point in the history
fio supports replaying I/Os from a previous invocation, captured either
via blktrace or `write_iolog`. That support includes simulating the
timing between I/Os by using the running timestamps from the capture
log, recreating that timing by inserting delays in the replay submission
path if necessary.  The calculation for how much delay to insert is
based on how long it takes to complete the most recent replayed I/O, as
compared to the how long that same I/O took in the original workload
recorded in the log.

The issue is in iolog_delay(). It calculates the replay I/O execution
time starting from just after issuing I/O to the kernel in td_io_queue()
until the start of iolog_delay(). This excludes the time it actually
took for the kernel to process the submission, which depending on the
I/O engine can be significant. Because this time was under-measured,
iolog_delay()'s added more delay than needed, since it thought the
replay was executing too fast vs the logged I/O.

This issue causes the total replay to take 2x or longer than the
original workload due to the progressively-larger drift from these
excessive delays.

The fix is to change iolog_delay() to use the full execution path time
for an I/O, including the kernel submission time. This is accomplished
by removing the existing points where the replay I/O clock is started
(td_io_queue and the async engines) and moving that into iolog_delay().
This means we're no longer technically matching the precise stage in the
I/O path for replay vs the original workload, but this doesn't really
matter since that's only an initial time skew reflected in the very
first I/O operation. After that we only need to match the inter-I/O
pacing.

This change includes fixes to additional issues found in iolog_delay():

1) The logic that measures how long usec_sleep() takes relative to how
much sleep was requested was comparing against the original log delay
value instead of the value adjusted by the last replay execution I/O,
causing an under-adjustment when usec_sleep() took longer than
requested, creating more total delay than necessary.

2) The logic wasn't accumulating the usec_sleep() adjustment from the
above step across more than one replayed I/O. It would reset the
adjustment to zero on the next I/O. I guess the logic was assuming the
adjustment would never be larger than a single log delay interval. It
also wasn't accumulating the replay I/O execution time across more than
one interval, so for example if a replay I/O took 3x longer than the log
I/O it would only adjust that down to a zero delay for one replay I/O,
causing the 2x difference to never be accounted for and thus add to the
total running replay time.

Link: #1537 (comment)

Signed-off-by: Adam Horshack ([email protected])
  • Loading branch information
horshack-dpreview committed Mar 10, 2023
1 parent 051b578 commit 6d608dd
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 66 deletions.
6 changes: 0 additions & 6 deletions engines/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -613,12 +613,6 @@ static void fio_ioring_queued(struct thread_data *td, int start, int nr)

start++;
}

/*
* only used for iolog
*/
if (td->o.read_iolog_file)
memcpy(&td->last_issue, &now, sizeof(now));
}

static int fio_ioring_commit(struct thread_data *td)
Expand Down
6 changes: 0 additions & 6 deletions engines/libaio.c
Original file line number Diff line number Diff line change
Expand Up @@ -368,12 +368,6 @@ static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
memcpy(&io_u->issue_time, &now, sizeof(now));
io_u_queued(td, io_u);
}

/*
* only used for iolog
*/
if (td->o.read_iolog_file)
memcpy(&td->last_issue, &now, sizeof(now));
}

static int fio_libaio_commit(struct thread_data *td)
Expand Down
10 changes: 0 additions & 10 deletions engines/librpma_fio.c
Original file line number Diff line number Diff line change
Expand Up @@ -597,16 +597,6 @@ int librpma_fio_client_commit(struct thread_data *td)
}
}

if ((fill_time = fio_fill_issue_time(td))) {
fio_gettime(&now, NULL);

/*
* only used for iolog
*/
if (td->o.read_iolog_file)
memcpy(&td->last_issue, &now, sizeof(now));

}
/* move executed io_us from queued[] to flight[] */
for (i = 0; i < ccd->io_u_queued_nr; i++) {
struct io_u *io_u = ccd->io_us_queued[i];
Expand Down
6 changes: 0 additions & 6 deletions engines/rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -832,12 +832,6 @@ static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
memcpy(&io_u->issue_time, &now, sizeof(now));
io_u_queued(td, io_u);
}

/*
* only used for iolog
*/
if (td->o.read_iolog_file)
memcpy(&td->last_issue, &now, sizeof(now));
}

static int fio_rdmaio_commit(struct thread_data *td)
Expand Down
7 changes: 0 additions & 7 deletions init.c
Original file line number Diff line number Diff line change
Expand Up @@ -628,13 +628,6 @@ static int fixup_options(struct thread_data *td)
}
#endif

if (o->write_iolog_file && o->read_iolog_file) {
log_err("fio: read iolog overrides write_iolog\n");
free(o->write_iolog_file);
o->write_iolog_file = NULL;
ret |= warnings_fatal;
}

if (o->zone_mode == ZONE_MODE_NONE && o->zone_size) {
log_err("fio: --zonemode=none and --zonesize are not compatible.\n");
ret |= 1;
Expand Down
20 changes: 2 additions & 18 deletions ioengines.c
Original file line number Diff line number Diff line change
Expand Up @@ -358,16 +358,8 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)

if (td_ioengine_flagged(td, FIO_SYNCIO) ||
async_ioengine_sync_trim(td, io_u)) {
if (fio_fill_issue_time(td)) {
if (fio_fill_issue_time(td))
fio_gettime(&io_u->issue_time, NULL);

/*
* only used for iolog
*/
if (td->o.read_iolog_file)
memcpy(&td->last_issue, &io_u->issue_time,
sizeof(io_u->issue_time));
}
}


Expand Down Expand Up @@ -444,16 +436,8 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
if (!td_ioengine_flagged(td, FIO_SYNCIO) &&
!async_ioengine_sync_trim(td, io_u)) {
if (fio_fill_issue_time(td) &&
!td_ioengine_flagged(td, FIO_ASYNCIO_SETS_ISSUE_TIME)) {
!td_ioengine_flagged(td, FIO_ASYNCIO_SETS_ISSUE_TIME))
fio_gettime(&io_u->issue_time, NULL);

/*
* only used for iolog
*/
if (td->o.read_iolog_file)
memcpy(&td->last_issue, &io_u->issue_time,
sizeof(io_u->issue_time));
}
}

return ret;
Expand Down
28 changes: 15 additions & 13 deletions iolog.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,23 +80,23 @@ void log_file(struct thread_data *td, struct fio_file *f,

static void iolog_delay(struct thread_data *td, unsigned long delay)
{
uint64_t usec = utime_since_now(&td->last_issue);
unsigned long orig_delay = delay;
uint64_t usec;
unsigned long delay_wanted;
uint64_t this_delay;
struct timespec ts;
struct timespec ts, ts_after_sleep;

if (delay < td->time_offset) {
td->time_offset = 0;
return;
}
fio_gettime(&ts, NULL);
usec = utime_since(&td->last_issue, &ts) + td->time_offset;

delay -= td->time_offset;
if (delay < usec)
if (delay <= usec) {
td->time_offset = usec-delay;
td->last_issue = ts;
return;
}

delay -= usec;

fio_gettime(&ts, NULL);
delay_wanted = delay;
while (delay && !td->terminate) {
this_delay = delay;
if (this_delay > 500000)
Expand All @@ -106,11 +106,13 @@ static void iolog_delay(struct thread_data *td, unsigned long delay)
delay -= this_delay;
}

usec = utime_since_now(&ts);
if (usec > orig_delay)
td->time_offset = usec - orig_delay;
fio_gettime(&ts_after_sleep, NULL);
usec = utime_since(&ts, &ts_after_sleep);
if (usec > delay_wanted)
td->time_offset = usec - delay_wanted;
else
td->time_offset = 0;
td->last_issue = ts_after_sleep;
}

static int ipo_special(struct thread_data *td, struct io_piece *ipo)
Expand Down

0 comments on commit 6d608dd

Please sign in to comment.