Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RFC] Implement discard for AIO on certain file systems #167

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion control/tap-ctl-info.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
#include "tap-ctl.h"

int tap_ctl_info(pid_t pid, unsigned long long *sectors,
unsigned int *sector_size, unsigned int *info, const int minor)
unsigned int *sector_size, unsigned int *info,
bool * discard_supported, const int minor)
{
tapdisk_message_t message;
int err;
Expand All @@ -49,6 +50,7 @@ int tap_ctl_info(pid_t pid, unsigned long long *sectors,
*sectors = message.u.image.sectors;
*sector_size = message.u.image.sector_size;
*info = message.u.image.info;
*discard_supported = message.u.image.discard_supported;
return 0;
} else if (TAPDISK_MESSAGE_ERROR == message.type) {
return -message.u.response.error;
Expand Down
34 changes: 34 additions & 0 deletions drivers/block-aio.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <linux/falloc.h>

#include "tapdisk.h"
#include "tapdisk-driver.h"
Expand Down Expand Up @@ -82,6 +83,11 @@ static int tdaio_get_image_info(int fd, td_disk_info_t *info)
/*Local file? try fstat instead*/
info->size = (stat.st_size >> SECTOR_SHIFT);
info->sector_size = DEFAULT_SECTOR_SIZE;

if(is_hole_punching_supported_for_fd(fd)) {
info->discard_supported = true;
}

DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
"sector_shift [%llu]\n",
(long long unsigned)(info->size << SECTOR_SHIFT),
Expand Down Expand Up @@ -212,6 +218,33 @@ void tdaio_queue_write(td_driver_t *driver, td_request_t treq)
td_complete_request(treq, -EBUSY);
}

void tdaio_discard(td_driver_t *driver, td_request_t treq)
{
int rc;
off64_t size;
off64_t offset;
struct tdaio_state *prv;

if (driver->info.discard_supported != true) {
td_complete_request(treq, -EOPNOTSUPP);
return;
}

prv = (struct tdaio_state *)driver->data;
size = treq.vreq->discard_nr_sectors * driver->info.sector_size;
offset = treq.vreq->sec * driver->info.sector_size;

rc = fallocate64(prv->fd, (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE), offset, size);
// Upper layers will retry on EINTR

// ToDo: Remove the following debug statement after feeling confident
DPRINTF("fallocate64(%d, %d, %" PRIu64 ", %" PRIu64 ") returned %d", prv->fd,
(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE), offset, size, rc);

td_complete_request(treq, rc);
return;
}

int tdaio_close(td_driver_t *driver)
{
struct tdaio_state *prv = (struct tdaio_state *)driver->data;
Expand Down Expand Up @@ -253,6 +286,7 @@ struct tap_disk tapdisk_aio = {
.td_close = tdaio_close,
.td_queue_read = tdaio_queue_read,
.td_queue_write = tdaio_queue_write,
.td_queue_discard = tdaio_discard,
.td_get_parent_id = tdaio_get_parent_id,
.td_validate_parent = tdaio_validate_parent,
.td_debug = NULL,
Expand Down
1 change: 1 addition & 0 deletions drivers/tapdisk-control.c
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,7 @@ tapdisk_control_disk_info(
image->sectors = vbd->disk_info.size;
image->sector_size = vbd->disk_info.sector_size;
image->info = vbd->disk_info.info;
image->discard_supported = vbd->disk_info.discard_supported;
}
return err;
}
Expand Down
11 changes: 8 additions & 3 deletions drivers/tapdisk-image.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,17 @@ tapdisk_image_check_td_request(td_image_t *image, td_request_t treq)
info = &image->info;
rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);

if (treq.op != TD_OP_READ && treq.op != TD_OP_WRITE)
if (treq.op != TD_OP_READ && treq.op != TD_OP_WRITE &&
treq.op != TD_OP_DISCARD)
goto fail;

if (treq.op == TD_OP_WRITE && rdonly) {
if ((treq.op == TD_OP_WRITE || treq.op == TD_OP_DISCARD) && rdonly) {
err = -EPERM;
goto fail;
}

if (treq.secs <= 0 || treq.sec + treq.secs > info->size)
if ((treq.secs <= 0 || treq.sec + treq.secs > info->size) &&
treq.op != TD_OP_DISCARD)
goto fail;

return 0;
Expand Down Expand Up @@ -140,6 +142,9 @@ tapdisk_image_check_request(td_image_t *image, td_vbd_request_t *vreq)
secs += vreq->iov[i].secs;

switch (vreq->op) {
case TD_OP_DISCARD:
secs = vreq->discard_nr_sectors;
/* fall through */
case TD_OP_WRITE:
if (rdonly) {
err = -EPERM;
Expand Down
36 changes: 36 additions & 0 deletions drivers/tapdisk-interface.c
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,42 @@ td_queue_read(td_image_t *image, td_request_t treq)
td_complete_request(treq, err);
}


void
td_queue_discard(td_image_t *image, td_request_t treq)
{
int err;
td_driver_t *driver;

driver = image->driver;
if (!driver) {
err = -ENODEV;
goto fail;
}

if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
err = -EBADF;
goto fail;
}

if (!driver->ops->td_queue_discard) {
err = -EOPNOTSUPP;
goto fail;
}

err = tapdisk_image_check_td_request(image, treq);
if (err)
goto fail;

driver->ops->td_queue_discard(driver, treq);

return;

fail:
td_complete_request(treq, err);
}


void
td_forward_request(td_request_t treq)
{
Expand Down
1 change: 1 addition & 0 deletions drivers/tapdisk-interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ int td_set_quantum(td_image_t *, int);

void td_queue_write(td_image_t *, td_request_t);
void td_queue_read(td_image_t *, td_request_t);
void td_queue_discard(td_image_t *, td_request_t);
void td_forward_request(td_request_t);
void td_complete_request(td_request_t, int);

Expand Down
38 changes: 38 additions & 0 deletions drivers/tapdisk-utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@
#include <string.h>
#include <unistd.h>
#include <linux/fs.h>
#include <linux/magic.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/resource.h>
#include <sys/utsname.h>
#include <sys/vfs.h>
#include <arpa/inet.h>

#ifdef __linux__
Expand Down Expand Up @@ -431,3 +433,39 @@ inline long long timeval_to_us(struct timeval *tv)
{
return ((long long)tv->tv_sec * USEC_PER_SEC) + tv->tv_usec;
}

bool is_hole_punching_supported_for_fd(int fd) {
int rc;
int kernel_version;
struct statfs statfs_buf;

rc = fstatfs(fd, &statfs_buf);
if (rc)
return false;
kernel_version = tapdisk_linux_version();
if (-ENOSYS == kernel_version)
return false;

// Support matrix according to man fallocate(2)
switch (statfs_buf.f_type) {
#ifdef BTRFS_SUPER_MAGIC
case BTRFS_SUPER_MAGIC:
return (kernel_version >= KERNEL_VERSION(3, 7, 0));
#endif
#ifdef EXT4_SUPER_MAGIC
case EXT4_SUPER_MAGIC:
return (kernel_version >= KERNEL_VERSION(3, 0, 0));
#endif
#ifdef TMPFS_SUPER_MAGIC
case TMPFS_SUPER_MAGIC:
return (kernel_version >= KERNEL_VERSION(3, 5, 0));
#endif
#ifdef XFS_SUPER_MAGIC
case XFS_SUPER_MAGIC:
return (kernel_version >= KERNEL_VERSION(2, 6, 38));
#endif
default:
break;
}
return false;
}
7 changes: 7 additions & 0 deletions drivers/tapdisk-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define _TAPDISK_UTILS_H_

#include <inttypes.h>
#include <stdbool.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
Expand Down Expand Up @@ -89,4 +90,10 @@ shm_destroy(struct shm *shm);

inline long long timeval_to_us(struct timeval *tv);

/**
* Returns true if the filesystem that hosts the specified path is known to
* allow hole punching and thereby discard.
*/
bool is_hole_punching_supported_for_fd(int fd);

#endif
22 changes: 22 additions & 0 deletions drivers/tapdisk-vbd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1363,6 +1363,10 @@ __tapdisk_vbd_reissue_td_request(td_vbd_t *vbd,
case TD_OP_READ:
td_queue_read(parent, treq);
break;

case TD_OP_DISCARD:
td_queue_discard(parent, treq);
break;
}

done:
Expand Down Expand Up @@ -1485,6 +1489,19 @@ tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
goto fail;
}

if(vreq->op==TD_OP_DISCARD) {
treq.sidx = 1;
treq.sec = sec;
treq.image = image;
treq.cb = tapdisk_vbd_complete_td_request;
treq.cb_data = NULL;
treq.vreq = vreq;
treq.op = TD_OP_DISCARD;
td_queue_discard(treq.image, treq);
err = 0;
goto out;
}

for (i = 0; i < vreq->iovcnt; i++) {
struct td_iovec *iov = &vreq->iov[i];

Expand Down Expand Up @@ -1529,6 +1546,11 @@ tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
vbd->vdi_stats.stats->read_reqs_submitted++;
td_queue_read(treq.image, treq);
break;

case TD_OP_DISCARD:
treq.op = TD_OP_DISCARD;
td_queue_discard(treq.image, treq);
break;
}

DBG(TLOG_DBG, "%s: req %s seg %d sec 0x%08"PRIx64" secs 0x%04x "
Expand Down
6 changes: 6 additions & 0 deletions drivers/tapdisk.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@

#include <time.h>
#include <stdint.h>
#include <stdbool.h>

#include "list.h"
#include "compiler.h"
Expand All @@ -74,6 +75,7 @@ extern unsigned int PAGE_SHIFT;

#define TD_OP_READ 0
#define TD_OP_WRITE 1
#define TD_OP_DISCARD 2

#define TD_OPEN_QUIET 0x00001
#define TD_OPEN_QUERY 0x00002
Expand Down Expand Up @@ -126,6 +128,7 @@ struct td_disk_info {
td_sector_t size;
long sector_size;
uint32_t info;
bool discard_supported;
};

struct td_iovec {
Expand Down Expand Up @@ -155,6 +158,8 @@ struct td_vbd_request {
td_vbd_t *vbd;
struct list_head next;
struct list_head *list_head;

uint64_t discard_nr_sectors;
};

struct td_request {
Expand Down Expand Up @@ -188,6 +193,7 @@ struct tap_disk {
int (*td_validate_parent) (td_driver_t *, td_driver_t *, td_flag_t);
void (*td_queue_read) (td_driver_t *, td_request_t);
void (*td_queue_write) (td_driver_t *, td_request_t);
void (*td_queue_discard) (td_driver_t *, td_request_t);
void (*td_debug) (td_driver_t *);
void (*td_stats) (td_driver_t *, td_stats_t *);

Expand Down
28 changes: 26 additions & 2 deletions drivers/td-ctx.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,17 @@ xenio_pending_blkif(struct td_xenio_ctx * const ctx)
dst->seg[i] = src->seg[i]; \
}

#define blkif_get_req_discard(dst, discard_src) \
{ \
/* assert(sizeof(blkif_request_discard_t)<sizeof(blkif_request_t)) */ \
blkif_request_discard_t *discard_dst = (blkif_request_discard_t *) dst; \
discard_dst->operation = src->operation; \
discard_dst->flag = discard_src->flag; \
discard_dst->id = discard_src->id; \
discard_dst->sector_number = discard_src->sector_number; \
discard_dst->nr_sectors = discard_src->nr_sectors; \
}

/**
* Utility function that retrieves a request using @idx as the ring index,
* copying it to the @dst in a H/W independent way.
Expand All @@ -149,6 +160,7 @@ xenio_blkif_get_request(struct td_xenblkif * const blkif,
{
blkif_request_t *src;
src = RING_GET_REQUEST(&rings->native, idx);
// sizeof(blkif_request_t)>sizeof(blkif_request_discard_t)
memcpy(dst, src, sizeof(blkif_request_t));
break;
}
Expand All @@ -157,15 +169,27 @@ xenio_blkif_get_request(struct td_xenblkif * const blkif,
{
blkif_x86_32_request_t *src;
src = RING_GET_REQUEST(&rings->x86_32, idx);
blkif_get_req(dst, src);
if (src->operation==BLKIF_OP_DISCARD) {
blkif_x86_32_request_discard_t * discard_src;
discard_src = (blkif_x86_32_request_discard_t *) src;
blkif_get_req_discard(dst, discard_src);
} else {
blkif_get_req(dst, src);
}
break;
}

case BLKIF_PROTOCOL_X86_64:
{
blkif_x86_64_request_t *src;
src = RING_GET_REQUEST(&rings->x86_64, idx);
blkif_get_req(dst, src);
if (src->operation==BLKIF_OP_DISCARD) {
blkif_x86_64_request_discard_t * discard_src;
discard_src = (blkif_x86_64_request_discard_t *) src;
blkif_get_req_discard(dst, discard_src);
} else {
blkif_get_req(dst, src);
}
break;
}

Expand Down
Loading