Skip to content

Commit

Permalink
Merge branch 'atomic-writes'
Browse files Browse the repository at this point in the history
Merge the atomc write support.

* atomic-writes:
  examples: Add example for atomic write verify
  fio: Support verify_write_sequence
  doc: Document atomic command
  tools/fiograph: Update for atomic support
  io_uring: Support RWF_ATOMIC
  libaio: Support RWF_ATOMIC
  pvsync2: Support RWF_ATOMIC
  os: Reintroduce atomic write support
  os-linux: Define RWF_ATOMIC
  • Loading branch information
axboe committed Sep 17, 2024
2 parents f0af081 + f23208c commit bcd46be
Show file tree
Hide file tree
Showing 13 changed files with 140 additions and 11 deletions.
25 changes: 25 additions & 0 deletions HOWTO.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2501,6 +2501,20 @@ with the caveat that when used on the command line, they must come after the
For direct I/O, requests will only succeed if cache invalidation isn't required,
file blocks are fully allocated and the disk request could be issued immediately.

.. option:: atomic=bool : [pvsync2] [libaio] [io_uring]

This option means that writes are issued with torn-write protection, meaning
that for a power fail or kernel crash, all or none of the data from the write
will be stored, but never a mix of old and new data. Torn-write protection is
also known as atomic writes.

This option sets the RWF_ATOMIC flag (supported from the 6.11 Linux kernel) on
a per-IO basis.

Writes with RWF_ATOMIC set will be rejected by the kernel when the file does
not support torn-write protection. To learn a file's torn-write limits, issue
statx with STATX_WRITE_ATOMIC.

.. option:: fdp=bool : [io_uring_cmd] [xnvme]

Enable Flexible Data Placement mode for write commands.
Expand Down Expand Up @@ -3988,6 +4002,17 @@ Verification
instead resets the file after the write phase and then replays I/Os for
the verification phase.

.. option:: verify_write_sequence=bool

Verify the header write sequence number. In a scenario with multiple jobs,
verification of the write sequence number may fail. Disabling this option
will mean that write sequence number checking is skipped. Doing that can be
useful for testing atomic writes, as it means that checksum verification can
still be attempted. For when :option:`atomic` is enabled, checksum
verification is expected to succeed (while write sequence checking can still
fail).
Defaults to true.

.. option:: trim_percentage=int

Number of verify blocks to discard/trim.
Expand Down
5 changes: 4 additions & 1 deletion engines/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,8 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
sqe->rw_flags = 0;
if (o->nowait)
sqe->rw_flags |= RWF_NOWAIT;
if (td->o.oatomic && io_u->ddir == DDIR_WRITE)
sqe->rw_flags |= RWF_ATOMIC;

/*
* Since io_uring can have a submission context (sqthread_poll)
Expand Down Expand Up @@ -1582,7 +1584,8 @@ static struct ioengine_ops ioengine_uring = {
.name = "io_uring",
.version = FIO_IOOPS_VERSION,
.flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD |
FIO_ASYNCIO_SETS_ISSUE_TIME,
FIO_ASYNCIO_SETS_ISSUE_TIME |
FIO_ATOMICWRITES,
.init = fio_ioring_init,
.post_init = fio_ioring_post_init,
.io_u_init = fio_ioring_io_u_init,
Expand Down
7 changes: 6 additions & 1 deletion engines/libaio.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ static int fio_libaio_prep(struct thread_data *td, struct io_u *io_u)
io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
if (o->nowait)
iocb->aio_rw_flags |= RWF_NOWAIT;
#ifdef FIO_HAVE_RWF_ATOMIC
if (td->o.oatomic)
iocb->aio_rw_flags |= RWF_ATOMIC;
#endif
} else if (ddir_sync(io_u->ddir))
io_prep_fsync(iocb, f->fd);

Expand Down Expand Up @@ -440,7 +444,8 @@ FIO_STATIC struct ioengine_ops ioengine = {
.name = "libaio",
.version = FIO_IOOPS_VERSION,
.flags = FIO_ASYNCIO_SYNC_TRIM |
FIO_ASYNCIO_SETS_ISSUE_TIME,
FIO_ASYNCIO_SETS_ISSUE_TIME |
FIO_ATOMICWRITES,
.init = fio_libaio_init,
.post_init = fio_libaio_post_init,
.prep = fio_libaio_prep,
Expand Down
9 changes: 6 additions & 3 deletions engines/sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,11 @@ static enum fio_q_status fio_pvsyncio2_queue(struct thread_data *td,

if (io_u->ddir == DDIR_READ)
ret = preadv2(f->fd, iov, 1, io_u->offset, flags);
else if (io_u->ddir == DDIR_WRITE)
else if (io_u->ddir == DDIR_WRITE) {
if (td->o.oatomic)
flags |= RWF_ATOMIC;
ret = pwritev2(f->fd, iov, 1, io_u->offset, flags);
else if (io_u->ddir == DDIR_TRIM) {
} else if (io_u->ddir == DDIR_TRIM) {
do_io_u_trim(td, io_u);
return FIO_Q_COMPLETED;
} else
Expand Down Expand Up @@ -476,7 +478,8 @@ static struct ioengine_ops ioengine_pvrw2 = {
.open_file = generic_open_file,
.close_file = generic_close_file,
.get_file_size = generic_get_file_size,
.flags = FIO_SYNCIO,
.flags = FIO_SYNCIO |
FIO_ATOMICWRITES,
.options = options,
.option_struct_size = sizeof(struct psyncv2_options),
};
Expand Down
36 changes: 36 additions & 0 deletions examples/atomic-verify.fio
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Data verification with atomic writes
#
# Some background on atomic writes:
#
# The main selling point of atomic writes is that it is guaranteed writes
# to storage will not be torn for a power failure or kernel crash.

# Another aspect of atomic writes is that they handle racing writes and
# reads, such that a read racing with a write will see all the data from
# the write or none. Well, SCSI and NVMe guarantee this if using
# RWF_ATOMIC, but it is not formally stated as a feature of RWF_ATOMIC.
#
# Fio verify mode can be used to prove that atomic writes can make "safe"
# racing reads and writes. This done by having many jobs in a xsum verify
# mode. In this way, xsums should be correct, although a job may be
# reading a data block written by another job; however
# verify_write_sequence must be disabled, as it cannot be helped that data
# blocks will be out of sequence between with many jobs.
#
# Atomic write limits:
# For a block device, the max block size for atomic=1 is in
# /sys/block/sdXXX/queue/atomic_write_unit_max_bytes
# or this value can also be read with a statx syscall on the bdev file.

[write-and-verify]
rw=randwrite
bs=4k
direct=1
ioengine=libaio
iodepth=16
verify=crc64
atomic=1
verify_write_sequence=0
numjobs=10
# Use /dev/XXX or filename
filename=/dev/XXX
21 changes: 21 additions & 0 deletions fio.1
Original file line number Diff line number Diff line change
Expand Up @@ -2266,6 +2266,19 @@ cached data. Currently the RWF_NOWAIT flag does not supported for cached write.
For direct I/O, requests will only succeed if cache invalidation isn't required,
file blocks are fully allocated and the disk request could be issued immediately.
.TP
.BI (pvsync2,libaio,io_uring)atomic \fR=\fPbool
This option means that writes are issued with torn-write protection, meaning
that for a power fail or kernel crash, all or none of the data from the write
will be stored, but never a mix of old and new data. Torn-write protection is
also known as atomic writes.

This option sets the RWF_ATOMIC flag (supported from the 6.11 Linux kernel) on
a per-IO basis.

Writes with RWF_ATOMIC set will be rejected by the kernel when the file does
not support torn-write protection. To learn a file's torn-write limits, issue
statx with STATX_WRITE_ATOMIC.
.TP
.BI (io_uring_cmd,xnvme)fdp \fR=\fPbool
Enable Flexible Data Placement mode for write commands.
.TP
Expand Down Expand Up @@ -3713,6 +3726,14 @@ Enable experimental verification. Standard verify records I/O metadata for
later use during the verification phase. Experimental verify instead resets the
file after the write phase and then replays I/Os for the verification phase.
.TP
.BI verify_write_sequence \fR=\fPbool
Verify the header write sequence number. In a scenario with multiple jobs,
verification of the write sequence number may fail. Disabling this option
will mean that write sequence number checking is skipped. Doing that can be
useful for testing atomic writes, as it means that checksum verification can
still be attempted. For when \fBatomic\fR is enabled, checksum verification
is expected to succeed (while write sequence checking can still fail).
.TP
.BI trim_percentage \fR=\fPint
Number of verify blocks to discard/trim.
.TP
Expand Down
14 changes: 14 additions & 0 deletions init.c
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,20 @@ static int fixup_options(struct thread_data *td)
(o->max_bs[DDIR_WRITE] % o->verify_interval))
o->verify_interval = gcd(o->min_bs[DDIR_WRITE],
o->max_bs[DDIR_WRITE]);

if (td->o.verify_only)
o->verify_write_sequence = 0;
}

if (td->o.oatomic) {
if (!td_ioengine_flagged(td, FIO_ATOMICWRITES)) {
log_err("fio: engine does not support atomic writes\n");
td->o.oatomic = 0;
ret |= 1;
}

if (!td_write(td))
td->o.oatomic = 0;
}

if (o->pre_read) {
Expand Down
2 changes: 2 additions & 0 deletions ioengines.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ enum {
__FIO_RO_NEEDS_RW_OPEN, /* open files in rw mode even if we have a read job; only
affects ioengines using generic_open_file */
__FIO_MULTI_RANGE_TRIM, /* ioengine supports trim with more than one range */
__FIO_ATOMICWRITES, /* ioengine supports atomic writes */
__FIO_IOENGINE_F_LAST, /* not a real bit; used to count number of bits */
};

Expand All @@ -120,6 +121,7 @@ enum fio_ioengine_flags {
FIO_SKIPPABLE_IOMEM_ALLOC = 1 << __FIO_SKIPPABLE_IOMEM_ALLOC,
FIO_RO_NEEDS_RW_OPEN = 1 << __FIO_RO_NEEDS_RW_OPEN,
FIO_MULTI_RANGE_TRIM = 1 << __FIO_MULTI_RANGE_TRIM,
FIO_ATOMICWRITES = 1 << __FIO_ATOMICWRITES,
};

/*
Expand Down
13 changes: 13 additions & 0 deletions options.c
Original file line number Diff line number Diff line change
Expand Up @@ -2926,6 +2926,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_IO_TYPE,
},
#ifdef FIO_HAVE_RWF_ATOMIC
{
.name = "atomic",
.lname = "Atomic I/O",
Expand All @@ -2936,6 +2937,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_IO_TYPE,
},
#endif
{
.name = "buffered",
.lname = "Buffered I/O",
Expand Down Expand Up @@ -3395,6 +3397,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_VERIFY,
},
{
.name = "verify_write_sequence",
.lname = "Verify write sequence number",
.off1 = offsetof(struct thread_options, verify_write_sequence),
.type = FIO_OPT_BOOL,
.def = "1",
.help = "Verify header write sequence number",
.parent = "verify",
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_VERIFY,
},
#ifdef FIO_HAVE_TRIM
{
.name = "trim_percentage",
Expand Down
5 changes: 5 additions & 0 deletions os/os-linux.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#define FIO_HAVE_BYTEORDER_FUNCS
#define FIO_HAVE_PWRITEV2
#define FIO_HAVE_SHM_ATTACH_REMOVED
#define FIO_HAVE_RWF_ATOMIC

#ifdef MAP_HUGETLB
#define FIO_HAVE_MMAP_HUGE
Expand Down Expand Up @@ -328,6 +329,10 @@ static inline int fio_set_sched_idle(void)
#define RWF_NOWAIT 0x00000008
#endif

#ifndef RWF_ATOMIC
#define RWF_ATOMIC 0x00000040
#endif

#ifndef RWF_WRITE_LIFE_SHIFT
#define RWF_WRITE_LIFE_SHIFT 4
#define RWF_WRITE_LIFE_SHORT (1 << RWF_WRITE_LIFE_SHIFT)
Expand Down
1 change: 1 addition & 0 deletions thread_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ struct thread_options {
unsigned int experimental_verify;
unsigned int verify_state;
unsigned int verify_state_save;
unsigned int verify_write_sequence;
unsigned int use_thread;
unsigned int unlink;
unsigned int unlink_each_loop;
Expand Down
6 changes: 3 additions & 3 deletions tools/fiograph/fiograph.conf
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ specific_options=https http_host http_user http_pass http_s3_key http_s3_ke
specific_options=ime_psync ime_psyncv

[ioengine_io_uring]
specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async
specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async atomic

[ioengine_io_uring_cmd]
specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async cmd_type md_per_io_size pi_act pi_chk apptag apptag_mask

[ioengine_libaio]
specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait
specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait atomic

[ioengine_libblkio]
specific_options=libblkio_driver libblkio_path libblkio_pre_connect_props libblkio_num_entries libblkio_queue_size libblkio_pre_start_props hipri libblkio_vectored libblkio_write_zeroes_on_trim libblkio_wait_mode libblkio_force_enable_completion_eventfd
Expand Down Expand Up @@ -99,7 +99,7 @@ specific_options=hostname bindname port verb
specific_options=hipri readfua writefua sg_write_mode stream_id

[ioengine_pvsync2]
specific_options=hipri hipri_percentage nowait sync psync vsync pvsync
specific_options=hipri hipri_percentage nowait sync psync vsync pvsync atomic

[ioengine_xnvme]
specific_options=hipri sqthread_poll xnvme_be xnvme_async xnvme_sync xnvme_admin xnvme_dev_nsid xnvme_iovec
7 changes: 4 additions & 3 deletions verify.c
Original file line number Diff line number Diff line change
Expand Up @@ -848,12 +848,13 @@ static int verify_header(struct io_u *io_u, struct thread_data *td,
/*
* For read-only workloads, the program cannot be certain of the
* last numberio written to a block. Checking of numberio will be
* done only for workloads that write data. For verify_only,
* numberio check is skipped.
* done only for workloads that write data. For verify_only or
* any mode de-selecting verify_write_sequence, numberio check is
* skipped.
*/
if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) &&
!td->o.time_based)
if (!td->o.verify_only)
if (td->o.verify_write_sequence)
if (hdr->numberio != io_u->numberio) {
log_err("verify: bad header numberio %"PRIu16
", wanted %"PRIu16,
Expand Down

0 comments on commit bcd46be

Please sign in to comment.