From 04d5add8452b55b89c39625d9139511584fd4c21 Mon Sep 17 00:00:00 2001
From: Hyunwoo Park <dshw.park@samsung.com>
Date: Tue, 30 Apr 2024 06:58:12 +0000
Subject: [PATCH] fdp: support scheme placement id (index) selection

Add a new placement id selection method called scheme. It allows
users to assign a placement ID (index) depending on the offset range.
The strategy of the scheme is specified in the file by user and
is applicable using the option dp_scheme.

Signed-off-by: Hyunwoo Park <dshw.park@samsung.com>
---
 HOWTO.rst        | 28 +++++++++++++++--
 cconv.c          |  2 ++
 dataplacement.c  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
 dataplacement.h  | 14 ++++++++-
 file.h           |  1 +
 fio.1            | 32 +++++++++++++++++++-
 options.c        | 52 ++++++++++++++++++++++++++++++++
 server.h         |  2 +-
 thread_options.h |  2 ++
 9 files changed, 206 insertions(+), 5 deletions(-)

diff --git a/HOWTO.rst b/HOWTO.rst
index 2f8ef6d42c..3b262faeae 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -2529,8 +2529,12 @@ with the caveat that when used on the command line, they must come after the
 			Round robin over available placement IDs. This is the
 			default.
 
-	The available placement ID (indices) are defined by the option
-	:option:`plids`.
+		**scheme**
+			Choose a placement ID (index) based on the scheme file defined by
+			the option :option:`dp_scheme`.
+
+	The available placement ID (indices) are defined by the option :option:`fdp_pli`
+	or :option:`plids` except for the case of **scheme**.
 
 .. option:: plids=str, fdp_pli=str : [io_uring_cmd] [xnvme]
 
@@ -2541,6 +2545,26 @@ with the caveat that when used on the command line, they must come after the
         identifiers only at indices 0, 2 and 5 specify ``plids=0,2,5``. For
         streams this should be a comma-separated list of Stream IDs.
 
+.. option:: dp_scheme=str : [io_uring_cmd] [xnvme]
+
+	Defines which placement ID (index) to be selected based on offset(LBA) range.
+	The file should contains one or more scheme entries in the following format:
+
+		0, 10737418240, 0
+		10737418240, 21474836480, 1
+		21474836480, 32212254720, 2
+		...
+
+	Each line, a scheme entry, contains start offset, end offset, and placement ID
+	(index) separated by comma(,). If the write offset is within the range of a certain
+	scheme entry(start offset ≤ offset < end offset), the corresponding placement ID
+	(index) will be selected. If the write offset belongs to multiple scheme entries,
+	the first matched scheme entry will be applied. If the offset is not within any range
+	of scheme entry, dspec field will be set to 0, default RUH. (Caution: In case of
+	multiple devices in a job, all devices of the job will be affected by the scheme. If
+	this option is specified, the option :option:`plids` or :option:`fdp_pli` will be
+	ignored.)
+
 .. option:: md_per_io_size=int : [io_uring_cmd] [xnvme]
 
 	Size in bytes for separate metadata buffer per IO. Default: 0.
diff --git a/cconv.c b/cconv.c
index 16112248a6..9b344940cb 100644
--- a/cconv.c
+++ b/cconv.c
@@ -94,6 +94,7 @@ int convert_thread_options_to_cpu(struct thread_options *o,
 	string_to_cpu(&o->ioscheduler, top->ioscheduler);
 	string_to_cpu(&o->profile, top->profile);
 	string_to_cpu(&o->cgroup, top->cgroup);
+	string_to_cpu(&o->dp_scheme_file, top->dp_scheme_file);
 
 	o->allow_create = le32_to_cpu(top->allow_create);
 	o->allow_mounted_write = le32_to_cpu(top->allow_mounted_write);
@@ -398,6 +399,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	string_to_net(top->ioscheduler, o->ioscheduler);
 	string_to_net(top->profile, o->profile);
 	string_to_net(top->cgroup, o->cgroup);
+	string_to_net(top->dp_scheme_file, o->dp_scheme_file);
 
 	top->allow_create = cpu_to_le32(o->allow_create);
 	top->allow_mounted_write = cpu_to_le32(o->allow_mounted_write);
diff --git a/dataplacement.c b/dataplacement.c
index 1d5b21edfd..8a4c8e6441 100644
--- a/dataplacement.c
+++ b/dataplacement.c
@@ -100,6 +100,56 @@ static int init_ruh_info(struct thread_data *td, struct fio_file *f)
 	return ret;
 }
 
+static int init_ruh_scheme(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_ruhs_scheme *ruh_scheme;
+	FILE *scheme_fp;
+	unsigned long long start, end;
+	uint16_t pli;
+	int ret = 0;
+
+	if (td->o.dp_id_select != FIO_DP_SCHEME)
+		return 0;
+
+	/* Get the scheme from the file */
+	scheme_fp = fopen(td->o.dp_scheme_file, "r");
+
+	if (!scheme_fp) {
+		log_err("fio: ruh scheme failed to open scheme file %s\n",
+			 td->o.dp_scheme_file);
+		ret = -errno;
+		goto out;
+	}
+
+	ruh_scheme = scalloc(1, sizeof(*ruh_scheme));
+	if (!ruh_scheme) {
+		ret = -ENOMEM;
+		goto out_with_close_fp;
+	}
+
+	for (int i = 0;
+		i < DP_MAX_SCHEME_ENTRIES && fscanf(scheme_fp, "%llu,%llu,%hu\n", &start, &end, &pli) == 3;
+		i++) {
+
+		ruh_scheme->scheme_entries[i].start_offset = start;
+		ruh_scheme->scheme_entries[i].end_offset = end;
+		ruh_scheme->scheme_entries[i].pli = pli;
+		ruh_scheme->nr_schemes++;
+	}
+
+	if (fscanf(scheme_fp, "%llu,%llu,%hu\n", &start, &end, &pli) == 3)
+		log_info("fio: too many scheme entries in %s. Only the first %d scheme entries are applied\n",
+			 td->o.dp_scheme_file,
+			 DP_MAX_SCHEME_ENTRIES);
+
+	f->ruhs_scheme = ruh_scheme;
+
+out_with_close_fp:
+	fclose(scheme_fp);
+out:
+	return ret;
+}
+
 int dp_init(struct thread_data *td)
 {
 	struct fio_file *f;
@@ -109,6 +159,10 @@ int dp_init(struct thread_data *td)
 		ret = init_ruh_info(td, f);
 		if (ret)
 			break;
+
+		ret = init_ruh_scheme(td, f);
+		if (ret)
+			break;
 	}
 	return ret;
 }
@@ -119,6 +173,11 @@ void fdp_free_ruhs_info(struct fio_file *f)
 		return;
 	sfree(f->ruhs_info);
 	f->ruhs_info = NULL;
+
+	if (!f->ruhs_scheme)
+		return;
+	sfree(f->ruhs_scheme);
+	f->ruhs_scheme = NULL;
 }
 
 void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u)
@@ -138,6 +197,25 @@ void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u)
 			ruhs->pli_loc = 0;
 
 		dspec = ruhs->plis[ruhs->pli_loc++];
+	} else if (td->o.dp_id_select == FIO_DP_SCHEME) {
+		struct fio_ruhs_scheme *ruhs_scheme = f->ruhs_scheme;
+		unsigned long long offset = io_u->offset;
+		int i;
+
+		for (i = 0; i < ruhs_scheme->nr_schemes; i++) {
+			if (offset >= ruhs_scheme->scheme_entries[i].start_offset &&
+			    offset < ruhs_scheme->scheme_entries[i].end_offset) {
+				dspec = ruhs_scheme->scheme_entries[i].pli;
+				break;
+			}
+		}
+
+		/*
+		 * If the write offset is not affected by any scheme entry,
+		 * 0(default RUH) will be assigned to dspec
+		 */
+		if (i == ruhs_scheme->nr_schemes)
+			dspec = 0;
 	} else {
 		ruhs->pli_loc = rand_between(&td->fdp_state, 0, ruhs->nr_ruhs - 1);
 		dspec = ruhs->plis[ruhs->pli_loc];
diff --git a/dataplacement.h b/dataplacement.h
index b5718c869e..71d19d6965 100644
--- a/dataplacement.h
+++ b/dataplacement.h
@@ -7,6 +7,7 @@
 #define FDP_DIR_DTYPE		2
 #define FDP_MAX_RUHS		128
 #define FIO_MAX_DP_IDS 		16
+#define DP_MAX_SCHEME_ENTRIES	32
 
 /*
  * How fio chooses what placement identifier to use next. Choice of
@@ -15,9 +16,9 @@
 enum {
 	FIO_DP_RANDOM	= 0x1,
 	FIO_DP_RR	= 0x2,
+	FIO_DP_SCHEME	= 0x3,
 };
 
-
 enum {
 	FIO_DP_NONE	= 0x0,
 	FIO_DP_FDP	= 0x1,
@@ -30,6 +31,17 @@ struct fio_ruhs_info {
 	uint16_t plis[];
 };
 
+struct fio_ruhs_scheme_entry {
+	unsigned long long start_offset;
+	unsigned long long end_offset;
+	uint16_t pli;
+};
+
+struct fio_ruhs_scheme {
+	uint16_t nr_schemes;
+	struct fio_ruhs_scheme_entry scheme_entries[DP_MAX_SCHEME_ENTRIES];
+};
+
 int dp_init(struct thread_data *td);
 void fdp_free_ruhs_info(struct fio_file *f);
 void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u);
diff --git a/file.h b/file.h
index deb36e0291..e38ed2f123 100644
--- a/file.h
+++ b/file.h
@@ -103,6 +103,7 @@ struct fio_file {
 	uint64_t io_size;
 
 	struct fio_ruhs_info *ruhs_info;
+	struct fio_ruhs_scheme *ruhs_scheme;
 
 	/*
 	 * Zoned block device information. See also zonemode=zbd.
diff --git a/fio.1 b/fio.1
index ee8124946a..1c8e3a5670 100644
--- a/fio.1
+++ b/fio.1
@@ -2294,9 +2294,14 @@ Choose a placement ID at random (uniform).
 .TP
 .B roundrobin
 Round robin over available placement IDs. This is the default.
+.TP
+.B scheme
+Choose a placement ID (index) based on the scheme file defined by
+the option \fBdp_scheme\fP.
 .RE
 .P
-The available placement ID (indices) are defined by the \fBplids\fR option.
+The available placement ID (indices) are defined by \fBplids\fR or
+\fBfdp_pli\fR option except for the case of \fBscheme\fP.
 .RE
 .TP
 .BI (io_uring_cmd,xnvme)plids=str, fdp_pli \fR=\fPstr
@@ -2307,6 +2312,31 @@ jobs. If you want fio to use placement identifier only at indices 0, 2 and 5
 specify, you would set `plids=0,2,5`. For streams this should be a
 comma-separated list of Stream IDs.
 .TP
+.BI (io_uring_cmd,xnvme)\fR\fBdp_scheme\fP=str
+Defines which placement ID (index) to be selected based on offset(LBA) range.
+The file should contains one or more scheme entries in the following format:
+.sp
+.RS
+.RS
+0, 10737418240, 0
+.br
+10737418240, 21474836480, 1
+.br
+21474836480, 32212254720, 2
+.br
+\&...
+.RE
+.sp
+Each line, a scheme entry, contains start offset, end offset, and placement ID
+(index) separated by comma(,). If the write offset is within the range of a certain
+scheme entry(start offset ≤ offset < end offset), the corresponding placement ID
+(index) will be selected. If the write offset belongs to multiple scheme entries,
+the first matched scheme entry will be applied. If the offset is not within any range
+of scheme entry, dspec field will be set to 0, default RUH. (Caution: In case of
+multiple devices in a job, all devices of the job will be affected by the scheme. If
+this option is specified, the option \fBplids\fP or \fBfdp_pli\fP will be ignored.)
+.RE
+.TP
 .BI (io_uring_cmd,xnvme)md_per_io_size \fR=\fPint
 Size in bytes for separate metadata buffer per IO. Default: 0.
 .TP
diff --git a/options.c b/options.c
index 61ea41cc4e..f5d221c776 100644
--- a/options.c
+++ b/options.c
@@ -287,6 +287,43 @@ static int str_fdp_pli_cb(void *data, const char *input)
 	return 0;
 }
 
+/* str_dp_scheme_cb() is a callback function for parsing the fdp_scheme option
+	This function validates the fdp_scheme filename. */
+static int str_dp_scheme_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	struct stat sb;
+	char *filename;
+	int ret = 0;
+
+	if (parse_dryrun())
+		return 0;
+
+	filename = strdup(td->o.dp_scheme_file);
+	strip_blank_front(&filename);
+	strip_blank_end(filename);
+
+	strcpy(td->o.dp_scheme_file, filename);
+
+	if (lstat(filename, &sb) < 0){
+		ret = errno;
+		log_err("fio: lstat() error related to %s\n", filename);
+		td_verror(td, ret, "lstat");
+		goto out;
+	}
+
+	if (!S_ISREG(sb.st_mode)) {
+		ret = errno;
+		log_err("fio: %s is not a file\n", filename);
+		td_verror(td, ret, "S_ISREG");
+		goto out;
+	}
+
+out:
+	free(filename);
+	return ret;
+}
+
 static int str_bssplit_cb(void *data, const char *input)
 {
 	struct thread_data *td = cb_data_to_td(data);
@@ -3760,6 +3797,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 			    .oval = FIO_DP_RR,
 			    .help = "Round robin select Placement IDs",
 			  },
+			  { .ival = "scheme",
+			    .oval = FIO_DP_SCHEME,
+			    .help = "Use a scheme(based on LBA) to select Placement IDs",
+			  },
 		},
 	},
 	{
@@ -3774,6 +3815,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
 	},
+	{
+		.name	= "dp_scheme",
+		.lname	= "Data Placement Scheme",
+		.type	= FIO_OPT_STR_STORE,
+		.cb	= str_dp_scheme_cb,
+		.off1	= offsetof(struct thread_options, dp_scheme_file),
+		.maxlen	= PATH_MAX,
+		.help	= "scheme file that specifies offset-RUH mapping",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
 	{
 		.name	= "lockmem",
 		.lname	= "Lock memory",
diff --git a/server.h b/server.h
index 83ce449ba0..e8659f7920 100644
--- a/server.h
+++ b/server.h
@@ -51,7 +51,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-	FIO_SERVER_VER			= 104,
+	FIO_SERVER_VER			= 105,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
diff --git a/thread_options.h b/thread_options.h
index a36b79094f..ccd0c064b9 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -396,6 +396,7 @@ struct thread_options {
 	unsigned int dp_id_select;
 	unsigned int dp_ids[FIO_MAX_DP_IDS];
 	unsigned int dp_nr_ids;
+	char *dp_scheme_file;
 
 	unsigned int log_entries;
 	unsigned int log_prio;
@@ -713,6 +714,7 @@ struct thread_options_pack {
 	uint32_t dp_id_select;
 	uint32_t dp_ids[FIO_MAX_DP_IDS];
 	uint32_t dp_nr_ids;
+	uint8_t dp_scheme_file[FIO_TOP_STR_MAX];
 
 	uint32_t num_range;
 	/*