From b6f1b373ca11d71b506d162fd38e79e76822bd31 Mon Sep 17 00:00:00 2001 From: Jan Riemer <2428387-janriemer@users.noreply.gitlab.com> Date: Sun, 8 Sep 2024 16:27:00 +0200 Subject: [PATCH] `diff`: add flag `--drop-equal-fields` This implements a new flag for the command `diff`. When activated, it drops the values of fields that are equal within a row of type `Modified` and replaces them with the empty string (an empty byte slice to be precise). For now, the value for replacing equal values is not configurable, but should be trivial to add in the future. Note that key field values are _not_ dropped and always appear in the output. Example: csv_left.csv col1,col2,col3 1,foo,bar csv_right.csv col1,col2,col3 1,foo,baz qsv diff --drop-equal-fields csv_left.csv csv_right.csv Output: diffresult;col1;col2;col3 -;1,,bar +;1,,baz See #2000 --- src/cmd/diff.rs | 98 ++++++++++++++++++++++++++----- tests/test_diff.rs | 142 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+), 15 deletions(-) diff --git a/src/cmd/diff.rs b/src/cmd/diff.rs index 633777987..45a8cc9a9 100644 --- a/src/cmd/diff.rs +++ b/src/cmd/diff.rs @@ -30,6 +30,11 @@ Find the difference between two CSVs, comparing records that have the same value in the first two columns and sort the result by the first two columns: qsv diff -k 0,1 --sort-columns 0,1 left.csv right.csv +Find the difference between two CSVs, but do not output equal field values +in the result (equal field values are replaced with the empty string). Key +field values _will_ appear in the output: + qsv diff --drop-equal-fields left.csv right.csv + Find the difference between two CSVs, but do not output headers in the result: qsv diff --no-headers-output left.csv right.csv @@ -81,6 +86,9 @@ diff options: left CSV's headers are used to match the column names and it is assumed that the right CSV has the same selected column names in the same order as the left CSV. + --drop-equal-fields Drop values of equal fields in modified rows of the CSV + diff result (and replace them with the empty string). + Key field values will not be dropped. -j, --jobs The number of jobs to run in parallel. When not set, the number of jobs is set to the number of CPUs detected. @@ -92,6 +100,7 @@ Common options: use std::io::{self, Write}; +use csv::ByteRecord; use csv_diff::{ csv_diff::CsvByteDiffBuilder, csv_headers::Headers, diff_result::DiffByteRecords, diff_row::DiffByteRecord, @@ -119,6 +128,7 @@ struct Args { flag_delimiter_output: Option, flag_key: Option, flag_sort_columns: Option, + flag_drop_equal_fields: bool, } pub fn run(argv: &[&str]) -> CliResult<()> { @@ -242,7 +252,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { util::njobs(args.flag_jobs); let Ok(csv_diff) = CsvByteDiffBuilder::new() - .primary_key_columns(primary_key_cols) + .primary_key_columns(primary_key_cols.clone()) .build() else { return fail_clierror!("Cannot instantiate diff"); @@ -263,24 +273,38 @@ pub fn run(argv: &[&str]) -> CliResult<()> { }, } - let mut csv_diff_writer = CsvDiffWriter::new(wtr, args.flag_no_headers_output); + let mut csv_diff_writer = CsvDiffWriter::new( + wtr, + args.flag_no_headers_output, + args.flag_drop_equal_fields, + primary_key_cols, + ); Ok(csv_diff_writer.write_diff_byte_records(diff_byte_records)?) } struct CsvDiffWriter { - csv_writer: csv::Writer, - no_headers: bool, + csv_writer: csv::Writer, + no_headers: bool, + drop_equal_fields: bool, + key_fields: Vec, } impl CsvDiffWriter { - const fn new(csv_writer: csv::Writer, no_headers: bool) -> Self { + fn new( + csv_writer: csv::Writer, + no_headers: bool, + drop_equal_fields: bool, + key_fields: impl IntoIterator, + ) -> Self { Self { csv_writer, no_headers, + drop_equal_fields, + key_fields: key_fields.into_iter().collect(), } } - fn write_headers(&mut self, headers: &Headers, num_columns: Option) -> csv::Result<()> { + fn write_headers(&mut self, headers: &Headers, num_columns: Option<&usize>) -> csv::Result<()> { match (headers.headers_left(), headers.headers_right()) { (Some(lbh), Some(_rbh)) => { // currently, `diff` can only handle two CSVs that have the same @@ -296,7 +320,8 @@ impl CsvDiffWriter { } }, (None, None) => { - if let (Some(num_cols), false) = (num_columns.filter(|&c| c > 0), self.no_headers) { + if let (Some(&num_cols), false) = (num_columns.filter(|&&c| c > 0), self.no_headers) + { let headers_generic = rename_headers_all_generic(num_cols); let mut new_rdr = csv::Reader::from_reader(headers_generic.as_bytes()); let new_headers = new_rdr.byte_headers()?; @@ -309,7 +334,10 @@ impl CsvDiffWriter { } fn write_diff_byte_records(&mut self, diff_byte_records: DiffByteRecords) -> io::Result<()> { - self.write_headers(diff_byte_records.headers(), diff_byte_records.num_columns())?; + self.write_headers( + diff_byte_records.headers(), + diff_byte_records.num_columns().as_ref(), + )?; for dbr in diff_byte_records { self.write_diff_byte_record(&dbr)?; } @@ -330,16 +358,34 @@ impl CsvDiffWriter { DiffByteRecord::Modify { delete, add, - // TODO: this should be used in the future to highlight the column where differences - // occur - field_indices: _field_indices, + field_indices, } => { - let mut vec_del = vec![remove_sign]; - vec_del.extend(delete.byte_record()); + let vec_del = if self.drop_equal_fields { + self.fill_modified_and_drop_equal_fields( + remove_sign, + delete.byte_record(), + field_indices.as_slice(), + ) + } else { + let mut tmp = vec![remove_sign]; + tmp.extend(delete.byte_record()); + tmp + }; + self.csv_writer.write_record(vec_del)?; - let mut vec_add = vec![add_sign]; - vec_add.extend(add.byte_record()); + let vec_add = if self.drop_equal_fields { + self.fill_modified_and_drop_equal_fields( + add_sign, + add.byte_record(), + field_indices.as_slice(), + ) + } else { + let mut tmp = vec![add_sign]; + tmp.extend(add.byte_record()); + tmp + }; + self.csv_writer.write_record(vec_add) }, DiffByteRecord::Delete(del) => { @@ -349,6 +395,28 @@ impl CsvDiffWriter { }, } } + + fn fill_modified_and_drop_equal_fields<'a>( + &self, + prefix: &'a [u8], + byte_record: &'a ByteRecord, + modified_field_indices: &[usize], + ) -> Vec<&'a [u8]> { + let mut vec_to_fill = { + // We start out with all fields set to an empty byte slice + // (which end up as our equal fields). + let mut tmp = vec![&b""[..]; byte_record.len() + 1 /* + 1, because we need to store our additional prefix*/]; + tmp.as_mut_slice()[0] = prefix; + tmp + }; + // key field values and modified field values should appear in the output + for &key_field in self.key_fields.iter().chain(modified_field_indices) { + // + 1 here, because of the prefix value (see above) + vec_to_fill[key_field + 1] = &byte_record[key_field]; + } + + vec_to_fill + } } trait WriteDiffResultHeader { diff --git a/tests/test_diff.rs b/tests/test_diff.rs index 4b4cdf1e7..96d49bb62 100644 --- a/tests/test_diff.rs +++ b/tests/test_diff.rs @@ -547,6 +547,148 @@ diffresult;h1;h2;h3 assert_eq!(got.as_str(), expected); } +#[test] +fn diff_drop_equal_fields_flag_on_modified_rows_one_row_modified() { + let wrk = Workdir::new("diff_drop_equal_fields_flag_on_modified_rows_one_row_modified"); + + let left = vec![ + svec!["h1", "h2", "h3"], + svec!["1", "deleted", "row"], + svec!["2", "baz", "quux"], + svec!["3", "corge", "grault"], + ]; + wrk.create("left.csv", left); + + let right = vec![ + svec!["h1", "h2", "h3"], + svec!["2", "baz", "quux_modified"], + svec!["3", "corge", "grault"], + svec!["4", "added", "row"], + ]; + wrk.create("right.csv", right); + + let mut cmd = wrk.command("diff"); + cmd.args(["left.csv", "right.csv", "--drop-equal-fields"]); + + let got: String = wrk.stdout(&mut cmd); + let expected = "\ +diffresult,h1,h2,h3 +-,1,deleted,row +-,2,,quux ++,2,,quux_modified ++,4,added,row"; + assert_eq!(got.as_str(), expected); +} + +#[test] +fn diff_drop_equal_fields_flag_on_modified_rows_multiple_fields_on_one_row_equal() { + let wrk = Workdir::new( + "diff_drop_equal_fields_flag_on_modified_rows_multiple_fields_on_one_row_equal", + ); + + let left = vec![ + svec!["h1", "h2", "h3", "h4"], + svec!["1", "deleted", "row", "foo"], + svec!["2", "baz", "quux", "drix"], + svec!["3", "corge", "grault", "bar"], + ]; + wrk.create("left.csv", left); + + let right = vec![ + svec!["h1", "h2", "h3", "h4"], + svec!["2", "baz", "quux", "drix_modified"], + svec!["3", "corge", "grault", "bar"], + svec!["4", "added", "row", "new"], + ]; + wrk.create("right.csv", right); + + let mut cmd = wrk.command("diff"); + cmd.args(["left.csv", "right.csv", "--drop-equal-fields"]); + + let got: String = wrk.stdout(&mut cmd); + let expected = "\ +diffresult,h1,h2,h3,h4 +-,1,deleted,row,foo +-,2,,,drix ++,2,,,drix_modified ++,4,added,row,new"; + assert_eq!(got.as_str(), expected); +} + +#[test] +fn diff_drop_equal_fields_flag_on_modified_rows_multiple_rows_modified_in_different_columns() { + let wrk = Workdir::new( + "diff_drop_equal_fields_flag_on_modified_rows_multiple_rows_modified_in_different_columns", + ); + + let left = vec![ + svec!["h1", "h2", "h3"], + svec!["1", "deleted", "row"], + svec!["2", "baz", "quux"], + svec!["3", "corge", "grault"], + ]; + wrk.create("left.csv", left); + + let right = vec![ + svec!["h1", "h2", "h3"], + svec!["2", "baz", "quux_modified"], + svec!["3", "corge_modified", "grault"], + svec!["4", "added", "row"], + ]; + wrk.create("right.csv", right); + + let mut cmd = wrk.command("diff"); + cmd.args(["left.csv", "right.csv", "--drop-equal-fields"]); + + let got: String = wrk.stdout(&mut cmd); + let expected = "\ +diffresult,h1,h2,h3 +-,1,deleted,row +-,2,,quux ++,2,,quux_modified +-,3,corge, ++,3,corge_modified, ++,4,added,row"; + assert_eq!(got.as_str(), expected); +} + +#[test] +fn diff_drop_equal_fields_flag_on_modified_rows_multiple_key_fields_far_apart() { + let wrk = + Workdir::new("diff_drop_equal_fields_flag_on_modified_rows_multiple_key_fields_far_apart"); + + let left = vec![ + svec!["h1", "h2", "h3", "h4"], + svec!["1", "deleted", "row", "id1"], + svec!["2", "baz", "quux", "id2"], + svec!["3", "corge", "grault", "id3"], + ]; + wrk.create("left.csv", left); + + let right = vec![ + svec!["h1", "h2", "h3", "h4"], + svec!["2", "baz", "quux_modified", "id2"], + svec!["3", "corge_modified", "grault", "id3"], + svec!["3", "added", "row", "id_new"], + ]; + wrk.create("right.csv", right); + + let mut cmd = wrk.command("diff"); + // here, first and last columns are our key fields + cmd.args(["left.csv", "right.csv", "--drop-equal-fields", "-k", "0,3"]); + + let got: String = wrk.stdout(&mut cmd); + let expected = "\ +diffresult,h1,h2,h3,h4 +-,1,deleted,row,id1 +-,2,,quux,id2 ++,2,,quux_modified,id2 +-,3,corge,,id3 ++,3,corge_modified,,id3 ++,3,added,row,id_new"; + assert_eq!(got.as_str(), expected); +} + fn create_file_with_delim(wrk: &Workdir, file_path_new: &str, file_path: &str, delimiter: u8) { let mut select_cmd = wrk.command("select"); select_cmd.args(["1-", file_path]);