Skip to content

Commit

Permalink
diff: add flag --drop-equal-fields
Browse files Browse the repository at this point in the history
This implements a new flag for the command `diff`. When activated, it
drops the values of fields that are equal within a row of type
`Modified` and replaces them with the empty string (an empty byte slice
to be precise). For now, the value for replacing equal values is not
configurable, but should be trivial to add in the future.

Note that key field values are _not_ dropped and always appear in the
output.

Example:
csv_left.csv    col1,col2,col3
                1,foo,bar

csv_right.csv   col1,col2,col3
                1,foo,baz

qsv diff --drop-equal-fields csv_left.csv csv_right.csv

Output:         diffresult,col1,col2,col3
                -,1,,bar
                +,1,,baz

See jqnatividad#2000
  • Loading branch information
Jan Riemer committed Sep 8, 2024
1 parent f9cc559 commit 2beb709
Show file tree
Hide file tree
Showing 2 changed files with 225 additions and 15 deletions.
98 changes: 83 additions & 15 deletions src/cmd/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ Find the difference between two CSVs, comparing records that have the same value
in the first two columns and sort the result by the first two columns:
qsv diff -k 0,1 --sort-columns 0,1 left.csv right.csv
Find the difference between two CSVs, but do not output equal field values
in the result (equal field values are replaced with the empty string). Key
field values _will_ appear in the output:
qsv diff --drop-equal-fields left.csv right.csv
Find the difference between two CSVs, but do not output headers in the result:
qsv diff --no-headers-output left.csv right.csv
Expand Down Expand Up @@ -81,6 +86,9 @@ diff options:
left CSV's headers are used to match the column names
and it is assumed that the right CSV has the same
selected column names in the same order as the left CSV.
--drop-equal-fields Drop values of equal fields in modified rows of the CSV
diff result (and replace them with the empty string).
Key field values will not be dropped.
-j, --jobs <arg> The number of jobs to run in parallel.
When not set, the number of jobs is set to the number
of CPUs detected.
Expand All @@ -92,6 +100,7 @@ Common options:

use std::io::{self, Write};

use csv::ByteRecord;
use csv_diff::{
csv_diff::CsvByteDiffBuilder, csv_headers::Headers, diff_result::DiffByteRecords,
diff_row::DiffByteRecord,
Expand Down Expand Up @@ -119,6 +128,7 @@ struct Args {
flag_delimiter_output: Option<Delimiter>,
flag_key: Option<String>,
flag_sort_columns: Option<String>,
flag_drop_equal_fields: bool,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
Expand Down Expand Up @@ -242,7 +252,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
util::njobs(args.flag_jobs);

let Ok(csv_diff) = CsvByteDiffBuilder::new()
.primary_key_columns(primary_key_cols)
.primary_key_columns(primary_key_cols.clone())
.build()
else {
return fail_clierror!("Cannot instantiate diff");
Expand All @@ -263,24 +273,38 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
},
}

let mut csv_diff_writer = CsvDiffWriter::new(wtr, args.flag_no_headers_output);
let mut csv_diff_writer = CsvDiffWriter::new(
wtr,
args.flag_no_headers_output,
args.flag_drop_equal_fields,
primary_key_cols,
);
Ok(csv_diff_writer.write_diff_byte_records(diff_byte_records)?)
}

struct CsvDiffWriter<W: Write> {
csv_writer: csv::Writer<W>,
no_headers: bool,
csv_writer: csv::Writer<W>,
no_headers: bool,
drop_equal_fields: bool,
key_fields: Vec<usize>,
}

impl<W: Write> CsvDiffWriter<W> {
const fn new(csv_writer: csv::Writer<W>, no_headers: bool) -> Self {
fn new(
csv_writer: csv::Writer<W>,
no_headers: bool,
drop_equal_fields: bool,
key_fields: impl IntoIterator<Item = usize>,
) -> Self {
Self {
csv_writer,
no_headers,
drop_equal_fields,
key_fields: key_fields.into_iter().collect(),
}
}

fn write_headers(&mut self, headers: &Headers, num_columns: Option<usize>) -> csv::Result<()> {
fn write_headers(&mut self, headers: &Headers, num_columns: Option<&usize>) -> csv::Result<()> {
match (headers.headers_left(), headers.headers_right()) {
(Some(lbh), Some(_rbh)) => {
// currently, `diff` can only handle two CSVs that have the same
Expand All @@ -296,7 +320,8 @@ impl<W: Write> CsvDiffWriter<W> {
}
},
(None, None) => {
if let (Some(num_cols), false) = (num_columns.filter(|&c| c > 0), self.no_headers) {
if let (Some(&num_cols), false) = (num_columns.filter(|&&c| c > 0), self.no_headers)
{
let headers_generic = rename_headers_all_generic(num_cols);
let mut new_rdr = csv::Reader::from_reader(headers_generic.as_bytes());
let new_headers = new_rdr.byte_headers()?;
Expand All @@ -309,7 +334,10 @@ impl<W: Write> CsvDiffWriter<W> {
}

fn write_diff_byte_records(&mut self, diff_byte_records: DiffByteRecords) -> io::Result<()> {
self.write_headers(diff_byte_records.headers(), diff_byte_records.num_columns())?;
self.write_headers(
diff_byte_records.headers(),
diff_byte_records.num_columns().as_ref(),
)?;
for dbr in diff_byte_records {
self.write_diff_byte_record(&dbr)?;
}
Expand All @@ -330,16 +358,34 @@ impl<W: Write> CsvDiffWriter<W> {
DiffByteRecord::Modify {
delete,
add,
// TODO: this should be used in the future to highlight the column where differences
// occur
field_indices: _field_indices,
field_indices,
} => {
let mut vec_del = vec![remove_sign];
vec_del.extend(delete.byte_record());
let vec_del = if self.drop_equal_fields {
self.fill_modified_and_drop_equal_fields(
remove_sign,
delete.byte_record(),
field_indices.as_slice(),
)
} else {
let mut tmp = vec![remove_sign];
tmp.extend(delete.byte_record());
tmp
};

self.csv_writer.write_record(vec_del)?;

let mut vec_add = vec![add_sign];
vec_add.extend(add.byte_record());
let vec_add = if self.drop_equal_fields {
self.fill_modified_and_drop_equal_fields(
add_sign,
add.byte_record(),
field_indices.as_slice(),
)
} else {
let mut tmp = vec![add_sign];
tmp.extend(add.byte_record());
tmp
};

self.csv_writer.write_record(vec_add)
},
DiffByteRecord::Delete(del) => {
Expand All @@ -349,6 +395,28 @@ impl<W: Write> CsvDiffWriter<W> {
},
}
}

fn fill_modified_and_drop_equal_fields<'a>(
&self,
prefix: &'a [u8],
byte_record: &'a ByteRecord,
modified_field_indices: &[usize],
) -> Vec<&'a [u8]> {
let mut vec_to_fill = {
// We start out with all fields set to an empty byte slice
// (which end up as our equal fields).
let mut tmp = vec![&b""[..]; byte_record.len() + 1 /* + 1, because we need to store our additional prefix*/];
tmp.as_mut_slice()[0] = prefix;
tmp
};
// key field values and modified field values should appear in the output
for &key_field in self.key_fields.iter().chain(modified_field_indices) {
// + 1 here, because of the prefix value (see above)
vec_to_fill[key_field + 1] = &byte_record[key_field];
}

vec_to_fill
}
}

trait WriteDiffResultHeader {
Expand Down
142 changes: 142 additions & 0 deletions tests/test_diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,148 @@ diffresult;h1;h2;h3
assert_eq!(got.as_str(), expected);
}

#[test]
fn diff_drop_equal_fields_flag_on_modified_rows_one_row_modified() {
let wrk = Workdir::new("diff_drop_equal_fields_flag_on_modified_rows_one_row_modified");

let left = vec![
svec!["h1", "h2", "h3"],
svec!["1", "deleted", "row"],
svec!["2", "baz", "quux"],
svec!["3", "corge", "grault"],
];
wrk.create("left.csv", left);

let right = vec![
svec!["h1", "h2", "h3"],
svec!["2", "baz", "quux_modified"],
svec!["3", "corge", "grault"],
svec!["4", "added", "row"],
];
wrk.create("right.csv", right);

let mut cmd = wrk.command("diff");
cmd.args(["left.csv", "right.csv", "--drop-equal-fields"]);

let got: String = wrk.stdout(&mut cmd);
let expected = "\
diffresult,h1,h2,h3
-,1,deleted,row
-,2,,quux
+,2,,quux_modified
+,4,added,row";
assert_eq!(got.as_str(), expected);
}

#[test]
fn diff_drop_equal_fields_flag_on_modified_rows_multiple_fields_on_one_row_equal() {
let wrk = Workdir::new(
"diff_drop_equal_fields_flag_on_modified_rows_multiple_fields_on_one_row_equal",
);

let left = vec![
svec!["h1", "h2", "h3", "h4"],
svec!["1", "deleted", "row", "foo"],
svec!["2", "baz", "quux", "drix"],
svec!["3", "corge", "grault", "bar"],
];
wrk.create("left.csv", left);

let right = vec![
svec!["h1", "h2", "h3", "h4"],
svec!["2", "baz", "quux", "drix_modified"],
svec!["3", "corge", "grault", "bar"],
svec!["4", "added", "row", "new"],
];
wrk.create("right.csv", right);

let mut cmd = wrk.command("diff");
cmd.args(["left.csv", "right.csv", "--drop-equal-fields"]);

let got: String = wrk.stdout(&mut cmd);
let expected = "\
diffresult,h1,h2,h3,h4
-,1,deleted,row,foo
-,2,,,drix
+,2,,,drix_modified
+,4,added,row,new";
assert_eq!(got.as_str(), expected);
}

#[test]
fn diff_drop_equal_fields_flag_on_modified_rows_multiple_rows_modified_in_different_columns() {
let wrk = Workdir::new(
"diff_drop_equal_fields_flag_on_modified_rows_multiple_rows_modified_in_different_columns",
);

let left = vec![
svec!["h1", "h2", "h3"],
svec!["1", "deleted", "row"],
svec!["2", "baz", "quux"],
svec!["3", "corge", "grault"],
];
wrk.create("left.csv", left);

let right = vec![
svec!["h1", "h2", "h3"],
svec!["2", "baz", "quux_modified"],
svec!["3", "corge_modified", "grault"],
svec!["4", "added", "row"],
];
wrk.create("right.csv", right);

let mut cmd = wrk.command("diff");
cmd.args(["left.csv", "right.csv", "--drop-equal-fields"]);

let got: String = wrk.stdout(&mut cmd);
let expected = "\
diffresult,h1,h2,h3
-,1,deleted,row
-,2,,quux
+,2,,quux_modified
-,3,corge,
+,3,corge_modified,
+,4,added,row";
assert_eq!(got.as_str(), expected);
}

#[test]
fn diff_drop_equal_fields_flag_on_modified_rows_multiple_key_fields_far_apart() {
let wrk =
Workdir::new("diff_drop_equal_fields_flag_on_modified_rows_multiple_key_fields_far_apart");

let left = vec![
svec!["h1", "h2", "h3", "h4"],
svec!["1", "deleted", "row", "id1"],
svec!["2", "baz", "quux", "id2"],
svec!["3", "corge", "grault", "id3"],
];
wrk.create("left.csv", left);

let right = vec![
svec!["h1", "h2", "h3", "h4"],
svec!["2", "baz", "quux_modified", "id2"],
svec!["3", "corge_modified", "grault", "id3"],
svec!["3", "added", "row", "id_new"],
];
wrk.create("right.csv", right);

let mut cmd = wrk.command("diff");
// here, first and last columns are our key fields
cmd.args(["left.csv", "right.csv", "--drop-equal-fields", "-k", "0,3"]);

let got: String = wrk.stdout(&mut cmd);
let expected = "\
diffresult,h1,h2,h3,h4
-,1,deleted,row,id1
-,2,,quux,id2
+,2,,quux_modified,id2
-,3,corge,,id3
+,3,corge_modified,,id3
+,3,added,row,id_new";
assert_eq!(got.as_str(), expected);
}

fn create_file_with_delim(wrk: &Workdir, file_path_new: &str, file_path: &str, delimiter: u8) {
let mut select_cmd = wrk.command("select");
select_cmd.args(["1-", file_path]);
Expand Down

0 comments on commit 2beb709

Please sign in to comment.