From d56aa7feb30cad727bfa158964dfe6f96d2455c7 Mon Sep 17 00:00:00 2001 From: Nico Wagner Date: Wed, 21 Jun 2023 08:25:56 +0200 Subject: [PATCH] Concatenate records without duplicates (#628) --- src/bin/pica/commands/cat.rs | 60 ++++++++++++++++++- tests/snapshot/cat/016-cat-unique-idn.toml | 6 ++ tests/snapshot/cat/017-cat-unique-hash.toml | 6 ++ tests/snapshot/cat/018-cat-unique-no-idn.toml | 6 ++ 4 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 tests/snapshot/cat/016-cat-unique-idn.toml create mode 100644 tests/snapshot/cat/017-cat-unique-hash.toml create mode 100644 tests/snapshot/cat/018-cat-unique-no-idn.toml diff --git a/src/bin/pica/commands/cat.rs b/src/bin/pica/commands/cat.rs index 2bbf0a013..6fe85b53c 100644 --- a/src/bin/pica/commands/cat.rs +++ b/src/bin/pica/commands/cat.rs @@ -1,8 +1,11 @@ +use std::collections::BTreeSet; use std::ffi::OsString; use std::path::PathBuf; -use clap::Parser; +use clap::{Parser, ValueEnum}; +use pica_path::PathExt; use pica_record::io::{ReaderBuilder, RecordsIterator, WriterBuilder}; +use pica_record::ByteRecord; use serde::{Deserialize, Serialize}; use crate::config::Config; @@ -19,6 +22,13 @@ pub(crate) struct CatConfig { pub(crate) gzip: Option, } +#[derive(Clone, Debug, PartialEq, Eq, Default, ValueEnum)] +enum Strategy { + #[default] + Idn, + Hash, +} + /// Concatenate records from multiple files #[derive(Parser, Debug)] pub(crate) struct Cat { @@ -26,6 +36,29 @@ pub(crate) struct Cat { #[arg(short, long)] skip_invalid: bool, + /// Skip duplicate records + #[arg(long, short)] + unique: bool, + + /// Use the given strategy to determine duplicate records. + /// + /// The `idn` strategy (default) is used to distinguish records by + /// IDN (first value of field `003@.0`) and `hash` compares + /// the SHA-256 checksums over all fields of a record. + /// + /// Note: If a record doesn't contain a IDN value and the `idn` + /// strategy is selected, the record is ignored and won't be + /// written to . + #[arg( + long, + requires = "unique", + default_value = "idn", + value_name = "strategy", + hide_possible_values = true, + hide_default_value = true + )] + unique_strategy: Strategy, + /// Append to the given file, do not overwrite #[arg(long)] append: bool, @@ -58,6 +91,21 @@ impl Cat { config.global ); + let mut seen = BTreeSet::new(); + let key = |record: &ByteRecord| -> String { + match self.unique_strategy { + Strategy::Idn => record + .idn() + .map(ToString::to_string) + .unwrap_or_default(), + Strategy::Hash => record + .sha256() + .iter() + .map(|b| format!("{:02x}", b)) + .collect::(), + } + }; + let mut writer = WriterBuilder::new() .gzip(gzip_compression) .append(self.append) @@ -87,6 +135,16 @@ impl Cat { } } Ok(record) => { + if self.unique { + let k = key(&record); + + if k.is_empty() || seen.contains(&k) { + continue; + } + + seen.insert(k); + } + writer.write_byte_record(&record)?; if let Some(ref mut writer) = tee_writer { writer.write_byte_record(&record)?; diff --git a/tests/snapshot/cat/016-cat-unique-idn.toml b/tests/snapshot/cat/016-cat-unique-idn.toml new file mode 100644 index 000000000..efb8debf5 --- /dev/null +++ b/tests/snapshot/cat/016-cat-unique-idn.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "cat --unique --unique-strategy idn" +status = "success" +stderr = "" +stdin = "003@ \u001f0123456789X\u001e012A \u001fa1\u001e\n003@ \u001f0123456789X\u001e012A \u001fa2\u001e\n" +stdout = "003@ \u001f0123456789X\u001e012A \u001fa1\u001e\n" diff --git a/tests/snapshot/cat/017-cat-unique-hash.toml b/tests/snapshot/cat/017-cat-unique-hash.toml new file mode 100644 index 000000000..ad7231b4a --- /dev/null +++ b/tests/snapshot/cat/017-cat-unique-hash.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "cat --unique --unique-strategy hash" +status = "success" +stderr = "" +stdin = "003@ \u001f0123456789X\u001e012A \u001fa1\u001e\n003@ \u001f0123456789X\u001e012A \u001fa2\u001e\n" +stdout = "003@ \u001f0123456789X\u001e012A \u001fa1\u001e\n003@ \u001f0123456789X\u001e012A \u001fa2\u001e\n" diff --git a/tests/snapshot/cat/018-cat-unique-no-idn.toml b/tests/snapshot/cat/018-cat-unique-no-idn.toml new file mode 100644 index 000000000..56c396277 --- /dev/null +++ b/tests/snapshot/cat/018-cat-unique-no-idn.toml @@ -0,0 +1,6 @@ +bin.name = "pica" +args = "cat --unique --unique-strategy idn" +status = "success" +stderr = "" +stdin = "012A \u001fa1\u001e\n003@ \u001f0123456789X\u001e012A \u001fa2\u001e\n" +stdout = "003@ \u001f0123456789X\u001e012A \u001fa2\u001e\n"