From 2abe836166a68c1d12572b144d20c1648c80400b Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 26 Aug 2024 14:36:16 +0200 Subject: [PATCH 01/14] Log progress when applying updates in `map` module --- src/manipulator/map.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/manipulator/map.rs b/src/manipulator/map.rs index 64ebdd36..55d30a7d 100644 --- a/src/manipulator/map.rs +++ b/src/manipulator/map.rs @@ -146,7 +146,12 @@ impl Manipulator for MapAnnos { }; map_impl.run()? }; - graph.apply_update(&mut updates, |_| {})?; + let progress = ProgressReporter::new_unknown_total_work(tx, step_id)?; + graph.apply_update(&mut updates, move |msg| { + if let Err(e) = progress.info(&format!("Applying `map` updates: {msg}")) { + log::error!("{e}"); + } + })?; Ok(()) } From f3d1e33794792aac6a08e6de17b6e333e9d1bc15 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 26 Aug 2024 14:41:18 +0200 Subject: [PATCH 02/14] Add "id_column" parameter to table exporter --- CHANGELOG.md | 2 ++ src/exporter/table.rs | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c039fa1e..eea5e8a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `conllu` as export format - import of `conllu` now supports enhanced dependencies - Adds `saltxml` export format +- The `table` exporter now supports the `id_column` parameter to + enable/disable the ID column. ### Changed diff --git a/src/exporter/table.rs b/src/exporter/table.rs index 5344e240..e897bca9 100644 --- a/src/exporter/table.rs +++ b/src/exporter/table.rs @@ -99,6 +99,8 @@ pub struct ExportTable { /// ``` #[serde(default, deserialize_with = "deserialize_annotation_component_seq")] outgoing: Vec, + /// If `true` (the default), always output a column with the ID of the node. + id_column: bool, } impl Default for ExportTable { @@ -110,6 +112,7 @@ impl Default for ExportTable { no_value: String::default(), ingoing: vec![], outgoing: vec![], + id_column: true, } } } @@ -265,10 +268,13 @@ impl ExportTable { let id_name = format!("id_{qname}"); let index = if let Some(index) = index_map.get(&qname) { *index - } else { + } else if self.id_column { index_map.insert(qname.to_string(), index_map.len()); index_map.insert(id_name.to_string(), index_map.len()); index_map.len() - 2 + } else { + index_map.insert(qname.to_string(), index_map.len()); + index_map.len() - 1 }; let value = node_annos .get_value_for_item(&rn, &anno_key)? From faae0cb8df4d1804256b2874b727ed8a1b643d86 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 26 Aug 2024 15:08:57 +0200 Subject: [PATCH 03/14] Log progress when applying updates in `revise` module --- src/manipulator/map.rs | 2 +- src/manipulator/re.rs | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/manipulator/map.rs b/src/manipulator/map.rs index 55d30a7d..8521cd4c 100644 --- a/src/manipulator/map.rs +++ b/src/manipulator/map.rs @@ -148,7 +148,7 @@ impl Manipulator for MapAnnos { }; let progress = ProgressReporter::new_unknown_total_work(tx, step_id)?; graph.apply_update(&mut updates, move |msg| { - if let Err(e) = progress.info(&format!("Applying `map` updates: {msg}")) { + if let Err(e) = progress.info(&format!("`map` updates: {msg}")) { log::error!("{e}"); } })?; diff --git a/src/manipulator/re.rs b/src/manipulator/re.rs index 4fa381c4..5872e26b 100644 --- a/src/manipulator/re.rs +++ b/src/manipulator/re.rs @@ -774,7 +774,12 @@ impl Manipulator for Revise { remove_subgraph(graph, &mut update, node_name)?; } } - graph.apply_update(&mut update, |_| {})?; + graph.apply_update(&mut update, move |msg| { + if let Err(e) = progress_reporter.info(&format!("`revise` updates: {msg}")) { + log::error!("{e}"); + } + })?; + Ok(()) } } From 995004d08a3845d9291e4ec3a24b89c288276b20 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 26 Aug 2024 15:47:02 +0200 Subject: [PATCH 04/14] Log progress when applying updates in `filter` module --- src/manipulator/filter.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/manipulator/filter.rs b/src/manipulator/filter.rs index a395aacb..23cb66cf 100644 --- a/src/manipulator/filter.rs +++ b/src/manipulator/filter.rs @@ -12,6 +12,8 @@ use graphannis_core::graph::{ANNIS_NS, NODE_NAME_KEY, NODE_TYPE_KEY}; use serde::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; +use crate::progress::ProgressReporter; + use super::Manipulator; /// This module acts as a positive filter, i. e., all nodes that do not match the query and are not real tokens @@ -56,8 +58,8 @@ impl Manipulator for FilterNodes { &self, graph: &mut graphannis::AnnotationGraph, _workflow_directory: &std::path::Path, - _step_id: crate::StepID, - _tx: Option, + step_id: crate::StepID, + tx: Option, ) -> Result<(), Box> { let mut update = GraphUpdate::default(); let query = aql::parse(&self.query, false)?; @@ -126,7 +128,12 @@ impl Manipulator for FilterNodes { } } } - graph.apply_update(&mut update, |_| {})?; + let progress = ProgressReporter::new_unknown_total_work(tx, step_id)?; + graph.apply_update(&mut update, move |msg| { + if let Err(e) = progress.info(&format!("`filter` updates: {msg}")) { + log::error!("{e}"); + } + })?; Ok(()) } } From cc1e360fe26b21767e7558dfe66bf49a26333d04 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 28 Aug 2024 10:09:55 +0200 Subject: [PATCH 05/14] Advocate for the more robust ${N} backreference syntax. --- src/manipulator/map.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/manipulator/map.rs b/src/manipulator/map.rs index 8521cd4c..fa4da823 100644 --- a/src/manipulator/map.rs +++ b/src/manipulator/map.rs @@ -78,7 +78,7 @@ use super::Manipulator; /// This would add a new annotation value "complidoged" to any token with the value "complicated". /// /// The `replacement` value can contain back references to the regular -/// expression (e.g. "$0" for the whole match or "$1" for the first match +/// expression (e.g. "${0}" for the whole match or "${1}" for the first match /// group). /// ```toml /// [[rules]] @@ -86,7 +86,7 @@ use super::Manipulator; /// target = 1 /// ns = "" /// name = "abbr" -/// value = {target = 1, search = "([A-Z])[a-z]+ ([A-Z])[a-z]+", replacement = "$1$2"} +/// value = {target = 1, search = "([A-Z])[a-z]+ ([A-Z])[a-z]+", replacement = "${1}${2}"} /// ``` /// This example would add an annotation with the value "NY". /// @@ -539,7 +539,7 @@ name = "abbr" [rules.value] target = 1 -replacements = [["([A-Z])[a-z]+ ([A-Z])[a-z]+", "$1$2"]] +replacements = [["([A-Z])[a-z]+ ([A-Z])[a-z]+", "${1}${2}"]] "#; let m: Mapping = toml::from_str(config).unwrap(); From 964da24242af6a87c7e563f806056bc357d79327 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 27 Aug 2024 12:17:26 +0200 Subject: [PATCH 06/14] Normalize annotation value to Unicode NFC before applying the mapping regex --- Cargo.toml | 1 + src/manipulator/map.rs | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d621d2a6..535628e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,6 +53,7 @@ thiserror = "1.0" toml = "0.8.0" tracing-subscriber = {version = "0.3", features = ["env-filter"]} umya-spreadsheet = "2.0.1" +unicode-normalization = "0.1.23" url = "2.5.2" xml-rs = "0.8" zip = "0.6.6" diff --git a/src/manipulator/map.rs b/src/manipulator/map.rs index fa4da823..7ad1d294 100644 --- a/src/manipulator/map.rs +++ b/src/manipulator/map.rs @@ -4,6 +4,7 @@ use std::{ path::{Path, PathBuf}, }; +use super::Manipulator; use crate::{ progress::ProgressReporter, util::token_helper::{TokenHelper, TOKEN_KEY}, @@ -23,8 +24,7 @@ use graphannis_core::graph::{ use regex::Regex; use serde_derive::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; - -use super::Manipulator; +use unicode_normalization::UnicodeNormalization; /// Creates new or updates annotations based on existing annotation values. /// @@ -120,10 +120,11 @@ impl Manipulator for MapAnnos { } }; let config = read_config(read_from_path.as_path())?; + let progress = ProgressReporter::new(tx.clone(), step_id.clone(), config.rules.len())?; + progress.info("Ensure all graph storages are loaded.")?; graph.ensure_loaded_all()?; - let progress = ProgressReporter::new(tx.clone(), step_id.clone(), config.rules.len())?; let mut updates = { let tok_helper = TokenHelper::new(graph)?; let all_part_of_gs: Vec<_> = graph @@ -249,7 +250,7 @@ impl Rule { target, replacements, } => { - let mut val = target.resolve_value(graph, mg, ' ')?; + let mut val = target.resolve_value(graph, mg, ' ')?.nfc().to_string(); for (search, replace) in replacements { // replace all occurences of the value let search = Regex::new(search)?; @@ -275,6 +276,8 @@ impl<'a> MapperImpl<'a> { let mut update = GraphUpdate::default(); for rule in self.config.rules.clone() { + self.progress + .info(&format!("Applying rule with query `{}`", &rule.query))?; let query = graphannis::aql::parse(&rule.query, false) .with_context(|| format!("could not parse query '{}'", &rule.query))?; let result_it = From 63507aeac2daf277a134a7f083dccc80d4dc67fa Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 27 Aug 2024 15:29:28 +0200 Subject: [PATCH 07/14] Add test case for replacing macrons similar in RIDGES --- Cargo.toml | 1 - src/manipulator/map.rs | 92 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 535628e1..d621d2a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,6 @@ thiserror = "1.0" toml = "0.8.0" tracing-subscriber = {version = "0.3", features = ["env-filter"]} umya-spreadsheet = "2.0.1" -unicode-normalization = "0.1.23" url = "2.5.2" xml-rs = "0.8" zip = "0.6.6" diff --git a/src/manipulator/map.rs b/src/manipulator/map.rs index 7ad1d294..6419cf4e 100644 --- a/src/manipulator/map.rs +++ b/src/manipulator/map.rs @@ -24,7 +24,6 @@ use graphannis_core::graph::{ use regex::Regex; use serde_derive::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; -use unicode_normalization::UnicodeNormalization; /// Creates new or updates annotations based on existing annotation values. /// @@ -250,7 +249,7 @@ impl Rule { target, replacements, } => { - let mut val = target.resolve_value(graph, mg, ' ')?.nfc().to_string(); + let mut val = target.resolve_value(graph, mg, ' ')?; for (search, replace) in replacements { // replace all occurences of the value let search = Regex::new(search)?; @@ -560,6 +559,51 @@ replacements = [["([A-Z])[a-z]+ ([A-Z])[a-z]+", "${1}${2}"]] assert_eq!("NY", result); } + #[test] + fn test_ridges_clean_resolver() { + let config = r#" +[[rules]] +query = "tok" +target = 1 +ns = "test" +name = "clean" + +[rules.value] +target = 1 +replacements = [ + ['ð', 'der'], + ['(.*)(.)\u0304(.*)', '$1$2/MACRON_M/$3|$1$2/MACRON_N/$3'], + ['([^|]*)([^|])\u0304([^|]*)', '$1$2/MACRON_M/$3|$1$2/MACRON_N/$3'], + ['/MACRON_M/', 'm'], + ['/MACRON_N/', 'n'], +] +"#; + + let m: Mapping = toml::from_str(config).unwrap(); + + let g = tokens_with_macrons().unwrap(); + + let singlemacron = g + .get_node_annos() + .exact_anno_search(Some("annis"), "tok", ValueSearch::Some("anðthalbē")) + .next() + .unwrap() + .unwrap(); + + let result = m.rules[0].resolve_value(&g, &[singlemacron]).unwrap(); + assert_eq!("anderthalbem|anderthalben", result); + + let multiple_macron = g + .get_node_annos() + .exact_anno_search(Some("annis"), "tok", ValueSearch::Some("ellēbogē")) + .next() + .unwrap() + .unwrap(); + + let result = m.rules[0].resolve_value(&g, &[multiple_macron]).unwrap(); + assert_eq!("ellembogem|ellenbogem|ellembogen|ellenbogen", result); + } + #[test] fn test_map_spans() { let mut updates = GraphUpdate::new(); @@ -732,6 +776,50 @@ value = "PROPN" Ok(g) } + fn tokens_with_macrons() -> Result> { + let mut g = AnnotationGraph::with_default_graphstorages(true)?; + let mut u = GraphUpdate::default(); + u.add_event(UpdateEvent::AddNode { + node_name: "doc".to_string(), + node_type: "corpus".to_string(), + })?; + for (i, text) in [ + "ein", + "kraut", + "wechſzt", + "etwan", + "anðthalbē", + "ellēbogē", + "hoch", + ] + .iter() + .enumerate() + { + let node_name = format!("doc#t{}", &i + &1); + u.add_event(UpdateEvent::AddNode { + node_name: node_name.to_string(), + node_type: "node".to_string(), + })?; + u.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok".to_string(), + anno_value: text.to_string(), + })?; + if i > 0 { + u.add_event(UpdateEvent::AddEdge { + source_node: format!("doc#t{i}"), + target_node: node_name.to_string(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::Ordering.to_string(), + component_name: "".to_string(), + })?; + } + } + g.apply_update(&mut u, |_| {})?; + Ok(g) + } + fn target_graph(on_disk: bool) -> Result> { let mut g = source_graph(on_disk)?; let mut u = GraphUpdate::default(); From 10714481ba4b05705a06034fe3f0d43c723dd4c4 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 29 Aug 2024 15:27:34 +0200 Subject: [PATCH 08/14] Provide default value for "id_column" in "table" exporter --- src/exporter/table.rs | 7 ++++++- src/manipulator/map.rs | 30 +++++++++++++++--------------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/exporter/table.rs b/src/exporter/table.rs index e897bca9..abaa228f 100644 --- a/src/exporter/table.rs +++ b/src/exporter/table.rs @@ -100,9 +100,14 @@ pub struct ExportTable { #[serde(default, deserialize_with = "deserialize_annotation_component_seq")] outgoing: Vec, /// If `true` (the default), always output a column with the ID of the node. + #[serde(default = "default_id_column")] id_column: bool, } +fn default_id_column() -> bool { + true +} + impl Default for ExportTable { fn default() -> Self { Self { @@ -112,7 +117,7 @@ impl Default for ExportTable { no_value: String::default(), ingoing: vec![], outgoing: vec![], - id_column: true, + id_column: default_id_column(), } } } diff --git a/src/manipulator/map.rs b/src/manipulator/map.rs index 6419cf4e..34be7cad 100644 --- a/src/manipulator/map.rs +++ b/src/manipulator/map.rs @@ -45,7 +45,7 @@ use struct_field_names_as_array::FieldNamesAsSlice; /// A `target` can also be a list. In this case, a new span is created that /// covers the same token as the referenced nodes of the match. /// ```toml -/// [[rules]] +/// [[rules]] /// query = "tok=/more/ . tok" /// target = [1,2] /// ns = "mapper" @@ -56,7 +56,7 @@ use struct_field_names_as_array::FieldNamesAsSlice; /// Instead of a fixed value, you can also use an existing annotation value /// from the matched nodes copy the value. /// ```toml -/// [[rules]] +/// [[rules]] /// query = "tok=\"complicated\"" /// target = 1 /// ns = "" @@ -67,7 +67,7 @@ use struct_field_names_as_array::FieldNamesAsSlice; /// It is also possible to replace all occurences in the original value that /// match a `search` regular expression with a `replacement` value. /// ```toml -/// [[rules]] +/// [[rules]] /// query = "tok=\"complicated\"" /// target = 1 /// ns = "" @@ -80,7 +80,7 @@ use struct_field_names_as_array::FieldNamesAsSlice; /// expression (e.g. "${0}" for the whole match or "${1}" for the first match /// group). /// ```toml -/// [[rules]] +/// [[rules]] /// query = "tok=\"New York\"" /// target = 1 /// ns = "" @@ -449,10 +449,10 @@ mod tests { g.apply_update(&mut updates, |_msg| {}).unwrap(); let config = r#" - [[rules]] + [[rules]] query = "tok" target = 1 - ns = "test_ns" + ns = "test_ns" name = "test" value = {copy = 1} "#; @@ -533,10 +533,10 @@ mod tests { #[test] fn test_parse_complicated_replace() { let config = r#" -[[rules]] +[[rules]] query = "tok=\"New York\"" target = 1 -ns = "" +ns = "" name = "abbr" [rules.value] @@ -562,10 +562,10 @@ replacements = [["([A-Z])[a-z]+ ([A-Z])[a-z]+", "${1}${2}"]] #[test] fn test_ridges_clean_resolver() { let config = r#" -[[rules]] +[[rules]] query = "tok" target = 1 -ns = "test" +ns = "test" name = "clean" [rules.value] @@ -613,7 +613,7 @@ replacements = [ g.apply_update(&mut updates, |_msg| {}).unwrap(); let config = r#" -[[rules]] +[[rules]] query = "tok=/more/ . tok" target = [1,2] ns = "mapper" @@ -659,26 +659,26 @@ value = "comparison" fn main_test(on_disk: bool) -> Result<(), Box> { let config = r#" -[[rules]] +[[rules]] query = "tok=/I/" target = 1 ns = "" name = "pos" -value = "PRON" +value = "PRON" [[rules]] query = "tok=/am/" target = 1 ns = "" name = "pos" -value = "VERB" +value = "VERB" [[rules]] query = "tok=/in/" target = 1 ns = "" name = "pos" -value = "ADP" +value = "ADP" [[rules]] query = "tok=/New York/" From 8fd85073e0bc74080c52192fbaabc521e0e7f379 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 29 Aug 2024 19:05:03 +0200 Subject: [PATCH 09/14] Allow to import single files --- src/util/graphupdate.rs | 129 +++++++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 48 deletions(-) diff --git a/src/util/graphupdate.rs b/src/util/graphupdate.rs index 26cf388c..4cb582c0 100644 --- a/src/util/graphupdate.rs +++ b/src/util/graphupdate.rs @@ -22,63 +22,96 @@ fn add_subcorpora( // Get the files and sort them according to their path, to get a predictable // order of adding the documents to the graph. - let mut files_in_directory = Vec::new(); - for entry in std::fs::read_dir(file_path)? { - let entry = entry?; - files_in_directory.push(entry); - } - files_in_directory.sort_by_key(|dir_entry| dir_entry.path()); - for entry in files_in_directory { - let entry_type = entry.file_type()?; - let entry_path = entry.path(); - let subcorpus_name = entry_path + if file_path.is_file() + && file_endings + .iter() + .any(|ext| file_path.extension().unwrap_or_default().to_string_lossy() == *ext) + { + // Add the file itself as document + let subcorpus_name = file_path .file_stem() - .map(|f| f.to_os_string()) - .unwrap_or_else(|| entry.file_name()) - .to_string_lossy() - .to_string(); + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_else(|| "document".to_string()); let node_name = format!("{}/{}", parent_corpus, subcorpus_name); - let add_node = if entry_type.is_file() { - if let Some(actual_ending) = entry.path().extension() { - file_endings - .iter() - .any(|ext| *ext == actual_ending.to_string_lossy().as_ref()) - } else { - false - } - } else { - entry_type.is_dir() - }; - if add_node { - u.add_event(UpdateEvent::AddNode { - node_name: node_name.clone(), - node_type: "corpus".to_string(), - })?; - u.add_event(UpdateEvent::AddEdge { - source_node: node_name.clone(), - target_node: parent_corpus.to_string(), - layer: ANNIS_NS.to_string(), - component_type: AnnotationComponentType::PartOf.to_string(), - component_name: "".to_string(), - })?; + u.add_event(UpdateEvent::AddNode { + node_name: node_name.clone(), + node_type: "corpus".to_string(), + })?; + u.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "doc".to_string(), + anno_value: subcorpus_name.to_string(), + })?; + u.add_event(UpdateEvent::AddEdge { + source_node: node_name.clone(), + target_node: parent_corpus.to_string(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + let result = (file_path.to_path_buf(), node_name); + Ok(vec![result]) + } else { + let mut files_in_directory = Vec::new(); + for entry in std::fs::read_dir(file_path)? { + let entry = entry?; + files_in_directory.push(entry); + } + files_in_directory.sort_by_key(|dir_entry| dir_entry.path()); - if entry_type.is_dir() { - result.extend(add_subcorpora(u, &entry.path(), &node_name, file_endings)?); - } else if entry_type.is_file() { - // Also add the special "annis:doc" label to mark this as document - u.add_event(UpdateEvent::AddNodeLabel { + for entry in files_in_directory { + let entry_type = entry.file_type()?; + let entry_path = entry.path(); + let subcorpus_name = entry_path + .file_stem() + .map(|f| f.to_os_string()) + .unwrap_or_else(|| entry.file_name()) + .to_string_lossy() + .to_string(); + let node_name = format!("{}/{}", parent_corpus, subcorpus_name); + let add_node = if entry_type.is_file() { + if let Some(actual_ending) = entry.path().extension() { + file_endings + .iter() + .any(|ext| *ext == actual_ending.to_string_lossy().as_ref()) + } else { + false + } + } else { + entry_type.is_dir() + }; + if add_node { + u.add_event(UpdateEvent::AddNode { node_name: node_name.clone(), - anno_ns: ANNIS_NS.to_string(), - anno_name: "doc".to_string(), - anno_value: subcorpus_name.to_string(), + node_type: "corpus".to_string(), })?; - // Only add the corpus graph leafs to the result vector - result.push((entry.path(), node_name)); + u.add_event(UpdateEvent::AddEdge { + source_node: node_name.clone(), + target_node: parent_corpus.to_string(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + + if entry_type.is_dir() { + result.extend(add_subcorpora(u, &entry.path(), &node_name, file_endings)?); + } else if entry_type.is_file() { + // Also add the special "annis:doc" label to mark this as document + u.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "doc".to_string(), + anno_value: subcorpus_name.to_string(), + })?; + // Only add the corpus graph leafs to the result vector + result.push((entry.path(), node_name)); + } } } + Ok(result) } - Ok(result) } pub fn root_corpus_from_path(root_path: &Path) -> Result { From 678578e1ed8cebf6e2393c9175079985f3f6c881 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 29 Aug 2024 19:05:14 +0200 Subject: [PATCH 10/14] Do not export ID column value if not set --- src/exporter/table.rs | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/exporter/table.rs b/src/exporter/table.rs index abaa228f..87127a51 100644 --- a/src/exporter/table.rs +++ b/src/exporter/table.rs @@ -26,7 +26,7 @@ use super::Exporter; use crate::{ deserialize::{deserialize_anno_key, deserialize_annotation_component_seq}, - workflow::StatusMessage, + progress::ProgressReporter, }; /// This module exports all ordered nodes and nodes connected by coverage edges of any name into a table. @@ -140,9 +140,11 @@ impl Exporter for ExportTable { &self, graph: &graphannis::AnnotationGraph, output_path: &std::path::Path, - _step_id: crate::StepID, + step_id: crate::StepID, tx: Option, ) -> Result<(), Box> { + let progress = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?; + let base_ordering = AnnotationComponent::new( AnnotationComponentType::Ordering, ANNIS_NS.into(), @@ -164,11 +166,7 @@ impl Exporter for ExportTable { .filter_map(|c| graph.get_graphstorage(c)) .collect_vec(); if coverage_storages.is_empty() { - if let Some(sender) = &tx { - sender.send(StatusMessage::Warning( - "No coverage storages available".to_string(), - ))?; - } + progress.warn("No coverage storages available")?; } let mut doc_node_to_start = BTreeMap::new(); for node in storage.source_nodes().flatten().filter(|n| { @@ -206,9 +204,16 @@ impl Exporter for ExportTable { } } } + let progress = ProgressReporter::new(tx, step_id, doc_node_to_start.len())?; + progress.info(&format!("Exporting {} documents", doc_node_to_start.len()))?; doc_node_to_start .into_iter() - .try_for_each(|(doc, start)| self.export_document(graph, output_path, doc, start))?; + .try_for_each(move |(doc, start)| -> anyhow::Result<()> { + progress.info(&format!("Exporting {doc} as table"))?; + self.export_document(graph, output_path, doc, start)?; + progress.worked(1)?; + Ok(()) + })?; Ok(()) } @@ -285,7 +290,9 @@ impl ExportTable { .get_value_for_item(&rn, &anno_key)? .ok_or(anyhow!("Annotation has no value"))?; data.insert(index, value.to_string()); - data.insert(index + 1, node_name.to_string()); + if self.id_column { + data.insert(index + 1, node_name.to_string()); + } } } if follow_edges { From a1defc1ce2e6c434521db2599f27c97fc354246b Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 30 Aug 2024 10:00:11 +0200 Subject: [PATCH 11/14] Add improvements on single files in changelog --- CHANGELOG.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eea5e8a6..ef59d237 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,10 +13,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Adds `saltxml` export format - The `table` exporter now supports the `id_column` parameter to enable/disable the ID column. +- Importers that map directories to (sub)-corpora and files to documents can now also importt the + corpus if the `path` argument points to a single file. ### Changed -- `exmaralda` import now ranks order of tlis higher than sorting by time value (more compatible with modern EXMARaLDA files) +- `exmaralda` import now ranks order of tlis higher than sorting by time value (more compatible with + modern EXMARaLDA files) - `xlsx` importer will connect spans to their corresponding segmentation node with coverage edges instead of connecting them with the base tokens generated for the timeline items. Thus, the configured connection between spans and base @@ -76,7 +79,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `visualize` graph operation that allows to output the current graph (somehwere in the conversion process) to SVG or DOT for debugging. - + ### Fixed - removed debug output @@ -173,7 +176,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- Fix non-resolved relative path when importing EXMARaLDA files. +- Fix non-resolved relative path when importing EXMARaLDA files. - Limit the table width when listing the module properties, so they fit in the current terminal. @@ -203,9 +206,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added simple chunker module based on [text-splitter](https://crates.io/crates/text-splitter). -- `check` can write check report to file +- `check` can write check report to file - `check` can test a corpus graph comparing results to an external corpus graph loaded from a graphANNIS database -- import `ptb` can now split node annotations to derive a label for the incoming edge, when a delimiter is provided +- import `ptb` can now split node annotations to derive a label for the incoming edge, when a delimiter is provided using `edge_delimiter`. E. g., `NP-sbj` will create a node of category `NP`, whose incoming edge has function `sbj`, given the following config is used: `edge_delimiter = "-"` - config attribute `stable_order` for exporting graphml enforces stable ordering of edges and nodes in output @@ -247,7 +250,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `exmaralda` returns error when there is no time value for a timeline item - fixed and simplified import of corpus node annotations - `exmaralda` import's paths to linked media files are relative to the working directory -- `xlsx` importer now adds `PartOf` relations to the document nodes +- `xlsx` importer now adds `PartOf` relations to the document nodes ## [0.4.0] - 2023-11-13 From 54d1deed9bdf7b206ea7e7242d4db8b574b2b60f Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 30 Aug 2024 10:44:37 +0200 Subject: [PATCH 12/14] Add test case for importing a single document --- src/util/graphupdate.rs | 28 ++++++++++++- ...raphupdate__tests__single_file_import.snap | 39 +++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 src/util/snapshots/annatto__util__graphupdate__tests__single_file_import.snap diff --git a/src/util/graphupdate.rs b/src/util/graphupdate.rs index 4cb582c0..cb65531d 100644 --- a/src/util/graphupdate.rs +++ b/src/util/graphupdate.rs @@ -117,7 +117,7 @@ fn add_subcorpora( pub fn root_corpus_from_path(root_path: &Path) -> Result { let norm_path = root_path.normalize()?; let root_name = norm_path - .file_name() + .file_stem() .unwrap_or_else(|| OsStr::new("root-corpus")) .to_string_lossy(); @@ -340,3 +340,29 @@ pub fn map_annotations>( Ok(span_id) } + +#[cfg(test)] +mod tests { + use std::path::Path; + + use graphannis::update::GraphUpdate; + use insta::assert_debug_snapshot; + + use super::import_corpus_graph_from_files; + + #[test] + fn single_file_import() { + let mut u = GraphUpdate::new(); + import_corpus_graph_from_files( + &mut u, + Path::new("tests/data/import/exmaralda/clean/import/exmaralda/test_doc.exb"), + &["exb"], + ) + .unwrap(); + + let result: graphannis_core::errors::Result> = u.iter().unwrap().collect(); + let result = result.unwrap(); + + assert_debug_snapshot!(result); + } +} diff --git a/src/util/snapshots/annatto__util__graphupdate__tests__single_file_import.snap b/src/util/snapshots/annatto__util__graphupdate__tests__single_file_import.snap new file mode 100644 index 00000000..e388c8f8 --- /dev/null +++ b/src/util/snapshots/annatto__util__graphupdate__tests__single_file_import.snap @@ -0,0 +1,39 @@ +--- +source: src/util/graphupdate.rs +expression: result +--- +[ + ( + 1, + AddNode { + node_name: "test_doc", + node_type: "corpus", + }, + ), + ( + 2, + AddNode { + node_name: "test_doc/test_doc", + node_type: "corpus", + }, + ), + ( + 3, + AddNodeLabel { + node_name: "test_doc/test_doc", + anno_ns: "annis", + anno_name: "doc", + anno_value: "test_doc", + }, + ), + ( + 4, + AddEdge { + source_node: "test_doc/test_doc", + target_node: "test_doc", + layer: "annis", + component_type: "PartOf", + component_name: "", + }, + ), +] From f3f0fe33830c3aa06d03646a921f915c7cd1599d Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 30 Aug 2024 10:48:30 +0200 Subject: [PATCH 13/14] Also test the return value itself --- ...__textgrid__tests__core_functionality.snap | 112 -------- ...mporter__exmaralda__tests__run_test-2.snap | 232 ----------------- ...mporter__exmaralda__tests__run_test-3.snap | 246 ------------------ ..._importer__exmaralda__tests__run_test.snap | 225 ---------------- ..._treetagger__tests__default_pos_lemma.snap | 137 ---------- src/util/graphupdate.rs | 15 +- ...raphupdate__tests__single_file_import.snap | 2 +- 7 files changed, 12 insertions(+), 957 deletions(-) delete mode 100644 src/exporter/snapshots/annatto__exporter__textgrid__tests__core_functionality.snap delete mode 100644 src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test-2.snap delete mode 100644 src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test-3.snap delete mode 100644 src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test.snap delete mode 100644 src/importer/treetagger/snapshots/annatto__importer__treetagger__tests__default_pos_lemma.snap diff --git a/src/exporter/snapshots/annatto__exporter__textgrid__tests__core_functionality.snap b/src/exporter/snapshots/annatto__exporter__textgrid__tests__core_functionality.snap deleted file mode 100644 index e4827154..00000000 --- a/src/exporter/snapshots/annatto__exporter__textgrid__tests__core_functionality.snap +++ /dev/null @@ -1,112 +0,0 @@ ---- -source: src/exporter/textgrid.rs -expression: export.unwrap() ---- -File type = "ooTextFile" -Object class = "TextGrid" - -xmin = 0 -xmax = 5.55555 -tiers? -size = 5 -item []: - item [1]: - class = "IntervalTier" - name = "dipl" - xmin = 0 - xmax = 5.55555 - intervals: size = 4 - intervals [1]: - xmin = 0 - xmax = 2.22222 - text = "I'm" - intervals [2]: - xmin = 2.22222 - xmax = 3.33333 - text = "in" - intervals [3]: - xmin = 3.33333 - xmax = 4.44444 - text = "New" - intervals [4]: - xmin = 4.44444 - xmax = 5.55555 - text = "York" - item [2]: - class = "IntervalTier" - name = "lemma" - xmin = 0 - xmax = 5.55555 - intervals: size = 4 - intervals [1]: - xmin = 0 - xmax = 1.11111 - text = "I" - intervals [2]: - xmin = 1.11111 - xmax = 2.22222 - text = "be" - intervals [3]: - xmin = 2.22222 - xmax = 3.33333 - text = "in" - intervals [4]: - xmin = 3.33333 - xmax = 5.55555 - text = "New York" - item [3]: - class = "IntervalTier" - name = "norm" - xmin = 0 - xmax = 5.55555 - intervals: size = 4 - intervals [1]: - xmin = 0 - xmax = 1.11111 - text = "I" - intervals [2]: - xmin = 1.11111 - xmax = 2.22222 - text = "am" - intervals [3]: - xmin = 2.22222 - xmax = 3.33333 - text = "in" - intervals [4]: - xmin = 3.33333 - xmax = 5.55555 - text = "New York" - item [4]: - class = "IntervalTier" - name = "pos" - xmin = 0 - xmax = 5.55555 - intervals: size = 4 - intervals [1]: - xmin = 0 - xmax = 1.11111 - text = "PRON" - intervals [2]: - xmin = 1.11111 - xmax = 2.22222 - text = "VERB" - intervals [3]: - xmin = 2.22222 - xmax = 3.33333 - text = "ADP" - intervals [4]: - xmin = 3.33333 - xmax = 5.55555 - text = "ADP" - item [5]: - class = "IntervalTier" - name = "sentence" - xmin = 0 - xmax = 5.55555 - intervals: size = 1 - intervals [1]: - xmin = 0 - xmax = 5.55555 - text = "1" - - diff --git a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test-2.snap b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test-2.snap deleted file mode 100644 index 7b14c574..00000000 --- a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test-2.snap +++ /dev/null @@ -1,232 +0,0 @@ ---- -source: src/importer/exmaralda/tests.rs -expression: actual ---- - - - - - - - - - - - - - - - - - - corpus - - - corpus - - - dipl - norm - test_doc - corpus - - - tests/data/import/exmaralda/pass-no_tier_type/import/exmaralda/test_file.wav - file - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - 0-2 - I'm - I'm - dipl - node - - - 2-3 - in - in - dipl - node - - - 3-4 - New - New - dipl - node - - - 4-5 - York - York - dipl - node - - - 0-1 - I - norm - node - I - - - 1-2 - am - norm - node - am - - - 2-3 - in - norm - node - in - - - 3-5 - New York - norm - node - New York - - - 1 - 0-5 - dipl - node - - - 0-1 - norm - I - node - PRON - - - 1-2 - norm - be - node - VERB - - - 2-3 - norm - in - node - ADP - - - 3-5 - norm - New York - node - ADP - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test-3.snap b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test-3.snap deleted file mode 100644 index a143d1a3..00000000 --- a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test-3.snap +++ /dev/null @@ -1,246 +0,0 @@ ---- -source: src/importer/exmaralda/tests.rs -expression: actual ---- - - - - - - - - - - - - - - - - - - - - - - - - - corpus - - - corpus - - - dipl - norm - corpus - personal-anno-value-1 - personal-anno-value-2 - was late for elicitation - was on time - test_doc - eng - deu - eng,eng - - - file - tests/data/import/exmaralda/clean/import/exmaralda/test_file.wav - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - dipl - node - 0-2.22222 - I'm - I'm - - - dipl - node - 2.22222-3.33333 - in - in - - - dipl - node - 3.33333-4.44444 - New - New - - - dipl - node - 4.44444-5.55555 - York - York - - - norm - node - I - 0-1.11111 - I - - - norm - node - am - 1.11111-2.22222 - am - - - norm - node - in - 2.22222-3.33333 - in - - - norm - node - New York - 3.33333-5.55555 - New York - - - dipl - node - 1 - 0-5.55555 - - - norm - I - node - PRON - 0-1.11111 - - - norm - be - node - VERB - 1.11111-2.22222 - - - norm - in - node - ADP - 2.22222-3.33333 - - - norm - New York - node - ADP - 3.33333-5.55555 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test.snap b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test.snap deleted file mode 100644 index 6607beb3..00000000 --- a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__run_test.snap +++ /dev/null @@ -1,225 +0,0 @@ ---- -source: src/importer/exmaralda/tests.rs -expression: actual ---- - - - - - - - - - - - - - - - - - corpus - - - corpus - - - dipl - norm - test_doc - corpus - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - 0-2 - I'm - I'm - dipl - node - - - 2-3 - in - in - dipl - node - - - 3-4 - New - New - dipl - node - - - 4-5 - York - York - dipl - node - - - 0-1 - I - norm - node - I - - - 1-2 - am - norm - node - am - - - 2-3 - in - norm - node - in - - - 3-5 - New York - norm - node - New York - - - 0-5 - dipl - node - 1 - - - 0-1 - norm - I - node - PRON - - - 1-2 - norm - be - node - VERB - - - 2-3 - norm - in - node - ADP - - - 3-5 - norm - New York - node - ADP - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/importer/treetagger/snapshots/annatto__importer__treetagger__tests__default_pos_lemma.snap b/src/importer/treetagger/snapshots/annatto__importer__treetagger__tests__default_pos_lemma.snap deleted file mode 100644 index 5dbe4e21..00000000 --- a/src/importer/treetagger/snapshots/annatto__importer__treetagger__tests__default_pos_lemma.snap +++ /dev/null @@ -1,137 +0,0 @@ ---- -source: src/importer/treetagger/tests.rs -expression: actual ---- - - - - - - - - - - - - - - corpus - - - zossen - corpus - - - datasource - - - default_layer - der - node - ART - Die - - - default_layer - Jugendliche - node - NN - Jugendlichen - - - - default_layer - in - node - APPR - in - - - - default_layer - Zossen - node - NE - Zossen - - - - default_layer - wollen - node - VMFIN - wollen - - - - default_layer - ein - node - ART - ein - - - - default_layer - Musikcafé - node - NN - Musikcafé - - - - default_layer - . - node - $. - . - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/util/graphupdate.rs b/src/util/graphupdate.rs index cb65531d..9fa8c236 100644 --- a/src/util/graphupdate.rs +++ b/src/util/graphupdate.rs @@ -353,16 +353,23 @@ mod tests { #[test] fn single_file_import() { let mut u = GraphUpdate::new(); - import_corpus_graph_from_files( + let result = import_corpus_graph_from_files( &mut u, Path::new("tests/data/import/exmaralda/clean/import/exmaralda/test_doc.exb"), &["exb"], ) .unwrap(); - let result: graphannis_core::errors::Result> = u.iter().unwrap().collect(); - let result = result.unwrap(); + assert_eq!(1, result.len()); + assert_eq!( + "tests/data/import/exmaralda/clean/import/exmaralda/test_doc.exb", + result[0].0.to_string_lossy() + ); + assert_eq!("test_doc/test_doc", result[0].1); - assert_debug_snapshot!(result); + let created_updates: graphannis_core::errors::Result> = u.iter().unwrap().collect(); + let created_updates = created_updates.unwrap(); + + assert_debug_snapshot!(created_updates); } } diff --git a/src/util/snapshots/annatto__util__graphupdate__tests__single_file_import.snap b/src/util/snapshots/annatto__util__graphupdate__tests__single_file_import.snap index e388c8f8..9cd4cf83 100644 --- a/src/util/snapshots/annatto__util__graphupdate__tests__single_file_import.snap +++ b/src/util/snapshots/annatto__util__graphupdate__tests__single_file_import.snap @@ -1,6 +1,6 @@ --- source: src/util/graphupdate.rs -expression: result +expression: created_updates --- [ ( From d60aa198997f39d328336fd724851dfb899b57af Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 30 Aug 2024 10:59:03 +0200 Subject: [PATCH 14/14] Add test for the new "id_column" table configuration --- ..._exporter__table__tests__no_id_column.snap | 10 ++++++++ src/exporter/table.rs | 24 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/exporter/snapshots/annatto__exporter__table__tests__no_id_column.snap diff --git a/src/exporter/snapshots/annatto__exporter__table__tests__no_id_column.snap b/src/exporter/snapshots/annatto__exporter__table__tests__no_id_column.snap new file mode 100644 index 00000000..d7eb406c --- /dev/null +++ b/src/exporter/snapshots/annatto__exporter__table__tests__no_id_column.snap @@ -0,0 +1,10 @@ +--- +source: src/exporter/table.rs +expression: export.unwrap() +--- +norm::lemma norm::pos dipl::sentence norm::norm dipl::dipl +I PRON 1 I I'm +be VERB 1 am I'm +in ADP 1 in in +New York PRON 1 New York New +New York PRON 1 New York York diff --git a/src/exporter/table.rs b/src/exporter/table.rs index 87127a51..250ca42c 100644 --- a/src/exporter/table.rs +++ b/src/exporter/table.rs @@ -605,4 +605,28 @@ mod tests { assert!(export.is_ok(), "error: {:?}", export.err()); assert_snapshot!(export.unwrap()); } + + #[test] + fn no_id_column() { + let exmaralda = ImportEXMARaLDA {}; + let mprt = exmaralda.import_corpus( + Path::new("tests/data/import/exmaralda/clean/import/exmaralda/"), + StepID { + module_name: "test_import_exb".to_string(), + path: None, + }, + None, + ); + assert!(mprt.is_ok()); + let mut update_import = mprt.unwrap(); + let g = AnnotationGraph::with_default_graphstorages(true); + assert!(g.is_ok()); + let mut graph = g.unwrap(); + assert!(graph.apply_update(&mut update_import, |_| {}).is_ok()); + let mut exporter = ExportTable::default(); + exporter.id_column = false; + let export = export_to_string(&graph, exporter); + assert!(export.is_ok(), "error: {:?}", export.err()); + assert_snapshot!(export.unwrap()); + } }