From 181adf4e5da2d22f64316a9966f87e74e0cbeb2b Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Fri, 16 Aug 2024 13:56:42 +0200 Subject: [PATCH 01/18] implemented basic node annotation export --- src/exporter/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/exporter/mod.rs b/src/exporter/mod.rs index 9e623b5d..978a87ee 100644 --- a/src/exporter/mod.rs +++ b/src/exporter/mod.rs @@ -1,5 +1,6 @@ //! Exporter modules export the data into different formats. +pub mod conllu; pub mod exmaralda; pub mod graphml; pub mod sequence; From dbe8146edf075c9a5a94a68fc0eb72301294af39 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Fri, 16 Aug 2024 20:26:34 +0200 Subject: [PATCH 02/18] added conllu export --- CHANGELOG.md | 1 + src/exporter/conllu.rs | 475 ++++++++++++++++++ ...porter__conllu__tests__conll_to_conll.snap | 17 + tests/data/export/conll/deserialize.toml | 7 + 4 files changed, 500 insertions(+) create mode 100644 src/exporter/conllu.rs create mode 100644 src/exporter/snapshots/annatto__exporter__conllu__tests__conll_to_conll.snap create mode 100644 tests/data/export/conll/deserialize.toml diff --git a/CHANGELOG.md b/CHANGELOG.md index d6710437..84395a94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - `table` export has feature to customize n/a-value, which by default is the empty string +- Add `conllu` as export format ## [0.15.0] - 2024-08-14 diff --git a/src/exporter/conllu.rs b/src/exporter/conllu.rs new file mode 100644 index 00000000..5e6688b4 --- /dev/null +++ b/src/exporter/conllu.rs @@ -0,0 +1,475 @@ +use std::{ + collections::{BTreeMap, BTreeSet}, + fs, + io::{LineWriter, Write}, + ops::Bound, + path::Path, + usize, +}; + +use anyhow::{anyhow, bail}; +use graphannis::{ + graph::{AnnoKey, Edge, GraphStorage, NodeID}, + model::{AnnotationComponent, AnnotationComponentType}, + AnnotationGraph, +}; +use graphannis_core::{ + annostorage::ValueSearch, + graph::{ANNIS_NS, NODE_NAME_KEY}, +}; +use itertools::Itertools; +use serde::Deserialize; + +use super::Exporter; + +use crate::deserialize::{ + deserialize_anno_key, deserialize_anno_key_opt, deserialize_anno_key_seq, + deserialize_annotation_component, deserialize_annotation_component_opt, + deserialize_annotation_component_seq, +}; + +#[derive(Deserialize)] +#[serde(deny_unknown_fields)] +pub struct ExportCoNLLU { + #[serde( + deserialize_with = "deserialize_anno_key", + default = "default_doc_anno" + )] + doc: AnnoKey, + #[serde(deserialize_with = "deserialize_anno_key_opt", default)] + groupby: Option, + #[serde( + deserialize_with = "deserialize_annotation_component", + default = "default_ordering" + )] + ordering: AnnotationComponent, + #[serde( + deserialize_with = "deserialize_anno_key", + default = "default_form_key" + )] + form: AnnoKey, + #[serde( + deserialize_with = "deserialize_anno_key", + default = "default_lemma_key" + )] + lemma: AnnoKey, + #[serde( + deserialize_with = "deserialize_anno_key", + default = "default_upos_key" + )] + upos: AnnoKey, + #[serde( + deserialize_with = "deserialize_anno_key", + default = "default_xpos_key" + )] + xpos: AnnoKey, + #[serde(deserialize_with = "deserialize_anno_key_seq", default)] + features: Vec, + #[serde(deserialize_with = "deserialize_annotation_component_opt", default)] + dependency_component: Option, // this is an option, because by default no edges are exported, as dependency anotations are not usually given and exporting conll usually serves actually parsing the data + #[serde(deserialize_with = "deserialize_anno_key_opt", default)] + dependency_anno: Option, // same reason for option as in component field + #[serde(deserialize_with = "deserialize_annotation_component_seq", default)] + enhanced_components: Vec, + #[serde(deserialize_with = "deserialize_anno_key_seq", default)] + enhanced_annos: Vec, + #[serde(deserialize_with = "deserialize_anno_key_seq", default)] + misc: Vec, +} + +fn default_doc_anno() -> AnnoKey { + AnnoKey { + name: "doc".into(), + ns: ANNIS_NS.into(), + } +} + +fn default_ordering() -> AnnotationComponent { + AnnotationComponent::new( + AnnotationComponentType::Ordering, + ANNIS_NS.into(), + "".into(), + ) +} + +fn default_form_key() -> AnnoKey { + AnnoKey { + name: "tok".into(), + ns: ANNIS_NS.into(), + } +} + +fn default_lemma_key() -> AnnoKey { + AnnoKey { + name: "lemma".into(), + ns: "".into(), + } +} + +fn default_xpos_key() -> AnnoKey { + AnnoKey { + name: "xpos".into(), + ns: "".into(), + } +} + +fn default_upos_key() -> AnnoKey { + AnnoKey { + name: "upos".into(), + ns: "".into(), + } +} + +impl Default for ExportCoNLLU { + fn default() -> Self { + Self { + doc: default_doc_anno(), + groupby: None, + ordering: default_ordering(), + form: default_form_key(), + lemma: default_lemma_key(), + upos: default_upos_key(), + xpos: default_xpos_key(), + features: vec![], + dependency_component: None, + dependency_anno: None, + enhanced_components: vec![], + enhanced_annos: vec![], + misc: vec![], + } + } +} + +const FILE_EXTENSION: &str = "conllu"; + +impl Exporter for ExportCoNLLU { + fn export_corpus( + &self, + graph: &graphannis::AnnotationGraph, + output_path: &std::path::Path, + _step_id: crate::StepID, + _tx: Option, + ) -> Result<(), Box> { + let mut doc_nodes = graph + .get_node_annos() + .exact_anno_search( + Some(self.doc.ns.as_str()), + self.doc.name.as_str(), + ValueSearch::Any, + ) + .flatten(); + doc_nodes.try_for_each(|d| self.export_document(graph, d.node, output_path))?; + Ok(()) + } + + fn file_extension(&self) -> &str { + FILE_EXTENSION + } +} + +const NO_VALUE: &str = "_"; + +type NodeData<'a> = BTreeMap<&'a AnnoKey, String>; +type DependencyData = Vec<(NodeID, Option)>; + +impl ExportCoNLLU { + fn export_document( + &self, + graph: &AnnotationGraph, + doc_node: NodeID, + corpus_path: &Path, + ) -> Result<(), anyhow::Error> { + let node_annos = graph.get_node_annos(); + let doc_name = node_annos + .get_value_for_item(&doc_node, &self.doc)? + .ok_or(anyhow!("Document name is not available."))?; + let output_path = corpus_path.join(format!("{doc_name}.{}", self.file_extension())); + let mut writer = LineWriter::new(fs::File::create(output_path)?); + let part_of_storage = graph + .get_graphstorage(&AnnotationComponent::new( + AnnotationComponentType::PartOf, + ANNIS_NS.into(), + "".into(), + )) + .ok_or(anyhow!("Part-of component storage not available."))?; + let ordering_storage = graph + .get_graphstorage(&self.ordering) + .ok_or(anyhow!("Ordering storage is unavailable."))?; + let start_node = part_of_storage + .find_connected_inverse(doc_node, 0, Bound::Included(usize::MAX)) + .flatten() + .find(|n| { + !ordering_storage.has_ingoing_edges(*n).unwrap_or_default() + && node_annos + .has_value_for_item(n, &self.form) + .unwrap_or_default() + }) + .ok_or(anyhow!("Could not find ordering start node for {doc_name}"))?; + let mut anno_keys = vec![&self.form, &self.lemma, &self.upos, &self.xpos]; + anno_keys.extend(&self.features); + anno_keys.extend(&self.misc); + let mut node_id = 1; + let mut last_group = None; + let ordered_nodes = ordering_storage + .find_connected(start_node, 0, Bound::Included(usize::MAX)) + .flatten() + .collect_vec(); // can be memory intense, but we need indices + let node_to_index: BTreeMap = ordered_nodes + .iter() + .enumerate() + .map(|(i, n)| (*n, i)) + .collect(); + for node in ordered_nodes { + let (mut data, group_node, dependency_data) = + self.node_data(graph, anno_keys.clone(), node)?; + if let (Some(gn), Some(gn_)) = (last_group, group_node) { + if gn != gn_ { + writer.write_all("\n".as_bytes())?; + last_group = group_node; + node_id = 1; + } + } else { + last_group = group_node; + } + let mut line = Vec::new(); + line.push(node_id.to_string()); + if let Some(value) = data.remove(&self.form) { + line.push(value); + } else { + bail!( + "No form value for node {}", + node_annos + .get_value_for_item(&node, &NODE_NAME_KEY)? + .unwrap_or_default() + ); + } + for k in [&self.lemma, &self.upos, &self.xpos] { + line.push(data.remove(k).unwrap_or(NO_VALUE.to_string())); + } + let mut features = Vec::with_capacity(self.features.len()); + for k in &self.features { + if let Some(value) = data.remove(k) { + features.push([k.name.to_string(), value].join("=")); + } + } + features.sort(); + if features.is_empty() { + line.push(NO_VALUE.to_string()); + } else { + line.push(features.join("|")); + } + + // dependencies + let (head_id, label) = if self.dependency_component.is_some() { + map_dependency_data(dependency_data.get(0), node_id, node, &node_to_index)? + } else { + (NO_VALUE.to_string(), NO_VALUE.to_string()) + }; + line.push(head_id); + line.push(label); + + // enhanced dependencies + let mut entries = Vec::with_capacity(dependency_data.len()); + for entry in &dependency_data { + let (head, label) = + map_dependency_data(Some(entry), node_id, node, &node_to_index)?; + entries.push([head, label].join(":")); + } + if entries.is_empty() { + line.push(NO_VALUE.to_string()); + } else { + line.push(entries.join("|")); + } + + // misc + features.clear(); + for k in &self.misc { + if let Some(value) = data.remove(k) { + features.push([k.name.to_string(), value].join("=")); + } + } + features.sort(); + if features.is_empty() { + line.push(NO_VALUE.to_string()); + } else { + line.push(features.join("|")); + } + + // finish + writer.write_all(line.join("\t").as_bytes())?; + writer.write_all("\n".as_bytes())?; + node_id += 1; + } + writer.flush()?; + Ok(()) + } + + fn node_data<'a>( + &self, + graph: &AnnotationGraph, + keys: Vec<&'a AnnoKey>, + node: NodeID, + ) -> Result<(NodeData<'a>, Option, DependencyData), anyhow::Error> { + let coverage_storages = graph + .get_all_components(Some(AnnotationComponentType::Coverage), None) + .into_iter() + .map(|c| graph.get_graphstorage(&c)) + .flatten() + .collect_vec(); + let mut connected_nodes = BTreeSet::default(); + for storage in coverage_storages { + storage + .find_connected(node, 0, Bound::Included(usize::MAX)) + .flatten() + .for_each(|n| { + connected_nodes.insert(n); + }); + let extra_nodes = connected_nodes + .iter() + .map(|n| { + storage + .find_connected_inverse(*n, 1, Bound::Included(usize::MAX)) + .flatten() + }) + .collect_vec(); + extra_nodes + .into_iter() + .for_each(|v| connected_nodes.extend(v)); + } + let mut data = BTreeMap::default(); + let mut remaining_keys: BTreeSet<&AnnoKey> = keys.into_iter().collect(); + let node_annos = graph.get_node_annos(); + let mut group_node = None; + let mut dependency_data = DependencyData::default(); + let mut dependency_storages = if let Some(c) = &self.dependency_component { + if let Some(storage) = graph.get_graphstorage(c) { + vec![storage] + } else { + bail!("No such component: {c}. Please check configuration."); + } + } else { + vec![] + }; + self.enhanced_components.iter().for_each(|c| { + if let Some(storage) = graph.get_graphstorage(c) { + dependency_storages.push(storage); + } + }); + let mut dependency_keys = if let Some(k) = &self.dependency_anno { + vec![k] + } else { + vec![] + }; + self.enhanced_annos + .iter() + .for_each(|k| dependency_keys.push(k)); + if dependency_storages.len() != dependency_keys.len() { + bail!("Number of dependency components does not match number of label names."); + } + for node in connected_nodes { + if let (None, Some(k)) = (group_node, &self.groupby) { + if node_annos.has_value_for_item(&node, k)? { + group_node = Some(node); + } + } + if !remaining_keys.is_empty() { + let mut pop = BTreeSet::new(); + for k in &remaining_keys { + if let Some(value) = node_annos.get_value_for_item(&node, k)? { + pop.insert(*k); + data.insert(*k, value.to_string()); + } + } + for k in pop { + remaining_keys.remove(k); + } + } + for (storage, label_key) in dependency_storages.iter().zip(&dependency_keys) { + if let Some(other_node) = storage.get_ingoing_edges(node).next() { + let id = other_node?; + let label = storage + .get_anno_storage() + .get_value_for_item( + &Edge { + source: id, + target: node, + }, + &label_key, + )? + .map(|v| v.to_string()); + dependency_data.push((id, label)); + } + } + } + Ok((data, group_node, dependency_data)) + } +} + +fn map_dependency_data( + dependency_data: Option<&(NodeID, Option)>, + conll_id: usize, + internal_id: NodeID, + node_index: &BTreeMap, +) -> Result<(String, String), anyhow::Error> { + if let Some((internal_head_id, label)) = dependency_data { + let order_index_head = *node_index + .get(&internal_head_id) + .ok_or(anyhow!("Unknown node id of dependency head."))? + as i32; + let order_index_dependent = *node_index + .get(&internal_id) + .ok_or(anyhow!("Unknown dependent id."))? as i32; + let normalized_id = order_index_head - order_index_dependent + (conll_id as i32); + if let Some(v) = label { + Ok((normalized_id.to_string(), v.to_string())) + } else { + Ok((normalized_id.to_string(), NO_VALUE.to_string())) + } + } else { + Ok((NO_VALUE.to_string(), NO_VALUE.to_string())) + } +} + +#[cfg(test)] +mod tests { + use std::{fs, path::Path}; + + use graphannis::AnnotationGraph; + use insta::assert_snapshot; + + use crate::{ + exporter::conllu::ExportCoNLLU, + importer::{conllu::ImportCoNLLU, Importer}, + test_util::export_to_string, + StepID, + }; + + #[test] + fn conll_to_conll() { + let conll_in = ImportCoNLLU::default(); + let u = conll_in.import_corpus( + Path::new("tests/data/import/conll/valid"), + StepID { + module_name: "test_import".to_string(), + path: None, + }, + None, + ); + assert!(u.is_ok()); + let g = AnnotationGraph::with_default_graphstorages(true); + assert!(g.is_ok()); + let mut update = u.unwrap(); + let mut graph = g.unwrap(); + assert!(graph.apply_update(&mut update, |_| {}).is_ok()); + let toml_str = fs::read_to_string("./tests/data/export/conll/deserialize.toml").unwrap(); + let conll_out: Result = toml::from_str(toml_str.as_str()); + assert!( + conll_out.is_ok(), + "could not deserialize exporter: {:?}", + conll_out.err() + ); + let actual = export_to_string(&graph, conll_out.unwrap()); + assert!(actual.is_ok(), "failed: {:?}", actual.err()); + assert_snapshot!(actual.unwrap()); + } +} diff --git a/src/exporter/snapshots/annatto__exporter__conllu__tests__conll_to_conll.snap b/src/exporter/snapshots/annatto__exporter__conllu__tests__conll_to_conll.snap new file mode 100644 index 00000000..ccd8e440 --- /dev/null +++ b/src/exporter/snapshots/annatto__exporter__conllu__tests__conll_to_conll.snap @@ -0,0 +1,17 @@ +--- +source: src/exporter/conllu.rs +expression: actual.unwrap() +--- +1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|2:nsubj _ +2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres _ _ _ _ +3 and and CONJ CC _ 4 cc 4:cc|4:cc _ +4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 2:conj|2:conj _ +5 books book NOUN NNS Number=Plur 2 obj 2:obj|2:obj SpaceAfter=No +6 . . PUNCT . _ 2 punct 2:punct|2:punct _ + +1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj 2:nsubj|2:nsubj _ +2 have have VERB VBP Number=Sing|Person=1|Tense=Pres _ _ _ _ +3 no no DET DT PronType=Neg 4 det 4:det|4:det _ +4 clue clue NOUN NN Number=Sing 2 obj 2:obj|2:obj SpaceAfter=No +5 . . PUNCT . _ 2 punct 2:punct|2:punct _ + diff --git a/tests/data/export/conll/deserialize.toml b/tests/data/export/conll/deserialize.toml new file mode 100644 index 00000000..77301904 --- /dev/null +++ b/tests/data/export/conll/deserialize.toml @@ -0,0 +1,7 @@ +features = ["Number", "Person", "Case", "Gender", "PronType", "Tense"] +misc = ["SpaceAfter"] +groupby = "sent_id" +dependency_component = { ctype = "Pointing", layer = "", name = "dep" } +dependency_anno = "deprel" +enhanced_components = [{ ctype = "Pointing", layer = "", name = "dep" }] # this is intended to double the representation of dependencies in the enhanced column +enhanced_annos = ["deprel"] From fd0868d18b6714ec4338c9476ffdded2f6fdc422 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Fri, 16 Aug 2024 20:28:36 +0200 Subject: [PATCH 03/18] fixed clippy warnings --- src/exporter/conllu.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/exporter/conllu.rs b/src/exporter/conllu.rs index 5e6688b4..245abb17 100644 --- a/src/exporter/conllu.rs +++ b/src/exporter/conllu.rs @@ -4,12 +4,11 @@ use std::{ io::{LineWriter, Write}, ops::Bound, path::Path, - usize, }; use anyhow::{anyhow, bail}; use graphannis::{ - graph::{AnnoKey, Edge, GraphStorage, NodeID}, + graph::{AnnoKey, Edge, NodeID}, model::{AnnotationComponent, AnnotationComponentType}, AnnotationGraph, }; @@ -261,7 +260,7 @@ impl ExportCoNLLU { // dependencies let (head_id, label) = if self.dependency_component.is_some() { - map_dependency_data(dependency_data.get(0), node_id, node, &node_to_index)? + map_dependency_data(dependency_data.first(), node_id, node, &node_to_index)? } else { (NO_VALUE.to_string(), NO_VALUE.to_string()) }; @@ -313,8 +312,7 @@ impl ExportCoNLLU { let coverage_storages = graph .get_all_components(Some(AnnotationComponentType::Coverage), None) .into_iter() - .map(|c| graph.get_graphstorage(&c)) - .flatten() + .filter_map(|c| graph.get_graphstorage(&c)) .collect_vec(); let mut connected_nodes = BTreeSet::default(); for storage in coverage_storages { @@ -394,7 +392,7 @@ impl ExportCoNLLU { source: id, target: node, }, - &label_key, + label_key, )? .map(|v| v.to_string()); dependency_data.push((id, label)); @@ -413,7 +411,7 @@ fn map_dependency_data( ) -> Result<(String, String), anyhow::Error> { if let Some((internal_head_id, label)) = dependency_data { let order_index_head = *node_index - .get(&internal_head_id) + .get(internal_head_id) .ok_or(anyhow!("Unknown node id of dependency head."))? as i32; let order_index_dependent = *node_index From 24dae0c14a795d7813321e0891134333fa81c81b Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Sat, 17 Aug 2024 00:00:48 +0200 Subject: [PATCH 04/18] added module to lib, added doc string --- src/exporter/conllu.rs | 111 ++++++++++++++++++++++++++++++++++++++++- src/lib.rs | 11 +++- 2 files changed, 119 insertions(+), 3 deletions(-) diff --git a/src/exporter/conllu.rs b/src/exporter/conllu.rs index 245abb17..a2615c55 100644 --- a/src/exporter/conllu.rs +++ b/src/exporter/conllu.rs @@ -7,6 +7,7 @@ use std::{ }; use anyhow::{anyhow, bail}; +use documented::{Documented, DocumentedFields}; use graphannis::{ graph::{AnnoKey, Edge, NodeID}, model::{AnnotationComponent, AnnotationComponentType}, @@ -18,6 +19,7 @@ use graphannis_core::{ }; use itertools::Itertools; use serde::Deserialize; +use struct_field_names_as_array::FieldNamesAsSlice; use super::Exporter; @@ -27,51 +29,158 @@ use crate::deserialize::{ deserialize_annotation_component_seq, }; -#[derive(Deserialize)] +/// This module exports a graph in CoNLL-U format. +#[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] #[serde(deny_unknown_fields)] pub struct ExportCoNLLU { + /// This key is used to determine nodes that whose part-of subgraph constitutes a document, i. e. the entire input for a file. + /// Default is `annis::doc`, or `{ ns = "annis", name = "doc" }`. + /// + /// Example: + /// ```toml + /// [export.config] + /// doc = "annis::doc" + /// ``` #[serde( deserialize_with = "deserialize_anno_key", default = "default_doc_anno" )] doc: AnnoKey, + /// This optional annotation key is used to identify annotation spans, that constitute a sentence. Default is no export of sentence blocks. + /// Default is `annis::doc`, or `{ ns = "annis", name = "doc" }`. + /// + /// Example: + /// ```toml + /// [export.config] + /// groupby = "norm::sentence" + /// ``` #[serde(deserialize_with = "deserialize_anno_key_opt", default)] groupby: Option, + /// The nodes connected by this annotation component are used as nodes defining a line in a CoNLL-U file. Usually you want to use an ordering. + /// Default is `{ ctype = "Ordering", layer = "annis", name = "" }`. + /// + /// Example: + /// ```toml + /// [export.config] + /// ordering = { ctype = "Ordering", layer = "annis", name = "norm" } + /// ``` #[serde( deserialize_with = "deserialize_annotation_component", default = "default_ordering" )] ordering: AnnotationComponent, + /// This annotation key is used to write the form column. + /// Default is `{ ns = "annis", name = "tok" }`. + /// + /// Example: + /// ```toml + /// [export.config] + /// form = { ns = "norm", name = "norm" } + /// ``` #[serde( deserialize_with = "deserialize_anno_key", default = "default_form_key" )] form: AnnoKey, + /// This annotation key is used to write the lemma column. + /// Default is `{ ns = "", name = "tok" }`. + /// + /// Example: + /// ```toml + /// [export.config] + /// lemma = { ns = "norm", name = "lemma" } + /// ``` #[serde( deserialize_with = "deserialize_anno_key", default = "default_lemma_key" )] lemma: AnnoKey, + /// This annotation key is used to write the upos column. + /// Default is `{ ns = "", name = "upos" }`. + /// + /// Example: + /// ```toml + /// [export.config] + /// upos = { ns = "norm", name = "pos" } + /// ``` #[serde( deserialize_with = "deserialize_anno_key", default = "default_upos_key" )] upos: AnnoKey, + /// This annotation key is used to write the xpos column. + /// Default is `{ ns = "", name = "xpos" }`. + /// + /// Example: + /// ```toml + /// [export.config] + /// upos = { ns = "norm", name = "pos_spec" } + /// ``` #[serde( deserialize_with = "deserialize_anno_key", default = "default_xpos_key" )] xpos: AnnoKey, + /// This list of annotation keys will be represented in the feature column. + /// Default is the empty list. + /// + /// Example: + /// ```toml + /// [export.config] + /// features = ["Animacy", "Tense", "VerbClass"] + /// ``` #[serde(deserialize_with = "deserialize_anno_key_seq", default)] features: Vec, + /// The nodes connected by this annotation component are used to export dependencies. + /// Default is none, so nothing will be exported. + /// + /// Example: + /// ```toml + /// [export.config] + /// dependency_component = { ctype = "Pointing", layer = "", name = "dependencies" } + /// ``` #[serde(deserialize_with = "deserialize_annotation_component_opt", default)] dependency_component: Option, // this is an option, because by default no edges are exported, as dependency anotations are not usually given and exporting conll usually serves actually parsing the data + /// This annotation key is used to write the dependency relation, which will be looked for on the dependency edges. + /// Default is none, so nothing will be exported. + /// + /// Example: + /// ```toml + /// [export.config] + /// dependency_anno = { ns = "", name = "deprel" } + /// ``` #[serde(deserialize_with = "deserialize_anno_key_opt", default)] dependency_anno: Option, // same reason for option as in component field + /// The listed components will be used to export enhanced dependencies. More than + /// one component can be listed. + /// Default is the empty list, so nothing will be exported. + /// + /// Example: + /// ```toml + /// [export.config] + /// enhanced_components = [{ ctype = "Pointing", layer = "", name = "dependencies" }] + /// ``` #[serde(deserialize_with = "deserialize_annotation_component_seq", default)] enhanced_components: Vec, + /// This list of annotation keys defines the annotation keys, that correspond to the + /// edge labels in the component listed in `enhanced_components`. The i-th element of + /// one list belongs to the i-th element in the other list. Default is the empty list. + /// + /// Example: + /// ```toml + /// [export.config] + /// enhanced_annos = ["func"] + /// ``` #[serde(deserialize_with = "deserialize_anno_key_seq", default)] enhanced_annos: Vec, + /// This list of annotation keys will be represented in the misc column. + /// Default is the empty list. + /// + /// Example: + /// ```toml + /// [export.config] + /// misc = ["NoSpaceAfter", "Referent"] + /// ``` #[serde(deserialize_with = "deserialize_anno_key_seq", default)] misc: Vec, } diff --git a/src/lib.rs b/src/lib.rs index 7fd3db98..a9bf246b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,8 +20,9 @@ use std::{ use documented::{Documented, DocumentedFields}; use error::Result; use exporter::{ - exmaralda::ExportExmaralda, graphml::GraphMLExporter, sequence::ExportSequence, - table::ExportTable, textgrid::ExportTextGrid, xlsx::ExportXlsx, Exporter, + conllu::ExportCoNLLU, exmaralda::ExportExmaralda, graphml::GraphMLExporter, + sequence::ExportSequence, table::ExportTable, textgrid::ExportTextGrid, xlsx::ExportXlsx, + Exporter, }; use graphannis::AnnotationGraph; use importer::{ @@ -53,6 +54,7 @@ pub struct ModuleConfiguration { #[strum_discriminants(derive(EnumIter, AsRefStr), strum(serialize_all = "lowercase"))] #[serde(tag = "format", rename_all = "lowercase", content = "config")] pub enum WriteAs { + CoNLLU(#[serde(default)] ExportCoNLLU), GraphML(#[serde(default)] GraphMLExporter), // the purpose of serde(default) here is, that an empty `[export.config]` table can be omited EXMARaLDA(#[serde(default)] ExportExmaralda), Sequence(#[serde(default)] ExportSequence), @@ -77,6 +79,7 @@ impl WriteAs { WriteAs::Table(m) => m, WriteAs::TextGrid(m) => m, WriteAs::Xlsx(m) => m, + WriteAs::CoNLLU(m) => m, } } } @@ -90,6 +93,7 @@ impl WriteAsDiscriminants { WriteAsDiscriminants::Table => ExportTable::DOCS, WriteAsDiscriminants::TextGrid => ExportTextGrid::DOCS, WriteAsDiscriminants::Xlsx => ExportXlsx::DOCS, + WriteAsDiscriminants::CoNLLU => ExportCoNLLU::DOCS, } } @@ -118,6 +122,9 @@ impl WriteAsDiscriminants { WriteAsDiscriminants::Xlsx => { (ExportXlsx::FIELD_NAMES_AS_SLICE, ExportXlsx::FIELD_DOCS) } + WriteAsDiscriminants::CoNLLU => { + (ExportCoNLLU::FIELD_NAMES_AS_SLICE, ExportCoNLLU::FIELD_DOCS) + } }; for (idx, n) in field_names.iter().enumerate() { if idx < field_docs.len() { From 892110afc37282608ea0de4ee6ee2e141f95ad95 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Sat, 17 Aug 2024 00:01:26 +0200 Subject: [PATCH 05/18] test update --- tests/snapshots/cli__list_modules.snap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/snapshots/cli__list_modules.snap b/tests/snapshots/cli__list_modules.snap index 32808bb5..64ddfbc7 100644 --- a/tests/snapshots/cli__list_modules.snap +++ b/tests/snapshots/cli__list_modules.snap @@ -5,7 +5,7 @@ expression: output | Type | Modules | |------------------|----------------------------------------------------------------------------------------------------------------------| | Import formats | conllu, exmaralda, graphml, meta, none, opus, path, ptb, relannis, saltxml, textgrid, toolbox, treetagger, xlsx, xml | -| Export formats | graphml, exmaralda, sequence, table, textgrid, xlsx | +| Export formats | conllu, graphml, exmaralda, sequence, table, textgrid, xlsx | | Graph operations | check, collapse, filter, visualize, enumerate, link, map, revise, chunk, split, none | Use `annatto info ` to get more information about one of the formats or graph operations. From 9753b4b07762a29661fed913fb3fe54aed873f38 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Sat, 17 Aug 2024 00:12:14 +0200 Subject: [PATCH 06/18] boxed large variant --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a9bf246b..1184eaac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,7 +54,7 @@ pub struct ModuleConfiguration { #[strum_discriminants(derive(EnumIter, AsRefStr), strum(serialize_all = "lowercase"))] #[serde(tag = "format", rename_all = "lowercase", content = "config")] pub enum WriteAs { - CoNLLU(#[serde(default)] ExportCoNLLU), + CoNLLU(#[serde(default)] Box), GraphML(#[serde(default)] GraphMLExporter), // the purpose of serde(default) here is, that an empty `[export.config]` table can be omited EXMARaLDA(#[serde(default)] ExportExmaralda), Sequence(#[serde(default)] ExportSequence), @@ -79,7 +79,7 @@ impl WriteAs { WriteAs::Table(m) => m, WriteAs::TextGrid(m) => m, WriteAs::Xlsx(m) => m, - WriteAs::CoNLLU(m) => m, + WriteAs::CoNLLU(m) => &**m, } } } From 81b4b05415f1a0df27d1772103c02db3bbbe1ef6 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 10:46:07 +0200 Subject: [PATCH 07/18] implemented enhanced dependency feature --- src/importer/conllu/mod.rs | 63 +++++++++++++------ ...natto__importer__conllu__tests__basic.snap | 54 +++++++++------- ...u__tests__comments_and_sentence_annos.snap | 54 +++++++++------- ...orter__conllu__tests__custom_comments.snap | 54 +++++++++------- 4 files changed, 135 insertions(+), 90 deletions(-) diff --git a/src/importer/conllu/mod.rs b/src/importer/conllu/mod.rs index ca5190ee..128dfc1c 100644 --- a/src/importer/conllu/mod.rs +++ b/src/importer/conllu/mod.rs @@ -17,6 +17,7 @@ use graphannis_core::{ util::{join_qname, split_qname}, }; use itertools::Itertools; +use linked_hash_set::LinkedHashSet; use pest::{ iterators::{Pair, Pairs}, Parser, @@ -107,7 +108,7 @@ impl Display for Rule { } } -type DepSpec = (usize, Option); +type DepSpec = LinkedHashSet<(usize, Option)>; impl ImportCoNLLU { fn import_document( @@ -187,11 +188,21 @@ impl ImportCoNLLU { for member in sentence.into_inner() { match member.as_rule() { Rule::token => { - let (tok_name, tok_id, dep) = + let (tok_name, tok_id, mut deps) = self.map_token(step_id, update, document_node_name, member, tx)?; id_to_tok_name.insert(tok_id, tok_name.to_string()); - if let Some(dependency) = dep { - dependencies.push((tok_name, dependency.0, dependency.1)); + if let Some(dependency) = deps.pop_front() { + dependencies.push(( + tok_name.to_string(), + dependency.0, + dependency.1.clone(), + "", + "dep", + )); + } + + for (h, r) in deps { + dependencies.push((tok_name.to_string(), h, r, "enh", "dep")); } } Rule::multi_token | Rule::invalid_multi_token => { @@ -268,23 +279,23 @@ impl ImportCoNLLU { component_name: "".to_string(), })?; } - for (target_node_name, head_id, deprel) in dependencies { + for (target_node_name, head_id, deprel, clayer, cname) in dependencies { if head_id > 0 { if let Some(source_node_name) = id_to_tok_name.get(&head_id) { update.add_event(UpdateEvent::AddEdge { source_node: source_node_name.to_string(), target_node: target_node_name.to_string(), - layer: "".to_string(), + layer: clayer.to_string(), component_type: AnnotationComponentType::Pointing.to_string(), - component_name: "dep".to_string(), + component_name: cname.to_string(), })?; if let Some(deprel_value) = deprel { update.add_event(UpdateEvent::AddEdgeLabel { source_node: source_node_name.to_string(), target_node: target_node_name.to_string(), - layer: "".to_string(), + layer: clayer.to_string(), component_type: AnnotationComponentType::Pointing.to_string(), - component_name: "dep".to_string(), + component_name: cname.to_string(), anno_ns: "".to_string(), anno_name: "deprel".to_string(), anno_value: deprel_value.to_string(), @@ -313,7 +324,7 @@ impl ImportCoNLLU { document_node_name: &str, token: Pair, _tx: &Option, - ) -> anyhow::Result<(String, usize, Option)> { + ) -> anyhow::Result<(String, usize, DepSpec)> { let (l, c) = token.line_col(); let line = token.as_str().to_string(); let node_name = format!("{document_node_name}#t{l}_{c}"); @@ -335,8 +346,7 @@ impl ImportCoNLLU { anno_value: "default_layer".to_string(), })?; let mut token_id = None; - let mut head_id = None; - let mut deprel = None; + let mut dependencies = DepSpec::default(); for member in token.into_inner() { let rule = member.as_rule(); match rule { @@ -390,21 +400,38 @@ impl ImportCoNLLU { Rule::head => { for id_or_else in member.into_inner() { if id_or_else.as_rule() == Rule::id { - head_id = Some(id_or_else.as_str().trim().parse::()?); - break; + dependencies + .insert((id_or_else.as_str().trim().parse::()?, None)); } } } Rule::deprel => { - deprel = Some(member.as_str().trim().to_string()); + if let Some((base_head, None)) = dependencies.pop_back() { + dependencies.insert((base_head, Some(member.as_str().trim().to_string()))); + } + } + Rule::enhanced_deps => { + for enh_dep in member.into_inner() { + let mut inner = enh_dep.into_inner(); + if let Some(enh_id) = inner.next() { + let head = enh_id.as_str().trim().parse::()?; + if let Some(enh_rel) = inner.next() { + let rel = enh_rel.as_str().to_string(); + let value = (head, Some(rel)); + // this is to avoid the basic dependency to be anywhere else than in the first position, because this position needs to be treated differently + // to avoid cycles in the graph + if !dependencies.contains(&value) { + dependencies.insert(value); + } + } + } + } } - Rule::enhanced_deps => {} _ => {} } } - let dependency = head_id.map(|v| (v, deprel)); if let Some(id) = token_id { - Ok((node_name, id, dependency)) + Ok((node_name, id, dependencies)) } else { // by grammar spec this branch should never be possible let reason = format!("Token `{line}` ({l}, {c}) has no id which is invalid."); diff --git a/src/importer/conllu/snapshots/annatto__importer__conllu__tests__basic.snap b/src/importer/conllu/snapshots/annatto__importer__conllu__tests__basic.snap index 136157de..1b2a4fb6 100644 --- a/src/importer/conllu/snapshots/annatto__importer__conllu__tests__basic.snap +++ b/src/importer/conllu/snapshots/annatto__importer__conllu__tests__basic.snap @@ -194,53 +194,59 @@ expression: actual.unwrap() det - + + nsubj + + + obj + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/src/importer/conllu/snapshots/annatto__importer__conllu__tests__comments_and_sentence_annos.snap b/src/importer/conllu/snapshots/annatto__importer__conllu__tests__comments_and_sentence_annos.snap index 24ecb926..e5fe8079 100644 --- a/src/importer/conllu/snapshots/annatto__importer__conllu__tests__comments_and_sentence_annos.snap +++ b/src/importer/conllu/snapshots/annatto__importer__conllu__tests__comments_and_sentence_annos.snap @@ -198,53 +198,59 @@ it has two lines, what are you going to do about it?! det - + + nsubj + + + obj + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/src/importer/conllu/snapshots/annatto__importer__conllu__tests__custom_comments.snap b/src/importer/conllu/snapshots/annatto__importer__conllu__tests__custom_comments.snap index a9e429df..6a6c9b7e 100644 --- a/src/importer/conllu/snapshots/annatto__importer__conllu__tests__custom_comments.snap +++ b/src/importer/conllu/snapshots/annatto__importer__conllu__tests__custom_comments.snap @@ -198,53 +198,59 @@ it has two lines, what are you going to do about it?! det - + + nsubj + + + obj + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + From f57fe108bbcb99f58e46d8f61ec91ac0cf132cb0 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 10:46:47 +0200 Subject: [PATCH 08/18] update --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84395a94..217addaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `table` export has feature to customize n/a-value, which by default is the empty string - Add `conllu` as export format +- import of `conllu` now supports enhanced dependencies ## [0.15.0] - 2024-08-14 From 48e6259efdd0477ed39b9dfa3f1b720b2cb0e063 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 10:58:41 +0200 Subject: [PATCH 09/18] adapted test config --- tests/data/export/conll/deserialize.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/export/conll/deserialize.toml b/tests/data/export/conll/deserialize.toml index 77301904..a05b6e80 100644 --- a/tests/data/export/conll/deserialize.toml +++ b/tests/data/export/conll/deserialize.toml @@ -3,5 +3,5 @@ misc = ["SpaceAfter"] groupby = "sent_id" dependency_component = { ctype = "Pointing", layer = "", name = "dep" } dependency_anno = "deprel" -enhanced_components = [{ ctype = "Pointing", layer = "", name = "dep" }] # this is intended to double the representation of dependencies in the enhanced column +enhanced_components = [{ ctype = "Pointing", layer = "enh", name = "dep" }] enhanced_annos = ["deprel"] From 6950057d2f98b31137bfacb5b383081ef29d7447 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 14:39:41 +0200 Subject: [PATCH 10/18] new snapshot export test --- ...xporter__conllu__tests__conll_to_conll.snap | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/exporter/snapshots/annatto__exporter__conllu__tests__conll_to_conll.snap b/src/exporter/snapshots/annatto__exporter__conllu__tests__conll_to_conll.snap index ccd8e440..7af5e854 100644 --- a/src/exporter/snapshots/annatto__exporter__conllu__tests__conll_to_conll.snap +++ b/src/exporter/snapshots/annatto__exporter__conllu__tests__conll_to_conll.snap @@ -2,16 +2,16 @@ source: src/exporter/conllu.rs expression: actual.unwrap() --- -1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|2:nsubj _ +1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _ 2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres _ _ _ _ -3 and and CONJ CC _ 4 cc 4:cc|4:cc _ -4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 2:conj|2:conj _ -5 books book NOUN NNS Number=Plur 2 obj 2:obj|2:obj SpaceAfter=No -6 . . PUNCT . _ 2 punct 2:punct|2:punct _ +3 and and CONJ CC _ 4 cc 4:cc _ +4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 2:conj _ +5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No +6 . . PUNCT . _ 2 punct 2:punct _ -1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj 2:nsubj|2:nsubj _ +1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj 2:nsubj _ 2 have have VERB VBP Number=Sing|Person=1|Tense=Pres _ _ _ _ -3 no no DET DT PronType=Neg 4 det 4:det|4:det _ -4 clue clue NOUN NN Number=Sing 2 obj 2:obj|2:obj SpaceAfter=No -5 . . PUNCT . _ 2 punct 2:punct|2:punct _ +3 no no DET DT PronType=Neg 4 det 4:det _ +4 clue clue NOUN NN Number=Sing 2 obj 2:obj SpaceAfter=No +5 . . PUNCT . _ 2 punct 2:punct _ From 3d01dfe800ee61f56656a0412b1f92f9aa4b4f28 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 21:50:54 +0200 Subject: [PATCH 11/18] parse tlis by order of appearance --- src/importer/exmaralda/mod.rs | 97 ++++++++++++------- ...orter__exmaralda__tests__invalid_fail.snap | 2 +- 2 files changed, 64 insertions(+), 35 deletions(-) diff --git a/src/importer/exmaralda/mod.rs b/src/importer/exmaralda/mod.rs index 6a17bd8b..4b044910 100644 --- a/src/importer/exmaralda/mod.rs +++ b/src/importer/exmaralda/mod.rs @@ -95,7 +95,7 @@ impl ImportEXMARaLDA { let mut already_defined: BTreeSet = BTreeSet::new(); let mut named_orderings: BTreeMap, String)>> = BTreeMap::new(); - let mut time_to_tli_attrs: BTreeMap, Vec> = BTreeMap::new(); + let mut tlis = Vec::new(); // reader let f = File::open(document_path)?; let mut parser_cfg = ParserConfig::new(); @@ -156,7 +156,7 @@ impl ImportEXMARaLDA { } } "tli" => { - if let Some(time_value) = attr_map.get("time") { + let time = if let Some(time_value) = attr_map.get("time") { let time = if let Ok(t_val) = time_value.parse::>() { t_val @@ -168,18 +168,12 @@ impl ImportEXMARaLDA { }; return Err(err); }; - time_to_tli_attrs - .entry(time) - .or_default() - .push(attr_map["id"].to_string()); + Some(time) } else { - let err = AnnattoError::Import { - reason: "A timeline item does not have a time value." - .to_string(), - importer: step_id.module_name.clone(), - path: document_path.to_path_buf(), - }; - return Err(err); + None + }; + if let Some(id) = attr_map.get("id") { + tlis.push((id.to_string(), time)); } } "language" => { @@ -219,12 +213,54 @@ impl ImportEXMARaLDA { } } "common-timeline" => { + // check for integrity of timeline + let mut used_time_values = BTreeSet::default(); + let mut last = OrderedFloat::from(-1.); + let mut corrupted = false; + for (_, to) in &tlis { + if let Some(t) = to { + if used_time_values.contains(t) { + return Err(AnnattoError::Import { + reason: format!( + "Time value {t} is used more than once." + ), + importer: step_id.module_name.to_string(), + path: document_path.to_path_buf(), + }); + } + if t <= &last { + if let Some(sender) = &tx { + sender.send(StatusMessage::Warning( + "Unordered timeline, will try to fix ..." + .to_string(), + ))?; + } + corrupted = true; + } + last = *t; + used_time_values.insert(*t); + } + } + if corrupted { + if tlis.iter().any(|(_, t_opt)| t_opt.is_none()) { + // impossible, order of mentioning of tlis in xml-file is relevant + return Err(AnnattoError::Import { + reason: "Timeline cannot be fixed automatically." + .to_string(), + importer: step_id.to_string(), + path: document_path.to_path_buf(), + }); + } else { + tlis.sort_by(|(_, t_opt_a), (_, t_opt_b)| { + t_opt_a + .unwrap_or_default() + .cmp(&t_opt_b.unwrap_or_default()) + }); + } + } // build empty toks - for (time_value, tli_ids) in - time_to_tli_attrs.iter().sorted_by(|e0, e1| e0.0.cmp(e1.0)) - { - let tli_id_suffix = tli_ids.join("_"); - let node_name = format!("{}#{}", &doc_node_name, tli_id_suffix); + for (tli_id, time_opt) in &tlis { + let node_name = format!("{}#{}", &doc_node_name, tli_id); update.add_event(UpdateEvent::AddNode { node_name: node_name.to_string(), node_type: "node".to_string(), @@ -235,12 +271,10 @@ impl ImportEXMARaLDA { anno_name: "tok".to_string(), anno_value: " ".to_string(), })?; - for tli_id in tli_ids { - timeline.insert( - tli_id.to_string(), - (*time_value, node_name.to_string()), - ); - } + timeline.insert( + tli_id.to_string(), + ((*time_opt).clone(), node_name.to_string()), + ); update.add_event(UpdateEvent::AddEdge { source_node: node_name.to_string(), target_node: doc_node_name.to_string(), @@ -250,13 +284,8 @@ impl ImportEXMARaLDA { })?; } // order timeline elements / empty toks - ordered_tl_nodes.extend( - timeline - .iter() - .sorted_by(|a, b| a.1 .0.cmp(&b.1 .0)) - .map(|t| t.0.to_string()) - .collect_vec(), - ); + ordered_tl_nodes + .extend(tlis.iter().map(|e| e.0.to_string()).collect_vec()); for i in 1..ordered_tl_nodes.len() { if let (Some(source), Some(target)) = ( &timeline.get(&ordered_tl_nodes[i - 1]), @@ -412,7 +441,7 @@ impl ImportEXMARaLDA { "{}#{}_{}_{}-{}", doc_node_name, tier_type, speaker_id, start_id, end_id ); // this is not a unique id as not intended to be - let start_time = if let Some((t, _)) = timeline.get(key) { + let start_time = if let Some((Some(t), _)) = timeline.get(key) { t } else { if let Some(sender) = tx { @@ -459,11 +488,11 @@ impl ImportEXMARaLDA { "Could not determine end time of event {}::{}:{}-{}. Event will be skipped.", &speaker_id, &anno_name, &start_id, &end_id ); - sender.send(StatusMessage::Warning(msg))?; + sender.send(StatusMessage::Info(msg))?; } continue; }; - if let Some((end_time, _)) = node_tpl { + if let Some((Some(end_time), _)) = node_tpl { update.add_event(UpdateEvent::AddNodeLabel { node_name: node_name.to_string(), anno_ns: ANNIS_NS.to_string(), diff --git a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__invalid_fail.snap b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__invalid_fail.snap index ef98e8c9..4927c4b9 100644 --- a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__invalid_fail.snap +++ b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__invalid_fail.snap @@ -1,5 +1,5 @@ --- source: src/importer/exmaralda/tests.rs -expression: r.err().unwrap() +expression: r.err().unwrap().to_string() --- Error during importing corpus to ./tests/data/import/exmaralda/fail-invalid/import/exmaralda/test_doc_invalid.exb with "import_exmaralda": "Start time is bigger than end time for ids: T1--T2 " From 3baabb399660f5b9e9f1892d797dec2e3689fed1 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 21:52:56 +0200 Subject: [PATCH 12/18] clip- -py --- src/importer/exmaralda/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/importer/exmaralda/mod.rs b/src/importer/exmaralda/mod.rs index 4b044910..327edcec 100644 --- a/src/importer/exmaralda/mod.rs +++ b/src/importer/exmaralda/mod.rs @@ -273,7 +273,7 @@ impl ImportEXMARaLDA { })?; timeline.insert( tli_id.to_string(), - ((*time_opt).clone(), node_name.to_string()), + ((*time_opt), node_name.to_string()), ); update.add_event(UpdateEvent::AddEdge { source_node: node_name.to_string(), From b565078570478e47db842b890bcdc5b42050b821 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:03:41 +0200 Subject: [PATCH 13/18] update --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 217addaa..7ff9e81d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `conllu` as export format - import of `conllu` now supports enhanced dependencies +### Changed + +- `exmaralda` import now ranks order of tlis higher than sorting by time value (more compatible with modern EXMARaLDA files) + ## [0.15.0] - 2024-08-14 ## [0.15.0] - 2024-08-14 From 6757882d4ac8a816cf4c58a96f3c67eb686cf4ed Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:21:03 +0200 Subject: [PATCH 14/18] new test data --- .../import/exmaralda/test_doc.exb | 52 +++++++++++++++++++ .../import/exmaralda/test_file.wav | 0 2 files changed, 52 insertions(+) create mode 100644 tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_doc.exb create mode 100644 tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_file.wav diff --git a/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_doc.exb b/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_doc.exb new file mode 100644 index 00000000..5d41eb53 --- /dev/null +++ b/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_doc.exb @@ -0,0 +1,52 @@ + + + + +dipl + + + + + + +was late for elicitation + + +norm + + + + + +personal-anno-value-1personal-anno-value-2 +was on time + + + + + + + + + + + +I'm +in +New +York +I +am +in +New York +1 +I +be +in +New York +PRON +VERB +ADP +PRON + + diff --git a/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_file.wav b/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_file.wav new file mode 100644 index 00000000..e69de29b From 2e28930228dfc532f8fb0482dd26c812658c1da9 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:27:26 +0200 Subject: [PATCH 15/18] fixed bug and added test --- src/importer/exmaralda/mod.rs | 26 ++++++-------------------- src/importer/exmaralda/tests.rs | 16 +++++++++++++--- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/src/importer/exmaralda/mod.rs b/src/importer/exmaralda/mod.rs index 327edcec..da2accc7 100644 --- a/src/importer/exmaralda/mod.rs +++ b/src/importer/exmaralda/mod.rs @@ -93,8 +93,7 @@ impl ImportEXMARaLDA { let mut speaker_map = BTreeMap::new(); let mut parent_map: BTreeMap> = BTreeMap::new(); let mut already_defined: BTreeSet = BTreeSet::new(); - let mut named_orderings: BTreeMap, String)>> = - BTreeMap::new(); + let mut named_orderings: BTreeMap> = BTreeMap::new(); let mut tlis = Vec::new(); // reader let f = File::open(document_path)?; @@ -441,18 +440,6 @@ impl ImportEXMARaLDA { "{}#{}_{}_{}-{}", doc_node_name, tier_type, speaker_id, start_id, end_id ); // this is not a unique id as not intended to be - let start_time = if let Some((Some(t), _)) = timeline.get(key) { - t - } else { - if let Some(sender) = tx { - let msg = format!( - "Could not determine start time of event {}::{}:{}-{}. Event will be skipped.", - &speaker_id, &anno_name, &start_id, &end_id - ); - sender.send(StatusMessage::Warning(msg))?; - } - continue; - }; if !already_defined.contains(&node_name) { update.add_event(UpdateEvent::AddNode { node_name: node_name.to_string(), @@ -492,7 +479,9 @@ impl ImportEXMARaLDA { } continue; }; - if let Some((Some(end_time), _)) = node_tpl { + if let (Some((Some(start_time), _)), Some((Some(end_time), _))) = + (timeline.get(key), node_tpl) + { update.add_event(UpdateEvent::AddNodeLabel { node_name: node_name.to_string(), anno_ns: ANNIS_NS.to_string(), @@ -517,7 +506,7 @@ impl ImportEXMARaLDA { anno_value: text.to_string(), })?; // order nodes - let order_tpl = (*start_time, node_name.to_string()); + let order_tpl = (start_i, node_name.to_string()); match named_orderings.entry(anno_name.to_string()) { std::collections::btree_map::Entry::Vacant(e) => { e.insert(vec![order_tpl]); @@ -574,10 +563,7 @@ impl ImportEXMARaLDA { // build order relations for (name, node_name_vec) in named_orderings { let mut prev = None; - for (_, node_name) in node_name_vec - .into_iter() - .sorted_by(|a, b| a.0.total_cmp(&b.0)) - { + for (_, node_name) in node_name_vec.into_iter().sorted_by(|a, b| a.0.cmp(&b.0)) { if let Some(source) = prev { update.add_event(UpdateEvent::AddEdge { source_node: source, diff --git a/src/importer/exmaralda/tests.rs b/src/importer/exmaralda/tests.rs index 56bde523..7e88435f 100644 --- a/src/importer/exmaralda/tests.rs +++ b/src/importer/exmaralda/tests.rs @@ -236,21 +236,31 @@ fn invalid_fail() { #[test] fn import() { let r = run_test("./tests/data/import/exmaralda/clean/import/", 0); - assert_eq!(r.is_ok(), true, "Probing core test result {:?}", r); + assert!(r.is_ok(), "Probing core test result {:?}", r); assert_snapshot!(r.unwrap()); } #[test] fn broken_audio_pass() { let r = run_test("./tests/data/import/exmaralda/broken_audio/import/", 1); - assert_eq!(r.is_ok(), true, "Probing core test result {:?}", r); + assert!(r.is_ok(), "Probing core test result {:?}", r); assert_snapshot!(r.unwrap()); } #[test] fn missing_type_attr_pass() { let r = run_test("./tests/data/import/exmaralda/pass-no_tier_type/import/", 9); - assert_eq!(r.is_ok(), true, "Probing core test result {:?}", r); + assert!(r.is_ok(), "Probing core test result {:?}", r); + assert_snapshot!(r.unwrap()); +} + +#[test] +fn sparse_timeline_pass() { + let r = run_test( + "./tests/data/import/exmaralda/valid-no-timevalues/import/", + 0, + ); + assert!(r.is_ok(), "Probing core test result {:?}", r); assert_snapshot!(r.unwrap()); } From 996b227d913dafe844ec48bba7cec51f1a9f74a3 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:29:38 +0200 Subject: [PATCH 16/18] update --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ff9e81d..91db59c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `exmaralda` import now ranks order of tlis higher than sorting by time value (more compatible with modern EXMARaLDA files) +### Fixed + +- `exmaralda` import keeps events with missing time values + ## [0.15.0] - 2024-08-14 ## [0.15.0] - 2024-08-14 From 31099aa279a419570d76366742ca1c3172626ab6 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:29:49 +0200 Subject: [PATCH 17/18] update --- docs/README.md | 2 +- docs/exporters/conllu.md | 151 +++++++++++++++++++++++++++++++++++++++ docs/exporters/table.md | 10 +++ 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 docs/exporters/conllu.md diff --git a/docs/README.md b/docs/README.md index de7e3e5d..d453750e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,5 @@ | Type | Modules | |------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [saltxml](importers/saltxml.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | -| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [table](exporters/table.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) | +| Export formats | [conllu](exporters/conllu.md), [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [table](exporters/table.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) | | Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [filter](graph_ops/filter.md), [visualize](graph_ops/visualize.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file diff --git a/docs/exporters/conllu.md b/docs/exporters/conllu.md new file mode 100644 index 00000000..8875db13 --- /dev/null +++ b/docs/exporters/conllu.md @@ -0,0 +1,151 @@ +# conllu (exporter) + +This module exports a graph in CoNLL-U format. + +## Configuration + +### doc + +This key is used to determine nodes that whose part-of subgraph constitutes a document, i. e. the entire input for a file. +Default is `annis::doc`, or `{ ns = "annis", name = "doc" }`. + +Example: +```toml +[export.config] +doc = "annis::doc" +``` + +### groupby + +This optional annotation key is used to identify annotation spans, that constitute a sentence. Default is no export of sentence blocks. +Default is `annis::doc`, or `{ ns = "annis", name = "doc" }`. + +Example: +```toml +[export.config] +groupby = "norm::sentence" +``` + +### ordering + +The nodes connected by this annotation component are used as nodes defining a line in a CoNLL-U file. Usually you want to use an ordering. +Default is `{ ctype = "Ordering", layer = "annis", name = "" }`. + +Example: +```toml +[export.config] +ordering = { ctype = "Ordering", layer = "annis", name = "norm" } +``` + +### form + +This annotation key is used to write the form column. +Default is `{ ns = "annis", name = "tok" }`. + +Example: +```toml +[export.config] +form = { ns = "norm", name = "norm" } +``` + +### lemma + +This annotation key is used to write the lemma column. +Default is `{ ns = "", name = "tok" }`. + +Example: +```toml +[export.config] +lemma = { ns = "norm", name = "lemma" } +``` + +### upos + +This annotation key is used to write the upos column. +Default is `{ ns = "", name = "upos" }`. + +Example: +```toml +[export.config] +upos = { ns = "norm", name = "pos" } +``` + +### xpos + +This annotation key is used to write the xpos column. +Default is `{ ns = "", name = "xpos" }`. + +Example: +```toml +[export.config] +upos = { ns = "norm", name = "pos_spec" } +``` + +### features + +This list of annotation keys will be represented in the feature column. +Default is the empty list. + +Example: +```toml +[export.config] +features = ["Animacy", "Tense", "VerbClass"] +``` + +### dependency_component + +The nodes connected by this annotation component are used to export dependencies. +Default is none, so nothing will be exported. + +Example: +```toml +[export.config] +dependency_component = { ctype = "Pointing", layer = "", name = "dependencies" } +``` + +### dependency_anno + +This annotation key is used to write the dependency relation, which will be looked for on the dependency edges. +Default is none, so nothing will be exported. + +Example: +```toml +[export.config] +dependency_anno = { ns = "", name = "deprel" } +``` + +### enhanced_components + +The listed components will be used to export enhanced dependencies. More than +one component can be listed. +Default is the empty list, so nothing will be exported. + +Example: +```toml +[export.config] +enhanced_components = [{ ctype = "Pointing", layer = "", name = "dependencies" }] +``` + +### enhanced_annos + +This list of annotation keys defines the annotation keys, that correspond to the +edge labels in the component listed in `enhanced_components`. The i-th element of +one list belongs to the i-th element in the other list. Default is the empty list. + +Example: +```toml +[export.config] +enhanced_annos = ["func"] +``` + +### misc + +This list of annotation keys will be represented in the misc column. +Default is the empty list. + +Example: +```toml +[export.config] +misc = ["NoSpaceAfter", "Referent"] +``` + diff --git a/docs/exporters/table.md b/docs/exporters/table.md index 54b7966f..186f8b4b 100644 --- a/docs/exporters/table.md +++ b/docs/exporters/table.md @@ -39,6 +39,16 @@ Example: quote_char = "\"" ``` +### no_value + +Provides the string sequence used for n/a. Default is the empty string. + +Example: +```toml +[export.config] +no_value = "n/a" +``` + ### ingoing By listing annotation components, the ingoing edges of that component and their annotations From a41c6e3c0fc8aad4358a34208900096a3909921c Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:30:53 +0200 Subject: [PATCH 18/18] new test snapshot --- ...xmaralda__tests__sparse_timeline_pass.snap | 261 ++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__sparse_timeline_pass.snap diff --git a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__sparse_timeline_pass.snap b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__sparse_timeline_pass.snap new file mode 100644 index 00000000..21f56b73 --- /dev/null +++ b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__sparse_timeline_pass.snap @@ -0,0 +1,261 @@ +--- +source: src/importer/exmaralda/tests.rs +expression: r.unwrap() +--- + + + + + + + + + + + + + + + + + + + + + + + + + corpus + + + corpus + + + dipl + norm + corpus + personal-anno-value-1 + personal-anno-value-2 + was late for elicitation + was on time + test_doc + eng + deu + eng,eng + + + file + tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_file.wav + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + dipl + node + I'm + I'm + + + dipl + node + in + in + + + dipl + node + New + New + + + dipl + node + 4.44444-5.55555 + York + York + + + norm + node + I + I + + + norm + node + am + am + + + norm + node + in + in + + + norm + node + New York + New York + + + dipl + node + 1 + 0-5.55555 + + + norm + I + node + PRON + + + norm + be + node + VERB + + + norm + in + node + ADP + + + norm + New York + node + PRON + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +