From 6757882d4ac8a816cf4c58a96f3c67eb686cf4ed Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:21:03 +0200 Subject: [PATCH 1/5] new test data --- .../import/exmaralda/test_doc.exb | 52 +++++++++++++++++++ .../import/exmaralda/test_file.wav | 0 2 files changed, 52 insertions(+) create mode 100644 tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_doc.exb create mode 100644 tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_file.wav diff --git a/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_doc.exb b/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_doc.exb new file mode 100644 index 00000000..5d41eb53 --- /dev/null +++ b/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_doc.exb @@ -0,0 +1,52 @@ + + + + +dipl + + + + + + +was late for elicitation + + +norm + + + + + +personal-anno-value-1personal-anno-value-2 +was on time + + + + + + + + + + + +I'm +in +New +York +I +am +in +New York +1 +I +be +in +New York +PRON +VERB +ADP +PRON + + diff --git a/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_file.wav b/tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_file.wav new file mode 100644 index 00000000..e69de29b From 2e28930228dfc532f8fb0482dd26c812658c1da9 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:27:26 +0200 Subject: [PATCH 2/5] fixed bug and added test --- src/importer/exmaralda/mod.rs | 26 ++++++-------------------- src/importer/exmaralda/tests.rs | 16 +++++++++++++--- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/src/importer/exmaralda/mod.rs b/src/importer/exmaralda/mod.rs index 327edcec..da2accc7 100644 --- a/src/importer/exmaralda/mod.rs +++ b/src/importer/exmaralda/mod.rs @@ -93,8 +93,7 @@ impl ImportEXMARaLDA { let mut speaker_map = BTreeMap::new(); let mut parent_map: BTreeMap> = BTreeMap::new(); let mut already_defined: BTreeSet = BTreeSet::new(); - let mut named_orderings: BTreeMap, String)>> = - BTreeMap::new(); + let mut named_orderings: BTreeMap> = BTreeMap::new(); let mut tlis = Vec::new(); // reader let f = File::open(document_path)?; @@ -441,18 +440,6 @@ impl ImportEXMARaLDA { "{}#{}_{}_{}-{}", doc_node_name, tier_type, speaker_id, start_id, end_id ); // this is not a unique id as not intended to be - let start_time = if let Some((Some(t), _)) = timeline.get(key) { - t - } else { - if let Some(sender) = tx { - let msg = format!( - "Could not determine start time of event {}::{}:{}-{}. Event will be skipped.", - &speaker_id, &anno_name, &start_id, &end_id - ); - sender.send(StatusMessage::Warning(msg))?; - } - continue; - }; if !already_defined.contains(&node_name) { update.add_event(UpdateEvent::AddNode { node_name: node_name.to_string(), @@ -492,7 +479,9 @@ impl ImportEXMARaLDA { } continue; }; - if let Some((Some(end_time), _)) = node_tpl { + if let (Some((Some(start_time), _)), Some((Some(end_time), _))) = + (timeline.get(key), node_tpl) + { update.add_event(UpdateEvent::AddNodeLabel { node_name: node_name.to_string(), anno_ns: ANNIS_NS.to_string(), @@ -517,7 +506,7 @@ impl ImportEXMARaLDA { anno_value: text.to_string(), })?; // order nodes - let order_tpl = (*start_time, node_name.to_string()); + let order_tpl = (start_i, node_name.to_string()); match named_orderings.entry(anno_name.to_string()) { std::collections::btree_map::Entry::Vacant(e) => { e.insert(vec![order_tpl]); @@ -574,10 +563,7 @@ impl ImportEXMARaLDA { // build order relations for (name, node_name_vec) in named_orderings { let mut prev = None; - for (_, node_name) in node_name_vec - .into_iter() - .sorted_by(|a, b| a.0.total_cmp(&b.0)) - { + for (_, node_name) in node_name_vec.into_iter().sorted_by(|a, b| a.0.cmp(&b.0)) { if let Some(source) = prev { update.add_event(UpdateEvent::AddEdge { source_node: source, diff --git a/src/importer/exmaralda/tests.rs b/src/importer/exmaralda/tests.rs index 56bde523..7e88435f 100644 --- a/src/importer/exmaralda/tests.rs +++ b/src/importer/exmaralda/tests.rs @@ -236,21 +236,31 @@ fn invalid_fail() { #[test] fn import() { let r = run_test("./tests/data/import/exmaralda/clean/import/", 0); - assert_eq!(r.is_ok(), true, "Probing core test result {:?}", r); + assert!(r.is_ok(), "Probing core test result {:?}", r); assert_snapshot!(r.unwrap()); } #[test] fn broken_audio_pass() { let r = run_test("./tests/data/import/exmaralda/broken_audio/import/", 1); - assert_eq!(r.is_ok(), true, "Probing core test result {:?}", r); + assert!(r.is_ok(), "Probing core test result {:?}", r); assert_snapshot!(r.unwrap()); } #[test] fn missing_type_attr_pass() { let r = run_test("./tests/data/import/exmaralda/pass-no_tier_type/import/", 9); - assert_eq!(r.is_ok(), true, "Probing core test result {:?}", r); + assert!(r.is_ok(), "Probing core test result {:?}", r); + assert_snapshot!(r.unwrap()); +} + +#[test] +fn sparse_timeline_pass() { + let r = run_test( + "./tests/data/import/exmaralda/valid-no-timevalues/import/", + 0, + ); + assert!(r.is_ok(), "Probing core test result {:?}", r); assert_snapshot!(r.unwrap()); } From 996b227d913dafe844ec48bba7cec51f1a9f74a3 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:29:38 +0200 Subject: [PATCH 3/5] update --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ff9e81d..91db59c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `exmaralda` import now ranks order of tlis higher than sorting by time value (more compatible with modern EXMARaLDA files) +### Fixed + +- `exmaralda` import keeps events with missing time values + ## [0.15.0] - 2024-08-14 ## [0.15.0] - 2024-08-14 From 31099aa279a419570d76366742ca1c3172626ab6 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:29:49 +0200 Subject: [PATCH 4/5] update --- docs/README.md | 2 +- docs/exporters/conllu.md | 151 +++++++++++++++++++++++++++++++++++++++ docs/exporters/table.md | 10 +++ 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 docs/exporters/conllu.md diff --git a/docs/README.md b/docs/README.md index de7e3e5d..d453750e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,5 @@ | Type | Modules | |------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [saltxml](importers/saltxml.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | -| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [table](exporters/table.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) | +| Export formats | [conllu](exporters/conllu.md), [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [table](exporters/table.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) | | Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [filter](graph_ops/filter.md), [visualize](graph_ops/visualize.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file diff --git a/docs/exporters/conllu.md b/docs/exporters/conllu.md new file mode 100644 index 00000000..8875db13 --- /dev/null +++ b/docs/exporters/conllu.md @@ -0,0 +1,151 @@ +# conllu (exporter) + +This module exports a graph in CoNLL-U format. + +## Configuration + +### doc + +This key is used to determine nodes that whose part-of subgraph constitutes a document, i. e. the entire input for a file. +Default is `annis::doc`, or `{ ns = "annis", name = "doc" }`. + +Example: +```toml +[export.config] +doc = "annis::doc" +``` + +### groupby + +This optional annotation key is used to identify annotation spans, that constitute a sentence. Default is no export of sentence blocks. +Default is `annis::doc`, or `{ ns = "annis", name = "doc" }`. + +Example: +```toml +[export.config] +groupby = "norm::sentence" +``` + +### ordering + +The nodes connected by this annotation component are used as nodes defining a line in a CoNLL-U file. Usually you want to use an ordering. +Default is `{ ctype = "Ordering", layer = "annis", name = "" }`. + +Example: +```toml +[export.config] +ordering = { ctype = "Ordering", layer = "annis", name = "norm" } +``` + +### form + +This annotation key is used to write the form column. +Default is `{ ns = "annis", name = "tok" }`. + +Example: +```toml +[export.config] +form = { ns = "norm", name = "norm" } +``` + +### lemma + +This annotation key is used to write the lemma column. +Default is `{ ns = "", name = "tok" }`. + +Example: +```toml +[export.config] +lemma = { ns = "norm", name = "lemma" } +``` + +### upos + +This annotation key is used to write the upos column. +Default is `{ ns = "", name = "upos" }`. + +Example: +```toml +[export.config] +upos = { ns = "norm", name = "pos" } +``` + +### xpos + +This annotation key is used to write the xpos column. +Default is `{ ns = "", name = "xpos" }`. + +Example: +```toml +[export.config] +upos = { ns = "norm", name = "pos_spec" } +``` + +### features + +This list of annotation keys will be represented in the feature column. +Default is the empty list. + +Example: +```toml +[export.config] +features = ["Animacy", "Tense", "VerbClass"] +``` + +### dependency_component + +The nodes connected by this annotation component are used to export dependencies. +Default is none, so nothing will be exported. + +Example: +```toml +[export.config] +dependency_component = { ctype = "Pointing", layer = "", name = "dependencies" } +``` + +### dependency_anno + +This annotation key is used to write the dependency relation, which will be looked for on the dependency edges. +Default is none, so nothing will be exported. + +Example: +```toml +[export.config] +dependency_anno = { ns = "", name = "deprel" } +``` + +### enhanced_components + +The listed components will be used to export enhanced dependencies. More than +one component can be listed. +Default is the empty list, so nothing will be exported. + +Example: +```toml +[export.config] +enhanced_components = [{ ctype = "Pointing", layer = "", name = "dependencies" }] +``` + +### enhanced_annos + +This list of annotation keys defines the annotation keys, that correspond to the +edge labels in the component listed in `enhanced_components`. The i-th element of +one list belongs to the i-th element in the other list. Default is the empty list. + +Example: +```toml +[export.config] +enhanced_annos = ["func"] +``` + +### misc + +This list of annotation keys will be represented in the misc column. +Default is the empty list. + +Example: +```toml +[export.config] +misc = ["NoSpaceAfter", "Referent"] +``` + diff --git a/docs/exporters/table.md b/docs/exporters/table.md index 54b7966f..186f8b4b 100644 --- a/docs/exporters/table.md +++ b/docs/exporters/table.md @@ -39,6 +39,16 @@ Example: quote_char = "\"" ``` +### no_value + +Provides the string sequence used for n/a. Default is the empty string. + +Example: +```toml +[export.config] +no_value = "n/a" +``` + ### ingoing By listing annotation components, the ingoing edges of that component and their annotations From a41c6e3c0fc8aad4358a34208900096a3909921c Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Tue, 20 Aug 2024 22:30:53 +0200 Subject: [PATCH 5/5] new test snapshot --- ...xmaralda__tests__sparse_timeline_pass.snap | 261 ++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__sparse_timeline_pass.snap diff --git a/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__sparse_timeline_pass.snap b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__sparse_timeline_pass.snap new file mode 100644 index 00000000..21f56b73 --- /dev/null +++ b/src/importer/exmaralda/snapshots/annatto__importer__exmaralda__tests__sparse_timeline_pass.snap @@ -0,0 +1,261 @@ +--- +source: src/importer/exmaralda/tests.rs +expression: r.unwrap() +--- + + + + + + + + + + + + + + + + + + + + + + + + + corpus + + + corpus + + + dipl + norm + corpus + personal-anno-value-1 + personal-anno-value-2 + was late for elicitation + was on time + test_doc + eng + deu + eng,eng + + + file + tests/data/import/exmaralda/valid-no-timevalues/import/exmaralda/test_file.wav + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + dipl + node + I'm + I'm + + + dipl + node + in + in + + + dipl + node + New + New + + + dipl + node + 4.44444-5.55555 + York + York + + + norm + node + I + I + + + norm + node + am + am + + + norm + node + in + in + + + norm + node + New York + New York + + + dipl + node + 1 + 0-5.55555 + + + norm + I + node + PRON + + + norm + be + node + VERB + + + norm + in + node + ADP + + + norm + New York + node + PRON + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +