From 55c5a3c164fc4c935646c256b567d64ff18b2eac Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 19 Jun 2024 15:43:30 +0200 Subject: [PATCH 01/61] Add empty SalXML importer --- src/importer/mod.rs | 1 + src/importer/saltxml.rs | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 src/importer/saltxml.rs diff --git a/src/importer/mod.rs b/src/importer/mod.rs index c1217c88..0669fd42 100644 --- a/src/importer/mod.rs +++ b/src/importer/mod.rs @@ -8,6 +8,7 @@ pub mod none; pub mod opus; pub mod ptb; pub mod relannis; +pub mod saltxml; pub mod textgrid; pub mod toolbox; pub mod treetagger; diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs new file mode 100644 index 00000000..448646b7 --- /dev/null +++ b/src/importer/saltxml.rs @@ -0,0 +1,26 @@ +use documented::{Documented, DocumentedFields}; +use serde::Deserialize; +use struct_field_names_as_array::FieldNamesAsSlice; + +use super::Importer; + +/// Imports the SaltXML format used by Pepper (). +/// SaltXML is an XMI serialization of the [Salt model](https://raw.githubusercontent.com/korpling/salt/master/gh-site/doc/salt_modelGuide.pdf). +#[derive(Default, Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] +#[serde(default, deny_unknown_fields)] +pub struct ImportSaltXml {} + +impl Importer for ImportSaltXml { + fn import_corpus( + &self, + input_path: &std::path::Path, + step_id: crate::StepID, + tx: Option, + ) -> Result> { + todo!() + } + + fn file_extensions(&self) -> &[&str] { + todo!() + } +} From 4e17f80c2dbb241a4737a12736b234fe71719ee4 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 25 Jun 2024 12:26:19 +0200 Subject: [PATCH 02/61] Add empty SaltXML exporter --- src/exporter/mod.rs | 1 + src/exporter/saltxml.rs | 27 +++++++++++++++++++++++++++ src/importer/saltxml.rs | 6 +++--- src/lib.rs | 12 ++++++++++-- 4 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 src/exporter/saltxml.rs diff --git a/src/exporter/mod.rs b/src/exporter/mod.rs index 4c48261e..09842b3e 100644 --- a/src/exporter/mod.rs +++ b/src/exporter/mod.rs @@ -2,6 +2,7 @@ pub mod exmaralda; pub mod graphml; +pub mod saltxml; pub mod sequence; pub mod xlsx; diff --git a/src/exporter/saltxml.rs b/src/exporter/saltxml.rs new file mode 100644 index 00000000..1b0e1bb1 --- /dev/null +++ b/src/exporter/saltxml.rs @@ -0,0 +1,27 @@ +use documented::{Documented, DocumentedFields}; +use serde::Deserialize; +use struct_field_names_as_array::FieldNamesAsSlice; + +use super::Exporter; + +/// Exports Excel Spreadsheets where each line is a token, the other columns are +/// spans and merged cells can be used for spans that cover more than one token. +#[derive(Default, Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] +#[serde(default, deny_unknown_fields)] +pub struct SaltXmlExporter {} + +impl Exporter for SaltXmlExporter { + fn export_corpus( + &self, + _graph: &graphannis::AnnotationGraph, + _output_path: &std::path::Path, + _step_id: crate::StepID, + _tx: Option, + ) -> Result<(), Box> { + todo!() + } + + fn file_extension(&self) -> &str { + todo!() + } +} diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 448646b7..6429330b 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -13,9 +13,9 @@ pub struct ImportSaltXml {} impl Importer for ImportSaltXml { fn import_corpus( &self, - input_path: &std::path::Path, - step_id: crate::StepID, - tx: Option, + _input_path: &std::path::Path, + _step_id: crate::StepID, + _tx: Option, ) -> Result> { todo!() } diff --git a/src/lib.rs b/src/lib.rs index ec45f2ca..2d34ad8c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,8 +21,9 @@ use exporter::{ use importer::{ conllu::ImportCoNLLU, exmaralda::ImportEXMARaLDA, file_nodes::CreateFileNodes, graphml::GraphMLImporter, meta::AnnotateCorpus, none::CreateEmptyCorpus, opus::ImportOpusLinks, - ptb::ImportPTB, relannis::ImportRelAnnis, textgrid::ImportTextgrid, toolbox::ImportToolBox, - treetagger::ImportTreeTagger, xlsx::ImportSpreadsheet, xml::ImportXML, Importer, + ptb::ImportPTB, relannis::ImportRelAnnis, saltxml::ImportSaltXml, textgrid::ImportTextgrid, + toolbox::ImportToolBox, treetagger::ImportTreeTagger, xlsx::ImportSpreadsheet, xml::ImportXML, + Importer, }; use manipulator::{ check::Check, chunker::Chunk, collapse::Collapse, enumerate::EnumerateMatches, link::LinkNodes, @@ -128,6 +129,7 @@ pub enum ReadFrom { Path(#[serde(default)] CreateFileNodes), PTB(#[serde(default)] ImportPTB), RelAnnis(#[serde(default)] ImportRelAnnis), + SaltXml(#[serde(default)] ImportSaltXml), TextGrid(#[serde(default)] ImportTextgrid), Toolbox(#[serde(default)] ImportToolBox), TreeTagger(#[serde(default)] ImportTreeTagger), @@ -154,6 +156,7 @@ impl ReadFrom { ReadFrom::Path(m) => m, ReadFrom::PTB(m) => m, ReadFrom::RelAnnis(m) => m, + ReadFrom::SaltXml(m) => m, ReadFrom::TextGrid(m) => m, ReadFrom::Toolbox(m) => m, ReadFrom::TreeTagger(m) => m, @@ -175,6 +178,7 @@ impl ReadFromDiscriminants { ReadFromDiscriminants::Path => CreateFileNodes::DOCS, ReadFromDiscriminants::PTB => ImportPTB::DOCS, ReadFromDiscriminants::RelAnnis => ImportRelAnnis::DOCS, + ReadFromDiscriminants::SaltXml => ImportSaltXml::DOCS, ReadFromDiscriminants::TextGrid => ImportTextgrid::DOCS, ReadFromDiscriminants::Toolbox => ImportToolBox::DOCS, ReadFromDiscriminants::TreeTagger => ImportTreeTagger::DOCS, @@ -235,6 +239,10 @@ impl ReadFromDiscriminants { ImportRelAnnis::FIELD_NAMES_AS_SLICE, ImportRelAnnis::FIELD_DOCS, ), + ReadFromDiscriminants::SaltXml => ( + ImportSaltXml::FIELD_NAMES_AS_SLICE, + ImportSaltXml::FIELD_DOCS, + ), }; for (idx, n) in field_names.iter().enumerate() { if idx < field_docs.len() { From 88b1622408e701870cfb109a19717feb86082728 Mon Sep 17 00:00:00 2001 From: thomaskrause Date: Wed, 26 Jun 2024 08:39:14 +0000 Subject: [PATCH 03/61] Apply automatic changes --- docs/README.md | 10 +++++----- docs/importers/saltxml.md | 6 ++++++ 2 files changed, 11 insertions(+), 5 deletions(-) create mode 100644 docs/importers/saltxml.md diff --git a/docs/README.md b/docs/README.md index 2385cafa..fc16cfe2 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,5 @@ -| Type | Modules | -|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | -| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [xlsx](exporters/xlsx.md) | -| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [merge](graph_ops/merge.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file +| Type | Modules | +|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [saltxml](importers/saltxml.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | +| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [xlsx](exporters/xlsx.md) | +| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [merge](graph_ops/merge.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file diff --git a/docs/importers/saltxml.md b/docs/importers/saltxml.md new file mode 100644 index 00000000..0fcc3242 --- /dev/null +++ b/docs/importers/saltxml.md @@ -0,0 +1,6 @@ +# saltxml (importer) + +Imports the SaltXML format used by Pepper (). +SaltXML is an XMI serialization of the [Salt model](https://raw.githubusercontent.com/korpling/salt/master/gh-site/doc/salt_modelGuide.pdf). + +*No Configuration* From 2012b686c293c37da6a6fe0820489c1f6325d810 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 26 Jun 2024 11:06:29 +0200 Subject: [PATCH 04/61] Update the snapshot test to include the new SaltXML modules --- src/lib.rs | 11 +++++++++-- tests/snapshots/cli__list_modules.snap | 12 +++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2d34ad8c..b1247ce2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,8 +15,8 @@ use std::{fmt::Display, path::PathBuf}; use documented::{Documented, DocumentedFields}; use error::Result; use exporter::{ - exmaralda::ExportExmaralda, graphml::GraphMLExporter, sequence::ExportSequence, - xlsx::XlsxExporter, Exporter, + exmaralda::ExportExmaralda, graphml::GraphMLExporter, saltxml::SaltXmlExporter, + sequence::ExportSequence, xlsx::XlsxExporter, Exporter, }; use importer::{ conllu::ImportCoNLLU, exmaralda::ImportEXMARaLDA, file_nodes::CreateFileNodes, @@ -47,6 +47,7 @@ pub struct ModuleConfiguration { pub enum WriteAs { GraphML(#[serde(default)] GraphMLExporter), // the purpose of serde(default) here is, that an empty `[export.config]` table can be omited EXMARaLDA(#[serde(default)] ExportExmaralda), + SaltXml(#[serde(default)] SaltXmlExporter), Sequence(#[serde(default)] ExportSequence), Xlsx(#[serde(default)] XlsxExporter), } @@ -63,6 +64,7 @@ impl WriteAs { match self { WriteAs::GraphML(m) => m, WriteAs::EXMARaLDA(m) => m, + WriteAs::SaltXml(m) => m, WriteAs::Sequence(m) => m, WriteAs::Xlsx(m) => m, } @@ -74,6 +76,7 @@ impl WriteAsDiscriminants { match self { WriteAsDiscriminants::GraphML => GraphMLExporter::DOCS, WriteAsDiscriminants::EXMARaLDA => ExportExmaralda::DOCS, + WriteAsDiscriminants::SaltXml => SaltXmlExporter::DOCS, WriteAsDiscriminants::Sequence => ExportSequence::DOCS, WriteAsDiscriminants::Xlsx => XlsxExporter::DOCS, } @@ -90,6 +93,10 @@ impl WriteAsDiscriminants { ExportExmaralda::FIELD_NAMES_AS_SLICE, ExportExmaralda::FIELD_DOCS, ), + WriteAsDiscriminants::SaltXml => ( + SaltXmlExporter::FIELD_NAMES_AS_SLICE, + SaltXmlExporter::FIELD_DOCS, + ), WriteAsDiscriminants::Sequence => ( ExportSequence::FIELD_NAMES_AS_SLICE, ExportSequence::FIELD_DOCS, diff --git a/tests/snapshots/cli__list_modules.snap b/tests/snapshots/cli__list_modules.snap index bb7d7434..572978d2 100644 --- a/tests/snapshots/cli__list_modules.snap +++ b/tests/snapshots/cli__list_modules.snap @@ -2,10 +2,12 @@ source: tests/cli.rs expression: output --- -| Type | Modules | -|------------------|-------------------------------------------------------------------------------------------------------------| -| Import formats | conllu, exmaralda, graphml, meta, none, opus, path, ptb, relannis, textgrid, toolbox, treetagger, xlsx, xml | -| Export formats | graphml, exmaralda, sequence, xlsx | -| Graph operations | check, collapse, enumerate, link, map, merge, revise, chunk, split, none | +| Type | Modules | +|------------------|----------------------------------------------------------------------------------------------------------------------| +| Import formats | conllu, exmaralda, graphml, meta, none, opus, path, ptb, relannis, saltxml, textgrid, toolbox, treetagger, xlsx, xml | +| Export formats | graphml, exmaralda, saltxml, sequence, xlsx | +| Graph operations | check, collapse, enumerate, link, map, merge, revise, chunk, split, none | Use `annatto info ` to get more information about one of the formats or graph operations. + + From 91e94e3548e512af6b22b09591d21d63ce5829fd Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 26 Jun 2024 11:06:44 +0200 Subject: [PATCH 05/61] Update quick-xml dependency --- Cargo.toml | 2 +- src/importer/graphml.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0c2bfdff..ee2c45f6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ pathdiff = "0.2" percent-encoding = "2.3.1" pest = "2.7" pest_derive = "2.0" -quick-xml = "0.31" +quick-xml = "0.34" rayon = "1.1" regex = "1.4" serde = "1.0" diff --git a/src/importer/graphml.rs b/src/importer/graphml.rs index db72a173..73a9ce8a 100644 --- a/src/importer/graphml.rs +++ b/src/importer/graphml.rs @@ -136,7 +136,7 @@ fn read_graphml( edge_updates: &mut GraphUpdate, ) -> Result, AnnattoError> { let mut reader = Reader::from_reader(input); - reader.expand_empty_elements(true); + reader.config_mut().expand_empty_elements = true; let mut buf = Vec::new(); From dc1ae071c119ac49bf6bdca155b300d70a602129 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 26 Jun 2024 14:20:55 +0200 Subject: [PATCH 06/61] Start mapping the Salt corpus structure. --- src/importer/saltxml.rs | 39 +- src/importer/saltxml/mapper.rs | 150 +++++++ ...ltxml__tests__read_salt_sample_corpus.snap | 52 +++ src/importer/saltxml/tests.rs | 18 + src/util.rs | 1 + src/util/xml.rs | 82 ++++ .../rootCorpus/subCorpus1/doc1.salt | 369 ++++++++++++++++++ .../rootCorpus/subCorpus1/doc2.salt | 369 ++++++++++++++++++ .../rootCorpus/subCorpus2/doc3.salt | 369 ++++++++++++++++++ .../rootCorpus/subCorpus2/doc4.salt | 369 ++++++++++++++++++ .../salt/SaltSampleCorpus/saltProject.salt | 63 +++ 11 files changed, 1876 insertions(+), 5 deletions(-) create mode 100644 src/importer/saltxml/mapper.rs create mode 100644 src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap create mode 100644 src/importer/saltxml/tests.rs create mode 100644 src/util/xml.rs create mode 100644 tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt create mode 100644 tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt create mode 100644 tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt create mode 100644 tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt create mode 100644 tests/data/import/salt/SaltSampleCorpus/saltProject.salt diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 6429330b..f5f05be0 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -1,7 +1,12 @@ +use std::fs::File; + use documented::{Documented, DocumentedFields}; +use graphannis::update::GraphUpdate; use serde::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; +use crate::progress::ProgressReporter; + use super::Importer; /// Imports the SaltXML format used by Pepper (). @@ -13,14 +18,38 @@ pub struct ImportSaltXml {} impl Importer for ImportSaltXml { fn import_corpus( &self, - _input_path: &std::path::Path, - _step_id: crate::StepID, - _tx: Option, + input_path: &std::path::Path, + step_id: crate::StepID, + tx: Option, ) -> Result> { - todo!() + let mut updates = GraphUpdate::new(); + // Start with an undetermined progress reporter + let reporter = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?; + let mut mapper = mapper::SaltXmlMapper::new(reporter); + + // Read the corpus structure from the Salt project and get the number of documents to create + mapper.reporter.info("Reading SaltXML project structure")?; + let mut project_file = File::open(input_path.join("saltProject.salt"))?; + let documents = mapper.map_corpus_structure(&mut project_file, &mut updates)?; + + // Create a new progress reporter that can now estimate the work based on the number of documents + mapper.reporter = ProgressReporter::new(tx, step_id, documents.len())?; + for (document_node_name, document_path) in documents.iter() { + mapper.reporter.info("Reading document {document_path}")?; + let mut document_file = File::open(document_path)?; + mapper.read_document(&mut document_file, document_node_name, &mut updates)?; + mapper.reporter.worked(1)?; + } + + Ok(updates) } fn file_extensions(&self) -> &[&str] { - todo!() + &[] } } + +mod mapper; + +#[cfg(test)] +mod tests; diff --git a/src/importer/saltxml/mapper.rs b/src/importer/saltxml/mapper.rs new file mode 100644 index 00000000..85df1f57 --- /dev/null +++ b/src/importer/saltxml/mapper.rs @@ -0,0 +1,150 @@ +use std::{collections::BTreeMap, convert::TryFrom, io::BufReader, path::PathBuf}; + +use anyhow::{anyhow, Ok}; +use graphannis::update::{GraphUpdate, UpdateEvent}; +use quick_xml::{ + events::{attributes::Attributes, BytesStart}, + Reader, +}; + +use crate::{ + progress::ProgressReporter, + util::xml::{consume_start_tag_with_name, get_attribute_by_local_name, get_attribute_by_qname}, +}; + +#[derive(Clone)] +enum SaltType { + Corpus, + Document, + ElementId, + Feature, + CorpusRelation, + DocumentRelation, +} + +impl<'a> TryFrom> for SaltType { + type Error = anyhow::Error; + + fn try_from(value: Attributes<'a>) -> Result { + // Use the xsi:type attribute to determine the type + if let Some(type_id) = get_attribute_by_qname(value, "xsi", "type")? { + match type_id.as_str() { + "sCorpusStructure:SCorpus" => Ok(SaltType::Corpus), + "sCorpusStructure:SDocument" => Ok(SaltType::Document), + "saltCore:SElementId" => Ok(SaltType::ElementId), + "saltCore:SFeature" => Ok(SaltType::Feature), + "sCorpusStructure:SCorpusRelation" => Ok(SaltType::CorpusRelation), + "sCorpusStructure:SCorpusDocumentRelation" => Ok(SaltType::DocumentRelation), + _ => Err(anyhow!("Unknown Salt type {type_id}")), + } + } else { + Err(anyhow!("Missing attribute xsi:type")) + } + } +} + +fn get_label(e: &BytesStart) -> anyhow::Result<(String, String, SaltObject)> { + let namespace = get_attribute_by_local_name(e.attributes(), "namespace")? + .ok_or_else(|| anyhow!("Missing \"namespace\" attribute for label"))?; + let name = get_attribute_by_local_name(e.attributes(), "name")? + .ok_or_else(|| anyhow!("Missing \"name\" attribute for label"))?; + let value = get_attribute_by_local_name(e.attributes(), "value")? + .ok_or_else(|| anyhow!("Missing \"value\" attribute for label"))?; + let value = SaltObject::try_from(value.as_str())?; + Ok((namespace, name, value)) +} + +enum SaltObject { + Text(String), +} + +impl TryFrom<&str> for SaltObject { + type Error = anyhow::Error; + + fn try_from(value: &str) -> Result { + if value.starts_with("T::") { + Ok(SaltObject::Text(value[3..].to_string())) + } else { + Err(anyhow!("Could not create Salt object from \"{value}\"")) + } + } +} + +pub(crate) struct SaltXmlMapper { + pub(crate) reporter: ProgressReporter, +} + +impl SaltXmlMapper { + pub(crate) fn new(reporter: ProgressReporter) -> SaltXmlMapper { + SaltXmlMapper { reporter } + } + + pub(crate) fn map_corpus_structure( + &self, + input: &mut R, + updates: &mut GraphUpdate, + ) -> anyhow::Result> { + let input = BufReader::new(input); + let mut reader = Reader::from_reader(input); + reader.config_mut().expand_empty_elements = true; + + let mut buf = Vec::new(); + + // Consume the root SaltProject and sCorpusGraphs XML elements, which do not have the "xsi:type" attribute + consume_start_tag_with_name(&mut reader, "SaltProject")?; + consume_start_tag_with_name(&mut reader, "sCorpusGraphs")?; + + // TODO: map corpus graph labels + + // Iterate over all child elements of the corpus graph, which are the corpus and document nodes + let result = BTreeMap::new(); + let mut salt_type_stack = Vec::new(); + let mut current_element_id = None; + //let mut features = Vec::new(); + loop { + match reader.read_event_into(&mut buf)? { + quick_xml::events::Event::Start(e) => { + let salt_type = SaltType::try_from(e.attributes())?; + salt_type_stack.push(salt_type.clone()); + + match salt_type { + SaltType::ElementId => { + current_element_id = None; + + let (namespace, name, value) = get_label(&e)?; + if namespace == "salt" && name == "id" { + if let SaltObject::Text(id) = value { + current_element_id = Some(id); + } + } + } + _ => {} + } + } + quick_xml::events::Event::End(_e) => { + if let Some(_salt_type) = salt_type_stack.pop() { + // Create the element with the collected properties + updates.add_event(UpdateEvent::AddNode { + node_name: current_element_id.clone().ok_or_else(|| { + anyhow!("Missing element ID for corpus graph node") + })?, + node_type: "corpus".into(), + })?; + } + } + quick_xml::events::Event::Eof => break, + _ => {} + } + } + Ok(result) + } + + pub(crate) fn read_document( + &self, + _input: &mut R, + _document_node_name: &str, + _updates: &mut GraphUpdate, + ) -> anyhow::Result<()> { + Ok(()) + } +} diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap new file mode 100644 index 00000000..ecd68dc7 --- /dev/null +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -0,0 +1,52 @@ +--- +source: src/importer/saltxml/tests.rs +expression: actual +--- + + + + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + corpus + + + diff --git a/src/importer/saltxml/tests.rs b/src/importer/saltxml/tests.rs new file mode 100644 index 00000000..b78b896d --- /dev/null +++ b/src/importer/saltxml/tests.rs @@ -0,0 +1,18 @@ +use std::path::Path; + +use insta::assert_snapshot; + +use super::*; +use crate::test_util; + +#[test] +fn read_salt_sample_corpus() { + let importer = ImportSaltXml::default(); + let actual = test_util::import_as_graphml_string( + importer, + Path::new("tests/data/import/salt/SaltSampleCorpus"), + None, + ) + .unwrap(); + assert_snapshot!(actual); +} diff --git a/src/util.rs b/src/util.rs index 4a453e95..8d259b97 100644 --- a/src/util.rs +++ b/src/util.rs @@ -11,6 +11,7 @@ use std::path::{Path, PathBuf}; pub(crate) mod example_generator; pub(crate) mod graphupdate; pub(crate) mod token_helper; +pub(crate) mod xml; /// Get all files with a given extension in a directory. pub fn get_all_files( diff --git a/src/util/xml.rs b/src/util/xml.rs new file mode 100644 index 00000000..f3574406 --- /dev/null +++ b/src/util/xml.rs @@ -0,0 +1,82 @@ +use std::io::BufRead; + +use anyhow::{anyhow, Result}; +use quick_xml::{ + events::{attributes::Attributes, Event}, + Reader, +}; + +/// Extract an attribute for an XML element by the namespace and name. +pub(crate) fn get_attribute_by_qname<'a>( + attribute_list: Attributes<'a>, + namespace: &str, + name: &str, +) -> Result> { + for att in attribute_list { + let att = att?; + if let Some(prefix) = att.key.prefix() { + if prefix.as_ref() == namespace.as_bytes() + && att.key.local_name().as_ref() == name.as_bytes() + { + let value = String::from_utf8_lossy(&att.value).to_string(); + return Ok(Some(value)); + } + } + } + Ok(None) +} + +/// Extract an attribute for an XML element by the name. +pub(crate) fn get_attribute_by_local_name<'a>( + attribute_list: Attributes<'a>, + name: &str, +) -> Result> { + for att in attribute_list { + let att = att?; + if att.key.local_name().as_ref() == name.as_bytes() { + let value = String::from_utf8_lossy(&att.value).to_string(); + return Ok(Some(value)); + } + } + Ok(None) +} + +/// Read the next event. Will fail if the next event is not a start tag and does +/// not have the given name. All non tag elements (XML declaration, text nodes, +/// comments, ...) are ignored and skipped. +pub(crate) fn consume_start_tag_with_name(reader: &mut Reader, name: &str) -> Result<()> +where + R: BufRead, +{ + let mut buf = Vec::new(); + loop { + let event = reader.read_event_into(&mut buf)?; + let result = match event { + Event::Start(tag) => { + if tag.local_name().as_ref() == name.as_bytes() { + Ok(()) + } else { + Err(anyhow!( + "Expected <{name}> but got <{}>", + String::from_utf8_lossy(tag.local_name().as_ref()) + )) + } + } + Event::End(_) => Err(anyhow!( + "Expected \"<{name}>\" but got closing tag instead." + )), + Event::Empty(_) => Err(anyhow!("Expected \"<{name}>\" but got empty tag instead.")), + + Event::Comment(_) + | Event::Decl(_) + | Event::PI(_) + | Event::DocType(_) + | Event::CData(_) + | Event::Text(_) => continue, + Event::Eof => Err(anyhow!( + "Expected <{name} but the file is already at its end." + )), + }; + return result; + } +} diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt new file mode 100644 index 00000000..886757da --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt @@ -0,0 +1,369 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt new file mode 100644 index 00000000..886757da --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt @@ -0,0 +1,369 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt new file mode 100644 index 00000000..427a6179 --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt @@ -0,0 +1,369 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt new file mode 100644 index 00000000..2558b0dc --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt @@ -0,0 +1,369 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/saltProject.salt b/tests/data/import/salt/SaltSampleCorpus/saltProject.salt new file mode 100644 index 00000000..d4ced77b --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/saltProject.salt @@ -0,0 +1,63 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From a194e61b887ba60b47e4fbe9f7338c6b4cc2748c Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 26 Jun 2024 14:56:16 +0200 Subject: [PATCH 07/61] Map features on documents and corpora as annotations --- src/importer/saltxml/mapper.rs | 67 +++++++++++++--- ...ltxml__tests__read_salt_sample_corpus.snap | 76 +++++++++---------- src/util/xml.rs | 4 +- 3 files changed, 91 insertions(+), 56 deletions(-) diff --git a/src/importer/saltxml/mapper.rs b/src/importer/saltxml/mapper.rs index 85df1f57..9270c2e9 100644 --- a/src/importer/saltxml/mapper.rs +++ b/src/importer/saltxml/mapper.rs @@ -2,6 +2,7 @@ use std::{collections::BTreeMap, convert::TryFrom, io::BufReader, path::PathBuf} use anyhow::{anyhow, Ok}; use graphannis::update::{GraphUpdate, UpdateEvent}; +use graphannis_core::util::{join_qname, split_qname}; use quick_xml::{ events::{attributes::Attributes, BytesStart}, Reader, @@ -56,20 +57,33 @@ fn get_label(e: &BytesStart) -> anyhow::Result<(String, String, SaltObject)> { enum SaltObject { Text(String), + Boolean(bool), } impl TryFrom<&str> for SaltObject { type Error = anyhow::Error; fn try_from(value: &str) -> Result { - if value.starts_with("T::") { - Ok(SaltObject::Text(value[3..].to_string())) + if let Some(value) = value.strip_prefix("T::") { + Ok(SaltObject::Text(value.to_string())) + } else if let Some(_value) = value.strip_prefix("B::") { + let value = value.to_ascii_lowercase() == "true"; + Ok(SaltObject::Boolean(value)) } else { Err(anyhow!("Could not create Salt object from \"{value}\"")) } } } +impl ToString for SaltObject { + fn to_string(&self) -> String { + match self { + SaltObject::Text(val) => val.clone(), + SaltObject::Boolean(val) => val.to_string(), + } + } +} + pub(crate) struct SaltXmlMapper { pub(crate) reporter: ProgressReporter, } @@ -100,7 +114,7 @@ impl SaltXmlMapper { let result = BTreeMap::new(); let mut salt_type_stack = Vec::new(); let mut current_element_id = None; - //let mut features = Vec::new(); + let mut features = BTreeMap::new(); loop { match reader.read_event_into(&mut buf)? { quick_xml::events::Event::Start(e) => { @@ -114,22 +128,51 @@ impl SaltXmlMapper { let (namespace, name, value) = get_label(&e)?; if namespace == "salt" && name == "id" { if let SaltObject::Text(id) = value { - current_element_id = Some(id); + current_element_id = + Some(id.trim_start_matches("salt:/").to_string()); } } } + SaltType::Feature => { + let (namespace, name, value) = get_label(&e)?; + let qname = join_qname(&namespace, &name); + features.insert(qname, value); + } _ => {} } } quick_xml::events::Event::End(_e) => { - if let Some(_salt_type) = salt_type_stack.pop() { - // Create the element with the collected properties - updates.add_event(UpdateEvent::AddNode { - node_name: current_element_id.clone().ok_or_else(|| { - anyhow!("Missing element ID for corpus graph node") - })?, - node_type: "corpus".into(), - })?; + if let Some(salt_type) = salt_type_stack.pop() { + match salt_type { + SaltType::Corpus | SaltType::Document => { + let node_name = current_element_id.clone().ok_or_else(|| { + anyhow!("Missing element ID for corpus graph node") + })?; + // Create the element with the collected properties + updates.add_event(UpdateEvent::AddNode { + node_name: node_name.clone(), + node_type: "corpus".into(), + })?; + + // Add features as annotations + for (feat_qname, value) in features.iter() { + let (annos_ns, anno_name) = split_qname(&feat_qname); + + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.clone(), + anno_ns: annos_ns.unwrap_or_default().to_string(), + anno_name: anno_name.to_string(), + anno_value: value.to_string(), + })?; + } + + // Reset state + features.clear(); + current_element_id = None; + } + + _ => {} + } } } quick_xml::events::Event::Eof => break, diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index ecd68dc7..ae9265d7 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -4,49 +4,41 @@ expression: actual --- - + + + - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus - - - corpus + + rootCorpus + corpus + + + subCorpus1 + corpus + + + subCorpus2 + corpus + + + file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt + doc1 + corpus + + + file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt + doc2 + corpus + + + file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt + doc3 + corpus + + + file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt + doc4 + corpus diff --git a/src/util/xml.rs b/src/util/xml.rs index f3574406..a48cee32 100644 --- a/src/util/xml.rs +++ b/src/util/xml.rs @@ -27,8 +27,8 @@ pub(crate) fn get_attribute_by_qname<'a>( } /// Extract an attribute for an XML element by the name. -pub(crate) fn get_attribute_by_local_name<'a>( - attribute_list: Attributes<'a>, +pub(crate) fn get_attribute_by_local_name( + attribute_list: Attributes, name: &str, ) -> Result> { for att in attribute_list { From c49132c0260f548626b78d47a31b7a0ac563c721 Mon Sep 17 00:00:00 2001 From: thomaskrause Date: Wed, 26 Jun 2024 12:58:10 +0000 Subject: [PATCH 08/61] Apply automatic changes --- docs/README.md | 2 +- docs/exporters/saltxml.md | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 docs/exporters/saltxml.md diff --git a/docs/README.md b/docs/README.md index fc16cfe2..38a95b32 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,5 @@ | Type | Modules | |------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [saltxml](importers/saltxml.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | -| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [xlsx](exporters/xlsx.md) | +| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [saltxml](exporters/saltxml.md), [sequence](exporters/sequence.md), [xlsx](exporters/xlsx.md) | | Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [merge](graph_ops/merge.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file diff --git a/docs/exporters/saltxml.md b/docs/exporters/saltxml.md new file mode 100644 index 00000000..46382448 --- /dev/null +++ b/docs/exporters/saltxml.md @@ -0,0 +1,6 @@ +# saltxml (exporter) + +Exports Excel Spreadsheets where each line is a token, the other columns are +spans and merged cells can be used for spans that cover more than one token. + +*No Configuration* From 3489ce26f6c162e84229a591360f2584b87f03c7 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 26 Jun 2024 14:59:36 +0200 Subject: [PATCH 09/61] Fix clippy warnings --- src/importer/saltxml/mapper.rs | 10 +++++----- src/util/xml.rs | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/importer/saltxml/mapper.rs b/src/importer/saltxml/mapper.rs index 9270c2e9..4205fe00 100644 --- a/src/importer/saltxml/mapper.rs +++ b/src/importer/saltxml/mapper.rs @@ -75,11 +75,11 @@ impl TryFrom<&str> for SaltObject { } } -impl ToString for SaltObject { - fn to_string(&self) -> String { +impl std::fmt::Display for SaltObject { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - SaltObject::Text(val) => val.clone(), - SaltObject::Boolean(val) => val.to_string(), + SaltObject::Text(val) => write!(f, "{val}"), + SaltObject::Boolean(val) => write!(f, "{val}"), } } } @@ -156,7 +156,7 @@ impl SaltXmlMapper { // Add features as annotations for (feat_qname, value) in features.iter() { - let (annos_ns, anno_name) = split_qname(&feat_qname); + let (annos_ns, anno_name) = split_qname(feat_qname); updates.add_event(UpdateEvent::AddNodeLabel { node_name: node_name.clone(), diff --git a/src/util/xml.rs b/src/util/xml.rs index a48cee32..22da89b0 100644 --- a/src/util/xml.rs +++ b/src/util/xml.rs @@ -7,8 +7,8 @@ use quick_xml::{ }; /// Extract an attribute for an XML element by the namespace and name. -pub(crate) fn get_attribute_by_qname<'a>( - attribute_list: Attributes<'a>, +pub(crate) fn get_attribute_by_qname( + attribute_list: Attributes, namespace: &str, name: &str, ) -> Result> { From eb41dd796a1c3729af62360b77ae33c74c13ef71 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 27 Jun 2024 13:34:50 +0200 Subject: [PATCH 10/61] Use DOM parser for SaltXML documents --- Cargo.toml | 1 + src/importer/saltxml.rs | 4 +- src/importer/saltxml/mapper.rs | 212 ++++++++++++++++----------------- src/util.rs | 1 - src/util/xml.rs | 82 ------------- 5 files changed, 103 insertions(+), 197 deletions(-) delete mode 100644 src/util/xml.rs diff --git a/Cargo.toml b/Cargo.toml index ee2c45f6..cd3bd4f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ pest_derive = "2.0" quick-xml = "0.34" rayon = "1.1" regex = "1.4" +roxmltree = "0.20.0" serde = "1.0" serde_derive = "1.0" struct-field-names-as-array = "0.3.0" diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index f5f05be0..ed94bd45 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -29,8 +29,8 @@ impl Importer for ImportSaltXml { // Read the corpus structure from the Salt project and get the number of documents to create mapper.reporter.info("Reading SaltXML project structure")?; - let mut project_file = File::open(input_path.join("saltProject.salt"))?; - let documents = mapper.map_corpus_structure(&mut project_file, &mut updates)?; + let project_file = std::fs::read_to_string(input_path.join("saltProject.salt"))?; + let documents = mapper.map_corpus_structure(&project_file, &mut updates)?; // Create a new progress reporter that can now estimate the work based on the number of documents mapper.reporter = ProgressReporter::new(tx, step_id, documents.len())?; diff --git a/src/importer/saltxml/mapper.rs b/src/importer/saltxml/mapper.rs index 4205fe00..5845a02a 100644 --- a/src/importer/saltxml/mapper.rs +++ b/src/importer/saltxml/mapper.rs @@ -1,19 +1,15 @@ -use std::{collections::BTreeMap, convert::TryFrom, io::BufReader, path::PathBuf}; +use std::{collections::BTreeMap, convert::TryFrom, path::PathBuf}; use anyhow::{anyhow, Ok}; use graphannis::update::{GraphUpdate, UpdateEvent}; -use graphannis_core::util::{join_qname, split_qname}; -use quick_xml::{ - events::{attributes::Attributes, BytesStart}, - Reader, -}; - -use crate::{ - progress::ProgressReporter, - util::xml::{consume_start_tag_with_name, get_attribute_by_local_name, get_attribute_by_qname}, -}; - -#[derive(Clone)] +use itertools::Itertools; +use roxmltree::Node; + +use crate::progress::ProgressReporter; + +const XSI_NAMESPACE: &str = "http://www.w3.org/2001/XMLSchema-instance"; + +#[derive(Debug, Clone, PartialEq)] enum SaltType { Corpus, Document, @@ -21,40 +17,28 @@ enum SaltType { Feature, CorpusRelation, DocumentRelation, + Unknown, } -impl<'a> TryFrom> for SaltType { - type Error = anyhow::Error; - - fn try_from(value: Attributes<'a>) -> Result { +impl<'a, 'input> From> for SaltType { + fn from(n: Node) -> Self { // Use the xsi:type attribute to determine the type - if let Some(type_id) = get_attribute_by_qname(value, "xsi", "type")? { - match type_id.as_str() { - "sCorpusStructure:SCorpus" => Ok(SaltType::Corpus), - "sCorpusStructure:SDocument" => Ok(SaltType::Document), - "saltCore:SElementId" => Ok(SaltType::ElementId), - "saltCore:SFeature" => Ok(SaltType::Feature), - "sCorpusStructure:SCorpusRelation" => Ok(SaltType::CorpusRelation), - "sCorpusStructure:SCorpusDocumentRelation" => Ok(SaltType::DocumentRelation), - _ => Err(anyhow!("Unknown Salt type {type_id}")), + if let Some(type_id) = n.attribute((XSI_NAMESPACE, "type")) { + match type_id { + "sCorpusStructure:SCorpus" => SaltType::Corpus, + "sCorpusStructure:SDocument" => SaltType::Document, + "saltCore:SElementId" => SaltType::ElementId, + "saltCore:SFeature" => SaltType::Feature, + "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, + "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, + _ => SaltType::Unknown, } } else { - Err(anyhow!("Missing attribute xsi:type")) + SaltType::Unknown } } } -fn get_label(e: &BytesStart) -> anyhow::Result<(String, String, SaltObject)> { - let namespace = get_attribute_by_local_name(e.attributes(), "namespace")? - .ok_or_else(|| anyhow!("Missing \"namespace\" attribute for label"))?; - let name = get_attribute_by_local_name(e.attributes(), "name")? - .ok_or_else(|| anyhow!("Missing \"name\" attribute for label"))?; - let value = get_attribute_by_local_name(e.attributes(), "value")? - .ok_or_else(|| anyhow!("Missing \"value\" attribute for label"))?; - let value = SaltObject::try_from(value.as_str())?; - Ok((namespace, name, value)) -} - enum SaltObject { Text(String), Boolean(bool), @@ -84,6 +68,19 @@ impl std::fmt::Display for SaltObject { } } +fn get_element_id(n: &Node) -> Option { + for element_id_label in n + .children() + .filter(|c| c.tag_name().name() == "labels" && SaltType::from(*c) == SaltType::ElementId) + { + if let Some(id) = element_id_label.attribute("value") { + let id = SaltObject::try_from(id).ok()?; + return Some(id.to_string().trim_start_matches("salt:/").to_string()); + } + } + None +} + pub(crate) struct SaltXmlMapper { pub(crate) reporter: ProgressReporter, } @@ -93,92 +90,83 @@ impl SaltXmlMapper { SaltXmlMapper { reporter } } - pub(crate) fn map_corpus_structure( + pub(crate) fn map_corpus_structure( &self, - input: &mut R, + input: &str, updates: &mut GraphUpdate, ) -> anyhow::Result> { - let input = BufReader::new(input); - let mut reader = Reader::from_reader(input); - reader.config_mut().expand_empty_elements = true; + let doc = roxmltree::Document::parse(input)?; - let mut buf = Vec::new(); - - // Consume the root SaltProject and sCorpusGraphs XML elements, which do not have the "xsi:type" attribute - consume_start_tag_with_name(&mut reader, "SaltProject")?; - consume_start_tag_with_name(&mut reader, "sCorpusGraphs")?; - - // TODO: map corpus graph labels + let root = doc.root_element(); + if root.tag_name().name() != "SaltProject" { + return Err(anyhow!( + "SaltXML project file must start with tag" + )); + } - // Iterate over all child elements of the corpus graph, which are the corpus and document nodes let result = BTreeMap::new(); - let mut salt_type_stack = Vec::new(); - let mut current_element_id = None; - let mut features = BTreeMap::new(); - loop { - match reader.read_event_into(&mut buf)? { - quick_xml::events::Event::Start(e) => { - let salt_type = SaltType::try_from(e.attributes())?; - salt_type_stack.push(salt_type.clone()); - - match salt_type { - SaltType::ElementId => { - current_element_id = None; - - let (namespace, name, value) = get_label(&e)?; - if namespace == "salt" && name == "id" { - if let SaltObject::Text(id) = value { - current_element_id = - Some(id.trim_start_matches("salt:/").to_string()); - } - } - } - SaltType::Feature => { - let (namespace, name, value) = get_label(&e)?; - let qname = join_qname(&namespace, &name); - features.insert(qname, value); + + // Iterate over all corpus graphs + for cg in root + .children() + .filter(|t| t.tag_name().name() == "sCorpusGraphs") + { + // TODO: map corpus graph labels + + // Get all nodes + let nodes = cg + .children() + .filter(|t| t.tag_name().name() == "nodes") + .collect_vec(); + + for node in nodes.iter() { + match SaltType::from(*node) { + SaltType::Corpus | SaltType::Document => { + // Get the element ID from the label + let node_name = get_element_id(node) + .ok_or_else(|| anyhow!("Missing element ID for corpus graph node"))?; + // Create the element with the collected properties + updates.add_event(UpdateEvent::AddNode { + node_name: node_name.to_string(), + node_type: "corpus".into(), + })?; + + // Add features as annotations + let features = node.children().filter(|n| { + n.tag_name().name() == "labels" + && SaltType::from(*n) == SaltType::Feature + }); + for feature_node in features { + let annos_ns = feature_node.attribute("namespace"); + let anno_name = feature_node.attribute("name").ok_or_else(|| { + anyhow!("Missing \"name\" attribute for node \"{node_name}\"") + })?; + let anno_value = SaltObject::try_from( + feature_node.attribute("value").unwrap_or_default(), + )?; + + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: annos_ns.unwrap_or_default().to_string(), + anno_name: anno_name.to_string(), + anno_value: anno_value.to_string(), + })?; } - _ => {} } + _ => {} } - quick_xml::events::Event::End(_e) => { - if let Some(salt_type) = salt_type_stack.pop() { - match salt_type { - SaltType::Corpus | SaltType::Document => { - let node_name = current_element_id.clone().ok_or_else(|| { - anyhow!("Missing element ID for corpus graph node") - })?; - // Create the element with the collected properties - updates.add_event(UpdateEvent::AddNode { - node_name: node_name.clone(), - node_type: "corpus".into(), - })?; - - // Add features as annotations - for (feat_qname, value) in features.iter() { - let (annos_ns, anno_name) = split_qname(feat_qname); - - updates.add_event(UpdateEvent::AddNodeLabel { - node_name: node_name.clone(), - anno_ns: annos_ns.unwrap_or_default().to_string(), - anno_name: anno_name.to_string(), - anno_value: value.to_string(), - })?; - } - - // Reset state - features.clear(); - current_element_id = None; - } - - _ => {} - } - } + } + + // Add a PartOf Edge between parent corpora and the sub-corpora/documents + for e in cg.children().filter(|n| n.tag_name().name() == "edges") { + match SaltType::from(e) { + SaltType::CorpusRelation => {} + SaltType::DocumentRelation => {} + _ => {} } - quick_xml::events::Event::Eof => break, - _ => {} } } + Ok(result) } diff --git a/src/util.rs b/src/util.rs index 8d259b97..4a453e95 100644 --- a/src/util.rs +++ b/src/util.rs @@ -11,7 +11,6 @@ use std::path::{Path, PathBuf}; pub(crate) mod example_generator; pub(crate) mod graphupdate; pub(crate) mod token_helper; -pub(crate) mod xml; /// Get all files with a given extension in a directory. pub fn get_all_files( diff --git a/src/util/xml.rs b/src/util/xml.rs deleted file mode 100644 index 22da89b0..00000000 --- a/src/util/xml.rs +++ /dev/null @@ -1,82 +0,0 @@ -use std::io::BufRead; - -use anyhow::{anyhow, Result}; -use quick_xml::{ - events::{attributes::Attributes, Event}, - Reader, -}; - -/// Extract an attribute for an XML element by the namespace and name. -pub(crate) fn get_attribute_by_qname( - attribute_list: Attributes, - namespace: &str, - name: &str, -) -> Result> { - for att in attribute_list { - let att = att?; - if let Some(prefix) = att.key.prefix() { - if prefix.as_ref() == namespace.as_bytes() - && att.key.local_name().as_ref() == name.as_bytes() - { - let value = String::from_utf8_lossy(&att.value).to_string(); - return Ok(Some(value)); - } - } - } - Ok(None) -} - -/// Extract an attribute for an XML element by the name. -pub(crate) fn get_attribute_by_local_name( - attribute_list: Attributes, - name: &str, -) -> Result> { - for att in attribute_list { - let att = att?; - if att.key.local_name().as_ref() == name.as_bytes() { - let value = String::from_utf8_lossy(&att.value).to_string(); - return Ok(Some(value)); - } - } - Ok(None) -} - -/// Read the next event. Will fail if the next event is not a start tag and does -/// not have the given name. All non tag elements (XML declaration, text nodes, -/// comments, ...) are ignored and skipped. -pub(crate) fn consume_start_tag_with_name(reader: &mut Reader, name: &str) -> Result<()> -where - R: BufRead, -{ - let mut buf = Vec::new(); - loop { - let event = reader.read_event_into(&mut buf)?; - let result = match event { - Event::Start(tag) => { - if tag.local_name().as_ref() == name.as_bytes() { - Ok(()) - } else { - Err(anyhow!( - "Expected <{name}> but got <{}>", - String::from_utf8_lossy(tag.local_name().as_ref()) - )) - } - } - Event::End(_) => Err(anyhow!( - "Expected \"<{name}>\" but got closing tag instead." - )), - Event::Empty(_) => Err(anyhow!("Expected \"<{name}>\" but got empty tag instead.")), - - Event::Comment(_) - | Event::Decl(_) - | Event::PI(_) - | Event::DocType(_) - | Event::CData(_) - | Event::Text(_) => continue, - Event::Eof => Err(anyhow!( - "Expected <{name} but the file is already at its end." - )), - }; - return result; - } -} From 59a0c4dfaf1d23b54552724f2a05363408399fef Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 27 Jun 2024 14:05:10 +0200 Subject: [PATCH 11/61] Map document and corpus relations --- src/importer/saltxml/mapper.rs | 48 +++++++++++++++++-- ...ltxml__tests__read_salt_sample_corpus.snap | 12 +++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/src/importer/saltxml/mapper.rs b/src/importer/saltxml/mapper.rs index 5845a02a..55d8f120 100644 --- a/src/importer/saltxml/mapper.rs +++ b/src/importer/saltxml/mapper.rs @@ -1,7 +1,11 @@ use std::{collections::BTreeMap, convert::TryFrom, path::PathBuf}; use anyhow::{anyhow, Ok}; -use graphannis::update::{GraphUpdate, UpdateEvent}; +use graphannis::{ + model::AnnotationComponentType, + update::{GraphUpdate, UpdateEvent}, +}; +use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; use roxmltree::Node; @@ -81,6 +85,26 @@ fn get_element_id(n: &Node) -> Option { None } +fn get_referenced_index(attribute_value: &str, tag_name: &str) -> Option { + let mut pattern = String::with_capacity(tag_name.len() + 4); + pattern.push_str("//@"); + pattern.push_str(tag_name); + pattern.push('.'); + + let index_as_str = attribute_value.strip_prefix(&pattern)?; + let idx = index_as_str.parse::().ok()?; + Some(idx) +} + +fn resolve_element<'a>( + attribute_value: &str, + tag_name: &str, + elements: &'a [Node], +) -> Option> { + let idx = get_referenced_index(attribute_value, tag_name)?; + elements.get(idx).copied() +} + pub(crate) struct SaltXmlMapper { pub(crate) reporter: ProgressReporter, } @@ -160,8 +184,26 @@ impl SaltXmlMapper { // Add a PartOf Edge between parent corpora and the sub-corpora/documents for e in cg.children().filter(|n| n.tag_name().name() == "edges") { match SaltType::from(e) { - SaltType::CorpusRelation => {} - SaltType::DocumentRelation => {} + SaltType::CorpusRelation | SaltType::DocumentRelation => { + let source_ref = e.attribute("source").unwrap_or_default(); + let target_ref = e.attribute("target").unwrap_or_default(); + + let source_node = resolve_element(source_ref, "nodes", &nodes) + .and_then(|n| get_element_id(&n)); + let target_node = resolve_element(target_ref, "nodes", &nodes) + .and_then(|n| get_element_id(&n)); + + if let (Some(source_node), Some(target_node)) = (source_node, target_node) { + // PartOf has the inverse meaning of the corpus and documentation relation in Salt + updates.add_event(UpdateEvent::AddEdge { + source_node: target_node, + target_node: source_node, + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".into(), + })?; + } + } _ => {} } } diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index ae9265d7..7c20fd7c 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -40,5 +40,17 @@ expression: actual doc4 corpus + + + + + + + + + + + + From b87b8830e3ff267d4a7b9fecda7c380485e91a2a Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 27 Jun 2024 14:37:31 +0200 Subject: [PATCH 12/61] Map annis:doc annotation and actual return the document IDs --- src/importer/saltxml.rs | 122 +++++++++++++- .../{mapper.rs => corpus_structure.rs} | 158 +++++------------- src/importer/saltxml/document.rs | 22 +++ ...ltxml__tests__read_salt_sample_corpus.snap | 25 +-- src/progress.rs | 1 + 5 files changed, 187 insertions(+), 141 deletions(-) rename src/importer/saltxml/{mapper.rs => corpus_structure.rs} (50%) create mode 100644 src/importer/saltxml/document.rs diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index ed94bd45..2a0a3e5f 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -1,7 +1,10 @@ -use std::fs::File; +use std::{convert::TryFrom, fs::File}; +use anyhow::anyhow; +use document::DocumentMapper; use documented::{Documented, DocumentedFields}; use graphannis::update::GraphUpdate; +use roxmltree::Node; use serde::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; @@ -25,20 +28,26 @@ impl Importer for ImportSaltXml { let mut updates = GraphUpdate::new(); // Start with an undetermined progress reporter let reporter = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?; - let mut mapper = mapper::SaltXmlMapper::new(reporter); + let mapper = corpus_structure::SaltCorpusStructureMapper::new(reporter.clone()); // Read the corpus structure from the Salt project and get the number of documents to create - mapper.reporter.info("Reading SaltXML project structure")?; + reporter.info("Reading SaltXML project structure")?; let project_file = std::fs::read_to_string(input_path.join("saltProject.salt"))?; let documents = mapper.map_corpus_structure(&project_file, &mut updates)?; // Create a new progress reporter that can now estimate the work based on the number of documents - mapper.reporter = ProgressReporter::new(tx, step_id, documents.len())?; - for (document_node_name, document_path) in documents.iter() { - mapper.reporter.info("Reading document {document_path}")?; + let reporter = ProgressReporter::new(tx, step_id, documents.len())?; + for document_node_name in documents { + let mut relative_document_path = document_node_name.clone(); + relative_document_path.push_str(".salt"); + dbg!(&relative_document_path); + // Get the path from the node name + let document_path = input_path.join(relative_document_path); + reporter.info("Reading document {document_path}")?; let mut document_file = File::open(document_path)?; - mapper.read_document(&mut document_file, document_node_name, &mut updates)?; - mapper.reporter.worked(1)?; + let document_mapper = DocumentMapper::new(reporter.clone()); + document_mapper.read_document(&mut document_file, &document_node_name, &mut updates)?; + reporter.worked(1)?; } Ok(updates) @@ -49,7 +58,102 @@ impl Importer for ImportSaltXml { } } -mod mapper; +const XSI_NAMESPACE: &str = "http://www.w3.org/2001/XMLSchema-instance"; + +#[derive(Debug, Clone, Copy, PartialEq)] +enum SaltType { + Corpus, + Document, + ElementId, + Feature, + CorpusRelation, + DocumentRelation, + Unknown, +} + +impl<'a, 'input> From> for SaltType { + fn from(n: Node) -> Self { + // Use the xsi:type attribute to determine the type + if let Some(type_id) = n.attribute((XSI_NAMESPACE, "type")) { + match type_id { + "sCorpusStructure:SCorpus" => SaltType::Corpus, + "sCorpusStructure:SDocument" => SaltType::Document, + "saltCore:SElementId" => SaltType::ElementId, + "saltCore:SFeature" => SaltType::Feature, + "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, + "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, + _ => SaltType::Unknown, + } + } else { + SaltType::Unknown + } + } +} + +enum SaltObject { + Text(String), + Boolean(bool), +} + +impl TryFrom<&str> for SaltObject { + type Error = anyhow::Error; + + fn try_from(value: &str) -> Result { + if let Some(value) = value.strip_prefix("T::") { + Ok(SaltObject::Text(value.to_string())) + } else if let Some(_value) = value.strip_prefix("B::") { + let value = value.to_ascii_lowercase() == "true"; + Ok(SaltObject::Boolean(value)) + } else { + Err(anyhow!("Could not create Salt object from \"{value}\"")) + } + } +} + +impl std::fmt::Display for SaltObject { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SaltObject::Text(val) => write!(f, "{val}"), + SaltObject::Boolean(val) => write!(f, "{val}"), + } + } +} + +fn get_element_id(n: &Node) -> Option { + for element_id_label in n + .children() + .filter(|c| c.tag_name().name() == "labels" && SaltType::from(*c) == SaltType::ElementId) + { + if let Some(id) = element_id_label.attribute("value") { + let id = SaltObject::try_from(id).ok()?; + return Some(id.to_string().trim_start_matches("salt:/").to_string()); + } + } + None +} + +fn get_referenced_index(attribute_value: &str, tag_name: &str) -> Option { + let mut pattern = String::with_capacity(tag_name.len() + 4); + pattern.push_str("//@"); + pattern.push_str(tag_name); + pattern.push('.'); + + let index_as_str = attribute_value.strip_prefix(&pattern)?; + let idx = index_as_str.parse::().ok()?; + Some(idx) +} + +fn resolve_element<'a>( + attribute_value: &str, + tag_name: &str, + elements: &'a [Node], +) -> Option> { + let idx = get_referenced_index(attribute_value, tag_name)?; + elements.get(idx).copied() +} + +mod corpus_structure; +mod document; #[cfg(test)] mod tests; diff --git a/src/importer/saltxml/mapper.rs b/src/importer/saltxml/corpus_structure.rs similarity index 50% rename from src/importer/saltxml/mapper.rs rename to src/importer/saltxml/corpus_structure.rs index 55d8f120..1fdcc6d0 100644 --- a/src/importer/saltxml/mapper.rs +++ b/src/importer/saltxml/corpus_structure.rs @@ -1,4 +1,4 @@ -use std::{collections::BTreeMap, convert::TryFrom, path::PathBuf}; +use std::{collections::BTreeSet, convert::TryFrom}; use anyhow::{anyhow, Ok}; use graphannis::{ @@ -7,118 +7,25 @@ use graphannis::{ }; use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; -use roxmltree::Node; use crate::progress::ProgressReporter; -const XSI_NAMESPACE: &str = "http://www.w3.org/2001/XMLSchema-instance"; - -#[derive(Debug, Clone, PartialEq)] -enum SaltType { - Corpus, - Document, - ElementId, - Feature, - CorpusRelation, - DocumentRelation, - Unknown, -} - -impl<'a, 'input> From> for SaltType { - fn from(n: Node) -> Self { - // Use the xsi:type attribute to determine the type - if let Some(type_id) = n.attribute((XSI_NAMESPACE, "type")) { - match type_id { - "sCorpusStructure:SCorpus" => SaltType::Corpus, - "sCorpusStructure:SDocument" => SaltType::Document, - "saltCore:SElementId" => SaltType::ElementId, - "saltCore:SFeature" => SaltType::Feature, - "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, - "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, - _ => SaltType::Unknown, - } - } else { - SaltType::Unknown - } - } -} - -enum SaltObject { - Text(String), - Boolean(bool), -} - -impl TryFrom<&str> for SaltObject { - type Error = anyhow::Error; - - fn try_from(value: &str) -> Result { - if let Some(value) = value.strip_prefix("T::") { - Ok(SaltObject::Text(value.to_string())) - } else if let Some(_value) = value.strip_prefix("B::") { - let value = value.to_ascii_lowercase() == "true"; - Ok(SaltObject::Boolean(value)) - } else { - Err(anyhow!("Could not create Salt object from \"{value}\"")) - } - } -} - -impl std::fmt::Display for SaltObject { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - SaltObject::Text(val) => write!(f, "{val}"), - SaltObject::Boolean(val) => write!(f, "{val}"), - } - } -} +use super::{get_element_id, resolve_element, SaltObject, SaltType}; -fn get_element_id(n: &Node) -> Option { - for element_id_label in n - .children() - .filter(|c| c.tag_name().name() == "labels" && SaltType::from(*c) == SaltType::ElementId) - { - if let Some(id) = element_id_label.attribute("value") { - let id = SaltObject::try_from(id).ok()?; - return Some(id.to_string().trim_start_matches("salt:/").to_string()); - } - } - None -} - -fn get_referenced_index(attribute_value: &str, tag_name: &str) -> Option { - let mut pattern = String::with_capacity(tag_name.len() + 4); - pattern.push_str("//@"); - pattern.push_str(tag_name); - pattern.push('.'); - - let index_as_str = attribute_value.strip_prefix(&pattern)?; - let idx = index_as_str.parse::().ok()?; - Some(idx) -} - -fn resolve_element<'a>( - attribute_value: &str, - tag_name: &str, - elements: &'a [Node], -) -> Option> { - let idx = get_referenced_index(attribute_value, tag_name)?; - elements.get(idx).copied() -} - -pub(crate) struct SaltXmlMapper { - pub(crate) reporter: ProgressReporter, +pub(super) struct SaltCorpusStructureMapper { + reporter: ProgressReporter, } -impl SaltXmlMapper { - pub(crate) fn new(reporter: ProgressReporter) -> SaltXmlMapper { - SaltXmlMapper { reporter } +impl SaltCorpusStructureMapper { + pub(super) fn new(reporter: ProgressReporter) -> SaltCorpusStructureMapper { + SaltCorpusStructureMapper { reporter } } - pub(crate) fn map_corpus_structure( + pub(super) fn map_corpus_structure( &self, input: &str, updates: &mut GraphUpdate, - ) -> anyhow::Result> { + ) -> anyhow::Result> { let doc = roxmltree::Document::parse(input)?; let root = doc.root_element(); @@ -128,7 +35,7 @@ impl SaltXmlMapper { )); } - let result = BTreeMap::new(); + let mut documents = BTreeSet::new(); // Iterate over all corpus graphs for cg in root @@ -144,7 +51,8 @@ impl SaltXmlMapper { .collect_vec(); for node in nodes.iter() { - match SaltType::from(*node) { + let salt_type = SaltType::from(*node); + match salt_type { SaltType::Corpus | SaltType::Document => { // Get the element ID from the label let node_name = get_element_id(node) @@ -155,6 +63,11 @@ impl SaltXmlMapper { node_type: "corpus".into(), })?; + // Add the document ID to the result + if SaltType::Document == salt_type { + documents.insert(node_name.to_string()); + } + // Add features as annotations let features = node.children().filter(|n| { n.tag_name().name() == "labels" @@ -169,13 +82,27 @@ impl SaltXmlMapper { feature_node.attribute("value").unwrap_or_default(), )?; - updates.add_event(UpdateEvent::AddNodeLabel { - node_name: node_name.to_string(), - anno_ns: annos_ns.unwrap_or_default().to_string(), - anno_name: anno_name.to_string(), - anno_value: anno_value.to_string(), - })?; + if salt_type == SaltType::Document + && annos_ns == Some("salt") + && anno_name == "SNAME" + { + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "doc".to_string(), + anno_value: anno_value.to_string(), + })?; + } else { + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: annos_ns.unwrap_or_default().to_string(), + anno_name: anno_name.to_string(), + anno_value: anno_value.to_string(), + })?; + } } + + // TODO: map annotations (that are not features) } _ => {} } @@ -209,15 +136,6 @@ impl SaltXmlMapper { } } - Ok(result) - } - - pub(crate) fn read_document( - &self, - _input: &mut R, - _document_node_name: &str, - _updates: &mut GraphUpdate, - ) -> anyhow::Result<()> { - Ok(()) + Ok(documents) } } diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs new file mode 100644 index 00000000..844a1673 --- /dev/null +++ b/src/importer/saltxml/document.rs @@ -0,0 +1,22 @@ +use graphannis::update::GraphUpdate; + +use crate::progress::ProgressReporter; + +pub(super) struct DocumentMapper { + reporter: ProgressReporter, +} + +impl DocumentMapper { + pub(super) fn new(reporter: ProgressReporter) -> DocumentMapper { + DocumentMapper { reporter } + } + + pub(super) fn read_document( + &self, + _input: &mut R, + _document_node_name: &str, + _updates: &mut GraphUpdate, + ) -> anyhow::Result<()> { + Ok(()) + } +} diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 7c20fd7c..4aaa9856 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -6,39 +6,40 @@ expression: actual - + + rootCorpus - corpus + corpus subCorpus1 - corpus + corpus subCorpus2 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt - doc1 - corpus + doc1 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt - doc2 - corpus + doc2 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt - doc3 - corpus + doc3 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt - doc4 - corpus + doc4 + corpus diff --git a/src/progress.rs b/src/progress.rs index 92f24d67..5e0061db 100644 --- a/src/progress.rs +++ b/src/progress.rs @@ -7,6 +7,7 @@ struct ProgressState { accumulated_finished_work: usize, } +#[derive(Clone)] pub struct ProgressReporter { state: Arc>, total_work: Option, From e27213f1f3404ceb17ccd4433e6afb45822be960 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 27 Jun 2024 14:52:40 +0200 Subject: [PATCH 13/61] Refactor the progress reporting in SaltXML --- src/importer/saltxml.rs | 14 +++++++------- src/importer/saltxml/corpus_structure.rs | 10 +++------- src/importer/saltxml/document.rs | 24 +++++++++++++++--------- src/progress.rs | 1 - 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 2a0a3e5f..902ebad7 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -1,4 +1,4 @@ -use std::{convert::TryFrom, fs::File}; +use std::convert::TryFrom; use anyhow::anyhow; use document::DocumentMapper; @@ -28,7 +28,7 @@ impl Importer for ImportSaltXml { let mut updates = GraphUpdate::new(); // Start with an undetermined progress reporter let reporter = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?; - let mapper = corpus_structure::SaltCorpusStructureMapper::new(reporter.clone()); + let mapper = corpus_structure::SaltCorpusStructureMapper::new(); // Read the corpus structure from the Salt project and get the number of documents to create reporter.info("Reading SaltXML project structure")?; @@ -38,15 +38,15 @@ impl Importer for ImportSaltXml { // Create a new progress reporter that can now estimate the work based on the number of documents let reporter = ProgressReporter::new(tx, step_id, documents.len())?; for document_node_name in documents { + reporter.info(&format!("Reading document {document_node_name}"))?; + let mut relative_document_path = document_node_name.clone(); relative_document_path.push_str(".salt"); - dbg!(&relative_document_path); // Get the path from the node name let document_path = input_path.join(relative_document_path); - reporter.info("Reading document {document_path}")?; - let mut document_file = File::open(document_path)?; - let document_mapper = DocumentMapper::new(reporter.clone()); - document_mapper.read_document(&mut document_file, &document_node_name, &mut updates)?; + let document_file = std::fs::read_to_string(document_path)?; + let document_mapper = DocumentMapper::new(); + document_mapper.read_document(&document_file, &document_node_name, &mut updates)?; reporter.worked(1)?; } diff --git a/src/importer/saltxml/corpus_structure.rs b/src/importer/saltxml/corpus_structure.rs index 1fdcc6d0..7bf2e9dd 100644 --- a/src/importer/saltxml/corpus_structure.rs +++ b/src/importer/saltxml/corpus_structure.rs @@ -8,17 +8,13 @@ use graphannis::{ use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; -use crate::progress::ProgressReporter; - use super::{get_element_id, resolve_element, SaltObject, SaltType}; -pub(super) struct SaltCorpusStructureMapper { - reporter: ProgressReporter, -} +pub(super) struct SaltCorpusStructureMapper {} impl SaltCorpusStructureMapper { - pub(super) fn new(reporter: ProgressReporter) -> SaltCorpusStructureMapper { - SaltCorpusStructureMapper { reporter } + pub(super) fn new() -> SaltCorpusStructureMapper { + SaltCorpusStructureMapper {} } pub(super) fn map_corpus_structure( diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 844a1673..ab925ef5 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -1,22 +1,28 @@ +use anyhow::anyhow; use graphannis::update::GraphUpdate; -use crate::progress::ProgressReporter; - -pub(super) struct DocumentMapper { - reporter: ProgressReporter, -} +pub(super) struct DocumentMapper {} impl DocumentMapper { - pub(super) fn new(reporter: ProgressReporter) -> DocumentMapper { - DocumentMapper { reporter } + pub(super) fn new() -> DocumentMapper { + DocumentMapper {} } - pub(super) fn read_document( + pub(super) fn read_document( &self, - _input: &mut R, + input: &str, _document_node_name: &str, _updates: &mut GraphUpdate, ) -> anyhow::Result<()> { + let doc = roxmltree::Document::parse(input)?; + + let root = doc.root_element(); + if root.tag_name().name() != "SDocumentGraph" { + return Err(anyhow!( + "SaltXML document file must start with tag" + )); + } + Ok(()) } } diff --git a/src/progress.rs b/src/progress.rs index 5e0061db..92f24d67 100644 --- a/src/progress.rs +++ b/src/progress.rs @@ -7,7 +7,6 @@ struct ProgressState { accumulated_finished_work: usize, } -#[derive(Clone)] pub struct ProgressReporter { state: Arc>, total_work: Option, From 5ad718b4cd966f3d2628c507637be2866b88111c Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 27 Jun 2024 16:24:38 +0200 Subject: [PATCH 14/61] Preparations for mapping the documents --- src/importer/saltxml.rs | 9 +++++-- src/importer/saltxml/document.rs | 46 +++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 902ebad7..a9308371 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -45,8 +45,7 @@ impl Importer for ImportSaltXml { // Get the path from the node name let document_path = input_path.join(relative_document_path); let document_file = std::fs::read_to_string(document_path)?; - let document_mapper = DocumentMapper::new(); - document_mapper.read_document(&document_file, &document_node_name, &mut updates)?; + DocumentMapper::read_document(&document_file, &document_node_name, &mut updates)?; reporter.worked(1)?; } @@ -68,6 +67,9 @@ enum SaltType { Feature, CorpusRelation, DocumentRelation, + Layer, + Token, + TextualDs, Unknown, } @@ -82,6 +84,9 @@ impl<'a, 'input> From> for SaltType { "saltCore:SFeature" => SaltType::Feature, "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, + "saltCore:SLayer" => SaltType::Layer, + "sDocumentStructure:SToken" => SaltType::Token, + "sDocumentStructure:STextualDS" => SaltType::TextualDs, _ => SaltType::Unknown, } } else { diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index ab925ef5..d2b8b7dc 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -1,21 +1,21 @@ -use anyhow::anyhow; +use anyhow::{anyhow, Result}; use graphannis::update::GraphUpdate; +use itertools::Itertools; +use roxmltree::Node; -pub(super) struct DocumentMapper {} +use super::{get_element_id, SaltType}; -impl DocumentMapper { - pub(super) fn new() -> DocumentMapper { - DocumentMapper {} - } +pub(super) struct DocumentMapper { + base_texts: Vec, +} +impl DocumentMapper { pub(super) fn read_document( - &self, input: &str, _document_node_name: &str, _updates: &mut GraphUpdate, - ) -> anyhow::Result<()> { + ) -> Result<()> { let doc = roxmltree::Document::parse(input)?; - let root = doc.root_element(); if root.tag_name().name() != "SDocumentGraph" { return Err(anyhow!( @@ -23,6 +23,34 @@ impl DocumentMapper { )); } + let layers = root + .children() + .filter(|n| SaltType::from(*n) == SaltType::Layer) + .collect_vec(); + + let mut mapper = DocumentMapper { + base_texts: Vec::new(), + }; + mapper.map_textual_ds(&root)?; + mapper.map_token(&root, &layers)?; + + Ok(()) + } + + fn map_textual_ds(&mut self, root: &Node) -> Result<()> { + for t in root + .children() + .filter(|n| SaltType::from(*n) == SaltType::TextualDs) + { + let element_id = get_element_id(&t) + .ok_or_else(|| anyhow!("Missing element ID for textual data source"))?; + } + Ok(()) + } + + fn map_token(&self, root: &Node, layers: &[Node]) -> Result<()> { + root.children() + .filter(|n| SaltType::from(*n) == SaltType::Token); Ok(()) } } From 707ccfdfb6f9221cff58d4b730b32aeed32ef0c5 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 1 Jul 2024 13:17:42 +0200 Subject: [PATCH 15/61] Start to map the textual datasource of a document --- src/importer/saltxml.rs | 34 ++-- src/importer/saltxml/corpus_structure.rs | 14 +- src/importer/saltxml/document.rs | 36 +++-- ...ltxml__tests__read_salt_sample_corpus.snap | 12 ++ .../rootCorpus/subCorpus1/doc1.salt | 150 +++++++++--------- 5 files changed, 139 insertions(+), 107 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index a9308371..0a0ac55d 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -1,6 +1,3 @@ -use std::convert::TryFrom; - -use anyhow::anyhow; use document::DocumentMapper; use documented::{Documented, DocumentedFields}; use graphannis::update::GraphUpdate; @@ -98,19 +95,18 @@ impl<'a, 'input> From> for SaltType { enum SaltObject { Text(String), Boolean(bool), + Null, } -impl TryFrom<&str> for SaltObject { - type Error = anyhow::Error; - - fn try_from(value: &str) -> Result { +impl From<&str> for SaltObject { + fn from(value: &str) -> Self { if let Some(value) = value.strip_prefix("T::") { - Ok(SaltObject::Text(value.to_string())) + SaltObject::Text(value.to_string()) } else if let Some(_value) = value.strip_prefix("B::") { let value = value.to_ascii_lowercase() == "true"; - Ok(SaltObject::Boolean(value)) + SaltObject::Boolean(value) } else { - Err(anyhow!("Could not create Salt object from \"{value}\"")) + SaltObject::Null } } } @@ -120,6 +116,7 @@ impl std::fmt::Display for SaltObject { match self { SaltObject::Text(val) => write!(f, "{val}"), SaltObject::Boolean(val) => write!(f, "{val}"), + SaltObject::Null => write!(f, ""), } } } @@ -130,13 +127,28 @@ fn get_element_id(n: &Node) -> Option { .filter(|c| c.tag_name().name() == "labels" && SaltType::from(*c) == SaltType::ElementId) { if let Some(id) = element_id_label.attribute("value") { - let id = SaltObject::try_from(id).ok()?; + let id = SaltObject::from(id); return Some(id.to_string().trim_start_matches("salt:/").to_string()); } } None } +fn get_features<'a, 'input>(n: &'a Node<'a, 'input>) -> impl Iterator> { + n.children() + .filter(|n| n.tag_name().name() == "labels" && SaltType::from(*n) == SaltType::Feature) +} + +fn get_feature_by_qname(n: &Node, namespace: &str, name: &str) -> Option { + get_features(n) + .filter(|f| { + f.attribute("namespace") == Some(namespace) && f.attribute("name") == Some(name) + }) + .filter_map(|f| f.attribute("value")) + .map(|v| SaltObject::from(v)) + .next() +} + fn get_referenced_index(attribute_value: &str, tag_name: &str) -> Option { let mut pattern = String::with_capacity(tag_name.len() + 4); pattern.push_str("//@"); diff --git a/src/importer/saltxml/corpus_structure.rs b/src/importer/saltxml/corpus_structure.rs index 7bf2e9dd..3e0e9738 100644 --- a/src/importer/saltxml/corpus_structure.rs +++ b/src/importer/saltxml/corpus_structure.rs @@ -1,4 +1,4 @@ -use std::{collections::BTreeSet, convert::TryFrom}; +use std::collections::BTreeSet; use anyhow::{anyhow, Ok}; use graphannis::{ @@ -8,7 +8,7 @@ use graphannis::{ use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; -use super::{get_element_id, resolve_element, SaltObject, SaltType}; +use super::{get_element_id, get_features, resolve_element, SaltObject, SaltType}; pub(super) struct SaltCorpusStructureMapper {} @@ -65,18 +65,14 @@ impl SaltCorpusStructureMapper { } // Add features as annotations - let features = node.children().filter(|n| { - n.tag_name().name() == "labels" - && SaltType::from(*n) == SaltType::Feature - }); - for feature_node in features { + for feature_node in get_features(node) { let annos_ns = feature_node.attribute("namespace"); let anno_name = feature_node.attribute("name").ok_or_else(|| { anyhow!("Missing \"name\" attribute for node \"{node_name}\"") })?; - let anno_value = SaltObject::try_from( + let anno_value = SaltObject::from( feature_node.attribute("value").unwrap_or_default(), - )?; + ); if salt_type == SaltType::Document && annos_ns == Some("salt") diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index d2b8b7dc..f9836bde 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -1,19 +1,21 @@ +use std::collections::BTreeMap; + use anyhow::{anyhow, Result}; -use graphannis::update::GraphUpdate; +use graphannis::update::{GraphUpdate, UpdateEvent}; use itertools::Itertools; use roxmltree::Node; -use super::{get_element_id, SaltType}; +use super::{get_element_id, get_feature_by_qname, SaltObject, SaltType}; pub(super) struct DocumentMapper { - base_texts: Vec, + base_texts: BTreeMap, } impl DocumentMapper { pub(super) fn read_document( input: &str, _document_node_name: &str, - _updates: &mut GraphUpdate, + updates: &mut GraphUpdate, ) -> Result<()> { let doc = roxmltree::Document::parse(input)?; let root = doc.root_element(); @@ -29,28 +31,38 @@ impl DocumentMapper { .collect_vec(); let mut mapper = DocumentMapper { - base_texts: Vec::new(), + base_texts: BTreeMap::new(), }; - mapper.map_textual_ds(&root)?; + mapper.map_textual_ds(&root, updates)?; mapper.map_token(&root, &layers)?; Ok(()) } - fn map_textual_ds(&mut self, root: &Node) -> Result<()> { - for t in root + fn map_textual_ds(&mut self, root: &Node, updates: &mut GraphUpdate) -> Result<()> { + for text_node in root .children() .filter(|n| SaltType::from(*n) == SaltType::TextualDs) { - let element_id = get_element_id(&t) + let element_id = get_element_id(&text_node) .ok_or_else(|| anyhow!("Missing element ID for textual data source"))?; + + if let Some(SaltObject::Text(anno_value)) = + get_feature_by_qname(&text_node, "saltCommon", "SDATA") + { + self.base_texts.insert(element_id.clone(), anno_value); + updates.add_event(UpdateEvent::AddNode { + node_name: element_id.clone(), + node_type: "datasource".to_string(), + })?; + } } Ok(()) } - fn map_token(&self, root: &Node, layers: &[Node]) -> Result<()> { - root.children() - .filter(|n| SaltType::from(*n) == SaltType::Token); + fn map_token(&self, _root: &Node, _layers: &[Node]) -> Result<()> { + // root.children() + // .filter(|n| SaltType::from(*n) == SaltType::Token); Ok(()) } } diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 4aaa9856..911e0a8c 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -41,6 +41,18 @@ expression: actual doc4 corpus + + datasource + + + datasource + + + datasource + + + datasource + diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt index 886757da..f65e6d0e 100644 --- a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt @@ -1,360 +1,360 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + From 56149589b94704d12ac8ffef32fbcd7720d64954 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 1 Jul 2024 13:40:31 +0200 Subject: [PATCH 16/61] Add token nodes and store the document in the mapper struct --- src/importer/saltxml/document.rs | 62 +++++--- ...ltxml__tests__read_salt_sample_corpus.snap | 132 ++++++++++++++++++ 2 files changed, 178 insertions(+), 16 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index f9836bde..9486ad09 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -2,18 +2,18 @@ use std::collections::BTreeMap; use anyhow::{anyhow, Result}; use graphannis::update::{GraphUpdate, UpdateEvent}; -use itertools::Itertools; -use roxmltree::Node; +use roxmltree::Document; use super::{get_element_id, get_feature_by_qname, SaltObject, SaltType}; -pub(super) struct DocumentMapper { +pub(super) struct DocumentMapper<'input> { + document: Document<'input>, base_texts: BTreeMap, } -impl DocumentMapper { +impl<'input> DocumentMapper<'input> { pub(super) fn read_document( - input: &str, + input: &'input str, _document_node_name: &str, updates: &mut GraphUpdate, ) -> Result<()> { @@ -25,22 +25,25 @@ impl DocumentMapper { )); } - let layers = root - .children() - .filter(|n| SaltType::from(*n) == SaltType::Layer) - .collect_vec(); + // let layers = root + // .children() + // .filter(|n| SaltType::from(*n) == SaltType::Layer) + // .collect_vec(); let mut mapper = DocumentMapper { base_texts: BTreeMap::new(), + document: doc, }; - mapper.map_textual_ds(&root, updates)?; - mapper.map_token(&root, &layers)?; + mapper.map_textual_ds(updates)?; + mapper.map_token(updates)?; Ok(()) } - fn map_textual_ds(&mut self, root: &Node, updates: &mut GraphUpdate) -> Result<()> { - for text_node in root + fn map_textual_ds(&mut self, updates: &mut GraphUpdate) -> Result<()> { + for text_node in self + .document + .root_element() .children() .filter(|n| SaltType::from(*n) == SaltType::TextualDs) { @@ -60,9 +63,36 @@ impl DocumentMapper { Ok(()) } - fn map_token(&self, _root: &Node, _layers: &[Node]) -> Result<()> { - // root.children() - // .filter(|n| SaltType::from(*n) == SaltType::Token); + fn map_token(&self, updates: &mut GraphUpdate) -> Result<()> { + // Get the list of token in the same order as in the SaltXML file + let tokens: Result> = self + .document + .root_element() + .children() + .filter(|n| n.tag_name().name() == "nodes" && SaltType::from(*n) == SaltType::Token) + .map(|t| { + let id = get_element_id(&t) + .ok_or_else(|| anyhow!("Missing element ID for token source"))?; + Ok((t, id)) + }) + .collect(); + let tokens = tokens?; + + for (_, t_id) in tokens.iter() { + updates.add_event(UpdateEvent::AddNode { + node_name: t_id.clone(), + node_type: "node".to_string(), + })?; + } + // Connect the token to the texts by the textual relations + for _text_rel in self + .document + .root_element() + .children() + .filter(|n| n.tag_name().name() == "edges" && SaltType::from(*n) == SaltType::Token) + { + } + Ok(()) } } diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 911e0a8c..7f9b92f9 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -44,15 +44,147 @@ expression: actual datasource + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + datasource + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + datasource + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + datasource + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + From ba01b932df2f14fd03c8961ea7781546174eca44 Mon Sep 17 00:00:00 2001 From: thomaskrause Date: Mon, 1 Jul 2024 11:42:27 +0000 Subject: [PATCH 17/61] Apply automatic changes --- docs/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/README.md b/docs/README.md index 4eec8575..fa51a5d1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,5 @@ -| Type | Modules | -|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | -| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [xlsx](exporters/xlsx.md) | -| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file +| Type | Modules | +|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [saltxml](importers/saltxml.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | +| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [saltxml](exporters/saltxml.md), [sequence](exporters/sequence.md), [xlsx](exporters/xlsx.md) | +| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file From 1c56dbaf5af1bc0ec61c802bee32431529a7f56a Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 1 Jul 2024 14:25:54 +0200 Subject: [PATCH 18/61] Store nodes, edges and layers in the mapper struct --- src/importer/saltxml.rs | 2 + src/importer/saltxml/document.rs | 71 +++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 0a0ac55d..287bb8a1 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -64,6 +64,7 @@ enum SaltType { Feature, CorpusRelation, DocumentRelation, + TextualRelation, Layer, Token, TextualDs, @@ -81,6 +82,7 @@ impl<'a, 'input> From> for SaltType { "saltCore:SFeature" => SaltType::Feature, "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, + "sDocumentStructure:STextualRelation" => SaltType::TextualRelation, "saltCore:SLayer" => SaltType::Layer, "sDocumentStructure:SToken" => SaltType::Token, "sDocumentStructure:STextualDS" => SaltType::TextualDs, diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 9486ad09..4c7cafd0 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -2,16 +2,19 @@ use std::collections::BTreeMap; use anyhow::{anyhow, Result}; use graphannis::update::{GraphUpdate, UpdateEvent}; -use roxmltree::Document; +use itertools::Itertools; +use roxmltree::Node; -use super::{get_element_id, get_feature_by_qname, SaltObject, SaltType}; +use super::{get_element_id, get_feature_by_qname, resolve_element, SaltObject, SaltType}; -pub(super) struct DocumentMapper<'input> { - document: Document<'input>, +pub(super) struct DocumentMapper<'a, 'input> { + nodes: Vec>, + edges: Vec>, + layers: Vec>, base_texts: BTreeMap, } -impl<'input> DocumentMapper<'input> { +impl<'a, 'input> DocumentMapper<'a, 'input> { pub(super) fn read_document( input: &'input str, _document_node_name: &str, @@ -25,14 +28,28 @@ impl<'input> DocumentMapper<'input> { )); } - // let layers = root - // .children() - // .filter(|n| SaltType::from(*n) == SaltType::Layer) - // .collect_vec(); + let nodes = doc + .root_element() + .children() + .filter(|n| n.tag_name().name() == "nodes") + .collect_vec(); + let edges = doc + .root_element() + .children() + .filter(|n| n.tag_name().name() == "edges") + .collect_vec(); + + let layers = doc + .root_element() + .children() + .filter(|n| n.tag_name().name() == "layers") + .collect_vec(); let mut mapper = DocumentMapper { base_texts: BTreeMap::new(), - document: doc, + nodes, + edges, + layers, }; mapper.map_textual_ds(updates)?; mapper.map_token(updates)?; @@ -42,10 +59,9 @@ impl<'input> DocumentMapper<'input> { fn map_textual_ds(&mut self, updates: &mut GraphUpdate) -> Result<()> { for text_node in self - .document - .root_element() - .children() - .filter(|n| SaltType::from(*n) == SaltType::TextualDs) + .nodes + .iter() + .filter(|n| SaltType::from(**n) == SaltType::TextualDs) { let element_id = get_element_id(&text_node) .ok_or_else(|| anyhow!("Missing element ID for textual data source"))?; @@ -66,10 +82,9 @@ impl<'input> DocumentMapper<'input> { fn map_token(&self, updates: &mut GraphUpdate) -> Result<()> { // Get the list of token in the same order as in the SaltXML file let tokens: Result> = self - .document - .root_element() - .children() - .filter(|n| n.tag_name().name() == "nodes" && SaltType::from(*n) == SaltType::Token) + .nodes + .iter() + .filter(|n| SaltType::from(**n) == SaltType::Token) .map(|t| { let id = get_element_id(&t) .ok_or_else(|| anyhow!("Missing element ID for token source"))?; @@ -85,12 +100,22 @@ impl<'input> DocumentMapper<'input> { })?; } // Connect the token to the texts by the textual relations - for _text_rel in self - .document - .root_element() - .children() - .filter(|n| n.tag_name().name() == "edges" && SaltType::from(*n) == SaltType::Token) + for text_rel in self + .edges + .iter() + .filter(|n| SaltType::from(**n) == SaltType::TextualRelation) { + let token = resolve_element( + text_rel.attribute("source").unwrap_or_default(), + "nodes", + &self.nodes, + ); + let datasource = resolve_element( + text_rel.attribute("target").unwrap_or_default(), + "nodes", + &self.nodes, + ); + if let (Some(_token), Some(_datasource)) = (token, datasource) {} } Ok(()) From a87d6f41909797e39ce7089d8b17ab74008c935e Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 1 Jul 2024 17:16:14 +0200 Subject: [PATCH 19/61] Restructure code to fail if source/target can't be resolved --- src/importer/saltxml/document.rs | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 4c7cafd0..a58bfe2b 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -88,7 +88,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .map(|t| { let id = get_element_id(&t) .ok_or_else(|| anyhow!("Missing element ID for token source"))?; - Ok((t, id)) + Ok((*t, id)) }) .collect(); let tokens = tokens?; @@ -105,17 +105,24 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .iter() .filter(|n| SaltType::from(**n) == SaltType::TextualRelation) { - let token = resolve_element( - text_rel.attribute("source").unwrap_or_default(), - "nodes", - &self.nodes, - ); - let datasource = resolve_element( - text_rel.attribute("target").unwrap_or_default(), - "nodes", - &self.nodes, - ); - if let (Some(_token), Some(_datasource)) = (token, datasource) {} + let source_att_val = text_rel.attribute("source").unwrap_or_default(); + let token = resolve_element(source_att_val, "nodes", &self.nodes).ok_or_else(|| { + anyhow!("Textual relation source \"{source_att_val}\" could not be resolved") + })?; + + let target_att_val = text_rel.attribute("target").unwrap_or_default(); + let datasource = + resolve_element(target_att_val, "nodes", &self.nodes).ok_or_else(|| { + anyhow!("Textual relation target \"{target_att_val}\" could not be resolved") + })?; + let _token_id = + get_element_id(&token).ok_or_else(|| anyhow!("Missing ID for token"))?; + let _datasource_id = + get_element_id(&datasource).ok_or_else(|| anyhow!("Missing ID for token"))?; + + // TODO Get the string for this token + + // TODO also get whitespace after/before } Ok(()) From d060a6ef37fae71a88340d613a5b0111f66847c5 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 2 Jul 2024 11:49:52 +0200 Subject: [PATCH 20/61] Get token value from textual relations --- src/importer/saltxml.rs | 7 ++- src/importer/saltxml/document.rs | 54 +++++++++++++------ ...ltxml__tests__read_salt_sample_corpus.snap | 45 ++++++++++++++++ 3 files changed, 88 insertions(+), 18 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 287bb8a1..420519f9 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -97,6 +97,7 @@ impl<'a, 'input> From> for SaltType { enum SaltObject { Text(String), Boolean(bool), + Integer(i64), Null, } @@ -104,9 +105,12 @@ impl From<&str> for SaltObject { fn from(value: &str) -> Self { if let Some(value) = value.strip_prefix("T::") { SaltObject::Text(value.to_string()) - } else if let Some(_value) = value.strip_prefix("B::") { + } else if let Some(value) = value.strip_prefix("B::") { let value = value.to_ascii_lowercase() == "true"; SaltObject::Boolean(value) + } else if let Some(value) = value.strip_prefix("N::") { + let value = value.parse::().unwrap_or_default(); + SaltObject::Integer(value) } else { SaltObject::Null } @@ -118,6 +122,7 @@ impl std::fmt::Display for SaltObject { match self { SaltObject::Text(val) => write!(f, "{val}"), SaltObject::Boolean(val) => write!(f, "{val}"), + SaltObject::Integer(val) => write!(f, "{val}"), SaltObject::Null => write!(f, ""), } } diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index a58bfe2b..b3ec645d 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -1,7 +1,8 @@ -use std::collections::BTreeMap; +use std::{collections::BTreeMap, convert::TryFrom}; -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context, Result}; use graphannis::update::{GraphUpdate, UpdateEvent}; +use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; use roxmltree::Node; @@ -63,8 +64,8 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .iter() .filter(|n| SaltType::from(**n) == SaltType::TextualDs) { - let element_id = get_element_id(&text_node) - .ok_or_else(|| anyhow!("Missing element ID for textual data source"))?; + let element_id = + get_element_id(&text_node).context("Missing element ID for textual data source")?; if let Some(SaltObject::Text(anno_value)) = get_feature_by_qname(&text_node, "saltCommon", "SDATA") @@ -86,8 +87,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .iter() .filter(|n| SaltType::from(**n) == SaltType::Token) .map(|t| { - let id = get_element_id(&t) - .ok_or_else(|| anyhow!("Missing element ID for token source"))?; + let id = get_element_id(&t).context("Missing element ID for token source")?; Ok((*t, id)) }) .collect(); @@ -106,21 +106,41 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .filter(|n| SaltType::from(**n) == SaltType::TextualRelation) { let source_att_val = text_rel.attribute("source").unwrap_or_default(); - let token = resolve_element(source_att_val, "nodes", &self.nodes).ok_or_else(|| { - anyhow!("Textual relation source \"{source_att_val}\" could not be resolved") - })?; + let token = + resolve_element(source_att_val, "nodes", &self.nodes).with_context(|| { + format!("Textual relation source \"{source_att_val}\" could not be resolved") + })?; let target_att_val = text_rel.attribute("target").unwrap_or_default(); let datasource = - resolve_element(target_att_val, "nodes", &self.nodes).ok_or_else(|| { - anyhow!("Textual relation target \"{target_att_val}\" could not be resolved") + resolve_element(target_att_val, "nodes", &self.nodes).with_context(|| { + format!("Textual relation target \"{target_att_val}\" could not be resolved") })?; - let _token_id = - get_element_id(&token).ok_or_else(|| anyhow!("Missing ID for token"))?; - let _datasource_id = - get_element_id(&datasource).ok_or_else(|| anyhow!("Missing ID for token"))?; - - // TODO Get the string for this token + let token_id = get_element_id(&token).context("Missing ID for token")?; + let datasource_id = get_element_id(&datasource).context("Missing ID for token")?; + + // Get the string for this token + let matching_base_text = self + .base_texts + .get(&datasource_id) + .with_context(|| format!("Missing base text for token {token_id}"))?; + let start_offset = + get_feature_by_qname(text_rel, "salt", "SSTART").context("Missing start value")?; + let end_offset = + get_feature_by_qname(text_rel, "salt", "SEND").context("Missing end value")?; + if let (SaltObject::Integer(start), SaltObject::Integer(end)) = + (start_offset, end_offset) + { + let start = usize::try_from(start)?; + let end = usize::try_from(end)?; + let covered_text = &matching_base_text[start..end]; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: token_id, + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok".to_string(), + anno_value: covered_text.to_string(), + })?; + } // TODO also get whitespace after/before } diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 7f9b92f9..92504a2d 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -8,6 +8,7 @@ expression: actual + rootCorpus @@ -46,144 +47,188 @@ expression: actual node + Is node + this node + example node + more node + complicated node + than node + it node + appears node + to node + be node + ? datasource node + Is node + this node + example node + more node + complicated node + than node + it node + appears node + to node + be node + ? datasource node + Is node + this node + example node + more node + complicated node + than node + it node + appears node + to node + be node + ? datasource node + Is node + this node + example node + more node + complicated node + than node + it node + appears node + to node + be node + ? From 2988822419371e99b7e754b7cf9bbde50c6987c6 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 2 Jul 2024 12:04:19 +0200 Subject: [PATCH 21/61] Add ordering edges between the token --- src/importer/saltxml/document.rs | 51 +++++++--- ...ltxml__tests__read_salt_sample_corpus.snap | 92 +++++++++++++++++-- 2 files changed, 125 insertions(+), 18 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index b3ec645d..6e5259b2 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -1,7 +1,10 @@ use std::{collections::BTreeMap, convert::TryFrom}; use anyhow::{anyhow, Context, Result}; -use graphannis::update::{GraphUpdate, UpdateEvent}; +use graphannis::{ + model::AnnotationComponentType, + update::{GraphUpdate, UpdateEvent}, +}; use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; use roxmltree::Node; @@ -99,12 +102,27 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_type: "node".to_string(), })?; } - // Connect the token to the texts by the textual relations - for text_rel in self + + // Order textual relations by their start offset, so we iterate in the + // actual order of the tokens. + let sorted_text_rels: BTreeMap = self .edges .iter() .filter(|n| SaltType::from(**n) == SaltType::TextualRelation) - { + .map(|text_rel| { + let start = + get_feature_by_qname(text_rel, "salt", "SSTART").unwrap_or(SaltObject::Null); + if let SaltObject::Integer(start) = start { + (start, *text_rel) + } else { + (-1, *text_rel) + } + }) + .collect(); + + // Connect the token to the texts by the textual relations + let mut previous_token = None; + for (_, text_rel) in sorted_text_rels { let source_att_val = text_rel.attribute("source").unwrap_or_default(); let token = resolve_element(source_att_val, "nodes", &self.nodes).with_context(|| { @@ -124,23 +142,32 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .base_texts .get(&datasource_id) .with_context(|| format!("Missing base text for token {token_id}"))?; - let start_offset = - get_feature_by_qname(text_rel, "salt", "SSTART").context("Missing start value")?; - let end_offset = - get_feature_by_qname(text_rel, "salt", "SEND").context("Missing end value")?; - if let (SaltObject::Integer(start), SaltObject::Integer(end)) = - (start_offset, end_offset) - { + let start = + get_feature_by_qname(&text_rel, "salt", "SSTART").context("Missing start value")?; + let end = + get_feature_by_qname(&text_rel, "salt", "SEND").context("Missing end value")?; + if let (SaltObject::Integer(start), SaltObject::Integer(end)) = (start, end) { let start = usize::try_from(start)?; let end = usize::try_from(end)?; let covered_text = &matching_base_text[start..end]; updates.add_event(UpdateEvent::AddNodeLabel { - node_name: token_id, + node_name: token_id.clone(), anno_ns: ANNIS_NS.to_string(), anno_name: "tok".to_string(), anno_value: covered_text.to_string(), })?; } + // Add ordering edges between the tokens for the base token layer + if let Some(previous_token) = previous_token { + updates.add_event(UpdateEvent::AddEdge { + source_node: previous_token, + target_node: token_id.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::Ordering.to_string(), + component_name: "".to_string(), + })?; + } + previous_token = Some(token_id); // TODO also get whitespace after/before } diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 92504a2d..5ae83878 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -230,17 +230,97 @@ expression: actual node ? - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 66227c08d7f5eb4682472fab7b55cb41895f4b17 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 2 Jul 2024 12:23:21 +0200 Subject: [PATCH 22/61] Add whitespace before/after token --- src/importer/saltxml/document.rs | 39 +++++++++++++++++-- ...ltxml__tests__read_salt_sample_corpus.snap | 37 ++++++++++++++++++ 2 files changed, 72 insertions(+), 4 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 6e5259b2..ddb3d49c 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -1,4 +1,7 @@ -use std::{collections::BTreeMap, convert::TryFrom}; +use std::{ + collections::BTreeMap, + convert::{TryFrom, TryInto}, +}; use anyhow::{anyhow, Context, Result}; use graphannis::{ @@ -122,7 +125,8 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { // Connect the token to the texts by the textual relations let mut previous_token = None; - for (_, text_rel) in sorted_text_rels { + let mut sorted_text_rels = sorted_text_rels.into_iter().peekable(); + while let Some((_, text_rel)) = sorted_text_rels.next() { let source_att_val = text_rel.attribute("source").unwrap_or_default(); let token = resolve_element(source_att_val, "nodes", &self.nodes).with_context(|| { @@ -156,6 +160,35 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { anno_name: "tok".to_string(), anno_value: covered_text.to_string(), })?; + + // Get the whitespace before the first token + if previous_token.is_none() && start > 0 { + let whitespace = &matching_base_text[0..start]; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: token_id.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok-whitespace-before".to_string(), + anno_value: whitespace.to_string(), + })?; + } + + // Add whitespace after this token + + let next_token_offset = sorted_text_rels + .peek() + .map(|(offset, _)| *offset) + .unwrap_or_else(|| matching_base_text.len().try_into().unwrap_or(i64::MAX)); + let next_token_offset = usize::try_from(next_token_offset).unwrap_or(0); + + if next_token_offset > end && (next_token_offset - end) >= 1 { + let whitespace = &matching_base_text[end..next_token_offset]; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: token_id.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok-whitespace-after".to_string(), + anno_value: whitespace.to_string(), + })?; + } } // Add ordering edges between the tokens for the base token layer if let Some(previous_token) = previous_token { @@ -168,8 +201,6 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { })?; } previous_token = Some(token_id); - - // TODO also get whitespace after/before } Ok(()) diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 5ae83878..fbf20e7a 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -9,6 +9,7 @@ expression: actual + rootCorpus @@ -48,38 +49,47 @@ expression: actual node Is + node this + node example + node more + node complicated + node than + node it + node appears + node to + node @@ -95,38 +105,47 @@ expression: actual node Is + node this + node example + node more + node complicated + node than + node it + node appears + node to + node @@ -142,38 +161,47 @@ expression: actual node Is + node this + node example + node more + node complicated + node than + node it + node appears + node to + node @@ -189,38 +217,47 @@ expression: actual node Is + node this + node example + node more + node complicated + node than + node it + node appears + node to + node From 58c7f2b303944c2cafa4c15320c7d826ffba1e84 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 2 Jul 2024 12:43:00 +0200 Subject: [PATCH 23/61] Add layer information for tokens --- src/importer/saltxml/document.rs | 26 +- ...ltxml__tests__read_salt_sample_corpus.snap | 321 ++++++++++-------- 2 files changed, 204 insertions(+), 143 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index ddb3d49c..51d2375c 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -3,7 +3,7 @@ use std::{ convert::{TryFrom, TryInto}, }; -use anyhow::{anyhow, Context, Result}; +use anyhow::{bail, Context, Result}; use graphannis::{ model::AnnotationComponentType, update::{GraphUpdate, UpdateEvent}, @@ -30,9 +30,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { let doc = roxmltree::Document::parse(input)?; let root = doc.root_element(); if root.tag_name().name() != "SDocumentGraph" { - return Err(anyhow!( - "SaltXML document file must start with tag" - )); + bail!("SaltXML document file must start with tag"); } let nodes = doc @@ -52,6 +50,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .children() .filter(|n| n.tag_name().name() == "layers") .collect_vec(); + let mut mapper = DocumentMapper { base_texts: BTreeMap::new(), nodes, @@ -99,11 +98,28 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .collect(); let tokens = tokens?; - for (_, t_id) in tokens.iter() { + for (token_node, t_id) in tokens.iter() { updates.add_event(UpdateEvent::AddNode { node_name: t_id.clone(), node_type: "node".to_string(), })?; + + if let Some(layers_attribute) = token_node.attribute("layers") { + for layer_ref in layers_attribute.split(' ') { + let layer_node = resolve_element(layer_ref, "layers", &self.layers) + .context("Could not resolve layer")?; + if let Some(SaltObject::Text(layer_name)) = + get_feature_by_qname(&layer_node, "salt", "SNAME") + { + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: t_id.clone(), + anno_ns: ANNIS_NS.to_owned(), + anno_name: "layer".to_owned(), + anno_value: layer_name, + })?; + } + } + } } // Order textual relations by their start offset, so we iterate in the diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index fbf20e7a..f0657d34 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -7,265 +7,310 @@ expression: actual - - - + + + + rootCorpus - corpus + corpus subCorpus1 - corpus + corpus subCorpus2 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt doc1 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt doc2 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt doc3 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt doc4 - corpus + corpus - datasource + datasource - node - Is - + morphology + node + Is + - node - this - + morphology + node + this + - node - example - + morphology + node + example + - node - more - + morphology + node + more + - node - complicated - + morphology + node + complicated + - node - than - + morphology + node + than + - node - it - + morphology + node + it + - node - appears - + morphology + node + appears + - node - to - + morphology + node + to + - node - be + morphology + node + be - node - ? + morphology + node + ? - datasource + datasource - node - Is - + morphology + node + Is + - node - this - + morphology + node + this + - node - example - + morphology + node + example + - node - more - + morphology + node + more + - node - complicated - + morphology + node + complicated + - node - than - + morphology + node + than + - node - it - + morphology + node + it + - node - appears - + morphology + node + appears + - node - to - + morphology + node + to + - node - be + morphology + node + be - node - ? + morphology + node + ? - datasource + datasource - node - Is - + morphology + node + Is + - node - this - + morphology + node + this + - node - example - + morphology + node + example + - node - more - + morphology + node + more + - node - complicated - + morphology + node + complicated + - node - than - + morphology + node + than + - node - it - + morphology + node + it + - node - appears - + morphology + node + appears + - node - to - + morphology + node + to + - node - be + morphology + node + be - node - ? + morphology + node + ? - datasource + datasource - node - Is - + morphology + node + Is + - node - this - + morphology + node + this + - node - example - + morphology + node + example + - node - more - + morphology + node + more + - node - complicated - + morphology + node + complicated + - node - than - + morphology + node + than + - node - it - + morphology + node + it + - node - appears - + morphology + node + appears + - node - to - + morphology + node + to + - node - be + morphology + node + be - node - ? + morphology + node + ? From eba61fbefdc0203e008e2bb550fabc03ad7db6bd Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 2 Jul 2024 12:44:10 +0200 Subject: [PATCH 24/61] Fix some clippy issues --- src/importer/saltxml.rs | 2 +- src/importer/saltxml/document.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 420519f9..08520799 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -152,7 +152,7 @@ fn get_feature_by_qname(n: &Node, namespace: &str, name: &str) -> Option DocumentMapper<'a, 'input> { .filter(|n| SaltType::from(**n) == SaltType::TextualDs) { let element_id = - get_element_id(&text_node).context("Missing element ID for textual data source")?; + get_element_id(text_node).context("Missing element ID for textual data source")?; if let Some(SaltObject::Text(anno_value)) = - get_feature_by_qname(&text_node, "saltCommon", "SDATA") + get_feature_by_qname(text_node, "saltCommon", "SDATA") { self.base_texts.insert(element_id.clone(), anno_value); updates.add_event(UpdateEvent::AddNode { @@ -92,7 +92,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .iter() .filter(|n| SaltType::from(**n) == SaltType::Token) .map(|t| { - let id = get_element_id(&t).context("Missing element ID for token source")?; + let id = get_element_id(t).context("Missing element ID for token source")?; Ok((*t, id)) }) .collect(); From 83089907558d10da2445d355b54c0589c9a138af Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 2 Jul 2024 12:49:34 +0200 Subject: [PATCH 25/61] Decouple CLI compilation and running the documentation generation to avoid action failure when there are warnings --- .github/workflows/module_docs.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/module_docs.yml b/.github/workflows/module_docs.yml index 78a57c49..98401196 100644 --- a/.github/workflows/module_docs.yml +++ b/.github/workflows/module_docs.yml @@ -11,8 +11,10 @@ jobs: steps: - uses: actions/checkout@v3 - uses: actions-rust-lang/setup-rust-toolchain@v1.4.4 + - name: Compile CLI + run: cargo build - name: Update documentation - run: cargo run -- documentation docs/ + run: ./target/debug/annatto documentation docs/ - name: Commit changed documentation uses: stefanzweifel/git-auto-commit-action@v5.0.1 with: From b5896f338d097435640228b9f9ddb24cb2d24493 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 2 Jul 2024 13:05:45 +0200 Subject: [PATCH 26/61] Add token annotations --- src/importer/saltxml.rs | 7 + src/importer/saltxml/document.rs | 24 +- ...ltxml__tests__read_salt_sample_corpus.snap | 366 +++++++++++------- 3 files changed, 257 insertions(+), 140 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 08520799..c2461528 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -62,6 +62,7 @@ enum SaltType { Document, ElementId, Feature, + Annotation, CorpusRelation, DocumentRelation, TextualRelation, @@ -80,6 +81,7 @@ impl<'a, 'input> From> for SaltType { "sCorpusStructure:SDocument" => SaltType::Document, "saltCore:SElementId" => SaltType::ElementId, "saltCore:SFeature" => SaltType::Feature, + "saltCore:SAnnotation" => SaltType::Annotation, "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, "sDocumentStructure:STextualRelation" => SaltType::TextualRelation, @@ -146,6 +148,11 @@ fn get_features<'a, 'input>(n: &'a Node<'a, 'input>) -> impl Iterator(n: &'a Node<'a, 'input>) -> impl Iterator> { + n.children() + .filter(|n| n.tag_name().name() == "labels" && SaltType::from(*n) == SaltType::Annotation) +} + fn get_feature_by_qname(n: &Node, namespace: &str, name: &str) -> Option { get_features(n) .filter(|f| { diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index ed0c16bc..a3c99e4b 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -12,7 +12,9 @@ use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; use roxmltree::Node; -use super::{get_element_id, get_feature_by_qname, resolve_element, SaltObject, SaltType}; +use super::{ + get_annotations, get_element_id, get_feature_by_qname, resolve_element, SaltObject, SaltType, +}; pub(super) struct DocumentMapper<'a, 'input> { nodes: Vec>, @@ -120,6 +122,25 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { } } } + + for label_node in get_annotations(token_node) { + let anno_ns = label_node + .attribute("namespace") + .unwrap_or_default() + .to_string(); + let anno_name = label_node + .attribute("name") + .context("Missing annotation name for token")? + .to_string(); + let anno_value = + SaltObject::from(label_node.attribute("value").unwrap_or_default()).to_string(); + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: t_id.clone(), + anno_ns, + anno_name, + anno_value, + })?; + } } // Order textual relations by their start offset, so we iterate in the @@ -189,7 +210,6 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { } // Add whitespace after this token - let next_token_offset = sorted_text_rels .peek() .map(|(offset, _)| *offset) diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index f0657d34..aec3c53c 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -8,309 +8,399 @@ expression: actual - - - + + + + + rootCorpus - corpus + corpus subCorpus1 - corpus + corpus subCorpus2 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt doc1 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt doc2 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt doc3 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt doc4 - corpus + corpus - datasource + datasource morphology - node - Is - + be + node + VBZ + Is + morphology - node - this - + this + node + DT + this + morphology - node - example - + example + node + NN + example + morphology - node - more - + more + node + RBR + more + morphology - node - complicated - + complicated + node + JJ + complicated + morphology - node - than - + than + node + IN + than + morphology - node - it - + it + node + PRP + it + morphology - node - appears - + appear + node + VBZ + appears + morphology - node - to - + to + node + TO + to + morphology - node - be + be + node + VB + be morphology - node - ? + ? + node + . + ? - datasource + datasource morphology - node - Is - + be + node + VBZ + Is + morphology - node - this - + this + node + DT + this + morphology - node - example - + example + node + NN + example + morphology - node - more - + more + node + RBR + more + morphology - node - complicated - + complicated + node + JJ + complicated + morphology - node - than - + than + node + IN + than + morphology - node - it - + it + node + PRP + it + morphology - node - appears - + appear + node + VBZ + appears + morphology - node - to - + to + node + TO + to + morphology - node - be + be + node + VB + be morphology - node - ? + ? + node + . + ? - datasource + datasource morphology - node - Is - + be + node + VBZ + Is + morphology - node - this - + this + node + DT + this + morphology - node - example - + example + node + NN + example + morphology - node - more - + more + node + RBR + more + morphology - node - complicated - + complicated + node + JJ + complicated + morphology - node - than - + than + node + IN + than + morphology - node - it - + it + node + PRP + it + morphology - node - appears - + appear + node + VBZ + appears + morphology - node - to - + to + node + TO + to + morphology - node - be + be + node + VB + be morphology - node - ? + ? + node + . + ? - datasource + datasource morphology - node - Is - + be + node + VBZ + Is + morphology - node - this - + this + node + DT + this + morphology - node - example - + example + node + NN + example + morphology - node - more - + more + node + RBR + more + morphology - node - complicated - + complicated + node + JJ + complicated + morphology - node - than - + than + node + IN + than + morphology - node - it - + it + node + PRP + it + morphology - node - appears - + appear + node + VBZ + appears + morphology - node - to - + to + node + TO + to + morphology - node - be + be + node + VB + be morphology - node - ? + ? + node + . + ? From 51f1dfc381605f1d1a1414be339cabe397623225 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 3 Jul 2024 12:22:00 +0200 Subject: [PATCH 27/61] Add spans and spanning relations --- src/importer/saltxml.rs | 22 +- src/importer/saltxml/corpus_structure.rs | 4 +- src/importer/saltxml/document.rs | 196 +++-- ...ltxml__tests__read_salt_sample_corpus.snap | 815 +++++++++++------- 4 files changed, 639 insertions(+), 398 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index c2461528..9fc822fd 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -66,14 +66,16 @@ enum SaltType { CorpusRelation, DocumentRelation, TextualRelation, + SpanningRelation, Layer, Token, + Span, TextualDs, Unknown, } -impl<'a, 'input> From> for SaltType { - fn from(n: Node) -> Self { +impl SaltType { + fn from_node(n: &Node) -> SaltType { // Use the xsi:type attribute to determine the type if let Some(type_id) = n.attribute((XSI_NAMESPACE, "type")) { match type_id { @@ -85,8 +87,10 @@ impl<'a, 'input> From> for SaltType { "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, "sDocumentStructure:STextualRelation" => SaltType::TextualRelation, + "sDocumentStructure:SSpanningRelation" => SaltType::SpanningRelation, "saltCore:SLayer" => SaltType::Layer, "sDocumentStructure:SToken" => SaltType::Token, + "sDocumentStructure:SSpan" => SaltType::Span, "sDocumentStructure:STextualDS" => SaltType::TextualDs, _ => SaltType::Unknown, } @@ -131,10 +135,9 @@ impl std::fmt::Display for SaltObject { } fn get_element_id(n: &Node) -> Option { - for element_id_label in n - .children() - .filter(|c| c.tag_name().name() == "labels" && SaltType::from(*c) == SaltType::ElementId) - { + for element_id_label in n.children().filter(|c| { + c.tag_name().name() == "labels" && SaltType::from_node(c) == SaltType::ElementId + }) { if let Some(id) = element_id_label.attribute("value") { let id = SaltObject::from(id); return Some(id.to_string().trim_start_matches("salt:/").to_string()); @@ -145,12 +148,13 @@ fn get_element_id(n: &Node) -> Option { fn get_features<'a, 'input>(n: &'a Node<'a, 'input>) -> impl Iterator> { n.children() - .filter(|n| n.tag_name().name() == "labels" && SaltType::from(*n) == SaltType::Feature) + .filter(|n| n.tag_name().name() == "labels" && SaltType::from_node(n) == SaltType::Feature) } fn get_annotations<'a, 'input>(n: &'a Node<'a, 'input>) -> impl Iterator> { - n.children() - .filter(|n| n.tag_name().name() == "labels" && SaltType::from(*n) == SaltType::Annotation) + n.children().filter(|n| { + n.tag_name().name() == "labels" && SaltType::from_node(n) == SaltType::Annotation + }) } fn get_feature_by_qname(n: &Node, namespace: &str, name: &str) -> Option { diff --git a/src/importer/saltxml/corpus_structure.rs b/src/importer/saltxml/corpus_structure.rs index 3e0e9738..0de32de2 100644 --- a/src/importer/saltxml/corpus_structure.rs +++ b/src/importer/saltxml/corpus_structure.rs @@ -47,7 +47,7 @@ impl SaltCorpusStructureMapper { .collect_vec(); for node in nodes.iter() { - let salt_type = SaltType::from(*node); + let salt_type = SaltType::from_node(node); match salt_type { SaltType::Corpus | SaltType::Document => { // Get the element ID from the label @@ -102,7 +102,7 @@ impl SaltCorpusStructureMapper { // Add a PartOf Edge between parent corpora and the sub-corpora/documents for e in cg.children().filter(|n| n.tag_name().name() == "edges") { - match SaltType::from(e) { + match SaltType::from_node(&e) { SaltType::CorpusRelation | SaltType::DocumentRelation => { let source_ref = e.attribute("source").unwrap_or_default(); let target_ref = e.attribute("target").unwrap_or_default(); diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index a3c99e4b..e40c5899 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -59,17 +59,25 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { edges, layers, }; - mapper.map_textual_ds(updates)?; - mapper.map_token(updates)?; + mapper.map_textual_datasources(updates)?; + mapper.map_tokens(updates)?; + + mapper.map_spans(updates)?; + // TODO map SStructure and SDominanceRelation; + // TODO map SPointingRelation + + // TODO map STimeline and STimelineRelation + // TODO map SOrderRelation for segmentation nodes + // TODO map SAudioDS and SAudioRelation Ok(()) } - fn map_textual_ds(&mut self, updates: &mut GraphUpdate) -> Result<()> { + fn map_textual_datasources(&mut self, updates: &mut GraphUpdate) -> Result<()> { for text_node in self .nodes .iter() - .filter(|n| SaltType::from(**n) == SaltType::TextualDs) + .filter(|n| SaltType::from_node(n) == SaltType::TextualDs) { let element_id = get_element_id(text_node).context("Missing element ID for textual data source")?; @@ -87,68 +95,128 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } - fn map_token(&self, updates: &mut GraphUpdate) -> Result<()> { - // Get the list of token in the same order as in the SaltXML file - let tokens: Result> = self - .nodes - .iter() - .filter(|n| SaltType::from(**n) == SaltType::Token) - .map(|t| { - let id = get_element_id(t).context("Missing element ID for token source")?; - Ok((*t, id)) - }) - .collect(); - let tokens = tokens?; - - for (token_node, t_id) in tokens.iter() { - updates.add_event(UpdateEvent::AddNode { - node_name: t_id.clone(), - node_type: "node".to_string(), - })?; + fn map_node(&self, n: &Node, updates: &mut GraphUpdate) -> Result<()> { + let id = get_element_id(n).context("Missing element ID for node")?; + updates.add_event(UpdateEvent::AddNode { + node_name: id.clone(), + node_type: "node".to_string(), + })?; - if let Some(layers_attribute) = token_node.attribute("layers") { - for layer_ref in layers_attribute.split(' ') { - let layer_node = resolve_element(layer_ref, "layers", &self.layers) - .context("Could not resolve layer")?; - if let Some(SaltObject::Text(layer_name)) = - get_feature_by_qname(&layer_node, "salt", "SNAME") - { - updates.add_event(UpdateEvent::AddNodeLabel { - node_name: t_id.clone(), - anno_ns: ANNIS_NS.to_owned(), - anno_name: "layer".to_owned(), - anno_value: layer_name, - })?; - } + if let Some(layers_attribute) = n.attribute("layers") { + for layer_ref in layers_attribute.split(' ') { + let layer_node = resolve_element(layer_ref, "layers", &self.layers) + .context("Could not resolve layer")?; + if let Some(SaltObject::Text(layer_name)) = + get_feature_by_qname(&layer_node, "salt", "SNAME") + { + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: id.clone(), + anno_ns: ANNIS_NS.to_owned(), + anno_name: "layer".to_owned(), + anno_value: layer_name, + })?; } } + } - for label_node in get_annotations(token_node) { - let anno_ns = label_node - .attribute("namespace") - .unwrap_or_default() - .to_string(); - let anno_name = label_node - .attribute("name") - .context("Missing annotation name for token")? - .to_string(); - let anno_value = - SaltObject::from(label_node.attribute("value").unwrap_or_default()).to_string(); - updates.add_event(UpdateEvent::AddNodeLabel { - node_name: t_id.clone(), - anno_ns, - anno_name, - anno_value, - })?; + for label_node in get_annotations(n) { + let anno_ns = label_node + .attribute("namespace") + .unwrap_or_default() + .to_string(); + let anno_name = label_node + .attribute("name") + .context("Missing annotation name for node")? + .to_string(); + let anno_value = + SaltObject::from(label_node.attribute("value").unwrap_or_default()).to_string(); + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: id.clone(), + anno_ns, + anno_name, + anno_value, + })?; + } + Ok(()) + } + + fn map_edge( + &self, + rel: &Node, + component_type: AnnotationComponentType, + fallback_component_name: &str, + updates: &mut GraphUpdate, + ) -> Result<()> { + let source_att_val = rel.attribute("source").unwrap_or_default(); + let source_element = + resolve_element(source_att_val, "nodes", &self.nodes).context("Missing source node")?; + let source_id = get_element_id(&source_element).context("Missing source node ID")?; + + let target_att_val = rel.attribute("target").unwrap_or_default(); + let target_element = + resolve_element(target_att_val, "nodes", &self.nodes).context("Missing target node")?; + let target_id = get_element_id(&target_element).context("Missing target node ID")?; + + let component_name = get_feature_by_qname(rel, "salt", "STYPE") + .map(|t| t.to_string()) + .unwrap_or_else(|| fallback_component_name.to_string()); + + let mut component_layer = "default_ns".to_string(); + if let Some(layers_attribute) = rel.attribute("layers") { + if let Some(first_layer) = layers_attribute.split(' ').next() { + component_layer = first_layer.to_string(); } } + updates.add_event(UpdateEvent::AddEdge { + source_node: source_id.clone(), + target_node: target_id.clone(), + layer: component_layer.clone(), + component_type: component_type.to_string(), + component_name: component_name.clone(), + })?; + + for label_element in get_annotations(rel) { + let anno_ns = label_element + .attribute("namespace") + .unwrap_or_default() + .to_string(); + let anno_name = label_element + .attribute("name") + .context("Missing annotation name for edge")? + .to_string(); + let anno_value = + SaltObject::from(label_element.attribute("value").unwrap_or_default()).to_string(); + updates.add_event(UpdateEvent::AddEdgeLabel { + source_node: source_id.clone(), + target_node: target_id.clone(), + layer: component_layer.clone(), + component_type: component_type.to_string(), + component_name: component_name.clone(), + anno_ns, + anno_name, + anno_value, + })?; + } + Ok(()) + } + + fn map_tokens(&self, updates: &mut GraphUpdate) -> Result<()> { + // Map the token nodes in the same order as in the SaltXML file + for token_node in self + .nodes + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::Token) + { + self.map_node(token_node, updates)?; + } + // Order textual relations by their start offset, so we iterate in the // actual order of the tokens. let sorted_text_rels: BTreeMap = self .edges .iter() - .filter(|n| SaltType::from(**n) == SaltType::TextualRelation) + .filter(|n| SaltType::from_node(n) == SaltType::TextualRelation) .map(|text_rel| { let start = get_feature_by_qname(text_rel, "salt", "SSTART").unwrap_or(SaltObject::Null); @@ -241,4 +309,24 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } + + fn map_spans(&self, updates: &mut GraphUpdate) -> Result<()> { + for span_node in self + .nodes + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::Span) + { + self.map_node(span_node, updates)?; + } + + // Connect all spans with the token using the spanning relations + for spanning_rel in self + .edges + .iter() + .filter(|rel| SaltType::from_node(rel) == SaltType::SpanningRelation) + { + self.map_edge(spanning_rel, AnnotationComponentType::Coverage, "", updates)?; + } + Ok(()) + } } diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index aec3c53c..98bae5f3 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -4,495 +4,644 @@ expression: actual --- - - - - - - - - - + + + + + + + + + + - rootCorpus - corpus + rootCorpus + corpus - subCorpus1 - corpus + subCorpus1 + corpus - subCorpus2 - corpus + subCorpus2 + corpus - file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt - doc1 - corpus + file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt + doc1 + corpus - file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt - doc2 - corpus + file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt + doc2 + corpus - file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt - doc3 - corpus + file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt + doc3 + corpus - file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt - doc4 - corpus + file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt + doc4 + corpus - datasource + datasource - morphology - be - node - VBZ - Is - + morphology + be + node + VBZ + Is + - morphology - this - node - DT - this - + morphology + this + node + DT + this + - morphology - example - node - NN - example - + morphology + example + node + NN + example + - morphology - more - node - RBR - more - + morphology + more + node + RBR + more + - morphology - complicated - node - JJ - complicated - + morphology + complicated + node + JJ + complicated + - morphology - than - node - IN - than - + morphology + than + node + IN + than + - morphology - it - node - PRP - it - + morphology + it + node + PRP + it + - morphology - appear - node - VBZ - appears - + morphology + appear + node + VBZ + appears + - morphology - to - node - TO - to - + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? + + + contrast-focus + node + + + topic + node + + + node - datasource + datasource - morphology - be - node - VBZ - Is - + morphology + be + node + VBZ + Is + - morphology - this - node - DT - this - + morphology + this + node + DT + this + - morphology - example - node - NN - example - + morphology + example + node + NN + example + - morphology - more - node - RBR - more - + morphology + more + node + RBR + more + - morphology - complicated - node - JJ - complicated - + morphology + complicated + node + JJ + complicated + - morphology - than - node - IN - than - + morphology + than + node + IN + than + - morphology - it - node - PRP - it - + morphology + it + node + PRP + it + - morphology - appear - node - VBZ - appears - + morphology + appear + node + VBZ + appears + - morphology - to - node - TO - to - + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? + + + contrast-focus + node + + + topic + node + + + node - datasource + datasource - morphology - be - node - VBZ - Is - + morphology + be + node + VBZ + Is + - morphology - this - node - DT - this - + morphology + this + node + DT + this + - morphology - example - node - NN - example - + morphology + example + node + NN + example + - morphology - more - node - RBR - more - + morphology + more + node + RBR + more + - morphology - complicated - node - JJ - complicated - + morphology + complicated + node + JJ + complicated + - morphology - than - node - IN - than - + morphology + than + node + IN + than + - morphology - it - node - PRP - it - + morphology + it + node + PRP + it + - morphology - appear - node - VBZ - appears - + morphology + appear + node + VBZ + appears + - morphology - to - node - TO - to - + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? + + + contrast-focus + node + + + topic + node + + + node - datasource + datasource - morphology - be - node - VBZ - Is - + morphology + be + node + VBZ + Is + - morphology - this - node - DT - this - + morphology + this + node + DT + this + - morphology - example - node - NN - example - + morphology + example + node + NN + example + - morphology - more - node - RBR - more - + morphology + more + node + RBR + more + - morphology - complicated - node - JJ - complicated - + morphology + complicated + node + JJ + complicated + - morphology - than - node - IN - than - + morphology + than + node + IN + than + - morphology - it - node - PRP - it - + morphology + it + node + PRP + it + - morphology - appear - node - VBZ - appears - + morphology + appear + node + VBZ + appears + - morphology - to - node - TO - to - + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? + + + contrast-focus + node + + + topic + node + + + node - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + From 7d17c27988ec7845981f23f11cc132ccdc897872 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 3 Jul 2024 13:30:36 +0200 Subject: [PATCH 28/61] Add dominance edge with empty name as well --- src/importer/saltxml.rs | 32 +- src/importer/saltxml/document.rs | 44 +- ...ltxml__tests__read_salt_sample_corpus.snap | 1441 +++++++++++++---- 3 files changed, 1159 insertions(+), 358 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 9fc822fd..2f5effb0 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -58,19 +58,21 @@ const XSI_NAMESPACE: &str = "http://www.w3.org/2001/XMLSchema-instance"; #[derive(Debug, Clone, Copy, PartialEq)] enum SaltType { + Annotation, Corpus, + CorpusRelation, Document, + DocumentRelation, + DominanceRelation, ElementId, Feature, - Annotation, - CorpusRelation, - DocumentRelation, - TextualRelation, - SpanningRelation, Layer, - Token, Span, + SpanningRelation, + Structure, TextualDs, + TextualRelation, + Token, Unknown, } @@ -79,19 +81,21 @@ impl SaltType { // Use the xsi:type attribute to determine the type if let Some(type_id) = n.attribute((XSI_NAMESPACE, "type")) { match type_id { - "sCorpusStructure:SCorpus" => SaltType::Corpus, - "sCorpusStructure:SDocument" => SaltType::Document, + "saltCore:SAnnotation" => SaltType::Annotation, "saltCore:SElementId" => SaltType::ElementId, "saltCore:SFeature" => SaltType::Feature, - "saltCore:SAnnotation" => SaltType::Annotation, - "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, - "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, - "sDocumentStructure:STextualRelation" => SaltType::TextualRelation, - "sDocumentStructure:SSpanningRelation" => SaltType::SpanningRelation, "saltCore:SLayer" => SaltType::Layer, - "sDocumentStructure:SToken" => SaltType::Token, + "sCorpusStructure:SCorpus" => SaltType::Corpus, + "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, + "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, + "sCorpusStructure:SDocument" => SaltType::Document, + "sDocumentStructure:SDominanceRelation" => SaltType::DominanceRelation, "sDocumentStructure:SSpan" => SaltType::Span, + "sDocumentStructure:SSpanningRelation" => SaltType::SpanningRelation, + "sDocumentStructure:SStructure" => SaltType::Structure, "sDocumentStructure:STextualDS" => SaltType::TextualDs, + "sDocumentStructure:STextualRelation" => SaltType::TextualRelation, + "sDocumentStructure:SToken" => SaltType::Token, _ => SaltType::Unknown, } } else { diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index e40c5899..6895515e 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -62,8 +62,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { mapper.map_textual_datasources(updates)?; mapper.map_tokens(updates)?; - mapper.map_spans(updates)?; - // TODO map SStructure and SDominanceRelation; + mapper.map_non_token_nodes(updates)?; // TODO map SPointingRelation // TODO map STimeline and STimelineRelation @@ -176,6 +175,17 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { component_name: component_name.clone(), })?; + if component_type == AnnotationComponentType::Dominance { + // Also add to the special component with the empty name, which includes all dominance edges from all STypes. + updates.add_event(UpdateEvent::AddEdge { + source_node: source_id.clone(), + target_node: target_id.clone(), + layer: ANNIS_NS.to_string(), + component_type: component_type.to_string(), + component_name: "".to_string(), + })?; + } + for label_element in get_annotations(rel) { let anno_ns = label_element .attribute("namespace") @@ -310,12 +320,11 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } - fn map_spans(&self, updates: &mut GraphUpdate) -> Result<()> { - for span_node in self - .nodes - .iter() - .filter(|n| SaltType::from_node(n) == SaltType::Span) - { + fn map_non_token_nodes(&self, updates: &mut GraphUpdate) -> Result<()> { + for span_node in self.nodes.iter().filter(|n| { + let t = SaltType::from_node(n); + t == SaltType::Span || t == SaltType::Structure + }) { self.map_node(span_node, updates)?; } @@ -327,6 +336,25 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { { self.map_edge(spanning_rel, AnnotationComponentType::Coverage, "", updates)?; } + // Add all dominance relations + for dominance_rel in self + .edges + .iter() + .filter(|rel| SaltType::from_node(rel) == SaltType::DominanceRelation) + { + self.map_edge( + dominance_rel, + AnnotationComponentType::Dominance, + "", + updates, + )?; + self.map_edge( + dominance_rel, + AnnotationComponentType::Dominance, + "edge", + updates, + )?; + } Ok(()) } } diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 98bae5f3..474248c4 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -7,445 +7,686 @@ expression: actual - - - - - - - + + + + + + + + rootCorpus - corpus + corpus subCorpus1 - corpus + corpus subCorpus2 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt - doc1 - corpus + doc1 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt - doc2 - corpus + doc2 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt - doc3 - corpus + doc3 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt - doc4 - corpus + doc4 + corpus - datasource + datasource - morphology - be - node - VBZ - Is - + + morphology + be + node + VBZ + Is - morphology - this - node - DT - this - + + morphology + this + node + DT + this - morphology - example - node - NN - example - + + morphology + example + node + NN + example - morphology - more - node - RBR - more - + + morphology + more + node + RBR + more - morphology - complicated - node - JJ - complicated - + + morphology + complicated + node + JJ + complicated - morphology - than - node - IN - than - + + morphology + than + node + IN + than - morphology - it - node - PRP - it - + + morphology + it + node + PRP + it - morphology - appear - node - VBZ - appears - + + morphology + appear + node + VBZ + appears - morphology - to - node - TO - to - + + morphology + to + node + TO + to - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node + + + ROOT + syntax + node + + + SQ + syntax + node + + + NP + syntax + node + + + ADJP + syntax + node + + + ADJP + syntax + node + + + SBar + syntax + node + + + S + syntax + node + + + NP + syntax + node + + + VP + syntax + node + + + S + syntax + node + + + VP + syntax + node + + + VP + syntax + node - node + node - datasource + datasource - morphology - be - node - VBZ - Is - + + morphology + be + node + VBZ + Is - morphology - this - node - DT - this - + + morphology + this + node + DT + this - morphology - example - node - NN - example - + + morphology + example + node + NN + example - morphology - more - node - RBR - more - + + morphology + more + node + RBR + more - morphology - complicated - node - JJ - complicated - + + morphology + complicated + node + JJ + complicated - morphology - than - node - IN - than - + + morphology + than + node + IN + than - morphology - it - node - PRP - it - + + morphology + it + node + PRP + it - morphology - appear - node - VBZ - appears - + + morphology + appear + node + VBZ + appears - morphology - to - node - TO - to - + + morphology + to + node + TO + to - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node + + + ROOT + syntax + node + + + SQ + syntax + node + + + NP + syntax + node + + + ADJP + syntax + node + + + ADJP + syntax + node + + + SBar + syntax + node + + + S + syntax + node + + + NP + syntax + node + + + VP + syntax + node + + + S + syntax + node + + + VP + syntax + node + + + VP + syntax + node - node + node - datasource + datasource - morphology - be - node - VBZ - Is - + + morphology + be + node + VBZ + Is - morphology - this - node - DT - this - + + morphology + this + node + DT + this - morphology - example - node - NN - example - + + morphology + example + node + NN + example - morphology - more - node - RBR - more - + + morphology + more + node + RBR + more - morphology - complicated - node - JJ - complicated - + + morphology + complicated + node + JJ + complicated - morphology - than - node - IN - than - + + morphology + than + node + IN + than - morphology - it - node - PRP - it - + + morphology + it + node + PRP + it - morphology - appear - node - VBZ - appears - + + morphology + appear + node + VBZ + appears - morphology - to - node - TO - to - + + morphology + to + node + TO + to - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node + + + ROOT + syntax + node + + + SQ + syntax + node + + + NP + syntax + node + + + ADJP + syntax + node + + + ADJP + syntax + node + + + SBar + syntax + node + + + S + syntax + node + + + NP + syntax + node + + + VP + syntax + node + + + S + syntax + node + + + VP + syntax + node + + + VP + syntax + node - node + node - datasource + datasource - morphology - be - node - VBZ - Is - + + morphology + be + node + VBZ + Is - morphology - this - node - DT - this - + + morphology + this + node + DT + this - morphology - example - node - NN - example - + + morphology + example + node + NN + example - morphology - more - node - RBR - more - + + morphology + more + node + RBR + more - morphology - complicated - node - JJ - complicated - + + morphology + complicated + node + JJ + complicated - morphology - than - node - IN - than - + + morphology + than + node + IN + than - morphology - it - node - PRP - it - + + morphology + it + node + PRP + it - morphology - appear - node - VBZ - appears - + + morphology + appear + node + VBZ + appears - morphology - to - node - TO - to - + + morphology + to + node + TO + to - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node + + + ROOT + syntax + node + + + SQ + syntax + node + + + NP + syntax + node + + + ADJP + syntax + node + + + ADJP + syntax + node + + + SBar + syntax + node + + + S + syntax + node + + + NP + syntax + node + + + VP + syntax + node + + + S + syntax + node + + + VP + syntax + node + + + VP + syntax + node - node + node @@ -551,97 +792,625 @@ expression: actual - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From eb5b27bcf7f856821d473b0fd189bd38713253af Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 4 Jul 2024 14:04:28 +0200 Subject: [PATCH 29/61] Do not map "salt::SNAME" feature if this is not a document. --- src/importer/saltxml/corpus_structure.rs | 20 +- ...ltxml__tests__read_salt_sample_corpus.snap | 874 +++++++++--------- 2 files changed, 445 insertions(+), 449 deletions(-) diff --git a/src/importer/saltxml/corpus_structure.rs b/src/importer/saltxml/corpus_structure.rs index 0de32de2..ae1248f5 100644 --- a/src/importer/saltxml/corpus_structure.rs +++ b/src/importer/saltxml/corpus_structure.rs @@ -74,16 +74,16 @@ impl SaltCorpusStructureMapper { feature_node.attribute("value").unwrap_or_default(), ); - if salt_type == SaltType::Document - && annos_ns == Some("salt") - && anno_name == "SNAME" - { - updates.add_event(UpdateEvent::AddNodeLabel { - node_name: node_name.to_string(), - anno_ns: ANNIS_NS.to_string(), - anno_name: "doc".to_string(), - anno_value: anno_value.to_string(), - })?; + if annos_ns == Some("salt") && anno_name == "SNAME" { + // Only map this specific feature as document name + if salt_type == SaltType::Document { + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "doc".to_string(), + anno_value: anno_value.to_string(), + })?; + } } else { updates.add_event(UpdateEvent::AddNodeLabel { node_name: node_name.to_string(), diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 474248c4..a200e5c5 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -6,687 +6,683 @@ expression: actual - - - - - - - - - + + + + + + + + - rootCorpus - corpus + corpus - subCorpus1 - corpus + corpus - subCorpus2 - corpus + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt - doc1 - corpus + doc1 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt - doc2 - corpus + doc2 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt - doc3 - corpus + doc3 + corpus file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt - doc4 - corpus + doc4 + corpus - datasource + datasource - - morphology - be - node - VBZ - Is + morphology + be + node + VBZ + Is + - - morphology - this - node - DT - this + morphology + this + node + DT + this + - - morphology - example - node - NN - example + morphology + example + node + NN + example + - - morphology - more - node - RBR - more + morphology + more + node + RBR + more + - - morphology - complicated - node - JJ - complicated + morphology + complicated + node + JJ + complicated + - - morphology - than - node - IN - than + morphology + than + node + IN + than + - - morphology - it - node - PRP - it + morphology + it + node + PRP + it + - - morphology - appear - node - VBZ - appears + morphology + appear + node + VBZ + appears + - - morphology - to - node - TO - to + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node - ROOT - syntax - node + ROOT + syntax + node - SQ - syntax - node + SQ + syntax + node - NP - syntax - node + NP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - SBar - syntax - node + SBar + syntax + node - S - syntax - node + S + syntax + node - NP - syntax - node + NP + syntax + node - VP - syntax - node + VP + syntax + node - S - syntax - node + S + syntax + node - VP - syntax - node + VP + syntax + node - VP - syntax - node + VP + syntax + node - node + node - datasource + datasource - - morphology - be - node - VBZ - Is + morphology + be + node + VBZ + Is + - - morphology - this - node - DT - this + morphology + this + node + DT + this + - - morphology - example - node - NN - example + morphology + example + node + NN + example + - - morphology - more - node - RBR - more + morphology + more + node + RBR + more + - - morphology - complicated - node - JJ - complicated + morphology + complicated + node + JJ + complicated + - - morphology - than - node - IN - than + morphology + than + node + IN + than + - - morphology - it - node - PRP - it + morphology + it + node + PRP + it + - - morphology - appear - node - VBZ - appears + morphology + appear + node + VBZ + appears + - - morphology - to - node - TO - to + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node - ROOT - syntax - node + ROOT + syntax + node - SQ - syntax - node + SQ + syntax + node - NP - syntax - node + NP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - SBar - syntax - node + SBar + syntax + node - S - syntax - node + S + syntax + node - NP - syntax - node + NP + syntax + node - VP - syntax - node + VP + syntax + node - S - syntax - node + S + syntax + node - VP - syntax - node + VP + syntax + node - VP - syntax - node + VP + syntax + node - node + node - datasource + datasource - - morphology - be - node - VBZ - Is + morphology + be + node + VBZ + Is + - - morphology - this - node - DT - this + morphology + this + node + DT + this + - - morphology - example - node - NN - example + morphology + example + node + NN + example + - - morphology - more - node - RBR - more + morphology + more + node + RBR + more + - - morphology - complicated - node - JJ - complicated + morphology + complicated + node + JJ + complicated + - - morphology - than - node - IN - than + morphology + than + node + IN + than + - - morphology - it - node - PRP - it + morphology + it + node + PRP + it + - - morphology - appear - node - VBZ - appears + morphology + appear + node + VBZ + appears + - - morphology - to - node - TO - to + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node - ROOT - syntax - node + ROOT + syntax + node - SQ - syntax - node + SQ + syntax + node - NP - syntax - node + NP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - SBar - syntax - node + SBar + syntax + node - S - syntax - node + S + syntax + node - NP - syntax - node + NP + syntax + node - VP - syntax - node + VP + syntax + node - S - syntax - node + S + syntax + node - VP - syntax - node + VP + syntax + node - VP - syntax - node + VP + syntax + node - node + node - datasource + datasource - - morphology - be - node - VBZ - Is + morphology + be + node + VBZ + Is + - - morphology - this - node - DT - this + morphology + this + node + DT + this + - - morphology - example - node - NN - example + morphology + example + node + NN + example + - - morphology - more - node - RBR - more + morphology + more + node + RBR + more + - - morphology - complicated - node - JJ - complicated + morphology + complicated + node + JJ + complicated + - - morphology - than - node - IN - than + morphology + than + node + IN + than + - - morphology - it - node - PRP - it + morphology + it + node + PRP + it + - - morphology - appear - node - VBZ - appears + morphology + appear + node + VBZ + appears + - - morphology - to - node - TO - to + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node - ROOT - syntax - node + ROOT + syntax + node - SQ - syntax - node + SQ + syntax + node - NP - syntax - node + NP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - SBar - syntax - node + SBar + syntax + node - S - syntax - node + S + syntax + node - NP - syntax - node + NP + syntax + node - VP - syntax - node + VP + syntax + node - S - syntax - node + S + syntax + node - VP - syntax - node + VP + syntax + node - VP - syntax - node + VP + syntax + node - node + node From 1be7b97d4ff7a3869436dd7e05b221a4f10677cd Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 4 Jul 2024 14:45:31 +0200 Subject: [PATCH 30/61] Allow to configure how empty annotation namespaces are handled --- docs/README.md | 10 +- docs/importers/saltxml.md | 10 +- src/importer/saltxml.rs | 24 +- src/importer/saltxml/document.rs | 30 +- ...ltxml__tests__read_salt_sample_corpus.snap | 446 ++++++------------ 5 files changed, 191 insertions(+), 329 deletions(-) diff --git a/docs/README.md b/docs/README.md index b7437b60..4f38cfba 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,5 @@ -| Type | Modules | -|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | -| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) | -| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file +| Type | Modules | +|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [saltxml](importers/saltxml.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | +| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [saltxml](exporters/saltxml.md), [sequence](exporters/sequence.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) | +| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file diff --git a/docs/importers/saltxml.md b/docs/importers/saltxml.md index 0fcc3242..0c09163a 100644 --- a/docs/importers/saltxml.md +++ b/docs/importers/saltxml.md @@ -3,4 +3,12 @@ Imports the SaltXML format used by Pepper (). SaltXML is an XMI serialization of the [Salt model](https://raw.githubusercontent.com/korpling/salt/master/gh-site/doc/salt_modelGuide.pdf). -*No Configuration* +## Configuration + +### missing_anno_ns_from_layer + +If `true`, use the layer name as fallback for the namespace annotations +if none is given. This is consistent with how the ANNIS tree visualizer +handles annotations without any namespace. If `false`, use an empty +string as annotation namespace. + diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 2f5effb0..77c708cd 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -11,9 +11,23 @@ use super::Importer; /// Imports the SaltXML format used by Pepper (). /// SaltXML is an XMI serialization of the [Salt model](https://raw.githubusercontent.com/korpling/salt/master/gh-site/doc/salt_modelGuide.pdf). -#[derive(Default, Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] +#[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] #[serde(default, deny_unknown_fields)] -pub struct ImportSaltXml {} +pub struct ImportSaltXml { + /// If `true`, use the layer name as fallback for the namespace annotations + /// if none is given. This is consistent with how the ANNIS tree visualizer + /// handles annotations without any namespace. If `false`, use an empty + /// string as annotation namespace. + missing_anno_ns_from_layer: bool, +} + +impl Default for ImportSaltXml { + fn default() -> Self { + Self { + missing_anno_ns_from_layer: true, + } + } +} impl Importer for ImportSaltXml { fn import_corpus( @@ -42,7 +56,11 @@ impl Importer for ImportSaltXml { // Get the path from the node name let document_path = input_path.join(relative_document_path); let document_file = std::fs::read_to_string(document_path)?; - DocumentMapper::read_document(&document_file, &document_node_name, &mut updates)?; + DocumentMapper::read_document( + &document_file, + self.missing_anno_ns_from_layer, + &mut updates, + )?; reporter.worked(1)?; } diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 6895515e..93c6556e 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -21,12 +21,13 @@ pub(super) struct DocumentMapper<'a, 'input> { edges: Vec>, layers: Vec>, base_texts: BTreeMap, + missing_anno_ns_from_layer: bool, } impl<'a, 'input> DocumentMapper<'a, 'input> { pub(super) fn read_document( input: &'input str, - _document_node_name: &str, + missing_anno_ns_from_layer: bool, updates: &mut GraphUpdate, ) -> Result<()> { let doc = roxmltree::Document::parse(input)?; @@ -55,6 +56,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { let mut mapper = DocumentMapper { base_texts: BTreeMap::new(), + missing_anno_ns_from_layer, nodes, edges, layers, @@ -101,6 +103,8 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_type: "node".to_string(), })?; + let mut fallback_annotation_namespace = "".to_string(); + if let Some(layers_attribute) = n.attribute("layers") { for layer_ref in layers_attribute.split(' ') { let layer_node = resolve_element(layer_ref, "layers", &self.layers) @@ -108,6 +112,13 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { if let Some(SaltObject::Text(layer_name)) = get_feature_by_qname(&layer_node, "salt", "SNAME") { + // Use the edge layer as fallback annotation namespace. This is + // consistent with e.g. the ANNIS Tree Visualizer handles + // annotations without any namespace. + if self.missing_anno_ns_from_layer { + fallback_annotation_namespace = layer_name.clone(); + } + updates.add_event(UpdateEvent::AddNodeLabel { node_name: id.clone(), anno_ns: ANNIS_NS.to_owned(), @@ -121,7 +132,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { for label_node in get_annotations(n) { let anno_ns = label_node .attribute("namespace") - .unwrap_or_default() + .unwrap_or(&fallback_annotation_namespace) .to_string(); let anno_name = label_node .attribute("name") @@ -186,11 +197,18 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { })?; } + let fallback_annotation_namespace = if self.missing_anno_ns_from_layer { + &component_layer + } else { + "" + }; + for label_element in get_annotations(rel) { let anno_ns = label_element .attribute("namespace") - .unwrap_or_default() + .unwrap_or(fallback_annotation_namespace) .to_string(); + let anno_name = label_element .attribute("name") .context("Missing annotation name for edge")? @@ -342,12 +360,6 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .iter() .filter(|rel| SaltType::from_node(rel) == SaltType::DominanceRelation) { - self.map_edge( - dominance_rel, - AnnotationComponentType::Dominance, - "", - updates, - )?; self.map_edge( dominance_rel, AnnotationComponentType::Dominance, diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index a200e5c5..6de64781 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -6,7 +6,7 @@ expression: actual - + @@ -964,449 +964,273 @@ expression: actual - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + From 6fb8321186e42f1c4876bd24bee77fde054158b5 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 4 Jul 2024 14:52:01 +0200 Subject: [PATCH 31/61] Use "default_ns" as fallback for annotation namespaces instead of empty string --- docs/importers/saltxml.md | 4 ++-- src/importer/saltxml.rs | 4 ++-- src/importer/saltxml/document.rs | 4 ++-- ...to__importer__saltxml__tests__read_salt_sample_corpus.snap | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/importers/saltxml.md b/docs/importers/saltxml.md index 0c09163a..1e3f139a 100644 --- a/docs/importers/saltxml.md +++ b/docs/importers/saltxml.md @@ -9,6 +9,6 @@ SaltXML is an XMI serialization of the [Salt model](https://raw.githubuserconten If `true`, use the layer name as fallback for the namespace annotations if none is given. This is consistent with how the ANNIS tree visualizer -handles annotations without any namespace. If `false`, use an empty -string as annotation namespace. +handles annotations without any namespace. If `false`, use the +`default_ns` namespace as fallback. diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 77c708cd..adef63ba 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -16,8 +16,8 @@ use super::Importer; pub struct ImportSaltXml { /// If `true`, use the layer name as fallback for the namespace annotations /// if none is given. This is consistent with how the ANNIS tree visualizer - /// handles annotations without any namespace. If `false`, use an empty - /// string as annotation namespace. + /// handles annotations without any namespace. If `false`, use the + /// `default_ns` namespace as fallback. missing_anno_ns_from_layer: bool, } diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 93c6556e..2d8f3840 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -103,7 +103,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_type: "node".to_string(), })?; - let mut fallback_annotation_namespace = "".to_string(); + let mut fallback_annotation_namespace = "default_ns".to_string(); if let Some(layers_attribute) = n.attribute("layers") { for layer_ref in layers_attribute.split(' ') { @@ -200,7 +200,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { let fallback_annotation_namespace = if self.missing_anno_ns_from_layer { &component_layer } else { - "" + "default_ns" }; for label_element in get_annotations(rel) { diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 6de64781..7ac1d503 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -4,7 +4,7 @@ expression: actual --- - + From d3ad396f391a0fec4ae9b1b5c4d3bbd809e6ed32 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 4 Jul 2024 15:00:48 +0200 Subject: [PATCH 32/61] Do not hide token in grid visualizer if there is only one tokenisation --- src/exporter/graphml.rs | 51 ++++++++++--------- .../import/graphml/single_sentence.graphml | 2 +- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/exporter/graphml.rs b/src/exporter/graphml.rs index 2f28749d..c47e712e 100644 --- a/src/exporter/graphml.rs +++ b/src/exporter/graphml.rs @@ -350,35 +350,36 @@ fn node_annos_vis(graph: &AnnotationGraph) -> Result 1; - let ordered_nodes_are_identical = { - more_than_one_ordering && { - let ordering_components = - graph.get_all_components(Some(AnnotationComponentType::Ordering), None); - let node_sets = ordering_components - .iter() - .map(|c| { - if let Some(strge) = graph.get_graphstorage(c) { - strge - .source_nodes() - .filter_map(|r| if let Ok(n) = r { Some(n) } else { None }) - .collect::>() - } else { - BTreeSet::default() - } - }) - .collect_vec(); - let mut all_same = true; - //for i in 1..node_sets.len() - for (a, b) in node_sets.into_iter().tuple_windows() { - all_same &= matches!(a.cmp(&b), Ordering::Equal); - } - all_same + let ordered_components_contain_identical_nodes = if order_names.len() > 1 { + let ordering_components = + graph.get_all_components(Some(AnnotationComponentType::Ordering), None); + let node_sets = ordering_components + .iter() + .map(|c| { + if let Some(strge) = graph.get_graphstorage(c) { + strge + .source_nodes() + .filter_map(|r| if let Ok(n) = r { Some(n) } else { None }) + .collect::>() + } else { + BTreeSet::default() + } + }) + .collect_vec(); + let mut all_same = true; + //for i in 1..node_sets.len() + for (a, b) in node_sets.into_iter().tuple_windows() { + all_same &= matches!(a.cmp(&b), Ordering::Equal); } + all_same + } else { + // There is only one ordering component + true }; + mappings.insert( "hide_tok".to_string(), - (!ordered_nodes_are_identical).to_string(), + (!ordered_components_contain_identical_nodes).to_string(), ); mappings.insert("show_ns".to_string(), "false".to_string()); Ok(Visualizer { diff --git a/tests/data/import/graphml/single_sentence.graphml b/tests/data/import/graphml/single_sentence.graphml index 2f6d512e..ec30cf52 100644 --- a/tests/data/import/graphml/single_sentence.graphml +++ b/tests/data/import/graphml/single_sentence.graphml @@ -39,7 +39,7 @@ visibility = "hidden" [visualizers.mappings] annos = "/default_ns::pos/,/syntax::cat/" escape_html = "false" -hide_tok = "true" +hide_tok = "false" show_ns = "false" ]]> From 05ba5a02f7fa6744a938bd2bd67ef2b767d4aaa8 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 08:50:10 +0200 Subject: [PATCH 33/61] Fix clippy issue --- src/importer/saltxml/document.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 2d8f3840..6574b7e8 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -116,7 +116,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { // consistent with e.g. the ANNIS Tree Visualizer handles // annotations without any namespace. if self.missing_anno_ns_from_layer { - fallback_annotation_namespace = layer_name.clone(); + fallback_annotation_namespace.clone_from(&layer_name); } updates.add_event(UpdateEvent::AddNodeLabel { From 688f616dd227d85e9377043f47a80553d7e9839b Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 08:55:21 +0200 Subject: [PATCH 34/61] Map pointing relations --- src/importer/saltxml.rs | 2 + src/importer/saltxml/document.rs | 15 ++- ...ltxml__tests__read_salt_sample_corpus.snap | 100 ++++++++++-------- 3 files changed, 70 insertions(+), 47 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index adef63ba..19ac3f08 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -85,6 +85,7 @@ enum SaltType { ElementId, Feature, Layer, + PointingRelation, Span, SpanningRelation, Structure, @@ -108,6 +109,7 @@ impl SaltType { "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, "sCorpusStructure:SDocument" => SaltType::Document, "sDocumentStructure:SDominanceRelation" => SaltType::DominanceRelation, + "sDocumentStructure:SPointingRelation" => SaltType::PointingRelation, "sDocumentStructure:SSpan" => SaltType::Span, "sDocumentStructure:SSpanningRelation" => SaltType::SpanningRelation, "sDocumentStructure:SStructure" => SaltType::Structure, diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 6574b7e8..5442a051 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -65,7 +65,6 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { mapper.map_tokens(updates)?; mapper.map_non_token_nodes(updates)?; - // TODO map SPointingRelation // TODO map STimeline and STimelineRelation // TODO map SOrderRelation for segmentation nodes @@ -367,6 +366,20 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { updates, )?; } + + // Add all pointing relations + for pointing_rel in self + .edges + .iter() + .filter(|rel| SaltType::from_node(rel) == SaltType::PointingRelation) + { + self.map_edge( + pointing_rel, + AnnotationComponentType::Pointing, + "edge", + updates, + )?; + } Ok(()) } } diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 7ac1d503..c43e9384 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -1140,97 +1140,105 @@ expression: actual - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + From 4b5a5a7fb45df7ba7f6608d43967d6b8cb8f5a2d Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 09:31:07 +0200 Subject: [PATCH 35/61] Remove SaltXML exporter for now --- src/exporter/mod.rs | 1 - src/exporter/saltxml.rs | 27 --------------------------- src/lib.rs | 11 ++--------- 3 files changed, 2 insertions(+), 37 deletions(-) delete mode 100644 src/exporter/saltxml.rs diff --git a/src/exporter/mod.rs b/src/exporter/mod.rs index 31ffb953..5e7adc95 100644 --- a/src/exporter/mod.rs +++ b/src/exporter/mod.rs @@ -2,7 +2,6 @@ pub mod exmaralda; pub mod graphml; -pub mod saltxml; pub mod sequence; pub mod textgrid; pub mod xlsx; diff --git a/src/exporter/saltxml.rs b/src/exporter/saltxml.rs deleted file mode 100644 index 1b0e1bb1..00000000 --- a/src/exporter/saltxml.rs +++ /dev/null @@ -1,27 +0,0 @@ -use documented::{Documented, DocumentedFields}; -use serde::Deserialize; -use struct_field_names_as_array::FieldNamesAsSlice; - -use super::Exporter; - -/// Exports Excel Spreadsheets where each line is a token, the other columns are -/// spans and merged cells can be used for spans that cover more than one token. -#[derive(Default, Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] -#[serde(default, deny_unknown_fields)] -pub struct SaltXmlExporter {} - -impl Exporter for SaltXmlExporter { - fn export_corpus( - &self, - _graph: &graphannis::AnnotationGraph, - _output_path: &std::path::Path, - _step_id: crate::StepID, - _tx: Option, - ) -> Result<(), Box> { - todo!() - } - - fn file_extension(&self) -> &str { - todo!() - } -} diff --git a/src/lib.rs b/src/lib.rs index 5baff5c4..5f47d928 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,8 +17,8 @@ use std::{fmt::Display, path::PathBuf}; use documented::{Documented, DocumentedFields}; use error::Result; use exporter::{ - exmaralda::ExportExmaralda, graphml::GraphMLExporter, saltxml::SaltXmlExporter, - sequence::ExportSequence, textgrid::ExportTextGrid, xlsx::XlsxExporter, Exporter, + exmaralda::ExportExmaralda, graphml::GraphMLExporter, sequence::ExportSequence, + textgrid::ExportTextGrid, xlsx::XlsxExporter, Exporter, }; use importer::{ conllu::ImportCoNLLU, exmaralda::ImportEXMARaLDA, file_nodes::CreateFileNodes, @@ -49,7 +49,6 @@ pub struct ModuleConfiguration { pub enum WriteAs { GraphML(#[serde(default)] GraphMLExporter), // the purpose of serde(default) here is, that an empty `[export.config]` table can be omited EXMARaLDA(#[serde(default)] ExportExmaralda), - SaltXml(#[serde(default)] SaltXmlExporter), Sequence(#[serde(default)] ExportSequence), TextGrid(ExportTextGrid), // do not use default, as all attributes have their individual defaults Xlsx(#[serde(default)] XlsxExporter), @@ -67,7 +66,6 @@ impl WriteAs { match self { WriteAs::GraphML(m) => m, WriteAs::EXMARaLDA(m) => m, - WriteAs::SaltXml(m) => m, WriteAs::Sequence(m) => m, WriteAs::TextGrid(m) => m, WriteAs::Xlsx(m) => m, @@ -80,7 +78,6 @@ impl WriteAsDiscriminants { match self { WriteAsDiscriminants::GraphML => GraphMLExporter::DOCS, WriteAsDiscriminants::EXMARaLDA => ExportExmaralda::DOCS, - WriteAsDiscriminants::SaltXml => SaltXmlExporter::DOCS, WriteAsDiscriminants::Sequence => ExportSequence::DOCS, WriteAsDiscriminants::TextGrid => ExportTextGrid::DOCS, WriteAsDiscriminants::Xlsx => XlsxExporter::DOCS, @@ -98,10 +95,6 @@ impl WriteAsDiscriminants { ExportExmaralda::FIELD_NAMES_AS_SLICE, ExportExmaralda::FIELD_DOCS, ), - WriteAsDiscriminants::SaltXml => ( - SaltXmlExporter::FIELD_NAMES_AS_SLICE, - SaltXmlExporter::FIELD_DOCS, - ), WriteAsDiscriminants::Sequence => ( ExportSequence::FIELD_NAMES_AS_SLICE, ExportSequence::FIELD_DOCS, From 9232fdc450ac9b53fedcd31fc70cd057cedb19f5 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 09:41:50 +0200 Subject: [PATCH 36/61] Add example Salt corpus with timeline --- src/importer/saltxml.rs | 2 + .../dialog.demo/dialog.demo/dialog.demo.salt | 3278 +++++++++++++++++ .../dialog.demo/dialog.demo/dialog.demo.webm | 0 .../import/salt/dialog.demo/saltProject.salt | 34 + 4 files changed, 3314 insertions(+) create mode 100644 tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.salt create mode 100644 tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm create mode 100644 tests/data/import/salt/dialog.demo/saltProject.salt diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 19ac3f08..64de4a34 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -91,6 +91,7 @@ enum SaltType { Structure, TextualDs, TextualRelation, + Timeline, Token, Unknown, } @@ -115,6 +116,7 @@ impl SaltType { "sDocumentStructure:SStructure" => SaltType::Structure, "sDocumentStructure:STextualDS" => SaltType::TextualDs, "sDocumentStructure:STextualRelation" => SaltType::TextualRelation, + "sDocumentStructure:STimeline" => SaltType::Timeline, "sDocumentStructure:SToken" => SaltType::Token, _ => SaltType::Unknown, } diff --git a/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.salt b/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.salt new file mode 100644 index 00000000..31d70fb3 --- /dev/null +++ b/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.salt @@ -0,0 +1,3278 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm b/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/import/salt/dialog.demo/saltProject.salt b/tests/data/import/salt/dialog.demo/saltProject.salt new file mode 100644 index 00000000..ebc53c08 --- /dev/null +++ b/tests/data/import/salt/dialog.demo/saltProject.salt @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From faccdd021420790deb278012a8773c56d0bfb4a9 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 10:10:41 +0200 Subject: [PATCH 37/61] Use code points as reference and not byte positions --- src/importer/saltxml/document.rs | 8 +++++--- tests/snapshots/cli__list_modules.snap | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 5442a051..d2b0424f 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -278,6 +278,8 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .base_texts .get(&datasource_id) .with_context(|| format!("Missing base text for token {token_id}"))?; + // Our indices are refering to characters not bytes + let matching_base_text = matching_base_text.chars().collect_vec(); let start = get_feature_by_qname(&text_rel, "salt", "SSTART").context("Missing start value")?; let end = @@ -290,7 +292,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_name: token_id.clone(), anno_ns: ANNIS_NS.to_string(), anno_name: "tok".to_string(), - anno_value: covered_text.to_string(), + anno_value: covered_text.iter().collect(), })?; // Get the whitespace before the first token @@ -300,7 +302,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_name: token_id.clone(), anno_ns: ANNIS_NS.to_string(), anno_name: "tok-whitespace-before".to_string(), - anno_value: whitespace.to_string(), + anno_value: whitespace.iter().collect(), })?; } @@ -317,7 +319,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_name: token_id.clone(), anno_ns: ANNIS_NS.to_string(), anno_name: "tok-whitespace-after".to_string(), - anno_value: whitespace.to_string(), + anno_value: whitespace.iter().collect(), })?; } } diff --git a/tests/snapshots/cli__list_modules.snap b/tests/snapshots/cli__list_modules.snap index 783bec3c..ca0e8ff5 100644 --- a/tests/snapshots/cli__list_modules.snap +++ b/tests/snapshots/cli__list_modules.snap @@ -5,7 +5,7 @@ expression: output | Type | Modules | |------------------|----------------------------------------------------------------------------------------------------------------------| | Import formats | conllu, exmaralda, graphml, meta, none, opus, path, ptb, relannis, saltxml, textgrid, toolbox, treetagger, xlsx, xml | -| Export formats | graphml, exmaralda, saltxml, sequence, textgrid, xlsx | +| Export formats | graphml, exmaralda, sequence, textgrid, xlsx | | Graph operations | check, collapse, enumerate, link, map, revise, chunk, split, none | Use `annatto info ` to get more information about one of the formats or graph operations. From c9bbe89303a121f74552640138166ffe44417379 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 12:42:57 +0200 Subject: [PATCH 38/61] Add PartOf edges for all created nodes and start mapping a timeline --- src/importer/saltxml/document.rs | 89 +- ...ltxml__tests__read_salt_sample_corpus.snap | 208 ++ ...ltxml__tests__read_salt_with_timeline.snap | 2418 +++++++++++++++++ src/importer/saltxml/tests.rs | 12 + 4 files changed, 2719 insertions(+), 8 deletions(-) create mode 100644 src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index d2b0424f..4e7bc6c5 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -61,18 +61,74 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { edges, layers, }; + + let document_node_name = + get_element_id(&doc.root_element()).context("Missing document ID")?; + + let timeline = mapper + .nodes + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::Timeline) + .copied() + .next(); + mapper.map_textual_datasources(updates)?; - mapper.map_tokens(updates)?; + if let Some(timeline) = timeline { + mapper.map_timeline(&timeline, &document_node_name, updates)?; + } - mapper.map_non_token_nodes(updates)?; + mapper.map_tokens(&document_node_name, updates)?; + mapper.map_non_token_nodes(&document_node_name, updates)?; - // TODO map STimeline and STimelineRelation // TODO map SOrderRelation for segmentation nodes // TODO map SAudioDS and SAudioRelation Ok(()) } + fn map_timeline( + &self, + timeline: &Node, + document_node_name: &str, + updates: &mut GraphUpdate, + ) -> Result<()> { + let number_of_tlis = get_feature_by_qname(timeline, "saltCommon", "SDATA") + .context("Missing SDATA attribute for timeline.")?; + if let SaltObject::Integer(number_of_tlis) = number_of_tlis { + let mut previous_tli = None; + for i in 0..number_of_tlis { + let tli_node_name = format!("{document_node_name}/tli{i}"); + updates.add_event(UpdateEvent::AddNode { + node_name: tli_node_name.clone(), + node_type: "node".to_string(), + })?; + + updates.add_event(UpdateEvent::AddEdge { + source_node: tli_node_name.clone(), + target_node: document_node_name.to_string(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + + if let Some(previous_tli) = previous_tli { + updates.add_event(UpdateEvent::AddEdge { + source_node: previous_tli, + target_node: tli_node_name.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::Ordering.to_string(), + component_name: "".to_string(), + })?; + } + + previous_tli = Some(tli_node_name); + } + } else { + bail!("SDATA attribute for timeline is not a number.") + } + Ok(()) + } + fn map_textual_datasources(&mut self, updates: &mut GraphUpdate) -> Result<()> { for text_node in self .nodes @@ -95,13 +151,26 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } - fn map_node(&self, n: &Node, updates: &mut GraphUpdate) -> Result<()> { + fn map_node( + &self, + n: &Node, + document_node_name: &str, + updates: &mut GraphUpdate, + ) -> Result<()> { let id = get_element_id(n).context("Missing element ID for node")?; updates.add_event(UpdateEvent::AddNode { node_name: id.clone(), node_type: "node".to_string(), })?; + updates.add_event(UpdateEvent::AddEdge { + source_node: id.clone(), + target_node: document_node_name.to_string(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + let mut fallback_annotation_namespace = "default_ns".to_string(); if let Some(layers_attribute) = n.attribute("layers") { @@ -228,14 +297,14 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } - fn map_tokens(&self, updates: &mut GraphUpdate) -> Result<()> { + fn map_tokens(&self, document_node_name: &str, updates: &mut GraphUpdate) -> Result<()> { // Map the token nodes in the same order as in the SaltXML file for token_node in self .nodes .iter() .filter(|n| SaltType::from_node(n) == SaltType::Token) { - self.map_node(token_node, updates)?; + self.map_node(token_node, document_node_name, updates)?; } // Order textual relations by their start offset, so we iterate in the @@ -339,12 +408,16 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } - fn map_non_token_nodes(&self, updates: &mut GraphUpdate) -> Result<()> { + fn map_non_token_nodes( + &self, + document_node_name: &str, + updates: &mut GraphUpdate, + ) -> Result<()> { for span_node in self.nodes.iter().filter(|n| { let t = SaltType::from_node(n); t == SaltType::Span || t == SaltType::Structure }) { - self.map_node(span_node, updates)?; + self.map_node(span_node, document_node_name, updates)?; } // Connect all spans with the token using the spanning relations diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index c43e9384..1e5b2718 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -1240,5 +1240,213 @@ expression: actual + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap new file mode 100644 index 00000000..bf1b66fe --- /dev/null +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -0,0 +1,2418 @@ +--- +source: src/importer/saltxml/tests.rs +expression: actual +--- + + + + + + + + + + + + + + corpus + + + + file:/tmp/pepper_thomas/workspace/rdr3o1mf/dialog.demo5661470868672726454.salt + dialog.demo + corpus + + + datasource + + + datasource + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + + + node + pass + + + node + auf + + + node + + + node + + + node + hast + + + node + jetz + + + node + wohl + + + node + hier + + + node + auch + + + node + so + + + node + ne + + + node + + + node + wie + + + node + ich + + + node + + + node + ich + + + node + hab + + + node + ne + + + node + + + node + + + node + + + node + nich + + + node + eine + + + node + genau + + + node + + + node + + + node + + + node + dir + + + node + jetzt + + + node + erklärn + + + node + wie + + + node + du + + + node + vom + + + node + Start + + + node + zum + + + node + Ziel + + + node + kommst + + + node + so + + + node + wie + + + node + meine + + + node + Linie + + + node + geht + + + node + so + + + node + also + + + node + du + + + node + hast + + + node + n + + + node + Stift + + + node + okay + + + node + aso + + + node + du + + + node + musst + + + node + jetzt + + + node + vom + + + node + Startpunkt + + + node + äh + + + node + waagerecht + + + node + Richtung + + + node + also + + + node + zu + + + node + dem + + + node + ersten + + + node + Bild + + + node + erstmal + + + node + und + + + node + zum + + + node + Rad + + + node + äh + + + node + ((lacht)) + + + node + fang + + + node + einfach + + + node + ma + + + node + an + + + node + ((lacht)) + + + node + nee + + + node + ich + + + node + hab + + + node + gar + + + node + keine + + + node + Linien + + + node + ich + + + node + hab + + + node + Start + + + node + und + + + node + Ziel + + + node + genau + + + node + jap + + + node + mhm + + + node + zum + + + node + (?) + + + node + Schornsteinfeger + + + node + naja + + + node + pass + + + node + auf + + + node + also + + + node + du + + + node + hast + + + node + jetzt + + + node + wohl + + + node + hier + + + node + auch + + + node + so + + + node + eine + + + node + Karte + + + node + wie + + + node + ich + + + node + bloß + + + node + ich + + + node + habe + + + node + eine + + + node + Linie + + + node + und + + + node + du + + + node + nicht + + + node + ne + + + node + genau + + + node + und + + + node + ich + + + node + muss + + + node + dir + + + node + jetzt + + + node + erklären + + + node + wie + + + node + du + + + node + vom + + + node + Start + + + node + zum + + + node + Ziel + + + node + kommst + + + node + so + + + node + wie + + + node + meine + + + node + Linie + + + node + so + + + node + also + + + node + du + + + node + hast + + + node + einen + + + node + Stift + + + node + okay + + + node + also + + + node + du + + + node + musst + + + node + jetzt + + + node + vom + + + node + Startpunkt + + + node + äh + + + node + waagerecht + + + node + Richtung + + + node + also + + + node + zu + + + node + dem + + + node + ersten + + + node + Bild + + + node + erstmal + + + node + und + + + node + zum + + + node + Rad + + + node + naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne + + + node + genau + + + node + und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht + + + node + so + + + node + also hast n Stift + + + node + okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal + + + node + und + + + node + zum Rad + + + node + äh + + + node + fang + + + node + einfach + + + node + mal + + + node + an + + + node + nee + + + node + ich + + + node + habe + + + node + gar + + + node + keine + + + node + Linien + + + node + ich + + + node + habe + + + node + Start + + + node + und + + + node + Ziel + + + node + genau + + + node + jap + + + node + mhm + + + node + zum + + + node + (?) + + + node + Schornsteinfeger + + + node + äh fang einfach ma an + + + node + nee ich hab gar keine Linie ich hab Start und Ziel + + + node + genau + + + node + jap + + + node + zum (?) Schornsteinfeger + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/importer/saltxml/tests.rs b/src/importer/saltxml/tests.rs index b78b896d..34a5134b 100644 --- a/src/importer/saltxml/tests.rs +++ b/src/importer/saltxml/tests.rs @@ -16,3 +16,15 @@ fn read_salt_sample_corpus() { .unwrap(); assert_snapshot!(actual); } + +#[test] +fn read_salt_with_timeline() { + let importer = ImportSaltXml::default(); + let actual = test_util::import_as_graphml_string( + importer, + Path::new("tests/data/import/salt/dialog.demo"), + None, + ) + .unwrap(); + assert_snapshot!(actual); +} From bc5a8027ff36f25952c4bd504e9ffb9f70d7286d Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 13:31:33 +0200 Subject: [PATCH 39/61] Consider the textual DS name when sorting the token relations --- src/importer/saltxml/document.rs | 53 ++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 4e7bc6c5..e9fc6cd0 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -16,6 +16,12 @@ use super::{ get_annotations, get_element_id, get_feature_by_qname, resolve_element, SaltObject, SaltType, }; +#[derive(Eq, PartialEq, PartialOrd, Ord, Hash, Clone, Debug)] +pub struct TextProperty { + segmentation: String, + val: i64, +} + pub(super) struct DocumentMapper<'a, 'input> { nodes: Vec>, edges: Vec>, @@ -77,7 +83,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { mapper.map_timeline(&timeline, &document_node_name, updates)?; } - mapper.map_tokens(&document_node_name, updates)?; + mapper.map_tokens(&document_node_name, timeline.as_ref(), updates)?; mapper.map_non_token_nodes(&document_node_name, updates)?; // TODO map SOrderRelation for segmentation nodes @@ -297,7 +303,12 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } - fn map_tokens(&self, document_node_name: &str, updates: &mut GraphUpdate) -> Result<()> { + fn map_tokens( + &self, + document_node_name: &str, + _timeline: Option<&Node>, + updates: &mut GraphUpdate, + ) -> Result<()> { // Map the token nodes in the same order as in the SaltXML file for token_node in self .nodes @@ -309,37 +320,53 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { // Order textual relations by their start offset, so we iterate in the // actual order of the tokens. - let sorted_text_rels: BTreeMap = self + let sorted_text_rels: BTreeMap = self .edges .iter() .filter(|n| SaltType::from_node(n) == SaltType::TextualRelation) .map(|text_rel| { let start = get_feature_by_qname(text_rel, "salt", "SSTART").unwrap_or(SaltObject::Null); - if let SaltObject::Integer(start) = start { - (start, *text_rel) + let referenced_text_node = resolve_element("target", "nodes", &self.nodes) + .and_then(|n| get_feature_by_qname(&n, "salt", "SNAME")) + .map(|o| o.to_string()) + .unwrap_or_default(); + let val = if let SaltObject::Integer(start) = start { + start } else { - (-1, *text_rel) - } + -1 + }; + let prop = TextProperty { + segmentation: referenced_text_node, + val, + }; + (prop, *text_rel) }) .collect(); // Connect the token to the texts by the textual relations - let mut previous_token = None; + let mut previous_token: Option<(TextProperty, String)> = None; let mut sorted_text_rels = sorted_text_rels.into_iter().peekable(); - while let Some((_, text_rel)) = sorted_text_rels.next() { + while let Some((prop, text_rel)) = sorted_text_rels.next() { + if let Some(p) = &previous_token { + // If the segmentation changes, there is no previous token + if p.0.segmentation != prop.segmentation { + previous_token = None; + } + } + let source_att_val = text_rel.attribute("source").unwrap_or_default(); let token = resolve_element(source_att_val, "nodes", &self.nodes).with_context(|| { format!("Textual relation source \"{source_att_val}\" could not be resolved") })?; + let token_id = get_element_id(&token).context("Missing ID for token")?; let target_att_val = text_rel.attribute("target").unwrap_or_default(); let datasource = resolve_element(target_att_val, "nodes", &self.nodes).with_context(|| { format!("Textual relation target \"{target_att_val}\" could not be resolved") })?; - let token_id = get_element_id(&token).context("Missing ID for token")?; let datasource_id = get_element_id(&datasource).context("Missing ID for token")?; // Get the string for this token @@ -378,7 +405,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { // Add whitespace after this token let next_token_offset = sorted_text_rels .peek() - .map(|(offset, _)| *offset) + .map(|(prop, _)| prop.val) .unwrap_or_else(|| matching_base_text.len().try_into().unwrap_or(i64::MAX)); let next_token_offset = usize::try_from(next_token_offset).unwrap_or(0); @@ -395,14 +422,14 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { // Add ordering edges between the tokens for the base token layer if let Some(previous_token) = previous_token { updates.add_event(UpdateEvent::AddEdge { - source_node: previous_token, + source_node: previous_token.1.clone(), target_node: token_id.clone(), layer: ANNIS_NS.to_string(), component_type: AnnotationComponentType::Ordering.to_string(), component_name: "".to_string(), })?; } - previous_token = Some(token_id); + previous_token = Some((prop, token_id)); } Ok(()) From e24b11d8a81ce06a50359fbd5898c2c7b9850352 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 13:46:48 +0200 Subject: [PATCH 40/61] Use the segmentation name for the ordering component if there is a timeline --- src/importer/saltxml/document.rs | 29 +- ...ltxml__tests__read_salt_with_timeline.snap | 1191 +++++++++-------- 2 files changed, 630 insertions(+), 590 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index e9fc6cd0..b9195c41 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -103,7 +103,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { if let SaltObject::Integer(number_of_tlis) = number_of_tlis { let mut previous_tli = None; for i in 0..number_of_tlis { - let tli_node_name = format!("{document_node_name}/tli{i}"); + let tli_node_name = format!("{document_node_name}#tli{i}"); updates.add_event(UpdateEvent::AddNode { node_name: tli_node_name.clone(), node_type: "node".to_string(), @@ -306,7 +306,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { fn map_tokens( &self, document_node_name: &str, - _timeline: Option<&Node>, + timeline: Option<&Node>, updates: &mut GraphUpdate, ) -> Result<()> { // Map the token nodes in the same order as in the SaltXML file @@ -327,10 +327,14 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .map(|text_rel| { let start = get_feature_by_qname(text_rel, "salt", "SSTART").unwrap_or(SaltObject::Null); - let referenced_text_node = resolve_element("target", "nodes", &self.nodes) - .and_then(|n| get_feature_by_qname(&n, "salt", "SNAME")) - .map(|o| o.to_string()) - .unwrap_or_default(); + let referenced_text_node = resolve_element( + text_rel.attribute("target").unwrap_or_default(), + "nodes", + &self.nodes, + ) + .and_then(|n| get_feature_by_qname(&n, "salt", "SNAME")) + .map(|o| o.to_string()) + .unwrap_or_default(); let val = if let SaltObject::Integer(start) = start { start } else { @@ -347,10 +351,10 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { // Connect the token to the texts by the textual relations let mut previous_token: Option<(TextProperty, String)> = None; let mut sorted_text_rels = sorted_text_rels.into_iter().peekable(); - while let Some((prop, text_rel)) = sorted_text_rels.next() { + while let Some((text_prop, text_rel)) = sorted_text_rels.next() { if let Some(p) = &previous_token { // If the segmentation changes, there is no previous token - if p.0.segmentation != prop.segmentation { + if p.0.segmentation != text_prop.segmentation { previous_token = None; } } @@ -421,15 +425,20 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { } // Add ordering edges between the tokens for the base token layer if let Some(previous_token) = previous_token { + let component_name = if timeline.is_some() { + text_prop.segmentation.clone() + } else { + "".to_string() + }; updates.add_event(UpdateEvent::AddEdge { source_node: previous_token.1.clone(), target_node: token_id.clone(), layer: ANNIS_NS.to_string(), component_type: AnnotationComponentType::Ordering.to_string(), - component_name: "".to_string(), + component_name, })?; } - previous_token = Some((prop, token_id)); + previous_token = Some((text_prop, token_id)); } Ok(()) diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index bf1b66fe..f2c71b31 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -29,314 +29,315 @@ expression: actual datasource - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node node + naja node @@ -348,9 +349,11 @@ expression: actual node + also node + du node @@ -382,6 +385,7 @@ expression: actual node + Karte node @@ -393,6 +397,7 @@ expression: actual node + bloß node @@ -408,12 +413,15 @@ expression: actual node + Linie node + und node + du node @@ -429,12 +437,15 @@ expression: actual node + und node + ich node + muss node @@ -1458,961 +1469,981 @@ expression: actual - + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + From 8823e7ead5596beafc7bfae0cd5648ebe1025803 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 14:09:33 +0200 Subject: [PATCH 41/61] Map timeline relations as coverage between segmentation nodes and the timeline tokens --- src/importer/saltxml.rs | 2 + src/importer/saltxml/document.rs | 37 +- ...ltxml__tests__read_salt_with_timeline.snap | 2102 +++++++++-------- 3 files changed, 1185 insertions(+), 956 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 64de4a34..c184151f 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -92,6 +92,7 @@ enum SaltType { TextualDs, TextualRelation, Timeline, + TimelineRelation, Token, Unknown, } @@ -117,6 +118,7 @@ impl SaltType { "sDocumentStructure:STextualDS" => SaltType::TextualDs, "sDocumentStructure:STextualRelation" => SaltType::TextualRelation, "sDocumentStructure:STimeline" => SaltType::Timeline, + "sDocumentStructure:STimelineRelation" => SaltType::TimelineRelation, "sDocumentStructure:SToken" => SaltType::Token, _ => SaltType::Unknown, } diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index b9195c41..7c372ca9 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -79,14 +79,12 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .next(); mapper.map_textual_datasources(updates)?; + mapper.map_tokens(&document_node_name, timeline.as_ref(), updates)?; if let Some(timeline) = timeline { mapper.map_timeline(&timeline, &document_node_name, updates)?; } - mapper.map_tokens(&document_node_name, timeline.as_ref(), updates)?; mapper.map_non_token_nodes(&document_node_name, updates)?; - - // TODO map SOrderRelation for segmentation nodes // TODO map SAudioDS and SAudioRelation Ok(()) @@ -132,6 +130,38 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { } else { bail!("SDATA attribute for timeline is not a number.") } + + // Connect the existing non-timeline tokens with the the timeline tokens + for timeline_rel in self + .edges + .iter() + .filter(|rel| SaltType::from_node(*rel) == SaltType::TimelineRelation) + { + let source_att = timeline_rel.attribute("source").unwrap_or_default(); + let token_node = resolve_element(&source_att, "nodes", &self.nodes) + .context("Token referenced in STimelineRelation cannot be resolved")?; + let token_id = get_element_id(&token_node).context("Token has no ID")?; + + let start = get_feature_by_qname(timeline_rel, "salt", "SSTART") + .context("Missing SSTART attribute for timeline relation")?; + let end = get_feature_by_qname(timeline_rel, "salt", "SEND") + .context("Missing SEND attribute for timeline relation")?; + + if let (SaltObject::Integer(start), SaltObject::Integer(end)) = (start, end) { + for tli in start..end { + updates.add_event(UpdateEvent::AddEdge { + source_node: token_id.clone(), + target_node: format!("{document_node_name}#tli{tli}"), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::Coverage.to_string(), + component_name: "".to_string(), + })?; + } + } else { + bail!("SSTART/SEND not an integer") + } + } + Ok(()) } @@ -457,6 +487,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { } // Connect all spans with the token using the spanning relations + // TODO: use the covered timeline token as target if there is a timeline for spanning_rel in self .edges .iter() diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index f2c71b31..570b226d 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -29,679 +29,679 @@ expression: actual datasource - + node + naja - + node + pass - + node + auf - + node + also - + node + du - + node + hast - + node + jetz - + node + wohl - + node + hier - + node + auch - + node + so - + node + ne - + node + Karte - + node + wie - + node + ich - + node + bloß - + node + ich - + node + hab - + node + ne - + node + Linie - + node + und - + node + du - + node + nich - + node + eine - + node + genau - + node + und - + node + ich - + node + muss - + node + dir - + node + jetzt - + node + erklärn - + node + wie - + node + du - + node + vom - + node + Start - + node + zum - + node + Ziel - + node + kommst - + node + so - + node + wie - + node + meine - + node + Linie - + node + geht - + node + so - + node + also - + node + du - + node + hast - + node + n - + node + Stift - + node + okay - + node + aso - + node + du - + node + musst - + node + jetzt - + node + vom - + node + Startpunkt - + node + äh - + node + waagerecht - + node + Richtung - + node + also - + node + zu - + node + dem - + node + ersten - + node + Bild - + node + erstmal - + node + und - + node + zum - + node + Rad - + node + äh - + node + ((lacht)) - + node + fang - + node + einfach - + node + ma - + node + an - + node + ((lacht)) - + node + nee - + node + ich - + node + hab - + node + gar - + node + keine - + node + Linien - + node + ich - + node + hab - + node + Start - + node + und - + node + Ziel - + node + genau - + node + jap - + node + mhm - + node + zum - + node + (?) - + node + Schornsteinfeger - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - naja - + node - pass - + node - auf - + node - also - + node - du - + node - hast - + node - jetz - + node - wohl - + node - hier - + node - auch - + node - so - + node - ne - + node - Karte - + node - wie - + node - ich - + node - bloß - + node - ich - + node - hab - + node - ne - + node - Linie - + node - und - + node - du - + node - nich - + node - eine - + node - genau - + node - und - + node - ich - + node - muss - + node - dir - + node - jetzt - + node - erklärn - + node - wie - + node - du - + node - vom - + node - Start - + node - zum - + node - Ziel - + node - kommst - + node - so - + node - wie - + node - meine - + node - Linie - + node - geht - + node - so - + node - also - + node - du - + node - hast - + node - n - + node - Stift - + node - okay - + node - aso - + node - du - + node - musst - + node - jetzt - + node - vom - + node - Startpunkt - + node - äh - + node - waagerecht - + node - Richtung - + node - also - + node - zu - + node - dem - + node - ersten - + node - Bild - + node - erstmal - + node - und - + node - zum - + node - Rad - + node - äh - + node - ((lacht)) - + node - fang - + node - einfach - + node - ma - + node - an - + node - ((lacht)) - + node - nee - + node - ich - + node - hab - + node - gar - + node - keine - + node - Linien - + node - ich - + node - hab - + node - Start - + node - und - + node - Ziel - + node - genau - + node - jap - + node - mhm - + node - zum - + node - (?) - + node - Schornsteinfeger node @@ -1111,1339 +1111,1535 @@ expression: actual node zum (?) Schornsteinfeger - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + From d42575de9e9a6aa4140facc481ec03a10c430427 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 14:39:07 +0200 Subject: [PATCH 42/61] Move document node to mapper struct --- src/importer/saltxml/document.rs | 39 ++++++++++++-------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 7c372ca9..e8942911 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -27,6 +27,7 @@ pub(super) struct DocumentMapper<'a, 'input> { edges: Vec>, layers: Vec>, base_texts: BTreeMap, + document_node_name: String, missing_anno_ns_from_layer: bool, } @@ -59,6 +60,8 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .children() .filter(|n| n.tag_name().name() == "layers") .collect_vec(); + let document_node_name = + get_element_id(&doc.root_element()).context("Missing document ID")?; let mut mapper = DocumentMapper { base_texts: BTreeMap::new(), @@ -66,11 +69,9 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { nodes, edges, layers, + document_node_name, }; - let document_node_name = - get_element_id(&doc.root_element()).context("Missing document ID")?; - let timeline = mapper .nodes .iter() @@ -79,29 +80,24 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .next(); mapper.map_textual_datasources(updates)?; - mapper.map_tokens(&document_node_name, timeline.as_ref(), updates)?; + mapper.map_tokens(timeline.as_ref(), updates)?; if let Some(timeline) = timeline { - mapper.map_timeline(&timeline, &document_node_name, updates)?; + mapper.map_timeline(&timeline, updates)?; } - mapper.map_non_token_nodes(&document_node_name, updates)?; + mapper.map_non_token_nodes(timeline.as_ref(), updates)?; // TODO map SAudioDS and SAudioRelation Ok(()) } - fn map_timeline( - &self, - timeline: &Node, - document_node_name: &str, - updates: &mut GraphUpdate, - ) -> Result<()> { + fn map_timeline(&self, timeline: &Node, updates: &mut GraphUpdate) -> Result<()> { let number_of_tlis = get_feature_by_qname(timeline, "saltCommon", "SDATA") .context("Missing SDATA attribute for timeline.")?; if let SaltObject::Integer(number_of_tlis) = number_of_tlis { let mut previous_tli = None; for i in 0..number_of_tlis { - let tli_node_name = format!("{document_node_name}#tli{i}"); + let tli_node_name = format!("{}#tli{i}", self.document_node_name); updates.add_event(UpdateEvent::AddNode { node_name: tli_node_name.clone(), node_type: "node".to_string(), @@ -109,7 +105,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { updates.add_event(UpdateEvent::AddEdge { source_node: tli_node_name.clone(), - target_node: document_node_name.to_string(), + target_node: self.document_node_name.clone(), layer: ANNIS_NS.to_string(), component_type: AnnotationComponentType::PartOf.to_string(), component_name: "".to_string(), @@ -151,7 +147,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { for tli in start..end { updates.add_event(UpdateEvent::AddEdge { source_node: token_id.clone(), - target_node: format!("{document_node_name}#tli{tli}"), + target_node: format!("{}#tli{tli}", self.document_node_name), layer: ANNIS_NS.to_string(), component_type: AnnotationComponentType::Coverage.to_string(), component_name: "".to_string(), @@ -333,19 +329,14 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } - fn map_tokens( - &self, - document_node_name: &str, - timeline: Option<&Node>, - updates: &mut GraphUpdate, - ) -> Result<()> { + fn map_tokens(&self, timeline: Option<&Node>, updates: &mut GraphUpdate) -> Result<()> { // Map the token nodes in the same order as in the SaltXML file for token_node in self .nodes .iter() .filter(|n| SaltType::from_node(n) == SaltType::Token) { - self.map_node(token_node, document_node_name, updates)?; + self.map_node(token_node, &self.document_node_name, updates)?; } // Order textual relations by their start offset, so we iterate in the @@ -476,14 +467,14 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { fn map_non_token_nodes( &self, - document_node_name: &str, + timeline: Option<&Node>, updates: &mut GraphUpdate, ) -> Result<()> { for span_node in self.nodes.iter().filter(|n| { let t = SaltType::from_node(n); t == SaltType::Span || t == SaltType::Structure }) { - self.map_node(span_node, document_node_name, updates)?; + self.map_node(span_node, &self.document_node_name, updates)?; } // Connect all spans with the token using the spanning relations From 789912b9a1523454f34789bdbf3a7eb763a03491 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 15:07:00 +0200 Subject: [PATCH 43/61] Add coverage to indirectly covered TLI token --- src/importer/saltxml/document.rs | 71 +- ...ltxml__tests__read_salt_with_timeline.snap | 1620 +++++++++-------- 2 files changed, 927 insertions(+), 764 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index e8942911..517633cf 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -28,6 +28,7 @@ pub(super) struct DocumentMapper<'a, 'input> { layers: Vec>, base_texts: BTreeMap, document_node_name: String, + token_to_tli: BTreeMap>, missing_anno_ns_from_layer: bool, } @@ -70,6 +71,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { edges, layers, document_node_name, + token_to_tli: BTreeMap::new(), }; let timeline = mapper @@ -85,13 +87,13 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { mapper.map_timeline(&timeline, updates)?; } - mapper.map_non_token_nodes(timeline.as_ref(), updates)?; + mapper.map_non_token_nodes(updates)?; // TODO map SAudioDS and SAudioRelation Ok(()) } - fn map_timeline(&self, timeline: &Node, updates: &mut GraphUpdate) -> Result<()> { + fn map_timeline(&mut self, timeline: &Node, updates: &mut GraphUpdate) -> Result<()> { let number_of_tlis = get_feature_by_qname(timeline, "saltCommon", "SDATA") .context("Missing SDATA attribute for timeline.")?; if let SaltObject::Integer(number_of_tlis) = number_of_tlis { @@ -153,6 +155,8 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { component_name: "".to_string(), })?; } + self.token_to_tli + .insert(token_id, (start..end).collect_vec()); } else { bail!("SSTART/SEND not an integer") } @@ -253,6 +257,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { fn map_edge( &self, rel: &Node, + overwrite_target_node: Option, component_type: AnnotationComponentType, fallback_component_name: &str, updates: &mut GraphUpdate, @@ -262,10 +267,15 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { resolve_element(source_att_val, "nodes", &self.nodes).context("Missing source node")?; let source_id = get_element_id(&source_element).context("Missing source node ID")?; - let target_att_val = rel.attribute("target").unwrap_or_default(); - let target_element = - resolve_element(target_att_val, "nodes", &self.nodes).context("Missing target node")?; - let target_id = get_element_id(&target_element).context("Missing target node ID")?; + let target_id = if let Some(target_id) = overwrite_target_node { + target_id + } else { + let target_att_val = rel.attribute("target").unwrap_or_default(); + let target_element = resolve_element(target_att_val, "nodes", &self.nodes) + .context("Missing target node")?; + let target_id = get_element_id(&target_element).context("Missing target node ID")?; + target_id + }; let component_name = get_feature_by_qname(rel, "salt", "STYPE") .map(|t| t.to_string()) @@ -415,6 +425,15 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { anno_name: "tok".to_string(), anno_value: covered_text.iter().collect(), })?; + if timeline.is_some() { + // Add the token value as additional annotation + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: token_id.clone(), + anno_ns: text_prop.segmentation.clone(), + anno_name: text_prop.segmentation.clone(), + anno_value: covered_text.iter().collect(), + })?; + } // Get the whitespace before the first token if previous_token.is_none() && start > 0 { @@ -465,11 +484,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } - fn map_non_token_nodes( - &self, - timeline: Option<&Node>, - updates: &mut GraphUpdate, - ) -> Result<()> { + fn map_non_token_nodes(&self, updates: &mut GraphUpdate) -> Result<()> { for span_node in self.nodes.iter().filter(|n| { let t = SaltType::from_node(n); t == SaltType::Span || t == SaltType::Structure @@ -478,13 +493,41 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { } // Connect all spans with the token using the spanning relations - // TODO: use the covered timeline token as target if there is a timeline + for spanning_rel in self .edges .iter() .filter(|rel| SaltType::from_node(rel) == SaltType::SpanningRelation) { - self.map_edge(spanning_rel, AnnotationComponentType::Coverage, "", updates)?; + let target_att = spanning_rel + .attribute("target") + .context("Missing target attribute for SSpanningRelation")?; + let target_node = resolve_element(&target_att, "nodes", &self.nodes) + .context("Could not resolve target for SSpanningRelation")?; + let target_node_id = get_element_id(&target_node).context("Target token has no ID")?; + + if let Some(tli_token) = self.token_to_tli.get(&target_node_id) { + // Add a coverage edge to the indirectly covered timeline item token + for tli in tli_token { + let tli_id = format!("{}#tli{tli}", &self.document_node_name); + self.map_edge( + spanning_rel, + Some(tli_id), + AnnotationComponentType::Coverage, + "", + updates, + )?; + } + } else { + // Directly map the coverage edge + self.map_edge( + spanning_rel, + None, + AnnotationComponentType::Coverage, + "", + updates, + )?; + } } // Add all dominance relations for dominance_rel in self @@ -494,6 +537,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { { self.map_edge( dominance_rel, + None, AnnotationComponentType::Dominance, "edge", updates, @@ -508,6 +552,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { { self.map_edge( pointing_rel, + None, AnnotationComponentType::Pointing, "edge", updates, diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index 570b226d..72f44e75 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -10,9 +10,11 @@ expression: actual - - - + + + + + corpus @@ -31,371 +33,463 @@ expression: actual node - naja + naja + naja node - pass + pass + pass node - auf + auf + auf node - also + also + also node - du + du + du node - hast + hast + hast node - jetz + jetz + jetz node - wohl + wohl + wohl node - hier + hier + hier node - auch + auch + auch node - so + so + so node - ne + ne + ne node - Karte + Karte + Karte node - wie + wie + wie node - ich + ich + ich node - bloß + bloß + bloß node - ich + ich + ich node - hab + hab + hab node - ne + ne + ne node - Linie + Linie + Linie node - und + und + und node - du + du + du node - nich + nich + nich node - eine + eine + eine node - genau + genau + genau node - und + und + und node - ich + ich + ich node - muss + muss + muss node - dir + dir + dir node - jetzt + jetzt + jetzt node - erklärn + erklärn + erklärn node - wie + wie + wie node - du + du + du node - vom + vom + vom node - Start + Start + Start node - zum + zum + zum node - Ziel + Ziel + Ziel node - kommst + kommst + kommst node - so + so + so node - wie + wie + wie node - meine + meine + meine node - Linie + Linie + Linie node - geht + geht + geht node - so + so + so node - also + also + also node - du + du + du node - hast + hast + hast node - n + n + n node - Stift + Stift + Stift node - okay + okay + okay node - aso + aso + aso node - du + du + du node - musst + musst + musst node - jetzt + jetzt + jetzt node - vom + vom + vom node - Startpunkt + Startpunkt + Startpunkt node - äh + äh + äh node - waagerecht + waagerecht + waagerecht node - Richtung + Richtung + Richtung node - also + also + also node - zu + zu + zu node - dem + dem + dem node - ersten + ersten + ersten node - Bild + Bild + Bild node - erstmal + erstmal + erstmal node - und + und + und node - zum + zum + zum node - Rad + Rad + Rad node äh + äh node ((lacht)) + ((lacht)) node fang + fang node einfach + einfach node ma + ma node an + an node ((lacht)) + ((lacht)) node nee + nee node ich + ich node hab + hab node gar + gar node keine + keine node Linien + Linien node ich + ich node hab + hab node Start + Start node und + und node Ziel + Ziel node genau + genau node jap + jap node mhm + mhm node zum + zum node (?) + (?) node Schornsteinfeger + Schornsteinfeger node @@ -972,36 +1066,36 @@ expression: actual Rad + naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne node - naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne + genau node - genau + und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht node - und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht + so node - so + also hast n Stift node - also hast n Stift + okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal node - okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal + und node - und + zum Rad node - zum Rad node @@ -1093,23 +1187,23 @@ expression: actual node - äh fang einfach ma an + äh fang einfach ma an node - nee ich hab gar keine Linie ich hab Start und Ziel + nee ich hab gar keine Linie ich hab Start und Ziel node - genau + genau node - jap + jap node - zum (?) Schornsteinfeger + zum (?) Schornsteinfeger @@ -1307,1339 +1401,1363 @@ expression: actual - + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + From 8002cf220a271928916ee39362d2a5085ac5164d Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 5 Jul 2024 15:36:29 +0200 Subject: [PATCH 44/61] Add tok annotation to TLIs --- src/importer/saltxml/document.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 517633cf..e9513d91 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -104,6 +104,12 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_name: tli_node_name.clone(), node_type: "node".to_string(), })?; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: tli_node_name.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok".to_string(), + anno_value: "".to_string(), + })?; updates.add_event(UpdateEvent::AddEdge { source_node: tli_node_name.clone(), From 848f1c0c02dcbef20c4da8d3350453eb1be31228 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 10 Jul 2024 15:36:06 +0200 Subject: [PATCH 45/61] Map and visualize datasources, but do do not connect tokens to them --- src/importer/saltxml/document.rs | 8 ++++++++ src/manipulator/visualize.rs | 34 ++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index e9513d91..5e84bd49 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -188,6 +188,14 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_name: element_id.clone(), node_type: "datasource".to_string(), })?; + + updates.add_event(UpdateEvent::AddEdge { + source_node: element_id.clone(), + target_node: self.document_node_name.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; } } Ok(()) diff --git a/src/manipulator/visualize.rs b/src/manipulator/visualize.rs index 126cc337..b959d17d 100644 --- a/src/manipulator/visualize.rs +++ b/src/manipulator/visualize.rs @@ -10,7 +10,11 @@ use graphannis::{ model::{AnnotationComponent, AnnotationComponentType}, AnnotationGraph, }; -use graphannis_core::types::NodeID as GraphAnnisNodeID; +use graphannis_core::{ + annostorage::ValueSearch, + graph::{ANNIS_NS, NODE_TYPE}, + types::NodeID as GraphAnnisNodeID, +}; use graphannis_core::{ dfs, graph::{storage::union::UnionEdgeContainer, NODE_NAME_KEY}, @@ -146,7 +150,7 @@ impl Visualize { } output.add_stmt(stmt!(subgraph)); - // Add all other nodes that are somehow connected to the included token + // Add all other nodes that are somehow connected to the included token and the document let all_components = graph.get_all_components(None, None); let all_gs = all_components .iter() @@ -171,6 +175,32 @@ impl Visualize { } } } + // Add all datasource nodes if they are connected to the included documents have not been already added + let part_of_gs = graph + .get_all_components(Some(AnnotationComponentType::PartOf), None) + .into_iter() + .filter_map(|c| graph.get_graphstorage(&c)) + .collect_vec(); + for ds in graph.get_node_annos().exact_anno_search( + Some(ANNIS_NS), + NODE_TYPE, + ValueSearch::Some("datasource"), + ) { + let ds = ds?.node; + if !included_nodes.contains(&ds) { + // The datsource must be part of a document node that is already included + let mut outgoing = HashSet::new(); + for gs in part_of_gs.iter() { + for o in gs.get_outgoing_edges(ds) { + outgoing.insert(o?); + } + } + if outgoing.intersection(&included_nodes).next().is_some() { + output.add_stmt(self.create_node_stmt(ds, graph)?); + included_nodes.insert(ds); + } + } + } // Output all edges grouped by their component for component in all_components.iter() { From 9208005a09079ad1dbf0e1103c842b5f4407925d Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 10 Jul 2024 15:38:44 +0200 Subject: [PATCH 46/61] Update test snapshots --- ...ltxml__tests__read_salt_sample_corpus.snap | 216 ++++--- ...ltxml__tests__read_salt_with_timeline.snap | 596 +++++++++--------- 2 files changed, 412 insertions(+), 400 deletions(-) diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 1e5b2718..6a432ff9 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -1240,213 +1240,221 @@ expression: actual - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index 23f08899..02e5aa88 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -2269,597 +2269,601 @@ expression: actual - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + From 129ffd735570141a440ce01dc00664f639a8cd80 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 12 Jul 2024 16:31:10 +0200 Subject: [PATCH 47/61] Add whitespace after the TLI token, so that they don't cover all the same empty text --- src/importer/saltxml/document.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 5e84bd49..b5cae320 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -110,6 +110,12 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { anno_name: "tok".to_string(), anno_value: "".to_string(), })?; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: tli_node_name.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok-whitespace-after".to_string(), + anno_value: " ".to_string(), + })?; updates.add_event(UpdateEvent::AddEdge { source_node: tli_node_name.clone(), From c6867a9f20ed34989bd2506361dc9e0f627793fe Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 12 Jul 2024 16:32:51 +0200 Subject: [PATCH 48/61] Use single space for TLI token instead --- src/importer/saltxml/document.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index b5cae320..205b4534 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -108,15 +108,8 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { node_name: tli_node_name.clone(), anno_ns: ANNIS_NS.to_string(), anno_name: "tok".to_string(), - anno_value: "".to_string(), - })?; - updates.add_event(UpdateEvent::AddNodeLabel { - node_name: tli_node_name.clone(), - anno_ns: ANNIS_NS.to_string(), - anno_name: "tok-whitespace-after".to_string(), anno_value: " ".to_string(), })?; - updates.add_event(UpdateEvent::AddEdge { source_node: tli_node_name.clone(), target_node: self.document_node_name.clone(), From e8acd130b807f7b4eb152a31d1cafdf3076aab0e Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 12 Jul 2024 16:33:27 +0200 Subject: [PATCH 49/61] Update test snapshot --- ...ltxml__tests__read_salt_with_timeline.snap | 204 +++++++++--------- 1 file changed, 102 insertions(+), 102 deletions(-) diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index 02e5aa88..951ee072 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -493,411 +493,411 @@ expression: actual node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node - + node From 2122e8e1c09c732066ab15129eb27085a0d919a9 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Fri, 12 Jul 2024 16:58:27 +0200 Subject: [PATCH 50/61] Start to map the audio file --- Cargo.toml | 1 + src/importer/saltxml.rs | 14 + src/importer/saltxml/document.rs | 43 +- ...l__tests__read_salt_with_timeline.snap.new | 2877 +++++++++++++++++ 4 files changed, 2934 insertions(+), 1 deletion(-) create mode 100644 src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap.new diff --git a/Cargo.toml b/Cargo.toml index 4f3dd84e..c53eb414 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ thiserror = "1.0" toml = "0.8.0" tracing-subscriber = {version = "0.3", features = ["env-filter"]} umya-spreadsheet = "~1.1.1" +url = "2.5.2" xml-rs = "0.8" zip = "0.6.6" diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index c184151f..ed9e2943 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -4,6 +4,7 @@ use graphannis::update::GraphUpdate; use roxmltree::Node; use serde::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; +use url::Url; use crate::progress::ProgressReporter; @@ -85,6 +86,8 @@ enum SaltType { ElementId, Feature, Layer, + MediaDs, + MediaRelation, PointingRelation, Span, SpanningRelation, @@ -110,6 +113,8 @@ impl SaltType { "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, "sCorpusStructure:SDocument" => SaltType::Document, + "sDocumentStructure:SAudioDS" => SaltType::MediaDs, + "sDocumentStructure:SAudioRelation" => SaltType::MediaRelation, "sDocumentStructure:SDominanceRelation" => SaltType::DominanceRelation, "sDocumentStructure:SPointingRelation" => SaltType::PointingRelation, "sDocumentStructure:SSpan" => SaltType::Span, @@ -120,6 +125,7 @@ impl SaltType { "sDocumentStructure:STimeline" => SaltType::Timeline, "sDocumentStructure:STimelineRelation" => SaltType::TimelineRelation, "sDocumentStructure:SToken" => SaltType::Token, + _ => SaltType::Unknown, } } else { @@ -132,6 +138,7 @@ enum SaltObject { Text(String), Boolean(bool), Integer(i64), + Url(Url), Null, } @@ -139,6 +146,12 @@ impl From<&str> for SaltObject { fn from(value: &str) -> Self { if let Some(value) = value.strip_prefix("T::") { SaltObject::Text(value.to_string()) + } else if let Some(value) = value.strip_prefix("U::") { + if let Ok(o) = Url::parse(value) { + SaltObject::Url(o) + } else { + SaltObject::Null + } } else if let Some(value) = value.strip_prefix("B::") { let value = value.to_ascii_lowercase() == "true"; SaltObject::Boolean(value) @@ -155,6 +168,7 @@ impl std::fmt::Display for SaltObject { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { SaltObject::Text(val) => write!(f, "{val}"), + SaltObject::Url(val) => write!(f, "{val}"), SaltObject::Boolean(val) => write!(f, "{val}"), SaltObject::Integer(val) => write!(f, "{val}"), SaltObject::Null => write!(f, ""), diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 205b4534..6fff0e0a 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -27,6 +27,7 @@ pub(super) struct DocumentMapper<'a, 'input> { edges: Vec>, layers: Vec>, base_texts: BTreeMap, + media_files: BTreeMap, document_node_name: String, token_to_tli: BTreeMap>, missing_anno_ns_from_layer: bool, @@ -66,6 +67,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { let mut mapper = DocumentMapper { base_texts: BTreeMap::new(), + media_files: BTreeMap::new(), missing_anno_ns_from_layer, nodes, edges, @@ -82,13 +84,13 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .next(); mapper.map_textual_datasources(updates)?; + mapper.map_media_datasources(updates)?; mapper.map_tokens(timeline.as_ref(), updates)?; if let Some(timeline) = timeline { mapper.map_timeline(&timeline, updates)?; } mapper.map_non_token_nodes(updates)?; - // TODO map SAudioDS and SAudioRelation Ok(()) } @@ -200,6 +202,45 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } + fn map_media_datasources(&mut self, updates: &mut GraphUpdate) -> Result<()> { + for media_node in self + .nodes + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::MediaDs) + { + let element_id = get_element_id(media_node) + .context("Missing element ID for media/audio data source")?; + + if let Some(SaltObject::Url(anno_value)) = + get_feature_by_qname(media_node, "salt", "SAUDIO_REFERENCE") + { + let file_path = anno_value.to_file_path().unwrap(); + self.media_files + .insert(element_id.clone(), file_path.to_string_lossy().to_string()); + + updates.add_event(UpdateEvent::AddNode { + node_name: element_id.clone(), + node_type: "file".to_string(), + })?; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: element_id.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "file".to_string(), + anno_value: file_path.to_string_lossy().to_string(), + })?; + + updates.add_event(UpdateEvent::AddEdge { + source_node: element_id.clone(), + target_node: self.document_node_name.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + } + } + Ok(()) + } + fn map_node( &self, n: &Node, diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap.new b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap.new new file mode 100644 index 00000000..33d0a6ec --- /dev/null +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap.new @@ -0,0 +1,2877 @@ +--- +source: src/importer/saltxml/tests.rs +assertion_line: 29 +expression: actual +--- + + + + + + + + + + + + + + + + + corpus + + + + file:/tmp/pepper_thomas/workspace/rdr3o1mf/dialog.demo5661470868672726454.salt + dialog.demo + corpus + + + datasource + + + datasource + + + file:dialog.demo.webm + file + + + node + naja + naja + + + node + pass + pass + + + node + auf + auf + + + node + also + also + + + node + du + du + + + node + hast + hast + + + node + jetz + jetz + + + node + wohl + wohl + + + node + hier + hier + + + node + auch + auch + + + node + so + so + + + node + ne + ne + + + node + Karte + Karte + + + node + wie + wie + + + node + ich + ich + + + node + bloß + bloß + + + node + ich + ich + + + node + hab + hab + + + node + ne + ne + + + node + Linie + Linie + + + node + und + und + + + node + du + du + + + node + nich + nich + + + node + eine + eine + + + node + genau + genau + + + node + und + und + + + node + ich + ich + + + node + muss + muss + + + node + dir + dir + + + node + jetzt + jetzt + + + node + erklärn + erklärn + + + node + wie + wie + + + node + du + du + + + node + vom + vom + + + node + Start + Start + + + node + zum + zum + + + node + Ziel + Ziel + + + node + kommst + kommst + + + node + so + so + + + node + wie + wie + + + node + meine + meine + + + node + Linie + Linie + + + node + geht + geht + + + node + so + so + + + node + also + also + + + node + du + du + + + node + hast + hast + + + node + n + n + + + node + Stift + Stift + + + node + okay + okay + + + node + aso + aso + + + node + du + du + + + node + musst + musst + + + node + jetzt + jetzt + + + node + vom + vom + + + node + Startpunkt + Startpunkt + + + node + äh + äh + + + node + waagerecht + waagerecht + + + node + Richtung + Richtung + + + node + also + also + + + node + zu + zu + + + node + dem + dem + + + node + ersten + ersten + + + node + Bild + Bild + + + node + erstmal + erstmal + + + node + und + und + + + node + zum + zum + + + node + Rad + Rad + + + node + äh + äh + + + node + ((lacht)) + ((lacht)) + + + node + fang + fang + + + node + einfach + einfach + + + node + ma + ma + + + node + an + an + + + node + ((lacht)) + ((lacht)) + + + node + nee + nee + + + node + ich + ich + + + node + hab + hab + + + node + gar + gar + + + node + keine + keine + + + node + Linien + Linien + + + node + ich + ich + + + node + hab + hab + + + node + Start + Start + + + node + und + und + + + node + Ziel + Ziel + + + node + genau + genau + + + node + jap + jap + + + node + mhm + mhm + + + node + zum + zum + + + node + (?) + (?) + + + node + Schornsteinfeger + Schornsteinfeger + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + naja + + + node + pass + + + node + auf + + + node + also + + + node + du + + + node + hast + + + node + jetzt + + + node + wohl + + + node + hier + + + node + auch + + + node + so + + + node + eine + + + node + Karte + + + node + wie + + + node + ich + + + node + bloß + + + node + ich + + + node + habe + + + node + eine + + + node + Linie + + + node + und + + + node + du + + + node + nicht + + + node + ne + + + node + genau + + + node + und + + + node + ich + + + node + muss + + + node + dir + + + node + jetzt + + + node + erklären + + + node + wie + + + node + du + + + node + vom + + + node + Start + + + node + zum + + + node + Ziel + + + node + kommst + + + node + so + + + node + wie + + + node + meine + + + node + Linie + + + node + so + + + node + also + + + node + du + + + node + hast + + + node + einen + + + node + Stift + + + node + okay + + + node + also + + + node + du + + + node + musst + + + node + jetzt + + + node + vom + + + node + Startpunkt + + + node + äh + + + node + waagerecht + + + node + Richtung + + + node + also + + + node + zu + + + node + dem + + + node + ersten + + + node + Bild + + + node + erstmal + + + node + und + + + node + zum + + + node + Rad + + + naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne + node + + + genau + node + + + und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht + node + + + so + node + + + also hast n Stift + node + + + okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal + node + + + und + node + + + zum Rad + node + + + node + äh + + + node + fang + + + node + einfach + + + node + mal + + + node + an + + + node + nee + + + node + ich + + + node + habe + + + node + gar + + + node + keine + + + node + Linien + + + node + ich + + + node + habe + + + node + Start + + + node + und + + + node + Ziel + + + node + genau + + + node + jap + + + node + mhm + + + node + zum + + + node + (?) + + + node + Schornsteinfeger + + + äh fang einfach ma an + node + + + nee ich hab gar keine Linie ich hab Start und Ziel + node + + + genau + node + + + jap + node + + + zum (?) Schornsteinfeger + node + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 299e3d89b577afbc9c90bb6a94695170bfcd69ae Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 6 Aug 2024 14:57:33 +0200 Subject: [PATCH 51/61] Resolve the URL of the linked media/audio file --- src/importer/saltxml.rs | 12 +- src/importer/saltxml/document.rs | 45 +- ...ltxml__tests__read_salt_with_timeline.snap | 1855 +++++------ ...l__tests__read_salt_with_timeline.snap.new | 2877 ----------------- 4 files changed, 970 insertions(+), 3819 deletions(-) delete mode 100644 src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap.new diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index ed9e2943..a47ad696 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -4,7 +4,6 @@ use graphannis::update::GraphUpdate; use roxmltree::Node; use serde::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; -use url::Url; use crate::progress::ProgressReporter; @@ -56,9 +55,10 @@ impl Importer for ImportSaltXml { relative_document_path.push_str(".salt"); // Get the path from the node name let document_path = input_path.join(relative_document_path); - let document_file = std::fs::read_to_string(document_path)?; + let document_file = std::fs::read_to_string(&document_path)?; DocumentMapper::read_document( &document_file, + &document_path, self.missing_anno_ns_from_layer, &mut updates, )?; @@ -138,7 +138,7 @@ enum SaltObject { Text(String), Boolean(bool), Integer(i64), - Url(Url), + Url(String), Null, } @@ -147,11 +147,7 @@ impl From<&str> for SaltObject { if let Some(value) = value.strip_prefix("T::") { SaltObject::Text(value.to_string()) } else if let Some(value) = value.strip_prefix("U::") { - if let Ok(o) = Url::parse(value) { - SaltObject::Url(o) - } else { - SaltObject::Null - } + SaltObject::Url(value.to_string()) } else if let Some(value) = value.strip_prefix("B::") { let value = value.to_ascii_lowercase() == "true"; SaltObject::Boolean(value) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 6fff0e0a..99dc243c 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -1,6 +1,7 @@ use std::{ collections::BTreeMap, convert::{TryFrom, TryInto}, + path::Path, }; use anyhow::{bail, Context, Result}; @@ -10,7 +11,9 @@ use graphannis::{ }; use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; +use normpath::{BasePathBuf, PathExt}; use roxmltree::Node; +use url::Url; use super::{ get_annotations, get_element_id, get_feature_by_qname, resolve_element, SaltObject, SaltType, @@ -31,11 +34,13 @@ pub(super) struct DocumentMapper<'a, 'input> { document_node_name: String, token_to_tli: BTreeMap>, missing_anno_ns_from_layer: bool, + input_directory: BasePathBuf, } impl<'a, 'input> DocumentMapper<'a, 'input> { pub(super) fn read_document( input: &'input str, + input_path: &Path, missing_anno_ns_from_layer: bool, updates: &mut GraphUpdate, ) -> Result<()> { @@ -65,6 +70,8 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { let document_node_name = get_element_id(&doc.root_element()).context("Missing document ID")?; + let input_directory = input_path.parent().unwrap_or(input_path).normalize()?; + let mut mapper = DocumentMapper { base_texts: BTreeMap::new(), media_files: BTreeMap::new(), @@ -74,6 +81,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { layers, document_node_name, token_to_tli: BTreeMap::new(), + input_directory, }; let timeline = mapper @@ -140,10 +148,10 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { for timeline_rel in self .edges .iter() - .filter(|rel| SaltType::from_node(*rel) == SaltType::TimelineRelation) + .filter(|rel| SaltType::from_node(rel) == SaltType::TimelineRelation) { let source_att = timeline_rel.attribute("source").unwrap_or_default(); - let token_node = resolve_element(&source_att, "nodes", &self.nodes) + let token_node = resolve_element(source_att, "nodes", &self.nodes) .context("Token referenced in STimelineRelation cannot be resolved")?; let token_id = get_element_id(&token_node).context("Token has no ID")?; @@ -214,19 +222,37 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { if let Some(SaltObject::Url(anno_value)) = get_feature_by_qname(media_node, "salt", "SAUDIO_REFERENCE") { - let file_path = anno_value.to_file_path().unwrap(); - self.media_files - .insert(element_id.clone(), file_path.to_string_lossy().to_string()); - updates.add_event(UpdateEvent::AddNode { node_name: element_id.clone(), node_type: "file".to_string(), })?; + // Parse the file URL with the input file location as base path + let base_dir = Url::from_directory_path(self.input_directory.canonicalize()?).ok(); + let referenced_url = Url::options() + .base_url(base_dir.as_ref()) + .parse(&anno_value)?; + + let file_path = if referenced_url.scheme() == "file" { + // Resolve this file URL against the input direcotry and + // store it relative to the current working directory. + let referenced_path = Path::new(referenced_url.path()); + let referenced_path = pathdiff::diff_paths( + referenced_path.normalize()?, + &std::env::current_dir()?, + ) + .unwrap_or_else(|| referenced_path.to_path_buf()); + referenced_path.to_string_lossy().to_string() + } else { + referenced_url.to_string() + }; + self.media_files + .insert(element_id.clone(), file_path.clone()); + updates.add_event(UpdateEvent::AddNodeLabel { node_name: element_id.clone(), anno_ns: ANNIS_NS.to_string(), anno_name: "file".to_string(), - anno_value: file_path.to_string_lossy().to_string(), + anno_value: file_path, })?; updates.add_event(UpdateEvent::AddEdge { @@ -327,8 +353,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { let target_att_val = rel.attribute("target").unwrap_or_default(); let target_element = resolve_element(target_att_val, "nodes", &self.nodes) .context("Missing target node")?; - let target_id = get_element_id(&target_element).context("Missing target node ID")?; - target_id + get_element_id(&target_element).context("Missing target node ID")? }; let component_name = get_feature_by_qname(rel, "salt", "STYPE") @@ -556,7 +581,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { let target_att = spanning_rel .attribute("target") .context("Missing target attribute for SSpanningRelation")?; - let target_node = resolve_element(&target_att, "nodes", &self.nodes) + let target_node = resolve_element(target_att, "nodes", &self.nodes) .context("Could not resolve target for SSpanningRelation")?; let target_node_id = get_element_id(&target_node).context("Target token has no ID")?; diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index 951ee072..f2c40a9f 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -7,1305 +7,1310 @@ expression: actual - - - - - - - - + + + + + + + + + - corpus + corpus file:/tmp/pepper_thomas/workspace/rdr3o1mf/dialog.demo5661470868672726454.salt dialog.demo - corpus + corpus - datasource + datasource - datasource + datasource + + + tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm + file - node - naja + node naja + naja - node - pass + node pass + pass - node - auf + node auf + auf - node - also + node also + also - node - du + node du + du - node - hast + node hast + hast - node - jetz + node jetz + jetz - node - wohl + node wohl + wohl - node - hier + node hier + hier - node - auch + node auch + auch - node - so + node so + so - node - ne + node ne + ne - node - Karte + node Karte + Karte - node - wie + node wie + wie - node - ich + node ich + ich - node - bloß + node bloß + bloß - node - ich + node ich + ich - node - hab + node hab + hab - node - ne + node ne + ne - node - Linie + node Linie + Linie - node - und + node und + und - node - du + node du + du - node - nich + node nich + nich - node - eine + node eine + eine - node - genau + node genau + genau - node - und + node und + und - node - ich + node ich + ich - node - muss + node muss + muss - node - dir + node dir + dir - node - jetzt + node jetzt + jetzt - node - erklärn + node erklärn + erklärn - node - wie + node wie + wie - node - du + node du + du - node - vom + node vom + vom - node - Start + node Start + Start - node - zum + node zum + zum - node - Ziel + node Ziel + Ziel - node - kommst + node kommst + kommst - node - so + node so + so - node - wie + node wie + wie - node - meine + node meine + meine - node - Linie + node Linie + Linie - node - geht + node geht + geht - node - so + node so + so - node - also + node also + also - node - du + node du + du - node - hast + node hast + hast - node - n + node n + n - node - Stift + node Stift + Stift - node - okay + node okay + okay - node - aso + node aso + aso - node - du + node du + du - node - musst + node musst + musst - node - jetzt + node jetzt + jetzt - node - vom + node vom + vom - node - Startpunkt + node Startpunkt + Startpunkt - node - äh + node äh + äh - node - waagerecht + node waagerecht + waagerecht - node - Richtung + node Richtung + Richtung - node - also + node also + also - node - zu + node zu + zu - node - dem + node dem + dem - node - ersten + node ersten + ersten - node - Bild + node Bild + Bild - node - erstmal + node erstmal + erstmal - node - und + node und + und - node - zum + node zum + zum - node - Rad + node Rad + Rad - node - äh - äh + node + äh + äh - node - ((lacht)) - ((lacht)) + node + ((lacht)) + ((lacht)) - node - fang - fang + node + fang + fang - node - einfach - einfach + node + einfach + einfach - node - ma - ma + node + ma + ma - node - an - an + node + an + an - node - ((lacht)) - ((lacht)) + node + ((lacht)) + ((lacht)) - node - nee - nee + node + nee + nee - node - ich - ich + node + ich + ich - node - hab - hab + node + hab + hab - node - gar - gar + node + gar + gar - node - keine - keine + node + keine + keine - node - Linien - Linien + node + Linien + Linien - node - ich - ich + node + ich + ich - node - hab - hab + node + hab + hab - node - Start - Start + node + Start + Start - node - und - und + node + und + und - node - Ziel - Ziel + node + Ziel + Ziel - node - genau - genau + node + genau + genau - node - jap - jap + node + jap + jap - node - mhm - mhm + node + mhm + mhm - node - zum - zum + node + zum + zum - node - (?) - (?) + node + (?) + (?) - node - Schornsteinfeger - Schornsteinfeger + node + Schornsteinfeger + Schornsteinfeger - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - naja + node + naja - node - pass + node + pass - node - auf + node + auf - node - also + node + also - node - du + node + du - node - hast + node + hast - node - jetzt + node + jetzt - node - wohl + node + wohl - node - hier + node + hier - node - auch + node + auch - node - so + node + so - node - eine + node + eine - node - Karte + node + Karte - node - wie + node + wie - node - ich + node + ich - node - bloß + node + bloß - node - ich + node + ich - node - habe + node + habe - node - eine + node + eine - node - Linie + node + Linie - node - und + node + und - node - du + node + du - node - nicht + node + nicht - node - ne + node + ne - node - genau + node + genau - node - und + node + und - node - ich + node + ich - node - muss + node + muss - node - dir + node + dir - node - jetzt + node + jetzt - node - erklären + node + erklären - node - wie + node + wie - node - du + node + du - node - vom + node + vom - node - Start + node + Start - node - zum + node + zum - node - Ziel + node + Ziel - node - kommst + node + kommst - node - so + node + so - node - wie + node + wie - node - meine + node + meine - node - Linie + node + Linie - node - so + node + so - node - also + node + also - node - du + node + du - node - hast + node + hast - node - einen + node + einen - node - Stift + node + Stift - node - okay + node + okay - node - also + node + also - node - du + node + du - node - musst + node + musst - node - jetzt + node + jetzt - node - vom + node + vom - node - Startpunkt + node + Startpunkt - node - äh + node + äh - node - waagerecht + node + waagerecht - node - Richtung + node + Richtung - node - also + node + also - node - zu + node + zu - node - dem + node + dem - node - ersten + node + ersten - node - Bild + node + Bild - node - erstmal + node + erstmal - node - und + node + und - node - zum + node + zum - node - Rad + node + Rad - naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne - node + naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne + node - genau - node + genau + node - und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht - node + und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht + node - so - node + so + node - also hast n Stift - node + also hast n Stift + node - okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal - node + okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal + node - und - node + und + node - zum Rad - node + zum Rad + node - node - äh + node + äh - node - fang + node + fang - node - einfach + node + einfach - node - mal + node + mal - node - an + node + an - node - nee + node + nee - node - ich + node + ich - node - habe + node + habe - node - gar + node + gar - node - keine + node + keine - node - Linien + node + Linien - node - ich + node + ich - node - habe + node + habe - node - Start + node + Start - node - und + node + und - node - Ziel + node + Ziel - node - genau + node + genau - node - jap + node + jap - node - mhm + node + mhm - node - zum + node + zum - node - (?) + node + (?) - node - Schornsteinfeger + node + Schornsteinfeger - node - äh fang einfach ma an + äh fang einfach ma an + node - node - nee ich hab gar keine Linie ich hab Start und Ziel + nee ich hab gar keine Linie ich hab Start und Ziel + node - node - genau + genau + node - node - jap + jap + node - node - zum (?) Schornsteinfeger + zum (?) Schornsteinfeger + node @@ -2273,597 +2278,599 @@ expression: actual - + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap.new b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap.new deleted file mode 100644 index 33d0a6ec..00000000 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap.new +++ /dev/null @@ -1,2877 +0,0 @@ ---- -source: src/importer/saltxml/tests.rs -assertion_line: 29 -expression: actual ---- - - - - - - - - - - - - - - - - - corpus - - - - file:/tmp/pepper_thomas/workspace/rdr3o1mf/dialog.demo5661470868672726454.salt - dialog.demo - corpus - - - datasource - - - datasource - - - file:dialog.demo.webm - file - - - node - naja - naja - - - node - pass - pass - - - node - auf - auf - - - node - also - also - - - node - du - du - - - node - hast - hast - - - node - jetz - jetz - - - node - wohl - wohl - - - node - hier - hier - - - node - auch - auch - - - node - so - so - - - node - ne - ne - - - node - Karte - Karte - - - node - wie - wie - - - node - ich - ich - - - node - bloß - bloß - - - node - ich - ich - - - node - hab - hab - - - node - ne - ne - - - node - Linie - Linie - - - node - und - und - - - node - du - du - - - node - nich - nich - - - node - eine - eine - - - node - genau - genau - - - node - und - und - - - node - ich - ich - - - node - muss - muss - - - node - dir - dir - - - node - jetzt - jetzt - - - node - erklärn - erklärn - - - node - wie - wie - - - node - du - du - - - node - vom - vom - - - node - Start - Start - - - node - zum - zum - - - node - Ziel - Ziel - - - node - kommst - kommst - - - node - so - so - - - node - wie - wie - - - node - meine - meine - - - node - Linie - Linie - - - node - geht - geht - - - node - so - so - - - node - also - also - - - node - du - du - - - node - hast - hast - - - node - n - n - - - node - Stift - Stift - - - node - okay - okay - - - node - aso - aso - - - node - du - du - - - node - musst - musst - - - node - jetzt - jetzt - - - node - vom - vom - - - node - Startpunkt - Startpunkt - - - node - äh - äh - - - node - waagerecht - waagerecht - - - node - Richtung - Richtung - - - node - also - also - - - node - zu - zu - - - node - dem - dem - - - node - ersten - ersten - - - node - Bild - Bild - - - node - erstmal - erstmal - - - node - und - und - - - node - zum - zum - - - node - Rad - Rad - - - node - äh - äh - - - node - ((lacht)) - ((lacht)) - - - node - fang - fang - - - node - einfach - einfach - - - node - ma - ma - - - node - an - an - - - node - ((lacht)) - ((lacht)) - - - node - nee - nee - - - node - ich - ich - - - node - hab - hab - - - node - gar - gar - - - node - keine - keine - - - node - Linien - Linien - - - node - ich - ich - - - node - hab - hab - - - node - Start - Start - - - node - und - und - - - node - Ziel - Ziel - - - node - genau - genau - - - node - jap - jap - - - node - mhm - mhm - - - node - zum - zum - - - node - (?) - (?) - - - node - Schornsteinfeger - Schornsteinfeger - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - - - - node - naja - - - node - pass - - - node - auf - - - node - also - - - node - du - - - node - hast - - - node - jetzt - - - node - wohl - - - node - hier - - - node - auch - - - node - so - - - node - eine - - - node - Karte - - - node - wie - - - node - ich - - - node - bloß - - - node - ich - - - node - habe - - - node - eine - - - node - Linie - - - node - und - - - node - du - - - node - nicht - - - node - ne - - - node - genau - - - node - und - - - node - ich - - - node - muss - - - node - dir - - - node - jetzt - - - node - erklären - - - node - wie - - - node - du - - - node - vom - - - node - Start - - - node - zum - - - node - Ziel - - - node - kommst - - - node - so - - - node - wie - - - node - meine - - - node - Linie - - - node - so - - - node - also - - - node - du - - - node - hast - - - node - einen - - - node - Stift - - - node - okay - - - node - also - - - node - du - - - node - musst - - - node - jetzt - - - node - vom - - - node - Startpunkt - - - node - äh - - - node - waagerecht - - - node - Richtung - - - node - also - - - node - zu - - - node - dem - - - node - ersten - - - node - Bild - - - node - erstmal - - - node - und - - - node - zum - - - node - Rad - - - naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne - node - - - genau - node - - - und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht - node - - - so - node - - - also hast n Stift - node - - - okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal - node - - - und - node - - - zum Rad - node - - - node - äh - - - node - fang - - - node - einfach - - - node - mal - - - node - an - - - node - nee - - - node - ich - - - node - habe - - - node - gar - - - node - keine - - - node - Linien - - - node - ich - - - node - habe - - - node - Start - - - node - und - - - node - Ziel - - - node - genau - - - node - jap - - - node - mhm - - - node - zum - - - node - (?) - - - node - Schornsteinfeger - - - äh fang einfach ma an - node - - - nee ich hab gar keine Linie ich hab Start und Ziel - node - - - genau - node - - - jap - node - - - zum (?) Schornsteinfeger - node - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From f5ca1f70edc500c89881fc9bb1e0cbf1e1fc1375 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 6 Aug 2024 15:24:44 +0200 Subject: [PATCH 52/61] Map corpus/document annotations but do not map the ones of the "salt" namespace --- src/importer/saltxml/corpus_structure.rs | 43 +- src/importer/saltxml/document.rs | 1 + ...ltxml__tests__read_salt_sample_corpus.snap | 875 ++++++----- ...ltxml__tests__read_salt_with_timeline.snap | 1358 ++++++++--------- 4 files changed, 1143 insertions(+), 1134 deletions(-) diff --git a/src/importer/saltxml/corpus_structure.rs b/src/importer/saltxml/corpus_structure.rs index ae1248f5..1822139e 100644 --- a/src/importer/saltxml/corpus_structure.rs +++ b/src/importer/saltxml/corpus_structure.rs @@ -8,7 +8,7 @@ use graphannis::{ use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; -use super::{get_element_id, get_features, resolve_element, SaltObject, SaltType}; +use super::{get_annotations, get_element_id, get_features, resolve_element, SaltObject, SaltType}; pub(super) struct SaltCorpusStructureMapper {} @@ -38,8 +38,6 @@ impl SaltCorpusStructureMapper { .children() .filter(|t| t.tag_name().name() == "sCorpusGraphs") { - // TODO: map corpus graph labels - // Get all nodes let nodes = cg .children() @@ -74,15 +72,17 @@ impl SaltCorpusStructureMapper { feature_node.attribute("value").unwrap_or_default(), ); - if annos_ns == Some("salt") && anno_name == "SNAME" { - // Only map this specific feature as document name - if salt_type == SaltType::Document { - updates.add_event(UpdateEvent::AddNodeLabel { - node_name: node_name.to_string(), - anno_ns: ANNIS_NS.to_string(), - anno_name: "doc".to_string(), - anno_value: anno_value.to_string(), - })?; + if annos_ns == Some("salt") { + if anno_name == "SNAME" { + // Only map this specific feature as document name + if salt_type == SaltType::Document { + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "doc".to_string(), + anno_value: anno_value.to_string(), + })?; + } } } else { updates.add_event(UpdateEvent::AddNodeLabel { @@ -93,8 +93,25 @@ impl SaltCorpusStructureMapper { })?; } } + // Add annotations + for anno_node in get_annotations(node) { + let annos_ns = anno_node.attribute("namespace"); + if annos_ns != Some("salt") { + let anno_name = anno_node.attribute("name").ok_or_else(|| { + anyhow!("Missing \"name\" attribute for node \"{node_name}\"") + })?; + let anno_value = SaltObject::from( + anno_node.attribute("value").unwrap_or_default(), + ); - // TODO: map annotations (that are not features) + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: annos_ns.unwrap_or_default().to_string(), + anno_name: anno_name.to_string(), + anno_value: anno_value.to_string(), + })?; + } + } } _ => {} } diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 99dc243c..c43d025c 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -211,6 +211,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { } fn map_media_datasources(&mut self, updates: &mut GraphUpdate) -> Result<()> { + // TODO: Map time codes from the SAudioRelation as annis::time for media_node in self .nodes .iter() diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index 6a432ff9..ec0cf4f0 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -5,684 +5,679 @@ expression: actual - - - - - - - - - + + + + + + + + - corpus + corpus - corpus + corpus - corpus + corpus - file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt - doc1 - corpus + doc1 + corpus - file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt - doc2 - corpus + doc2 + corpus - file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt - doc3 - corpus + doc3 + corpus - file:/home/thomas/korpora/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt - doc4 - corpus + doc4 + corpus - datasource + datasource - morphology - be - node - VBZ - Is - + morphology + be + node + VBZ + Is + - morphology - this - node - DT - this - + morphology + this + node + DT + this + - morphology - example - node - NN - example - + morphology + example + node + NN + example + - morphology - more - node - RBR - more - + morphology + more + node + RBR + more + - morphology - complicated - node - JJ - complicated - + morphology + complicated + node + JJ + complicated + - morphology - than - node - IN - than - + morphology + than + node + IN + than + - morphology - it - node - PRP - it - + morphology + it + node + PRP + it + - morphology - appear - node - VBZ - appears - + morphology + appear + node + VBZ + appears + - morphology - to - node - TO - to - + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node - ROOT - syntax - node + ROOT + syntax + node - SQ - syntax - node + SQ + syntax + node - NP - syntax - node + NP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - SBar - syntax - node + SBar + syntax + node - S - syntax - node + S + syntax + node - NP - syntax - node + NP + syntax + node - VP - syntax - node + VP + syntax + node - S - syntax - node + S + syntax + node - VP - syntax - node + VP + syntax + node - VP - syntax - node + VP + syntax + node - node + node - datasource + datasource - morphology - be - node - VBZ - Is - + morphology + be + node + VBZ + Is + - morphology - this - node - DT - this - + morphology + this + node + DT + this + - morphology - example - node - NN - example - + morphology + example + node + NN + example + - morphology - more - node - RBR - more - + morphology + more + node + RBR + more + - morphology - complicated - node - JJ - complicated - + morphology + complicated + node + JJ + complicated + - morphology - than - node - IN - than - + morphology + than + node + IN + than + - morphology - it - node - PRP - it - + morphology + it + node + PRP + it + - morphology - appear - node - VBZ - appears - + morphology + appear + node + VBZ + appears + - morphology - to - node - TO - to - + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node - ROOT - syntax - node + ROOT + syntax + node - SQ - syntax - node + SQ + syntax + node - NP - syntax - node + NP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - SBar - syntax - node + SBar + syntax + node - S - syntax - node + S + syntax + node - NP - syntax - node + NP + syntax + node - VP - syntax - node + VP + syntax + node - S - syntax - node + S + syntax + node - VP - syntax - node + VP + syntax + node - VP - syntax - node + VP + syntax + node - node + node - datasource + datasource - morphology - be - node - VBZ - Is - + morphology + be + node + VBZ + Is + - morphology - this - node - DT - this - + morphology + this + node + DT + this + - morphology - example - node - NN - example - + morphology + example + node + NN + example + - morphology - more - node - RBR - more - + morphology + more + node + RBR + more + - morphology - complicated - node - JJ - complicated - + morphology + complicated + node + JJ + complicated + - morphology - than - node - IN - than - + morphology + than + node + IN + than + - morphology - it - node - PRP - it - + morphology + it + node + PRP + it + - morphology - appear - node - VBZ - appears - + morphology + appear + node + VBZ + appears + - morphology - to - node - TO - to - + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node - ROOT - syntax - node + ROOT + syntax + node - SQ - syntax - node + SQ + syntax + node - NP - syntax - node + NP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - SBar - syntax - node + SBar + syntax + node - S - syntax - node + S + syntax + node - NP - syntax - node + NP + syntax + node - VP - syntax - node + VP + syntax + node - S - syntax - node + S + syntax + node - VP - syntax - node + VP + syntax + node - VP - syntax - node + VP + syntax + node - node + node - datasource + datasource - morphology - be - node - VBZ - Is - + morphology + be + node + VBZ + Is + - morphology - this - node - DT - this - + morphology + this + node + DT + this + - morphology - example - node - NN - example - + morphology + example + node + NN + example + - morphology - more - node - RBR - more - + morphology + more + node + RBR + more + - morphology - complicated - node - JJ - complicated - + morphology + complicated + node + JJ + complicated + - morphology - than - node - IN - than - + morphology + than + node + IN + than + - morphology - it - node - PRP - it - + morphology + it + node + PRP + it + - morphology - appear - node - VBZ - appears - + morphology + appear + node + VBZ + appears + - morphology - to - node - TO - to - + morphology + to + node + TO + to + - morphology - be - node - VB - be + morphology + be + node + VB + be - morphology - ? - node - . - ? + morphology + ? + node + . + ? contrast-focus - node + node topic - node + node - ROOT - syntax - node + ROOT + syntax + node - SQ - syntax - node + SQ + syntax + node - NP - syntax - node + NP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - ADJP - syntax - node + ADJP + syntax + node - SBar - syntax - node + SBar + syntax + node - S - syntax - node + S + syntax + node - NP - syntax - node + NP + syntax + node - VP - syntax - node + VP + syntax + node - S - syntax - node + S + syntax + node - VP - syntax - node + VP + syntax + node - VP - syntax - node + VP + syntax + node - node + node diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index f2c40a9f..80744d6a 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -4,1313 +4,1309 @@ expression: actual --- - - - - - - - - - - - - + + + + + + + + + + - corpus + corpus - - file:/tmp/pepper_thomas/workspace/rdr3o1mf/dialog.demo5661470868672726454.salt - dialog.demo - corpus + dialog.demo + corpus - datasource + datasource - datasource + datasource - tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm - file + tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm + file - node - naja - naja + node + naja + naja - node - pass - pass + node + pass + pass - node - auf - auf + node + auf + auf - node - also - also + node + also + also - node - du - du + node + du + du - node - hast - hast + node + hast + hast - node - jetz - jetz + node + jetz + jetz - node - wohl - wohl + node + wohl + wohl - node - hier - hier + node + hier + hier - node - auch - auch + node + auch + auch - node - so - so + node + so + so - node - ne - ne + node + ne + ne - node - Karte - Karte + node + Karte + Karte - node - wie - wie + node + wie + wie - node - ich - ich + node + ich + ich - node - bloß - bloß + node + bloß + bloß - node - ich - ich + node + ich + ich - node - hab - hab + node + hab + hab - node - ne - ne + node + ne + ne - node - Linie - Linie + node + Linie + Linie - node - und - und + node + und + und - node - du - du + node + du + du - node - nich - nich + node + nich + nich - node - eine - eine + node + eine + eine - node - genau - genau + node + genau + genau - node - und - und + node + und + und - node - ich - ich + node + ich + ich - node - muss - muss + node + muss + muss - node - dir - dir + node + dir + dir - node - jetzt - jetzt + node + jetzt + jetzt - node - erklärn - erklärn + node + erklärn + erklärn - node - wie - wie + node + wie + wie - node - du - du + node + du + du - node - vom - vom + node + vom + vom - node - Start - Start + node + Start + Start - node - zum - zum + node + zum + zum - node - Ziel - Ziel + node + Ziel + Ziel - node - kommst - kommst + node + kommst + kommst - node - so - so + node + so + so - node - wie - wie + node + wie + wie - node - meine - meine + node + meine + meine - node - Linie - Linie + node + Linie + Linie - node - geht - geht + node + geht + geht - node - so - so + node + so + so - node - also - also + node + also + also - node - du - du + node + du + du - node - hast - hast + node + hast + hast - node - n - n + node + n + n - node - Stift - Stift + node + Stift + Stift - node - okay - okay + node + okay + okay - node - aso - aso + node + aso + aso - node - du - du + node + du + du - node - musst - musst + node + musst + musst - node - jetzt - jetzt + node + jetzt + jetzt - node - vom - vom + node + vom + vom - node - Startpunkt - Startpunkt + node + Startpunkt + Startpunkt - node - äh - äh + node + äh + äh - node - waagerecht - waagerecht + node + waagerecht + waagerecht - node - Richtung - Richtung + node + Richtung + Richtung - node - also - also + node + also + also - node - zu - zu + node + zu + zu - node - dem - dem + node + dem + dem - node - ersten - ersten + node + ersten + ersten - node - Bild - Bild + node + Bild + Bild - node - erstmal - erstmal + node + erstmal + erstmal - node - und - und + node + und + und - node - zum - zum + node + zum + zum - node - Rad - Rad + node + Rad + Rad - node + node + äh äh - äh - node + node + ((lacht)) ((lacht)) - ((lacht)) - node + node + fang fang - fang - node + node + einfach einfach - einfach - node + node + ma ma - ma - node + node + an an - an - node + node + ((lacht)) ((lacht)) - ((lacht)) - node + node + nee nee - nee - node + node + ich ich - ich - node + node + hab hab - hab - node + node + gar gar - gar - node + node + keine keine - keine - node + node + Linien Linien - Linien - node + node + ich ich - ich - node + node + hab hab - hab - node + node + Start Start - Start - node + node + und und - und - node + node + Ziel Ziel - Ziel - node + node + genau genau - genau - node + node + jap jap - jap - node + node + mhm mhm - mhm - node + node + zum zum - zum - node + node + (?) (?) - (?) - node + node + Schornsteinfeger Schornsteinfeger - Schornsteinfeger - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - + node + - node - naja + node + naja - node - pass + node + pass - node - auf + node + auf - node - also + node + also - node - du + node + du - node - hast + node + hast - node - jetzt + node + jetzt - node - wohl + node + wohl - node - hier + node + hier - node - auch + node + auch - node - so + node + so - node - eine + node + eine - node - Karte + node + Karte - node - wie + node + wie - node - ich + node + ich - node - bloß + node + bloß - node - ich + node + ich - node - habe + node + habe - node - eine + node + eine - node - Linie + node + Linie - node - und + node + und - node - du + node + du - node - nicht + node + nicht - node - ne + node + ne - node - genau + node + genau - node - und + node + und - node - ich + node + ich - node - muss + node + muss - node - dir + node + dir - node - jetzt + node + jetzt - node - erklären + node + erklären - node - wie + node + wie - node - du + node + du - node - vom + node + vom - node - Start + node + Start - node - zum + node + zum - node - Ziel + node + Ziel - node - kommst + node + kommst - node - so + node + so - node - wie + node + wie - node - meine + node + meine - node - Linie + node + Linie - node - so + node + so - node - also + node + also - node - du + node + du - node - hast + node + hast - node - einen + node + einen - node - Stift + node + Stift - node - okay + node + okay - node - also + node + also - node - du + node + du - node - musst + node + musst - node - jetzt + node + jetzt - node - vom + node + vom - node - Startpunkt + node + Startpunkt - node - äh + node + äh - node - waagerecht + node + waagerecht - node - Richtung + node + Richtung - node - also + node + also - node - zu + node + zu - node - dem + node + dem - node - ersten + node + ersten - node - Bild + node + Bild - node - erstmal + node + erstmal - node - und + node + und - node - zum + node + zum - node - Rad + node + Rad - naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne - node + node + naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne - genau - node + node + genau - und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht - node + node + und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht - so - node + node + so - also hast n Stift - node + node + also hast n Stift - okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal - node + node + okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal - und - node + node + und - zum Rad - node + node + zum Rad - node - äh + node + äh - node - fang + node + fang - node - einfach + node + einfach - node - mal + node + mal - node - an + node + an - node - nee + node + nee - node - ich + node + ich - node - habe + node + habe - node - gar + node + gar - node - keine + node + keine - node - Linien + node + Linien - node - ich + node + ich - node - habe + node + habe - node - Start + node + Start - node - und + node + und - node - Ziel + node + Ziel - node - genau + node + genau - node - jap + node + jap - node - mhm + node + mhm - node - zum + node + zum - node - (?) + node + (?) - node - Schornsteinfeger + node + Schornsteinfeger - äh fang einfach ma an - node + node + äh fang einfach ma an - nee ich hab gar keine Linie ich hab Start und Ziel - node + node + nee ich hab gar keine Linie ich hab Start und Ziel - genau - node + node + genau - jap - node + node + jap - zum (?) Schornsteinfeger - node + node + zum (?) Schornsteinfeger From bedf582060f4a7ae7dda083bdf8d2a80e541d493 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 6 Aug 2024 15:57:13 +0200 Subject: [PATCH 53/61] Map time information from SaltXML --- src/exporter/graphml.rs | 2 +- src/importer/saltxml.rs | 5 + src/importer/saltxml/document.rs | 69 ++- ...ltxml__tests__read_salt_with_timeline.snap | 513 +++++++++++------- 4 files changed, 374 insertions(+), 215 deletions(-) diff --git a/src/exporter/graphml.rs b/src/exporter/graphml.rs index c47e712e..4236e674 100644 --- a/src/exporter/graphml.rs +++ b/src/exporter/graphml.rs @@ -242,7 +242,7 @@ fn media_vis(graph: &AnnotationGraph) -> Result, Box { + "mp4" | "avi" | "mov" | "webm" => { vis.push(Visualizer { element: "node".to_string(), layer: None, diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index a47ad696..9070c3cc 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -138,6 +138,7 @@ enum SaltObject { Text(String), Boolean(bool), Integer(i64), + Float(f64), Url(String), Null, } @@ -154,6 +155,9 @@ impl From<&str> for SaltObject { } else if let Some(value) = value.strip_prefix("N::") { let value = value.parse::().unwrap_or_default(); SaltObject::Integer(value) + } else if let Some(value) = value.strip_prefix("F::") { + let value = value.parse::().unwrap_or_default(); + SaltObject::Float(value) } else { SaltObject::Null } @@ -167,6 +171,7 @@ impl std::fmt::Display for SaltObject { SaltObject::Url(val) => write!(f, "{val}"), SaltObject::Boolean(val) => write!(f, "{val}"), SaltObject::Integer(val) => write!(f, "{val}"), + SaltObject::Float(val) => write!(f, "{val}"), SaltObject::Null => write!(f, ""), } } diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index c43d025c..9ed2e30d 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -97,19 +97,26 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { if let Some(timeline) = timeline { mapper.map_timeline(&timeline, updates)?; } + if !mapper.media_files.is_empty() { + mapper.map_media_relations(updates)?; + } mapper.map_non_token_nodes(updates)?; Ok(()) } + fn get_tli_node_name(&self, tli: i64) -> String { + format!("{}#tli{tli}", self.document_node_name) + } + fn map_timeline(&mut self, timeline: &Node, updates: &mut GraphUpdate) -> Result<()> { let number_of_tlis = get_feature_by_qname(timeline, "saltCommon", "SDATA") .context("Missing SDATA attribute for timeline.")?; if let SaltObject::Integer(number_of_tlis) = number_of_tlis { let mut previous_tli = None; for i in 0..number_of_tlis { - let tli_node_name = format!("{}#tli{i}", self.document_node_name); + let tli_node_name = self.get_tli_node_name(i); updates.add_event(UpdateEvent::AddNode { node_name: tli_node_name.clone(), node_type: "node".to_string(), @@ -164,7 +171,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { for tli in start..end { updates.add_event(UpdateEvent::AddEdge { source_node: token_id.clone(), - target_node: format!("{}#tli{tli}", self.document_node_name), + target_node: self.get_tli_node_name(tli), layer: ANNIS_NS.to_string(), component_type: AnnotationComponentType::Coverage.to_string(), component_name: "".to_string(), @@ -211,7 +218,6 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { } fn map_media_datasources(&mut self, updates: &mut GraphUpdate) -> Result<()> { - // TODO: Map time codes from the SAudioRelation as annis::time for media_node in self .nodes .iter() @@ -268,6 +274,61 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { Ok(()) } + fn map_media_relations(&mut self, updates: &mut GraphUpdate) -> Result<()> { + for media_rel in self + .edges + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::MediaRelation) + { + let source_att = media_rel.attribute("source").unwrap_or_default(); + let token_node = resolve_element(source_att, "nodes", &self.nodes) + .context("Token referenced in SAudioRelation cannot be resolved")?; + let token_id = get_element_id(&token_node).context("Token has no ID")?; + + let start = get_feature_by_qname(media_rel, "salt", "SSTART") + .context("Missing SSTART attribute for SAudioRlation")?; + let end = get_feature_by_qname(media_rel, "salt", "SEND") + .context("Missing SEND attribute for SAudioRelation")?; + + if let (SaltObject::Float(start), SaltObject::Float(end)) = (start, end) { + if let Some(covered_tli) = self.token_to_tli.get(&token_id) { + if let (Some(first_tli), Some(last_tli)) = + (covered_tli.first(), covered_tli.last()) + { + if first_tli == last_tli { + // Attach start and end time to the same token + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: self.get_tli_node_name(*first_tli), + anno_ns: "annis".to_string(), + anno_name: "time".to_string(), + anno_value: format!("{start}-{end}"), + })?; + } else { + // Attach start time to first token and end time to + // last token + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: self.get_tli_node_name(*first_tli), + anno_ns: "annis".to_string(), + anno_name: "time".to_string(), + anno_value: format!("{start}-"), + })?; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: self.get_tli_node_name(*last_tli), + anno_ns: "annis".to_string(), + anno_name: "time".to_string(), + anno_value: format!("-{end}"), + })?; + } + } + } + } else { + bail!("SSTART/SEND not a float") + } + } + + Ok(()) + } + fn map_node( &self, n: &Node, @@ -589,7 +650,7 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { if let Some(tli_token) = self.token_to_tli.get(&target_node_id) { // Add a coverage edge to the indirectly covered timeline item token for tli in tli_token { - let tli_id = format!("{}#tli{tli}", &self.document_node_name); + let tli_id = self.get_tli_node_name(*tli); self.map_edge( spanning_rel, Some(tli_id), diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index 80744d6a..bcffac59 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -11,9 +11,10 @@ expression: actual - - - + + + + corpus @@ -35,870 +36,962 @@ expression: actual node naja - naja + naja node pass - pass + pass node auf - auf + auf node also - also + also node du - du + du node hast - hast + hast node jetz - jetz + jetz node wohl - wohl + wohl node hier - hier + hier node auch - auch + auch node so - so + so node ne - ne + ne node Karte - Karte + Karte node wie - wie + wie node ich - ich + ich node bloß - bloß + bloß node ich - ich + ich node hab - hab + hab node ne - ne + ne node Linie - Linie + Linie node und - und + und node du - du + du node nich - nich + nich node eine - eine + eine node genau - genau + genau node und - und + und node ich - ich + ich node muss - muss + muss node dir - dir + dir node jetzt - jetzt + jetzt node erklärn - erklärn + erklärn node wie - wie + wie node du - du + du node vom - vom + vom node Start - Start + Start node zum - zum + zum node Ziel - Ziel + Ziel node kommst - kommst + kommst node so - so + so node wie - wie + wie node meine - meine + meine node Linie - Linie + Linie node geht - geht + geht node so - so + so node also - also + also node du - du + du node hast - hast + hast node n - n + n node Stift - Stift + Stift node okay - okay + okay node aso - aso + aso node du - du + du node musst - musst + musst node jetzt - jetzt + jetzt node vom - vom + vom node Startpunkt - Startpunkt + Startpunkt node äh - äh + äh node waagerecht - waagerecht + waagerecht node Richtung - Richtung + Richtung node also - also + also node zu - zu + zu node dem - dem + dem node ersten - ersten + ersten node Bild - Bild + Bild node erstmal - erstmal + erstmal node und - und + und node zum - zum + zum node Rad - Rad + Rad node äh - äh + äh node ((lacht)) - ((lacht)) + ((lacht)) node fang - fang + fang node einfach - einfach + einfach node ma - ma + ma node an - an + an node ((lacht)) - ((lacht)) + ((lacht)) node nee - nee + nee node ich - ich + ich node hab - hab + hab node gar - gar + gar node keine - keine + keine node Linien - Linien + Linien node ich - ich + ich node hab - hab + hab node Start - Start + Start node und - und + und node Ziel - Ziel + Ziel node genau - genau + genau node jap - jap + jap node mhm - mhm + mhm node zum - zum + zum node (?) - (?) + (?) node Schornsteinfeger - Schornsteinfeger + Schornsteinfeger node - + 0-0.6884364908323192 + node - + 0.6884364908323192-1.6052463032285857 + node - + 1.6052463032285857-1.809765415224676 + node - + 1.809765415224676-2.12712265797723 + node - + 2.12712265797723-2.3069584288703435 + node - + 2.3069584288703435-2.462110858660481 + node - + 2.462110858660481-3.061563428304194 + node - + 3.061563428304194-3.3859730542290265 + node - + 3.3859730542290265-3.564405882202723 + node - + 3.564405882202723-3.7724511857849525 + node - + 3.7724511857849525-3.9840226809533217 + node - + 3.9840226809533217-4.06865127902067 + node - + 4.06865127902067-4.290801348947457 + node - + 4.290801348947457-4.61521097487229 + node - + 4.61521097487229-4.766837213076288 + node - + 4.766837213076288-5.045406348381307 + node - + 5.045406348381307-5.182927820240747 + node - + 5.182927820240747-5.348658824789303 + node - + 5.348658824789303-5.50381125457944 + node - + 5.50381125457944-5.785906581470599 + node - + 5.785906581470599-5.969268543949853 + node - + 5.969268543949853-6.187892422290501 + node - + 6.187892422290501-6.492035791177925 + node - + 6.492035791177925-6.689502520001737 + node - + 6.689502520001737-6.819971608688898 + node - + 6.819971608688898-6.957493080548338 + node - + 6.957493080548338-7.3101122391622875 + node - + 7.3101122391622875-7.4970003932276805 + node - + 7.4970003932276805-7.578102799708889 + node - + 7.578102799708889-7.990667215287209 + node - + 7.990667215287209-8.198712518869439 + node - + 8.198712518869439-8.343286373901158 + node - + 8.343286373901158-8.477281654174458 + node - + 8.477281654174458-8.61127693444776 + node - + 8.61127693444776-8.766429364237897 + node - + 8.766429364237897-8.94979132671715 + node - + 8.94979132671715-9.111996139679567 + node - + 9.111996139679567-9.193098546160776 + node - + 9.193098546160776-9.270674761055844 + node - + 9.270674761055844-9.54771075568088 + node - + 9.54771075568088-9.727546526573994 + node - + 9.727546526573994- + node - + -10.041377577740407 + node - + node - + 10.196530007530546-10.358734820492963 + node - + 10.358734820492963-10.5138872502831 + node - + 10.5138872502831-10.718840651322928 + node - + 10.718840651322928-10.9127811885606 + node - + 10.9127811885606-11.099669342625994 + node - + 11.099669342625994-11.836643384129147 + node - + node - + 12.50309359390951-12.619457916252113 + node - + 12.619457916252113-12.71466508907788 + node - + 12.71466508907788-12.933288967418529 + node - + 12.933288967418529-13.25769859334336 + node - + 13.25769859334336-13.511584387545403 + node - + 13.511584387545403-13.850098779814795 + node - + 13.850098779814795-14.426381411663305 + node - + node - + 14.704950546968325-14.934153000067392 + node - + 14.934153000067392-15.121041154132785 + node - + 15.121041154132785-15.420767438954641 + node - + 15.420767438954641-15.794543747085427 + node - + 15.794543747085427-16.147162905699375 + node - + node - + 16.390470125143-17.00737639011335 + node - + node - + 17.113162137697536-17.783138539064044 + node - + 17.783138539064044-18.132231506091856 + node - + 18.132231506091856-18.195702954642368 + node - + 18.195702954642368-18.38259110870776 + node - + 18.38259110870776-18.449588748844413 + node - + 18.449588748844413-18.932676996145524 + node - + 18.932676996145524-19.953208071992766 + node - + 19.953208071992766-20.31816890902807 + node - + 20.31816890902807-20.471558246332762 + node - + 20.471558246332762-20.572054708704805 + node - + 20.572054708704805-20.75717977096909 + node - + 20.75717977096909-20.963461983206436 + node - + 20.963461983206436-21.132719182990925 + node - + 21.132719182990925-21.952558744447042 + node - + node - + 22.042476631832553-23.116201992965404 + node - + 23.116201992965404-23.7720736421303 + node - + 23.7720736421303-24.956836985878773 + node - + 24.956836985878773-25.965327779514674 + node - + node - + 26.71640658736239-26.86310541935397 + node - + 26.86310541935397-26.972417358524297 + node - + 26.972417358524297-27.095834064039177 + node - + 27.095834064039177-27.392034157274896 + node - + 27.392034157274896-27.920962895195824 + node - + 27.920962895195824-28.739039343180192 + node - + 28.739039343180192-29.052870394346606 + node - + 29.052870394346606-29.882474847095374 + node - + node - + 30.16457018006952-30.568359047306725 + node - + 30.568359047306725- + node - + 30.57541143063108-30.93508298017312 + node - + -31.760211829122508 + node - + node - + node @@ -1169,36 +1262,36 @@ expression: actual Rad + naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne node - naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne + genau node - genau + und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht node - und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht + so node - so + also hast n Stift node - also hast n Stift + okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal node - okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal + und node - und + zum Rad node - zum Rad node @@ -1290,23 +1383,23 @@ expression: actual node - äh fang einfach ma an + äh fang einfach ma an node - nee ich hab gar keine Linie ich hab Start und Ziel + nee ich hab gar keine Linie ich hab Start und Ziel node - genau + genau node - jap + jap node - zum (?) Schornsteinfeger + zum (?) Schornsteinfeger From f817efdd7e6100ed6addf44efb019deafc08283e Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 6 Aug 2024 16:06:03 +0200 Subject: [PATCH 54/61] Use the file name as node ID --- src/importer/saltxml/document.rs | 28 +++++++++++++------ ...ltxml__tests__read_salt_with_timeline.snap | 4 +-- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs index 9ed2e30d..c9037339 100644 --- a/src/importer/saltxml/document.rs +++ b/src/importer/saltxml/document.rs @@ -223,23 +223,21 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { .iter() .filter(|n| SaltType::from_node(n) == SaltType::MediaDs) { - let element_id = get_element_id(media_node) + let orig_element_id = get_element_id(media_node) .context("Missing element ID for media/audio data source")?; if let Some(SaltObject::Url(anno_value)) = get_feature_by_qname(media_node, "salt", "SAUDIO_REFERENCE") { - updates.add_event(UpdateEvent::AddNode { - node_name: element_id.clone(), - node_type: "file".to_string(), - })?; // Parse the file URL with the input file location as base path let base_dir = Url::from_directory_path(self.input_directory.canonicalize()?).ok(); let referenced_url = Url::options() .base_url(base_dir.as_ref()) .parse(&anno_value)?; - let file_path = if referenced_url.scheme() == "file" { + let mut element_id = orig_element_id; + let mut file_path = referenced_url.to_string(); + if referenced_url.scheme() == "file" { // Resolve this file URL against the input direcotry and // store it relative to the current working directory. let referenced_path = Path::new(referenced_url.path()); @@ -248,10 +246,22 @@ impl<'a, 'input> DocumentMapper<'a, 'input> { &std::env::current_dir()?, ) .unwrap_or_else(|| referenced_path.to_path_buf()); - referenced_path.to_string_lossy().to_string() - } else { - referenced_url.to_string() + + file_path = referenced_path.to_string_lossy().to_string(); + // Use the file name as element ID + if let Some(file_name) = referenced_path.file_name() { + element_id = format!( + "{}/{}", + self.document_node_name, + file_name.to_string_lossy() + ); + } }; + updates.add_event(UpdateEvent::AddNode { + node_name: element_id.clone(), + node_type: "file".to_string(), + })?; + self.media_files .insert(element_id.clone(), file_path.clone()); diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index bcffac59..67c6eaaf 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -29,7 +29,7 @@ expression: actual datasource - + tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm file @@ -2367,7 +2367,7 @@ expression: actual - + From 33c8a952ee093994d64a5f01eedc44c5106dcbaf Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 6 Aug 2024 16:06:52 +0200 Subject: [PATCH 55/61] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54fcda29..c360b038 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `map` manipulator can now add annotated spans and copy values from existing annotations. The copied values can be manipulated using regular expressions and replacement values. +- Addes `saltxml` import format ### Fixed From 072d0819a3b1d17376f4901b4126ae0cd13194f4 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 7 Aug 2024 14:40:54 +0200 Subject: [PATCH 56/61] Map meta annotation --- src/importer/saltxml.rs | 10 + src/importer/saltxml/corpus_structure.rs | 8 +- ...ltxml__tests__read_salt_with_timeline.snap | 1616 +++++++++-------- 3 files changed, 837 insertions(+), 797 deletions(-) diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs index 9070c3cc..2f19f683 100644 --- a/src/importer/saltxml.rs +++ b/src/importer/saltxml.rs @@ -88,6 +88,7 @@ enum SaltType { Layer, MediaDs, MediaRelation, + MetaAnnotation, PointingRelation, Span, SpanningRelation, @@ -109,6 +110,7 @@ impl SaltType { "saltCore:SElementId" => SaltType::ElementId, "saltCore:SFeature" => SaltType::Feature, "saltCore:SLayer" => SaltType::Layer, + "saltCore:SMetaAnnotation" => SaltType::MetaAnnotation, "sCorpusStructure:SCorpus" => SaltType::Corpus, "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, @@ -200,6 +202,14 @@ fn get_annotations<'a, 'input>(n: &'a Node<'a, 'input>) -> impl Iterator( + n: &'a Node<'a, 'input>, +) -> impl Iterator> { + n.children().filter(|n| { + n.tag_name().name() == "labels" && SaltType::from_node(n) == SaltType::MetaAnnotation + }) +} + fn get_feature_by_qname(n: &Node, namespace: &str, name: &str) -> Option { get_features(n) .filter(|f| { diff --git a/src/importer/saltxml/corpus_structure.rs b/src/importer/saltxml/corpus_structure.rs index 1822139e..2a255da4 100644 --- a/src/importer/saltxml/corpus_structure.rs +++ b/src/importer/saltxml/corpus_structure.rs @@ -8,7 +8,9 @@ use graphannis::{ use graphannis_core::graph::ANNIS_NS; use itertools::Itertools; -use super::{get_annotations, get_element_id, get_features, resolve_element, SaltObject, SaltType}; +use super::{ + get_element_id, get_features, get_meta_annotations, resolve_element, SaltObject, SaltType, +}; pub(super) struct SaltCorpusStructureMapper {} @@ -93,8 +95,8 @@ impl SaltCorpusStructureMapper { })?; } } - // Add annotations - for anno_node in get_annotations(node) { + // Add meta annotations + for anno_node in get_meta_annotations(node) { let annos_ns = anno_node.attribute("namespace"); if annos_ns != Some("salt") { let anno_name = anno_node.attribute("name").ok_or_else(|| { diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap index 67c6eaaf..72345abe 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -4,1402 +4,1430 @@ expression: actual --- - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + - corpus + corpus - dialog.demo - corpus + SPK0 + SPK1 + German + deu + deu + corpus + m + m + dialog.demo + dialog.demo + BeMaTaC + Part of a dialog recorded in a map task setting + dialog.demo + HU Berlin + deu + deu - datasource + datasource - datasource + datasource - tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm - file + file + tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm - node - naja - naja + node + naja + naja - node - pass - pass + node + pass + pass - node - auf - auf + node + auf + auf - node - also - also + node + also + also - node - du - du + node + du + du - node - hast - hast + node + hast + hast - node - jetz - jetz + node + jetz + jetz - node - wohl - wohl + node + wohl + wohl - node - hier - hier + node + hier + hier - node - auch - auch + node + auch + auch - node - so - so + node + so + so - node - ne - ne + node + ne + ne - node - Karte - Karte + node + Karte + Karte - node - wie - wie + node + wie + wie - node - ich - ich + node + ich + ich - node - bloß - bloß + node + bloß + bloß - node - ich - ich + node + ich + ich - node - hab - hab + node + hab + hab - node - ne - ne + node + ne + ne - node - Linie - Linie + node + Linie + Linie - node - und - und + node + und + und - node - du - du + node + du + du - node - nich - nich + node + nich + nich - node - eine - eine + node + eine + eine - node - genau - genau + node + genau + genau - node - und - und + node + und + und - node - ich - ich + node + ich + ich - node - muss - muss + node + muss + muss - node - dir - dir + node + dir + dir - node - jetzt - jetzt + node + jetzt + jetzt - node - erklärn - erklärn + node + erklärn + erklärn - node - wie - wie + node + wie + wie - node - du - du + node + du + du - node - vom - vom + node + vom + vom - node - Start - Start + node + Start + Start - node - zum - zum + node + zum + zum - node - Ziel - Ziel + node + Ziel + Ziel - node - kommst - kommst + node + kommst + kommst - node - so - so + node + so + so - node - wie - wie + node + wie + wie - node - meine - meine + node + meine + meine - node - Linie - Linie + node + Linie + Linie - node - geht - geht + node + geht + geht - node - so - so + node + so + so - node - also - also + node + also + also - node - du - du + node + du + du - node - hast - hast + node + hast + hast - node - n - n + node + n + n - node - Stift - Stift + node + Stift + Stift - node - okay - okay + node + okay + okay - node - aso - aso + node + aso + aso - node - du - du + node + du + du - node - musst - musst + node + musst + musst - node - jetzt - jetzt + node + jetzt + jetzt - node - vom - vom + node + vom + vom - node - Startpunkt - Startpunkt + node + Startpunkt + Startpunkt - node - äh - äh + node + äh + äh - node - waagerecht - waagerecht + node + waagerecht + waagerecht - node - Richtung - Richtung + node + Richtung + Richtung - node - also - also + node + also + also - node - zu - zu + node + zu + zu - node - dem - dem + node + dem + dem - node - ersten - ersten + node + ersten + ersten - node - Bild - Bild + node + Bild + Bild - node - erstmal - erstmal + node + erstmal + erstmal - node - und - und + node + und + und - node - zum - zum + node + zum + zum - node - Rad - Rad + node + Rad + Rad - node - äh - äh + node + äh + äh - node - ((lacht)) - ((lacht)) + node + ((lacht)) + ((lacht)) - node - fang - fang + node + fang + fang - node - einfach - einfach + node + einfach + einfach - node - ma - ma + node + ma + ma - node - an - an + node + an + an - node - ((lacht)) - ((lacht)) + node + ((lacht)) + ((lacht)) - node - nee - nee + node + nee + nee - node - ich - ich + node + ich + ich - node - hab - hab + node + hab + hab - node - gar - gar + node + gar + gar - node - keine - keine + node + keine + keine - node - Linien - Linien + node + Linien + Linien - node - ich - ich + node + ich + ich - node - hab - hab + node + hab + hab - node - Start - Start + node + Start + Start - node - und - und + node + und + und - node - Ziel - Ziel + node + Ziel + Ziel - node - genau - genau + node + genau + genau - node - jap - jap + node + jap + jap - node - mhm - mhm + node + mhm + mhm - node - zum - zum + node + zum + zum - node - (?) - (?) + node + (?) + (?) - node - Schornsteinfeger - Schornsteinfeger + node + Schornsteinfeger + Schornsteinfeger - node - 0-0.6884364908323192 - + node + 0-0.6884364908323192 + - node - 0.6884364908323192-1.6052463032285857 - + node + 0.6884364908323192-1.6052463032285857 + - node - 1.6052463032285857-1.809765415224676 - + node + 1.6052463032285857-1.809765415224676 + - node - 1.809765415224676-2.12712265797723 - + node + 1.809765415224676-2.12712265797723 + - node - 2.12712265797723-2.3069584288703435 - + node + 2.12712265797723-2.3069584288703435 + - node - 2.3069584288703435-2.462110858660481 - + node + 2.3069584288703435-2.462110858660481 + - node - 2.462110858660481-3.061563428304194 - + node + 2.462110858660481-3.061563428304194 + - node - 3.061563428304194-3.3859730542290265 - + node + 3.061563428304194-3.3859730542290265 + - node - 3.3859730542290265-3.564405882202723 - + node + 3.3859730542290265-3.564405882202723 + - node - 3.564405882202723-3.7724511857849525 - + node + 3.564405882202723-3.7724511857849525 + - node - 3.7724511857849525-3.9840226809533217 - + node + 3.7724511857849525-3.9840226809533217 + - node - 3.9840226809533217-4.06865127902067 - + node + 3.9840226809533217-4.06865127902067 + - node - 4.06865127902067-4.290801348947457 - + node + 4.06865127902067-4.290801348947457 + - node - 4.290801348947457-4.61521097487229 - + node + 4.290801348947457-4.61521097487229 + - node - 4.61521097487229-4.766837213076288 - + node + 4.61521097487229-4.766837213076288 + - node - 4.766837213076288-5.045406348381307 - + node + 4.766837213076288-5.045406348381307 + - node - 5.045406348381307-5.182927820240747 - + node + 5.045406348381307-5.182927820240747 + - node - 5.182927820240747-5.348658824789303 - + node + 5.182927820240747-5.348658824789303 + - node - 5.348658824789303-5.50381125457944 - + node + 5.348658824789303-5.50381125457944 + - node - 5.50381125457944-5.785906581470599 - + node + 5.50381125457944-5.785906581470599 + - node - 5.785906581470599-5.969268543949853 - + node + 5.785906581470599-5.969268543949853 + - node - 5.969268543949853-6.187892422290501 - + node + 5.969268543949853-6.187892422290501 + - node - 6.187892422290501-6.492035791177925 - + node + 6.187892422290501-6.492035791177925 + - node - 6.492035791177925-6.689502520001737 - + node + 6.492035791177925-6.689502520001737 + - node - 6.689502520001737-6.819971608688898 - + node + 6.689502520001737-6.819971608688898 + - node - 6.819971608688898-6.957493080548338 - + node + 6.819971608688898-6.957493080548338 + - node - 6.957493080548338-7.3101122391622875 - + node + 6.957493080548338-7.3101122391622875 + - node - 7.3101122391622875-7.4970003932276805 - + node + 7.3101122391622875-7.4970003932276805 + - node - 7.4970003932276805-7.578102799708889 - + node + 7.4970003932276805-7.578102799708889 + - node - 7.578102799708889-7.990667215287209 - + node + 7.578102799708889-7.990667215287209 + - node - 7.990667215287209-8.198712518869439 - + node + 7.990667215287209-8.198712518869439 + - node - 8.198712518869439-8.343286373901158 - + node + 8.198712518869439-8.343286373901158 + - node - 8.343286373901158-8.477281654174458 - + node + 8.343286373901158-8.477281654174458 + - node - 8.477281654174458-8.61127693444776 - + node + 8.477281654174458-8.61127693444776 + - node - 8.61127693444776-8.766429364237897 - + node + 8.61127693444776-8.766429364237897 + - node - 8.766429364237897-8.94979132671715 - + node + 8.766429364237897-8.94979132671715 + - node - 8.94979132671715-9.111996139679567 - + node + 8.94979132671715-9.111996139679567 + - node - 9.111996139679567-9.193098546160776 - + node + 9.111996139679567-9.193098546160776 + - node - 9.193098546160776-9.270674761055844 - + node + 9.193098546160776-9.270674761055844 + - node - 9.270674761055844-9.54771075568088 - + node + 9.270674761055844-9.54771075568088 + - node - 9.54771075568088-9.727546526573994 - + node + 9.54771075568088-9.727546526573994 + - node - 9.727546526573994- - + node + 9.727546526573994- + - node - -10.041377577740407 - + node + -10.041377577740407 + - node - + node + - node - 10.196530007530546-10.358734820492963 - + node + 10.196530007530546-10.358734820492963 + - node - 10.358734820492963-10.5138872502831 - + node + 10.358734820492963-10.5138872502831 + - node - 10.5138872502831-10.718840651322928 - + node + 10.5138872502831-10.718840651322928 + - node - 10.718840651322928-10.9127811885606 - + node + 10.718840651322928-10.9127811885606 + - node - 10.9127811885606-11.099669342625994 - + node + 10.9127811885606-11.099669342625994 + - node - 11.099669342625994-11.836643384129147 - + node + 11.099669342625994-11.836643384129147 + - node - + node + - node - 12.50309359390951-12.619457916252113 - + node + 12.50309359390951-12.619457916252113 + - node - 12.619457916252113-12.71466508907788 - + node + 12.619457916252113-12.71466508907788 + - node - 12.71466508907788-12.933288967418529 - + node + 12.71466508907788-12.933288967418529 + - node - 12.933288967418529-13.25769859334336 - + node + 12.933288967418529-13.25769859334336 + - node - 13.25769859334336-13.511584387545403 - + node + 13.25769859334336-13.511584387545403 + - node - 13.511584387545403-13.850098779814795 - + node + 13.511584387545403-13.850098779814795 + - node - 13.850098779814795-14.426381411663305 - + node + 13.850098779814795-14.426381411663305 + - node - + node + - node - 14.704950546968325-14.934153000067392 - + node + 14.704950546968325-14.934153000067392 + - node - 14.934153000067392-15.121041154132785 - + node + 14.934153000067392-15.121041154132785 + - node - 15.121041154132785-15.420767438954641 - + node + 15.121041154132785-15.420767438954641 + - node - 15.420767438954641-15.794543747085427 - + node + 15.420767438954641-15.794543747085427 + - node - 15.794543747085427-16.147162905699375 - + node + 15.794543747085427-16.147162905699375 + - node - + node + - node - 16.390470125143-17.00737639011335 - + node + 16.390470125143-17.00737639011335 + - node - + node + - node - 17.113162137697536-17.783138539064044 - + node + 17.113162137697536-17.783138539064044 + - node - 17.783138539064044-18.132231506091856 - + node + 17.783138539064044-18.132231506091856 + - node - 18.132231506091856-18.195702954642368 - + node + 18.132231506091856-18.195702954642368 + - node - 18.195702954642368-18.38259110870776 - + node + 18.195702954642368-18.38259110870776 + - node - 18.38259110870776-18.449588748844413 - + node + 18.38259110870776-18.449588748844413 + - node - 18.449588748844413-18.932676996145524 - + node + 18.449588748844413-18.932676996145524 + - node - 18.932676996145524-19.953208071992766 - + node + 18.932676996145524-19.953208071992766 + - node - 19.953208071992766-20.31816890902807 - + node + 19.953208071992766-20.31816890902807 + - node - 20.31816890902807-20.471558246332762 - + node + 20.31816890902807-20.471558246332762 + - node - 20.471558246332762-20.572054708704805 - + node + 20.471558246332762-20.572054708704805 + - node - 20.572054708704805-20.75717977096909 - + node + 20.572054708704805-20.75717977096909 + - node - 20.75717977096909-20.963461983206436 - + node + 20.75717977096909-20.963461983206436 + - node - 20.963461983206436-21.132719182990925 - + node + 20.963461983206436-21.132719182990925 + - node - 21.132719182990925-21.952558744447042 - + node + 21.132719182990925-21.952558744447042 + - node - + node + - node - 22.042476631832553-23.116201992965404 - + node + 22.042476631832553-23.116201992965404 + - node - 23.116201992965404-23.7720736421303 - + node + 23.116201992965404-23.7720736421303 + - node - 23.7720736421303-24.956836985878773 - + node + 23.7720736421303-24.956836985878773 + - node - 24.956836985878773-25.965327779514674 - + node + 24.956836985878773-25.965327779514674 + - node - + node + - node - 26.71640658736239-26.86310541935397 - + node + 26.71640658736239-26.86310541935397 + - node - 26.86310541935397-26.972417358524297 - + node + 26.86310541935397-26.972417358524297 + - node - 26.972417358524297-27.095834064039177 - + node + 26.972417358524297-27.095834064039177 + - node - 27.095834064039177-27.392034157274896 - + node + 27.095834064039177-27.392034157274896 + - node - 27.392034157274896-27.920962895195824 - + node + 27.392034157274896-27.920962895195824 + - node - 27.920962895195824-28.739039343180192 - + node + 27.920962895195824-28.739039343180192 + - node - 28.739039343180192-29.052870394346606 - + node + 28.739039343180192-29.052870394346606 + - node - 29.052870394346606-29.882474847095374 - + node + 29.052870394346606-29.882474847095374 + - node - + node + - node - 30.16457018006952-30.568359047306725 - + node + 30.16457018006952-30.568359047306725 + - node - 30.568359047306725- - + node + 30.568359047306725- + - node - 30.57541143063108-30.93508298017312 - + node + 30.57541143063108-30.93508298017312 + - node - -31.760211829122508 - + node + -31.760211829122508 + - node - + node + - node - + node + - node - naja + node + naja - node - pass + node + pass - node - auf + node + auf - node - also + node + also - node - du + node + du - node - hast + node + hast - node - jetzt + node + jetzt - node - wohl + node + wohl - node - hier + node + hier - node - auch + node + auch - node - so + node + so - node - eine + node + eine - node - Karte + node + Karte - node - wie + node + wie - node - ich + node + ich - node - bloß + node + bloß - node - ich + node + ich - node - habe + node + habe - node - eine + node + eine - node - Linie + node + Linie - node - und + node + und - node - du + node + du - node - nicht + node + nicht - node - ne + node + ne - node - genau + node + genau - node - und + node + und - node - ich + node + ich - node - muss + node + muss - node - dir + node + dir - node - jetzt + node + jetzt - node - erklären + node + erklären - node - wie + node + wie - node - du + node + du - node - vom + node + vom - node - Start + node + Start - node - zum + node + zum - node - Ziel + node + Ziel - node - kommst + node + kommst - node - so + node + so - node - wie + node + wie - node - meine + node + meine - node - Linie + node + Linie - node - so + node + so - node - also + node + also - node - du + node + du - node - hast + node + hast - node - einen + node + einen - node - Stift + node + Stift - node - okay + node + okay - node - also + node + also - node - du + node + du - node - musst + node + musst - node - jetzt + node + jetzt - node - vom + node + vom - node - Startpunkt + node + Startpunkt - node - äh + node + äh - node - waagerecht + node + waagerecht - node - Richtung + node + Richtung - node - also + node + also - node - zu + node + zu - node - dem + node + dem - node - ersten + node + ersten - node - Bild + node + Bild - node - erstmal + node + erstmal - node - und + node + und - node - zum + node + zum - node - Rad + node + Rad - naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne - node + node + naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne - genau - node + node + genau - und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht - node + node + und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht - so - node + node + so - also hast n Stift - node + node + also hast n Stift - okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal - node + node + okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal - und - node + node + und - zum Rad - node + node + zum Rad - node - äh + node + äh - node - fang + node + fang - node - einfach + node + einfach - node - mal + node + mal - node - an + node + an - node - nee + node + nee - node - ich + node + ich - node - habe + node + habe - node - gar + node + gar - node - keine + node + keine - node - Linien + node + Linien - node - ich + node + ich - node - habe + node + habe - node - Start + node + Start - node - und + node + und - node - Ziel + node + Ziel - node - genau + node + genau - node - jap + node + jap - node - mhm + node + mhm - node - zum + node + zum - node - (?) + node + (?) - node - Schornsteinfeger + node + Schornsteinfeger - node - äh fang einfach ma an + node + äh fang einfach ma an - node - nee ich hab gar keine Linie ich hab Start und Ziel + node + nee ich hab gar keine Linie ich hab Start und Ziel - node - genau + node + genau - node - jap + node + jap - node - zum (?) Schornsteinfeger + node + zum (?) Schornsteinfeger From 6ecbbada641e03fcc0e801d3767b6cba50dd1d3a Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 7 Aug 2024 15:27:29 +0200 Subject: [PATCH 57/61] Video and audio visualizers need the special "preloaded" visibility to work correctly --- src/exporter/graphml.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/exporter/graphml.rs b/src/exporter/graphml.rs index 4236e674..6a724d7b 100644 --- a/src/exporter/graphml.rs +++ b/src/exporter/graphml.rs @@ -238,7 +238,7 @@ fn media_vis(graph: &AnnotationGraph) -> Result, Box Result, Box Date: Wed, 7 Aug 2024 16:11:55 +0200 Subject: [PATCH 58/61] Add more complex pointing edges with annotations to one of the example Salt documents --- ...ltxml__tests__read_salt_sample_corpus.snap | 336 ++++++++++-------- .../rootCorpus/subCorpus1/doc1.salt | 100 ++++-- .../rootCorpus/subCorpus1/doc2.salt | 5 - .../rootCorpus/subCorpus2/doc3.salt | 5 - .../rootCorpus/subCorpus2/doc4.salt | 5 - 5 files changed, 248 insertions(+), 203 deletions(-) diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap index ec0cf4f0..f8c22fac 100644 --- a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -13,6 +13,7 @@ expression: actual + corpus @@ -1135,321 +1136,340 @@ expression: actual - + + det - + + cop - + + nsubj - + + advmod - + + advcl - + + mark - + + nsubj - + + xcomp - + + aux - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt index f65e6d0e..653bee7e 100644 --- a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt @@ -7,67 +7,67 @@ - + - + - + - + - + - + - + - + - + - + - + @@ -83,62 +83,62 @@ - + - + - + - + - + - + - + - + - + - + - + - + @@ -353,17 +353,57 @@ - + - + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt index 886757da..c44261e6 100644 --- a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt @@ -353,11 +353,6 @@ - - - - - diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt index 427a6179..fd3f508a 100644 --- a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt @@ -353,11 +353,6 @@ - - - - - diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt index 2558b0dc..0d3170f3 100644 --- a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt @@ -353,11 +353,6 @@ - - - - - From 35b6a8b2ba8576d2eaa94390fc1364291d3c2e4e Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 7 Aug 2024 16:20:30 +0200 Subject: [PATCH 59/61] Add test for some of the features of the GraphmL guess_vis function --- ...aphml__tests__export_graphml_with_vis.snap | 300 ++++++++++++++++++ src/exporter/graphml/tests.rs | 34 ++ 2 files changed, 334 insertions(+) create mode 100644 src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap diff --git a/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap b/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap new file mode 100644 index 00000000..f9699887 --- /dev/null +++ b/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap @@ -0,0 +1,300 @@ +--- +source: src/exporter/graphml/tests.rs +expression: graphml +--- + + + + + + + + + + + + + + + + + + + + + + + + + + + file + tests/data/import/exmaralda/clean/import/exmaralda/test_file.wav + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + dipl + node + 0-2.22222 + I'm + I'm + + + dipl + node + 2.22222-3.33333 + in + in + + + dipl + node + 3.33333-4.44444 + New + New + + + dipl + node + 4.44444-5.55555 + York + York + + + norm + node + I + 0-1.11111 + I + + + norm + node + am + 1.11111-2.22222 + am + + + norm + node + in + 2.22222-3.33333 + in + + + norm + node + New York + 3.33333-5.55555 + New York + + + dipl + node + 1 + 0-5.55555 + + + norm + I + node + PRON + 0-1.11111 + + + norm + be + node + VERB + 1.11111-2.22222 + + + norm + in + node + ADP + 2.22222-3.33333 + + + norm + New York + node + ADP + 3.33333-5.55555 + + + corpus + + + dipl + eng,eng + corpus + personal-anno-value-1 + personal-anno-value-2 + norm + was late for elicitation + was on time + test_doc + eng + deu + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/exporter/graphml/tests.rs b/src/exporter/graphml/tests.rs index 633a9ecb..6321776a 100644 --- a/src/exporter/graphml/tests.rs +++ b/src/exporter/graphml/tests.rs @@ -2,6 +2,7 @@ use super::*; use std::path::Path; use graphannis::AnnotationGraph; +use insta::assert_snapshot; use tempfile::TempDir; use crate::importer::{exmaralda::ImportEXMARaLDA, Importer}; @@ -47,3 +48,36 @@ fn export_as_zip_with_files() { files ); } + +#[test] +fn export_graphml_with_vis() { + let step_id = StepID { + module_name: "export_graphml".to_string(), + path: None, + }; + let importer = ImportEXMARaLDA::default(); + let mut updates = importer + .import_corpus( + Path::new("tests/data/import/exmaralda/clean/import/exmaralda"), + step_id.clone(), + None, + ) + .unwrap(); + let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap(); + g.apply_update(&mut updates, |_| {}).unwrap(); + + // Export the annotation graph, but zip the content + let mut exporter = GraphMLExporter::default(); + exporter.guess_vis = true; + + let output_path = TempDir::new().unwrap(); + + exporter + .export_corpus(&g, output_path.path(), step_id, None) + .unwrap(); + + // Read the generated GraphML file + let result_file_path = output_path.path().join("exmaralda.graphml"); + let graphml = std::fs::read_to_string(result_file_path).unwrap(); + assert_snapshot!(graphml); +} From cab5573c2cf45e080a5a863bc88424ba688539c1 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 7 Aug 2024 16:25:26 +0200 Subject: [PATCH 60/61] Esnure stable order in GraphML for test --- ...aphml__tests__export_graphml_with_vis.snap | 126 +++++++++--------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap b/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap index f9699887..88c507bb 100644 --- a/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap +++ b/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap @@ -58,6 +58,22 @@ display_name = "audio" visibility = "preloaded" ]]> + + corpus + + + dipl + eng,eng + corpus + personal-anno-value-1 + personal-anno-value-2 + norm + was late for elicitation + was on time + test_doc + eng + deu + file tests/data/import/exmaralda/clean/import/exmaralda/test_file.wav @@ -176,71 +192,55 @@ visibility = "preloaded" ADP 3.33333-5.55555 - - corpus - - - dipl - eng,eng - corpus - personal-anno-value-1 - personal-anno-value-2 - norm - was late for elicitation - was on time - test_doc - eng - deu - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -248,53 +248,53 @@ visibility = "preloaded" - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + From d0b87baff643b36d1b79baa0ee164814305def03 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 7 Aug 2024 16:29:58 +0200 Subject: [PATCH 61/61] Forget to commit argument --- src/exporter/graphml/tests.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/exporter/graphml/tests.rs b/src/exporter/graphml/tests.rs index 6321776a..b6b748c8 100644 --- a/src/exporter/graphml/tests.rs +++ b/src/exporter/graphml/tests.rs @@ -69,6 +69,7 @@ fn export_graphml_with_vis() { // Export the annotation graph, but zip the content let mut exporter = GraphMLExporter::default(); exporter.guess_vis = true; + exporter.stable_order = true; let output_path = TempDir::new().unwrap();