diff --git a/CHANGELOG.md b/CHANGELOG.md index 54fcda29..c360b038 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `map` manipulator can now add annotated spans and copy values from existing annotations. The copied values can be manipulated using regular expressions and replacement values. +- Addes `saltxml` import format ### Fixed diff --git a/Cargo.toml b/Cargo.toml index 5131beb8..f3e73295 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,8 @@ [package] -authors = ["Thomas Krause ", "Martin Klotz "] +authors = [ + "Thomas Krause ", + "Martin Klotz ", +] description = "Converts linguistic data formats based on the graphANNIS data model as intermediate representation and can apply consistency tests." edition = "2018" homepage = "https://github.com/korpling/annatto/" @@ -11,7 +14,7 @@ version = "0.14.0" [dependencies] ansi_term = "0.12" anyhow = "1.0" -clap = {version = "4.0", features = ["derive", "env"]} +clap = { version = "4.0", features = ["derive", "env"] } console = "0.15" csv = "1.1" documented = "0.3.0" @@ -27,32 +30,34 @@ lazy_static = "1.4.0" linked-hash-map = "0.5.6" log = "0.4" normpath = "1.1" -ordered-float = {version = "4.1", default-features = false} +ordered-float = { version = "4.1", default-features = false } pathdiff = "0.2" percent-encoding = "2.3.1" pest = "2.7" pest_derive = "2.0" -quick-xml = "0.31" +quick-xml = "0.34" rayon = "1.1" regex = "1.10" +roxmltree = "0.20.0" serde = "1.0" serde_derive = "1.0" struct-field-names-as-array = "0.3.0" -strum = {version = "0.26.2", features = ["derive"]} -tabled = {version = "0.15", features = ["ansi"]} +strum = { version = "0.26.2", features = ["derive"] } +tabled = { version = "0.15", features = ["ansi"] } tempfile = "3" termimad = "0.29.1" text-splitter = "0.6.3" thiserror = "1.0" toml = "0.8.0" -tracing-subscriber = {version = "0.3", features = ["env-filter"]} +tracing-subscriber = { version = "0.3", features = ["env-filter"] } umya-spreadsheet = "~1.1.1" +url = "2.5.2" xml-rs = "0.8" zip = "0.6.6" [dev-dependencies] assert_cmd = "2.0.11" -insta = {version = "1.26.0", features = ["toml", "filters"]} +insta = { version = "1.26.0", features = ["toml", "filters"] } pretty_assertions = "1.3" # Compile some of the dependencies in release mode if when we are ourself in @@ -82,7 +87,12 @@ ci = "github" # The installers to generate for each app installers = [] # Target platforms to build apps for (Rust target-triple syntax) -targets = ["aarch64-apple-darwin", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-pc-windows-msvc"] +targets = [ + "aarch64-apple-darwin", + "x86_64-apple-darwin", + "x86_64-unknown-linux-gnu", + "x86_64-pc-windows-msvc", +] # The preferred cargo-dist version to use in CI (Cargo.toml SemVer syntax) cargo-dist-version = "0.16.0" # Publish jobs to run in CI diff --git a/docs/README.md b/docs/README.md index 620ee41d..e6894c0e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,5 @@ -| Type | Modules | -|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | -| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) | -| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [visualize](graph_ops/visualize.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file +| Type | Modules | +|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [saltxml](importers/saltxml.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) | +| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) | +| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [visualize](graph_ops/visualize.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) | \ No newline at end of file diff --git a/docs/exporters/saltxml.md b/docs/exporters/saltxml.md new file mode 100644 index 00000000..46382448 --- /dev/null +++ b/docs/exporters/saltxml.md @@ -0,0 +1,6 @@ +# saltxml (exporter) + +Exports Excel Spreadsheets where each line is a token, the other columns are +spans and merged cells can be used for spans that cover more than one token. + +*No Configuration* diff --git a/docs/importers/saltxml.md b/docs/importers/saltxml.md new file mode 100644 index 00000000..1e3f139a --- /dev/null +++ b/docs/importers/saltxml.md @@ -0,0 +1,14 @@ +# saltxml (importer) + +Imports the SaltXML format used by Pepper (). +SaltXML is an XMI serialization of the [Salt model](https://raw.githubusercontent.com/korpling/salt/master/gh-site/doc/salt_modelGuide.pdf). + +## Configuration + +### missing_anno_ns_from_layer + +If `true`, use the layer name as fallback for the namespace annotations +if none is given. This is consistent with how the ANNIS tree visualizer +handles annotations without any namespace. If `false`, use the +`default_ns` namespace as fallback. + diff --git a/src/exporter/graphml.rs b/src/exporter/graphml.rs index 2f28749d..6a724d7b 100644 --- a/src/exporter/graphml.rs +++ b/src/exporter/graphml.rs @@ -238,17 +238,17 @@ fn media_vis(graph: &AnnotationGraph) -> Result, Box { + "mp4" | "avi" | "mov" | "webm" => { vis.push(Visualizer { element: "node".to_string(), layer: None, vis_type: "video".to_string(), display_name: "video".to_string(), - visibility: "hidden".to_string(), + visibility: "preloaded".to_string(), mappings: None, }); } @@ -350,35 +350,36 @@ fn node_annos_vis(graph: &AnnotationGraph) -> Result 1; - let ordered_nodes_are_identical = { - more_than_one_ordering && { - let ordering_components = - graph.get_all_components(Some(AnnotationComponentType::Ordering), None); - let node_sets = ordering_components - .iter() - .map(|c| { - if let Some(strge) = graph.get_graphstorage(c) { - strge - .source_nodes() - .filter_map(|r| if let Ok(n) = r { Some(n) } else { None }) - .collect::>() - } else { - BTreeSet::default() - } - }) - .collect_vec(); - let mut all_same = true; - //for i in 1..node_sets.len() - for (a, b) in node_sets.into_iter().tuple_windows() { - all_same &= matches!(a.cmp(&b), Ordering::Equal); - } - all_same + let ordered_components_contain_identical_nodes = if order_names.len() > 1 { + let ordering_components = + graph.get_all_components(Some(AnnotationComponentType::Ordering), None); + let node_sets = ordering_components + .iter() + .map(|c| { + if let Some(strge) = graph.get_graphstorage(c) { + strge + .source_nodes() + .filter_map(|r| if let Ok(n) = r { Some(n) } else { None }) + .collect::>() + } else { + BTreeSet::default() + } + }) + .collect_vec(); + let mut all_same = true; + //for i in 1..node_sets.len() + for (a, b) in node_sets.into_iter().tuple_windows() { + all_same &= matches!(a.cmp(&b), Ordering::Equal); } + all_same + } else { + // There is only one ordering component + true }; + mappings.insert( "hide_tok".to_string(), - (!ordered_nodes_are_identical).to_string(), + (!ordered_components_contain_identical_nodes).to_string(), ); mappings.insert("show_ns".to_string(), "false".to_string()); Ok(Visualizer { diff --git a/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap b/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap new file mode 100644 index 00000000..88c507bb --- /dev/null +++ b/src/exporter/graphml/snapshots/annatto__exporter__graphml__tests__export_graphml_with_vis.snap @@ -0,0 +1,300 @@ +--- +source: src/exporter/graphml/tests.rs +expression: graphml +--- + + + + + + + + + + + + + + + + + + + + + + + + + + + corpus + + + dipl + eng,eng + corpus + personal-anno-value-1 + personal-anno-value-2 + norm + was late for elicitation + was on time + test_doc + eng + deu + + + file + tests/data/import/exmaralda/clean/import/exmaralda/test_file.wav + + + node + + + + node + + + + node + + + + node + + + + node + + + + node + + + + dipl + node + 0-2.22222 + I'm + I'm + + + dipl + node + 2.22222-3.33333 + in + in + + + dipl + node + 3.33333-4.44444 + New + New + + + dipl + node + 4.44444-5.55555 + York + York + + + norm + node + I + 0-1.11111 + I + + + norm + node + am + 1.11111-2.22222 + am + + + norm + node + in + 2.22222-3.33333 + in + + + norm + node + New York + 3.33333-5.55555 + New York + + + dipl + node + 1 + 0-5.55555 + + + norm + I + node + PRON + 0-1.11111 + + + norm + be + node + VERB + 1.11111-2.22222 + + + norm + in + node + ADP + 2.22222-3.33333 + + + norm + New York + node + ADP + 3.33333-5.55555 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/exporter/graphml/tests.rs b/src/exporter/graphml/tests.rs index 633a9ecb..b6b748c8 100644 --- a/src/exporter/graphml/tests.rs +++ b/src/exporter/graphml/tests.rs @@ -2,6 +2,7 @@ use super::*; use std::path::Path; use graphannis::AnnotationGraph; +use insta::assert_snapshot; use tempfile::TempDir; use crate::importer::{exmaralda::ImportEXMARaLDA, Importer}; @@ -47,3 +48,37 @@ fn export_as_zip_with_files() { files ); } + +#[test] +fn export_graphml_with_vis() { + let step_id = StepID { + module_name: "export_graphml".to_string(), + path: None, + }; + let importer = ImportEXMARaLDA::default(); + let mut updates = importer + .import_corpus( + Path::new("tests/data/import/exmaralda/clean/import/exmaralda"), + step_id.clone(), + None, + ) + .unwrap(); + let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap(); + g.apply_update(&mut updates, |_| {}).unwrap(); + + // Export the annotation graph, but zip the content + let mut exporter = GraphMLExporter::default(); + exporter.guess_vis = true; + exporter.stable_order = true; + + let output_path = TempDir::new().unwrap(); + + exporter + .export_corpus(&g, output_path.path(), step_id, None) + .unwrap(); + + // Read the generated GraphML file + let result_file_path = output_path.path().join("exmaralda.graphml"); + let graphml = std::fs::read_to_string(result_file_path).unwrap(); + assert_snapshot!(graphml); +} diff --git a/src/importer/graphml.rs b/src/importer/graphml.rs index db72a173..73a9ce8a 100644 --- a/src/importer/graphml.rs +++ b/src/importer/graphml.rs @@ -136,7 +136,7 @@ fn read_graphml( edge_updates: &mut GraphUpdate, ) -> Result, AnnattoError> { let mut reader = Reader::from_reader(input); - reader.expand_empty_elements(true); + reader.config_mut().expand_empty_elements = true; let mut buf = Vec::new(); diff --git a/src/importer/mod.rs b/src/importer/mod.rs index c1217c88..0669fd42 100644 --- a/src/importer/mod.rs +++ b/src/importer/mod.rs @@ -8,6 +8,7 @@ pub mod none; pub mod opus; pub mod ptb; pub mod relannis; +pub mod saltxml; pub mod textgrid; pub mod toolbox; pub mod treetagger; diff --git a/src/importer/saltxml.rs b/src/importer/saltxml.rs new file mode 100644 index 00000000..2f19f683 --- /dev/null +++ b/src/importer/saltxml.rs @@ -0,0 +1,247 @@ +use document::DocumentMapper; +use documented::{Documented, DocumentedFields}; +use graphannis::update::GraphUpdate; +use roxmltree::Node; +use serde::Deserialize; +use struct_field_names_as_array::FieldNamesAsSlice; + +use crate::progress::ProgressReporter; + +use super::Importer; + +/// Imports the SaltXML format used by Pepper (). +/// SaltXML is an XMI serialization of the [Salt model](https://raw.githubusercontent.com/korpling/salt/master/gh-site/doc/salt_modelGuide.pdf). +#[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] +#[serde(default, deny_unknown_fields)] +pub struct ImportSaltXml { + /// If `true`, use the layer name as fallback for the namespace annotations + /// if none is given. This is consistent with how the ANNIS tree visualizer + /// handles annotations without any namespace. If `false`, use the + /// `default_ns` namespace as fallback. + missing_anno_ns_from_layer: bool, +} + +impl Default for ImportSaltXml { + fn default() -> Self { + Self { + missing_anno_ns_from_layer: true, + } + } +} + +impl Importer for ImportSaltXml { + fn import_corpus( + &self, + input_path: &std::path::Path, + step_id: crate::StepID, + tx: Option, + ) -> Result> { + let mut updates = GraphUpdate::new(); + // Start with an undetermined progress reporter + let reporter = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?; + let mapper = corpus_structure::SaltCorpusStructureMapper::new(); + + // Read the corpus structure from the Salt project and get the number of documents to create + reporter.info("Reading SaltXML project structure")?; + let project_file = std::fs::read_to_string(input_path.join("saltProject.salt"))?; + let documents = mapper.map_corpus_structure(&project_file, &mut updates)?; + + // Create a new progress reporter that can now estimate the work based on the number of documents + let reporter = ProgressReporter::new(tx, step_id, documents.len())?; + for document_node_name in documents { + reporter.info(&format!("Reading document {document_node_name}"))?; + + let mut relative_document_path = document_node_name.clone(); + relative_document_path.push_str(".salt"); + // Get the path from the node name + let document_path = input_path.join(relative_document_path); + let document_file = std::fs::read_to_string(&document_path)?; + DocumentMapper::read_document( + &document_file, + &document_path, + self.missing_anno_ns_from_layer, + &mut updates, + )?; + reporter.worked(1)?; + } + + Ok(updates) + } + + fn file_extensions(&self) -> &[&str] { + &[] + } +} + +const XSI_NAMESPACE: &str = "http://www.w3.org/2001/XMLSchema-instance"; + +#[derive(Debug, Clone, Copy, PartialEq)] +enum SaltType { + Annotation, + Corpus, + CorpusRelation, + Document, + DocumentRelation, + DominanceRelation, + ElementId, + Feature, + Layer, + MediaDs, + MediaRelation, + MetaAnnotation, + PointingRelation, + Span, + SpanningRelation, + Structure, + TextualDs, + TextualRelation, + Timeline, + TimelineRelation, + Token, + Unknown, +} + +impl SaltType { + fn from_node(n: &Node) -> SaltType { + // Use the xsi:type attribute to determine the type + if let Some(type_id) = n.attribute((XSI_NAMESPACE, "type")) { + match type_id { + "saltCore:SAnnotation" => SaltType::Annotation, + "saltCore:SElementId" => SaltType::ElementId, + "saltCore:SFeature" => SaltType::Feature, + "saltCore:SLayer" => SaltType::Layer, + "saltCore:SMetaAnnotation" => SaltType::MetaAnnotation, + "sCorpusStructure:SCorpus" => SaltType::Corpus, + "sCorpusStructure:SCorpusDocumentRelation" => SaltType::DocumentRelation, + "sCorpusStructure:SCorpusRelation" => SaltType::CorpusRelation, + "sCorpusStructure:SDocument" => SaltType::Document, + "sDocumentStructure:SAudioDS" => SaltType::MediaDs, + "sDocumentStructure:SAudioRelation" => SaltType::MediaRelation, + "sDocumentStructure:SDominanceRelation" => SaltType::DominanceRelation, + "sDocumentStructure:SPointingRelation" => SaltType::PointingRelation, + "sDocumentStructure:SSpan" => SaltType::Span, + "sDocumentStructure:SSpanningRelation" => SaltType::SpanningRelation, + "sDocumentStructure:SStructure" => SaltType::Structure, + "sDocumentStructure:STextualDS" => SaltType::TextualDs, + "sDocumentStructure:STextualRelation" => SaltType::TextualRelation, + "sDocumentStructure:STimeline" => SaltType::Timeline, + "sDocumentStructure:STimelineRelation" => SaltType::TimelineRelation, + "sDocumentStructure:SToken" => SaltType::Token, + + _ => SaltType::Unknown, + } + } else { + SaltType::Unknown + } + } +} + +enum SaltObject { + Text(String), + Boolean(bool), + Integer(i64), + Float(f64), + Url(String), + Null, +} + +impl From<&str> for SaltObject { + fn from(value: &str) -> Self { + if let Some(value) = value.strip_prefix("T::") { + SaltObject::Text(value.to_string()) + } else if let Some(value) = value.strip_prefix("U::") { + SaltObject::Url(value.to_string()) + } else if let Some(value) = value.strip_prefix("B::") { + let value = value.to_ascii_lowercase() == "true"; + SaltObject::Boolean(value) + } else if let Some(value) = value.strip_prefix("N::") { + let value = value.parse::().unwrap_or_default(); + SaltObject::Integer(value) + } else if let Some(value) = value.strip_prefix("F::") { + let value = value.parse::().unwrap_or_default(); + SaltObject::Float(value) + } else { + SaltObject::Null + } + } +} + +impl std::fmt::Display for SaltObject { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SaltObject::Text(val) => write!(f, "{val}"), + SaltObject::Url(val) => write!(f, "{val}"), + SaltObject::Boolean(val) => write!(f, "{val}"), + SaltObject::Integer(val) => write!(f, "{val}"), + SaltObject::Float(val) => write!(f, "{val}"), + SaltObject::Null => write!(f, ""), + } + } +} + +fn get_element_id(n: &Node) -> Option { + for element_id_label in n.children().filter(|c| { + c.tag_name().name() == "labels" && SaltType::from_node(c) == SaltType::ElementId + }) { + if let Some(id) = element_id_label.attribute("value") { + let id = SaltObject::from(id); + return Some(id.to_string().trim_start_matches("salt:/").to_string()); + } + } + None +} + +fn get_features<'a, 'input>(n: &'a Node<'a, 'input>) -> impl Iterator> { + n.children() + .filter(|n| n.tag_name().name() == "labels" && SaltType::from_node(n) == SaltType::Feature) +} + +fn get_annotations<'a, 'input>(n: &'a Node<'a, 'input>) -> impl Iterator> { + n.children().filter(|n| { + n.tag_name().name() == "labels" && SaltType::from_node(n) == SaltType::Annotation + }) +} + +fn get_meta_annotations<'a, 'input>( + n: &'a Node<'a, 'input>, +) -> impl Iterator> { + n.children().filter(|n| { + n.tag_name().name() == "labels" && SaltType::from_node(n) == SaltType::MetaAnnotation + }) +} + +fn get_feature_by_qname(n: &Node, namespace: &str, name: &str) -> Option { + get_features(n) + .filter(|f| { + f.attribute("namespace") == Some(namespace) && f.attribute("name") == Some(name) + }) + .filter_map(|f| f.attribute("value")) + .map(SaltObject::from) + .next() +} + +fn get_referenced_index(attribute_value: &str, tag_name: &str) -> Option { + let mut pattern = String::with_capacity(tag_name.len() + 4); + pattern.push_str("//@"); + pattern.push_str(tag_name); + pattern.push('.'); + + let index_as_str = attribute_value.strip_prefix(&pattern)?; + let idx = index_as_str.parse::().ok()?; + Some(idx) +} + +fn resolve_element<'a>( + attribute_value: &str, + tag_name: &str, + elements: &'a [Node], +) -> Option> { + let idx = get_referenced_index(attribute_value, tag_name)?; + elements.get(idx).copied() +} + +mod corpus_structure; +mod document; + +#[cfg(test)] +mod tests; diff --git a/src/importer/saltxml/corpus_structure.rs b/src/importer/saltxml/corpus_structure.rs new file mode 100644 index 00000000..2a255da4 --- /dev/null +++ b/src/importer/saltxml/corpus_structure.rs @@ -0,0 +1,152 @@ +use std::collections::BTreeSet; + +use anyhow::{anyhow, Ok}; +use graphannis::{ + model::AnnotationComponentType, + update::{GraphUpdate, UpdateEvent}, +}; +use graphannis_core::graph::ANNIS_NS; +use itertools::Itertools; + +use super::{ + get_element_id, get_features, get_meta_annotations, resolve_element, SaltObject, SaltType, +}; + +pub(super) struct SaltCorpusStructureMapper {} + +impl SaltCorpusStructureMapper { + pub(super) fn new() -> SaltCorpusStructureMapper { + SaltCorpusStructureMapper {} + } + + pub(super) fn map_corpus_structure( + &self, + input: &str, + updates: &mut GraphUpdate, + ) -> anyhow::Result> { + let doc = roxmltree::Document::parse(input)?; + + let root = doc.root_element(); + if root.tag_name().name() != "SaltProject" { + return Err(anyhow!( + "SaltXML project file must start with tag" + )); + } + + let mut documents = BTreeSet::new(); + + // Iterate over all corpus graphs + for cg in root + .children() + .filter(|t| t.tag_name().name() == "sCorpusGraphs") + { + // Get all nodes + let nodes = cg + .children() + .filter(|t| t.tag_name().name() == "nodes") + .collect_vec(); + + for node in nodes.iter() { + let salt_type = SaltType::from_node(node); + match salt_type { + SaltType::Corpus | SaltType::Document => { + // Get the element ID from the label + let node_name = get_element_id(node) + .ok_or_else(|| anyhow!("Missing element ID for corpus graph node"))?; + // Create the element with the collected properties + updates.add_event(UpdateEvent::AddNode { + node_name: node_name.to_string(), + node_type: "corpus".into(), + })?; + + // Add the document ID to the result + if SaltType::Document == salt_type { + documents.insert(node_name.to_string()); + } + + // Add features as annotations + for feature_node in get_features(node) { + let annos_ns = feature_node.attribute("namespace"); + let anno_name = feature_node.attribute("name").ok_or_else(|| { + anyhow!("Missing \"name\" attribute for node \"{node_name}\"") + })?; + let anno_value = SaltObject::from( + feature_node.attribute("value").unwrap_or_default(), + ); + + if annos_ns == Some("salt") { + if anno_name == "SNAME" { + // Only map this specific feature as document name + if salt_type == SaltType::Document { + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "doc".to_string(), + anno_value: anno_value.to_string(), + })?; + } + } + } else { + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: annos_ns.unwrap_or_default().to_string(), + anno_name: anno_name.to_string(), + anno_value: anno_value.to_string(), + })?; + } + } + // Add meta annotations + for anno_node in get_meta_annotations(node) { + let annos_ns = anno_node.attribute("namespace"); + if annos_ns != Some("salt") { + let anno_name = anno_node.attribute("name").ok_or_else(|| { + anyhow!("Missing \"name\" attribute for node \"{node_name}\"") + })?; + let anno_value = SaltObject::from( + anno_node.attribute("value").unwrap_or_default(), + ); + + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: annos_ns.unwrap_or_default().to_string(), + anno_name: anno_name.to_string(), + anno_value: anno_value.to_string(), + })?; + } + } + } + _ => {} + } + } + + // Add a PartOf Edge between parent corpora and the sub-corpora/documents + for e in cg.children().filter(|n| n.tag_name().name() == "edges") { + match SaltType::from_node(&e) { + SaltType::CorpusRelation | SaltType::DocumentRelation => { + let source_ref = e.attribute("source").unwrap_or_default(); + let target_ref = e.attribute("target").unwrap_or_default(); + + let source_node = resolve_element(source_ref, "nodes", &nodes) + .and_then(|n| get_element_id(&n)); + let target_node = resolve_element(target_ref, "nodes", &nodes) + .and_then(|n| get_element_id(&n)); + + if let (Some(source_node), Some(target_node)) = (source_node, target_node) { + // PartOf has the inverse meaning of the corpus and documentation relation in Salt + updates.add_event(UpdateEvent::AddEdge { + source_node: target_node, + target_node: source_node, + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".into(), + })?; + } + } + _ => {} + } + } + } + + Ok(documents) + } +} diff --git a/src/importer/saltxml/document.rs b/src/importer/saltxml/document.rs new file mode 100644 index 00000000..c9037339 --- /dev/null +++ b/src/importer/saltxml/document.rs @@ -0,0 +1,714 @@ +use std::{ + collections::BTreeMap, + convert::{TryFrom, TryInto}, + path::Path, +}; + +use anyhow::{bail, Context, Result}; +use graphannis::{ + model::AnnotationComponentType, + update::{GraphUpdate, UpdateEvent}, +}; +use graphannis_core::graph::ANNIS_NS; +use itertools::Itertools; +use normpath::{BasePathBuf, PathExt}; +use roxmltree::Node; +use url::Url; + +use super::{ + get_annotations, get_element_id, get_feature_by_qname, resolve_element, SaltObject, SaltType, +}; + +#[derive(Eq, PartialEq, PartialOrd, Ord, Hash, Clone, Debug)] +pub struct TextProperty { + segmentation: String, + val: i64, +} + +pub(super) struct DocumentMapper<'a, 'input> { + nodes: Vec>, + edges: Vec>, + layers: Vec>, + base_texts: BTreeMap, + media_files: BTreeMap, + document_node_name: String, + token_to_tli: BTreeMap>, + missing_anno_ns_from_layer: bool, + input_directory: BasePathBuf, +} + +impl<'a, 'input> DocumentMapper<'a, 'input> { + pub(super) fn read_document( + input: &'input str, + input_path: &Path, + missing_anno_ns_from_layer: bool, + updates: &mut GraphUpdate, + ) -> Result<()> { + let doc = roxmltree::Document::parse(input)?; + let root = doc.root_element(); + if root.tag_name().name() != "SDocumentGraph" { + bail!("SaltXML document file must start with tag"); + } + + let nodes = doc + .root_element() + .children() + .filter(|n| n.tag_name().name() == "nodes") + .collect_vec(); + + let edges = doc + .root_element() + .children() + .filter(|n| n.tag_name().name() == "edges") + .collect_vec(); + + let layers = doc + .root_element() + .children() + .filter(|n| n.tag_name().name() == "layers") + .collect_vec(); + let document_node_name = + get_element_id(&doc.root_element()).context("Missing document ID")?; + + let input_directory = input_path.parent().unwrap_or(input_path).normalize()?; + + let mut mapper = DocumentMapper { + base_texts: BTreeMap::new(), + media_files: BTreeMap::new(), + missing_anno_ns_from_layer, + nodes, + edges, + layers, + document_node_name, + token_to_tli: BTreeMap::new(), + input_directory, + }; + + let timeline = mapper + .nodes + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::Timeline) + .copied() + .next(); + + mapper.map_textual_datasources(updates)?; + mapper.map_media_datasources(updates)?; + mapper.map_tokens(timeline.as_ref(), updates)?; + if let Some(timeline) = timeline { + mapper.map_timeline(&timeline, updates)?; + } + if !mapper.media_files.is_empty() { + mapper.map_media_relations(updates)?; + } + + mapper.map_non_token_nodes(updates)?; + + Ok(()) + } + + fn get_tli_node_name(&self, tli: i64) -> String { + format!("{}#tli{tli}", self.document_node_name) + } + + fn map_timeline(&mut self, timeline: &Node, updates: &mut GraphUpdate) -> Result<()> { + let number_of_tlis = get_feature_by_qname(timeline, "saltCommon", "SDATA") + .context("Missing SDATA attribute for timeline.")?; + if let SaltObject::Integer(number_of_tlis) = number_of_tlis { + let mut previous_tli = None; + for i in 0..number_of_tlis { + let tli_node_name = self.get_tli_node_name(i); + updates.add_event(UpdateEvent::AddNode { + node_name: tli_node_name.clone(), + node_type: "node".to_string(), + })?; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: tli_node_name.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok".to_string(), + anno_value: " ".to_string(), + })?; + updates.add_event(UpdateEvent::AddEdge { + source_node: tli_node_name.clone(), + target_node: self.document_node_name.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + + if let Some(previous_tli) = previous_tli { + updates.add_event(UpdateEvent::AddEdge { + source_node: previous_tli, + target_node: tli_node_name.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::Ordering.to_string(), + component_name: "".to_string(), + })?; + } + + previous_tli = Some(tli_node_name); + } + } else { + bail!("SDATA attribute for timeline is not a number.") + } + + // Connect the existing non-timeline tokens with the the timeline tokens + for timeline_rel in self + .edges + .iter() + .filter(|rel| SaltType::from_node(rel) == SaltType::TimelineRelation) + { + let source_att = timeline_rel.attribute("source").unwrap_or_default(); + let token_node = resolve_element(source_att, "nodes", &self.nodes) + .context("Token referenced in STimelineRelation cannot be resolved")?; + let token_id = get_element_id(&token_node).context("Token has no ID")?; + + let start = get_feature_by_qname(timeline_rel, "salt", "SSTART") + .context("Missing SSTART attribute for timeline relation")?; + let end = get_feature_by_qname(timeline_rel, "salt", "SEND") + .context("Missing SEND attribute for timeline relation")?; + + if let (SaltObject::Integer(start), SaltObject::Integer(end)) = (start, end) { + for tli in start..end { + updates.add_event(UpdateEvent::AddEdge { + source_node: token_id.clone(), + target_node: self.get_tli_node_name(tli), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::Coverage.to_string(), + component_name: "".to_string(), + })?; + } + self.token_to_tli + .insert(token_id, (start..end).collect_vec()); + } else { + bail!("SSTART/SEND not an integer") + } + } + + Ok(()) + } + + fn map_textual_datasources(&mut self, updates: &mut GraphUpdate) -> Result<()> { + for text_node in self + .nodes + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::TextualDs) + { + let element_id = + get_element_id(text_node).context("Missing element ID for textual data source")?; + + if let Some(SaltObject::Text(anno_value)) = + get_feature_by_qname(text_node, "saltCommon", "SDATA") + { + self.base_texts.insert(element_id.clone(), anno_value); + updates.add_event(UpdateEvent::AddNode { + node_name: element_id.clone(), + node_type: "datasource".to_string(), + })?; + + updates.add_event(UpdateEvent::AddEdge { + source_node: element_id.clone(), + target_node: self.document_node_name.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + } + } + Ok(()) + } + + fn map_media_datasources(&mut self, updates: &mut GraphUpdate) -> Result<()> { + for media_node in self + .nodes + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::MediaDs) + { + let orig_element_id = get_element_id(media_node) + .context("Missing element ID for media/audio data source")?; + + if let Some(SaltObject::Url(anno_value)) = + get_feature_by_qname(media_node, "salt", "SAUDIO_REFERENCE") + { + // Parse the file URL with the input file location as base path + let base_dir = Url::from_directory_path(self.input_directory.canonicalize()?).ok(); + let referenced_url = Url::options() + .base_url(base_dir.as_ref()) + .parse(&anno_value)?; + + let mut element_id = orig_element_id; + let mut file_path = referenced_url.to_string(); + if referenced_url.scheme() == "file" { + // Resolve this file URL against the input direcotry and + // store it relative to the current working directory. + let referenced_path = Path::new(referenced_url.path()); + let referenced_path = pathdiff::diff_paths( + referenced_path.normalize()?, + &std::env::current_dir()?, + ) + .unwrap_or_else(|| referenced_path.to_path_buf()); + + file_path = referenced_path.to_string_lossy().to_string(); + // Use the file name as element ID + if let Some(file_name) = referenced_path.file_name() { + element_id = format!( + "{}/{}", + self.document_node_name, + file_name.to_string_lossy() + ); + } + }; + updates.add_event(UpdateEvent::AddNode { + node_name: element_id.clone(), + node_type: "file".to_string(), + })?; + + self.media_files + .insert(element_id.clone(), file_path.clone()); + + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: element_id.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "file".to_string(), + anno_value: file_path, + })?; + + updates.add_event(UpdateEvent::AddEdge { + source_node: element_id.clone(), + target_node: self.document_node_name.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + } + } + Ok(()) + } + + fn map_media_relations(&mut self, updates: &mut GraphUpdate) -> Result<()> { + for media_rel in self + .edges + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::MediaRelation) + { + let source_att = media_rel.attribute("source").unwrap_or_default(); + let token_node = resolve_element(source_att, "nodes", &self.nodes) + .context("Token referenced in SAudioRelation cannot be resolved")?; + let token_id = get_element_id(&token_node).context("Token has no ID")?; + + let start = get_feature_by_qname(media_rel, "salt", "SSTART") + .context("Missing SSTART attribute for SAudioRlation")?; + let end = get_feature_by_qname(media_rel, "salt", "SEND") + .context("Missing SEND attribute for SAudioRelation")?; + + if let (SaltObject::Float(start), SaltObject::Float(end)) = (start, end) { + if let Some(covered_tli) = self.token_to_tli.get(&token_id) { + if let (Some(first_tli), Some(last_tli)) = + (covered_tli.first(), covered_tli.last()) + { + if first_tli == last_tli { + // Attach start and end time to the same token + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: self.get_tli_node_name(*first_tli), + anno_ns: "annis".to_string(), + anno_name: "time".to_string(), + anno_value: format!("{start}-{end}"), + })?; + } else { + // Attach start time to first token and end time to + // last token + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: self.get_tli_node_name(*first_tli), + anno_ns: "annis".to_string(), + anno_name: "time".to_string(), + anno_value: format!("{start}-"), + })?; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: self.get_tli_node_name(*last_tli), + anno_ns: "annis".to_string(), + anno_name: "time".to_string(), + anno_value: format!("-{end}"), + })?; + } + } + } + } else { + bail!("SSTART/SEND not a float") + } + } + + Ok(()) + } + + fn map_node( + &self, + n: &Node, + document_node_name: &str, + updates: &mut GraphUpdate, + ) -> Result<()> { + let id = get_element_id(n).context("Missing element ID for node")?; + updates.add_event(UpdateEvent::AddNode { + node_name: id.clone(), + node_type: "node".to_string(), + })?; + + updates.add_event(UpdateEvent::AddEdge { + source_node: id.clone(), + target_node: document_node_name.to_string(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; + + let mut fallback_annotation_namespace = "default_ns".to_string(); + + if let Some(layers_attribute) = n.attribute("layers") { + for layer_ref in layers_attribute.split(' ') { + let layer_node = resolve_element(layer_ref, "layers", &self.layers) + .context("Could not resolve layer")?; + if let Some(SaltObject::Text(layer_name)) = + get_feature_by_qname(&layer_node, "salt", "SNAME") + { + // Use the edge layer as fallback annotation namespace. This is + // consistent with e.g. the ANNIS Tree Visualizer handles + // annotations without any namespace. + if self.missing_anno_ns_from_layer { + fallback_annotation_namespace.clone_from(&layer_name); + } + + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: id.clone(), + anno_ns: ANNIS_NS.to_owned(), + anno_name: "layer".to_owned(), + anno_value: layer_name, + })?; + } + } + } + + for label_node in get_annotations(n) { + let anno_ns = label_node + .attribute("namespace") + .unwrap_or(&fallback_annotation_namespace) + .to_string(); + let anno_name = label_node + .attribute("name") + .context("Missing annotation name for node")? + .to_string(); + let anno_value = + SaltObject::from(label_node.attribute("value").unwrap_or_default()).to_string(); + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: id.clone(), + anno_ns, + anno_name, + anno_value, + })?; + } + Ok(()) + } + + fn map_edge( + &self, + rel: &Node, + overwrite_target_node: Option, + component_type: AnnotationComponentType, + fallback_component_name: &str, + updates: &mut GraphUpdate, + ) -> Result<()> { + let source_att_val = rel.attribute("source").unwrap_or_default(); + let source_element = + resolve_element(source_att_val, "nodes", &self.nodes).context("Missing source node")?; + let source_id = get_element_id(&source_element).context("Missing source node ID")?; + + let target_id = if let Some(target_id) = overwrite_target_node { + target_id + } else { + let target_att_val = rel.attribute("target").unwrap_or_default(); + let target_element = resolve_element(target_att_val, "nodes", &self.nodes) + .context("Missing target node")?; + get_element_id(&target_element).context("Missing target node ID")? + }; + + let component_name = get_feature_by_qname(rel, "salt", "STYPE") + .map(|t| t.to_string()) + .unwrap_or_else(|| fallback_component_name.to_string()); + + let mut component_layer = "default_ns".to_string(); + if let Some(layers_attribute) = rel.attribute("layers") { + if let Some(first_layer) = layers_attribute.split(' ').next() { + component_layer = first_layer.to_string(); + } + } + + updates.add_event(UpdateEvent::AddEdge { + source_node: source_id.clone(), + target_node: target_id.clone(), + layer: component_layer.clone(), + component_type: component_type.to_string(), + component_name: component_name.clone(), + })?; + + if component_type == AnnotationComponentType::Dominance { + // Also add to the special component with the empty name, which includes all dominance edges from all STypes. + updates.add_event(UpdateEvent::AddEdge { + source_node: source_id.clone(), + target_node: target_id.clone(), + layer: ANNIS_NS.to_string(), + component_type: component_type.to_string(), + component_name: "".to_string(), + })?; + } + + let fallback_annotation_namespace = if self.missing_anno_ns_from_layer { + &component_layer + } else { + "default_ns" + }; + + for label_element in get_annotations(rel) { + let anno_ns = label_element + .attribute("namespace") + .unwrap_or(fallback_annotation_namespace) + .to_string(); + + let anno_name = label_element + .attribute("name") + .context("Missing annotation name for edge")? + .to_string(); + let anno_value = + SaltObject::from(label_element.attribute("value").unwrap_or_default()).to_string(); + updates.add_event(UpdateEvent::AddEdgeLabel { + source_node: source_id.clone(), + target_node: target_id.clone(), + layer: component_layer.clone(), + component_type: component_type.to_string(), + component_name: component_name.clone(), + anno_ns, + anno_name, + anno_value, + })?; + } + Ok(()) + } + + fn map_tokens(&self, timeline: Option<&Node>, updates: &mut GraphUpdate) -> Result<()> { + // Map the token nodes in the same order as in the SaltXML file + for token_node in self + .nodes + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::Token) + { + self.map_node(token_node, &self.document_node_name, updates)?; + } + + // Order textual relations by their start offset, so we iterate in the + // actual order of the tokens. + let sorted_text_rels: BTreeMap = self + .edges + .iter() + .filter(|n| SaltType::from_node(n) == SaltType::TextualRelation) + .map(|text_rel| { + let start = + get_feature_by_qname(text_rel, "salt", "SSTART").unwrap_or(SaltObject::Null); + let referenced_text_node = resolve_element( + text_rel.attribute("target").unwrap_or_default(), + "nodes", + &self.nodes, + ) + .and_then(|n| get_feature_by_qname(&n, "salt", "SNAME")) + .map(|o| o.to_string()) + .unwrap_or_default(); + let val = if let SaltObject::Integer(start) = start { + start + } else { + -1 + }; + let prop = TextProperty { + segmentation: referenced_text_node, + val, + }; + (prop, *text_rel) + }) + .collect(); + + // Connect the token to the texts by the textual relations + let mut previous_token: Option<(TextProperty, String)> = None; + let mut sorted_text_rels = sorted_text_rels.into_iter().peekable(); + while let Some((text_prop, text_rel)) = sorted_text_rels.next() { + if let Some(p) = &previous_token { + // If the segmentation changes, there is no previous token + if p.0.segmentation != text_prop.segmentation { + previous_token = None; + } + } + + let source_att_val = text_rel.attribute("source").unwrap_or_default(); + let token = + resolve_element(source_att_val, "nodes", &self.nodes).with_context(|| { + format!("Textual relation source \"{source_att_val}\" could not be resolved") + })?; + let token_id = get_element_id(&token).context("Missing ID for token")?; + + let target_att_val = text_rel.attribute("target").unwrap_or_default(); + let datasource = + resolve_element(target_att_val, "nodes", &self.nodes).with_context(|| { + format!("Textual relation target \"{target_att_val}\" could not be resolved") + })?; + let datasource_id = get_element_id(&datasource).context("Missing ID for token")?; + + // Get the string for this token + let matching_base_text = self + .base_texts + .get(&datasource_id) + .with_context(|| format!("Missing base text for token {token_id}"))?; + // Our indices are refering to characters not bytes + let matching_base_text = matching_base_text.chars().collect_vec(); + let start = + get_feature_by_qname(&text_rel, "salt", "SSTART").context("Missing start value")?; + let end = + get_feature_by_qname(&text_rel, "salt", "SEND").context("Missing end value")?; + if let (SaltObject::Integer(start), SaltObject::Integer(end)) = (start, end) { + let start = usize::try_from(start)?; + let end = usize::try_from(end)?; + let covered_text = &matching_base_text[start..end]; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: token_id.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok".to_string(), + anno_value: covered_text.iter().collect(), + })?; + if timeline.is_some() { + // Add the token value as additional annotation + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: token_id.clone(), + anno_ns: text_prop.segmentation.clone(), + anno_name: text_prop.segmentation.clone(), + anno_value: covered_text.iter().collect(), + })?; + } + + // Get the whitespace before the first token + if previous_token.is_none() && start > 0 { + let whitespace = &matching_base_text[0..start]; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: token_id.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok-whitespace-before".to_string(), + anno_value: whitespace.iter().collect(), + })?; + } + + // Add whitespace after this token + let next_token_offset = sorted_text_rels + .peek() + .map(|(prop, _)| prop.val) + .unwrap_or_else(|| matching_base_text.len().try_into().unwrap_or(i64::MAX)); + let next_token_offset = usize::try_from(next_token_offset).unwrap_or(0); + + if next_token_offset > end && (next_token_offset - end) >= 1 { + let whitespace = &matching_base_text[end..next_token_offset]; + updates.add_event(UpdateEvent::AddNodeLabel { + node_name: token_id.clone(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok-whitespace-after".to_string(), + anno_value: whitespace.iter().collect(), + })?; + } + } + // Add ordering edges between the tokens for the base token layer + if let Some(previous_token) = previous_token { + let component_name = if timeline.is_some() { + text_prop.segmentation.clone() + } else { + "".to_string() + }; + updates.add_event(UpdateEvent::AddEdge { + source_node: previous_token.1.clone(), + target_node: token_id.clone(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::Ordering.to_string(), + component_name, + })?; + } + previous_token = Some((text_prop, token_id)); + } + + Ok(()) + } + + fn map_non_token_nodes(&self, updates: &mut GraphUpdate) -> Result<()> { + for span_node in self.nodes.iter().filter(|n| { + let t = SaltType::from_node(n); + t == SaltType::Span || t == SaltType::Structure + }) { + self.map_node(span_node, &self.document_node_name, updates)?; + } + + // Connect all spans with the token using the spanning relations + + for spanning_rel in self + .edges + .iter() + .filter(|rel| SaltType::from_node(rel) == SaltType::SpanningRelation) + { + let target_att = spanning_rel + .attribute("target") + .context("Missing target attribute for SSpanningRelation")?; + let target_node = resolve_element(target_att, "nodes", &self.nodes) + .context("Could not resolve target for SSpanningRelation")?; + let target_node_id = get_element_id(&target_node).context("Target token has no ID")?; + + if let Some(tli_token) = self.token_to_tli.get(&target_node_id) { + // Add a coverage edge to the indirectly covered timeline item token + for tli in tli_token { + let tli_id = self.get_tli_node_name(*tli); + self.map_edge( + spanning_rel, + Some(tli_id), + AnnotationComponentType::Coverage, + "", + updates, + )?; + } + } else { + // Directly map the coverage edge + self.map_edge( + spanning_rel, + None, + AnnotationComponentType::Coverage, + "", + updates, + )?; + } + } + // Add all dominance relations + for dominance_rel in self + .edges + .iter() + .filter(|rel| SaltType::from_node(rel) == SaltType::DominanceRelation) + { + self.map_edge( + dominance_rel, + None, + AnnotationComponentType::Dominance, + "edge", + updates, + )?; + } + + // Add all pointing relations + for pointing_rel in self + .edges + .iter() + .filter(|rel| SaltType::from_node(rel) == SaltType::PointingRelation) + { + self.map_edge( + pointing_rel, + None, + AnnotationComponentType::Pointing, + "edge", + updates, + )?; + } + Ok(()) + } +} diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap new file mode 100644 index 00000000..f8c22fac --- /dev/null +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_sample_corpus.snap @@ -0,0 +1,1475 @@ +--- +source: src/importer/saltxml/tests.rs +expression: actual +--- + + + + + + + + + + + + + + + corpus + + + corpus + + + corpus + + + doc1 + corpus + + + doc2 + corpus + + + doc3 + corpus + + + doc4 + corpus + + + datasource + + + morphology + be + node + VBZ + Is + + + + morphology + this + node + DT + this + + + + morphology + example + node + NN + example + + + + morphology + more + node + RBR + more + + + + morphology + complicated + node + JJ + complicated + + + + morphology + than + node + IN + than + + + + morphology + it + node + PRP + it + + + + morphology + appear + node + VBZ + appears + + + + morphology + to + node + TO + to + + + + morphology + be + node + VB + be + + + morphology + ? + node + . + ? + + + contrast-focus + node + + + topic + node + + + ROOT + syntax + node + + + SQ + syntax + node + + + NP + syntax + node + + + ADJP + syntax + node + + + ADJP + syntax + node + + + SBar + syntax + node + + + S + syntax + node + + + NP + syntax + node + + + VP + syntax + node + + + S + syntax + node + + + VP + syntax + node + + + VP + syntax + node + + + node + + + datasource + + + morphology + be + node + VBZ + Is + + + + morphology + this + node + DT + this + + + + morphology + example + node + NN + example + + + + morphology + more + node + RBR + more + + + + morphology + complicated + node + JJ + complicated + + + + morphology + than + node + IN + than + + + + morphology + it + node + PRP + it + + + + morphology + appear + node + VBZ + appears + + + + morphology + to + node + TO + to + + + + morphology + be + node + VB + be + + + morphology + ? + node + . + ? + + + contrast-focus + node + + + topic + node + + + ROOT + syntax + node + + + SQ + syntax + node + + + NP + syntax + node + + + ADJP + syntax + node + + + ADJP + syntax + node + + + SBar + syntax + node + + + S + syntax + node + + + NP + syntax + node + + + VP + syntax + node + + + S + syntax + node + + + VP + syntax + node + + + VP + syntax + node + + + node + + + datasource + + + morphology + be + node + VBZ + Is + + + + morphology + this + node + DT + this + + + + morphology + example + node + NN + example + + + + morphology + more + node + RBR + more + + + + morphology + complicated + node + JJ + complicated + + + + morphology + than + node + IN + than + + + + morphology + it + node + PRP + it + + + + morphology + appear + node + VBZ + appears + + + + morphology + to + node + TO + to + + + + morphology + be + node + VB + be + + + morphology + ? + node + . + ? + + + contrast-focus + node + + + topic + node + + + ROOT + syntax + node + + + SQ + syntax + node + + + NP + syntax + node + + + ADJP + syntax + node + + + ADJP + syntax + node + + + SBar + syntax + node + + + S + syntax + node + + + NP + syntax + node + + + VP + syntax + node + + + S + syntax + node + + + VP + syntax + node + + + VP + syntax + node + + + node + + + datasource + + + morphology + be + node + VBZ + Is + + + + morphology + this + node + DT + this + + + + morphology + example + node + NN + example + + + + morphology + more + node + RBR + more + + + + morphology + complicated + node + JJ + complicated + + + + morphology + than + node + IN + than + + + + morphology + it + node + PRP + it + + + + morphology + appear + node + VBZ + appears + + + + morphology + to + node + TO + to + + + + morphology + be + node + VB + be + + + morphology + ? + node + . + ? + + + contrast-focus + node + + + topic + node + + + ROOT + syntax + node + + + SQ + syntax + node + + + NP + syntax + node + + + ADJP + syntax + node + + + ADJP + syntax + node + + + SBar + syntax + node + + + S + syntax + node + + + NP + syntax + node + + + VP + syntax + node + + + S + syntax + node + + + VP + syntax + node + + + VP + syntax + node + + + node + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + det + + + cop + + + nsubj + + + advmod + + + advcl + + + mark + + + nsubj + + + xcomp + + + aux + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap new file mode 100644 index 00000000..72345abe --- /dev/null +++ b/src/importer/saltxml/snapshots/annatto__importer__saltxml__tests__read_salt_with_timeline.snap @@ -0,0 +1,2993 @@ +--- +source: src/importer/saltxml/tests.rs +expression: actual +--- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + corpus + + + SPK0 + SPK1 + German + deu + deu + corpus + m + m + dialog.demo + dialog.demo + BeMaTaC + Part of a dialog recorded in a map task setting + dialog.demo + HU Berlin + deu + deu + + + datasource + + + datasource + + + file + tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm + + + node + naja + naja + + + node + pass + pass + + + node + auf + auf + + + node + also + also + + + node + du + du + + + node + hast + hast + + + node + jetz + jetz + + + node + wohl + wohl + + + node + hier + hier + + + node + auch + auch + + + node + so + so + + + node + ne + ne + + + node + Karte + Karte + + + node + wie + wie + + + node + ich + ich + + + node + bloß + bloß + + + node + ich + ich + + + node + hab + hab + + + node + ne + ne + + + node + Linie + Linie + + + node + und + und + + + node + du + du + + + node + nich + nich + + + node + eine + eine + + + node + genau + genau + + + node + und + und + + + node + ich + ich + + + node + muss + muss + + + node + dir + dir + + + node + jetzt + jetzt + + + node + erklärn + erklärn + + + node + wie + wie + + + node + du + du + + + node + vom + vom + + + node + Start + Start + + + node + zum + zum + + + node + Ziel + Ziel + + + node + kommst + kommst + + + node + so + so + + + node + wie + wie + + + node + meine + meine + + + node + Linie + Linie + + + node + geht + geht + + + node + so + so + + + node + also + also + + + node + du + du + + + node + hast + hast + + + node + n + n + + + node + Stift + Stift + + + node + okay + okay + + + node + aso + aso + + + node + du + du + + + node + musst + musst + + + node + jetzt + jetzt + + + node + vom + vom + + + node + Startpunkt + Startpunkt + + + node + äh + äh + + + node + waagerecht + waagerecht + + + node + Richtung + Richtung + + + node + also + also + + + node + zu + zu + + + node + dem + dem + + + node + ersten + ersten + + + node + Bild + Bild + + + node + erstmal + erstmal + + + node + und + und + + + node + zum + zum + + + node + Rad + Rad + + + node + äh + äh + + + node + ((lacht)) + ((lacht)) + + + node + fang + fang + + + node + einfach + einfach + + + node + ma + ma + + + node + an + an + + + node + ((lacht)) + ((lacht)) + + + node + nee + nee + + + node + ich + ich + + + node + hab + hab + + + node + gar + gar + + + node + keine + keine + + + node + Linien + Linien + + + node + ich + ich + + + node + hab + hab + + + node + Start + Start + + + node + und + und + + + node + Ziel + Ziel + + + node + genau + genau + + + node + jap + jap + + + node + mhm + mhm + + + node + zum + zum + + + node + (?) + (?) + + + node + Schornsteinfeger + Schornsteinfeger + + + node + 0-0.6884364908323192 + + + + node + 0.6884364908323192-1.6052463032285857 + + + + node + 1.6052463032285857-1.809765415224676 + + + + node + 1.809765415224676-2.12712265797723 + + + + node + 2.12712265797723-2.3069584288703435 + + + + node + 2.3069584288703435-2.462110858660481 + + + + node + 2.462110858660481-3.061563428304194 + + + + node + 3.061563428304194-3.3859730542290265 + + + + node + 3.3859730542290265-3.564405882202723 + + + + node + 3.564405882202723-3.7724511857849525 + + + + node + 3.7724511857849525-3.9840226809533217 + + + + node + 3.9840226809533217-4.06865127902067 + + + + node + 4.06865127902067-4.290801348947457 + + + + node + 4.290801348947457-4.61521097487229 + + + + node + 4.61521097487229-4.766837213076288 + + + + node + 4.766837213076288-5.045406348381307 + + + + node + 5.045406348381307-5.182927820240747 + + + + node + 5.182927820240747-5.348658824789303 + + + + node + 5.348658824789303-5.50381125457944 + + + + node + 5.50381125457944-5.785906581470599 + + + + node + 5.785906581470599-5.969268543949853 + + + + node + 5.969268543949853-6.187892422290501 + + + + node + 6.187892422290501-6.492035791177925 + + + + node + 6.492035791177925-6.689502520001737 + + + + node + 6.689502520001737-6.819971608688898 + + + + node + 6.819971608688898-6.957493080548338 + + + + node + 6.957493080548338-7.3101122391622875 + + + + node + 7.3101122391622875-7.4970003932276805 + + + + node + 7.4970003932276805-7.578102799708889 + + + + node + 7.578102799708889-7.990667215287209 + + + + node + 7.990667215287209-8.198712518869439 + + + + node + 8.198712518869439-8.343286373901158 + + + + node + 8.343286373901158-8.477281654174458 + + + + node + 8.477281654174458-8.61127693444776 + + + + node + 8.61127693444776-8.766429364237897 + + + + node + 8.766429364237897-8.94979132671715 + + + + node + 8.94979132671715-9.111996139679567 + + + + node + 9.111996139679567-9.193098546160776 + + + + node + 9.193098546160776-9.270674761055844 + + + + node + 9.270674761055844-9.54771075568088 + + + + node + 9.54771075568088-9.727546526573994 + + + + node + 9.727546526573994- + + + + node + -10.041377577740407 + + + + node + + + + node + 10.196530007530546-10.358734820492963 + + + + node + 10.358734820492963-10.5138872502831 + + + + node + 10.5138872502831-10.718840651322928 + + + + node + 10.718840651322928-10.9127811885606 + + + + node + 10.9127811885606-11.099669342625994 + + + + node + 11.099669342625994-11.836643384129147 + + + + node + + + + node + 12.50309359390951-12.619457916252113 + + + + node + 12.619457916252113-12.71466508907788 + + + + node + 12.71466508907788-12.933288967418529 + + + + node + 12.933288967418529-13.25769859334336 + + + + node + 13.25769859334336-13.511584387545403 + + + + node + 13.511584387545403-13.850098779814795 + + + + node + 13.850098779814795-14.426381411663305 + + + + node + + + + node + 14.704950546968325-14.934153000067392 + + + + node + 14.934153000067392-15.121041154132785 + + + + node + 15.121041154132785-15.420767438954641 + + + + node + 15.420767438954641-15.794543747085427 + + + + node + 15.794543747085427-16.147162905699375 + + + + node + + + + node + 16.390470125143-17.00737639011335 + + + + node + + + + node + 17.113162137697536-17.783138539064044 + + + + node + 17.783138539064044-18.132231506091856 + + + + node + 18.132231506091856-18.195702954642368 + + + + node + 18.195702954642368-18.38259110870776 + + + + node + 18.38259110870776-18.449588748844413 + + + + node + 18.449588748844413-18.932676996145524 + + + + node + 18.932676996145524-19.953208071992766 + + + + node + 19.953208071992766-20.31816890902807 + + + + node + 20.31816890902807-20.471558246332762 + + + + node + 20.471558246332762-20.572054708704805 + + + + node + 20.572054708704805-20.75717977096909 + + + + node + 20.75717977096909-20.963461983206436 + + + + node + 20.963461983206436-21.132719182990925 + + + + node + 21.132719182990925-21.952558744447042 + + + + node + + + + node + 22.042476631832553-23.116201992965404 + + + + node + 23.116201992965404-23.7720736421303 + + + + node + 23.7720736421303-24.956836985878773 + + + + node + 24.956836985878773-25.965327779514674 + + + + node + + + + node + 26.71640658736239-26.86310541935397 + + + + node + 26.86310541935397-26.972417358524297 + + + + node + 26.972417358524297-27.095834064039177 + + + + node + 27.095834064039177-27.392034157274896 + + + + node + 27.392034157274896-27.920962895195824 + + + + node + 27.920962895195824-28.739039343180192 + + + + node + 28.739039343180192-29.052870394346606 + + + + node + 29.052870394346606-29.882474847095374 + + + + node + + + + node + 30.16457018006952-30.568359047306725 + + + + node + 30.568359047306725- + + + + node + 30.57541143063108-30.93508298017312 + + + + node + -31.760211829122508 + + + + node + + + + node + + + + node + naja + + + node + pass + + + node + auf + + + node + also + + + node + du + + + node + hast + + + node + jetzt + + + node + wohl + + + node + hier + + + node + auch + + + node + so + + + node + eine + + + node + Karte + + + node + wie + + + node + ich + + + node + bloß + + + node + ich + + + node + habe + + + node + eine + + + node + Linie + + + node + und + + + node + du + + + node + nicht + + + node + ne + + + node + genau + + + node + und + + + node + ich + + + node + muss + + + node + dir + + + node + jetzt + + + node + erklären + + + node + wie + + + node + du + + + node + vom + + + node + Start + + + node + zum + + + node + Ziel + + + node + kommst + + + node + so + + + node + wie + + + node + meine + + + node + Linie + + + node + so + + + node + also + + + node + du + + + node + hast + + + node + einen + + + node + Stift + + + node + okay + + + node + also + + + node + du + + + node + musst + + + node + jetzt + + + node + vom + + + node + Startpunkt + + + node + äh + + + node + waagerecht + + + node + Richtung + + + node + also + + + node + zu + + + node + dem + + + node + ersten + + + node + Bild + + + node + erstmal + + + node + und + + + node + zum + + + node + Rad + + + node + naja pass auf also du hast jetzt wohl hier auch so ne Karte wie ich bloß ich hab ne Linie und du nich ne + + + node + genau + + + node + und ich muss dir jetzt erklären wie du vom Start zum Ziel kommst so wie meine Linie geht + + + node + so + + + node + also hast n Stift + + + node + okay also du musst jetzt vom Startpunkt äh waagerecht Richtung also zu dem ersten Bild erstmal + + + node + und + + + node + zum Rad + + + node + äh + + + node + fang + + + node + einfach + + + node + mal + + + node + an + + + node + nee + + + node + ich + + + node + habe + + + node + gar + + + node + keine + + + node + Linien + + + node + ich + + + node + habe + + + node + Start + + + node + und + + + node + Ziel + + + node + genau + + + node + jap + + + node + mhm + + + node + zum + + + node + (?) + + + node + Schornsteinfeger + + + node + äh fang einfach ma an + + + node + nee ich hab gar keine Linie ich hab Start und Ziel + + + node + genau + + + node + jap + + + node + zum (?) Schornsteinfeger + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/importer/saltxml/tests.rs b/src/importer/saltxml/tests.rs new file mode 100644 index 00000000..34a5134b --- /dev/null +++ b/src/importer/saltxml/tests.rs @@ -0,0 +1,30 @@ +use std::path::Path; + +use insta::assert_snapshot; + +use super::*; +use crate::test_util; + +#[test] +fn read_salt_sample_corpus() { + let importer = ImportSaltXml::default(); + let actual = test_util::import_as_graphml_string( + importer, + Path::new("tests/data/import/salt/SaltSampleCorpus"), + None, + ) + .unwrap(); + assert_snapshot!(actual); +} + +#[test] +fn read_salt_with_timeline() { + let importer = ImportSaltXml::default(); + let actual = test_util::import_as_graphml_string( + importer, + Path::new("tests/data/import/salt/dialog.demo"), + None, + ) + .unwrap(); + assert_snapshot!(actual); +} diff --git a/src/lib.rs b/src/lib.rs index b3127c8a..554b78b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,8 +27,9 @@ use graphannis::AnnotationGraph; use importer::{ conllu::ImportCoNLLU, exmaralda::ImportEXMARaLDA, file_nodes::CreateFileNodes, graphml::GraphMLImporter, meta::AnnotateCorpus, none::CreateEmptyCorpus, opus::ImportOpusLinks, - ptb::ImportPTB, relannis::ImportRelAnnis, textgrid::ImportTextgrid, toolbox::ImportToolBox, - treetagger::ImportTreeTagger, xlsx::ImportSpreadsheet, xml::ImportXML, Importer, + ptb::ImportPTB, relannis::ImportRelAnnis, saltxml::ImportSaltXml, textgrid::ImportTextgrid, + toolbox::ImportToolBox, treetagger::ImportTreeTagger, xlsx::ImportSpreadsheet, xml::ImportXML, + Importer, }; use manipulator::{ check::Check, chunker::Chunk, collapse::Collapse, enumerate::EnumerateMatches, link::LinkNodes, @@ -142,6 +143,7 @@ pub enum ReadFrom { Path(#[serde(default)] CreateFileNodes), PTB(#[serde(default)] ImportPTB), RelAnnis(#[serde(default)] ImportRelAnnis), + SaltXml(#[serde(default)] ImportSaltXml), TextGrid(#[serde(default)] ImportTextgrid), Toolbox(#[serde(default)] ImportToolBox), TreeTagger(#[serde(default)] ImportTreeTagger), @@ -168,6 +170,7 @@ impl ReadFrom { ReadFrom::Path(m) => m, ReadFrom::PTB(m) => m, ReadFrom::RelAnnis(m) => m, + ReadFrom::SaltXml(m) => m, ReadFrom::TextGrid(m) => m, ReadFrom::Toolbox(m) => m, ReadFrom::TreeTagger(m) => m, @@ -189,6 +192,7 @@ impl ReadFromDiscriminants { ReadFromDiscriminants::Path => CreateFileNodes::DOCS, ReadFromDiscriminants::PTB => ImportPTB::DOCS, ReadFromDiscriminants::RelAnnis => ImportRelAnnis::DOCS, + ReadFromDiscriminants::SaltXml => ImportSaltXml::DOCS, ReadFromDiscriminants::TextGrid => ImportTextgrid::DOCS, ReadFromDiscriminants::Toolbox => ImportToolBox::DOCS, ReadFromDiscriminants::TreeTagger => ImportTreeTagger::DOCS, @@ -249,6 +253,10 @@ impl ReadFromDiscriminants { ImportRelAnnis::FIELD_NAMES_AS_SLICE, ImportRelAnnis::FIELD_DOCS, ), + ReadFromDiscriminants::SaltXml => ( + ImportSaltXml::FIELD_NAMES_AS_SLICE, + ImportSaltXml::FIELD_DOCS, + ), }; for (idx, n) in field_names.iter().enumerate() { if idx < field_docs.len() { diff --git a/src/manipulator/visualize.rs b/src/manipulator/visualize.rs index 370aff45..bb2efa51 100644 --- a/src/manipulator/visualize.rs +++ b/src/manipulator/visualize.rs @@ -10,7 +10,11 @@ use graphannis::{ model::{AnnotationComponent, AnnotationComponentType}, AnnotationGraph, }; -use graphannis_core::types::NodeID as GraphAnnisNodeID; +use graphannis_core::{ + annostorage::ValueSearch, + graph::{ANNIS_NS, NODE_TYPE}, + types::NodeID as GraphAnnisNodeID, +}; use graphannis_core::{ dfs, graph::{storage::union::UnionEdgeContainer, NODE_NAME_KEY}, @@ -146,7 +150,7 @@ impl Visualize { } output.add_stmt(stmt!(subgraph)); - // Add all other nodes that are somehow connected to the included token + // Add all other nodes that are somehow connected to the included token and the document let all_components = graph.get_all_components(None, None); let all_gs = all_components .iter() @@ -171,6 +175,32 @@ impl Visualize { } } } + // Add all datasource nodes if they are connected to the included documents have not been already added + let part_of_gs = graph + .get_all_components(Some(AnnotationComponentType::PartOf), None) + .into_iter() + .filter_map(|c| graph.get_graphstorage(&c)) + .collect_vec(); + for ds in graph.get_node_annos().exact_anno_search( + Some(ANNIS_NS), + NODE_TYPE, + ValueSearch::Some("datasource"), + ) { + let ds = ds?.node; + if !included_nodes.contains(&ds) { + // The datsource must be part of a document node that is already included + let mut outgoing = HashSet::new(); + for gs in part_of_gs.iter() { + for o in gs.get_outgoing_edges(ds) { + outgoing.insert(o?); + } + } + if outgoing.intersection(&included_nodes).next().is_some() { + output.add_stmt(self.create_node_stmt(ds, graph)?); + included_nodes.insert(ds); + } + } + } // Output all edges grouped by their component for component in all_components.iter() { diff --git a/tests/data/import/graphml/single_sentence.graphml b/tests/data/import/graphml/single_sentence.graphml index 2f6d512e..ec30cf52 100644 --- a/tests/data/import/graphml/single_sentence.graphml +++ b/tests/data/import/graphml/single_sentence.graphml @@ -39,7 +39,7 @@ visibility = "hidden" [visualizers.mappings] annos = "/default_ns::pos/,/syntax::cat/" escape_html = "false" -hide_tok = "true" +hide_tok = "false" show_ns = "false" ]]> diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt new file mode 100644 index 00000000..653bee7e --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc1.salt @@ -0,0 +1,409 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt new file mode 100644 index 00000000..c44261e6 --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus1/doc2.salt @@ -0,0 +1,364 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt new file mode 100644 index 00000000..fd3f508a --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc3.salt @@ -0,0 +1,364 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt new file mode 100644 index 00000000..0d3170f3 --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/rootCorpus/subCorpus2/doc4.salt @@ -0,0 +1,364 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/SaltSampleCorpus/saltProject.salt b/tests/data/import/salt/SaltSampleCorpus/saltProject.salt new file mode 100644 index 00000000..d4ced77b --- /dev/null +++ b/tests/data/import/salt/SaltSampleCorpus/saltProject.salt @@ -0,0 +1,63 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.salt b/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.salt new file mode 100644 index 00000000..31d70fb3 --- /dev/null +++ b/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.salt @@ -0,0 +1,3278 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm b/tests/data/import/salt/dialog.demo/dialog.demo/dialog.demo.webm new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/import/salt/dialog.demo/saltProject.salt b/tests/data/import/salt/dialog.demo/saltProject.salt new file mode 100644 index 00000000..ebc53c08 --- /dev/null +++ b/tests/data/import/salt/dialog.demo/saltProject.salt @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/snapshots/cli__list_modules.snap b/tests/snapshots/cli__list_modules.snap index e63d9fd8..c9ba7244 100644 --- a/tests/snapshots/cli__list_modules.snap +++ b/tests/snapshots/cli__list_modules.snap @@ -2,10 +2,12 @@ source: tests/cli.rs expression: output --- -| Type | Modules | -|------------------|-------------------------------------------------------------------------------------------------------------| -| Import formats | conllu, exmaralda, graphml, meta, none, opus, path, ptb, relannis, textgrid, toolbox, treetagger, xlsx, xml | -| Export formats | graphml, exmaralda, sequence, textgrid, xlsx | -| Graph operations | check, collapse, visualize, enumerate, link, map, revise, chunk, split, none | +| Type | Modules | +|------------------|----------------------------------------------------------------------------------------------------------------------| +| Import formats | conllu, exmaralda, graphml, meta, none, opus, path, ptb, relannis, saltxml, textgrid, toolbox, treetagger, xlsx, xml | +| Export formats | graphml, exmaralda, sequence, textgrid, xlsx | +| Graph operations | check, collapse, visualize, enumerate, link, map, revise, chunk, split, none | Use `annatto info ` to get more information about one of the formats or graph operations. + +