Skip to content

Commit

Permalink
Merge pull request #261 from korpling/feature/saltxml
Browse files Browse the repository at this point in the history
Add SaltXML importer
  • Loading branch information
thomaskrause committed Aug 7, 2024
2 parents ac5c4df + d0b87ba commit 2b7a78f
Show file tree
Hide file tree
Showing 28 changed files with 10,948 additions and 53 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `map` manipulator can now add annotated spans and copy values from existing
annotations. The copied values can be manipulated using regular expressions and
replacement values.
- Addes `saltxml` import format

### Fixed

Expand Down
28 changes: 19 additions & 9 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
[package]
authors = ["Thomas Krause <[email protected]>", "Martin Klotz <[email protected]>"]
authors = [
"Thomas Krause <[email protected]>",
"Martin Klotz <[email protected]>",
]
description = "Converts linguistic data formats based on the graphANNIS data model as intermediate representation and can apply consistency tests."
edition = "2018"
homepage = "https://github.com/korpling/annatto/"
Expand All @@ -11,7 +14,7 @@ version = "0.14.0"
[dependencies]
ansi_term = "0.12"
anyhow = "1.0"
clap = {version = "4.0", features = ["derive", "env"]}
clap = { version = "4.0", features = ["derive", "env"] }
console = "0.15"
csv = "1.1"
documented = "0.3.0"
Expand All @@ -27,32 +30,34 @@ lazy_static = "1.4.0"
linked-hash-map = "0.5.6"
log = "0.4"
normpath = "1.1"
ordered-float = {version = "4.1", default-features = false}
ordered-float = { version = "4.1", default-features = false }
pathdiff = "0.2"
percent-encoding = "2.3.1"
pest = "2.7"
pest_derive = "2.0"
quick-xml = "0.31"
quick-xml = "0.34"
rayon = "1.1"
regex = "1.10"
roxmltree = "0.20.0"
serde = "1.0"
serde_derive = "1.0"
struct-field-names-as-array = "0.3.0"
strum = {version = "0.26.2", features = ["derive"]}
tabled = {version = "0.15", features = ["ansi"]}
strum = { version = "0.26.2", features = ["derive"] }
tabled = { version = "0.15", features = ["ansi"] }
tempfile = "3"
termimad = "0.29.1"
text-splitter = "0.6.3"
thiserror = "1.0"
toml = "0.8.0"
tracing-subscriber = {version = "0.3", features = ["env-filter"]}
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
umya-spreadsheet = "~1.1.1"
url = "2.5.2"
xml-rs = "0.8"
zip = "0.6.6"

[dev-dependencies]
assert_cmd = "2.0.11"
insta = {version = "1.26.0", features = ["toml", "filters"]}
insta = { version = "1.26.0", features = ["toml", "filters"] }
pretty_assertions = "1.3"

# Compile some of the dependencies in release mode if when we are ourself in
Expand Down Expand Up @@ -82,7 +87,12 @@ ci = "github"
# The installers to generate for each app
installers = []
# Target platforms to build apps for (Rust target-triple syntax)
targets = ["aarch64-apple-darwin", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-pc-windows-msvc"]
targets = [
"aarch64-apple-darwin",
"x86_64-apple-darwin",
"x86_64-unknown-linux-gnu",
"x86_64-pc-windows-msvc",
]
# The preferred cargo-dist version to use in CI (Cargo.toml SemVer syntax)
cargo-dist-version = "0.16.0"
# Publish jobs to run in CI
Expand Down
10 changes: 5 additions & 5 deletions docs/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
| Type | Modules |
|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) |
| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) |
| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [visualize](graph_ops/visualize.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) |
| Type | Modules |
|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Import formats | [conllu](importers/conllu.md), [exmaralda](importers/exmaralda.md), [graphml](importers/graphml.md), [meta](importers/meta.md), [none](importers/none.md), [opus](importers/opus.md), [path](importers/path.md), [ptb](importers/ptb.md), [relannis](importers/relannis.md), [saltxml](importers/saltxml.md), [textgrid](importers/textgrid.md), [toolbox](importers/toolbox.md), [treetagger](importers/treetagger.md), [xlsx](importers/xlsx.md), [xml](importers/xml.md) |
| Export formats | [graphml](exporters/graphml.md), [exmaralda](exporters/exmaralda.md), [sequence](exporters/sequence.md), [textgrid](exporters/textgrid.md), [xlsx](exporters/xlsx.md) |
| Graph operations | [check](graph_ops/check.md), [collapse](graph_ops/collapse.md), [visualize](graph_ops/visualize.md), [enumerate](graph_ops/enumerate.md), [link](graph_ops/link.md), [map](graph_ops/map.md), [revise](graph_ops/revise.md), [chunk](graph_ops/chunk.md), [split](graph_ops/split.md), [none](graph_ops/none.md) |
6 changes: 6 additions & 0 deletions docs/exporters/saltxml.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# saltxml (exporter)

Exports Excel Spreadsheets where each line is a token, the other columns are
spans and merged cells can be used for spans that cover more than one token.

*No Configuration*
14 changes: 14 additions & 0 deletions docs/importers/saltxml.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# saltxml (importer)

Imports the SaltXML format used by Pepper (<https://corpus-tools.org/pepper/>).
SaltXML is an XMI serialization of the [Salt model](https://raw.githubusercontent.com/korpling/salt/master/gh-site/doc/salt_modelGuide.pdf).

## Configuration

### missing_anno_ns_from_layer

If `true`, use the layer name as fallback for the namespace annotations
if none is given. This is consistent with how the ANNIS tree visualizer
handles annotations without any namespace. If `false`, use the
`default_ns` namespace as fallback.

57 changes: 29 additions & 28 deletions src/exporter/graphml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,17 +238,17 @@ fn media_vis(graph: &AnnotationGraph) -> Result<Vec<Visualizer>, Box<dyn std::er
layer: None,
vis_type: "audio".to_string(),
display_name: "audio".to_string(),
visibility: "hidden".to_string(),
visibility: "preloaded".to_string(),
mappings: None,
});
}
"mp4" | "avi" | "mov" => {
"mp4" | "avi" | "mov" | "webm" => {
vis.push(Visualizer {
element: "node".to_string(),
layer: None,
vis_type: "video".to_string(),
display_name: "video".to_string(),
visibility: "hidden".to_string(),
visibility: "preloaded".to_string(),
mappings: None,
});
}
Expand Down Expand Up @@ -350,35 +350,36 @@ fn node_annos_vis(graph: &AnnotationGraph) -> Result<Visualizer, Box<dyn std::er
mappings.insert("annos".to_string(), node_names);
mappings.insert("escape_html".to_string(), "false".to_string());

let more_than_one_ordering = order_names.len() > 1;
let ordered_nodes_are_identical = {
more_than_one_ordering && {
let ordering_components =
graph.get_all_components(Some(AnnotationComponentType::Ordering), None);
let node_sets = ordering_components
.iter()
.map(|c| {
if let Some(strge) = graph.get_graphstorage(c) {
strge
.source_nodes()
.filter_map(|r| if let Ok(n) = r { Some(n) } else { None })
.collect::<BTreeSet<u64>>()
} else {
BTreeSet::default()
}
})
.collect_vec();
let mut all_same = true;
//for i in 1..node_sets.len()
for (a, b) in node_sets.into_iter().tuple_windows() {
all_same &= matches!(a.cmp(&b), Ordering::Equal);
}
all_same
let ordered_components_contain_identical_nodes = if order_names.len() > 1 {
let ordering_components =
graph.get_all_components(Some(AnnotationComponentType::Ordering), None);
let node_sets = ordering_components
.iter()
.map(|c| {
if let Some(strge) = graph.get_graphstorage(c) {
strge
.source_nodes()
.filter_map(|r| if let Ok(n) = r { Some(n) } else { None })
.collect::<BTreeSet<u64>>()
} else {
BTreeSet::default()
}
})
.collect_vec();
let mut all_same = true;
//for i in 1..node_sets.len()
for (a, b) in node_sets.into_iter().tuple_windows() {
all_same &= matches!(a.cmp(&b), Ordering::Equal);
}
all_same
} else {
// There is only one ordering component
true
};

mappings.insert(
"hide_tok".to_string(),
(!ordered_nodes_are_identical).to_string(),
(!ordered_components_contain_identical_nodes).to_string(),
);
mappings.insert("show_ns".to_string(), "false".to_string());
Ok(Visualizer {
Expand Down
Loading

0 comments on commit 2b7a78f

Please sign in to comment.