Skip to content

Commit

Permalink
Merge pull request #308 from korpling/feature/improve-ridges-clean
Browse files Browse the repository at this point in the history
Improvements related to updating the clean script in RIDGES
  • Loading branch information
thomaskrause authored Aug 30, 2024
2 parents b082cc1 + d60aa19 commit cbe6102
Show file tree
Hide file tree
Showing 13 changed files with 359 additions and 1,041 deletions.
17 changes: 11 additions & 6 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- import of `conllu` now supports enhanced dependencies
- Adds `saltxml` export format
- Adds `time` graph op to add or enrich time annotations
- The `table` exporter now supports the `id_column` parameter to
enable/disable the ID column.
- Importers that map directories to (sub)-corpora and files to documents can now also importt the
corpus if the `path` argument points to a single file.

### Changed

- `exmaralda` import now ranks order of tlis higher than sorting by time value (more compatible with modern EXMARaLDA files)
- `exmaralda` import now ranks order of tlis higher than sorting by time value (more compatible with
modern EXMARaLDA files)
- `xlsx` importer will connect spans to their corresponding segmentation node
with coverage edges instead of connecting them with the base tokens generated
for the timeline items. Thus, the configured connection between spans and base
Expand Down Expand Up @@ -75,7 +80,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- `visualize` graph operation that allows to output the current graph (somehwere
in the conversion process) to SVG or DOT for debugging.

### Fixed

- removed debug output
Expand Down Expand Up @@ -172,7 +177,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- Fix non-resolved relative path when importing EXMARaLDA files.
- Fix non-resolved relative path when importing EXMARaLDA files.
- Limit the table width when listing the module properties, so they fit in the
current terminal.

Expand Down Expand Up @@ -202,9 +207,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Added simple chunker module based on
[text-splitter](https://crates.io/crates/text-splitter).
- `check` can write check report to file
- `check` can write check report to file
- `check` can test a corpus graph comparing results to an external corpus graph loaded from a graphANNIS database
- import `ptb` can now split node annotations to derive a label for the incoming edge, when a delimiter is provided
- import `ptb` can now split node annotations to derive a label for the incoming edge, when a delimiter is provided
using `edge_delimiter`. E. g., `NP-sbj` will create a node of category `NP`, whose incoming edge has function `sbj`,
given the following config is used: `edge_delimiter = "-"`
- config attribute `stable_order` for exporting graphml enforces stable ordering of edges and nodes in output
Expand Down Expand Up @@ -246,7 +251,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `exmaralda` returns error when there is no time value for a timeline item
- fixed and simplified import of corpus node annotations
- `exmaralda` import's paths to linked media files are relative to the working directory
- `xlsx` importer now adds `PartOf` relations to the document nodes
- `xlsx` importer now adds `PartOf` relations to the document nodes

## [0.4.0] - 2023-11-13

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
source: src/exporter/table.rs
expression: export.unwrap()
---
norm::lemma norm::pos dipl::sentence norm::norm dipl::dipl
I PRON 1 I I'm
be VERB 1 am I'm
in ADP 1 in in
New York PRON 1 New York New
New York PRON 1 New York York

This file was deleted.

62 changes: 52 additions & 10 deletions src/exporter/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use super::Exporter;

use crate::{
deserialize::{deserialize_anno_key, deserialize_annotation_component_seq},
workflow::StatusMessage,
progress::ProgressReporter,
};

/// This module exports all ordered nodes and nodes connected by coverage edges of any name into a table.
Expand Down Expand Up @@ -99,6 +99,13 @@ pub struct ExportTable {
/// ```
#[serde(default, deserialize_with = "deserialize_annotation_component_seq")]
outgoing: Vec<AnnotationComponent>,
/// If `true` (the default), always output a column with the ID of the node.
#[serde(default = "default_id_column")]
id_column: bool,
}

fn default_id_column() -> bool {
true
}

impl Default for ExportTable {
Expand All @@ -110,6 +117,7 @@ impl Default for ExportTable {
no_value: String::default(),
ingoing: vec![],
outgoing: vec![],
id_column: default_id_column(),
}
}
}
Expand All @@ -132,9 +140,11 @@ impl Exporter for ExportTable {
&self,
graph: &graphannis::AnnotationGraph,
output_path: &std::path::Path,
_step_id: crate::StepID,
step_id: crate::StepID,
tx: Option<crate::workflow::StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let progress = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?;

let base_ordering = AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
Expand All @@ -156,11 +166,7 @@ impl Exporter for ExportTable {
.filter_map(|c| graph.get_graphstorage(c))
.collect_vec();
if coverage_storages.is_empty() {
if let Some(sender) = &tx {
sender.send(StatusMessage::Warning(
"No coverage storages available".to_string(),
))?;
}
progress.warn("No coverage storages available")?;
}
let mut doc_node_to_start = BTreeMap::new();
for node in storage.source_nodes().flatten().filter(|n| {
Expand Down Expand Up @@ -198,9 +204,16 @@ impl Exporter for ExportTable {
}
}
}
let progress = ProgressReporter::new(tx, step_id, doc_node_to_start.len())?;
progress.info(&format!("Exporting {} documents", doc_node_to_start.len()))?;
doc_node_to_start
.into_iter()
.try_for_each(|(doc, start)| self.export_document(graph, output_path, doc, start))?;
.try_for_each(move |(doc, start)| -> anyhow::Result<()> {
progress.info(&format!("Exporting {doc} as table"))?;
self.export_document(graph, output_path, doc, start)?;
progress.worked(1)?;
Ok(())
})?;
Ok(())
}

Expand Down Expand Up @@ -265,16 +278,21 @@ impl ExportTable {
let id_name = format!("id_{qname}");
let index = if let Some(index) = index_map.get(&qname) {
*index
} else {
} else if self.id_column {
index_map.insert(qname.to_string(), index_map.len());
index_map.insert(id_name.to_string(), index_map.len());
index_map.len() - 2
} else {
index_map.insert(qname.to_string(), index_map.len());
index_map.len() - 1
};
let value = node_annos
.get_value_for_item(&rn, &anno_key)?
.ok_or(anyhow!("Annotation has no value"))?;
data.insert(index, value.to_string());
data.insert(index + 1, node_name.to_string());
if self.id_column {
data.insert(index + 1, node_name.to_string());
}
}
}
if follow_edges {
Expand Down Expand Up @@ -587,4 +605,28 @@ mod tests {
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}

#[test]
fn no_id_column() {
let exmaralda = ImportEXMARaLDA {};
let mprt = exmaralda.import_corpus(
Path::new("tests/data/import/exmaralda/clean/import/exmaralda/"),
StepID {
module_name: "test_import_exb".to_string(),
path: None,
},
None,
);
assert!(mprt.is_ok());
let mut update_import = mprt.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(graph.apply_update(&mut update_import, |_| {}).is_ok());
let mut exporter = ExportTable::default();
exporter.id_column = false;
let export = export_to_string(&graph, exporter);
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}
}
Loading

0 comments on commit cbe6102

Please sign in to comment.