Merge pull request #309 from korpling/feature/repeated-map

Allow repeated application of rules in `map` module
korpling · Sep 2, 2024 · dbafad5 · dbafad5
2 parents e81e546 + 0868093
commit dbafad5
Show file tree

Hide file tree

Showing 8 changed files with 409 additions and 126 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   enable/disable the ID column.
 - Importers that map directories to (sub)-corpora and files to documents can now also importt the
   corpus if the `path` argument points to a single file.
+- `xlsx` importer now maps columns as spans if the column is not configured to
+  be a `token_column`.
 
 ### Changed
 

diff --git a/src/exporter/xlsx.rs b/src/exporter/xlsx.rs
@@ -7,7 +7,7 @@ use anyhow::anyhow;
 use graphannis::{graph::GraphStorage, model::AnnotationComponentType, AnnotationGraph};
 use graphannis_core::{
     annostorage::{NodeAnnotationStorage, ValueSearch},
-    graph::ANNIS_NS,
+    graph::{ANNIS_NS, NODE_TYPE_KEY},
     types::{AnnoKey, Component, NodeID},
     util::join_qname,
 };
@@ -76,14 +76,23 @@ fn is_span_column(
     node_annos: &dyn NodeAnnotationStorage,
     token_helper: &TokenHelper,
 ) -> anyhow::Result<bool> {
-    // Check that none of the nodes having this key are token
+    // Check that none of the nodes having this key are token and that there is at least one non-corpus node.
+    // Document meta data and annotations inside documents could share the same
+    // annotation names, but we only want to include the ones that are used as
+    // annotations in a document.
+    let mut has_non_corpus_match = false;
     for m in node_annos.exact_anno_search(Some(&anno_key.ns), &anno_key.name, ValueSearch::Any) {
         let m = m?;
         if token_helper.is_token(m.node)? {
             return Ok(false);
         }
+        if let Some(node_type) = node_annos.get_value_for_item(&m.node, &NODE_TYPE_KEY)? {
+            if node_type == "node" {
+                has_non_corpus_match = true;
+            }
+        }
     }
-    Ok(true)
+    Ok(has_non_corpus_match)
 }
 
 fn overwritten_position_for_key(
@@ -136,6 +145,29 @@ impl ExportXlsx {
             worksheet.remove_column_by_index(&1, &1);
         }
 
+        // Add meta data sheet
+        let meta_annos = g.get_node_annos().get_annotations_for_item(&doc_node_id)?;
+        if !meta_annos.is_empty() {
+            let meta_sheet = book.new_sheet("meta").map_err(|e| anyhow!(e))?;
+            meta_sheet.insert_new_row(&1, &2);
+            meta_sheet.get_cell_mut((&1, &1)).set_value_string("Name");
+            meta_sheet.get_cell_mut((&2, &1)).set_value_string("Value");
+
+            let mut currrent_row = 2;
+            for a in meta_annos {
+                if a.key.ns != ANNIS_NS {
+                    meta_sheet.insert_new_row(&currrent_row, &2);
+                    meta_sheet
+                        .get_cell_mut((&1, &currrent_row))
+                        .set_value_string(join_qname(&a.key.ns, &a.key.name));
+                    meta_sheet
+                        .get_cell_mut((&2, &currrent_row))
+                        .set_value_string(a.val);
+                    currrent_row += 1;
+                }
+            }
+        }
+
         let output_path = output_path.join(format!("{}.xlsx", doc_name));
         umya_spreadsheet::writer::xlsx::write(&book, output_path)?;
 
@@ -558,4 +590,69 @@ mod tests {
         let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
         assert_eq!(1, it.count());
     }
+
+    #[test]
+    fn with_meta() {
+        let importer: ImportSpreadsheet = toml::from_str(
+            r#"
+        column_map = {"tok" = ["lb"]}
+        metasheet = "meta"
+        metasheet_skip_rows = 1
+            "#,
+        )
+        .unwrap();
+        let exporter = ExportXlsx::default();
+
+        // Import an example document
+        let path = Path::new("./tests/data/import/xlsx/sample_sentence/");
+        let importer = crate::ReadFrom::Xlsx(importer);
+        let orig_import_step = ImporterStep {
+            module: importer,
+            path: path.to_path_buf(),
+        };
+        let mut updates = orig_import_step.execute(None).unwrap();
+        let mut original_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
+        original_graph.apply_update(&mut updates, |_| {}).unwrap();
+
+        // Export to Excel file and read it again
+        let tmp_outputdir = TempDir::new().unwrap();
+        let output_dir = tmp_outputdir.path().join("sample_sentence");
+        std::fs::create_dir(&output_dir).unwrap();
+        let exporter = crate::WriteAs::Xlsx(exporter);
+        let export_step = ExporterStep {
+            module: exporter,
+            path: output_dir.clone(),
+        };
+        export_step.execute(&original_graph, None).unwrap();
+
+        let importer: ImportSpreadsheet = toml::from_str(
+            r#"
+        column_map = {"tok" = ["lb"]}
+        metasheet = "meta"
+        metasheet_skip_rows = 1
+            "#,
+        )
+        .unwrap();
+        let second_import_step = ImporterStep {
+            module: crate::ReadFrom::Xlsx(importer),
+            path: output_dir.clone(),
+        };
+        let mut updates = second_import_step.execute(None).unwrap();
+
+        let mut written_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
+        written_graph.apply_update(&mut updates, |_| {}).unwrap();
+
+        let q = graphannis::aql::parse("Author=\"Unknown\" _ident_ annis:doc", false).unwrap();
+        let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
+        assert_eq!(1, it.count());
+
+        let q = graphannis::aql::parse("Year=\"2024\" _ident_ annis:doc", false).unwrap();
+        let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
+        assert_eq!(1, it.count());
+
+        // The header should not be imported
+        let q = graphannis::aql::parse("Name _ident_ annis:doc", false).unwrap();
+        let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
+        assert_eq!(0, it.count());
+    }
 }
diff --git a/src/importer/mod.rs b/src/importer/mod.rs
@@ -17,6 +17,7 @@ pub mod xml;
 
 use crate::{workflow::StatusSender, StepID};
 use graphannis::update::GraphUpdate;
+use percent_encoding::{AsciiSet, CONTROLS};
 use std::path::Path;
 
 /// An importer is a module that takes a path and produces a list of graph update events.
@@ -40,3 +41,24 @@ pub trait Importer: Sync {
 
     fn file_extensions(&self) -> &[&str];
 }
+
+/// An encoding set for node names.
+///
+/// This disallows `:` to avoid any possible ambiguities with the `::` annotation
+/// match seperator. `/` disallowed so this separator can be used to build
+/// hierarchical node IDs and simplifies using node names as file names.
+/// Spaces ` ` are encoded to avoid problems with annotation names in the AQL syntax.
+/// Since node names might be used as file names, all reserved charactes for
+/// Windows file names are encoded as well.
+pub const NODE_NAME_ENCODE_SET: &AsciiSet = &CONTROLS
+    .add(b':')
+    .add(b'/')
+    .add(b' ')
+    .add(b'%')
+    .add(b'\\')
+    .add(b'<')
+    .add(b'>')
+    .add(b'"')
+    .add(b'|')
+    .add(b'?')
+    .add(b'*');
diff --git a/src/importer/relannis.rs b/src/importer/relannis.rs
@@ -1,11 +1,11 @@
 use crate::progress::ProgressReporter;
 
-use super::Importer;
+use super::{Importer, NODE_NAME_ENCODE_SET};
 use anyhow::{anyhow, Result};
 use documented::{Documented, DocumentedFields};
 use graphannis::model::AnnotationComponentType;
 use graphannis::update::{GraphUpdate, UpdateEvent};
-use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
+use percent_encoding::utf8_percent_encode;
 use serde::{Deserialize, Serialize};
 use struct_field_names_as_array::FieldNamesAsSlice;
 
@@ -88,27 +88,6 @@ impl Importer for ImportRelAnnis {
     }
 }
 
-/// An encoding set for node names.
-///
-/// This disallows `:` to avoid any possible ambiguities with the `::` annotation
-/// match seperator. `/` disallowed so this separator can be used to build
-/// hierarchical node IDs and simplifies using node names as file names.
-/// Spaces ` ` are encoded to avoid problems with annotation names in the AQL syntax.
-/// Since node names might be used as file names, all reserved charactes for
-/// Windows file names are encoded as well.
-pub const NODE_NAME_ENCODE_SET: &AsciiSet = &CONTROLS
-    .add(b':')
-    .add(b'/')
-    .add(b' ')
-    .add(b'%')
-    .add(b'\\')
-    .add(b'<')
-    .add(b'>')
-    .add(b'"')
-    .add(b'|')
-    .add(b'?')
-    .add(b'*');
-
 const TOK_WHITESPACE_BEFORE: &str = "tok-whitespace-before";
 const TOK_WHITESPACE_AFTER: &str = "tok-whitespace-after";
 

diff --git a/src/importer/xlsx.rs b/src/importer/xlsx.rs
@@ -1,5 +1,5 @@
 use std::{
-    collections::{BTreeMap, BTreeSet},
+    collections::{BTreeMap, BTreeSet, HashSet},
     fmt::Display,
     path::Path,
 };
@@ -14,12 +14,15 @@ use graphannis_core::{
     util::split_qname,
 };
 use itertools::Itertools;
+use percent_encoding::utf8_percent_encode;
 use serde_derive::Deserialize;
 use struct_field_names_as_array::FieldNamesAsSlice;
 use umya_spreadsheet::Cell;
 
 use super::Importer;
-use crate::{error::AnnattoError, progress::ProgressReporter, util, StepID};
+use crate::{
+    error::AnnattoError, importer::NODE_NAME_ENCODE_SET, progress::ProgressReporter, util, StepID,
+};
 use documented::{Documented, DocumentedFields};
 
 /// Imports Excel Spreadsheets where each line is a token, the other columns are
@@ -59,6 +62,12 @@ pub struct ImportSpreadsheet {
     /// Optional value of the Excel sheet that contains the metadata table. If
     /// no metadata is imported.    
     metasheet: Option<SheetAddress>,
+    /// Skip the first given rows in the meta data sheet.
+    #[serde(default)]
+    metasheet_skip_rows: u32,
+    /// Map the given annotation columns as token annotations and not as span if possible.
+    #[serde(default)]
+    token_annos: Vec<String>,
 }
 
 #[derive(Debug, Deserialize, PartialEq)]
@@ -178,6 +187,8 @@ impl ImportSpreadsheet {
                 })?;
                 Ok::<(), AnnattoError>(())
             })?;
+
+        let token_annos: HashSet<String> = self.token_annos.clone().into_iter().collect();
         for (tok_name, anno_names) in &fullmap {
             let mut names = if tok_name.is_empty() {
                 vec![]
@@ -219,8 +230,19 @@ impl ImportSpreadsheet {
                         let base_token_end = *end_row_excl as usize - 2;
                         let overlapped_base_tokens: &[String] =
                             &base_tokens[base_token_start..base_token_end]; // TODO check indices
-                        let node_name =
-                            format!("{}#{}_{}-{}", &doc_path, tok_name, start_row, end_row_excl);
+
+                        let node_name = if token_annos.contains(name) {
+                            format!("{}#{}_{}-{}", &doc_path, tok_name, start_row, end_row_excl)
+                        } else {
+                            format!(
+                                "{}#span_{}_{}-{}",
+                                &doc_path,
+                                utf8_percent_encode(name, NODE_NAME_ENCODE_SET),
+                                start_row,
+                                end_row_excl
+                            )
+                        };
+
                         update.add_event(UpdateEvent::AddNode {
                             node_name: node_name.to_string(),
                             node_type: "node".to_string(),
@@ -342,7 +364,7 @@ impl ImportSpreadsheet {
         update: &mut GraphUpdate,
     ) -> Result<(), AnnattoError> {
         let max_row_num = sheet.get_highest_row(); // 1-based
-        for row_num in 1..max_row_num + 1 {
+        for row_num in (self.metasheet_skip_rows + 1)..max_row_num + 1 {
             let entries = sheet.get_collection_by_row(&row_num); // sorting not necessarily by col number
             let entry_map = entries
                 .into_iter()
@@ -571,6 +593,8 @@ mod tests {
             fallback: fallback.clone(),
             datasheet: None,
             metasheet: None,
+            token_annos: vec![],
+            metasheet_skip_rows: 0,
         };
         let importer = ReadFrom::Xlsx(importer);
         let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");
@@ -679,6 +703,8 @@ mod tests {
             fallback: None,
             datasheet: None,
             metasheet: None,
+            token_annos: vec![],
+            metasheet_skip_rows: 0,
         };
         let importer = ReadFrom::Xlsx(importer);
         let path = Path::new("./tests/data/import/xlsx/dirty/xlsx/");
@@ -712,6 +738,8 @@ mod tests {
             fallback: None,
             datasheet: None,
             metasheet: None,
+            token_annos: vec![],
+            metasheet_skip_rows: 0,
         };
         let importer = ReadFrom::Xlsx(importer);
         let path = Path::new("./tests/data/import/xlsx/warnings/xlsx/");
@@ -789,6 +817,8 @@ mod tests {
             fallback: Some("tok".to_string()),
             datasheet: None,
             metasheet: None,
+            token_annos: vec![],
+            metasheet_skip_rows: 0,
         };
         let importer = ReadFrom::Xlsx(importer);
         let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");
@@ -879,6 +909,8 @@ mod tests {
             fallback: None,
             datasheet: None,
             metasheet: Some(SheetAddress::Name("meta".to_string())),
+            token_annos: vec![],
+            metasheet_skip_rows: 0,
         };
         let importer = ReadFrom::Xlsx(importer);
         let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");