Skip to content

Commit

Permalink
Merge pull request #309 from korpling/feature/repeated-map
Browse files Browse the repository at this point in the history
Allow repeated application of rules in `map` module
  • Loading branch information
thomaskrause committed Sep 2, 2024
2 parents e81e546 + 0868093 commit dbafad5
Show file tree
Hide file tree
Showing 8 changed files with 409 additions and 126 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
enable/disable the ID column.
- Importers that map directories to (sub)-corpora and files to documents can now also importt the
corpus if the `path` argument points to a single file.
- `xlsx` importer now maps columns as spans if the column is not configured to
be a `token_column`.

### Changed

Expand Down
103 changes: 100 additions & 3 deletions src/exporter/xlsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use anyhow::anyhow;
use graphannis::{graph::GraphStorage, model::AnnotationComponentType, AnnotationGraph};
use graphannis_core::{
annostorage::{NodeAnnotationStorage, ValueSearch},
graph::ANNIS_NS,
graph::{ANNIS_NS, NODE_TYPE_KEY},
types::{AnnoKey, Component, NodeID},
util::join_qname,
};
Expand Down Expand Up @@ -76,14 +76,23 @@ fn is_span_column(
node_annos: &dyn NodeAnnotationStorage,
token_helper: &TokenHelper,
) -> anyhow::Result<bool> {
// Check that none of the nodes having this key are token
// Check that none of the nodes having this key are token and that there is at least one non-corpus node.
// Document meta data and annotations inside documents could share the same
// annotation names, but we only want to include the ones that are used as
// annotations in a document.
let mut has_non_corpus_match = false;
for m in node_annos.exact_anno_search(Some(&anno_key.ns), &anno_key.name, ValueSearch::Any) {
let m = m?;
if token_helper.is_token(m.node)? {
return Ok(false);
}
if let Some(node_type) = node_annos.get_value_for_item(&m.node, &NODE_TYPE_KEY)? {
if node_type == "node" {
has_non_corpus_match = true;
}
}
}
Ok(true)
Ok(has_non_corpus_match)
}

fn overwritten_position_for_key(
Expand Down Expand Up @@ -136,6 +145,29 @@ impl ExportXlsx {
worksheet.remove_column_by_index(&1, &1);
}

// Add meta data sheet
let meta_annos = g.get_node_annos().get_annotations_for_item(&doc_node_id)?;
if !meta_annos.is_empty() {
let meta_sheet = book.new_sheet("meta").map_err(|e| anyhow!(e))?;
meta_sheet.insert_new_row(&1, &2);
meta_sheet.get_cell_mut((&1, &1)).set_value_string("Name");
meta_sheet.get_cell_mut((&2, &1)).set_value_string("Value");

let mut currrent_row = 2;
for a in meta_annos {
if a.key.ns != ANNIS_NS {
meta_sheet.insert_new_row(&currrent_row, &2);
meta_sheet
.get_cell_mut((&1, &currrent_row))
.set_value_string(join_qname(&a.key.ns, &a.key.name));
meta_sheet
.get_cell_mut((&2, &currrent_row))
.set_value_string(a.val);
currrent_row += 1;
}
}
}

let output_path = output_path.join(format!("{}.xlsx", doc_name));
umya_spreadsheet::writer::xlsx::write(&book, output_path)?;

Expand Down Expand Up @@ -558,4 +590,69 @@ mod tests {
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());
}

#[test]
fn with_meta() {
let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
metasheet = "meta"
metasheet_skip_rows = 1
"#,
)
.unwrap();
let exporter = ExportXlsx::default();

// Import an example document
let path = Path::new("./tests/data/import/xlsx/sample_sentence/");
let importer = crate::ReadFrom::Xlsx(importer);
let orig_import_step = ImporterStep {
module: importer,
path: path.to_path_buf(),
};
let mut updates = orig_import_step.execute(None).unwrap();
let mut original_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
original_graph.apply_update(&mut updates, |_| {}).unwrap();

// Export to Excel file and read it again
let tmp_outputdir = TempDir::new().unwrap();
let output_dir = tmp_outputdir.path().join("sample_sentence");
std::fs::create_dir(&output_dir).unwrap();
let exporter = crate::WriteAs::Xlsx(exporter);
let export_step = ExporterStep {
module: exporter,
path: output_dir.clone(),
};
export_step.execute(&original_graph, None).unwrap();

let importer: ImportSpreadsheet = toml::from_str(
r#"
column_map = {"tok" = ["lb"]}
metasheet = "meta"
metasheet_skip_rows = 1
"#,
)
.unwrap();
let second_import_step = ImporterStep {
module: crate::ReadFrom::Xlsx(importer),
path: output_dir.clone(),
};
let mut updates = second_import_step.execute(None).unwrap();

let mut written_graph = AnnotationGraph::with_default_graphstorages(false).unwrap();
written_graph.apply_update(&mut updates, |_| {}).unwrap();

let q = graphannis::aql::parse("Author=\"Unknown\" _ident_ annis:doc", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());

let q = graphannis::aql::parse("Year=\"2024\" _ident_ annis:doc", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(1, it.count());

// The header should not be imported
let q = graphannis::aql::parse("Name _ident_ annis:doc", false).unwrap();
let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap();
assert_eq!(0, it.count());
}
}
22 changes: 22 additions & 0 deletions src/importer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub mod xml;

use crate::{workflow::StatusSender, StepID};
use graphannis::update::GraphUpdate;
use percent_encoding::{AsciiSet, CONTROLS};
use std::path::Path;

/// An importer is a module that takes a path and produces a list of graph update events.
Expand All @@ -40,3 +41,24 @@ pub trait Importer: Sync {

fn file_extensions(&self) -> &[&str];
}

/// An encoding set for node names.
///
/// This disallows `:` to avoid any possible ambiguities with the `::` annotation
/// match seperator. `/` disallowed so this separator can be used to build
/// hierarchical node IDs and simplifies using node names as file names.
/// Spaces ` ` are encoded to avoid problems with annotation names in the AQL syntax.
/// Since node names might be used as file names, all reserved charactes for
/// Windows file names are encoded as well.
pub const NODE_NAME_ENCODE_SET: &AsciiSet = &CONTROLS
.add(b':')
.add(b'/')
.add(b' ')
.add(b'%')
.add(b'\\')
.add(b'<')
.add(b'>')
.add(b'"')
.add(b'|')
.add(b'?')
.add(b'*');
25 changes: 2 additions & 23 deletions src/importer/relannis.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use crate::progress::ProgressReporter;

use super::Importer;
use super::{Importer, NODE_NAME_ENCODE_SET};
use anyhow::{anyhow, Result};
use documented::{Documented, DocumentedFields};
use graphannis::model::AnnotationComponentType;
use graphannis::update::{GraphUpdate, UpdateEvent};
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
use percent_encoding::utf8_percent_encode;
use serde::{Deserialize, Serialize};
use struct_field_names_as_array::FieldNamesAsSlice;

Expand Down Expand Up @@ -88,27 +88,6 @@ impl Importer for ImportRelAnnis {
}
}

/// An encoding set for node names.
///
/// This disallows `:` to avoid any possible ambiguities with the `::` annotation
/// match seperator. `/` disallowed so this separator can be used to build
/// hierarchical node IDs and simplifies using node names as file names.
/// Spaces ` ` are encoded to avoid problems with annotation names in the AQL syntax.
/// Since node names might be used as file names, all reserved charactes for
/// Windows file names are encoded as well.
pub const NODE_NAME_ENCODE_SET: &AsciiSet = &CONTROLS
.add(b':')
.add(b'/')
.add(b' ')
.add(b'%')
.add(b'\\')
.add(b'<')
.add(b'>')
.add(b'"')
.add(b'|')
.add(b'?')
.add(b'*');

const TOK_WHITESPACE_BEFORE: &str = "tok-whitespace-before";
const TOK_WHITESPACE_AFTER: &str = "tok-whitespace-after";

Expand Down
42 changes: 37 additions & 5 deletions src/importer/xlsx.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::{
collections::{BTreeMap, BTreeSet},
collections::{BTreeMap, BTreeSet, HashSet},
fmt::Display,
path::Path,
};
Expand All @@ -14,12 +14,15 @@ use graphannis_core::{
util::split_qname,
};
use itertools::Itertools;
use percent_encoding::utf8_percent_encode;
use serde_derive::Deserialize;
use struct_field_names_as_array::FieldNamesAsSlice;
use umya_spreadsheet::Cell;

use super::Importer;
use crate::{error::AnnattoError, progress::ProgressReporter, util, StepID};
use crate::{
error::AnnattoError, importer::NODE_NAME_ENCODE_SET, progress::ProgressReporter, util, StepID,
};
use documented::{Documented, DocumentedFields};

/// Imports Excel Spreadsheets where each line is a token, the other columns are
Expand Down Expand Up @@ -59,6 +62,12 @@ pub struct ImportSpreadsheet {
/// Optional value of the Excel sheet that contains the metadata table. If
/// no metadata is imported.
metasheet: Option<SheetAddress>,
/// Skip the first given rows in the meta data sheet.
#[serde(default)]
metasheet_skip_rows: u32,
/// Map the given annotation columns as token annotations and not as span if possible.
#[serde(default)]
token_annos: Vec<String>,
}

#[derive(Debug, Deserialize, PartialEq)]
Expand Down Expand Up @@ -178,6 +187,8 @@ impl ImportSpreadsheet {
})?;
Ok::<(), AnnattoError>(())
})?;

let token_annos: HashSet<String> = self.token_annos.clone().into_iter().collect();
for (tok_name, anno_names) in &fullmap {
let mut names = if tok_name.is_empty() {
vec![]
Expand Down Expand Up @@ -219,8 +230,19 @@ impl ImportSpreadsheet {
let base_token_end = *end_row_excl as usize - 2;
let overlapped_base_tokens: &[String] =
&base_tokens[base_token_start..base_token_end]; // TODO check indices
let node_name =
format!("{}#{}_{}-{}", &doc_path, tok_name, start_row, end_row_excl);

let node_name = if token_annos.contains(name) {
format!("{}#{}_{}-{}", &doc_path, tok_name, start_row, end_row_excl)
} else {
format!(
"{}#span_{}_{}-{}",
&doc_path,
utf8_percent_encode(name, NODE_NAME_ENCODE_SET),
start_row,
end_row_excl
)
};

update.add_event(UpdateEvent::AddNode {
node_name: node_name.to_string(),
node_type: "node".to_string(),
Expand Down Expand Up @@ -342,7 +364,7 @@ impl ImportSpreadsheet {
update: &mut GraphUpdate,
) -> Result<(), AnnattoError> {
let max_row_num = sheet.get_highest_row(); // 1-based
for row_num in 1..max_row_num + 1 {
for row_num in (self.metasheet_skip_rows + 1)..max_row_num + 1 {
let entries = sheet.get_collection_by_row(&row_num); // sorting not necessarily by col number
let entry_map = entries
.into_iter()
Expand Down Expand Up @@ -571,6 +593,8 @@ mod tests {
fallback: fallback.clone(),
datasheet: None,
metasheet: None,
token_annos: vec![],
metasheet_skip_rows: 0,
};
let importer = ReadFrom::Xlsx(importer);
let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");
Expand Down Expand Up @@ -679,6 +703,8 @@ mod tests {
fallback: None,
datasheet: None,
metasheet: None,
token_annos: vec![],
metasheet_skip_rows: 0,
};
let importer = ReadFrom::Xlsx(importer);
let path = Path::new("./tests/data/import/xlsx/dirty/xlsx/");
Expand Down Expand Up @@ -712,6 +738,8 @@ mod tests {
fallback: None,
datasheet: None,
metasheet: None,
token_annos: vec![],
metasheet_skip_rows: 0,
};
let importer = ReadFrom::Xlsx(importer);
let path = Path::new("./tests/data/import/xlsx/warnings/xlsx/");
Expand Down Expand Up @@ -789,6 +817,8 @@ mod tests {
fallback: Some("tok".to_string()),
datasheet: None,
metasheet: None,
token_annos: vec![],
metasheet_skip_rows: 0,
};
let importer = ReadFrom::Xlsx(importer);
let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");
Expand Down Expand Up @@ -879,6 +909,8 @@ mod tests {
fallback: None,
datasheet: None,
metasheet: Some(SheetAddress::Name("meta".to_string())),
token_annos: vec![],
metasheet_skip_rows: 0,
};
let importer = ReadFrom::Xlsx(importer);
let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");
Expand Down
Loading

0 comments on commit dbafad5

Please sign in to comment.