diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f8cc511..5bcbc123 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 enable/disable the ID column. - Importers that map directories to (sub)-corpora and files to documents can now also importt the corpus if the `path` argument points to a single file. +- `xlsx` importer now maps columns as spans if the column is not configured to + be a `token_column`. ### Changed diff --git a/src/exporter/xlsx.rs b/src/exporter/xlsx.rs index b9a9e143..fc014f23 100644 --- a/src/exporter/xlsx.rs +++ b/src/exporter/xlsx.rs @@ -7,7 +7,7 @@ use anyhow::anyhow; use graphannis::{graph::GraphStorage, model::AnnotationComponentType, AnnotationGraph}; use graphannis_core::{ annostorage::{NodeAnnotationStorage, ValueSearch}, - graph::ANNIS_NS, + graph::{ANNIS_NS, NODE_TYPE_KEY}, types::{AnnoKey, Component, NodeID}, util::join_qname, }; @@ -76,14 +76,23 @@ fn is_span_column( node_annos: &dyn NodeAnnotationStorage, token_helper: &TokenHelper, ) -> anyhow::Result { - // Check that none of the nodes having this key are token + // Check that none of the nodes having this key are token and that there is at least one non-corpus node. + // Document meta data and annotations inside documents could share the same + // annotation names, but we only want to include the ones that are used as + // annotations in a document. + let mut has_non_corpus_match = false; for m in node_annos.exact_anno_search(Some(&anno_key.ns), &anno_key.name, ValueSearch::Any) { let m = m?; if token_helper.is_token(m.node)? { return Ok(false); } + if let Some(node_type) = node_annos.get_value_for_item(&m.node, &NODE_TYPE_KEY)? { + if node_type == "node" { + has_non_corpus_match = true; + } + } } - Ok(true) + Ok(has_non_corpus_match) } fn overwritten_position_for_key( @@ -136,6 +145,29 @@ impl ExportXlsx { worksheet.remove_column_by_index(&1, &1); } + // Add meta data sheet + let meta_annos = g.get_node_annos().get_annotations_for_item(&doc_node_id)?; + if !meta_annos.is_empty() { + let meta_sheet = book.new_sheet("meta").map_err(|e| anyhow!(e))?; + meta_sheet.insert_new_row(&1, &2); + meta_sheet.get_cell_mut((&1, &1)).set_value_string("Name"); + meta_sheet.get_cell_mut((&2, &1)).set_value_string("Value"); + + let mut currrent_row = 2; + for a in meta_annos { + if a.key.ns != ANNIS_NS { + meta_sheet.insert_new_row(&currrent_row, &2); + meta_sheet + .get_cell_mut((&1, &currrent_row)) + .set_value_string(join_qname(&a.key.ns, &a.key.name)); + meta_sheet + .get_cell_mut((&2, &currrent_row)) + .set_value_string(a.val); + currrent_row += 1; + } + } + } + let output_path = output_path.join(format!("{}.xlsx", doc_name)); umya_spreadsheet::writer::xlsx::write(&book, output_path)?; @@ -558,4 +590,69 @@ mod tests { let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap(); assert_eq!(1, it.count()); } + + #[test] + fn with_meta() { + let importer: ImportSpreadsheet = toml::from_str( + r#" + column_map = {"tok" = ["lb"]} + metasheet = "meta" + metasheet_skip_rows = 1 + "#, + ) + .unwrap(); + let exporter = ExportXlsx::default(); + + // Import an example document + let path = Path::new("./tests/data/import/xlsx/sample_sentence/"); + let importer = crate::ReadFrom::Xlsx(importer); + let orig_import_step = ImporterStep { + module: importer, + path: path.to_path_buf(), + }; + let mut updates = orig_import_step.execute(None).unwrap(); + let mut original_graph = AnnotationGraph::with_default_graphstorages(false).unwrap(); + original_graph.apply_update(&mut updates, |_| {}).unwrap(); + + // Export to Excel file and read it again + let tmp_outputdir = TempDir::new().unwrap(); + let output_dir = tmp_outputdir.path().join("sample_sentence"); + std::fs::create_dir(&output_dir).unwrap(); + let exporter = crate::WriteAs::Xlsx(exporter); + let export_step = ExporterStep { + module: exporter, + path: output_dir.clone(), + }; + export_step.execute(&original_graph, None).unwrap(); + + let importer: ImportSpreadsheet = toml::from_str( + r#" + column_map = {"tok" = ["lb"]} + metasheet = "meta" + metasheet_skip_rows = 1 + "#, + ) + .unwrap(); + let second_import_step = ImporterStep { + module: crate::ReadFrom::Xlsx(importer), + path: output_dir.clone(), + }; + let mut updates = second_import_step.execute(None).unwrap(); + + let mut written_graph = AnnotationGraph::with_default_graphstorages(false).unwrap(); + written_graph.apply_update(&mut updates, |_| {}).unwrap(); + + let q = graphannis::aql::parse("Author=\"Unknown\" _ident_ annis:doc", false).unwrap(); + let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap(); + assert_eq!(1, it.count()); + + let q = graphannis::aql::parse("Year=\"2024\" _ident_ annis:doc", false).unwrap(); + let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap(); + assert_eq!(1, it.count()); + + // The header should not be imported + let q = graphannis::aql::parse("Name _ident_ annis:doc", false).unwrap(); + let it = graphannis::aql::execute_query_on_graph(&written_graph, &q, false, None).unwrap(); + assert_eq!(0, it.count()); + } } diff --git a/src/importer/mod.rs b/src/importer/mod.rs index 0669fd42..2b70060b 100644 --- a/src/importer/mod.rs +++ b/src/importer/mod.rs @@ -17,6 +17,7 @@ pub mod xml; use crate::{workflow::StatusSender, StepID}; use graphannis::update::GraphUpdate; +use percent_encoding::{AsciiSet, CONTROLS}; use std::path::Path; /// An importer is a module that takes a path and produces a list of graph update events. @@ -40,3 +41,24 @@ pub trait Importer: Sync { fn file_extensions(&self) -> &[&str]; } + +/// An encoding set for node names. +/// +/// This disallows `:` to avoid any possible ambiguities with the `::` annotation +/// match seperator. `/` disallowed so this separator can be used to build +/// hierarchical node IDs and simplifies using node names as file names. +/// Spaces ` ` are encoded to avoid problems with annotation names in the AQL syntax. +/// Since node names might be used as file names, all reserved charactes for +/// Windows file names are encoded as well. +pub const NODE_NAME_ENCODE_SET: &AsciiSet = &CONTROLS + .add(b':') + .add(b'/') + .add(b' ') + .add(b'%') + .add(b'\\') + .add(b'<') + .add(b'>') + .add(b'"') + .add(b'|') + .add(b'?') + .add(b'*'); diff --git a/src/importer/relannis.rs b/src/importer/relannis.rs index 36d087d9..075dabc2 100644 --- a/src/importer/relannis.rs +++ b/src/importer/relannis.rs @@ -1,11 +1,11 @@ use crate::progress::ProgressReporter; -use super::Importer; +use super::{Importer, NODE_NAME_ENCODE_SET}; use anyhow::{anyhow, Result}; use documented::{Documented, DocumentedFields}; use graphannis::model::AnnotationComponentType; use graphannis::update::{GraphUpdate, UpdateEvent}; -use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; +use percent_encoding::utf8_percent_encode; use serde::{Deserialize, Serialize}; use struct_field_names_as_array::FieldNamesAsSlice; @@ -88,27 +88,6 @@ impl Importer for ImportRelAnnis { } } -/// An encoding set for node names. -/// -/// This disallows `:` to avoid any possible ambiguities with the `::` annotation -/// match seperator. `/` disallowed so this separator can be used to build -/// hierarchical node IDs and simplifies using node names as file names. -/// Spaces ` ` are encoded to avoid problems with annotation names in the AQL syntax. -/// Since node names might be used as file names, all reserved charactes for -/// Windows file names are encoded as well. -pub const NODE_NAME_ENCODE_SET: &AsciiSet = &CONTROLS - .add(b':') - .add(b'/') - .add(b' ') - .add(b'%') - .add(b'\\') - .add(b'<') - .add(b'>') - .add(b'"') - .add(b'|') - .add(b'?') - .add(b'*'); - const TOK_WHITESPACE_BEFORE: &str = "tok-whitespace-before"; const TOK_WHITESPACE_AFTER: &str = "tok-whitespace-after"; diff --git a/src/importer/xlsx.rs b/src/importer/xlsx.rs index bb69ff88..e7517764 100644 --- a/src/importer/xlsx.rs +++ b/src/importer/xlsx.rs @@ -1,5 +1,5 @@ use std::{ - collections::{BTreeMap, BTreeSet}, + collections::{BTreeMap, BTreeSet, HashSet}, fmt::Display, path::Path, }; @@ -14,12 +14,15 @@ use graphannis_core::{ util::split_qname, }; use itertools::Itertools; +use percent_encoding::utf8_percent_encode; use serde_derive::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; use umya_spreadsheet::Cell; use super::Importer; -use crate::{error::AnnattoError, progress::ProgressReporter, util, StepID}; +use crate::{ + error::AnnattoError, importer::NODE_NAME_ENCODE_SET, progress::ProgressReporter, util, StepID, +}; use documented::{Documented, DocumentedFields}; /// Imports Excel Spreadsheets where each line is a token, the other columns are @@ -59,6 +62,12 @@ pub struct ImportSpreadsheet { /// Optional value of the Excel sheet that contains the metadata table. If /// no metadata is imported. metasheet: Option, + /// Skip the first given rows in the meta data sheet. + #[serde(default)] + metasheet_skip_rows: u32, + /// Map the given annotation columns as token annotations and not as span if possible. + #[serde(default)] + token_annos: Vec, } #[derive(Debug, Deserialize, PartialEq)] @@ -178,6 +187,8 @@ impl ImportSpreadsheet { })?; Ok::<(), AnnattoError>(()) })?; + + let token_annos: HashSet = self.token_annos.clone().into_iter().collect(); for (tok_name, anno_names) in &fullmap { let mut names = if tok_name.is_empty() { vec![] @@ -219,8 +230,19 @@ impl ImportSpreadsheet { let base_token_end = *end_row_excl as usize - 2; let overlapped_base_tokens: &[String] = &base_tokens[base_token_start..base_token_end]; // TODO check indices - let node_name = - format!("{}#{}_{}-{}", &doc_path, tok_name, start_row, end_row_excl); + + let node_name = if token_annos.contains(name) { + format!("{}#{}_{}-{}", &doc_path, tok_name, start_row, end_row_excl) + } else { + format!( + "{}#span_{}_{}-{}", + &doc_path, + utf8_percent_encode(name, NODE_NAME_ENCODE_SET), + start_row, + end_row_excl + ) + }; + update.add_event(UpdateEvent::AddNode { node_name: node_name.to_string(), node_type: "node".to_string(), @@ -342,7 +364,7 @@ impl ImportSpreadsheet { update: &mut GraphUpdate, ) -> Result<(), AnnattoError> { let max_row_num = sheet.get_highest_row(); // 1-based - for row_num in 1..max_row_num + 1 { + for row_num in (self.metasheet_skip_rows + 1)..max_row_num + 1 { let entries = sheet.get_collection_by_row(&row_num); // sorting not necessarily by col number let entry_map = entries .into_iter() @@ -571,6 +593,8 @@ mod tests { fallback: fallback.clone(), datasheet: None, metasheet: None, + token_annos: vec![], + metasheet_skip_rows: 0, }; let importer = ReadFrom::Xlsx(importer); let path = Path::new("./tests/data/import/xlsx/clean/xlsx/"); @@ -679,6 +703,8 @@ mod tests { fallback: None, datasheet: None, metasheet: None, + token_annos: vec![], + metasheet_skip_rows: 0, }; let importer = ReadFrom::Xlsx(importer); let path = Path::new("./tests/data/import/xlsx/dirty/xlsx/"); @@ -712,6 +738,8 @@ mod tests { fallback: None, datasheet: None, metasheet: None, + token_annos: vec![], + metasheet_skip_rows: 0, }; let importer = ReadFrom::Xlsx(importer); let path = Path::new("./tests/data/import/xlsx/warnings/xlsx/"); @@ -789,6 +817,8 @@ mod tests { fallback: Some("tok".to_string()), datasheet: None, metasheet: None, + token_annos: vec![], + metasheet_skip_rows: 0, }; let importer = ReadFrom::Xlsx(importer); let path = Path::new("./tests/data/import/xlsx/clean/xlsx/"); @@ -879,6 +909,8 @@ mod tests { fallback: None, datasheet: None, metasheet: Some(SheetAddress::Name("meta".to_string())), + token_annos: vec![], + metasheet_skip_rows: 0, }; let importer = ReadFrom::Xlsx(importer); let path = Path::new("./tests/data/import/xlsx/clean/xlsx/"); diff --git a/src/manipulator/map.rs b/src/manipulator/map.rs index 34be7cad..d5678b55 100644 --- a/src/manipulator/map.rs +++ b/src/manipulator/map.rs @@ -7,7 +7,10 @@ use std::{ use super::Manipulator; use crate::{ progress::ProgressReporter, - util::token_helper::{TokenHelper, TOKEN_KEY}, + util::{ + token_helper::{TokenHelper, TOKEN_KEY}, + CorpusGraphHelper, + }, StepID, }; use anyhow::Context; @@ -18,9 +21,7 @@ use graphannis::{ update::{GraphUpdate, UpdateEvent}, AnnotationGraph, }; -use graphannis_core::graph::{ - storage::union::UnionEdgeContainer, ANNIS_NS, DEFAULT_NS, NODE_NAME_KEY, NODE_TYPE_KEY, -}; +use graphannis_core::graph::{ANNIS_NS, DEFAULT_NS, NODE_NAME_KEY, NODE_TYPE_KEY}; use regex::Regex; use serde_derive::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; @@ -94,7 +95,27 @@ use struct_field_names_as_array::FieldNamesAsSlice; /// number. In this case, the node values are concatenated using a space as /// seperator. /// - +/// You can also apply a set of rules repeatedly. The standard is to only +/// executed it once. But you can configure +/// ```toml +/// repetition = {Fixed = {n = 3}} +/// +/// [[rules]] +/// # ... +/// ``` +/// at the beginning to set the fixed number of repetitions (in this case `3`). +/// An even more advanced usage is to apply the changes until none of the +/// queries in the rules matches anymore. +/// ```toml +/// repetition = "UntilUnchanged" +/// +/// [[rules]] +/// # ... +/// ``` +/// Make sure that the updates in the rules actually change the condition of the +/// rule, otherwise you might get an endless loop and the workflow will never +/// finish! +/// #[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] #[serde(deny_unknown_fields)] pub struct MapAnnos { @@ -110,6 +131,8 @@ impl Manipulator for MapAnnos { step_id: StepID, tx: Option, ) -> Result<(), Box> { + let progress = ProgressReporter::new_unknown_total_work(tx.clone(), step_id.clone())?; + let read_from_path = { let p = Path::new(&self.rule_file).to_path_buf(); if p.is_relative() { @@ -119,39 +142,16 @@ impl Manipulator for MapAnnos { } }; let config = read_config(read_from_path.as_path())?; - let progress = ProgressReporter::new(tx.clone(), step_id.clone(), config.rules.len())?; progress.info("Ensure all graph storages are loaded.")?; graph.ensure_loaded_all()?; - let mut updates = { - let tok_helper = TokenHelper::new(graph)?; - let all_part_of_gs: Vec<_> = graph - .get_all_components(Some(AnnotationComponentType::PartOf), None) - .into_iter() - .filter_map(|c| graph.get_graphstorage(&c)) - .collect(); - let all_part_of_edge_container: Vec<_> = all_part_of_gs - .iter() - .map(|gs| gs.as_edgecontainer()) - .collect(); - let part_of_gs = UnionEdgeContainer::new(all_part_of_edge_container); - let mut map_impl = MapperImpl { - config, - added_spans: 0, - graph, - part_of_gs, - tok_helper, - progress, - }; - map_impl.run()? + let mut map_impl = MapperImpl { + config, + added_spans: 0, + progress, }; - let progress = ProgressReporter::new_unknown_total_work(tx, step_id)?; - graph.apply_update(&mut updates, move |msg| { - if let Err(e) = progress.info(&format!("`map` updates: {msg}")) { - log::error!("{e}"); - } - })?; + map_impl.run(graph)?; Ok(()) } @@ -163,9 +163,25 @@ fn read_config(path: &Path) -> Result> { Ok(m) } +#[derive(Debug, Deserialize)] +enum RepetitionMode { + /// Repeat applying the rules n times. + Fixed { n: usize }, + /// Repeat applying the rules until no changes are made. + UntilUnchanged, +} + +impl Default for RepetitionMode { + fn default() -> Self { + Self::Fixed { n: 1 } + } +} + #[derive(Deserialize, Debug)] struct Mapping { rules: Vec, + #[serde(default)] + repetition: RepetitionMode, } #[derive(Clone, Deserialize, Debug)] @@ -261,53 +277,89 @@ impl Rule { } } -struct MapperImpl<'a> { +struct MapperImpl { config: Mapping, added_spans: usize, - graph: &'a AnnotationGraph, - part_of_gs: UnionEdgeContainer<'a>, - tok_helper: TokenHelper<'a>, + progress: ProgressReporter, } -impl<'a> MapperImpl<'a> { - fn run(&mut self) -> anyhow::Result { - let mut update = GraphUpdate::default(); +impl MapperImpl { + fn run(&mut self, graph: &mut AnnotationGraph) -> anyhow::Result<()> { + match self.config.repetition { + RepetitionMode::Fixed { n } => { + for i in 0..n { + self.progress.info(&format!( + "Applying rule set of `map` module run {}/{n}", + i + 1 + ))?; + self.apply_ruleset(graph)?; + } + } + RepetitionMode::UntilUnchanged => { + let mut run_nr = 1; + loop { + self.progress + .info(&format!("Applying rule set of `map` module run {run_nr}"))?; + let new_update_size = self.apply_ruleset(graph)?; + if new_update_size > 0 { + self.progress.info(&format!("Added {new_update_size} updates because of rules, repeating to apply all rules until no updates are generated."))?; + + run_nr += 1; + } else { + break; + } + } + } + } + Ok(()) + } + + fn apply_ruleset(&mut self, graph: &mut AnnotationGraph) -> anyhow::Result { + let mut updates = GraphUpdate::default(); for rule in self.config.rules.clone() { - self.progress - .info(&format!("Applying rule with query `{}`", &rule.query))?; let query = graphannis::aql::parse(&rule.query, false) .with_context(|| format!("could not parse query '{}'", &rule.query))?; - let result_it = - graphannis::aql::execute_query_on_graph(self.graph, &query, true, None)?; + let result_it = graphannis::aql::execute_query_on_graph(graph, &query, true, None)?; + let mut n = 0; for match_group in result_it { let match_group = match_group?; match rule.target { TargetRef::Node(target) => { - self.map_single_node(&rule, target, &match_group, &mut update)?; + self.map_single_node(&rule, target, &match_group, graph, &mut updates)?; } TargetRef::Span(ref all_targets) => { - self.map_span(&rule, all_targets, &match_group, &mut update)?; + self.map_span(&rule, all_targets, &match_group, graph, &mut updates)?; } } + n += 1; } - - self.progress.worked(1)?; + self.progress.info(&format!( + "Rule with query `{}` matched {n} time(s).", + &rule.query + ))?; } - Ok(update) + let number_of_updates = updates.len()?; + if number_of_updates > 0 { + graph.apply_update(&mut updates, |msg| { + if let Err(e) = self.progress.info(&format!("`map` updates: {msg}")) { + log::error!("{e}"); + } + })?; + } + Ok(number_of_updates) } - fn map_single_node( &self, rule: &Rule, target: usize, match_group: &[Match], + graph: &AnnotationGraph, update: &mut GraphUpdate, ) -> anyhow::Result<()> { if let Some(m) = match_group.get(target - 1) { - let match_node_name = self - .graph + let match_node_name = graph .get_node_annos() .get_value_for_item(&m.node, &NODE_NAME_KEY)? .context("Missing node name for matched node")?; @@ -315,7 +367,7 @@ impl<'a> MapperImpl<'a> { node_name: match_node_name.to_string(), anno_ns: rule.ns.to_string(), anno_name: rule.name.to_string(), - anno_value: rule.resolve_value(self.graph, match_group)?, + anno_value: rule.resolve_value(graph, match_group)?, })?; } Ok(()) @@ -326,8 +378,11 @@ impl<'a> MapperImpl<'a> { rule: &Rule, targets: &[usize], match_group: &[Match], + graph: &AnnotationGraph, update: &mut GraphUpdate, ) -> anyhow::Result<()> { + let tok_helper = TokenHelper::new(graph)?; + let corpusgraph_helper = CorpusGraphHelper::new(graph); if let Some(first_match) = targets .first() .copied() @@ -337,17 +392,16 @@ impl<'a> MapperImpl<'a> { let mut covered_token = BTreeSet::new(); for t in targets { if let Some(n) = match_group.get(t - 1) { - if self.tok_helper.is_token(n.node)? { + if tok_helper.is_token(n.node)? { covered_token.insert(n.node); } else { - covered_token.extend(self.tok_helper.covered_token(n.node)?); + covered_token.extend(tok_helper.covered_token(n.node)?); } } } // Determine the new node name by extending the node name of the first target - let first_node_name = self - .graph + let first_node_name = graph .get_node_annos() .get_value_for_item(&first_match.node, &NODE_NAME_KEY)? .context("Missing node name")?; @@ -363,14 +417,16 @@ impl<'a> MapperImpl<'a> { node_name: new_node_name.clone(), anno_ns: rule.ns.to_string(), anno_name: rule.name.to_string(), - anno_value: rule.resolve_value(self.graph, match_group)?, + anno_value: rule.resolve_value(graph, match_group)?, })?; // Add the new node to the common parent - if let Some(parent_node) = self.part_of_gs.get_outgoing_edges(first_match.node).next() { + if let Some(parent_node) = corpusgraph_helper + .get_outgoing_edges(first_match.node) + .next() + { let parent_node = parent_node?; - let parent_node_name = self - .graph + let parent_node_name = graph .get_node_annos() .get_value_for_item(&parent_node, &NODE_NAME_KEY)? .context("Missing node name for parent node")?; @@ -384,8 +440,7 @@ impl<'a> MapperImpl<'a> { } // Add the coverage edges to the covered tokens for t in covered_token { - let token_node_name = self - .graph + let token_node_name = graph .get_node_annos() .get_value_for_item(&t, &NODE_NAME_KEY)? .context("Missing node name for covered token")?; @@ -414,6 +469,8 @@ mod tests { AnnotationGraph, }; use graphannis_core::{annostorage::ValueSearch, graph::ANNIS_NS}; + + use pretty_assertions::assert_eq; use tempfile::NamedTempFile; use crate::{manipulator::Manipulator, test_util, util::example_generator, StepID}; @@ -604,6 +661,92 @@ replacements = [ assert_eq!("ellembogem|ellenbogem|ellembogen|ellenbogen", result); } + #[test] + fn repeat_mapping_fixed() { + let config = r#" +repetition = {Fixed = {n = 3}} + +[[rules]] +query = "tok" +target = 1 +ns = "annis" +name = "tok" + +[rules.value] +target = 1 +# Only replace the last character of each token. +replacements = [ + ['(\w\u0304?)X*$', 'X'], +] + "#; + let mut g = tokens_with_macrons().unwrap(); + + let tmp = NamedTempFile::new().unwrap(); + + std::fs::write(tmp.path(), config).unwrap(); + let mapper = MapAnnos { + rule_file: tmp.path().to_path_buf(), + }; + let step_id = StepID { + module_name: "test_map".to_string(), + path: None, + }; + mapper + .manipulate_corpus(&mut g, tmp.path().parent().unwrap(), step_id, None) + .unwrap(); + + let th = TokenHelper::new(&g).unwrap(); + + let tokens = th.get_ordered_token("doc", None).unwrap(); + let text = th.spanned_text(&tokens).unwrap(); + + // The rule is applied three times, to the last 3 characters of each + // token should have been replaced. + assert_eq!("X krX wechX etX anðthaX ellēbX hX", text); + } + + #[test] + fn repeat_mapping_until_unchanged() { + let config = r#" +repetition = "UntilUnchanged" + +[[rules]] +query = 'tok!="X"' +target = 1 +ns = "annis" +name = "tok" + +[rules.value] +target = 1 +replacements = [ + ['[^X]X*$', 'X'], +] + "#; + let mut g = tokens_with_macrons().unwrap(); + + let tmp = NamedTempFile::new().unwrap(); + + std::fs::write(tmp.path(), config).unwrap(); + let mapper = MapAnnos { + rule_file: tmp.path().to_path_buf(), + }; + let step_id = StepID { + module_name: "test_map".to_string(), + path: None, + }; + mapper + .manipulate_corpus(&mut g, tmp.path().parent().unwrap(), step_id, None) + .unwrap(); + + let th = TokenHelper::new(&g).unwrap(); + + let tokens = th.get_ordered_token("doc", None).unwrap(); + let text = th.spanned_text(&tokens).unwrap(); + + // The rule is applied until all characters have been replaced. + assert_eq!("X X X X X X X", text); + } + #[test] fn test_map_spans() { let mut updates = GraphUpdate::new(); @@ -776,6 +919,7 @@ value = "PROPN" Ok(g) } + /// Create tokens "ein kraut wechſzt etwan anðthalbē ellēbogē hoch". fn tokens_with_macrons() -> Result> { let mut g = AnnotationGraph::with_default_graphstorages(true)?; let mut u = GraphUpdate::default(); @@ -806,6 +950,13 @@ value = "PROPN" anno_name: "tok".to_string(), anno_value: text.to_string(), })?; + u.add_event(UpdateEvent::AddEdge { + source_node: format!("doc#t{i}"), + target_node: "doc".to_string(), + layer: ANNIS_NS.to_string(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".to_string(), + })?; if i > 0 { u.add_event(UpdateEvent::AddEdge { source_node: format!("doc#t{i}"), diff --git a/tests/data/import/xlsx/sample_sentence/doc1.xlsx b/tests/data/import/xlsx/sample_sentence/doc1.xlsx index 5137b8ad..b3d780b5 100644 Binary files a/tests/data/import/xlsx/sample_sentence/doc1.xlsx and b/tests/data/import/xlsx/sample_sentence/doc1.xlsx differ diff --git a/tests/snapshots/cli__module_info.snap b/tests/snapshots/cli__module_info.snap index f264403a..e4061486 100644 --- a/tests/snapshots/cli__module_info.snap +++ b/tests/snapshots/cli__module_info.snap @@ -11,36 +11,38 @@ spans and merged cells can be used for spans that cover more than one token. *Configuration* -| name | description | -|------------|-----------------------------------------------------------------------------------------------------------| -| column_map | Maps token columns to annotation columns. If there is more than one | -| | token column, it is assumed that the corpus has multiple segmentations. | -| | In this case, it is necessary to tell the importer which annotation column belongs to which token column. | -| | | -| | Example with the two token columns "dipl" and "norm": | -| | | -| | ```toml | -| | [import.config] | -| | column_map = {"dipl" = ["sentence"], "norm" = ["pos", "lemma", "seg"]} | -| | ``` | -| | The column "sentence" must be always be aligned with the "dipl" token | -| | and "pos", "lemma" and "seg" are aligned with the "norm" token. | -| fallback | If given, the name of the token column to be used when there is no | -| | explicit mapping given in the `column_map` parameter for this annotation | -| | column. | -| | | -| | Example with two token columns "dipl" and "norm", where all annotation | -| | columns except "lemma" and "pos" are mapped to the "dipl" token column: | -| | | -| | ```toml | -| | [import.config] | -| | column_map = {"dipl" = [], "norm" = ["pos", "lemma"]} | -| | fallback = "dipl" | -| | ``` | -| datasheet | Optional value of the Excel sheet that contains the data. If not given, | -| | the first sheet is used. | -| metasheet | Optional value of the Excel sheet that contains the metadata table. If | -| | no metadata is imported. | +| name | description | +|---------------------|-----------------------------------------------------------------------------------------------------------| +| column_map | Maps token columns to annotation columns. If there is more than one | +| | token column, it is assumed that the corpus has multiple segmentations. | +| | In this case, it is necessary to tell the importer which annotation column belongs to which token column. | +| | | +| | Example with the two token columns "dipl" and "norm": | +| | | +| | ```toml | +| | [import.config] | +| | column_map = {"dipl" = ["sentence"], "norm" = ["pos", "lemma", "seg"]} | +| | ``` | +| | The column "sentence" must be always be aligned with the "dipl" token | +| | and "pos", "lemma" and "seg" are aligned with the "norm" token. | +| fallback | If given, the name of the token column to be used when there is no | +| | explicit mapping given in the `column_map` parameter for this annotation | +| | column. | +| | | +| | Example with two token columns "dipl" and "norm", where all annotation | +| | columns except "lemma" and "pos" are mapped to the "dipl" token column: | +| | | +| | ```toml | +| | [import.config] | +| | column_map = {"dipl" = [], "norm" = ["pos", "lemma"]} | +| | fallback = "dipl" | +| | ``` | +| datasheet | Optional value of the Excel sheet that contains the data. If not given, | +| | the first sheet is used. | +| metasheet | Optional value of the Excel sheet that contains the metadata table. If | +| | no metadata is imported. | +| metasheet_skip_rows | Skip the first given rows in the meta data sheet. | +| token_annos | Map the given annotation columns as token annotations and not as span if possible. | # Exporters @@ -64,5 +66,3 @@ spans and merged cells can be used for spans that cover more than one token. | | ``` | | | | | | Has no effect if the vector is empty. | - -