Merge pull request #289 from korpling/feature/main-memory-switch

New command line argument `--in-memory`
korpling · Aug 2, 2024 · a42cfa1 · a42cfa1
2 parents 7cd96b9 + 5567e7e
commit a42cfa1
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 93 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- New command line argument `--in-memory` that has the same meaning as setting
+  `ANNATTO_IN_MEMORY` to true but is easier to discover.
 - `map` manipulator can now add annotated spans and copy values from existing
   annotations. The copied values can be manipulated used regular expressions and
   replacement values.

diff --git a/src/bin/annatto.rs b/src/bin/annatto.rs
@@ -32,8 +32,14 @@ enum Cli {
         #[clap(value_parser)]
         workflow_file: std::path::PathBuf,
         /// Adding this argument resolves environmental variables in the provided workflow file.
-        #[structopt(long)]
+        #[clap(long)]
         env: bool,
+        /// If this argument is given, store temporary annotation graphs in main
+        /// memory instead of on disk. This is faster, but if the corpus is too
+        /// large to fit into main memory, the pipeline will fail. Can also set
+        /// by setting the environment variable `ANNATTO_IN_MEMORY` to `true`.
+        #[clap(long, env = "ANNATTO_IN_MEMORY", default_value = "false")]
+        in_memory: bool,
     },
     /// Only check if a workflow file can be loaded. Invalid workflow files will lead to a non-zero exit code.
     Validate {
@@ -93,7 +99,11 @@ pub fn main() -> anyhow::Result<()> {
 
     let args = Parser::parse();
     match args {
-        Cli::Run { workflow_file, env } => convert(workflow_file, env)?,
+        Cli::Run {
+            workflow_file,
+            env,
+            in_memory,
+        } => convert(workflow_file, env, in_memory)?,
         Cli::Validate { workflow_file } => {
             Workflow::try_from((workflow_file, false))?;
         }
@@ -107,10 +117,11 @@ pub fn main() -> anyhow::Result<()> {
 }
 
 /// Execute the conversion in the background and show the status to the user
-fn convert(workflow_file: PathBuf, read_env: bool) -> Result<(), AnnattoError> {
+fn convert(workflow_file: PathBuf, read_env: bool, in_memory: bool) -> Result<(), AnnattoError> {
     let (tx, rx) = mpsc::channel();
-    let result =
-        thread::spawn(move || execute_from_file(&workflow_file, read_env, Some(tx.clone())));
+    let result = thread::spawn(move || {
+        execute_from_file(&workflow_file, read_env, in_memory, Some(tx.clone()))
+    });
 
     let mut all_bars: HashMap<StepID, ProgressBar> = HashMap::new();
 

diff --git a/src/importer/relannis/tests.rs b/src/importer/relannis/tests.rs
@@ -329,7 +329,12 @@ fn node_by_text_entry_serializer() {
 
 #[test]
 fn parse_relannis_workflow() {
-    let r = workflow::execute_from_file(Path::new("./tests/workflows/relannis.toml"), true, None);
+    let r = workflow::execute_from_file(
+        Path::new("./tests/workflows/relannis.toml"),
+        true,
+        false,
+        None,
+    );
     // This should fail, because the input directory does not exist
     assert_eq!(true, r.is_err());
     assert_eq!(

diff --git a/src/lib.rs b/src/lib.rs
@@ -7,7 +7,6 @@ pub mod importer;
 pub mod manipulator;
 pub mod models;
 pub mod progress;
-pub mod runtime;
 #[cfg(test)]
 pub(crate) mod test_util;
 pub(crate) mod util;

diff --git a/src/manipulator/visualize.rs b/src/manipulator/visualize.rs
@@ -361,7 +361,7 @@ mod tests {
             &workflow_file,
         )
         .unwrap();
-        execute_from_file(&workflow_file, true, None).unwrap();
+        execute_from_file(&workflow_file, true, false, None).unwrap();
         let result_dot = std::fs::read_to_string(workflow_dir.path().join("test.dot")).unwrap();
         assert_snapshot!(result_dot);
     }
@@ -375,7 +375,7 @@ mod tests {
             &workflow_file,
         )
         .unwrap();
-        execute_from_file(&workflow_file, true, None).unwrap();
+        execute_from_file(&workflow_file, true, false, None).unwrap();
         let result_dot = std::fs::read_to_string(workflow_dir.path().join("test.dot")).unwrap();
         assert_snapshot!(result_dot);
     }

diff --git a/src/runtime.rs b/src/runtime.rs
diff --git a/src/workflow.rs b/src/workflow.rs
@@ -10,8 +10,8 @@ use regex::Regex;
 use serde_derive::Deserialize;
 
 use crate::{
-    error::AnnattoError, error::Result, progress::ProgressReporter, runtime, ExporterStep,
-    ImporterStep, ManipulatorStep, StepID,
+    error::AnnattoError, error::Result, progress::ProgressReporter, ExporterStep, ImporterStep,
+    ManipulatorStep, StepID,
 };
 use log::error;
 use normpath::PathExt;
@@ -113,10 +113,12 @@ impl TryFrom<(PathBuf, bool)> for Workflow {
 ///
 /// * `workflow_file` - The TOML workflow file.
 /// * `read_env` - Set whether to resolve environment variables in the workflow file.
+/// * `in_memory` - If true, use a main memory implementation to store the temporary graphs.
 /// * `tx` - If supported by the caller, this is a sender object that allows to send [status updates](enum.StatusMessage.html) (like information messages, warnings and module progress) to the calling entity.
 pub fn execute_from_file(
     workflow_file: &Path,
     read_env: bool,
+    in_memory: bool,
     tx: Option<Sender<StatusMessage>>,
 ) -> Result<()> {
     let wf = Workflow::try_from((workflow_file.to_path_buf(), read_env))?;
@@ -125,7 +127,7 @@ pub fn execute_from_file(
     } else {
         Path::new("")
     };
-    wf.execute(tx, parent_dir)?;
+    wf.execute(tx, parent_dir, in_memory)?;
     Ok(())
 }
 
@@ -136,6 +138,7 @@ impl Workflow {
         &self,
         tx: Option<StatusSender>,
         default_workflow_directory: &Path,
+        in_memory: bool,
     ) -> Result<()> {
         // Create a vector of all conversion steps and report these as current status
         let apply_update_step_id = StepID {
@@ -172,9 +175,17 @@ impl Workflow {
         // Create a new empty annotation graph and apply updates
         let apply_update_reporter =
             ProgressReporter::new_unknown_total_work(tx.clone(), apply_update_step_id.clone())?;
-        apply_update_reporter
-            .info("Creating annotation graph by applying the updates from the import steps")?;
-        let mut g = runtime::initialize_graph(&tx)?;
+        if in_memory {
+            apply_update_reporter.info(
+                "Creating in-memory annotation graph by applying the updates from the import steps",
+            )?;
+        } else {
+            apply_update_reporter.info(
+                "Creating on-disk annotation graph by applying the updates from the import steps",
+            )?;
+        }
+        let mut g = AnnotationGraph::with_default_graphstorages(!in_memory)
+            .map_err(|e| AnnattoError::CreateGraph(e.to_string()))?;
 
         // collect all updates in a single update to only have a single atomic
         // call to `apply_update`
@@ -344,6 +355,7 @@ mod tests {
         execute_from_file(
             Path::new("./tests/data/import/empty/empty.toml"),
             false,
+            false,
             None,
         )
         .unwrap();
@@ -380,6 +392,7 @@ mod tests {
         execute_from_file(
             Path::new("./tests/workflows/multiple_importer.toml"),
             false,
+            false,
             None,
         )
         .unwrap();
@@ -394,6 +407,7 @@ mod tests {
         execute_from_file(
             Path::new("./tests/workflows/nonexisting_dir.toml"),
             true,
+            false,
             None,
         )
         .unwrap();