From 7ca27b4e021c27cb158d839e497879224333a7d1 Mon Sep 17 00:00:00 2001
From: Alan Christie <alan.christie@matildapeak.com>
Date: Tue, 8 Oct 2024 13:07:57 +0200
Subject: [PATCH] fix: Fix standardise with one file (and improved standardise
 DOC)

---
 README.md                                     | 27 ++++++++++++++++---
 ansible/group_vars/all.yaml                   |  2 +-
 .../standardise/tasks/standardise-file.yaml   |  2 +-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index f758f61..c2b1314 100644
--- a/README.md
+++ b/README.md
@@ -325,9 +325,30 @@ intensive fragmentation step.
 The process consists of three steps, described below:
 
 ### Standardisation
-It's easier to run playbooks using a YAML-based parameter file,
-where all the control variables can be set. A typical parameter file
-(`parameters`) might look like this: -
+Standardisation "normalises" the customer SMILES into a format suitable for
+on-going processing. It relies on files located in an S3 bucket as
+described above in **Configuring the S3 Directory Structure**.
+
+Using Ansible variables: -
+
+-  `unpacker` identifies the task file (in `roles/standardise/tasks`) that
+    will be used to unpack the raw data files. A value of `decompress-gz-all`
+    will use the logic defined in the task file `unpack-raw-decompress-gz-all.yaml`.
+-   `standinputfile` identifies the names of the decompressed files to be standardised,
+    and is typically a regular expression like `s*.cxsmiles`
+-   `standardiser` identifies the Python module in the project `frag/standardise/scripts`
+    directory that will interpret the raw data and produce the standardised file
+    (based on the files collected by the `standinputfile` filename filter).
+    It is responsible for parsing the decompressed input file so it and the file
+    must be compatible.
+
+These values are provided to the standardise play using a parameter file in the
+`roles/standardise/vars` directory. Its name is based on the vendor/library name.
+For example, the `xchem_dsip` library, e.g. `xchem_dsip-variables.yaml`.
+
+It's easier to run all our playbooks using a YAML-based parameter file,
+where all the main control variables can be set. A typical parameter file
+(`parameters.yaml`) might look like this: -
 
 ```yaml
 ---
diff --git a/ansible/group_vars/all.yaml b/ansible/group_vars/all.yaml
index dad564f..42b46ce 100644
--- a/ansible/group_vars/all.yaml
+++ b/ansible/group_vars/all.yaml
@@ -28,7 +28,7 @@ clean_finish: no
 # Blank assumes no registry (i.e. 'docker.io')
 nextflow_container_registry: ''
 nextflow_container_name: informaticsmatters/fragmentor
-nextflow_container_tag: '2.0.0'
+nextflow_container_tag: '2.0.1'
 
 # Memory requested by the 'sort' processes in the nextflow workflow
 # (namely fragmentation and combination). Where it's actually used
diff --git a/ansible/roles/standardise/tasks/standardise-file.yaml b/ansible/roles/standardise/tasks/standardise-file.yaml
index fb86577..78e6f6c 100644
--- a/ansible/roles/standardise/tasks/standardise-file.yaml
+++ b/ansible/roles/standardise/tasks/standardise-file.yaml
@@ -114,7 +114,7 @@
         -w {{ nextpath }}
         -with-report {{ standpath }}/standardise/standardise_nextflow_report.html
         --script {{ standardiser }}
-        --inputs {{ datapath }}/data/{{ vendor }}/"{{ standinputfile }}"
+        --inputs {{ datapath }}/data/{{ vendor }}/{{ standinputfile }}
         --out_dir {{ standpath }}/standardise
         --chunk_size {{ standchunksize|int }}
         --compound_id_prefix {{ standard_compound_id_prefix }}