From 7ca27b4e021c27cb158d839e497879224333a7d1 Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Tue, 8 Oct 2024 13:07:57 +0200 Subject: [PATCH] fix: Fix standardise with one file (and improved standardise DOC) --- README.md | 27 ++++++++++++++++--- ansible/group_vars/all.yaml | 2 +- .../standardise/tasks/standardise-file.yaml | 2 +- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f758f61..c2b1314 100644 --- a/README.md +++ b/README.md @@ -325,9 +325,30 @@ intensive fragmentation step. The process consists of three steps, described below: ### Standardisation -It's easier to run playbooks using a YAML-based parameter file, -where all the control variables can be set. A typical parameter file -(`parameters`) might look like this: - +Standardisation "normalises" the customer SMILES into a format suitable for +on-going processing. It relies on files located in an S3 bucket as +described above in **Configuring the S3 Directory Structure**. + +Using Ansible variables: - + +- `unpacker` identifies the task file (in `roles/standardise/tasks`) that + will be used to unpack the raw data files. A value of `decompress-gz-all` + will use the logic defined in the task file `unpack-raw-decompress-gz-all.yaml`. +- `standinputfile` identifies the names of the decompressed files to be standardised, + and is typically a regular expression like `s*.cxsmiles` +- `standardiser` identifies the Python module in the project `frag/standardise/scripts` + directory that will interpret the raw data and produce the standardised file + (based on the files collected by the `standinputfile` filename filter). + It is responsible for parsing the decompressed input file so it and the file + must be compatible. + +These values are provided to the standardise play using a parameter file in the +`roles/standardise/vars` directory. Its name is based on the vendor/library name. +For example, the `xchem_dsip` library, e.g. `xchem_dsip-variables.yaml`. + +It's easier to run all our playbooks using a YAML-based parameter file, +where all the main control variables can be set. A typical parameter file +(`parameters.yaml`) might look like this: - ```yaml --- diff --git a/ansible/group_vars/all.yaml b/ansible/group_vars/all.yaml index dad564f..42b46ce 100644 --- a/ansible/group_vars/all.yaml +++ b/ansible/group_vars/all.yaml @@ -28,7 +28,7 @@ clean_finish: no # Blank assumes no registry (i.e. 'docker.io') nextflow_container_registry: '' nextflow_container_name: informaticsmatters/fragmentor -nextflow_container_tag: '2.0.0' +nextflow_container_tag: '2.0.1' # Memory requested by the 'sort' processes in the nextflow workflow # (namely fragmentation and combination). Where it's actually used diff --git a/ansible/roles/standardise/tasks/standardise-file.yaml b/ansible/roles/standardise/tasks/standardise-file.yaml index fb86577..78e6f6c 100644 --- a/ansible/roles/standardise/tasks/standardise-file.yaml +++ b/ansible/roles/standardise/tasks/standardise-file.yaml @@ -114,7 +114,7 @@ -w {{ nextpath }} -with-report {{ standpath }}/standardise/standardise_nextflow_report.html --script {{ standardiser }} - --inputs {{ datapath }}/data/{{ vendor }}/"{{ standinputfile }}" + --inputs {{ datapath }}/data/{{ vendor }}/{{ standinputfile }} --out_dir {{ standpath }}/standardise --chunk_size {{ standchunksize|int }} --compound_id_prefix {{ standard_compound_id_prefix }}