Merge pull request #96 from wilhelm-lab/feature/config_reconstruction

Read input/output paths from config and restructure it
wilhelm-lab · Jul 14, 2023 · a241348 · a241348
2 parents 901b092 + f596737
commit a241348
Show file tree

Hide file tree

Showing 16 changed files with 259 additions and 230 deletions.
diff --git a/Makefile b/Makefile
@@ -22,7 +22,7 @@ build: dependencies
 
 run_oktoberfest: rm_err_file
 	$(DOCKER_CMD) \
-		$(IMAGE) python3 -u -m oktoberfest --search_dir $(LOCAL_DIR) --config_path $(LOCAL_DIR)/config.json || (echo "2" > $(DATA)err.out; exit 2)
+		$(IMAGE) python3 -u -m oktoberfest --config_path $(LOCAL_DIR)/config.json || (echo "2" > $(DATA)err.out; exit 2)
 
 compress: run_oktoberfest
 	zip -j -r -9 "$(DATA)/results.zip" "$(DATA)/results/" || (echo "3" > $(DATA)err.out; exit 3)
@@ -31,7 +31,7 @@ all: compress
 
 
 run_local:
-	python3 -u -m oktoberfest --search_dir "$(DATA)" --config_path $(DATA)/config.json
+	python3 -u -m oktoberfest --config_path $(DATA)/config.json
 
 clean_data_folder:
 	bash -c "rm -rf $(DATA)/{proc,msms,results,mzML,msms.prosit,err.out,results.zip}"
diff --git a/MakefileShared b/MakefileShared
@@ -14,7 +14,7 @@ else
 endif
 
 DOCKER_CMD ?= docker run -i ${USE_TTY} \
-					-v "$(DATA)":/root/data/
+					-v "$(realpath ${DATA})":/root/data/
 					--memory=$(MEMORY_LIMIT) \
 					--cpus=$(CPU_LIMIT) \
 

diff --git a/README.rst b/README.rst
@@ -110,48 +110,47 @@ Configuration
 
 Create a `config.json` file which should contain the following flags:
 
-- `jobType` = "CollisionEnergyAlignment", "SpectralLibraryGeneration" or "Rescoring"
+- `type` = "CollisionEnergyAlignment", "SpectralLibraryGeneration" or "Rescoring"
 - `tag` = "tmt", "tmtpro", "itraq4" or "itraq8"; default is ""
 - `fdr_estimation_method` = method used for FDR estimation on PSM and peptide level: "percolator" or "mokapot"; default = "mokapot"
-- allFeatures = True if all features should be used for FDR estimation; default = False
+- `allFeatures`` = True if all features should be used for FDR estimation; default = False
 - `regressionMethod` = regression method for curve fitting (mapping from predicted iRT values to experimental retention times): "lowess", "spline" or "logistic"; default = "lowess"
-- `fileUploads``
-   - `search_type``
-      This refers to the search engine that was used to produce the search result file(s).
-      Valid keys: "Maxquant", "Msfragger", "Mascot" or "Internal"; default = "Maxquant"
-   - `raw_type`
-      "thermo" or "mzml"; default = "thermo"
+- `inputs`
+   - `search_results` = path to the msms.txt (if the search type is msfragger, then the path to the xlsx file should be provided)
+   - `search_results_type` = "Maxquant", "Msfragger", "Mascot" or "Internal"; default = "Maxquant"
+   - `spectra` = path to the search results (raw or mzml files)
+   - `spectra_type` = "raw" or "mzml"; default = "raw"
 - `models`
-   - `intensity`
-      intensity model
-   - `irt`
-      irt model
+   - `intensity` = intensity model
+   - `irt` = irt model
 - `prediction_server` = server for obtaining peptide property predictions
+- `ssl` = Use ssl when making requests to the prediction server, can be true or false; default = true
 - `numThreads` = number of raw files processed in parallel processes; default = 1
-- `searchPath` = path to the search file (if the search type is msfragger, then the path to the xlsx file should be provided); default = ""
 - `thermoExe` = path to ThermoRawFileParser executable; default "ThermoRawFileParser.exe"
+- `output` = path to the output folder; if not provided the current working directory will be used.
 
 For `prediction_server`, you should use the `koina <https://koina.proteomicsdb.org/>`_ instance we provide at `koina.proteomicsdb.org:443`.
 For models, you should choose the models that fit your use case. You can see available models for the prediction server we offer at `https://koina.proteomicsdb.org/docs`.
 For a list of currently tested models, check the "Supported Models" section below.
 
 The following flags are relevant only for SpectralLibraryGeneration:
 
+- `inputs`
+   - `library_input` = path to the FASTA or peptides file
+   - `library_input_type` = library input type: "fasta" or "peptides
 - `outputFormat` = "spectronaut" or "msp"
-- `fasta` = path to the FASTA file, if FASTA file is provided
-- `peptides.csv` = true if you like to provide the list of peptides
 
 The following flags are relevant only if a FASTA file is provided:
 
 - `fastaDigestOptions`
-  - `fragmentation` = fragmentation method: "HCD" or "CID"
-  - `digestion` = digestion mode: "full", "semi" or None; default = "full"
-  - `cleavages` = number of allowed missed cleavages used in the search engine; default = 2
-  - `minLength` = minimum peptide length allowed used in the search engine; default = 7
-  - `maxLength` = maximum peptide length allowed used in the search engine; default = 60
-  - `enzyme` = type of enzyme used in the search engine; default = "trypsin"
-  - `specialAas` = special amino acids used by MaxQuant for decoy generation; default = "KR"
-  - `db` = "target", "decoy" or "concat"; default = "concat"
+   - `fragmentation` = fragmentation method: "HCD" or "CID"
+   - `digestion` = digestion mode: "full", "semi" or None; default = "full"
+   - `cleavages` = number of allowed missed cleavages used in the search engine; default = 2
+   - `minLength` = minimum peptide length allowed used in the search engine; default = 7
+   - `maxLength` = maximum peptide length allowed used in the search engine; default = 60
+   - `enzyme` = type of enzyme used in the search engine; default = "trypsin"
+   - `specialAas` = special amino acids used by MaxQuant for decoy generation; default = "KR"
+   - `db` = "target", "decoy" or "concat"; default = "concat"
 
 An example of the config file can be found in `/oktoberfest/example_config.json`.
 
@@ -162,32 +161,30 @@ The general command for executing any job is:
 
 .. code-block:: bash
 
-   python oktoberfest/run_oktoberfest.py --search_dir path_to_search_dir --config_path path_to_config_file
-
-Note: The ``search_dir`` should contain both the raw files and the search results that fit the specified ``search_type`` in the config, e.g., ``msms.txt`` for MaxQuant.
+   python oktoberfest/run_oktoberfest.py --config_path path_to_config_file
 
 If you instead want to run oktoberfest using the docker image, run:
 
 .. code-block:: bash
 
    DATA=path/to/data/dir make run_oktoberfest
 
-Note: ``DATA`` must be the absolute path to your data folder. It should contain the raw files, the search results that fit the specified ``search_type`` in the config, e.g., ``msms.txt`` for MaxQuant, and the ``config.json``. The results will be written to ``<DATA>/results/percolator``.
+Note: When using with docker, `DATA` must contain the spectra, the search results that fit the specified `search_type` in the config, e.g. `msms.txt` for MaxQuant and a `config.json` file with the configuration. The results will be written to `<DATA>/<output>/results/percolator`.
 
 Supported Models
 ----------------
 
 This is the list of currently supported and tested models for peptide property prediction provided by `koina.proteomicsdb.org`:
 
 - Intensity models:
-  - Prosit_2019_intensity
-  - Prosit_2020_intensity_HCD
-  - Prosit_2020_intensity_CID
-  - Prosit_2020_intensity_TMT
+   - Prosit_2019_intensity
+   - Prosit_2020_intensity_HCD
+   - Prosit_2020_intensity_CID
+   - Prosit_2020_intensity_TMT
 
 - iRT models:
-  - Prosit_2019_irt
-  - Prosit_2020_irt_TMT
+   - Prosit_2019_irt
+   - Prosit_2020_irt_TMT
 
 Once support for additional models is added, they will be added here.
 

diff --git a/ReadMe.md b/ReadMe.md
@@ -82,7 +82,7 @@ Oktoberfest will:
 
 Create a `config.json` file which should contain the following flags:
 
--   `jobType` = "CollisionEnergyAlignment", "SpectralLibraryGeneration" or "Rescoring"
+-   `type` = "CollisionEnergyAlignment", "SpectralLibraryGeneration" or "Rescoring"
 
 -   `tag` = "tmt", "tmtpro", "itraq4" or "itraq8"; default is ""
 
@@ -92,11 +92,15 @@ Create a `config.json` file which should contain the following flags:
 
 -   `regressionMethod` = regression method for curve fitting (mapping from predicted iRT values to experimental retention times): "lowess", "spline" or "logistic"; default = "lowess"
 
--   `fileUploads`
+-   `inputs`
 
-    -   `search_type` = "Maxquant", "Msfragger", "Mascot" or "Internal"; default = "Maxquant"
+    -   `search_results` = path to the msms.txt (if the search type is msfragger, then the path to the xlsx file should be provided)
 
-    -   `raw_type` = "thermo" or "mzml"; default = "thermo"
+    -   `search_results_type` = "Maxquant", "Msfragger", "Mascot" or "Internal"; default = "Maxquant"
+
+    -   `spectra` = path to the search results (raw or mzml files)
+
+    -   `spectra_type` = "raw" or "mzml"; default = "raw"
 
 -   `models`
 
@@ -110,21 +114,23 @@ Create a `config.json` file which should contain the following flags:
 
 -   `numThreads` = number of raw files processed in parallel processes; default = 1
 
--   `searchPath` = path to the search file (if the search type is msfragger, then the path to the xlsx file should be provided); default = ""
-
 -   `thermoExe` = path to ThermoRawFileParser executable; default "ThermoRawFileParser.exe"
 
+-   `output` = path to the output folder; if not provided the current working directory will be used.
+
 For `prediction_server`, you should use the koina (https://koina.proteomicsdb.org/) instance we provide at koina.proteomicsdb.org:443.
 For models, you should choose the models that fit your use case. You can see available models for the prediction server we offer at https://koina.proteomicsdb.org/docs.
 For a list of currently tested models, check the "Supported Models" section below.
 
 The following flags are relevant only for SpectralLibraryGeneration:
 
--   `outputFormat` = "spectronaut" or "msp"
+-   `inputs`
+
+    -   `library_input` = path to the FASTA or peptides file
 
--   `fasta` = path to the FASTA file, if FASTA file is provided
+    -   `library_input_type` = library input type: "fasta" or "peptides
 
--   `peptides.csv` = true if you like to provide the list of peptides
+-   `outputFormat` = "spectronaut" or "msp"
 
 The following flags are relevant only if a FASTA file is provided:
 
@@ -153,18 +159,16 @@ An example of the config file can be found in `/oktoberfest/example_config.json`
 The general command for executing any job is:
 
 ```bash
-python oktoberfest/run_oktoberfest.py —-search_dir path_to_search_dir —-config_path path_to_config_file
+python oktoberfest/run_oktoberfest.py --config_path path_to_config_file
 ```
 
-Note: The `search_dir` should contain both the raw files and the search results that fit the specified `search_type` in the config, e.g. `msms.txt` for MaxQuant.
-
 If you instead want to run oktoberfest using the docker image, run:
 
 ```bash
 DATA=path/to/data/dir make run_oktoberfest
 ```
 
-Note: `DATA` must be the absolute path to your data folder. It should contain the raw files, the search results that fit the specified `search_type` in the config, e.g. `msms.txt` for MaxQuant and the `config.json`. The results will be written to `<DATA>/results/percolator`.
+Note: When using with docker, `DATA` must contain the spectra, the search results that fit the specified `search_type` in the config, e.g. `msms.txt` for MaxQuant and a `config.json` file with the configuration. The results will be written to `<DATA>/<output>/results/percolator`.
 
 ## Supported Models
 

diff --git a/data/plasma/ReadMe.md b/data/plasma/ReadMe.md
@@ -19,7 +19,7 @@ Note: If ThermoRawFileParser.exe is not residing in the base directory, you need
 In case you have installed oktoberfest in a docker container, please execute
 
 ```
-DATA=$(realpath data/plasma)/ make run_oktoberfest
+DATA=data/plasma make run_oktoberfest
 ```
 
 from the base directory (ThermoRawFileParser.exe is installed in the base directory automatically).
diff --git a/data/plasma/config.json b/data/plasma/config.json
@@ -2,17 +2,18 @@
     "jobType": "Rescoring",
     "tag": "",
     "allFeatures": false,
-    "fileUploads": {
+    "inputs": {
+        "search_results": "./msms.txt",
         "search_type": "Maxquant",
-        "raw_type": "thermo",
-        "fasta": false,
-        "peptides.csv": true
+        "spectra": "./",
+        "spectra_type": "raw"
     },
     "models": {
         "intensity": "Prosit_2020_intensity_HCD",
         "irt": "Prosit_2019_irt"
     },
     "prediction_server": "koina.proteomicsdb.org:443",
     "ssl": true,
-    "thermoExe": "ThermoRawFileParser.exe"
+    "thermoExe": "ThermoRawFileParser.exe",
+    "output": "./out"
 }
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -19,18 +19,17 @@ Example config file:
     task_config_ce_calibration = {
         "type": "CollisionEnergyCalibration",
         "tag": "",
-        "output": "path_to_output_folder",
+        "output": "./out",
         "inputs": {
-            "search_results": "path_to_msms",
+            "search_results": "./msms.txt",
             "search_type": "Maxquant",
-            "spectra": "path_to_spectra_files",
+            "spectra": "./",
             "spectra_type": "raw"
         },
         "models": {
             "intensity": "Prosit_2020_intensity_HCD",
             "irt": "Prosit_2019_irt"
         },
-        "outputFormat": "",
         "prediction_server": "koina.proteomicsdb.org:443",
         "regressionMethod": "lowess",
         "ssl": True,
@@ -54,11 +53,11 @@ Example config file:
     task_config_spectral_lib = {
         "type": "SpectralLibraryGeneration",
         "tag": "",
-        "output": "path_to_output_folder",
+        "output": "./out",
         "inputs": {
-            "search_results": "path_to_msms",
+            "search_results": "./msms.txt",
             "search_type": "Maxquant",
-            "library_input": "path_to_peptides_csv,
+            "library_input": "./peptides.csv",
             "library_input_type": "peptides"
         },
         "models": {
@@ -101,20 +100,19 @@ Example config file:
     task_config_rescoring = {
         "type": "Rescoring",
         "tag": "",
-        "output": "path_to_output_folder",
+        "output": "./out",
         "inputs": {
-            "search_results": "path_to_msms",
+            "search_results": "./msms.txt",
             "search_type": "Maxquant",
-            "spectra": "path_to_spectra_files",
+            "spectra": "./",
             "spectra_type": "raw"
         },
         "models": {
             "intensity": "Prosit_2020_intensity_HCD",
             "irt": "Prosit_2019_irt"
         },
-        "outputFormat": "",
         "prediction_server": "koina.proteomicsdb.org:443",
-        "numThreads": 4,
+        "numThreads": 1,
         "fdr_estimation_method": "mokapot",
         "allFeatures": False,
         "regressionMethod": "lowess",

diff --git a/example_config.json b/example_config.json
@@ -1,25 +1,19 @@
 {
-    "jobType": "Rescoring",
+    "type": "Rescoring",
     "tag": "",
     "allFeatures": false,
-    "fileUploads": {
+    "inputs": {
+        "search_results": "./msms.txt",
         "search_type": "Maxquant",
-        "raw_type": "thermo",
-        "fasta": false,
-        "peptides.csv": true
-    },
-    "fastaDigestOptions": {
-        "collisionEnergy": false,
-        "protease": false,
-        "missedCleavages": false,
-        "oxidizedMethionine": false,
-        "charges": false
+        "spectra": "./",
+        "spectra_type": "raw"
     },
     "models": {
         "intensity": "Prosit_2020_intensity_HCD",
         "irt": "Prosit_2019_irt"
     },
-    "spectralLibraryOutputFormat": "msp",
+    "output": "./out",
+    "outputFormat": "",
     "prediction_server": "koina.proteomicsdb.org:443",
     "ssl": true,
     "numThreads": 1,