Merge pull request #271 from wilhelm-lab/release/0.8.0

Release/0.8.0
wilhelm-lab · Oct 7, 2024 · 1ff82c0 · 1ff82c0
2 parents 9d56659 + 0005103
commit 1ff82c0
Show file tree

Hide file tree

Showing 56 changed files with 124,055 additions and 2,909 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Victor Giurcoiu
 email: [email protected]
 project_name: oktoberfest
 project_short_description: Public repo oktoberfest
-version: 0.7.0
+version: 0.8.0
 license: MIT
diff --git a/.flake8 b/.flake8
@@ -8,5 +8,6 @@ per-file-ignores =
 	tests/*:S101
 	**/__init__.py:F401,F403
     	docs/conf.py:S404,S607,S603
-	oktoberfest/runner.py:S301,S403
+	oktoberfest/runner.py:C901,S301,S403
+    oktoberfest/predict/dlomix.py:E402
 docstring_style = sphinx
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.7.0 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.7.0 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.8.0 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.8.0 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -121,6 +121,7 @@ jobs:
               with:
                   name: coverage-data
                   path: ".coverage.*"
+                  include-hidden-files: "true"
 
             - name: Upload documentation
               if: matrix.session == 'docs-build'
@@ -136,10 +137,10 @@ jobs:
             - name: Check out the repository
               uses: actions/checkout@v4
 
-            - name: Set up Python 3.8
+            - name: Set up Python 3.9
               uses: actions/setup-python@v5
               with:
-                  python-version: 3.8
+                  python-version: 3.9
 
             - name: Install Poetry
               run: |

diff --git a/.gitignore b/.gitignore
@@ -51,6 +51,7 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+./unit_tests/data/quantification
 
 # Translations
 *.mo
@@ -149,3 +150,9 @@ tutorials/
 
 # example data
 data/
+
+# Machine learning artifacts
+wandb/
+
+# doctest IO files
+tests/doctests/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -40,7 +40,7 @@ repos:
             entry: pyupgrade
             language: system
             types: [python]
-            args: [--py38-plus]
+            args: [--py39-plus, --keep-runtime-typing]
           - id: trailing-whitespace
             name: Trim Trailing Whitespace
             entry: trailing-whitespace-fixer

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.7.0
+current_version = 0.8.0
 
 [bumpversion_files_whitelisted]
 init_file = oktoberfest/__init__.py

diff --git a/docs/API.rst b/docs/API.rst
@@ -20,6 +20,7 @@ Preprocessing: :code:`pp`
 .. currentmodule:: oktoberfest
 
 Generating libraries
+~~~~~~~~~~~~~~~~~~~~
 
 .. autosummary::
    :toctree: api/pp
@@ -31,6 +32,7 @@ Generating libraries
    pp.annotate_spectral_library
 
 Spectra preprocessing
+~~~~~~~~~~~~~~~~~~~~~
 
 .. autosummary::
    :toctree: api/pp
@@ -42,6 +44,7 @@ Spectra preprocessing
 
 
 Peptide preprocessing
+~~~~~~~~~~~~~~~~~~~~~
 
 .. autosummary::
    :toctree: api/pp
@@ -57,33 +60,43 @@ Peptide preprocessing
 
 Predicting: :code:`pr`
 ----------------------
+.. TODO
+    add full class documentation through autosummary
 
 .. module:: oktoberfest.pr
 
 .. currentmodule:: oktoberfest
 
-Access to functions that communicate with a Koina server to retrieve predictions from various prediction models.
+Access to functions that interface either a Koina server to retrieve predictions from various prediction models, or DLomix to serve & refinement-learn pre-trained models locally.
 
-High level features
-~~~~~~~~~~~~~~~~~~~
+High-level prediction runner
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autosummary::
-   :toctree: api/pr
+    :recursive:
+    :toctree: api/pr
 
-   pr.predict_intensities
-   pr.predict_rt
-   pr.ce_calibration
+    pr.Predictor
 
 Koina interface
 ~~~~~~~~~~~~~~~
 
 .. autosummary::
-   :toctree: api/pr
+    :recursive:
+    :toctree: api/pr
 
-   pr.predict
-   pr.predict_at_once
-   pr.predict_in_chunks
+    pr.Koina
 
+DLomix interface
+~~~~~~~~~~~~~~~~
+
+.. autosummary::
+    :recursive:
+    :toctree: api/pr
+
+    pr.DLomix
+    pr.create_dlomix_dataset
+    pr.refine_intensity_predictor
 
 Rescoring: :code:`re`
 ---------------------

diff --git a/docs/_static/custom_cookietemple.css b/docs/_static/custom_cookietemple.css
@@ -75,6 +75,29 @@ table.align-default {
     padding-left: 50px;
 }
 
+.lib-refinement-learning-config-table
+    tbody
+    tr:nth-child(n + 2):nth-child(-n + 5)
+    td:nth-child(1),
+.lib-refinement-learning-config-table tbody tr:nth-child(8) td:nth-child(1) {
+    padding-left: 50px;
+}
+
+.lib-refinement-learning-config-table
+    tbody
+    tr:nth-child(n + 6):nth-child(-n + 7)
+    td:nth-child(1),
+.lib-refinement-learning-config-table
+    tbody
+    tr:nth-child(n + 9):nth-child(-n + 10)
+    td:nth-child(1) {
+    padding-left: 100px;
+}
+
+.rescore-config-table tbody tr:last-child td:first-child {
+    padding-left: 50px;
+}
+
 .date {
     font-size: 50%;
 }
diff --git a/docs/conf.py b/docs/conf.py
@@ -33,6 +33,7 @@
     "sphinx_autodoc_typehints",
     "sphinx.ext.intersphinx",
     "sphinx_click",
+    "sphinx.ext.autosectionlabel",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -54,9 +55,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.7.0"
+version = "0.8.0"
 # The full version, including alpha/beta/rc tags.
-release = "0.7.0"
+release = "0.8.0"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -70,9 +71,6 @@
 # This patterns also effect to html_static_path and html_extra_path
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = False
 
@@ -93,6 +91,9 @@
 #
 html_theme = "sphinx_rtd_theme"
 
+# The names of the Pygments (syntax highlighting) styles to use.
+html_theme_options = {"pygment_light_style": "default", "pygment_dark_style": "lightbulp"}
+
 # Theme options are theme-specific and customize the look and feel of a
 # theme further.  For a list of options available for each theme, see the
 # documentation.
@@ -250,3 +251,6 @@ def modurl(qualname):
 # and there’s no way to insert filters into those templates
 # so we have to modify the default filters
 DEFAULT_FILTERS["modurl"] = modurl
+
+# -- Options for autosectionlabel mappings -----------------------------
+autosectionlabel_prefix_document = True
diff --git a/docs/config.rst b/docs/config.rst
@@ -1,7 +1,7 @@
 Configuration
 =============
 
-The following provides an overview of all available flags in the configuration file to use the high level API and run jobs. Parameters may be applicable to more than one job type and are collected within indivdual tables.
+The following provides an overview of all available flags in the configuration file to use the high-level API and run jobs. Parameters may be applicable to more than one job type and are collected within indivdual tables.
 
 Always applicable
 -----------------
@@ -18,7 +18,7 @@ Always applicable
    +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    | models                     | Contains information about the used models for peptide property prediction (see following 2 nested parameters)                                                                                                                                                                             |
    +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-   |     intensity              | Name of the model used for fragment intensity prediction                                                                                                                                                                                                                                   |
+   |     intensity              | Name or path of the model used for fragment intensity prediction                                                                                                                                                                                                                           |
    +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |     irt                    | Name of the model used for indexed retention time prediction                                                                                                                                                                                                                               |
    +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
@@ -71,7 +71,7 @@ Applicable to rescoring
 -----------------------
 
 .. table::
-   :class: fixed-table
+   :class: fixed-table rescore-config-rable
 
    +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    | Parameter                  |                             Description                                                                                                                                                       |
@@ -82,6 +82,12 @@ Applicable to rescoring
    +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    | add_feature_cols           | Additional columns to be used as percolator/mokapot input features; Can be "all" for all additional columns in provided internal search results or a list of column names; default = "none"   |
    +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   | quantification             | (Optional) If True, run picked-group-FDR for quantification. This also requires in-silico digestion options (see "Applicable to in-silico digestion") and a fasta input.                      |
+   +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   | inputs                     | Contains information about the fasta file (only needed if quantification is True).                                                                                                            |
+   +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |     library_input          | Path to fasta file for in-silico digestion (also see the required parameters for in-silico digestion above)                                                                                   |
+   +----------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 Applicable to spectral library generation
 -----------------------------------------
@@ -140,3 +146,45 @@ Applicable to in-silico digestion
    +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |     db                     | Defines whether the digestion should contain only targets, only decoys or both (concatenated); can be "target", "decoy" or "concat"; default = "concat"            |
    +----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Applicable to local intensity prediction
+----------------------------------------
+
+.. table::
+   :class: fixed-table
+
+    +--------------------------+---------------------------------------------------+
+    | Parameter                | Description                                       |
+    +==========================+===================================================+
+    | dlomixInferenceBatchSize | Batch size to use for local inference with DLomix |
+    +--------------------------+---------------------------------------------------+
+
+Applicable to transfer/refinement learning
+------------------------------------------
+
+.. table::
+   :class: fixed-table lib-refinement-learning-config-table
+
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   | Parameter                          |                             Description                                                                                                                            |
+   +====================================+====================================================================================================================================================================+
+   | refinementLearningOptions          | Contains specific settings for local refinement learning of intensity predictor on provided spectra. If not present, no refinement learning will be performed.     |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |     batchSize                      | Defines batch size to use for training; default = 1024                                                                                                             |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |     includeOriginalSequences       | Defines whether unmodified peptide sequences should be kept in processed DLomix dataset for downstream analysis; default = False                                   |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |     improveFurther                 | Defines whether to perform an additional third training phase during refinement learning to further improve the predictor; default = False.                        |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |     wandbOptions                   | Contains specific settings for using WandB when doing refinement learning. If not present, WandB will not be used.                                                 |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |         project                    | Project to save WandB run to; default = "DLomix_auto_RL_TL"                                                                                                        |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |         targets                    | Tags to use for WandB run; default = None                                                                                                                          |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |     datasetFilteringOptions        | Contains specific settings for filtering the refinement/transfer learning dataset. If not provided, will only remove decoys.                                       |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |         searchEngineScoreThreshold | Threshold for included peptides, everything below will be discarded.                                                                                               |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+   |         numDuplicates              | Number of (peptide, charge, collision energy) duplicates to include.                                                                                               |
+   +------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+