code cleanup and proofread of docs

metatensor · Jan 18, 2024 · deb9adf · deb9adf
1 parent 8469927
commit deb9adf
Show file tree

Hide file tree

Showing 14 changed files with 338 additions and 294 deletions.
diff --git a/docs/src/dev-docs/utils/index.rst b/docs/src/dev-docs/utils/index.rst
@@ -10,3 +10,4 @@ This is the API for the ``utils`` module of ``metatensor-models``.
    readers/index
    writers
    model-io
+   omegaconf
diff --git a/docs/src/dev-docs/utils/omegaconf.rst b/docs/src/dev-docs/utils/omegaconf.rst
@@ -0,0 +1,10 @@
+Custom omegaconf functions
+==========================
+
+Resolvers to handle special fields in our configs as well as the expansion/completion of
+the dataset section.
+
+.. automodule:: metatensor.models.utils.omegaconf
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/src/dev-docs/utils/readers/index.rst b/docs/src/dev-docs/utils/readers/index.rst
@@ -1,5 +1,5 @@
-General Structure and Target data Readers
-=========================================
+Structure and Target data Readers
+=================================
 
 The main entry point for reading structure and target information are the two reader
 functions

diff --git a/docs/src/getting-started/custom_dataset_conf.rst b/docs/src/getting-started/custom_dataset_conf.rst
@@ -1,3 +1,5 @@
+.. _dataset_conf:
+
 Customize a Dataset Configuration
 =================================
 
@@ -63,7 +65,7 @@ Understanding the YAML Block
 The ``training_set`` is divided into sections ``structures`` and ``targets``:
 
 Structures Section
-------------------
+^^^^^^^^^^^^^^^^^^
 Describes the structure data like positions and cell information.
 
 :param read_from: The file containing structure data.
@@ -76,16 +78,17 @@ A single string in this section automatically expands, using the string as the
 
 .. note::
 
-   Metatensor-models does not convert units during training or evaluation. Units are
-   necessary for MD simulations.
+   ``metatensor-models`` does not convert units during training or evaluation. Units are
+   only required if model should be used to run MD simulations.
 
 Targets Section
----------------
+^^^^^^^^^^^^^^^
 Allows defining multiple target sections, each with a unique name.
 
-- Commonly, a section named ``energy`` is defined, which is essential for MD
-  simulations.
-- For other target sections, gradients are disabled by default.
+- Commonly, a section named ``energy`` should be defined, which is essential for MD
+  simulations. For this section gradients like `forces` and `stress` are enabled by
+  default. See :ref:`energy-section` for further details on this section.
+- For other target sections, all gradients are disabled by default.
 
 Target section parameters include:
 
@@ -105,8 +108,8 @@ A single string in a target section automatically expands, using the string as t
 
 .. _gradient-section:
 
-Gradient Sections
------------------
+Gradient Section
+^^^^^^^^^^^^^^^^
 Each gradient section (like ``forces`` or ``stress``) has similar parameters:
 
 :param read_from: The file for gradient data.
@@ -115,8 +118,10 @@ Each gradient section (like ``forces`` or ``stress``) has similar parameters:
 
 Sections set to ``true`` or ``on`` automatically expand with default parameters.
 
+.. _energy-section:
+
 Energy Section
---------------
+^^^^^^^^^^^^^^
 The ``energy`` section is mandatory for MD simulations, with forces and stresses enabled
 by default.
 
@@ -126,4 +131,4 @@ by default.
 
 .. note::
 
-   Metatensor-models ignores unknown keys in these sections during dataset parsing.
+   Unknown keys are ignored and not deleted in all sections during dataset parsing.
diff --git a/docs/src/getting-started/usage.rst b/docs/src/getting-started/usage.rst
@@ -66,3 +66,7 @@ The sub-command to evaluate a already trained model is
 .. literalinclude:: ../../../examples/usage.sh
     :language: bash
     :lines: 9-
+
+
+In the next tutorials we show how adjust the dataset section of ``options.yaml`` file
+to use it for your own datasets.
diff --git a/src/metatensor/models/cli/conf/dataset/gradient.yaml b/src/metatensor/models/cli/conf/dataset/gradient.yaml
diff --git a/src/metatensor/models/cli/conf/dataset/gradients_avail.yaml b/src/metatensor/models/cli/conf/dataset/gradients_avail.yaml
diff --git a/src/metatensor/models/cli/conf/dataset/structures.yaml b/src/metatensor/models/cli/conf/dataset/structures.yaml
diff --git a/src/metatensor/models/cli/conf/dataset/targets.yaml b/src/metatensor/models/cli/conf/dataset/targets.yaml
diff --git a/src/metatensor/models/cli/train_model.py b/src/metatensor/models/cli/train_model.py
@@ -3,7 +3,6 @@
 import logging
 import warnings
 from pathlib import Path
-from typing import Union
 
 import hydra
 import torch
@@ -14,6 +13,7 @@
 
 from .. import CONFIG_PATH
 from ..utils.model_io import save_model
+from ..utils.omegaconf import expand_dataset_config
 from .formatter import CustomHelpFormatter
 
 
@@ -71,133 +71,6 @@ def _add_train_model_parser(subparser: argparse._SubParsersAction) -> None:
     )
 
 
-def _resolve_single_str(config):
-    if isinstance(config, str):
-        return OmegaConf.create({"read_from": config})
-    else:
-        return config
-
-
-def expand_dataset_config(conf: Union[str, DictConfig]) -> DictConfig:
-    """Expands shorthand notations in a dataset configuration to their full formats.
-
-    This function takes a dataset configuration, either as a string or a DictConfig, and
-    expands it into a detailed configuration format. It processes structures, targets,
-    and gradient sections, setting default values and inferring missing information.
-    Unknown keys are ignored, allowing for flexibility.
-
-    The function performs the following steps:
-
-    - Loads base configurations for structures, targets, and gradients from predefined
-      YAML files.
-    - Merges and interpolates the input configuration with the base configurations.
-    - Expands shorthand notations like file paths or simple true/false settings to full
-      dictionary structures.
-    - Handles special cases, such as the mandatory nature of the 'energy' section for MD
-      simulations and the mutual exclusivity of 'stress' and 'virial' sections.
-    - Validates the final expanded configuration, particularly for gradient-related
-      settings, to ensure consistency and prevent conflicts during training.
-
-    :param conf: The dataset configuration, either as a file path string or a DictConfig
-        object.
-    :returns: The fully expanded dataset configuration.
-    :raises ValueError: If both ``virial`` and ``stress`` sections are enabled in the
-        'energy' target, as this is not permissible for training.
-    """
-
-    conf_path = CONFIG_PATH / "dataset"
-    base_conf_structures = OmegaConf.load(conf_path / "structures.yaml")
-    base_conf_target = OmegaConf.load(conf_path / "targets.yaml")
-    base_conf_gradients_avail = OmegaConf.load(conf_path / "gradients_avail.yaml")
-    base_conf_gradient = OmegaConf.load(conf_path / "gradient.yaml")
-
-    known_gradient_keys = list(base_conf_gradients_avail.keys())
-
-    # merge confif to get default configs for energies and other target config.
-    base_conf_target = OmegaConf.merge(base_conf_target, base_conf_gradients_avail)
-    base_conf_energy = base_conf_target.copy()
-    base_conf_energy["forces"] = base_conf_gradient.copy()
-    base_conf_energy["stress"] = base_conf_gradient.copy()
-
-    if isinstance(conf, str):
-        read_from = conf
-        conf = OmegaConf.create(
-            {"structures": read_from, "targets": {"energy": read_from}}
-        )
-
-    if type(conf["structures"]) is str:
-        conf["structures"] = _resolve_single_str(conf["structures"])
-
-    conf["structures"] = OmegaConf.merge(base_conf_structures, conf["structures"])
-
-    if conf["structures"]["file_format"] is None:
-        conf["structures"]["file_format"] = Path(conf["structures"]["read_from"]).suffix
-
-    for target_key, target in conf["targets"].items():
-        if type(target) is str:
-            target = _resolve_single_str(target)
-
-        # Add default gradients "energy" target section
-        if target_key == "energy":
-            # For special case of the "energy" we add the section for force and stress
-            # gradient by default
-            target = OmegaConf.merge(base_conf_energy, target)
-        else:
-            target = OmegaConf.merge(base_conf_target, target)
-
-        if target["key"] is None:
-            target["key"] = target_key
-
-        # Update DictConfig to allow for config node interpolation of variables like
-        # "read_from"
-        conf["targets"][target_key] = target
-
-        # Check with respect to full config `conf` to avoid errors of node interpolation
-        if conf["targets"][target_key]["file_format"] is None:
-            conf["targets"][target_key]["file_format"] = Path(
-                conf["targets"][target_key]["read_from"]
-            ).suffix
-
-        # merge and interpolate possible present gradients with default gradient config
-        for gradient_key, gradient_conf in conf["targets"][target_key].items():
-            if gradient_key in known_gradient_keys:
-                if gradient_key is True:
-                    gradient_conf = base_conf_gradient.copy()
-                elif type(gradient_key) is str:
-                    gradient_conf = _resolve_single_str(gradient_conf)
-
-                if isinstance(gradient_conf, DictConfig):
-                    gradient_conf = OmegaConf.merge(base_conf_gradient, gradient_conf)
-
-                    if gradient_conf["key"] is None:
-                        gradient_conf["key"] = gradient_key
-
-                    conf["targets"][target_key][gradient_key] = gradient_conf
-
-        # If user sets the virial gradient and leaves the stress section untouched,
-        # we disable the by default enabled stress gradient section.
-        base_stress_gradient_conf = base_conf_gradient.copy()
-        base_stress_gradient_conf["key"] = "stress"
-
-        if (
-            target_key == "energy"
-            and conf["targets"][target_key]["virial"]
-            and conf["targets"][target_key]["stress"] == base_stress_gradient_conf
-        ):
-            conf["targets"][target_key]["stress"] = False
-
-        if (
-            conf["targets"][target_key]["stress"]
-            and conf["targets"][target_key]["virial"]
-        ):
-            raise ValueError(
-                f"Cannot perform training with respect to virials and stress as in "
-                f"section {target_key}. Set either `virials: off` or `stress: off`."
-            )
-
-    return conf
-
-
 @hydra.main(config_path=str(CONFIG_PATH), version_base=None)
 def train_model(options: DictConfig) -> None:
     """Train an atomistic machine learning model using configurations provided by Hydra.
@@ -301,7 +174,8 @@ def train_model(options: DictConfig) -> None:
     logger.info("Run training")
     output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
 
-    # HACK: Avoid passing a Subset which we can not handle yet. We now
+    # HACK: Avoid passing a Subset which we can not handle yet. For now we pass
+    # the complete training set even though it was split before...
     if isinstance(train_dataset, torch.utils.data.Subset):
         model = architecture.train(
             train_dataset=train_dataset.dataset,

diff --git a/src/metatensor/models/utils/data/readers/readers.py b/src/metatensor/models/utils/data/readers/readers.py
@@ -132,7 +132,7 @@ def read_targets(conf: DictConfig) -> Dict[str, TensorMap]:
     """Reading all target information from a fully expanded config.
 
     To get such a config you can use
-    :func:`metatensor.models.cli.train_model.expand_dataset_config`.
+    :func:`metatensor.models.utils.omegaconf.expand_dataset_config`.
 
     This function uses subfunctions like :func:`read_energy` to parse the requested
     target quantity. Currently only `energy` is a supported target property. But, within