Merge branch 'pr/1.6' into pr/1.6_to_main

bessagroup · Aug 15, 2024 · 081ac7c · 081ac7c
2 parents a01ca06 + f27235a
commit 081ac7c
Show file tree

Hide file tree

Showing 22 changed files with 4,191 additions and 399 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -5,6 +5,7 @@ source = src
 omit =
     tests/*
     **/__init__.py
+    src/f3dasm/_src/experimentdata/_experimental/*
 
 [report]
 # Regexes for lines to exclude from consideration

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.5.3
+1.6.0
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -27,8 +27,8 @@
 project = 'f3dasm'
 author = 'Martin van der Schelling'
 copyright = '2024, Martin van der Schelling'
-version = '1.5.3'
-release = '1.5.3'
+version = '1.6.0'
+release = '1.6.0'
 
 
 # -- General configuration ----------------------------------------------------

diff --git a/src/f3dasm/__version__.py b/src/f3dasm/__version__.py
@@ -1 +1 @@
-__version__: str = "1.5.3"
+__version__: str = "1.6.0"
diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py
@@ -14,7 +14,7 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import (Any, Dict, Iterable, Iterator, List, Literal, Optional,
-                    Sequence, Type)
+                    Protocol, Sequence, Type)
 
 # Third-party core
 import numpy as np
@@ -36,6 +36,13 @@
 # =============================================================================
 
 
+class _Data(Protocol):
+    def to_dataframe(self) -> pd.DataFrame:
+        ...
+
+# =============================================================================
+
+
 @dataclass
 class Domain:
     """Main class for defining the domain of the design of experiments.
@@ -238,19 +245,49 @@ def from_dataframe(cls, df_input: pd.DataFrame,
 
         return cls(space=input_space, output_space=output_space)
 
+    @classmethod
+    def from_data(cls: Type[Domain],
+                  input_data: _Data, output_data: _Data) -> Domain:
+        """Initializes a Domain from input and output data.
+
+        Parameters
+        ----------
+        input_data : _Data
+            Input data.
+        output_data : _Data
+            Output data.
+
+        Returns
+        -------
+        Domain
+            Domain object
+        """
+        return cls.from_dataframe(
+            input_data.to_dataframe(), output_data.to_dataframe())
+
 #                                                                        Export
 # =============================================================================
 
-    def store(self, filename: Path) -> None:
+    def store(self, filename: Path, create_tmp: bool = False) -> None:
         """Stores the Domain in a pickle file.
 
         Parameters
         ----------
         filename : str
             Name of the file.
         """
-        with open(filename.with_suffix('.pkl'), 'wb') as f:
-            pickle.dump(self, f)
+        if create_tmp:
+            with open(filename.with_suffix('.tmp'), 'wb') as f:
+                pickle.dump(self, f)
+
+            # remove old file if it exists
+            filename.with_suffix('.pkl').unlink(missing_ok=True)
+
+            # rename the file to the correct extension
+            filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl'))
+        else:
+            with open(filename.with_suffix('.pkl'), 'wb') as f:
+                pickle.dump(self, f)
 
     def _cast_types_dataframe(self) -> dict:
         """Make a dictionary that provides the datatype of each parameter"""
@@ -645,9 +682,7 @@ def make_nd_continuous_domain(bounds: np.ndarray | List[List[float]],
     return Domain(space)
 
 
-def _domain_factory(domain: Domain | DictConfig | None,
-                    input_data: pd.DataFrame,
-                    output_data: pd.DataFrame) -> Domain:
+def _domain_factory(domain: Domain | DictConfig | str | Path) -> Domain:
     if isinstance(domain, Domain):
         return domain
 
@@ -657,14 +692,14 @@ def _domain_factory(domain: Domain | DictConfig | None,
     elif isinstance(domain, DictConfig):
         return Domain.from_yaml(domain)
 
-    elif (input_data.empty and output_data.empty and domain is None):
-        return Domain()
+    # elif (input_data.empty and output_data.empty and domain is None):
+    #     return Domain()
 
-    elif domain is None:
-        return Domain.from_dataframe(
-            input_data, output_data)
+    # elif domain is None:
+    #     return Domain.from_dataframe(
+    #         input_data, output_data)
 
     else:
         raise TypeError(
-            f"Domain must be of type Domain, DictConfig "
-            f"or None, not {type(domain)}")
+            f"Domain must be of type Domain, DictConfig, str or Path, "
+            f"not {type(domain)}")
diff --git a/src/f3dasm/_src/experimentdata/_columns.py b/src/f3dasm/_src/experimentdata/_columns.py
@@ -18,7 +18,7 @@
 from __future__ import annotations
 
 # Standard
-from typing import Dict, List, Optional
+from typing import Dict, Iterable, List, Optional
 
 #                                                          Authorship & Credits
 # =============================================================================
@@ -123,3 +123,10 @@ def rename(self, old_name: str, new_name: str):
             name of the column to replace with
         """
         self.columns[new_name] = self.columns.pop(old_name)
+
+    def set_columnnames(self, names: Iterable[str]) -> None:
+        for old_name, new_name in zip(self.names, names):
+            self.rename(old_name, new_name)
+
+    def has_columnnames(self, names: Iterable[str]) -> None:
+        return set(names).issubset(self.names)
diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py
@@ -186,7 +186,8 @@ def from_file(cls, filename: Path | str) -> _Data:
         return cls(df, columns=_Columns(_columns))
 
     @classmethod
-    def from_numpy(cls: Type[_Data], array: np.ndarray) -> _Data:
+    def from_numpy(cls: Type[_Data],
+                   array: np.ndarray, keys: Iterable[str]) -> _Data:
         """Loads the data from a numpy array.
 
         Parameters
@@ -209,26 +210,6 @@ def from_dataframe(cls, dataframe: pd.DataFrame) -> _Data:
         _columns = {name: None for name in dataframe.columns.to_list()}
         return cls(dataframe, columns=_Columns(_columns))
 
-    def reset(self, domain: Optional[Domain] = None):
-        """Resets the data to the initial state.
-
-        Parameters
-        ----------
-        domain : Domain, optional
-            The domain of the experiment.
-
-        Note
-        ----
-        If the domain is None, the data will be reset to an empty dataframe.
-        """
-
-        if domain is None:
-            self.data = pd.DataFrame()
-            self.columns = _Columns()
-        else:
-            self.data = self.from_domain(domain).data
-            self.columns = self.from_domain(domain).columns
-
 #                                                                        Export
 # =============================================================================
 
@@ -270,32 +251,7 @@ def to_dataframe(self) -> pd.DataFrame:
         df.columns = self.names
         return df.astype(object)
 
-    def combine_data_to_multiindex(self, other: _Data,
-                                   jobs_df: pd.DataFrame) -> pd.DataFrame:
-        """Combine the data to a multiindex dataframe.
-
-        Parameters
-        ----------
-        other : _Data
-            The other data to combine.
-        jobs : pd.DataFrame
-            The jobs dataframe.
-
-        Returns
-        -------
-        pd.DataFrame
-            The combined dataframe.
-
-        Note
-        ----
-        This function is mainly used to show the combined ExperimentData
-        object in a Jupyter Notebook
-        """
-        return pd.concat([jobs_df, self.to_dataframe(),
-                          other.to_dataframe()],
-                         axis=1, keys=['jobs', 'input', 'output'])
-
-    def store(self, filename: Path) -> None:
+    def store(self, filename: Path, create_tmp: bool = False) -> None:
         """Stores the data to a file.
 
         Parameters
@@ -307,8 +263,19 @@ def store(self, filename: Path) -> None:
         ----
         The data is stored as a csv file.
         """
-        # TODO: The column information is not saved in the .csv!
-        self.to_dataframe().to_csv(filename.with_suffix('.csv'))
+
+        if create_tmp:
+            self.to_dataframe().to_csv(filename.with_suffix('.tmp'))
+
+            # remove the old file if it exists
+            filename.with_suffix('.csv').unlink(missing_ok=True)
+
+            # rename the file to the correct extension
+            filename.with_suffix('.tmp').rename(filename.with_suffix('.csv'))
+
+        else:
+            # TODO: The column information is not saved in the .csv!
+            self.to_dataframe().to_csv(filename.with_suffix('.csv'))
 
     def n_best_samples(self, nosamples: int,
                        column_name: List[str] | str) -> pd.DataFrame:
@@ -351,6 +318,7 @@ def select_columns(self, columns: Iterable[str] | str) -> _Data:
         return _Data(
             self.data[self.columns.iloc(columns)], columns=_selected_columns)
 
+    # TODO: Can we get rid of this method ?
     def drop(self, columns: Iterable[str] | str) -> _Data:
         """Drop the selected columns from the data.
 
@@ -377,33 +345,6 @@ def drop(self, columns: Iterable[str] | str) -> _Data:
 #                                                        Append and remove data
 # =============================================================================
 
-    def add(self, data: pd.DataFrame):
-        try:
-            last_index = self.data.index[-1]
-        except IndexError:  # Empty dataframe
-            self.data = data
-            return
-
-        new_indices = pd.RangeIndex(
-            start=last_index + 1, stop=last_index + len(data) + 1, step=1)
-
-        # set the indices of the data to new_indices
-        data.index = new_indices
-
-        self.data = pd.concat([self.data, data], ignore_index=False)
-
-    def add_empty_rows(self, number_of_rows: int):
-        if self.data.index.empty:
-            last_index = -1
-        else:
-            last_index = self.data.index[-1]
-
-        new_indices = pd.RangeIndex(
-            start=last_index + 1, stop=last_index + number_of_rows + 1, step=1)
-        empty_data = pd.DataFrame(
-            np.nan, index=new_indices, columns=self.data.columns)
-        self.data = pd.concat([self.data, empty_data], ignore_index=False)
-
     def add_column(self, name: str, exist_ok: bool = False):
         if name in self.columns.names:
             if not exist_ok:
@@ -423,9 +364,6 @@ def add_column(self, name: str, exist_ok: bool = False):
     def remove(self, indices: List[int]):
         self.data = self.data.drop(indices)
 
-    def round(self, decimals: int):
-        self.data = self.data.round(decimals=decimals)
-
     def overwrite(self, indices: Iterable[int], other: _Data | Dict[str, Any]):
         if isinstance(other, Dict):
             other = _convert_dict_to_data(other)
@@ -436,6 +374,7 @@ def overwrite(self, indices: Iterable[int], other: _Data | Dict[str, Any]):
 
         self.data.update(other.data.set_index(pd.Index(indices)))
 
+    # TODO: Rename this method, it is not clear what it does
     def join(self, __o: _Data) -> _Data:
         """Join two Data objects together.
 
@@ -455,6 +394,7 @@ def join(self, __o: _Data) -> _Data:
 #                                                           Getters and setters
 # =============================================================================
 
+    # TODO: Rename this method ? It is not clear what it does
     def get_data_dict(self, index: int) -> Dict[str, Any]:
         return self.to_dataframe().loc[index].to_dict()
 
@@ -510,31 +450,6 @@ def get_index_with_nan(self) -> pd.Index:
         """
         return self.indices[self.data.isna().any(axis=1)]
 
-    def has_columnnames(self, names: Iterable[str]) -> bool:
-        return set(names).issubset(self.names)
-
-    def set_columnnames(self, names: Iterable[str]) -> None:
-        for old_name, new_name in zip(self.names, names):
-            self.columns.rename(old_name, new_name)
-
-    def cast_types(self, domain: Domain):
-        """Cast the types of the data to the types of the domain.
-
-        Parameters
-        ----------
-        domain : Domain
-            The domain with specific parameters to cast the types to.
-
-        Raises
-        ------
-        ValueError
-            If the types of the domain and the data do not match.
-        """
-        _dtypes = {index: parameter._type
-                   for index, (_, parameter) in enumerate(
-                       domain.space.items())}
-        self.data = self.data.astype(_dtypes)
-
 
 def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data:
     """Converts a dictionary with scalar values to a data object.
@@ -555,7 +470,8 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data:
     return _Data(data=df, columns=_Columns(_columns))
 
 
-def _data_factory(data: DataTypes) -> _Data:
+def _data_factory(data: DataTypes,
+                  keys: Optional[Iterable[str]] = None) -> _Data:
     if data is None:
         return _Data()
 
@@ -566,10 +482,10 @@ def _data_factory(data: DataTypes) -> _Data:
         return _Data.from_dataframe(data)
 
     elif isinstance(data, (Path, str)):
-        return _Data.from_file(data)
+        return _Data.from_file(Path(data))
 
     elif isinstance(data, np.ndarray):
-        return _Data.from_numpy(data)
+        return _Data.from_numpy(data, keys=keys)
 
     else:
         raise TypeError(