From 197133d438aa27d590585a991f87145c11c6c33a Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:49:13 +0200 Subject: [PATCH 01/17] Update version to 1.6.0 --- VERSION | 2 +- docs/source/conf.py | 4 ++-- src/f3dasm/__version__.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/VERSION b/VERSION index 8e03717d..ce6a70b9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.5.1 \ No newline at end of file +1.6.0 \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 888f8ada..c480a49b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -27,8 +27,8 @@ project = 'f3dasm' author = 'Martin van der Schelling' copyright = '2024, Martin van der Schelling' -version = '1.5.1' -release = '1.5.1' +version = '1.6.0' +release = '1.6.0' # -- General configuration ---------------------------------------------------- diff --git a/src/f3dasm/__version__.py b/src/f3dasm/__version__.py index 90fb960e..465e3feb 100644 --- a/src/f3dasm/__version__.py +++ b/src/f3dasm/__version__.py @@ -1 +1 @@ -__version__: str = "1.5.1" +__version__: str = "1.6.0" From d3fcef61b399e3374f4c464abbb519781635825d Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 20 Jun 2024 10:02:30 +0200 Subject: [PATCH 02/17] comment unused funcation in _Data for future dev --- src/f3dasm/_src/experimentdata/_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index b75bf379..23cf153d 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -209,6 +209,7 @@ def from_dataframe(cls, dataframe: pd.DataFrame) -> _Data: _columns = {name: None for name in dataframe.columns.to_list()} return cls(dataframe, columns=_Columns(_columns)) + # NOT USED def reset(self, domain: Optional[Domain] = None): """Resets the data to the initial state. From b6fbdcb0acf7792af719b20a50651fdf14a7f175 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 20 Jun 2024 10:13:25 +0200 Subject: [PATCH 03/17] Fix formatting issues in parameter and experimentdata modules --- src/f3dasm/_src/design/parameter.py | 2 +- src/f3dasm/_src/experimentdata/experimentdata.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/f3dasm/_src/design/parameter.py b/src/f3dasm/_src/design/parameter.py index 7d21bc70..07b67e3f 100644 --- a/src/f3dasm/_src/design/parameter.py +++ b/src/f3dasm/_src/design/parameter.py @@ -279,7 +279,7 @@ def _check_range(self): raise ValueError("step size must be larger than 0!") -@ dataclass +@dataclass class _CategoricalParameter(_Parameter): """Create a search space parameter that is categorical diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index f8dd6705..55af5d16 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -240,7 +240,7 @@ def wrapper_func(self: ExperimentData, *args, **kwargs) -> None: # Properties # ========================================================================= - @ property + @property def index(self) -> pd.Index: """Returns an iterable of the job number of the experiments @@ -257,7 +257,7 @@ def index(self) -> pd.Index: # Alternative Constructors # ========================================================================= - @ classmethod + @classmethod def from_file(cls: Type[ExperimentData], project_dir: Path | str) -> ExperimentData: """Create an ExperimentData object from .csv and .json files. From 8918dca00243416caaf8ec932d2bc763aadf2199 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:38:54 +0200 Subject: [PATCH 04/17] Round input data to 6 decimal places and refactor combine_data_to_multiindex function --- src/f3dasm/_src/experimentdata/_data.py | 97 +------------------ .../_src/experimentdata/experimentdata.py | 32 +++++- tests/design/test_data.py | 15 --- tests/experimentdata/conftest.py | 5 +- tests/sampling/test_sampling.py | 2 +- 5 files changed, 35 insertions(+), 116 deletions(-) diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index 23cf153d..42a04bc6 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -209,27 +209,6 @@ def from_dataframe(cls, dataframe: pd.DataFrame) -> _Data: _columns = {name: None for name in dataframe.columns.to_list()} return cls(dataframe, columns=_Columns(_columns)) - # NOT USED - def reset(self, domain: Optional[Domain] = None): - """Resets the data to the initial state. - - Parameters - ---------- - domain : Domain, optional - The domain of the experiment. - - Note - ---- - If the domain is None, the data will be reset to an empty dataframe. - """ - - if domain is None: - self.data = pd.DataFrame() - self.columns = _Columns() - else: - self.data = self.from_domain(domain).data - self.columns = self.from_domain(domain).columns - # Export # ============================================================================= @@ -271,31 +250,6 @@ def to_dataframe(self) -> pd.DataFrame: df.columns = self.names return df.astype(object) - def combine_data_to_multiindex(self, other: _Data, - jobs_df: pd.DataFrame) -> pd.DataFrame: - """Combine the data to a multiindex dataframe. - - Parameters - ---------- - other : _Data - The other data to combine. - jobs : pd.DataFrame - The jobs dataframe. - - Returns - ------- - pd.DataFrame - The combined dataframe. - - Note - ---- - This function is mainly used to show the combined ExperimentData - object in a Jupyter Notebook - """ - return pd.concat([jobs_df, self.to_dataframe(), - other.to_dataframe()], - axis=1, keys=['jobs', 'input', 'output']) - def store(self, filename: Path) -> None: """Stores the data to a file. @@ -352,6 +306,7 @@ def select_columns(self, columns: Iterable[str] | str) -> _Data: return _Data( self.data[self.columns.iloc(columns)], columns=_selected_columns) + # TODO: Can we get rid of this method ? def drop(self, columns: Iterable[str] | str) -> _Data: """Drop the selected columns from the data. @@ -378,33 +333,6 @@ def drop(self, columns: Iterable[str] | str) -> _Data: # Append and remove data # ============================================================================= - def add(self, data: pd.DataFrame): - try: - last_index = self.data.index[-1] - except IndexError: # Empty dataframe - self.data = data - return - - new_indices = pd.RangeIndex( - start=last_index + 1, stop=last_index + len(data) + 1, step=1) - - # set the indices of the data to new_indices - data.index = new_indices - - self.data = pd.concat([self.data, data], ignore_index=False) - - def add_empty_rows(self, number_of_rows: int): - if self.data.index.empty: - last_index = -1 - else: - last_index = self.data.index[-1] - - new_indices = pd.RangeIndex( - start=last_index + 1, stop=last_index + number_of_rows + 1, step=1) - empty_data = pd.DataFrame( - np.nan, index=new_indices, columns=self.data.columns) - self.data = pd.concat([self.data, empty_data], ignore_index=False) - def add_column(self, name: str, exist_ok: bool = False): if name in self.columns.names: if not exist_ok: @@ -424,9 +352,6 @@ def add_column(self, name: str, exist_ok: bool = False): def remove(self, indices: List[int]): self.data = self.data.drop(indices) - def round(self, decimals: int): - self.data = self.data.round(decimals=decimals) - def overwrite(self, indices: Iterable[int], other: _Data | Dict[str, Any]): if isinstance(other, Dict): other = _convert_dict_to_data(other) @@ -437,6 +362,7 @@ def overwrite(self, indices: Iterable[int], other: _Data | Dict[str, Any]): self.data.update(other.data.set_index(pd.Index(indices))) + # TODO: Rename this method, it is not clear what it does def join(self, __o: _Data) -> _Data: """Join two Data objects together. @@ -456,6 +382,7 @@ def join(self, __o: _Data) -> _Data: # Getters and setters # ============================================================================= + # TODO: Rename this method ? It is not clear what it does def get_data_dict(self, index: int) -> Dict[str, Any]: return self.to_dataframe().loc[index].to_dict() @@ -518,24 +445,6 @@ def set_columnnames(self, names: Iterable[str]) -> None: for old_name, new_name in zip(self.names, names): self.columns.rename(old_name, new_name) - def cast_types(self, domain: Domain): - """Cast the types of the data to the types of the domain. - - Parameters - ---------- - domain : Domain - The domain with specific parameters to cast the types to. - - Raises - ------ - ValueError - If the types of the domain and the data do not match. - """ - _dtypes = {index: parameter._type - for index, (_, parameter) in enumerate( - domain.space.items())} - self.data = self.data.astype(_dtypes) - def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: """Converts a dictionary with scalar values to a data object. diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index 55af5d16..b9327c39 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -186,12 +186,10 @@ def __eq__(self, __o: ExperimentData) -> bool: self.domain == __o.domain]) def _repr_html_(self) -> str: - return self._input_data.combine_data_to_multiindex( - self._output_data, self._jobs.to_dataframe())._repr_html_() + return combine_data_to_multiindex(self)._repr_html_() def __repr__(self) -> str: - return self._input_data.combine_data_to_multiindex( - self._output_data, self._jobs.to_dataframe()).__repr__() + return combine_data_to_multiindex(self).__repr__() def _access_file(operation: Callable) -> Callable: """Wrapper for accessing a single resource with a file lock @@ -1846,3 +1844,29 @@ def x0_factory(experiment_data: ExperimentData, x0._reset_index() return x0 + + +def combine_data_to_multiindex( + experiment_data: ExperimentData) -> pd.DataFrame: + """Combine the data to a multiindex dataframe. + + Parameters + ---------- + experiment_data: ExperimentData + The ExperimentData object to combine + + Returns + ------- + pd.DataFrame + The combined dataframe. + + Note + ---- + This function is mainly used to show the combined ExperimentData + object in a Jupyter Notebook + """ + return pd.concat( + [experiment_data._jobs.to_dataframe(), + experiment_data._input_data.to_dataframe(), + experiment_data._output_data.to_dataframe()], + axis=1, keys=['jobs', 'input', 'output']) diff --git a/tests/design/test_data.py b/tests/design/test_data.py index 0d546ccd..441750ab 100644 --- a/tests/design/test_data.py +++ b/tests/design/test_data.py @@ -34,27 +34,12 @@ def test_data_from_design(domain: Domain): assert isinstance(data.data, pd.DataFrame) -def test_data_reset(sample_data: _Data): - # Assuming you have a Domain object named "domain" - design = Domain() - sample_data.reset(design) - assert isinstance(sample_data.data, pd.DataFrame) - assert len(sample_data) == 0 - - def test_data_remove(sample_data: _Data): indices = [0, 2] sample_data.remove(indices) assert len(sample_data) == 1 -def test_data_add_numpy_arrays(sample_data: _Data): - input_array = np.array([[1, 4], [2, 5]]) - df = pd.DataFrame(input_array, columns=sample_data.names) - sample_data.add(df) - assert len(sample_data) == 5 - - def test_data_get_data(sample_data: _Data): input_data = sample_data.data assert isinstance(input_data, pd.DataFrame) diff --git a/tests/experimentdata/conftest.py b/tests/experimentdata/conftest.py index f2b70947..7c612ac0 100644 --- a/tests/experimentdata/conftest.py +++ b/tests/experimentdata/conftest.py @@ -65,7 +65,8 @@ def experimentdata_expected() -> ExperimentData: data.add(input_data=np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]), output_data=np.array([[0.0], [0.0]]), domain=domain_continuous) - data._input_data.round(6) + # data._input_data.round(6) + data._input_data.data = data._input_data.data.round(6) # data._input_data.data = [[round(num, 6) if isinstance( # num, float) else num for num in sublist] # for sublist in data._input_data.data] @@ -81,7 +82,7 @@ def experimentdata_expected_no_output() -> ExperimentData: data.add(input_data=np.array( [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]), domain=domain_continuous) - data._input_data.round(6) + data._input_data.data = data._input_data.data.round(6) # data._input_data.data = [[round(num, 6) if isinstance( # num, float) else num for num in sublist] # for sublist in data._input_data.data] diff --git a/tests/sampling/test_sampling.py b/tests/sampling/test_sampling.py index b542840b..915b63b6 100644 --- a/tests/sampling/test_sampling.py +++ b/tests/sampling/test_sampling.py @@ -49,7 +49,7 @@ def test_correct_sampling_ran(design3: Domain): samples = ExperimentData(domain=design3) samples.sample(sampler='random', n_samples=numsamples, seed=seed) - samples._input_data.round(6) + samples._input_data.data = samples._input_data.data.round(6) df_input, _ = samples.to_pandas() df_input.columns = df_ground_truth.columns From 82cf830934975f3020e79a0f844b3afd4cca7cd3 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:39:09 +0200 Subject: [PATCH 05/17] Update numpy version <2.0.0 in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4be98c2e..c560f525 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy +numpy<2.0.0 scipy pandas xarray From 36af2ceeb95c0dcb30ab1515ab7cee6ce2567c79 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:56:46 +0200 Subject: [PATCH 06/17] Refactor column renaming methods in experimentdata module --- src/f3dasm/_src/experimentdata/_columns.py | 9 ++++++++- src/f3dasm/_src/experimentdata/_data.py | 7 ------- src/f3dasm/_src/experimentdata/experimentdata.py | 11 ++++++----- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/f3dasm/_src/experimentdata/_columns.py b/src/f3dasm/_src/experimentdata/_columns.py index 76a3f474..5bb09cd0 100644 --- a/src/f3dasm/_src/experimentdata/_columns.py +++ b/src/f3dasm/_src/experimentdata/_columns.py @@ -18,7 +18,7 @@ from __future__ import annotations # Standard -from typing import Dict, List, Optional +from typing import Dict, Iterable, List, Optional # Authorship & Credits # ============================================================================= @@ -123,3 +123,10 @@ def rename(self, old_name: str, new_name: str): name of the column to replace with """ self.columns[new_name] = self.columns.pop(old_name) + + def set_columnnames(self, names: Iterable[str]) -> None: + for old_name, new_name in zip(self.names, names): + self.rename(old_name, new_name) + + def has_columnnames(self, names: Iterable[str]) -> None: + return set(names).issubset(self.names) diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index 42a04bc6..3817cda3 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -438,13 +438,6 @@ def get_index_with_nan(self) -> pd.Index: """ return self.indices[self.data.isna().any(axis=1)] - def has_columnnames(self, names: Iterable[str]) -> bool: - return set(names).issubset(self.names) - - def set_columnnames(self, names: Iterable[str]) -> None: - for old_name, new_name in zip(self.names, names): - self.columns.rename(old_name, new_name) - def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: """Converts a dictionary with scalar values to a data object. diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index b9327c39..f053fdc5 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -133,16 +133,17 @@ def __init__(self, jobs, self._input_data, self._output_data, job_value) # Check if the columns of input_data are in the domain - if not self._input_data.has_columnnames(self.domain.names): - self._input_data.set_columnnames(self.domain.names) + if not self._input_data.columns.has_columnnames(self.domain.names): + self._input_data.columns.set_columnnames(self.domain.names) - if not self._output_data.has_columnnames(self.domain.output_names): - self._output_data.set_columnnames(self.domain.output_names) + if not self._output_data.columns.has_columnnames( + self.domain.output_names): + self._output_data.columns.set_columnnames(self.domain.output_names) # For backwards compatibility; if the output_data has # only one column, rename it to 'y' if self._output_data.names == [0]: - self._output_data.set_columnnames(['y']) + self._output_data.columns.set_columnnames(['y']) def __len__(self): """The len() method returns the number of datapoints""" From 25070b49bf92205787647a50e64801379c723cb4 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Fri, 21 Jun 2024 17:04:24 +0200 Subject: [PATCH 07/17] added _experimental newdata object --- .coveragerc | 1 + .../_experimental/_jobqueue2.py | 305 +++ .../experimentdata/_experimental/_newdata2.py | 221 ++ .../_experimental/_newexperimentdata2.py | 1881 +++++++++++++++++ src/f3dasm/_src/experimentdata/_jobqueue.py | 4 +- 5 files changed, 2411 insertions(+), 1 deletion(-) create mode 100644 src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py create mode 100644 src/f3dasm/_src/experimentdata/_experimental/_newdata2.py create mode 100644 src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py diff --git a/.coveragerc b/.coveragerc index 4cb78b98..54014b4b 100644 --- a/.coveragerc +++ b/.coveragerc @@ -5,6 +5,7 @@ source = src omit = tests/* **/__init__.py + src/f3dasm/_src/experimentdata/_experimental/* [report] # Regexes for lines to exclude from consideration diff --git a/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py new file mode 100644 index 00000000..3c88308b --- /dev/null +++ b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py @@ -0,0 +1,305 @@ +# Modules +# ============================================================================= + +from __future__ import annotations + +# Standard +from copy import deepcopy +from enum import Enum +from pathlib import Path +from typing import Iterable, List, Type + +# Third-party +import pandas as pd + +# Local +from ._newdata2 import _Data + +# Authorship & Credits +# ============================================================================= +__author__ = 'Martin van der Schelling (M.P.vanderSchelling@tudelft.nl)' +__credits__ = ['Martin van der Schelling'] +__status__ = 'Stable' +# ============================================================================= +# +# ============================================================================= + + +class Status(str, Enum): + """Enum class for the status of a job.""" + OPEN = 'open' + IN_PROGRESS = 'in progress' + FINISHED = 'finished' + ERROR = 'error' + + def __str__(self) -> str: + return self.value + + +class NoOpenJobsError(Exception): + """ + Exception raised when there are no open jobs. + + Attributes: + message (str): The error message. + """ + + def __init__(self, message): + super().__init__(message) + +# ============================================================================= + + +class Index: + def __init__(self, jobs: pd.Series | None | str = None): + if isinstance(jobs, str): + self.jobs = pd.Series(jobs, index=[0], dtype='string') + + elif jobs is None: + self.jobs = pd.Series(dtype='string') + + else: + self.jobs = jobs + + def __len__(self) -> int: + return len(self.jobs) + + def __add__(self, __o: Index | str) -> Index: + if isinstance(__o, str): + __o = Index(__o) + + if self.jobs.empty: + return __o + + # Make a copy of other.jobs and modify its index + other_jobs_copy = deepcopy(__o) + other_jobs_copy.jobs.index = range( + len(other_jobs_copy)) + self.jobs.index[-1] + 1 + + return Index(pd.concat([self.jobs, other_jobs_copy.jobs])) + + def __getitem__(self, indices: int | slice | Iterable[int]) -> Index: + if isinstance(indices, int): + indices = [indices] + return Index(self.jobs[indices].copy()) + + def __eq__(self, __o: Index) -> bool: + return self.jobs.equals(__o.jobs) + + def _repr_html_(self) -> str: + return self.jobs.__repr__() + + @property + def indices(self) -> pd.Index: + """The indices of the jobs.""" + return self.jobs.index + + def iloc(self, indices: Iterable[int]) -> Iterable[int]: + return self.indices.get_indexer(indices) + + # Alternative Constructors + # ========================================================================= + + @classmethod + def from_data(cls: Type[Index], data: _Data, + value: str = Status.OPEN) -> Index: + """Create a JobQueue object from a Data object. + + Parameters + ---------- + data : Data + Data object containing the data. + value : str + The value to assign to the jobs. Can be 'open', + 'in progress', 'finished', or 'error'. + + Returns + ------- + JobQueue + JobQueue object containing the loaded data. + """ + return cls(pd.Series([value] * len(data), dtype='string')) + + @classmethod + def from_file(cls: Type[Index], filename: Path | str) -> Index: + """Create a JobQueue object from a pickle file. + + Parameters + ---------- + filename : Path | str + Name of the file. + + Returns + ------- + JobQueue + JobQueue object containing the loaded data. + """ + # Convert filename to Path + if Path(filename).with_suffix('.csv').exists(): + return cls( + pd.read_csv(Path(filename).with_suffix('.csv'), + index_col=0)['0']) + + elif Path(filename).with_suffix('.pkl').exists(): + return cls( + pd.read_pickle(Path(filename).with_suffix('.pkl'))) + + else: + raise FileNotFoundError(f"Jobfile {filename} does not exist.") + + # Select + # ========================================================================= + + def select_all(self, status: str) -> Index: + """Selects all jobs with a certain status. + + Parameters + ---------- + status : str + Status of the jobs to select + + Returns + ------- + JobQueue + JobQueue object containing the selected jobs. + """ + return Index(self.jobs[self.jobs == status]) + + # Export + # ========================================================================= + + def store(self, filename: Path) -> None: + """Stores the jobs in a pickle file. + + Parameters + ---------- + filename : Path + Path of the file. + """ + self.jobs.to_csv(filename.with_suffix('.csv')) + + def to_dataframe(self, name: str = "") -> pd.DataFrame: + """Converts the job queue to a DataFrame. + + Parameters + ---------- + name : str, optional + Name of the column, by default "". + + Note + ---- + If the name is not specified, the column name will be an empty string + + Returns + ------- + DataFrame + DataFrame containing the jobs. + """ + return self.jobs.to_frame("") + + # Append and remove jobs + # ========================================================================= + + def remove(self, indices: List[int]): + """Removes a subset of the jobs. + + Parameters + ---------- + indices : List[int] + List of indices to remove. + """ + self.jobs = self.jobs.drop(indices) + + def overwrite( + self, indices: Iterable[int], + other: Index | str) -> None: + + if isinstance(other, str): + other = Index( + pd.Series([other], index=[0], dtype='string')) + + self.jobs.update(other.jobs.set_axis(indices)) + + # Mark + # ========================================================================= + + def mark(self, index: int | slice | Iterable[int], status: Status) -> None: + """Marks a job with a certain status. + + Parameters + ---------- + index : int + Index of the job to mark. + status : str + Status to mark the job with. + """ + self.jobs.loc[index] = status + + def mark_all_in_progress_open(self) -> None: + """Marks all jobs as 'open'.""" + self.jobs = self.jobs.replace(Status.IN_PROGRESS, Status.OPEN) + + def mark_all_error_open(self) -> None: + """Marks all jobs as 'open'.""" + self.jobs = self.jobs.replace(Status.ERROR, Status.OPEN) + # Miscellanous + # ========================================================================= + + def is_all_finished(self) -> bool: + """Checks if all jobs are finished. + + Returns + ------- + bool + True if all jobs are finished, False otherwise. + """ + return all(self.jobs.isin([Status.FINISHED, Status.ERROR])) + + def get_open_job(self) -> int: + """Returns the index of an open job. + + Returns + ------- + int + Index of an open job. + """ + try: # try to find an open job + return int(self.jobs[self.jobs == Status.OPEN].index[0]) + except IndexError: + raise NoOpenJobsError("No open jobs found.") + + def reset_index(self) -> None: + """Resets the index of the jobs.""" + self.jobs.reset_index(drop=True, inplace=True) + + +def _jobs_factory(jobs: Path | str | Index | None, input_data: _Data, + output_data: _Data, job_value: Status) -> Index: + """Creates a Index object from particular inpute + + Parameters + ---------- + jobs : Path | str | None + input data for the jobs + input_data : _Data + _Data object of input data to extract indices from, if necessary + output_data : _Data + _Data object of output data to extract indices from, if necessary + job_value : Status + initial value of all the jobs + + Returns + ------- + Index + JobQueue object + """ + if isinstance(jobs, Index): + return jobs + + if isinstance(jobs, (Path, str)): + return Index.from_file(Path(jobs)) + + if input_data.is_empty(): + return Index.from_data(output_data, value=job_value) + + return Index.from_data(input_data, value=job_value) diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py new file mode 100644 index 00000000..759473e6 --- /dev/null +++ b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py @@ -0,0 +1,221 @@ +# Modules +# ============================================================================= + +from __future__ import annotations + +# Standard +from copy import deepcopy +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Type, Union + +# Third-party +import numpy as np +import pandas as pd +import xarray as xr + +# Authorship & Credits +# ============================================================================= +__author__ = 'Martin van der Schelling (M.P.vanderSchelling@tudelft.nl)' +__credits__ = ['Martin van der Schelling'] +__status__ = 'Stable' +# ============================================================================= +# +# ============================================================================= + +MISSING_VALUE = np.nan + + +class _Data: + def __init__(self, data: Dict[int, Dict[str, Any]] = None): + self.data = data if data is not None else {} + + def __len__(self) -> int: + return len(self.data) + + def __iter__(self): + return iter(self.data.values()) + + def __getitem__(self, rows: int | slice | Iterable[int]) -> _Data: + + if isinstance(rows, int): + rows = [rows] + + return _Data({row: self.data.get(row, {}) for row in rows}) + + def __add__(self, __o: _Data) -> _Data: + if self.is_empty(): + return __o + + _data_copy = deepcopy(self) + other_data_copy = deepcopy(__o) + + new_indices = (np.array(range(len(__o))) + max(self.data) + 1).tolist() + + _data_copy.data.update({row: values for row, values in zip( + new_indices, other_data_copy.data.values())}) + return _data_copy + + def __eq__(self, __o: _Data) -> bool: + return self.data == __o.data + + def _repr_html_(self) -> str: + return self.to_dataframe()._repr_html_() + + def __repr__(self) -> str: + return self.to_dataframe().__repr__() + + @property + def indices(self) -> List[int]: + return list(self.data.keys()) + + @property + def names(self) -> List[str]: + return self.to_dataframe().columns.tolist() + + @classmethod + def from_indices(cls, rows: Iterable[int]): + return cls({row: {} for row in rows}) + + # @classmethod + # def from_domain(cls, space: Iterable[str]): + # return cls(None) + + @classmethod + def from_file(cls, filename: Path) -> _Data: + ... + + @classmethod + def from_numpy(cls: Type[_Data], array: np.ndarray, + keys: Optional[Iterable[str]] = None) -> _Data: + if keys is not None: + return _Data( + {index: {key: col for key, col in zip(keys, row) + } for index, row in enumerate(array)}) + else: + # Look out! i is now an integer key! + return _Data( + {index: {i: col for i, col in enumerate(row) + } for index, row in enumerate(array)}) + + @classmethod + def from_dataframe(cls, df: pd.DataFrame) -> _Data: + return _Data( + {index: row.to_dict() for index, (_, row) in + enumerate(df.iterrows())}) + + def to_numpy(self) -> np.ndarray: + return self.to_dataframe().to_numpy() + + def to_xarray(self, label: str): + df = self.to_dataframe() + # Can create the xarray with the information from the domain! + return xr.DataArray( + self.to_dataframe(), dims=['iterations', label], coords={ + 'iterations': df.index, label: df.columns}) + + def to_dataframe(self) -> pd.DataFrame: + # Can create the dataframe from the numpy array + column names!! + return pd.DataFrame(self.data).T + + def store(self, filename: Path): + ... + + def n_best_samples(self, nosamples: int, key: str) -> _Data: + df = self.to_dataframe() + return df.nsmallest( + n=nosamples, columns=key) + + def select_columns(self, keys: Iterable[str] | str) -> _Data: + # This only works for single ints or slices!! + + if isinstance(keys, str): + keys = [keys] + + return _Data( + {index: {key: row.get(key, MISSING_VALUE) for key in keys} + for index, row in self.data.items()}) + + def drop(self, keys: Iterable[str] | str) -> _Data: + # Might be depreciated? + + if isinstance(keys, str): + keys = [keys] + + for row in self.data: + for key in keys: + if key in row: + del self.data[row][key] + + def add_column(self, key: str): + for row in self.data: + self.data[row][key] = MISSING_VALUE + + def remove(self, rows: Iterable[int]): + for row in rows: + del self.data[row] # = deleting the row + + def overwrite(self, rows: Iterable[int], __o: _Data): + for index, other_row in zip(rows, __o): + self.data[index] = other_row + + def join(self, __o: _Data) -> _Data: + _data = deepcopy(self) + for row, other_row in zip(_data, __o): + row.update(other_row) + + return _Data(_data) + + def get_data_dict(self, row: int) -> Dict[str, Any]: + return self.data[row] + + def set_data(self, row: int, value: Any, key: str): + self.data[row][key] = value + + def reset_index(self, rows: Iterable[int] = None): + self.data = {index: values for index, values in enumerate(self.data) + } + + def is_empty(self) -> bool: + return not bool(self.data) + + +def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: + """Converts a dictionary with scalar values to a data object. + + Parameters + ---------- + dict : Dict[str, Any] + The dictionary to convert. Note that the dictionary + should only have scalar values! + + Returns + ------- + _Data + The data object. + """ + return _Data({0: {dictionary}}) + + +def _data_factory(data: DataTypes) -> _Data: + if data is None: + return _Data() + + elif isinstance(data, _Data): + return data + + elif isinstance(data, pd.DataFrame): + return _Data.from_dataframe(data) + + elif isinstance(data, (Path, str)): + return _Data.from_file(Path(data)) + + elif isinstance(data, np.ndarray): + return _Data.from_numpy(data) + + else: + raise TypeError( + f"Data must be of type _Data, pd.DataFrame, np.ndarray, " + f"Path or str, not {type(data)}") + + +DataTypes = Union[pd.DataFrame, np.ndarray, Path, str, _Data] diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py new file mode 100644 index 00000000..cb776297 --- /dev/null +++ b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py @@ -0,0 +1,1881 @@ +""" +The ExperimentData object is the main object used to store implementations + of a design-of-experiments, keep track of results, perform optimization and + extract data for machine learning purposes. +""" + +# Modules +# ============================================================================= + +from __future__ import annotations + +# Standard +import inspect +import traceback +from copy import copy +from functools import wraps +from pathlib import Path +from time import sleep +from typing import (Any, Callable, Dict, Iterable, Iterator, List, Literal, + Optional, Tuple, Type) + +# Third-party +import numpy as np +import pandas as pd +import xarray as xr +from filelock import FileLock +from hydra.utils import get_original_cwd +from omegaconf import DictConfig +from pathos.helpers import mp + +# Local +from ...datageneration.datagenerator import DataGenerator, convert_function +from ...datageneration.functions.function_factory import _datagenerator_factory +from ...design.domain import Domain, _domain_factory +from ...logger import logger +from ...optimization import Optimizer +from ...optimization.optimizer_factory import _optimizer_factory +from .._io import (DOMAIN_FILENAME, EXPERIMENTDATA_SUBFOLDER, + INPUT_DATA_FILENAME, JOBS_FILENAME, LOCK_FILENAME, + MAX_TRIES, OUTPUT_DATA_FILENAME, _project_dir_factory) +from ..experimentsample import ExperimentSample +from ..samplers import Sampler, SamplerNames, _sampler_factory +from ..utils import number_of_overiterations, number_of_updates +from ._jobqueue2 import NoOpenJobsError, Status, _jobs_factory +from ._newdata2 import DataTypes, _Data, _data_factory + +# Authorship & Credits +# ============================================================================= +__author__ = 'Martin van der Schelling (M.P.vanderSchelling@tudelft.nl)' +__credits__ = ['Martin van der Schelling'] +__status__ = 'Stable' +# ============================================================================= +# +# ============================================================================= + + +class ExperimentData: + """ + A class that contains data for experiments. + """ + + def __init__(self, + domain: Optional[Domain] = None, + input_data: Optional[DataTypes] = None, + output_data: Optional[DataTypes] = None, + jobs: Optional[Path | str] = None, + project_dir: Optional[Path] = None): + """ + Initializes an instance of ExperimentData. + + Parameters + ---------- + domain : Domain, optional + The domain of the experiment, by default None + input_data : DataTypes, optional + The input data of the experiment, by default None + output_data : DataTypes, optional + The output data of the experiment, by default None + jobs : Path | str, optional + The path to the jobs file, by default None + project_dir : Path | str, optional + A user-defined directory where the f3dasm project folder will be \ + created, by default the current working directory. + + Note + ---- + + The following data formats are supported for input and output data: + + * numpy array + * pandas Dataframe + * path to a csv file + + If no domain object is provided, the domain is inferred from the \ + input_data. + + If the provided project_dir does not exist, it will be created. + + Raises + ------ + + ValueError + If the input_data is a numpy array, the domain has to be provided. + """ + + if isinstance(input_data, np.ndarray) and domain is None: + raise ValueError( + 'If you provide a numpy array as input_data, \ + you have to provide the domain!') + + self.project_dir = _project_dir_factory(project_dir) + + self._input_data = _data_factory(input_data) + self._output_data = _data_factory(output_data) + + # Create empty output_data from indices if output_data is empty + if self._output_data.is_empty(): + self._output_data = _Data.from_indices(self._input_data.indices) + job_value = Status.OPEN + + else: + job_value = Status.FINISHED + + self.domain = _domain_factory( + domain=domain, input_data=self._input_data.to_dataframe(), + output_data=self._output_data.to_dataframe()) + + # Create empty input_data from domain if input_data is empty + if self._input_data.is_empty(): + self._input_data = _Data() + + self._jobs = _jobs_factory( + jobs, self._input_data, self._output_data, job_value) + + # # Check if the columns of input_data are in the domain + # if not self._input_data.columns.has_columnnames(self.domain.names): + # self._input_data.columns.set_columnnames(self.domain.names) + + # if not self._output_data.columns.has_columnnames( + # self.domain.output_names): + # self._output_data.columns.set_columnnames(self.domain.output_names) + + # For backwards compatibility; if the output_data has + # only one column, rename it to 'y' + # TODO: Fix this for newdata2 + if self._output_data.names == [0]: + self._output_data.columns.set_columnnames(['y']) + + def __len__(self): + """The len() method returns the number of datapoints""" + return len(self._jobs) + + # if self._input_data.is_empty(): + # return len(self._output_data) + + # return len(self._input_data) + + def __iter__(self) -> Iterator[Tuple[Dict[str, Any]]]: + self.current_index = 0 + return self + + def __next__(self) -> ExperimentSample: + if self.current_index >= len(self): + raise StopIteration + else: + index = self.index[self.current_index] + self.current_index += 1 + return self.get_experiment_sample(index) + + def __add__(self, + __o: ExperimentData | ExperimentSample) -> ExperimentData: + """The + operator combines two ExperimentData objects""" + # Check if the domains are the same + + if not isinstance(__o, (ExperimentData, ExperimentSample)): + raise TypeError( + f"Can only add ExperimentData or " + f"ExperimentSample objects, not {type(__o)}") + + return ExperimentData( + input_data=self._input_data + __o._input_data, + output_data=self._output_data + __o._output_data, + jobs=self._jobs + __o._jobs, domain=self.domain + __o.domain, + project_dir=self.project_dir) + + def __eq__(self, __o: ExperimentData) -> bool: + return all([self._input_data == __o._input_data, + self._output_data == __o._output_data, + self._jobs == __o._jobs, + self.domain == __o.domain]) + + def _repr_html_(self) -> str: + return combine_data_to_multiindex(self)._repr_html_() + + def __repr__(self) -> str: + return combine_data_to_multiindex(self).__repr__() + + def _access_file(operation: Callable) -> Callable: + """Wrapper for accessing a single resource with a file lock + + Parameters + ---------- + operation : Callable + The operation to be performed on the resource + + Returns + ------- + Callable + The wrapped operation + """ + @wraps(operation) + def wrapper_func(self: ExperimentData, *args, **kwargs) -> None: + lock = FileLock( + (self. + project_dir / EXPERIMENTDATA_SUBFOLDER / LOCK_FILENAME) + .with_suffix('.lock')) + + # If the lock has been acquired: + with lock: + tries = 0 + while tries < MAX_TRIES: + try: + self = ExperimentData.from_file(self.project_dir) + value = operation(self, *args, **kwargs) + self.store() + break + + # Racing conditions can occur when the file is empty + # and the file is being read at the same time + except pd.errors.EmptyDataError: + tries += 1 + logger.debug(( + f"EmptyDataError occurred, retrying" + f" {tries+1}/{MAX_TRIES}")) + sleep(1) + + raise pd.errors.EmptyDataError() + + return value + + return wrapper_func + # Properties + # ========================================================================= + + @property + def index(self) -> pd.Index: + """Returns an iterable of the job number of the experiments + + Returns + ------- + pd.Index + The job number of all the experiments in pandas Index format + """ + return self._jobs.indices + + # if self._input_data.is_empty(): + # return self._output_data.indices + + # return self._input_data.indices + + # Alternative Constructors + # ========================================================================= + + @classmethod + def from_file(cls: Type[ExperimentData], + project_dir: Path | str) -> ExperimentData: + """Create an ExperimentData object from .csv and .json files. + + Parameters + ---------- + project_dir : Path | str + User defined path of the experimentdata directory. + + Returns + ------- + ExperimentData + ExperimentData object containing the loaded data. + """ + if isinstance(project_dir, str): + project_dir = Path(project_dir) + + try: + return cls._from_file_attempt(project_dir) + except FileNotFoundError: + try: + filename_with_path = Path(get_original_cwd()) / project_dir + except ValueError: # get_original_cwd() hydra initialization error + raise FileNotFoundError( + f"Cannot find the folder {project_dir} !") + + return cls._from_file_attempt(filename_with_path) + + @classmethod + def from_sampling(cls, sampler: Sampler | str, domain: Domain | DictConfig, + n_samples: int = 1, + seed: Optional[int] = None, + **kwargs) -> ExperimentData: + """Create an ExperimentData object from a sampler. + + Parameters + ---------- + sampler : Sampler | str + Sampler object containing the sampling strategy or one of the + built-in sampler names. + domain : Domain | DictConfig + Domain object containing the domain of the experiment or hydra + DictConfig object containing the configuration. + n_samples : int, optional + Number of samples, by default 1. + seed : int, optional + Seed for the random number generator, by default None. + + Returns + ------- + ExperimentData + ExperimentData object containing the sampled data. + + Note + ---- + + If a string is passed for the sampler argument, it should be one + of the built-in samplers: + + * 'random' : Random sampling + * 'latin' : Latin Hypercube Sampling + * 'sobol' : Sobol Sequence Sampling + * 'grid' : Grid Search Sampling + + Any additional keyword arguments are passed to the sampler. + """ + experimentdata = cls(domain=domain) + experimentdata.sample( + sampler=sampler, n_samples=n_samples, seed=seed, **kwargs) + return experimentdata + + @classmethod + def from_yaml(cls, config: DictConfig) -> ExperimentData: + """Create an ExperimentData object from a hydra yaml configuration. + + Parameters + ---------- + config : DictConfig + A DictConfig object containing the configuration of the \ + experiment data. + + Returns + ------- + ExperimentData + ExperimentData object containing the loaded data. + """ + # Option 0: Both existing and sampling + if 'from_file' in config and 'from_sampling' in config: + return cls.from_file(config.from_file) + cls.from_sampling( + **config.from_sampling) + + # Option 1: From exisiting ExperimentData files + if 'from_file' in config: + return cls.from_file(config.from_file) + + # Option 2: Sample from the domain + if 'from_sampling' in config: + return cls.from_sampling(**config.from_sampling) + + else: + return cls(**config) + + @classmethod + def _from_file_attempt(cls: Type[ExperimentData], + project_dir: Path) -> ExperimentData: + """Attempt to create an ExperimentData object + from .csv and .pkl files. + + Parameters + ---------- + path : Path + Name of the user-defined directory where the files are stored. + + Returns + ------- + ExperimentData + ExperimentData object containing the loaded data. + + Raises + ------ + FileNotFoundError + If the files cannot be found. + """ + subdirectory = project_dir / EXPERIMENTDATA_SUBFOLDER + + try: + return cls(domain=subdirectory / DOMAIN_FILENAME, + input_data=subdirectory / INPUT_DATA_FILENAME, + output_data=subdirectory / OUTPUT_DATA_FILENAME, + jobs=subdirectory / JOBS_FILENAME, + project_dir=project_dir) + except FileNotFoundError: + raise FileNotFoundError( + f"Cannot find the files from {subdirectory}.") + + # Selecting subsets + # ========================================================================= + + def select(self, job_ids: int | Iterable[int]) -> ExperimentData: + """Select a subset of the ExperimentData object + + Parameters + ---------- + indices : int | Iterable[int] + The indices to select. + + Returns + ------- + ExperimentData + The selected ExperimentData object with only the selected indices. + """ + indices = self._jobs.iloc(job_ids) + # TODO: It could be that the indices are not in the input_data + # and output_data, because they are not defined + + return ExperimentData(input_data=self._input_data[indices], + output_data=self._output_data[indices], + jobs=self._jobs[job_ids], + domain=self.domain, project_dir=self.project_dir) + + def drop_output(self, names: Iterable[str] | str) -> ExperimentData: + """Drop a column from the output data + + Parameters + ---------- + names : Iteraeble | str + The names of the columns to drop. + + Returns + ------- + ExperimentData + The ExperimentData object with the column dropped. + """ + return ExperimentData(input_data=self._input_data, + output_data=self._output_data.drop(names), + jobs=self._jobs, domain=self.domain.drop_output( + names), + project_dir=self.project_dir) + + def select_with_status(self, status: Literal['open', 'in progress', + 'finished', 'error'] + ) -> ExperimentData: + """Select a subset of the ExperimentData object with a given status + + Parameters + ---------- + status : Literal['open', 'in progress', 'finished', 'error'] + The status to select. + + Returns + ------- + ExperimentData + The selected ExperimentData object with only the selected status. + + Raises + ------ + ValueError + Raised when invalid status is specified + """ + if status not in [s.value for s in Status]: + raise ValueError(f"Invalid status {status} given. " + f"\nChoose from values: " + f"{', '.join([s.value for s in Status])}") + + _indices = self._jobs.select_all(status).indices + return self.select(_indices) + + def get_input_data(self, + parameter_names: Optional[str | Iterable[str]] = None + ) -> ExperimentData: + """Retrieve a subset of the input data from the ExperimentData object + + Parameters + ---------- + parameter_names : str | Iterable[str], optional + The name(s) of the input parameters that you want to retrieve, \ + if None all input parameters are retrieved, by default None + + Returns + ------- + ExperimentData + The selected ExperimentData object with only the\ + selected input data. + + Note + ---- + If parameter_names is None, all input data is retrieved. \ + The returned ExperimentData object has the domain of \ + the original ExperimentData object, \ + but only with the selected input parameters.\ + """ + if parameter_names is None: + return ExperimentData(input_data=self._input_data, + jobs=self._jobs, + domain=self.domain, + project_dir=self.project_dir) + else: + return ExperimentData(input_data=self._input_data.select_columns( + parameter_names), + jobs=self._jobs, + domain=self.domain.select(parameter_names), + project_dir=self.project_dir) + + def get_output_data(self, + parameter_names: Optional[str | Iterable[str]] = None + ) -> ExperimentData: + """Retrieve a subset of the output data from the ExperimentData object + + Parameters + ---------- + parameter_names : str | Iterable[str], optional + The name(s) of the output parameters that you want to retrieve, \ + if None all output parameters are retrieved, by default None + + Returns + ------- + ExperimentData + The selected ExperimentData object with only \ + the selected output data. + + Note + ---- + If parameter_names is None, all output data is retrieved. \ + The returned ExperimentData object has no domain object and \ + no input data! + """ + if parameter_names is None: + # TODO: Make a domain where space is empty + # but it tracks output_space! + return ExperimentData( + output_data=self._output_data, jobs=self._jobs, + project_dir=self.project_dir) + else: + return ExperimentData( + output_data=self._output_data.select_columns(parameter_names), + jobs=self._jobs, + project_dir=self.project_dir) + + # Export + # ========================================================================= + + def store(self, project_dir: Optional[Path | str] = None): + """Write the ExperimentData to disk in the project directory. + + Parameters + ---------- + project_dir : Optional[Path | str], optional + The f3dasm project directory to store the \ + ExperimentData object to, by default None. + + Note + ---- + If no project directory is provided, the ExperimentData object is \ + stored in the directory provided by the `.project_dir` attribute that \ + is set upon creation of the object. + + The ExperimentData object is stored in a subfolder 'experiment_data'. + + The ExperimentData object is stored in four files: + + * the input data (`input.csv`) + * the output data (`output.csv`) + * the jobs (`jobs.pkl`) + * the domain (`domain.pkl`) + + To avoid the ExperimentData to be written simultaneously by multiple \ + processes, a '.lock' file is automatically created \ + in the project directory. Concurrent process can only sequentially \ + access the lock file. This lock file is removed after the \ + ExperimentData object is written to disk. + """ + if project_dir is not None: + self.set_project_dir(project_dir) + + subdirectory = self.project_dir / EXPERIMENTDATA_SUBFOLDER + + # Create the subdirectory if it does not exist + subdirectory.mkdir(parents=True, exist_ok=True) + + self._input_data.store(subdirectory / Path(INPUT_DATA_FILENAME)) + self._output_data.store(subdirectory / Path(OUTPUT_DATA_FILENAME)) + self._jobs.store(subdirectory / Path(JOBS_FILENAME)) + self.domain.store(subdirectory / Path(DOMAIN_FILENAME)) + + def to_numpy(self) -> Tuple[np.ndarray, np.ndarray]: + """ + Convert the ExperimentData object to a tuple of numpy arrays. + + Returns + ------- + tuple + A tuple containing two numpy arrays, \ + the first one for input columns, \ + and the second for output columns. + """ + return self._input_data.to_numpy(), self._output_data.to_numpy() + + def to_pandas(self) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Convert the ExperimentData object to a pandas DataFrame. + + Returns + ------- + tuple + A tuple containing two pandas DataFrames, \ + the first one for input columns, and the second for output + """ + return (self._input_data.to_dataframe(), + self._output_data.to_dataframe()) + + def to_xarray(self) -> xr.Dataset: + """ + Convert the ExperimentData object to an xarray Dataset. + + Returns + ------- + xarray.Dataset + An xarray Dataset containing the data. + """ + return xr.Dataset( + {'input': self._input_data.to_xarray('input_dim'), + 'output': self._output_data.to_xarray('output_dim')}) + + def get_n_best_output(self, n_samples: int) -> ExperimentData: + """Get the n best samples from the output data. \ + We consider lower values to be better. + + Parameters + ---------- + n_samples : int + Number of samples to select. + + Returns + ------- + ExperimentData + New experimentData object with a selection of the n best samples. + + Note + ---- + + The n best samples are selected based on the output data. \ + The output data is sorted based on the first output parameter. \ + The n best samples are selected based on this sorting. \ + """ + df = self._output_data.n_best_samples( + n_samples, self._output_data.names) + return self.select(df.index) + + # Append or remove data + # ========================================================================= + + def add(self, domain: Optional[Domain] = None, + input_data: Optional[DataTypes] = None, + output_data: Optional[DataTypes] = None, + jobs: Optional[Path | str] = None) -> None: + """Add data to the ExperimentData object. + + Parameters + ---------- + domain : Optional[Domain], optional + Domain of the added object, by default None + input_data : Optional[DataTypes], optional + input parameters of the added object, by default None + output_data : Optional[DataTypes], optional + output parameters of the added object, by default None + jobs : Optional[Path | str], optional + jobs off the added object, by default None + """ + self.add_experiments(ExperimentData( + domain=domain, input_data=input_data, + output_data=output_data, + jobs=jobs)) + + def add_experiments(self, + experiment_sample: ExperimentSample | ExperimentData + ) -> None: + """ + Add an ExperimentSample or ExperimentData to the ExperimentData + attribute. + + Parameters + ---------- + experiment_sample : ExperimentSample or ExperimentData + Experiment(s) to add. + + Raises + ------ + ValueError + If -after checked- the indices of the input and output data + objects are not equal. + """ + + if isinstance(experiment_sample, ExperimentData): + experiment_sample._reset_index() + self.domain += experiment_sample.domain + + self._input_data += experiment_sample._input_data + self._output_data += experiment_sample._output_data + self._jobs += experiment_sample._jobs + + # Check if indices of the internal objects are equal + if not (self._input_data.indices.equals(self._output_data.indices) + and self._input_data.indices.equals(self._jobs.indices)): + raise ValueError(f"Indices of the internal objects are not equal." + f"input_data {self._input_data.indices}, " + f"output_data {self._output_data.indices}," + f"jobs: {self._jobs.indices}") + + # Apparently you need to cast the types again + # TODO: Breaks if values are NaN or infinite + # self._input_data.cast_types(self.domain) + + def overwrite( + self, indices: Iterable[int], + domain: Optional[Domain] = None, + input_data: Optional[DataTypes] = None, + output_data: Optional[DataTypes] = None, + jobs: Optional[Path | str] = None, + add_if_not_exist: bool = False + ) -> None: + """Overwrite the ExperimentData object. + + Parameters + ---------- + indices : Iterable[int] + The indices to overwrite. + domain : Optional[Domain], optional + Domain of the new object, by default None + input_data : Optional[DataTypes], optional + input parameters of the new object, by default None + output_data : Optional[DataTypes], optional + output parameters of the new object, by default None + jobs : Optional[Path | str], optional + jobs off the new object, by default None + add_if_not_exist : bool, optional + If True, the new objects are added if the requested indices + do not exist in the current ExperimentData object, by default False + """ + + # Be careful, if a job has output data and gets overwritten with a + # job that has no output data, the status is set to open. But the job + # will still have the output data! + + # This is usually not a problem, because the output data will be + # immediately overwritten in optimization. + + self._overwrite_experiments( + indices=indices, + experiment_sample=ExperimentData( + domain=domain, input_data=input_data, + output_data=output_data, + jobs=jobs), + add_if_not_exist=add_if_not_exist) + + def _overwrite_experiments( + self, indices: Iterable[int], + experiment_sample: ExperimentSample | ExperimentData, + add_if_not_exist: bool) -> None: + """ + Overwrite the ExperimentData object at the given indices. + + Parameters + ---------- + indices : Iterable[int] + The indices to overwrite. + experimentdata : ExperimentData | ExperimentSample + The new ExperimentData object to overwrite with. + add_if_not_exist : bool + If True, the new objects are added if the requested indices + do not exist in the current ExperimentData object. + """ + if not all(pd.Index(indices).isin(self.index)): + if add_if_not_exist: + self.add_experiments(experiment_sample) + return + else: + raise ValueError( + f"The given indices {indices} do not exist in the current " + f"ExperimentData object. " + f"If you want to add the new experiments, " + f"set add_if_not_exist to True.") + + self._input_data.overwrite( + rows=indices, other=experiment_sample._input_data) + self._output_data.overwrite( + rows=indices, other=experiment_sample._output_data) + + self._jobs.overwrite( + indices=indices, other=experiment_sample._jobs) + + if isinstance(experiment_sample, ExperimentData): + self.domain += experiment_sample.domain + + @_access_file + def overwrite_disk( + self, indices: Iterable[int], + domain: Optional[Domain] = None, + input_data: Optional[DataTypes] = None, + output_data: Optional[DataTypes] = None, + jobs: Optional[Path | str] = None, + add_if_not_exist: bool = False + ) -> None: + self.overwrite(indices=indices, domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs, + add_if_not_exist=add_if_not_exist) + + def add_input_parameter( + self, name: str, + type: Literal['float', 'int', 'category', 'constant'], + **kwargs): + """Add a new input column to the ExperimentData object. + + Parameters + ---------- + name + name of the new input column + type + type of the new input column: float, int, category or constant + kwargs + additional arguments for the new parameter + """ + self._input_data.add_column(name) + self.domain.add(name=name, type=type, **kwargs) + + def add_output_parameter( + self, name: str, is_disk: bool, exist_ok: bool = False) -> None: + """Add a new output column to the ExperimentData object. + + Parameters + ---------- + name + name of the new output column + is_disk + Whether the output column will be stored on disk or not + exist_ok + If True, it will not raise an error if the output column already + exists, by default False + """ + self._output_data.add_column(name, exist_ok=exist_ok) + self.domain.add_output(name=name, to_disk=is_disk, exist_ok=exist_ok) + + def remove_rows_bottom(self, number_of_rows: int): + """ + Remove a number of rows from the end of the ExperimentData object. + + Parameters + ---------- + number_of_rows : int + Number of rows to remove from the bottom. + """ + if number_of_rows == 0: + return # Don't do anything if 0 rows need to be removed + + # get the last indices from data.data + indices = self.index[-number_of_rows:] + + # remove the indices rows_to_remove from data.data + self._input_data.remove(indices) + self._output_data.remove(indices) + self._jobs.remove(indices) + + def _reset_index(self) -> None: + """ + Reset the index of the ExperimentData object. + """ + self._input_data.reset_index() + + if self._input_data.is_empty(): + self._output_data.reset_index() + else: + self._output_data.reset_index(self._input_data.indices) + self._jobs.reset_index() + + def join(self, other: ExperimentData) -> ExperimentData: + """Join two ExperimentData objects. + + Parameters + ---------- + other : ExperimentData + The other ExperimentData object to join with. + + Returns + ------- + ExperimentData + The joined ExperimentData object. + """ + return ExperimentData( + input_data=self._input_data.join(other._input_data), + output_data=self._output_data.join(other._output_data), + jobs=self._jobs, + domain=self.domain + other.domain, + project_dir=self.project_dir) +# ExperimentSample + # ============================================================================= + + def get_experiment_sample(self, index: int) -> ExperimentSample: + """ + Gets the experiment_sample at the given index. + + Parameters + ---------- + index : int + The index of the experiment_sample to retrieve. + + Returns + ------- + ExperimentSample + The ExperimentSample at the given index. + """ + output_experiment_sample_dict = self._output_data.get_data_dict(index) + + dict_output = {k: (v, self.domain.output_space[k].to_disk) + for k, v in output_experiment_sample_dict.items()} + + return ExperimentSample(dict_input=self._input_data.get_data_dict( + index), + dict_output=dict_output, + jobnumber=index, + experimentdata_directory=self.project_dir) + + def get_experiment_samples( + self, + indices: Optional[Iterable[int]] = None) -> List[ExperimentSample]: + """ + Gets the experiment_samples at the given indices. + + Parameters + ---------- + indices : Optional[Iterable[int]], optional + The indices of the experiment_samples to retrieve, by default None + If None, all experiment_samples are retrieved. + + Returns + ------- + List[ExperimentSample] + The ExperimentSamples at the given indices. + """ + if indices is None: + # Return a list of the iterator over ExperimentData + return list(self) + + return [self.get_experiment_sample(index) for index in indices] + + def _set_experiment_sample(self, + experiment_sample: ExperimentSample) -> None: + """ + Sets the ExperimentSample at the given index. + + Parameters + ---------- + experiment_sample : ExperimentSample + The ExperimentSample to set. + """ + for column, (value, is_disk) in experiment_sample._dict_output.items(): + + if not self.domain.is_in_output(column): + self.domain.add_output(column, to_disk=is_disk) + + self._output_data.set_data( + row=experiment_sample.job_number, value=value, + column=column) + + self._jobs.mark(experiment_sample._jobnumber, status=Status.FINISHED) + + @_access_file + def _write_experiment_sample(self, + experiment_sample: ExperimentSample) -> None: + """ + Sets the ExperimentSample at the given index. + + Parameters + ---------- + experiment_sample : ExperimentSample + The ExperimentSample to set. + """ + self._set_experiment_sample(experiment_sample) + + def _access_open_job_data(self) -> ExperimentSample: + """Get the data of the first available open job. + + Returns + ------- + ExperimentSample + The ExperimentSample object of the first available open job. + """ + job_index = self._jobs.get_open_job() + self._jobs.mark(job_index, status=Status.IN_PROGRESS) + experiment_sample = self.get_experiment_sample(job_index) + return experiment_sample + + @_access_file + def _get_open_job_data(self) -> ExperimentSample: + """Get the data of the first available open job by + accessing the ExperimenData on disk. + + Returns + ------- + ExperimentSample + The ExperimentSample object of the first available open job. + """ + return self._access_open_job_data() + + # Jobs + # ========================================================================= + + def _set_error(self, index: int) -> None: + """Mark the experiment_sample at the given index as error. + + Parameters + ---------- + index + index of the experiment_sample to mark as error + """ + # self.jobs.mark_as_error(index) + self._jobs.mark(index, status=Status.ERROR) + self._output_data.set_data( + index, + value=['ERROR' for _ in self._output_data.names]) + + @_access_file + def _write_error(self, index: int): + """Mark the experiment_sample at the given index as + error and write to ExperimentData file. + + Parameters + ---------- + index + index of the experiment_sample to mark as error + """ + self._set_error(index) + + @_access_file + def is_all_finished(self) -> bool: + """Check if all jobs are finished + + Returns + ------- + bool + True if all jobs are finished, False otherwise + """ + return self._jobs.is_all_finished() + + def mark(self, indices: Iterable[int], + status: Literal['open', 'in progress', 'finished', 'error']): + """Mark the jobs at the given indices with the given status. + + Parameters + ---------- + indices : Iterable[int] + indices of the jobs to mark + status : Literal['open', 'in progress', 'finished', 'error'] + status to mark the jobs with: choose between: 'open', \ + 'in progress', 'finished' or 'error' + + Raises + ------ + ValueError + If the given status is not any of 'open', 'in progress', \ + 'finished' or 'error' + """ + # Check if the status is in Status + if not any(status.lower() == s.value for s in Status): + raise ValueError(f"Invalid status {status} given. " + f"\nChoose from values: " + f"{', '.join([s.value for s in Status])}") + + self._jobs.mark(indices, status) + + def mark_all(self, + status: Literal['open', 'in progress', 'finished', 'error']): + """Mark all the experiments with the given status + + Parameters + ---------- + status : Literal['open', 'in progress', 'finished', 'error'] + status to mark the jobs with: \ + choose between: + + * 'open', + * 'in progress', + * 'finished' + * 'error' + + Raises + ------ + ValueError + If the given status is not any of \ + 'open', 'in progress', 'finished' or 'error' + """ + self.mark(self._jobs.indices, status) + + def mark_all_error_open(self) -> None: + """ + Mark all the experiments that have the status 'error' open + """ + self._jobs.mark_all_error_open() + + def mark_all_in_progress_open(self) -> None: + """ + Mark all the experiments that have the status 'in progress' open + """ + self._jobs.mark_all_in_progress_open() + + def mark_all_nan_open(self) -> None: + """ + Mark all the experiments that have 'nan' in output open + """ + indices = self._output_data.get_index_with_nan() + self.mark(indices=indices, status='open') + # Datageneration + # ========================================================================= + + def evaluate(self, data_generator: DataGenerator, + mode: Literal['sequential', 'parallel', + 'cluster', 'cluster_parallel'] = 'sequential', + kwargs: Optional[dict] = None, + output_names: Optional[List[str]] = None) -> None: + """Run any function over the entirety of the experiments + + Parameters + ---------- + data_generator : DataGenerator + data generator to use + mode : str, optional + operational mode, by default 'sequential'. Choose between: + + * 'sequential' : Run the operation sequentially + * 'parallel' : Run the operation on multiple cores + * 'cluster' : Run the operation on the cluster + * 'cluster_parallel' : Run the operation on the cluster in parallel + + kwargs, optional + Any keyword arguments that need to + be supplied to the function, by default None + output_names : List[str], optional + If you provide a function as data generator, you have to provide + the names of all the output parameters that are in the return + statement, in order of appearance. + + Raises + ------ + ValueError + Raised when invalid parallelization mode is specified + """ + if kwargs is None: + kwargs = {} + + if inspect.isfunction(data_generator): + if output_names is None: + raise TypeError( + ("If you provide a function as data generator, you have to" + "provide the names of the return arguments with the" + "output_names attribute.")) + data_generator = convert_function( + f=data_generator, output=output_names) + + elif isinstance(data_generator, str): + data_generator = _datagenerator_factory( + data_generator, self.domain, kwargs) + + if mode.lower() == "sequential": + return self._run_sequential(data_generator, kwargs) + elif mode.lower() == "parallel": + return self._run_multiprocessing(data_generator, kwargs) + elif mode.lower() == "cluster": + return self._run_cluster(data_generator, kwargs) + elif mode.lower() == "cluster_parallel": + return self._run_cluster_parallel(data_generator, kwargs) + else: + raise ValueError("Invalid parallelization mode specified.") + + def _run_sequential(self, data_generator: DataGenerator, kwargs: dict): + """Run the operation sequentially + + Parameters + ---------- + operation : ExperimentSampleCallable + function execution for every entry in the ExperimentData object + kwargs : dict + Any keyword arguments that need to be supplied to the function + + Raises + ------ + NoOpenJobsError + Raised when there are no open jobs left + """ + while True: + try: + experiment_sample = self._access_open_job_data() + logger.debug( + f"Accessed experiment_sample \ + {experiment_sample._jobnumber}") + except NoOpenJobsError: + logger.debug("No Open Jobs left") + break + + try: + + # If kwargs is empty dict + if not kwargs: + logger.debug( + f"Running experiment_sample " + f"{experiment_sample._jobnumber}") + else: + logger.debug( + f"Running experiment_sample " + f"{experiment_sample._jobnumber} with kwargs {kwargs}") + + _experiment_sample = data_generator._run( + experiment_sample, **kwargs) # no *args! + self._set_experiment_sample(_experiment_sample) + except Exception as e: + error_msg = f"Error in experiment_sample \ + {experiment_sample._jobnumber}: {e}" + error_traceback = traceback.format_exc() + logger.error(f"{error_msg}\n{error_traceback}") + self._set_error(experiment_sample._jobnumber) + + def _run_multiprocessing(self, data_generator: DataGenerator, + kwargs: dict): + """Run the operation on multiple cores + + Parameters + ---------- + operation : ExperimentSampleCallable + function execution for every entry in the ExperimentData object + kwargs : dict + Any keyword arguments that need to be supplied to the function + + Raises + ------ + NoOpenJobsError + Raised when there are no open jobs left + """ + # Get all the jobs + options = [] + while True: + try: + experiment_sample = self._access_open_job_data() + options.append( + ({'experiment_sample': experiment_sample, **kwargs},)) + except NoOpenJobsError: + break + + def f(options: Dict[str, Any]) -> Tuple[ExperimentSample, int]: + try: + + logger.debug( + f"Running experiment_sample " + f"{options['experiment_sample'].job_number}") + + return (data_generator._run(**options), 0) # no *args! + + except Exception as e: + error_msg = f"Error in experiment_sample \ + {options['experiment_sample'].job_number}: {e}" + error_traceback = traceback.format_exc() + logger.error(f"{error_msg}\n{error_traceback}") + return (options['experiment_sample'], 1) + + with mp.Pool() as pool: + # maybe implement pool.starmap_async ? + _experiment_samples: List[ + Tuple[ExperimentSample, int]] = pool.starmap(f, options) + + for _experiment_sample, exit_code in _experiment_samples: + if exit_code == 0: + self._set_experiment_sample(_experiment_sample) + else: + self._set_error(_experiment_sample.job_number) + + def _run_cluster(self, data_generator: DataGenerator, kwargs: dict): + """Run the operation on the cluster + + Parameters + ---------- + operation : ExperimentSampleCallable + function execution for every entry in the ExperimentData object + kwargs : dict + Any keyword arguments that need to be supplied to the function + + Raises + ------ + NoOpenJobsError + Raised when there are no open jobs left + """ + # Retrieve the updated experimentdata object from disc + try: + self = self.from_file(self.project_dir) + except FileNotFoundError: # If not found, store current + self.store() + + while True: + try: + experiment_sample = self._get_open_job_data() + except NoOpenJobsError: + logger.debug("No Open jobs left!") + break + + try: + _experiment_sample = data_generator._run( + experiment_sample, **kwargs) + self._write_experiment_sample(_experiment_sample) + except Exception: + n = experiment_sample.job_number + error_msg = f"Error in experiment_sample {n}: " + error_traceback = traceback.format_exc() + logger.error(f"{error_msg}\n{error_traceback}") + self._write_error(experiment_sample._jobnumber) + continue + + self = self.from_file(self.project_dir) + # Remove the lockfile from disk + (self.project_dir / EXPERIMENTDATA_SUBFOLDER / LOCK_FILENAME + ).with_suffix('.lock').unlink(missing_ok=True) + + def _run_cluster_parallel( + self, data_generator: DataGenerator, kwargs: dict): + """Run the operation on the cluster and parallelize it over cores + + Parameters + ---------- + operation : ExperimentSampleCallable + function execution for every entry in the ExperimentData object + kwargs : dict + Any keyword arguments that need to be supplied to the function + + Raises + ------ + NoOpenJobsError + Raised when there are no open jobs left + """ + # Retrieve the updated experimentdata object from disc + try: + self = self.from_file(self.project_dir) + except FileNotFoundError: # If not found, store current + self.store() + + no_jobs = False + + while True: + es_list = [] + for core in range(mp.cpu_count()): + try: + es_list.append(self._get_open_job_data()) + except NoOpenJobsError: + logger.debug("No Open jobs left!") + no_jobs = True + break + + d = self.select([e.job_number for e in es_list]) + + d._run_multiprocessing( + data_generator=data_generator, kwargs=kwargs) + + # TODO access resource first! + self.overwrite_disk( + indices=d.index, input_data=d._input_data, + output_data=d._output_data, jobs=d._jobs, + domain=d.domain, add_if_not_exist=False) + + if no_jobs: + break + + self = self.from_file(self.project_dir) + # Remove the lockfile from disk + (self.project_dir / EXPERIMENTDATA_SUBFOLDER / LOCK_FILENAME + ).with_suffix('.lock').unlink(missing_ok=True) + + # Optimization + # ========================================================================= + + def optimize(self, optimizer: Optimizer | str, + data_generator: DataGenerator | str, + iterations: int, + kwargs: Optional[Dict[str, Any]] = None, + hyperparameters: Optional[Dict[str, Any]] = None, + x0_selection: Literal['best', 'random', + 'last', + 'new'] | ExperimentData = 'best', + sampler: Optional[Sampler | str] = 'random', + overwrite: bool = False, + callback: Optional[Callable] = None) -> None: + """Optimize the experimentdata object + + Parameters + ---------- + optimizer : Optimizer | str + Optimizer object + data_generator : DataGenerator | str + DataGenerator object + iterations : int + number of iterations + kwargs : Dict[str, Any], optional + any additional keyword arguments that will be passed to + the DataGenerator + hyperparameters : Dict[str, Any], optional + any additional keyword arguments that will be passed to + the optimizer + x0_selection : str | ExperimentData + How to select the initial design. By default 'best' + The following x0_selections are available: + + * 'best': Select the best designs from the current experimentdata + * 'random': Select random designs from the current experimentdata + * 'last': Select the last designs from the current experimentdata + * 'new': Create new random designs from the current experimentdata + + If the x0_selection is 'new', new designs are sampled with the + sampler provided. The number of designs selected is equal to the + population size of the optimizer. + + If an ExperimentData object is passed as x0_selection, + the optimizer will use the input_data and output_data from this + object as initial samples. + sampler: Sampler, optional + If x0_selection = 'new', the sampler to use. By default 'random' + overwrite: bool, optional + If True, the optimizer will overwrite the current data. By default + False + callback : Callable, optional + A callback function that is called after every iteration. It has + the following signature: + + ``callback(intermediate_result: ExperimentData)`` + + where the first argument is a parameter containing an + `ExperimentData` object with the current iterate(s). + + Raises + ------ + ValueError + Raised when invalid x0_selection is specified + """ + # Create the data generator object if a string reference is passed + if isinstance(data_generator, str): + data_generator: DataGenerator = _datagenerator_factory( + data_generator=data_generator, + domain=self.domain, kwargs=kwargs) + + # Create a copy of the optimizer object + _optimizer = copy(optimizer) + + # Create the optimizer object if a string reference is passed + if isinstance(_optimizer, str): + _optimizer: Optimizer = _optimizer_factory( + _optimizer, self.domain, hyperparameters) + + # Create the sampler object if a string reference is passed + if isinstance(sampler, str): + sampler: Sampler = _sampler_factory(sampler, self.domain) + + if _optimizer.type == 'scipy': + self._iterate_scipy( + optimizer=_optimizer, data_generator=data_generator, + iterations=iterations, kwargs=kwargs, + x0_selection=x0_selection, + sampler=sampler, + overwrite=overwrite, + callback=callback) + else: + self._iterate( + optimizer=_optimizer, data_generator=data_generator, + iterations=iterations, kwargs=kwargs, + x0_selection=x0_selection, + sampler=sampler, + overwrite=overwrite, + callback=callback) + + def _iterate(self, optimizer: Optimizer, data_generator: DataGenerator, + iterations: int, kwargs: Dict[str, Any], x0_selection: str, + sampler: Sampler, overwrite: bool, + callback: Callable): + """Internal represenation of the iteration process + + Parameters + ---------- + optimizer : Optimizer + Optimizer object + data_generator : DataGenerator + DataGenerator object + iterations : int + number of iterations + kwargs : Dict[str, Any] + any additional keyword arguments that will be passed to + the DataGenerator + x0_selection : str | ExperimentData + How to select the initial design. + The following x0_selections are available: + + * 'best': Select the best designs from the current experimentdata + * 'random': Select random designs from the current experimentdata + * 'last': Select the last designs from the current experimentdata + * 'new': Create new random designs from the current experimentdata + + If the x0_selection is 'new', new designs are sampled with the + sampler provided. The number of designs selected is equal to the + population size of the optimizer. + + If an ExperimentData object is passed as x0_selection, + the optimizer will use the input_data and output_data from this + object as initial samples. + + sampler: Sampler + If x0_selection = 'new', the sampler to use + overwrite: bool + If True, the optimizer will overwrite the current data. + callback : Callable + A callback function that is called after every iteration. It has + the following signature: + + ``callback(intermediate_result: ExperimentData)`` + + where the first argument is a parameter containing an + `ExperimentData` object with the current iterate(s). + + Raises + ------ + ValueError + Raised when invalid x0_selection is specified + """ + last_index = self.index[-1] if not self.index.empty else -1 + + if isinstance(x0_selection, str): + if x0_selection == 'new': + + if iterations < optimizer._population: + raise ValueError( + f'For creating new samples, the total number of ' + f'requested iterations ({iterations}) cannot be ' + f'smaller than the population size ' + f'({optimizer._population})') + + init_samples = ExperimentData.from_sampling( + domain=self.domain, + sampler=sampler, + n_samples=optimizer._population, + seed=optimizer._seed) + + init_samples.evaluate( + data_generator=data_generator, kwargs=kwargs, + mode='sequential') + + if callback is not None: + callback(init_samples) + + if overwrite: + _indices = init_samples.index + last_index + 1 + self._overwrite_experiments( + experiment_sample=init_samples, + indices=_indices, + add_if_not_exist=True) + + else: + self.add_experiments(init_samples) + + x0_selection = 'last' + iterations -= optimizer._population + + x0 = x0_factory(experiment_data=self, mode=x0_selection, + n_samples=optimizer._population) + optimizer._set_data(x0) + + optimizer._check_number_of_datapoints() + + optimizer._construct_model(data_generator) + + for _ in range(number_of_updates( + iterations, + population=optimizer._population)): + new_samples = optimizer.update_step(data_generator) + + # If new_samples is a tuple of input_data and output_data + if isinstance(new_samples, tuple): + new_samples = ExperimentData( + domain=self.domain, + input_data=new_samples[0], + output_data=new_samples[1], + ) + # If applicable, evaluate the new designs: + new_samples.evaluate( + data_generator, mode='sequential', kwargs=kwargs) + + if callback is not None: + callback(new_samples) + + if overwrite: + _indices = new_samples.index + last_index + 1 + self._overwrite_experiments(experiment_sample=new_samples, + indices=_indices, + add_if_not_exist=True) + + else: + self.add_experiments(new_samples) + + optimizer._set_data(self) + + if not overwrite: + # Remove overiterations + self.remove_rows_bottom(number_of_overiterations( + iterations, + population=optimizer._population)) + + # Reset the optimizer + # optimizer.reset(ExperimentData(domain=self.domain)) + + def _iterate_scipy(self, optimizer: Optimizer, + data_generator: DataGenerator, + iterations: int, kwargs: dict, + x0_selection: str | ExperimentData, + sampler: Sampler, overwrite: bool, + callback: Callable): + """Internal represenation of the iteration process for scipy-minimize + optimizers. + + Parameters + ---------- + optimizer : Optimizer + Optimizer object + data_generator : DataGenerator + DataGenerator object + iterations : int + number of iterations + kwargs : Dict[str, Any] + any additional keyword arguments that will be passed to + the DataGenerator + x0_selection : str | ExperimentData + How to select the initial design. + The following x0_selections are available: + + * 'best': Select the best designs from the current experimentdata + * 'random': Select random designs from the current experimentdata + * 'last': Select the last designs from the current experimentdata + * 'new': Create new random designs from the current experimentdata + + If the x0_selection is 'new', new designs are sampled with the + sampler provided. The number of designs selected is equal to the + population size of the optimizer. + + If an ExperimentData object is passed as x0_selection, + the optimizer will use the input_data and output_data from this + object as initial samples. + + sampler: Sampler + If x0_selection = 'new', the sampler to use + overwrite: bool + If True, the optimizer will overwrite the current data. + callback : Callable + A callback function that is called after every iteration. It has + the following signature: + + ``callback(intermediate_result: ExperimentData)`` + + where the first argument is a parameter containing an + `ExperimentData` object with the current iterate(s). + + Raises + ------ + ValueError + Raised when invalid x0_selection is specified + """ + last_index = self.index[-1] if not self.index.empty else -1 + n_data_before_iterate = len(self) + + if isinstance(x0_selection, str): + if x0_selection == 'new': + + if iterations < optimizer._population: + raise ValueError( + f'For creating new samples, the total number of ' + f'requested iterations ({iterations}) cannot be ' + f'smaller than the population size ' + f'({optimizer._population})') + + init_samples = ExperimentData.from_sampling( + domain=self.domain, + sampler=sampler, + n_samples=optimizer._population, + seed=optimizer._seed) + + init_samples.evaluate( + data_generator=data_generator, kwargs=kwargs, + mode='sequential') + + if callback is not None: + callback(init_samples) + + if overwrite: + _indices = init_samples.index + last_index + 1 + self._overwrite_experiments( + experiment_sample=init_samples, + indices=_indices, + add_if_not_exist=True) + + else: + self.add_experiments(init_samples) + + x0_selection = 'last' + + x0 = x0_factory(experiment_data=self, mode=x0_selection, + n_samples=optimizer._population) + optimizer._set_data(x0) + + optimizer._check_number_of_datapoints() + + optimizer.run_algorithm(iterations, data_generator) + + new_samples: ExperimentData = optimizer.data.select( + optimizer.data.index[1:]) + new_samples.evaluate(data_generator, mode='sequential', kwargs=kwargs) + + if callback is not None: + callback(new_samples) + + if overwrite: + self.add_experiments( + optimizer.data.select([optimizer.data.index[-1]])) + + elif not overwrite: + # Do not add the first element, as this is already + # in the sampled data + self.add_experiments(new_samples) + + # TODO: At the end, the data should have + # n_data_before_iterate + iterations amount of elements! + # If x_new is empty, repeat best x0 to fill up total iteration + if len(self) == n_data_before_iterate: + repeated_sample = self.get_n_best_output( + n_samples=1) + + for repetition in range(iterations): + self.add_experiments(repeated_sample) + + # Repeat last iteration to fill up total iteration + if len(self) < n_data_before_iterate + iterations: + last_design = self.get_experiment_sample(len(self)-1) + + while len(self) < n_data_before_iterate + iterations: + self.add_experiments(last_design) + + # Evaluate the function on the extra iterations + self.evaluate(data_generator, mode='sequential', kwargs=kwargs) + + # Reset the optimizer + # optimizer.reset(ExperimentData(domain=self.domain)) + + # Sampling + # ========================================================================= + + def sample(self, sampler: Sampler | SamplerNames, n_samples: int = 1, + seed: Optional[int] = None, **kwargs) -> None: + """Sample data from the domain providing the sampler strategy + + Parameters + ---------- + sampler: Sampler | str + Sampler callable or string of built-in sampler + If a string is passed, it should be one of the built-in samplers: + + * 'random' : Random sampling + * 'latin' : Latin Hypercube Sampling + * 'sobol' : Sobol Sequence Sampling + * 'grid' : Grid Search Sampling + n_samples : int, optional + Number of samples to generate, by default 1 + seed : Optional[int], optional + Seed to use for the sampler, by default None + + Note + ---- + When using the 'grid' sampler, an optional argument + 'stepsize_continuous_parameters' can be passed to specify the stepsize + to cast continuous parameters to discrete parameters. + + - The stepsize should be a dictionary with the parameter names as keys\ + and the stepsize as values. + - Alternatively, a single stepsize can be passed for all continuous\ + parameters. + + Raises + ------ + ValueError + Raised when invalid sampler type is specified + """ + + if isinstance(sampler, str): + sampler = _sampler_factory(sampler, self.domain) + + sample_data: DataTypes = sampler( + domain=self.domain, n_samples=n_samples, seed=seed, **kwargs) + self.add(input_data=sample_data, domain=self.domain) + + # Project directory + # ========================================================================= + + def set_project_dir(self, project_dir: Path | str): + """Set the directory of the f3dasm project folder. + + Parameters + ---------- + project_dir : Path or str + Path to the project directory + """ + self.project_dir = _project_dir_factory(project_dir) + + +def x0_factory(experiment_data: ExperimentData, + mode: str | ExperimentData, n_samples: int): + """Set the initial population to the best n samples of the given data + + Parameters + ---------- + experiment_data : ExperimentData + Data to be used for the initial population + mode : str + Mode of selecting the initial population, by default 'best' + The following modes are available: + + - best: select the best n samples + - random: select n random samples + - last: select the last n samples + n_samples : int + Number of samples to select + + Raises + ------ + ValueError + Raises when the mode is not recognized + """ + if isinstance(mode, ExperimentData): + x0 = mode + + elif mode == 'best': + x0 = experiment_data.get_n_best_output(n_samples) + + elif mode == 'random': + x0 = experiment_data.select( + np.random.choice( + experiment_data.index, + size=n_samples, replace=False)) + + elif mode == 'last': + x0 = experiment_data.select( + experiment_data.index[-n_samples:]) + + else: + raise ValueError( + f'Unknown selection mode {mode}, use best, random or last') + + x0._reset_index() + return x0 + + +def combine_data_to_multiindex( + experiment_data: ExperimentData) -> pd.DataFrame: + """Combine the data to a multiindex dataframe. + + Parameters + ---------- + experiment_data: ExperimentData + The ExperimentData object to combine + + Returns + ------- + pd.DataFrame + The combined dataframe. + + Note + ---- + This function is mainly used to show the combined ExperimentData + object in a Jupyter Notebook + """ + return pd.concat( + [experiment_data._jobs.to_dataframe(), + experiment_data._input_data.to_dataframe(), + experiment_data._output_data.to_dataframe()], + axis=1, keys=['jobs', 'input', 'output']) diff --git a/src/f3dasm/_src/experimentdata/_jobqueue.py b/src/f3dasm/_src/experimentdata/_jobqueue.py index 438b6c4d..79264ce1 100644 --- a/src/f3dasm/_src/experimentdata/_jobqueue.py +++ b/src/f3dasm/_src/experimentdata/_jobqueue.py @@ -91,7 +91,7 @@ def __add__(self, other: _JobQueue | str) -> _JobQueue: other_jobs_copy.index = other_jobs_copy.index + last_index + 1 return _JobQueue(pd.concat([self.jobs, other_jobs_copy])) - def __getitem__(self, index: int | slice | Iterable[int]) -> _Data: + def __getitem__(self, index: int | slice | Iterable[int]) -> _JobQueue: """Get a subset of the data. Parameters @@ -163,6 +163,7 @@ def from_file(cls: Type[_JobQueue], filename: Path | str) -> _JobQueue: return cls(pd.read_pickle(filename)) + # TODO: This function is not used! def reset(self) -> None: """Resets the job queue.""" self.jobs = pd.Series(dtype='string') @@ -230,6 +231,7 @@ def remove(self, indices: List[int]): """ self.jobs = self.jobs.drop(indices) + # TODO: Remove this method as it is not used! def add(self, number_of_jobs: int = 1, status: str = Status.OPEN): """Adds a number of jobs to the job queue. From c4c8a76fc9dccb300d5ef50fbb1fffd40921b014 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Mon, 24 Jun 2024 09:30:33 +0200 Subject: [PATCH 08/17] remove commented code --- .../_experimental/_newexperimentdata2.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py index cb776297..9e762296 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py @@ -132,14 +132,6 @@ def __init__(self, self._jobs = _jobs_factory( jobs, self._input_data, self._output_data, job_value) - # # Check if the columns of input_data are in the domain - # if not self._input_data.columns.has_columnnames(self.domain.names): - # self._input_data.columns.set_columnnames(self.domain.names) - - # if not self._output_data.columns.has_columnnames( - # self.domain.output_names): - # self._output_data.columns.set_columnnames(self.domain.output_names) - # For backwards compatibility; if the output_data has # only one column, rename it to 'y' # TODO: Fix this for newdata2 @@ -253,11 +245,6 @@ def index(self) -> pd.Index: """ return self._jobs.indices - # if self._input_data.is_empty(): - # return self._output_data.indices - - # return self._input_data.indices - # Alternative Constructors # ========================================================================= From 9be592857135de2ed3eb607b4084405a7e383e29 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:26:44 +0200 Subject: [PATCH 09/17] Added docstings and tests for _newdata2 object --- .../experimentdata/_experimental/_newdata2.py | 389 +++++++++++++++-- .../_experimental/_newexperimentdata2.py | 5 - tests/newdata/conftest.py | 33 +- tests/newdata/test_data.py | 391 +++++++++--------- 4 files changed, 559 insertions(+), 259 deletions(-) diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py index 759473e6..4bff29cd 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py @@ -24,25 +24,76 @@ MISSING_VALUE = np.nan +# ============================================================================= + class _Data: def __init__(self, data: Dict[int, Dict[str, Any]] = None): + """ + Initialize the _Data object. + + Parameters + ---------- + data : Dict[int, Dict[str, Any]], optional + The data dictionary with integer keys and dictionaries as values. + """ self.data = data if data is not None else {} def __len__(self) -> int: + """ + Get the number of items in the data. + + Returns + ------- + int + Number of items in the data. + """ return len(self.data) def __iter__(self): + """ + Get an iterator over the data values. + + Returns + ------- + iterator + Iterator over the data values. + """ return iter(self.data.values()) def __getitem__(self, rows: int | slice | Iterable[int]) -> _Data: - + """ + Get a subset of the data. + + Parameters + ---------- + rows : int or slice or Iterable[int] + The rows to retrieve. + + Returns + ------- + _Data + The subset of the data. + """ if isinstance(rows, int): rows = [rows] return _Data({row: self.data.get(row, {}) for row in rows}) def __add__(self, __o: _Data) -> _Data: + """ + Add another _Data object to this one. + + Parameters + ---------- + __o : _Data + The other _Data object. + + Returns + ------- + _Data + The combined _Data object. + """ if self.is_empty(): return __o @@ -56,78 +107,254 @@ def __add__(self, __o: _Data) -> _Data: return _data_copy def __eq__(self, __o: _Data) -> bool: + """ + Check if another _Data object is equal to this one. + + Parameters + ---------- + __o : _Data + The other _Data object. + + Returns + ------- + bool + True if the objects are equal, False otherwise. + """ return self.data == __o.data def _repr_html_(self) -> str: + """ + Get the HTML representation of the data. + + Returns + ------- + str + The HTML representation of the data. + """ return self.to_dataframe()._repr_html_() def __repr__(self) -> str: + """ + Get the string representation of the data. + + Returns + ------- + str + The string representation of the data. + """ return self.to_dataframe().__repr__() + +# Properties +# ============================================================================= + @property def indices(self) -> List[int]: + """ + Get the indices of the data. + + Returns + ------- + List[int] + The list of indices. + """ return list(self.data.keys()) @property def names(self) -> List[str]: + """ + Get the column names of the data. + + Returns + ------- + List[str] + The list of column names. + """ return self.to_dataframe().columns.tolist() + def is_empty(self) -> bool: + """ + Check if the data is empty. + + Returns + ------- + bool + True if the data is empty, False otherwise. + """ + return not bool(self.data) + + +# Initialization +# ============================================================================= + @classmethod - def from_indices(cls, rows: Iterable[int]): + def from_indices(cls, rows: Iterable[int]) -> _Data: + """ + Create a _Data object from a list of indices. + + Parameters + ---------- + rows : Iterable[int] + The indices to create the _Data object from. + + Returns + ------- + _Data + The created _Data object. + """ return cls({row: {} for row in rows}) - # @classmethod - # def from_domain(cls, space: Iterable[str]): - # return cls(None) - @classmethod def from_file(cls, filename: Path) -> _Data: + """ + Create a _Data object from a file. + + Parameters + ---------- + filename : Path + The file to read the data from. + + Returns + ------- + _Data + The created _Data object. + """ ... @classmethod def from_numpy(cls: Type[_Data], array: np.ndarray, keys: Optional[Iterable[str]] = None) -> _Data: + """ + Create a _Data object from a numpy array. + + Parameters + ---------- + array : np.ndarray + The numpy array to create the _Data object from. + keys : Optional[Iterable[str]], optional + The keys for the columns of the data. + + Returns + ------- + _Data + The created _Data object. + """ if keys is not None: return _Data( {index: {key: col for key, col in zip(keys, row) } for index, row in enumerate(array)}) else: - # Look out! i is now an integer key! return _Data( {index: {i: col for i, col in enumerate(row) } for index, row in enumerate(array)}) @classmethod def from_dataframe(cls, df: pd.DataFrame) -> _Data: + """ + Create a _Data object from a pandas DataFrame. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to create the _Data object from. + + Returns + ------- + _Data + The created _Data object. + """ return _Data( {index: row.to_dict() for index, (_, row) in enumerate(df.iterrows())}) +# Exporting +# ============================================================================= + def to_numpy(self) -> np.ndarray: + """ + Convert the data to a numpy array. + + Returns + ------- + np.ndarray + The numpy array representation of the data. + """ return self.to_dataframe().to_numpy() def to_xarray(self, label: str): + """ + Convert the data to an xarray DataArray. + + Parameters + ---------- + label : str + The label for the xarray DataArray. + + Returns + ------- + xr.DataArray + The xarray DataArray representation of the data. + """ df = self.to_dataframe() - # Can create the xarray with the information from the domain! return xr.DataArray( self.to_dataframe(), dims=['iterations', label], coords={ 'iterations': df.index, label: df.columns}) def to_dataframe(self) -> pd.DataFrame: - # Can create the dataframe from the numpy array + column names!! + """ + Convert the data to a pandas DataFrame. + + Returns + ------- + pd.DataFrame + The DataFrame representation of the data. + """ return pd.DataFrame(self.data).T def store(self, filename: Path): + """ + Store the data to a file. + + Parameters + ---------- + filename : Path + The file to store the data in. + """ ... - def n_best_samples(self, nosamples: int, key: str) -> _Data: - df = self.to_dataframe() - return df.nsmallest( - n=nosamples, columns=key) + def get_data_dict(self, row: int) -> Dict[str, Any]: + """ + Get the data dictionary for a specific row. + + Parameters + ---------- + row : int + The row to retrieve the data from. + + Returns + ------- + Dict[str, Any] + The data dictionary for the specified row. + """ + return self.data[row] - def select_columns(self, keys: Iterable[str] | str) -> _Data: - # This only works for single ints or slices!! +# Selecting and combining +# ============================================================================= + def select_columns(self, keys: Iterable[str] | str) -> _Data: + """ + Select specific columns from the data. + + Parameters + ---------- + keys : Iterable[str] or str + The keys of the columns to select. + + Returns + ------- + _Data + The _Data object with only the selected columns. + """ if isinstance(keys, str): keys = [keys] @@ -136,47 +363,135 @@ def select_columns(self, keys: Iterable[str] | str) -> _Data: for index, row in self.data.items()}) def drop(self, keys: Iterable[str] | str) -> _Data: - # Might be depreciated? - + """ + Drop specific columns from the data. + + Parameters + ---------- + keys : Iterable[str] or str + The keys of the columns to drop. + + Returns + ------- + _Data + The _Data object with the specified columns removed. + """ if isinstance(keys, str): keys = [keys] - for row in self.data: + for row in self: for key in keys: if key in row: - del self.data[row][key] + del row[key] + + def join(self, __o: _Data) -> _Data: + """ + Join another _Data object with this one. + + Parameters + ---------- + __o : _Data + The other _Data object to join with this one. + + Returns + ------- + _Data + The combined _Data object. + """ + _data = deepcopy(self) + for row, other_row in zip(_data, __o): + row.update(other_row) + + return _data + +# Modifying +# ============================================================================= + + def n_best_samples(self, nosamples: int, key: str) -> pd.DataFrame: + """ + Get the top N samples based on a specific key. + + Parameters + ---------- + nosamples : int + The number of samples to retrieve. + key : str + The key to sort the samples by. + + Returns + ------- + pd.DataFrame + The DataFrame with the top N samples. + """ + df = self.to_dataframe() + return df.nsmallest(n=nosamples, columns=key) def add_column(self, key: str): + """ + Add a new column to the data with missing values. + + Parameters + ---------- + key : str + The key for the new column. + """ for row in self.data: self.data[row][key] = MISSING_VALUE def remove(self, rows: Iterable[int]): + """ + Remove specific rows from the data. + + Parameters + ---------- + rows : Iterable[int] + The rows to remove. + """ for row in rows: - del self.data[row] # = deleting the row + del self.data[row] def overwrite(self, rows: Iterable[int], __o: _Data): + """ + Overwrite specific rows with data from another _Data object. + + Parameters + ---------- + rows : Iterable[int] + The rows to overwrite. + __o : _Data + The _Data object to overwrite the rows with. + """ for index, other_row in zip(rows, __o): self.data[index] = other_row - def join(self, __o: _Data) -> _Data: - _data = deepcopy(self) - for row, other_row in zip(_data, __o): - row.update(other_row) - - return _Data(_data) - - def get_data_dict(self, row: int) -> Dict[str, Any]: - return self.data[row] - def set_data(self, row: int, value: Any, key: str): + """ + Set a specific value in the data. + + Parameters + ---------- + row : int + The row to set the value in. + value : Any + The value to set. + key : str + The key for the value. + """ self.data[row][key] = value def reset_index(self, rows: Iterable[int] = None): - self.data = {index: values for index, values in enumerate(self.data) - } + """ + Reset the index of the data. - def is_empty(self) -> bool: - return not bool(self.data) + Parameters + ---------- + rows : Iterable[int], optional + The rows to reset the index for. + + """ + self.data = {index: values for index, values in enumerate(self)} + +# ============================================================================= def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: @@ -193,7 +508,9 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: _Data The data object. """ - return _Data({0: {dictionary}}) + return _Data({0: dictionary}) + +# ============================================================================= def _data_factory(data: DataTypes) -> _Data: @@ -217,5 +534,7 @@ def _data_factory(data: DataTypes) -> _Data: f"Data must be of type _Data, pd.DataFrame, np.ndarray, " f"Path or str, not {type(data)}") +# ============================================================================= + DataTypes = Union[pd.DataFrame, np.ndarray, Path, str, _Data] diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py index 9e762296..0a2cc770 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py @@ -142,11 +142,6 @@ def __len__(self): """The len() method returns the number of datapoints""" return len(self._jobs) - # if self._input_data.is_empty(): - # return len(self._output_data) - - # return len(self._input_data) - def __iter__(self) -> Iterator[Tuple[Dict[str, Any]]]: self.current_index = 0 return self diff --git a/tests/newdata/conftest.py b/tests/newdata/conftest.py index be072701..acde62e9 100644 --- a/tests/newdata/conftest.py +++ b/tests/newdata/conftest.py @@ -1,40 +1,31 @@ import numpy as np import pytest -from f3dasm._src.experimentdata._columns import _Columns -from f3dasm._src.experimentdata._newdata import _Index +from f3dasm._src.experimentdata._experimental._newdata2 import _Data from f3dasm.design import Domain @pytest.fixture(scope="package") def list_1(): - return [[np.array([0.3, 5.0, 0.34]), 'd', 3], [np.array( - [0.23, 5.0, 0.0]), 'f', 4], [np.array([0.3, 5.0, 0.2]), 'c', 0]] - - -@pytest.fixture(scope="package") -def columns_1(): - return _Columns({'a': None, 'b': None, 'c': None}) - - -@pytest.fixture(scope="package") -def indices_1(): - return _Index([3, 5, 6]) + return {0: {'a': np.array([0.3, 5.0, 0.34]), 'b': 'd', 'c': 3}, + 1: {'a': np.array([0.23, 5.0, 0.0]), 'b': 'f', 'c': 4}, + 2: {'a': np.array([0.3, 5.0, 0.2]), 'b': 'c', 'c': 0} + } @pytest.fixture(scope="package") def list_2(): - return [[np.array([0.3, 0.2])], [np.array([0.4, 0.3])], [np.array([0.0, 1.0])]] - - -@pytest.fixture(scope="package") -def columns_2(): - return _Columns({'a': None}) + return {0: {'a': np.array([0.3, 0.2])}, + 1: {'a': np.array([0.4, 0.3]), 'b': np.array([0.0, 1.0])} + } @pytest.fixture(scope="package") def list_3(): - return [[np.array([1.1, 0.2])], [np.array([8.9, 0.3])], [np.array([0.0, 0.87])]] + return {0: {'a': np.array([1.1, 0.2])}, + 1: {'a': np.array([8.9, 0.3])}, + 2: {'a': np.array([0.0, 0.87])} + } @pytest.fixture(scope="package") diff --git a/tests/newdata/test_data.py b/tests/newdata/test_data.py index 38b1b0ce..fb5f0cba 100644 --- a/tests/newdata/test_data.py +++ b/tests/newdata/test_data.py @@ -1,292 +1,287 @@ from copy import deepcopy -from typing import Any, List +from typing import Any, Dict, List import numpy as np import pandas as pd import pytest +import xarray as xr -from f3dasm._src.experimentdata._columns import _Columns -from f3dasm._src.experimentdata._newdata import _Data, _Index -from f3dasm.design import Domain +from f3dasm._src.experimentdata._experimental._newdata2 import ( + _convert_dict_to_data, _Data, _data_factory) pytestmark = pytest.mark.smoke -DataType = List[List[Any]] +DataType = Dict[int, Dict[str, Any]] +# Initialization +# ============================================================================= -def test_init(list_1: DataType): - data = _Data(list_1) - assert data.data == list_1 - assert data.columns.names == [0, 1, 2] - assert data.indices.equals(pd.Index([0, 1, 2])) +def test_init(): + data = _Data({0: {"a": 1, "b": 2}}) + assert len(data) == 1 + assert not data.is_empty() + assert data.data == {0: {"a": 1, "b": 2}} -def test_init_with_columns(list_1: DataType, columns_1: _Columns): - data = _Data(list_1, columns_1) - assert data.data == list_1 - assert data.names == ['a', 'b', 'c'] +def test_init_empty(): + data = _Data() + assert len(data) == 0 + assert data.is_empty() -def test_init_with_columns_and_indices( - list_1: DataType, columns_1: _Columns, indices_1: _Index): - data = _Data(list_1, columns_1, indices_1) - assert data.data == list_1 - assert data.names == ['a', 'b', 'c'] - assert data.indices.equals(pd.Index([3, 5, 6])) +def test_init_with_data(): + input_data = {0: {"a": 1, "b": 2}} + data = _Data(input_data) + assert len(data) == 1 + assert not data.is_empty() + assert data.data == input_data -def test__len__(list_1: DataType): - data = _Data(list_1) - assert len(data) == 3 +def test_from_numpy(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + data = _Data.from_numpy(array) + expected_data = {0: {0: 1, 1: 2, 2: 3}, 1: {0: 4, 1: 5, 2: 6}} + assert data.data == expected_data -def test__iter__(list_1: DataType): - data = _Data(list_1) - for i, row in enumerate(data): - assert row == list_1[i] +def test_from_numpy_with_keys(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + data = _Data.from_numpy(array, keys=["a", "b", "c"]) + expected_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} + assert data.data == expected_data -def test__getitem__(list_1: DataType): - data = _Data(list_1) - assert data[0].data[0] == list_1[0] - assert data[1].data[0] == list_1[1] - assert data[2].data[0] == list_1[2] +def test_from_dataframe(): + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + data = _Data.from_dataframe(df) + expected_data = {0: {"a": 1, "b": 3}, 1: {"a": 2, "b": 4}} + assert data.data == expected_data -def test__getitem__list(list_1: DataType): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=_Columns( - {'a': None, 'b': None, 'c': None}), index=_Index([3, 45])) - assert data[[3, 45]].data == data.data +def test_from_indices(): + data = _Data.from_indices([0, 1]) + assert data.data == {0: {}, 1: {}} -def test__add__(list_1: DataType, list_3: DataType): - data_1 = _Data(list_1) - data_2 = _Data(list_3) - data_3 = data_1 + data_2 - assert data_3.data == list_1 + list_3 - assert data_3.columns.names == [0, 1, 2] +# Exporting +# ============================================================================= -def test__add__empty(list_3: DataType): - data_1 = _Data(columns=_Columns({0: None, 1: None, 2: None})) - data_2 = _Data(list_3) - data_3 = data_1 + data_2 - assert data_3.data == list_3 - assert data_3.columns.names == [0, 1, 2] +def test_to_numpy(): + input_data = {0: {"a": 1, "b": 2}, 1: {"a": 3, "b": 4}} + data = _Data(input_data) + np_array = data.to_numpy() + expected_array = np.array([[1, 2], [3, 4]]) + np.testing.assert_array_equal(np_array, expected_array) -def test__eq__(list_1: DataType): - data_1 = _Data(list_1) - data_2 = _Data(list_1) - assert data_1 == data_2 +def test_to_dataframe(): + input_data = {0: {"a": 1, "b": 2}, 1: {"a": 3, "b": 4}} + data = _Data(input_data) + df = data.to_dataframe() + expected_df = pd.DataFrame({"a": [1, 3], "b": [2, 4]}) + pd.testing.assert_frame_equal(df, expected_df) -def test_repr_html(list_1: DataType): - data = _Data(list_1) - assert data._repr_html_() == data.to_dataframe()._repr_html_() +def test_to_xarray(): + input_data = {0: {"a": 1, "b": 2}, 1: {"a": 3, "b": 4}} + data = _Data(input_data) + xarray = data.to_xarray('test') + expected_xarray = xr.DataArray( + [[1, 2], [3, 4]], dims=["iterations", "test"], + coords={"iterations": [0, 1], "test": ["a", "b"]}) + xr.testing.assert_equal(xarray, expected_xarray) -# Properties -# ============================================================================= +def test_get_data_dict(): + input_data = {0: {"a": 1, "b": 2}, 1: {"a": 3, "b": 4}} + data = _Data(input_data) + assert data.get_data_dict(0) == {"a": 1, "b": 2} -def test_names(list_1: DataType, columns_1: _Columns): - data = _Data(list_1, columns=columns_1) - assert data.names == ['a', 'b', 'c'] +def test_convert_dict_to_data(): + dictionary = {"a": 1, "b": 2} + data = _convert_dict_to_data(dictionary) + expected_data = _Data({0: {"a": 1, "b": 2}}) + assert data == expected_data -def test_names_default(list_1: DataType): - data = _Data(list_1) - assert data.names == [0, 1, 2] +# Properties +# ============================================================================= -def test_indices(list_1: DataType, indices_1: _Index): - data = _Data(list_1, index=indices_1) - assert data.indices.equals(pd.Index([3, 5, 6])) +def test_len(): + data = _Data({0: {"a": 1}, 1: {"a": 2}}) + assert len(data) == 2 -def test_indices_default(list_1: DataType): - data = _Data(list_1) - assert data.indices.equals(pd.Index([0, 1, 2])) +def test_indices(): + data = _Data({0: {"a": 1}, 1: {"a": 2}}) + assert data.indices == [0, 1] -# Alternative constructors -# ============================================================================= - -def test_from_indices(): - data = _Data.from_indices(pd.Index([0, 1])) - assert data.indices.equals(pd.Index(([0, 1]))) - assert not data.names - assert data.is_empty() +def test_names(): + data = _Data({0: {"a": 1}, 1: {"a": 2}}) + assert data.names == ["a"] -def test_from_domain(domain: Domain): - data = _Data.from_domain(domain) - assert data.indices.equals(pd.Index([])) - assert data.names == ['a', 'b', 'c', 'd', 'e'] +def test_is_empty(): + data = _Data() assert data.is_empty() + data = _Data({0: {"a": 1}}) + assert not data.is_empty() -def test_from_numpy(): - data = _Data.from_numpy(np.array([[1, 2, 3], [4, 5, 6]])) - assert data.data == [[1, 2, 3], [4, 5, 6]] - assert data.names == [0, 1, 2] - assert data.indices.equals(pd.Index([0, 1])) +def test_getitem(): + data = _Data({0: {"a": 1}, 1: {"a": 2}}) + assert data[0] == _Data({0: {"a": 1}}) + assert data[1] == _Data({1: {"a": 2}}) + assert data[[0, 1]] == data -def test_from_dataframe(): - data = _Data.from_dataframe(pd.DataFrame([[1, 2, 3], [4, 5, 6]])) - assert data.data == [[1, 2, 3], [4, 5, 6]] - assert data.names == [0, 1, 2] - assert data.indices.equals(pd.Index([0, 1])) +def test_repr(): + data = _Data({0: {"a": 1}, 1: {"a": 2}}) + assert isinstance(data.__repr__(), str) -def test_reset(): - data = _Data.from_numpy(np.array([[1, 2, 3], [4, 5, 6]])) - data.reset() - assert data.data == [] - assert not data.names - assert data.indices.equals(pd.Index([])) +def test_repr_html(): + data = _Data({0: {"a": 1}, 1: {"a": 2}}) + assert isinstance(data._repr_html_(), str) +# Selecting and combining +# ============================================================================= -def test_reset_with_domain(domain: Domain): - data = _Data.from_numpy(np.array([[1, 2, 3], [4, 5, 6]])) - data.reset(domain) - assert data.data == [] - assert data.names == domain.names - assert data.indices.equals(pd.Index([])) +def test_join(): + data1 = _Data({0: {"a": 1, "b": 2}, 1: {"a": 3, "b": 4}}) + data2 = _Data({0: {"c": 5, "d": 6}, 1: {"c": 7, "d": 8}}) + data3 = data1.join(data2) + expected_data = {0: {"a": 1, "b": 2, "c": 5, "d": 6}, + 1: {"a": 3, "b": 4, "c": 7, "d": 8}} + assert data3 == _Data(expected_data) -# Export -# ============================================================================= +def test_select_columns(): + input_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} + data = _Data(input_data) + selected_data = data.select_columns(["a", "c"]) + expected_data = {0: {"a": 1, "c": 3}, 1: {"a": 4, "c": 6}} + assert selected_data.data == expected_data -def test_to_numpy(list_1: DataType): - data = _Data(list_1) - data.to_numpy() +def test_select_columns_single(): + input_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} + data = _Data(input_data) + selected_data = data.select_columns("a") + expected_data = {0: {"a": 1}, 1: {"a": 4}} + assert selected_data.data == expected_data -def to_dataframe(list_1: DataType): - data = _Data(list_1) - data.to_dataframe() - assert data.to_dataframe().equals(pd.DataFrame(list_1)) +def test_drop(): + input_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} + data = _Data(input_data) + data.drop(["b"]) + expected_data = {0: {"a": 1, "c": 3}, 1: {"a": 4, "c": 6}} + assert data.data == expected_data -def test_select_columns(list_1: DataType, columns_1: _Columns): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=columns_1) - new_data = data.select_columns(['a', 'c']) - assert new_data.names == ['a', 'c'] - assert new_data.data == [[1, 3], [4, 6]] +def test_drop_single_key(): + input_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} + data = _Data(input_data) + data.drop("b") + expected_data = {0: {"a": 1, "c": 3}, 1: {"a": 4, "c": 6}} + assert data.data == expected_data -def test_select_column(list_1: DataType, columns_1: _Columns): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=columns_1) - new_data = data.select_columns('a') - assert new_data.names == ['a'] - assert new_data.data == [[1], [4]] +# Modifying +# ============================================================================= -def test_add(list_2: DataType, list_3: DataType): - data_0 = _Data(deepcopy(list_2)) - data_1 = _Data(deepcopy(list_2)) - data_2 = _Data(list_3) - data_1.add(data_2.to_dataframe()) - assert data_1 == (data_0 + data_2) +def test_add(): + data1 = _Data({0: {"a": 1, "b": 2}}) + data2 = _Data({0: {"a": 3, "b": 4}}) + data3 = data1 + data2 + expected_data = {0: {"a": 1, "b": 2}, 1: {"a": 3, "b": 4}} + assert data3.data == expected_data -def test_add_empty_rows(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]]) - data.add_empty_rows(2) - assert data.data == [[1, 2, 3], [4, 5, 6], [ - np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]] +def test_add_empty(): + data1 = _Data() + data2 = _Data({0: {"a": 3, "b": 4}}) + data3 = data1 + data2 + assert data3.data == {0: {"a": 3, "b": 4}} def test_add_column(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]]) - data.add_column('a') - assert data.data == [[1, 2, 3, np.nan], [4, 5, 6, np.nan]] - assert data.names == [0, 1, 2, 'a'] + missing_value = np.nan + data = _Data({0: {"a": 1}, 1: {"a": 2}}) + data.add_column("b") + expected_data = {0: {"a": 1, "b": missing_value}, + 1: {"a": 2, "b": missing_value}} + assert data.data == expected_data -def test_remove(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]]) - data.remove(0) - assert data.data == [[4, 5, 6]] - assert data.names == [0, 1, 2] - - -def test_remove_list(): - data = _Data(data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - data.remove([0, 2]) - assert data.data == [[4, 5, 6]] - assert data.names == [0, 1, 2] +def test_overwrite(): + data = _Data({0: {"a": 1, "b": 2}, 1: {"a": 3, "b": 4}}) + data2 = _Data({0: {"a": 5, "b": 6}, 1: {"a": 7, "b": 8}}) + data.overwrite([0], data2) + assert data.data == {0: {"a": 5, "b": 6}, 1: {"a": 3, "b": 4}} -def test_get_data_dict(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]]) - assert data.get_data_dict(0) == {0: 1, 1: 2, 2: 3} +def test_remove(): + data = _Data({0: {"a": 1, "b": 2}, 1: {"a": 3, "b": 4}}) + data.remove([1]) + assert data.data == {0: {"a": 1, "b": 2}} -def test_set_data_all_columns(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]]) - data.set_data(index=0, value=[4, 5, 6]) - assert data.data == [[4, 5, 6], [4, 5, 6]] +def test_n_best_samples(): + df = pd.DataFrame({"a": [3, 1, 2], "b": [6, 4, 5]}) + data = _Data.from_dataframe(df) + best_samples = data.n_best_samples(2, "a") + expected_df = pd.DataFrame({"a": [1, 2], "b": [4, 5]}, index=[1, 2]) + pd.testing.assert_frame_equal(best_samples, expected_df) def test_set_data(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=_Columns( - {'a': None, 'b': None, 'c': None})) - data.set_data(index=0, value=99, column='b') - assert data.data == [[1, 99, 3], [4, 5, 6]] + data = _Data({0: {"a": 1}}) + data.set_data(0, 2, "a") + assert data.data[0]["a"] == 2 -def test_set_data_no_valid_index(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=_Columns( - {'a': None, 'b': None, 'c': None})) - with pytest.raises(IndexError): - data.set_data(index=2, value=99, column='b') +def test_reset_index(): + data = _Data({1: {"a": 1}, 3: {"a": 2}}) + data.reset_index() + expected_data = {0: {"a": 1}, 1: {"a": 2}} + assert data.data == expected_data -def test_set_data_unknown_column(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=_Columns( - {'a': None, 'b': None, 'c': None})) +def test_data_factory_pandas(): + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + data = _data_factory(df) + expected_data = _Data.from_dataframe(df) + assert data == expected_data - data.set_data(index=0, value=99, column='d') - assert data.names == ['a', 'b', 'c', 'd'] - assert data.data == [[1, 2, 3, 99], [4, 5, 6, np.nan]] +def test_data_factory_numpy(): + np_array = np.array([[1, 2], [3, 4]]) + data = _data_factory(np_array) + expected_data = _Data.from_numpy(np_array) + assert data == expected_data -def test_reset_index(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=_Columns( - {'a': None, 'b': None, 'c': None}), index=_Index([3, 45])) - data.reset_index() - assert data.indices.equals(pd.Index([0, 1])) +def test_data_factory_none(): + data = _data_factory(None) + expected_data = _Data() + assert data == expected_data -def test_is_empty(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=_Columns( - {'a': None, 'b': None, 'c': None}), index=_Index([3, 45])) - assert not data.is_empty() - data.reset() - assert data.is_empty() +def test_data_factory_unrecognized_datatype(): + with pytest.raises(TypeError): + _ = _data_factory(0) -def test_has_columnnames(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=_Columns( - {'a': None, 'b': None, 'c': None}), index=_Index([3, 45])) - assert not data.has_columnnames('d') - assert data.has_columnnames('c') - data.add_column('d') - assert data.has_columnnames('d') - -def test_set_columnnames(): - data = _Data(data=[[1, 2, 3], [4, 5, 6]], columns=_Columns( - {'a': None, 'b': None, 'c': None}), index=_Index([3, 45])) - data.set_columnnames(['d', 'f', 'g']) - assert data.names == ['d', 'f', 'g'] +def test_data_factory_data_object(): + data = _data_factory(_Data({0: {"a": 1}})) + expected_data = _Data({0: {"a": 1}}) + assert data == expected_data if __name__ == "__main__": # pragma: no cover pytest.main() - - # return [[np.array([0.3, 5.0, 0.34]), 'd', 3], [np.array( - # [0.23, 5.0, 0.0]), 'f', 4], [np.array([0.3, 5.0, 0.2]), 'c', 0]] From 6d9938157b7522b7eb7b0dd801dc273acb127be2 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Tue, 25 Jun 2024 11:26:30 +0200 Subject: [PATCH 10/17] Fix data indexing issue and add column renaming functionality --- .../_experimental/_jobqueue2.py | 4 +- .../experimentdata/_experimental/_newdata2.py | 30 +- .../_experimental/_newexperimentdata2.py | 20 +- tests/newdata/experimentdata/__init__.py | 0 tests/newdata/experimentdata/conftest.py | 130 +++ .../newdata/experimentdata/test__jobqueue.py | 43 + .../experimentdata/test_experimentdata.py | 737 ++++++++++++++++++ tests/newdata/test_data.py | 11 +- 8 files changed, 956 insertions(+), 19 deletions(-) create mode 100644 tests/newdata/experimentdata/__init__.py create mode 100644 tests/newdata/experimentdata/conftest.py create mode 100644 tests/newdata/experimentdata/test__jobqueue.py create mode 100644 tests/newdata/experimentdata/test_experimentdata.py diff --git a/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py index 3c88308b..82721ace 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py @@ -73,8 +73,8 @@ def __add__(self, __o: Index | str) -> Index: # Make a copy of other.jobs and modify its index other_jobs_copy = deepcopy(__o) - other_jobs_copy.jobs.index = range( - len(other_jobs_copy)) + self.jobs.index[-1] + 1 + other_jobs_copy.jobs.index = pd.Index(range( + len(other_jobs_copy))) + self.jobs.index[-1] + 1 return Index(pd.concat([self.jobs, other_jobs_copy.jobs])) diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py index 4bff29cd..26df0982 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py @@ -148,8 +148,9 @@ def __repr__(self) -> str: # Properties # ============================================================================= + @property - def indices(self) -> List[int]: + def indices(self) -> pd.Index: """ Get the indices of the data. @@ -158,7 +159,7 @@ def indices(self) -> List[int]: List[int] The list of indices. """ - return list(self.data.keys()) + return pd.Index(list(self.data.keys())) @property def names(self) -> List[str]: @@ -187,8 +188,9 @@ def is_empty(self) -> bool: # Initialization # ============================================================================= + @classmethod - def from_indices(cls, rows: Iterable[int]) -> _Data: + def from_indices(cls, rows: Iterable[int] | pd.Index) -> _Data: """ Create a _Data object from a list of indices. @@ -426,7 +428,7 @@ def n_best_samples(self, nosamples: int, key: str) -> pd.DataFrame: df = self.to_dataframe() return df.nsmallest(n=nosamples, columns=key) - def add_column(self, key: str): + def add_column(self, key: str, exist_ok: bool = True): """ Add a new column to the data with missing values. @@ -436,8 +438,23 @@ def add_column(self, key: str): The key for the new column. """ for row in self.data: + if not exist_ok and key in self.data[row]: + raise KeyError(f"Key '{key}' already exists in the data.") self.data[row][key] = MISSING_VALUE + def rename_columns(self, mapping: Dict[str, str]): + """ + Rename columns in the data. + + Parameters + ---------- + mapping : Dict[str, str] + The mapping of old to new column names. + """ + for row in self.data: + for old_key, new_key in mapping.items(): + self.data[row][new_key] = self.data[row].pop(old_key) + def remove(self, rows: Iterable[int]): """ Remove specific rows from the data. @@ -513,7 +530,8 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: # ============================================================================= -def _data_factory(data: DataTypes) -> _Data: +def _data_factory(data: DataTypes, + keys: Optional[Iterable[str]] = None) -> _Data: if data is None: return _Data() @@ -527,7 +545,7 @@ def _data_factory(data: DataTypes) -> _Data: return _Data.from_file(Path(data)) elif isinstance(data, np.ndarray): - return _Data.from_numpy(data) + return _Data.from_numpy(data, keys=keys) else: raise TypeError( diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py index 0a2cc770..7851f30f 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py @@ -110,8 +110,12 @@ def __init__(self, self.project_dir = _project_dir_factory(project_dir) - self._input_data = _data_factory(input_data) - self._output_data = _data_factory(output_data) + if isinstance(input_data, np.ndarray) and isinstance(domain, Domain): + self._input_data = _data_factory(input_data, domain.names) + self._output_data = _data_factory(output_data, domain.output_names) + else: + self._input_data = _data_factory(input_data) + self._output_data = _data_factory(output_data) # Create empty output_data from indices if output_data is empty if self._output_data.is_empty(): @@ -134,9 +138,8 @@ def __init__(self, # For backwards compatibility; if the output_data has # only one column, rename it to 'y' - # TODO: Fix this for newdata2 if self._output_data.names == [0]: - self._output_data.columns.set_columnnames(['y']) + self._output_data.rename_columns({0: 'y'}) def __len__(self): """The len() method returns the number of datapoints""" @@ -944,7 +947,7 @@ def _set_experiment_sample(self, self._output_data.set_data( row=experiment_sample.job_number, value=value, - column=column) + key=column) self._jobs.mark(experiment_sample._jobnumber, status=Status.FINISHED) @@ -997,11 +1000,10 @@ def _set_error(self, index: int) -> None: index index of the experiment_sample to mark as error """ - # self.jobs.mark_as_error(index) self._jobs.mark(index, status=Status.ERROR) - self._output_data.set_data( - index, - value=['ERROR' for _ in self._output_data.names]) + for column in self._output_data.names: + self._output_data.set_data( + index, value='ERROR', key=column) @_access_file def _write_error(self, index: int): diff --git a/tests/newdata/experimentdata/__init__.py b/tests/newdata/experimentdata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/newdata/experimentdata/conftest.py b/tests/newdata/experimentdata/conftest.py new file mode 100644 index 00000000..68189f88 --- /dev/null +++ b/tests/newdata/experimentdata/conftest.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from f3dasm._src.design.parameter import (_CategoricalParameter, + _ContinuousParameter, + _DiscreteParameter) +from f3dasm._src.experimentdata._experimental._newexperimentdata2 import \ + ExperimentData +from f3dasm.design import Domain, make_nd_continuous_domain + +SEED = 42 + + +@pytest.fixture(scope="package") +def seed() -> int: + return SEED + + +@pytest.fixture(scope="package") +def domain() -> Domain: + + space = { + 'x1': _ContinuousParameter(-5.12, 5.12), + 'x2': _DiscreteParameter(-3, 3), + 'x3': _CategoricalParameter(["red", "green", "blue"]) + } + + return Domain(space=space) + + +@pytest.fixture(scope="package") +def domain_continuous() -> Domain: + return make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3) + + +@pytest.fixture(scope="package") +def experimentdata(domain: Domain) -> ExperimentData: + e_data = ExperimentData(domain) + e_data.sample(sampler='random', n_samples=10, seed=SEED) + return e_data + + +@pytest.fixture(scope="package") +def experimentdata2(domain: Domain) -> ExperimentData: + return ExperimentData.from_sampling(sampler='random', domain=domain, n_samples=10, seed=SEED) + + +@pytest.fixture(scope="package") +def experimentdata_continuous(domain_continuous: Domain) -> ExperimentData: + return ExperimentData.from_sampling(sampler='random', domain=domain_continuous, n_samples=10, seed=SEED) + + +@pytest.fixture(scope="package") +def experimentdata_expected() -> ExperimentData: + domain_continuous = make_nd_continuous_domain( + bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3) + data = ExperimentData.from_sampling( + sampler='random', domain=domain_continuous, n_samples=10, seed=SEED) + for es, output in zip(data, np.zeros((10, 1))): + es.store(name='y', object=float(output)) + data._set_experiment_sample(es) + data.add(input_data=np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]), + output_data=np.array([[0.0], [0.0]]), domain=data.domain) + + # data._input_data.data = data._input_data.data.round(6) + return data + + +@pytest.fixture(scope="package") +def experimentdata_expected_no_output() -> ExperimentData: + domain_continuous = make_nd_continuous_domain( + bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3) + data = ExperimentData.from_sampling( + sampler='random', domain=domain_continuous, n_samples=10, seed=SEED) + data.add(input_data=np.array( + [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]), domain=domain_continuous) + + # data._input_data.data = data._input_data.data.round(6) + + return data + + +@pytest.fixture(scope="package") +def experimentdata_expected_only_domain() -> ExperimentData: + domain_continuous = make_nd_continuous_domain( + bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3) + return ExperimentData(domain=domain_continuous) + + +@pytest.fixture(scope="package") +def numpy_array(domain_continuous: Domain) -> np.ndarray: + rng = np.random.default_rng(SEED) + return rng.random((10, len(domain_continuous))) + + +@pytest.fixture(scope="package") +def numpy_output_array(domain_continuous: Domain) -> np.ndarray: + return np.zeros((10, 1)) + + +@pytest.fixture(scope="package") +def xarray_dataset(domain_continuous: Domain) -> xr.Dataset: + rng = np.random.default_rng(SEED) + # np.random.seed(SEED) + input_data = rng.random((10, len(domain_continuous))) + input_names = domain_continuous.names + + output_data = pd.DataFrame() + output_names = output_data.columns.to_list() + + return xr.Dataset({'input': xr.DataArray(input_data, dims=['iterations', 'input_dim'], coords={ + 'iterations': range(len(input_data)), 'input_dim': input_names}), + 'output': xr.DataArray(output_data, dims=['iterations', 'output_dim'], coords={ + 'iterations': range(len(output_data)), 'output_dim': output_names})}) + + +@pytest.fixture(scope="package") +def pandas_dataframe(domain_continuous: Domain) -> pd.DataFrame: + # np.random.seed(SEED) + rng = np.random.default_rng(SEED) + return pd.DataFrame(rng.random((10, len(domain_continuous))), columns=domain_continuous.names) + + +@pytest.fixture(scope="package") +def continuous_parameter() -> _ContinuousParameter: + return _ContinuousParameter(lower_bound=0., upper_bound=1.) diff --git a/tests/newdata/experimentdata/test__jobqueue.py b/tests/newdata/experimentdata/test__jobqueue.py new file mode 100644 index 00000000..52010733 --- /dev/null +++ b/tests/newdata/experimentdata/test__jobqueue.py @@ -0,0 +1,43 @@ +import pandas as pd + +from f3dasm._src.experimentdata._experimental._jobqueue2 import \ + Index as _JobQueue + +# from f3dasm._src.experimentdata._jobqueue import _JobQueue + + +def test_select_all_with_matching_status(): + # Create a job queue with some jobs + job_queue = _JobQueue() + job_queue.jobs = pd.Series( + ['in progress', 'running', 'completed', 'in progress', 'failed']) + + # Select all jobs with status 'in progress' + selected_jobs = job_queue.select_all('in progress') + + # Check if the selected jobs match the expected result + assert (selected_jobs.jobs == ['in progress', 'in progress']).all() + + +def test_select_all_with_no_matching_status(): + # Create a job queue with some jobs + job_queue = _JobQueue() + job_queue.jobs = pd.Series( + ['in progress', 'running', 'completed', 'in progress', 'failed']) + + # Select all jobs with status 'cancelled' + selected_jobs = job_queue.select_all('cancelled') + + # Check if the selected jobs match the expected result + assert selected_jobs.jobs.empty + + +def test_select_all_with_empty_job_queue(): + # Create an empty job queue + job_queue = _JobQueue() + + # Select all jobs with status 'in progress' + selected_jobs = job_queue.select_all('in progress') + + # Check if the selected jobs match the expected result + assert selected_jobs.jobs.empty diff --git a/tests/newdata/experimentdata/test_experimentdata.py b/tests/newdata/experimentdata/test_experimentdata.py new file mode 100644 index 00000000..026945e6 --- /dev/null +++ b/tests/newdata/experimentdata/test_experimentdata.py @@ -0,0 +1,737 @@ +from __future__ import annotations + +import csv +import pickle +from pathlib import Path +from typing import Iterable + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from f3dasm import ExperimentSample +from f3dasm._src.design.parameter import _ContinuousParameter +from f3dasm._src.experimentdata._experimental._jobqueue2 import \ + Index as _JobQueue +from f3dasm._src.experimentdata._experimental._newdata2 import DataTypes, _Data +from f3dasm._src.experimentdata._experimental._newexperimentdata2 import \ + ExperimentData +from f3dasm.design import Domain, Status, make_nd_continuous_domain + +pytestmark = pytest.mark.smoke + +SEED = 42 + + +def test_check_experimentdata(experimentdata: ExperimentData): + assert isinstance(experimentdata, ExperimentData) + +# Write test functions + + +def test_experiment_data_init(experimentdata: ExperimentData, domain: Domain): + assert experimentdata.domain == domain + assert experimentdata.project_dir == Path.cwd() + # Add more assertions as needed + + +def test_experiment_data_add(experimentdata: ExperimentData, + experimentdata2: ExperimentData, domain: Domain): + experimentdata_total = ExperimentData(domain) + experimentdata_total.add_experiments(experimentdata) + experimentdata_total.add_experiments(experimentdata2) + assert experimentdata_total == experimentdata + experimentdata2 + + +def test_experiment_data_len_empty(domain: Domain): + experiment_data = ExperimentData(domain) + assert len(experiment_data) == 0 # Update with the expected length + + +def test_experiment_data_len_equals_input_data(experimentdata: ExperimentData): + assert len(experimentdata) == len(experimentdata._input_data) + + +@pytest.mark.parametrize("slice_type", [3, [0, 1, 3]]) +def test_experiment_data_select(slice_type: int | Iterable[int], experimentdata: ExperimentData): + input_data = experimentdata._input_data[slice_type] + output_data = experimentdata._output_data[slice_type] + jobs = experimentdata._jobs[slice_type] + constructed_experimentdata = ExperimentData( + input_data=input_data, output_data=output_data, jobs=jobs, domain=experimentdata.domain) + assert constructed_experimentdata == experimentdata.select(slice_type) + +# Constructors +# ====================================================================================== + + +def test_from_file(experimentdata_continuous: ExperimentData, seed: int, tmp_path: Path): + # experimentdata_continuous.filename = tmp_path / 'test001' + experimentdata_continuous.store(tmp_path / 'experimentdata') + + experimentdata_from_file = ExperimentData.from_file( + tmp_path / 'experimentdata') + + # Check if the input_data attribute of ExperimentData matches the expected_data + pd.testing.assert_frame_equal( + experimentdata_continuous._input_data.to_dataframe(), experimentdata_from_file._input_data.to_dataframe(), check_dtype=False, atol=1e-6) + pd.testing.assert_frame_equal(experimentdata_continuous._output_data.to_dataframe(), + experimentdata_from_file._output_data.to_dataframe()) + pd.testing.assert_series_equal( + experimentdata_continuous._jobs.jobs, experimentdata_from_file._jobs.jobs) + # assert experimentdata_continuous.input_data == experimentdata_from_file.input_data + assert experimentdata_continuous._output_data == experimentdata_from_file._output_data + assert experimentdata_continuous.domain == experimentdata_from_file.domain + assert experimentdata_continuous._jobs == experimentdata_from_file._jobs + + +def test_from_file_wrong_name(experimentdata_continuous: ExperimentData, seed: int, tmp_path: Path): + experimentdata_continuous.filename = tmp_path / 'test001' + experimentdata_continuous.store() + + with pytest.raises(FileNotFoundError): + _ = ExperimentData.from_file(tmp_path / 'experimentdata') + + +def test_from_sampling(experimentdata_continuous: ExperimentData, seed: int): + # sampler = RandomUniform(domain=experimentdata_continuous.domain, number_of_samples=10, seed=seed) + experimentdata_from_sampling = ExperimentData.from_sampling(sampler='random', + domain=experimentdata_continuous.domain, + n_samples=10, seed=seed) + assert experimentdata_from_sampling == experimentdata_continuous + + +@pytest.fixture +def sample_csv_inputdata(tmp_path): + # Create sample CSV files for testing + input_csv_file = tmp_path / 'experimentdata_data.csv' + + # Create sample input and output dataframes + input_data = pd.DataFrame( + {'input_col1': [1, 2, 3], 'input_col2': [4, 5, 6]}) + + return input_csv_file, input_data + + +@pytest.fixture +def sample_csv_outputdata(tmp_path): + # Create sample CSV files for testing + output_csv_file = tmp_path / 'experimentdata_output.csv' + + # Create sample input and output dataframes + output_data = pd.DataFrame( + {'output_col1': [7, 8, 9], 'output_col2': [10, 11, 12]}) + + return output_csv_file, output_data + + +def test_from_object(experimentdata_continuous: ExperimentData): + input_data = experimentdata_continuous._input_data + output_data = experimentdata_continuous._output_data + jobs = experimentdata_continuous._jobs + domain = experimentdata_continuous.domain + experiment_data = ExperimentData( + input_data=input_data, output_data=output_data, jobs=jobs, domain=domain) + assert experiment_data == ExperimentData( + input_data=input_data, output_data=output_data, jobs=jobs, domain=domain) + assert experiment_data == experimentdata_continuous + +# Exporters +# ====================================================================================== + + +def test_to_numpy(experimentdata_continuous: ExperimentData, numpy_array: np.ndarray): + x, y = experimentdata_continuous.to_numpy() + + # cast x to floats + x = x.astype(float) + # assert if x and numpy_array have all the same values + assert np.allclose(x, numpy_array) + + +def test_to_xarray(experimentdata_continuous: ExperimentData, xarray_dataset: xr.Dataset): + exported_dataset = experimentdata_continuous.to_xarray() + # assert if xr_dataset is equal to xarray + assert exported_dataset.equals(xarray_dataset) + + +def test_to_pandas(experimentdata_continuous: ExperimentData, pandas_dataframe: pd.DataFrame): + exported_dataframe, _ = experimentdata_continuous.to_pandas() + # assert if pandas_dataframe is equal to exported_dataframe + pd.testing.assert_frame_equal( + exported_dataframe, pandas_dataframe, atol=1e-6, check_dtype=False) +# Exporters +# ====================================================================================== + + +def test_add_new_input_column(experimentdata: ExperimentData, + continuous_parameter: _ContinuousParameter): + kwargs = {'low': continuous_parameter.lower_bound, + 'high': continuous_parameter.upper_bound} + experimentdata.add_input_parameter( + name='test', type='float', **kwargs) + assert 'test' in experimentdata._input_data.names + + +def test_add_new_output_column(experimentdata: ExperimentData): + experimentdata.add_output_parameter(name='test', is_disk=False) + assert 'test' in experimentdata._output_data.names + + +def test_set_error(experimentdata_continuous: ExperimentData): + experimentdata_continuous._set_error(3) + assert experimentdata_continuous._jobs.jobs[3] == Status.ERROR + + +# Helper function to create a temporary CSV file with sample data +def create_sample_csv_input(file_path): + data = [ + ["x0", "x1", "x2"], + [0.77395605, 0.43887844, 0.85859792], + [0.69736803, 0.09417735, 0.97562235], + [0.7611397, 0.78606431, 0.12811363], + [0.45038594, 0.37079802, 0.92676499], + [0.64386512, 0.82276161, 0.4434142], + [0.22723872, 0.55458479, 0.06381726], + [0.82763117, 0.6316644, 0.75808774], + [0.35452597, 0.97069802, 0.89312112], + [0.7783835, 0.19463871, 0.466721], + [0.04380377, 0.15428949, 0.68304895], + [0.000000, 0.000000, 0.000000], + [1.000000, 1.000000, 1.000000], + ] + with open(file_path, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerows(data) + + +def create_sample_csv_output(file_path): + data = [ + ["y"], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + + ] + with open(file_path, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerows(data) + +# Pytest fixture to create a temporary CSV file + + +def create_domain_pickle(filepath): + domain = make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3) + domain.store(filepath) + + +def create_jobs_pickle_finished(filepath): + domain = make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3) + + _data_input = _Data.from_dataframe(pd_input()) + _data_output = _Data.from_dataframe(pd_output()) + experimentdata = ExperimentData( + domain=domain, input_data=_data_input, output_data=_data_output) + experimentdata._jobs.store(filepath) + + +def create_jobs_pickle_open(filepath): + domain = make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3) + + _data_input = _Data.from_dataframe(pd_input()) + experimentdata = ExperimentData(domain=domain, input_data=_data_input) + experimentdata._jobs.store(filepath) + + +def path_domain(tmp_path): + domain_file_path = tmp_path / "test_domain.pkl" + create_domain_pickle(domain_file_path) + return domain_file_path + + +def str_domain(tmp_path): + domain_file_path = tmp_path / "test_domain.pkl" + create_domain_pickle(domain_file_path) + return str(domain_file_path) + + +def path_jobs_finished(tmp_path): + jobs_file_path = tmp_path / "test_jobs.pkl" + create_jobs_pickle_finished(jobs_file_path) + return jobs_file_path + + +def str_jobs_finished(tmp_path): + jobs_file_path = tmp_path / "test_jobs.pkl" + create_jobs_pickle_finished(jobs_file_path) + return str(jobs_file_path) + + +def path_jobs_open(tmp_path): + jobs_file_path = tmp_path / "test_jobs.pkl" + create_jobs_pickle_open(jobs_file_path) + return jobs_file_path + + +def str_jobs_open(tmp_path): + jobs_file_path = tmp_path / "test_jobs.pkl" + create_jobs_pickle_open(jobs_file_path) + return str(jobs_file_path) + + +def path_input(tmp_path): + csv_file_path = tmp_path / "test_input.csv" + create_sample_csv_input(csv_file_path) + return csv_file_path + + +def str_input(tmp_path): + csv_file_path = tmp_path / "test_input.csv" + create_sample_csv_input(csv_file_path) + return str(csv_file_path) + + +def path_output(tmp_path: Path): + csv_file_path = tmp_path / "test_output.csv" + create_sample_csv_output(csv_file_path) + return csv_file_path + + +def str_output(tmp_path: Path): + csv_file_path = tmp_path / "test_output.csv" + create_sample_csv_output(csv_file_path) + return str(csv_file_path) + +# Pytest test function for reading and monkeypatching a CSV file + + +def numpy_input(): + return np.array([ + [0.77395605, 0.43887844, 0.85859792], + [0.69736803, 0.09417735, 0.97562235], + [0.7611397, 0.78606431, 0.12811363], + [0.45038594, 0.37079802, 0.92676499], + [0.64386512, 0.82276161, 0.4434142], + [0.22723872, 0.55458479, 0.06381726], + [0.82763117, 0.6316644, 0.75808774], + [0.35452597, 0.97069802, 0.89312112], + [0.7783835, 0.19463871, 0.466721], + [0.04380377, 0.15428949, 0.68304895], + [0.000000, 0.000000, 0.000000], + [1.000000, 1.000000, 1.000000], + ]) + + +def numpy_output(): + return np.array([ + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + + ]) + + +def pd_input(): + return pd.DataFrame([ + [0.77395605, 0.43887844, 0.85859792], + [0.69736803, 0.09417735, 0.97562235], + [0.7611397, 0.78606431, 0.12811363], + [0.45038594, 0.37079802, 0.92676499], + [0.64386512, 0.82276161, 0.4434142], + [0.22723872, 0.55458479, 0.06381726], + [0.82763117, 0.6316644, 0.75808774], + [0.35452597, 0.97069802, 0.89312112], + [0.7783835, 0.19463871, 0.466721], + [0.04380377, 0.15428949, 0.68304895], + [0.000000, 0.000000, 0.000000], + [1.000000, 1.000000, 1.000000], + ], columns=["x0", "x1", "x2"]) + + +def pd_output(): + return pd.DataFrame([ + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + + ], columns=["y"]) + + +def data_input(): + return _Data.from_dataframe(pd_input()) + + +def data_output(): + return _Data.from_dataframe(pd_output()) + + +@pytest.mark.parametrize("input_data", [path_input, str_input, pd_input(), data_input(), numpy_input()]) +@pytest.mark.parametrize("output_data", [path_output, str_output, pd_output(), data_output()]) +@pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3), None, path_domain, str_domain]) +@pytest.mark.parametrize("jobs", [None, path_jobs_finished, str_jobs_finished]) +def test_init_with_output(input_data: DataTypes, output_data: DataTypes, domain: Domain | str | Path | None, + jobs: _JobQueue | str | Path | None, + experimentdata_expected: ExperimentData, monkeypatch, tmp_path: Path): + + # if input_data is Callable + if callable(input_data): + input_data = input_data(tmp_path) + expected_data_input = pd.read_csv(input_data) + + # if output_data is Callable + if callable(output_data): + output_data = output_data(tmp_path) + expected_data_output = pd.read_csv(output_data) + + if callable(domain): + domain = domain(tmp_path) + expected_domain = Domain.from_file(domain) + + if callable(jobs): + jobs = jobs(tmp_path) + expected_jobs = _JobQueue.from_file(jobs).jobs + + # monkeypatch pd.read_csv to return the expected_data DataFrame + def mock_read_csv(*args, **kwargs): + + path = args[0] + if isinstance(args[0], str): + path = Path(path) + + if path == tmp_path / "test_input.csv": + return expected_data_input + + elif path == tmp_path / "test_output.csv": + return expected_data_output + + else: + raise ValueError("Unexpected file path") + + def mock_load_pickle(*args, **kwargs): + return expected_domain + + def mock_pd_read_pickle(*args, **kwargs): + path = args[0] + + if isinstance(path, str): + path = Path(path) + + if path == tmp_path / "test_jobs.pkl": + return expected_jobs + + else: + raise ValueError("Unexpected jobs file path") + + monkeypatch.setattr(pd, "read_csv", mock_read_csv) + monkeypatch.setattr(pickle, "load", mock_load_pickle) + monkeypatch.setattr(pd, "read_pickle", mock_pd_read_pickle) + + if isinstance(input_data, np.ndarray) and domain is None: + with pytest.raises(ValueError): + ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) + return + # Initialize ExperimentData with the CSV file + experiment_data = ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) + + # Check if the input_data attribute of ExperimentData matches the expected_data + pd.testing.assert_frame_equal( + experiment_data._input_data.to_dataframe(), experimentdata_expected._input_data.to_dataframe(), check_dtype=False, atol=1e-6) + pd.testing.assert_frame_equal(experiment_data._output_data.to_dataframe(), + experimentdata_expected._output_data.to_dataframe(), check_dtype=False) + + +@pytest.mark.parametrize("input_data", [pd_input(), path_input, str_input, data_input(), numpy_input()]) +@pytest.mark.parametrize("output_data", [None]) +@pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3), None, path_domain, str_domain]) +@pytest.mark.parametrize("jobs", [None, path_jobs_open, str_jobs_open]) +def test_init_without_output(input_data: DataTypes, output_data: DataTypes, domain: Domain, jobs: _JobQueue, + experimentdata_expected_no_output: ExperimentData, monkeypatch, tmp_path): + + # if input_data is Callable + if callable(input_data): + input_data = input_data(tmp_path) + expected_data_input = pd.read_csv(input_data) + + # if output_data is Callable + if callable(output_data): + output_data = output_data(tmp_path) + expected_data_output = pd.read_csv(output_data) + + if callable(domain): + domain = domain(tmp_path) + expected_domain = Domain.from_file(domain) + + if callable(jobs): + jobs = jobs(tmp_path) + expected_jobs = _JobQueue.from_file(jobs).jobs + + # monkeypatch pd.read_csv to return the expected_data DataFrame + def mock_read_csv(*args, **kwargs): + + path = args[0] + if isinstance(args[0], str): + path = Path(path) + + if path == tmp_path / "test_input.csv": + return expected_data_input + + elif path == tmp_path / "test_output.csv": + return expected_data_output + + else: + raise ValueError("Unexpected file path") + + def mock_load_pickle(*args, **kwargs): + return expected_domain + + def mock_pd_read_pickle(*args, **kwargs): + path = args[0] + + if isinstance(path, str): + path = Path(path) + + if path == tmp_path / "test_jobs.pkl": + return expected_jobs + + monkeypatch.setattr(pd, "read_csv", mock_read_csv) + monkeypatch.setattr(pickle, "load", mock_load_pickle) + monkeypatch.setattr(pd, "read_pickle", mock_pd_read_pickle) + + if isinstance(input_data, np.ndarray) and domain is None: + with pytest.raises(ValueError): + ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) + return + + # Initialize ExperimentData with the CSV file + experiment_data = ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) + + # Check if the input_data attribute of ExperimentData matches the expected_data + pd.testing.assert_frame_equal( + experiment_data._input_data.to_dataframe(), experimentdata_expected_no_output._input_data.to_dataframe(), atol=1e-6, check_dtype=False) + pd.testing.assert_frame_equal(experiment_data._output_data.to_dataframe(), + experimentdata_expected_no_output._output_data.to_dataframe()) + pd.testing.assert_series_equal( + experiment_data._jobs.jobs, experimentdata_expected_no_output._jobs.jobs) + # assert experiment_data.domain == experimentdata_expected_no_output.domain + assert experiment_data._jobs == experimentdata_expected_no_output._jobs + + +@pytest.mark.parametrize("input_data", [None]) +@pytest.mark.parametrize("output_data", [None]) +@pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3), path_domain, str_domain]) +def test_init_only_domain(input_data: DataTypes, output_data: DataTypes, domain: Domain | str | Path, + experimentdata_expected_only_domain: ExperimentData, + monkeypatch, tmp_path): + + # if input_data is Callable + if callable(input_data): + input_data = input_data(tmp_path) + expected_data_input = pd.read_csv(input_data) + + # if output_data is Callable + if callable(output_data): + output_data = output_data(tmp_path) + expected_data_output = pd.read_csv(output_data) + + if callable(domain): + domain = domain(tmp_path) + expected_domain = Domain.from_file(domain) + + # monkeypatch pd.read_csv to return the expected_data DataFrame + def mock_read_csv(*args, **kwargs): + + path = args[0] + if isinstance(args[0], str): + path = Path(path) + + if path == tmp_path / "test_input.csv": + return expected_data_input + + elif path == tmp_path / "test_output.csv": + return expected_data_output + + else: + raise ValueError("Unexpected file path") + + def mock_load_pickle(*args, **kwargs): + return expected_domain + + monkeypatch.setattr(pd, "read_csv", mock_read_csv) + monkeypatch.setattr(pickle, "load", mock_load_pickle) + + # Initialize ExperimentData with the CSV file + experiment_data = ExperimentData(domain=domain, input_data=input_data, + output_data=output_data) + + # Check if the input_data attribute of ExperimentData matches the expected_data + pd.testing.assert_frame_equal( + experiment_data._input_data.to_dataframe(), experimentdata_expected_only_domain._input_data.to_dataframe(), check_dtype=False) + pd.testing.assert_frame_equal(experiment_data._output_data.to_dataframe(), + experimentdata_expected_only_domain._output_data.to_dataframe(), check_dtype=False) + assert experiment_data._input_data == experimentdata_expected_only_domain._input_data + assert experiment_data._output_data == experimentdata_expected_only_domain._output_data + assert experiment_data.domain == experimentdata_expected_only_domain.domain + assert experiment_data._jobs == experimentdata_expected_only_domain._jobs + + assert experiment_data == experimentdata_expected_only_domain + + +@pytest.mark.parametrize("input_data", [[0.1, 0.2], {"a": 0.1, "b": 0.2}, 0.2, 2]) +def test_invalid_type(input_data): + with pytest.raises(TypeError): + ExperimentData(input_data=input_data) + + +def test_add_invalid_type(experimentdata: ExperimentData): + with pytest.raises(TypeError): + experimentdata + 1 + + +def test_add_two_different_domains(experimentdata: ExperimentData, experimentdata_continuous: ExperimentData): + with pytest.raises(ValueError): + experimentdata + experimentdata_continuous + + +def test_repr_html(experimentdata: ExperimentData, monkeypatch): + assert isinstance(experimentdata._repr_html_(), str) + + +def test_store(experimentdata: ExperimentData, tmp_path: Path): + experimentdata.store(tmp_path / "test") + assert (tmp_path / "test" / "experiment_data" / "input.csv").exists() + assert (tmp_path / "test" / "experiment_data" / "output.csv").exists() + assert (tmp_path / "test" / "experiment_data" / "domain.pkl").exists() + assert (tmp_path / "test" / "experiment_data" / "jobs.pkl").exists() + + +def test_store_give_no_filename(experimentdata: ExperimentData, tmp_path: Path): + experimentdata.set_project_dir(tmp_path / 'test2') + experimentdata.store() + assert (tmp_path / "test2" / "experiment_data" / "input.csv").exists() + assert (tmp_path / "test2" / "experiment_data" / "output.csv").exists() + assert (tmp_path / "test2" / "experiment_data" / "domain.pkl").exists() + assert (tmp_path / "test2" / "experiment_data" / "jobs.pkl").exists() + + +@pytest.mark.parametrize("mode", ["sequential", "parallel", "typo"]) +def test_evaluate_mode(mode: str, experimentdata_continuous: ExperimentData, tmp_path: Path): + experimentdata_continuous.filename = tmp_path / 'test009' + + if mode == "typo": + with pytest.raises(ValueError): + experimentdata_continuous.evaluate("ackley", mode=mode, kwargs={ + "scale_bounds": np.array([[0., 1.], [0., 1.], [0., 1.]]), 'seed': SEED}) + else: + experimentdata_continuous.evaluate("ackley", mode=mode, kwargs={ + "scale_bounds": np.array([[0., 1.], [0., 1.], [0., 1.]]), 'seed': SEED}) + + +def test_get_input_data(experimentdata_expected_no_output: ExperimentData): + input_data = experimentdata_expected_no_output.get_input_data() + df, _ = input_data.to_pandas() + pd.testing.assert_frame_equal(df, pd_input(), check_dtype=False, atol=1e-6) + assert experimentdata_expected_no_output._input_data == input_data._input_data + + +@pytest.mark.parametrize("selection", ["x0", ["x0"], ["x0", "x2"]]) +def test_get_input_data_selection(experimentdata_expected_no_output: ExperimentData, selection: Iterable[str] | str): + input_data = experimentdata_expected_no_output.get_input_data(selection) + df, _ = input_data.to_pandas() + if isinstance(selection, str): + selection = [selection] + selected_pd = pd_input()[selection] + pd.testing.assert_frame_equal( + df, selected_pd, check_dtype=False, atol=1e-6) + + +def test_get_output_data(experimentdata_expected: ExperimentData): + output_data = experimentdata_expected.get_output_data() + _, df = output_data.to_pandas() + pd.testing.assert_frame_equal(df, pd_output(), check_dtype=False) + assert experimentdata_expected._output_data == output_data._output_data + + +@pytest.mark.parametrize("selection", ["y", ["y"]]) +def test_get_output_data_selection(experimentdata_expected: ExperimentData, selection: Iterable[str] | str): + output_data = experimentdata_expected.get_output_data(selection) + _, df = output_data.to_pandas() + if isinstance(selection, str): + selection = [selection] + selected_pd = pd_output()[selection] + pd.testing.assert_frame_equal(df, selected_pd, check_dtype=False) + + +def test_iter_behaviour(experimentdata_continuous: ExperimentData): + for i in experimentdata_continuous: + assert isinstance(i, ExperimentSample) + + selected_experimentdata = experimentdata_continuous.select([0, 2, 4]) + for i in selected_experimentdata: + assert isinstance(i, ExperimentSample) + + +def test_select_with_status_open(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('open') + assert all(job == Status.OPEN for job in selected_data._jobs.jobs) + + +def test_select_with_status_in_progress(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('in progress') + assert all(job == Status.IN_PROGRESS for job in selected_data._jobs.jobs) + + +def test_select_with_status_finished(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('finished') + assert all(job == Status.FINISHED for job in selected_data._jobs.jobs) + + +def test_select_with_status_error(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('error') + assert all(job == Status.ERROR for job in selected_data._jobs.jobs) + + +def test_select_with_status_invalid_status(experimentdata: ExperimentData): + with pytest.raises(ValueError): + _ = experimentdata.select_with_status('invalid_status') + + +if __name__ == "__main__": # pragma: no cover + pytest.main() diff --git a/tests/newdata/test_data.py b/tests/newdata/test_data.py index fb5f0cba..6c5abe52 100644 --- a/tests/newdata/test_data.py +++ b/tests/newdata/test_data.py @@ -1,5 +1,4 @@ -from copy import deepcopy -from typing import Any, Dict, List +from typing import Any, Dict import numpy as np import pandas as pd @@ -175,6 +174,14 @@ def test_select_columns_single(): assert selected_data.data == expected_data +def test_rename_columns(): + input_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} + data = _Data(input_data) + data.rename_columns({"a": "x", "b": "y"}) + expected_data = {0: {"x": 1, "y": 2, "c": 3}, 1: {"x": 4, "y": 5, "c": 6}} + assert data.data == expected_data + + def test_drop(): input_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} data = _Data(input_data) From 74fd3154fe3a470eec433fa2c011a87f5bd6b074 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:57:42 +0200 Subject: [PATCH 11/17] Refactor domain initialization and data loading --- src/f3dasm/_src/design/domain.py | 47 ++- src/f3dasm/_src/experimentdata/_data.py | 10 +- .../_experimental/_jobqueue2.py | 267 ++++++++++++------ .../experimentdata/_experimental/_newdata2.py | 5 +- .../_experimental/_newexperimentdata2.py | 22 +- .../_src/experimentdata/experimentdata.py | 22 +- tests/newdata/test_data.py | 2 +- 7 files changed, 257 insertions(+), 118 deletions(-) diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py index 172d9ce1..9c31b9df 100644 --- a/src/f3dasm/_src/design/domain.py +++ b/src/f3dasm/_src/design/domain.py @@ -14,7 +14,7 @@ from dataclasses import dataclass, field from pathlib import Path from typing import (Any, Dict, Iterable, Iterator, List, Literal, Optional, - Sequence, Type) + Protocol, Sequence, Type) # Third-party core import numpy as np @@ -36,6 +36,13 @@ # ============================================================================= +class _Data(Protocol): + def to_dataframe(self) -> pd.DataFrame: + ... + +# ============================================================================= + + @dataclass class Domain: """Main class for defining the domain of the design of experiments. @@ -238,6 +245,26 @@ def from_dataframe(cls, df_input: pd.DataFrame, return cls(space=input_space, output_space=output_space) + @classmethod + def from_data(cls: Type[Domain], + input_data: _Data, output_data: _Data) -> Domain: + """Initializes a Domain from input and output data. + + Parameters + ---------- + input_data : _Data + Input data. + output_data : _Data + Output data. + + Returns + ------- + Domain + Domain object + """ + return cls.from_dataframe( + input_data.to_dataframe(), output_data.to_dataframe()) + # Export # ============================================================================= @@ -645,9 +672,7 @@ def make_nd_continuous_domain(bounds: np.ndarray | List[List[float]], return Domain(space) -def _domain_factory(domain: Domain | DictConfig | None, - input_data: pd.DataFrame, - output_data: pd.DataFrame) -> Domain: +def _domain_factory(domain: Domain | DictConfig | str | Path) -> Domain: if isinstance(domain, Domain): return domain @@ -657,14 +682,14 @@ def _domain_factory(domain: Domain | DictConfig | None, elif isinstance(domain, DictConfig): return Domain.from_yaml(domain) - elif (input_data.empty and output_data.empty and domain is None): - return Domain() + # elif (input_data.empty and output_data.empty and domain is None): + # return Domain() - elif domain is None: - return Domain.from_dataframe( - input_data, output_data) + # elif domain is None: + # return Domain.from_dataframe( + # input_data, output_data) else: raise TypeError( - f"Domain must be of type Domain, DictConfig " - f"or None, not {type(domain)}") + f"Domain must be of type Domain, DictConfig, str or Path, " + f"not {type(domain)}") diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index 3817cda3..0721396b 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -186,7 +186,8 @@ def from_file(cls, filename: Path | str) -> _Data: return cls(df, columns=_Columns(_columns)) @classmethod - def from_numpy(cls: Type[_Data], array: np.ndarray) -> _Data: + def from_numpy(cls: Type[_Data], + array: np.ndarray, keys: Iterable[str]) -> _Data: """Loads the data from a numpy array. Parameters @@ -458,7 +459,8 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: return _Data(data=df, columns=_Columns(_columns)) -def _data_factory(data: DataTypes) -> _Data: +def _data_factory(data: DataTypes, + keys: Optional[Iterable[str]] = None) -> _Data: if data is None: return _Data() @@ -469,10 +471,10 @@ def _data_factory(data: DataTypes) -> _Data: return _Data.from_dataframe(data) elif isinstance(data, (Path, str)): - return _Data.from_file(data) + return _Data.from_file(Path(data)) elif isinstance(data, np.ndarray): - return _Data.from_numpy(data) + return _Data.from_numpy(data, keys=keys) else: raise TypeError( diff --git a/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py index 82721ace..8e10f4ac 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py @@ -35,6 +35,8 @@ class Status(str, Enum): def __str__(self) -> str: return self.value +# ============================================================================= + class NoOpenJobsError(Exception): """ @@ -52,6 +54,14 @@ def __init__(self, message): class Index: def __init__(self, jobs: pd.Series | None | str = None): + """ + Initializes the Index object. + + Parameters + ---------- + jobs : pd.Series, None, or str, optional + Series of jobs, None, or a single job as a string. + """ if isinstance(jobs, str): self.jobs = pd.Series(jobs, index=[0], dtype='string') @@ -62,9 +72,30 @@ def __init__(self, jobs: pd.Series | None | str = None): self.jobs = jobs def __len__(self) -> int: + """ + Returns the number of jobs. + + Returns + ------- + int + Number of jobs. + """ return len(self.jobs) def __add__(self, __o: Index | str) -> Index: + """ + Adds another Index or a string to this Index. + + Parameters + ---------- + __o : Index or str + Another Index object or a string representing a job. + + Returns + ------- + Index + A new Index object containing the combined jobs. + """ if isinstance(__o, str): __o = Index(__o) @@ -73,135 +104,212 @@ def __add__(self, __o: Index | str) -> Index: # Make a copy of other.jobs and modify its index other_jobs_copy = deepcopy(__o) - other_jobs_copy.jobs.index = pd.Index(range( - len(other_jobs_copy))) + self.jobs.index[-1] + 1 + other_jobs_copy.jobs.index = pd.Index( + range(len(other_jobs_copy))) + self.jobs.index[-1] + 1 return Index(pd.concat([self.jobs, other_jobs_copy.jobs])) def __getitem__(self, indices: int | slice | Iterable[int]) -> Index: + """ + Gets a subset of jobs by indices. + + Parameters + ---------- + indices : int, slice, or Iterable[int] + Indices to get. + + Returns + ------- + Index + A new Index object containing the selected jobs. + """ if isinstance(indices, int): indices = [indices] return Index(self.jobs[indices].copy()) def __eq__(self, __o: Index) -> bool: + """ + Checks if this Index is equal to another Index. + + Parameters + ---------- + __o : Index + Another Index object to compare. + + Returns + ------- + bool + True if the two Index objects are equal, False otherwise. + """ return self.jobs.equals(__o.jobs) def _repr_html_(self) -> str: + """ + Returns an HTML representation of the jobs. + + Returns + ------- + str + HTML representation of the jobs. + """ return self.jobs.__repr__() @property def indices(self) -> pd.Index: - """The indices of the jobs.""" + """ + The indices of the jobs. + + Returns + ------- + pd.Index + The indices of the jobs. + """ return self.jobs.index - def iloc(self, indices: Iterable[int]) -> Iterable[int]: + def iloc(self, indices: Iterable[int] | int) -> Iterable[int]: + """ + Gets the position of the given indices in the jobs. + + Parameters + ---------- + indices : Iterable[int] or int + Indices to locate. + + Returns + ------- + Iterable[int] + Positions of the given indices. + """ + if isinstance(indices, int): + indices = [indices] return self.indices.get_indexer(indices) - # Alternative Constructors - # ========================================================================= + def is_all_finished(self) -> bool: + """ + Checks if all jobs are finished. + + Returns + ------- + bool + True if all jobs are finished, False otherwise. + """ + return all(self.jobs.isin([Status.FINISHED, Status.ERROR])) @classmethod def from_data(cls: Type[Index], data: _Data, value: str = Status.OPEN) -> Index: - """Create a JobQueue object from a Data object. + """ + Create an Index object from a Data object. Parameters ---------- - data : Data + data : _Data Data object containing the data. - value : str + value : str, optional The value to assign to the jobs. Can be 'open', - 'in progress', 'finished', or 'error'. + 'in_progress', 'finished', or 'error'. Default is 'open'. Returns ------- - JobQueue - JobQueue object containing the loaded data. + Index + Index object containing the loaded data. """ return cls(pd.Series([value] * len(data), dtype='string')) @classmethod def from_file(cls: Type[Index], filename: Path | str) -> Index: - """Create a JobQueue object from a pickle file. + """ + Create an Index object from a pickle file. Parameters ---------- - filename : Path | str + filename : Path or str Name of the file. Returns ------- - JobQueue - JobQueue object containing the loaded data. + Index + Index object containing the loaded data. + + Raises + ------ + FileNotFoundError + If the specified file does not exist. """ - # Convert filename to Path if Path(filename).with_suffix('.csv').exists(): - return cls( - pd.read_csv(Path(filename).with_suffix('.csv'), - index_col=0)['0']) - + return cls(pd.read_csv(Path(filename).with_suffix('.csv'), + index_col=0)['0']) elif Path(filename).with_suffix('.pkl').exists(): - return cls( - pd.read_pickle(Path(filename).with_suffix('.pkl'))) - + return cls(pd.read_pickle(Path(filename).with_suffix('.pkl'))) else: raise FileNotFoundError(f"Jobfile {filename} does not exist.") - # Select - # ========================================================================= - def select_all(self, status: str) -> Index: - """Selects all jobs with a certain status. + """ + Selects all jobs with a certain status. Parameters ---------- status : str - Status of the jobs to select + Status of the jobs to select. Returns ------- - JobQueue - JobQueue object containing the selected jobs. + Index + Index object containing the selected jobs. """ return Index(self.jobs[self.jobs == status]) - # Export - # ========================================================================= - def store(self, filename: Path) -> None: - """Stores the jobs in a pickle file. + """ + Stores the jobs in a pickle file. Parameters ---------- filename : Path Path of the file. """ - self.jobs.to_csv(filename.with_suffix('.csv')) + self.jobs.to_pickle(filename.with_suffix('.pkl')) + # self.jobs.to_csv(filename.with_suffix('.csv')) def to_dataframe(self, name: str = "") -> pd.DataFrame: - """Converts the job queue to a DataFrame. + """ + Converts the job queue to a DataFrame. Parameters ---------- name : str, optional - Name of the column, by default "". - - Note - ---- - If the name is not specified, the column name will be an empty string + Name of the column. Default is an empty string. Returns ------- - DataFrame + pd.DataFrame DataFrame containing the jobs. """ - return self.jobs.to_frame("") + return self.jobs.to_frame(name) - # Append and remove jobs - # ========================================================================= + def get_open_job(self) -> int: + """ + Returns the index of an open job. - def remove(self, indices: List[int]): - """Removes a subset of the jobs. + Returns + ------- + int + Index of an open job. + + Raises + ------ + NoOpenJobsError + If no open jobs are found. + """ + try: + return int(self.jobs[self.jobs == Status.OPEN].index[0]) + except IndexError: + raise NoOpenJobsError("No open jobs found.") + + def remove(self, indices: List[int]) -> None: + """ + Removes a subset of the jobs. Parameters ---------- @@ -210,69 +318,56 @@ def remove(self, indices: List[int]): """ self.jobs = self.jobs.drop(indices) - def overwrite( - self, indices: Iterable[int], - other: Index | str) -> None: + def overwrite(self, indices: Iterable[int], other: Index | str) -> None: + """ + Overwrites the jobs at the specified indices with new jobs. + Parameters + ---------- + indices : Iterable[int] + Indices to overwrite. + other : Index or str + New jobs to overwrite with. + """ if isinstance(other, str): - other = Index( - pd.Series([other], index=[0], dtype='string')) + other = Index(pd.Series([other], index=[0], dtype='string')) self.jobs.update(other.jobs.set_axis(indices)) - # Mark - # ========================================================================= - def mark(self, index: int | slice | Iterable[int], status: Status) -> None: - """Marks a job with a certain status. + """ + Marks a job with a certain status. Parameters ---------- - index : int + index : int, slice, or Iterable[int] Index of the job to mark. - status : str + status : Status Status to mark the job with. """ self.jobs.loc[index] = status def mark_all_in_progress_open(self) -> None: - """Marks all jobs as 'open'.""" + """ + Marks all jobs as 'open'. + """ self.jobs = self.jobs.replace(Status.IN_PROGRESS, Status.OPEN) def mark_all_error_open(self) -> None: - """Marks all jobs as 'open'.""" - self.jobs = self.jobs.replace(Status.ERROR, Status.OPEN) - # Miscellanous - # ========================================================================= - - def is_all_finished(self) -> bool: - """Checks if all jobs are finished. - - Returns - ------- - bool - True if all jobs are finished, False otherwise. """ - return all(self.jobs.isin([Status.FINISHED, Status.ERROR])) - - def get_open_job(self) -> int: - """Returns the index of an open job. - - Returns - ------- - int - Index of an open job. + Marks all jobs as 'open'. """ - try: # try to find an open job - return int(self.jobs[self.jobs == Status.OPEN].index[0]) - except IndexError: - raise NoOpenJobsError("No open jobs found.") + self.jobs = self.jobs.replace(Status.ERROR, Status.OPEN) def reset_index(self) -> None: - """Resets the index of the jobs.""" + """ + Resets the index of the jobs. + """ self.jobs.reset_index(drop=True, inplace=True) +# ============================================================================= + def _jobs_factory(jobs: Path | str | Index | None, input_data: _Data, output_data: _Data, job_value: Status) -> Index: """Creates a Index object from particular inpute diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py index 26df0982..c0cc9745 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py @@ -221,7 +221,8 @@ def from_file(cls, filename: Path) -> _Data: _Data The created _Data object. """ - ... + df = pd.read_csv(filename.with_suffix('.csv'), header=0, index_col=0) + return cls.from_dataframe(df) @classmethod def from_numpy(cls: Type[_Data], array: np.ndarray, @@ -322,7 +323,7 @@ def store(self, filename: Path): filename : Path The file to store the data in. """ - ... + self.to_dataframe().to_csv(filename.with_suffix('.csv')) def get_data_dict(self, row: int) -> Dict[str, Any]: """ diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py index 7851f30f..57e151ae 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py @@ -110,12 +110,20 @@ def __init__(self, self.project_dir = _project_dir_factory(project_dir) - if isinstance(input_data, np.ndarray) and isinstance(domain, Domain): - self._input_data = _data_factory(input_data, domain.names) - self._output_data = _data_factory(output_data, domain.output_names) + # DOMAIN + if domain is None: + self.domain = Domain.from_data( + input_data=_data_factory(input_data), + output_data=_data_factory(output_data)) + else: - self._input_data = _data_factory(input_data) - self._output_data = _data_factory(output_data) + self.domain = _domain_factory(domain=domain) + + # INPUT AND OUTPUT DATAA + self._input_data = _data_factory( + data=input_data, keys=self.domain.names) + self._output_data = _data_factory( + data=output_data, keys=self.domain.output_names) # Create empty output_data from indices if output_data is empty if self._output_data.is_empty(): @@ -125,10 +133,6 @@ def __init__(self, else: job_value = Status.FINISHED - self.domain = _domain_factory( - domain=domain, input_data=self._input_data.to_dataframe(), - output_data=self._output_data.to_dataframe()) - # Create empty input_data from domain if input_data is empty if self._input_data.is_empty(): self._input_data = _Data() diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index f053fdc5..72f66d0e 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -110,8 +110,20 @@ def __init__(self, self.project_dir = _project_dir_factory(project_dir) - self._input_data = _data_factory(input_data) - self._output_data = _data_factory(output_data) + # DOMAIN + if domain is None: + self.domain = Domain.from_data( + input_data=_data_factory(input_data), + output_data=_data_factory(output_data)) + + else: + self.domain = _domain_factory(domain=domain) + + # INPUT AND OUTPUT DATAA + self._input_data = _data_factory( + data=input_data, keys=self.domain.names) + self._output_data = _data_factory( + data=output_data, keys=self.domain.output_names) # Create empty output_data from indices if output_data is empty if self._output_data.is_empty(): @@ -121,9 +133,9 @@ def __init__(self, else: job_value = Status.FINISHED - self.domain = _domain_factory( - domain=domain, input_data=self._input_data.to_dataframe(), - output_data=self._output_data.to_dataframe()) + # self.domain = _domain_factory( + # domain=domain, input_data=self._input_data.to_dataframe(), + # output_data=self._output_data.to_dataframe()) # Create empty input_data from domain if input_data is empty if self._input_data.is_empty(): diff --git a/tests/newdata/test_data.py b/tests/newdata/test_data.py index 6c5abe52..644496ac 100644 --- a/tests/newdata/test_data.py +++ b/tests/newdata/test_data.py @@ -115,7 +115,7 @@ def test_len(): def test_indices(): data = _Data({0: {"a": 1}, 1: {"a": 2}}) - assert data.indices == [0, 1] + assert data.indices.equals(pd.Index([0, 1])) def test_names(): From 1eaae8b7efd9412ebddf8225cbe1b4a3ca8112ca Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Wed, 26 Jun 2024 12:06:19 +0200 Subject: [PATCH 12/17] Added tmp file creation after storing. Fixes #273 --- src/f3dasm/_src/design/domain.py | 5 ++- src/f3dasm/_src/experimentdata/_data.py | 5 ++- src/f3dasm/_src/experimentdata/_io.py | 42 +++++++++++++++++++ src/f3dasm/_src/experimentdata/_jobqueue.py | 5 ++- .../_src/experimentdata/experimentdata.py | 6 ++- 5 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py index 9c31b9df..930a36be 100644 --- a/src/f3dasm/_src/design/domain.py +++ b/src/f3dasm/_src/design/domain.py @@ -276,9 +276,12 @@ def store(self, filename: Path) -> None: filename : str Name of the file. """ - with open(filename.with_suffix('.pkl'), 'wb') as f: + with open(filename.with_suffix('.tmp'), 'wb') as f: pickle.dump(self, f) + # rename the file to the correct extension + filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl')) + def _cast_types_dataframe(self) -> dict: """Make a dictionary that provides the datatype of each parameter""" return {name: parameter._type for diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index 0721396b..849326f9 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -264,7 +264,10 @@ def store(self, filename: Path) -> None: The data is stored as a csv file. """ # TODO: The column information is not saved in the .csv! - self.to_dataframe().to_csv(filename.with_suffix('.csv')) + self.to_dataframe().to_csv(filename.with_suffix('.tmp')) + + # rename the file to the correct extension + filename.with_suffix('.tmp').rename(filename.with_suffix('.csv')) def n_best_samples(self, nosamples: int, column_name: List[str] | str) -> pd.DataFrame: diff --git a/src/f3dasm/_src/experimentdata/_io.py b/src/f3dasm/_src/experimentdata/_io.py index f602dbac..385f495e 100644 --- a/src/f3dasm/_src/experimentdata/_io.py +++ b/src/f3dasm/_src/experimentdata/_io.py @@ -11,6 +11,7 @@ # Standard import pickle from pathlib import Path +from time import sleep from typing import Any, Mapping, Optional, Type # Third-party @@ -43,6 +44,13 @@ RESOLUTION_MATPLOTLIB_FIGURE = 300 MAX_TRIES = 10 +# Exceptions +# ============================================================================= + + +class TemporaryFilesNotCleared(Exception): + pass + # Storing methods # ============================================================================= @@ -365,3 +373,37 @@ def _project_dir_factory(project_dir: Path | str | None) -> Path: raise TypeError( f"project_dir must be of type Path, str or None, \ not {type(project_dir).__name__}") + + +def check_for_temporary_files(directory: Path, delay: float = 0.3): + """ + Check if there are any .tmp files in the subdirectory. + + Parameters + ---------- + subdirectory : Path + subdirectory to check for temporary files + delay : float, optional + delay between checks, by default 0.3 + + Raises + ------ + TemporaryFilesNotCleared + Raises if temporary files are found after the maximum number of tries + """ + for attempt in range(MAX_TRIES): + if not any(directory.glob('*.tmp')): + logger.debug(( + f"No temporary files found in {directory} after " + f"{attempt + 1} tries.") + ) + break + logger.debug(( + f"Temporary files found in {directory} after {attempt + 1} " + f"tries. Waiting {delay} seconds before checking again.") + ) + sleep(delay) + else: + raise TemporaryFilesNotCleared(( + f"Temporary files found in {directory} after {MAX_TRIES} tries." + )) diff --git a/src/f3dasm/_src/experimentdata/_jobqueue.py b/src/f3dasm/_src/experimentdata/_jobqueue.py index 79264ce1..2f83cefe 100644 --- a/src/f3dasm/_src/experimentdata/_jobqueue.py +++ b/src/f3dasm/_src/experimentdata/_jobqueue.py @@ -197,7 +197,10 @@ def store(self, filename: Path) -> None: filename : Path Path of the file. """ - self.jobs.to_pickle(filename.with_suffix('.pkl')) + self.jobs.to_pickle(filename.with_suffix('.tmp')) + + # rename the file to the correct extension + filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl')) def to_dataframe(self, name: str = "") -> pd.DataFrame: """Converts the job queue to a DataFrame. diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index 72f66d0e..ff5086d7 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -38,7 +38,8 @@ from ._data import DataTypes, _Data, _data_factory from ._io import (DOMAIN_FILENAME, EXPERIMENTDATA_SUBFOLDER, INPUT_DATA_FILENAME, JOBS_FILENAME, LOCK_FILENAME, MAX_TRIES, - OUTPUT_DATA_FILENAME, _project_dir_factory) + OUTPUT_DATA_FILENAME, _project_dir_factory, + check_for_temporary_files) from ._jobqueue import NoOpenJobsError, Status, _jobs_factory from .experimentsample import ExperimentSample from .samplers import Sampler, SamplerNames, _sampler_factory @@ -394,6 +395,9 @@ def _from_file_attempt(cls: Type[ExperimentData], """ subdirectory = project_dir / EXPERIMENTDATA_SUBFOLDER + # check if there is any .tmp file in the subdirectory + check_for_temporary_files(subdirectory) + try: return cls(domain=subdirectory / DOMAIN_FILENAME, input_data=subdirectory / INPUT_DATA_FILENAME, From 06789592a225002422409b10e5872c3f9c0ba01e Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Wed, 26 Jun 2024 13:11:55 +0200 Subject: [PATCH 13/17] Remove old files and rename files with correct extensions --- src/f3dasm/_src/design/domain.py | 3 +++ src/f3dasm/_src/experimentdata/_data.py | 3 +++ src/f3dasm/_src/experimentdata/_jobqueue.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py index 930a36be..29c36b96 100644 --- a/src/f3dasm/_src/design/domain.py +++ b/src/f3dasm/_src/design/domain.py @@ -279,6 +279,9 @@ def store(self, filename: Path) -> None: with open(filename.with_suffix('.tmp'), 'wb') as f: pickle.dump(self, f) + # remove old file if it exists + filename.with_suffix('.pkl').unlink(missing_ok=True) + # rename the file to the correct extension filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl')) diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index 849326f9..1a17448b 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -266,6 +266,9 @@ def store(self, filename: Path) -> None: # TODO: The column information is not saved in the .csv! self.to_dataframe().to_csv(filename.with_suffix('.tmp')) + # remove the old file if it exists + filename.with_suffix('.csv').unlink(missing_ok=True) + # rename the file to the correct extension filename.with_suffix('.tmp').rename(filename.with_suffix('.csv')) diff --git a/src/f3dasm/_src/experimentdata/_jobqueue.py b/src/f3dasm/_src/experimentdata/_jobqueue.py index 2f83cefe..34818226 100644 --- a/src/f3dasm/_src/experimentdata/_jobqueue.py +++ b/src/f3dasm/_src/experimentdata/_jobqueue.py @@ -199,6 +199,9 @@ def store(self, filename: Path) -> None: """ self.jobs.to_pickle(filename.with_suffix('.tmp')) + # remove old file if it exists + filename.with_suffix('.pkl').unlink(missing_ok=True) + # rename the file to the correct extension filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl')) From 5070d70b801db5b8d656019d5e6ba625b844011e Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Wed, 26 Jun 2024 13:39:14 +0200 Subject: [PATCH 14/17] remove retrieved updated experimentdata cluster mode --- src/f3dasm/_src/experimentdata/experimentdata.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index ff5086d7..ca700764 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -1296,10 +1296,13 @@ def _run_cluster(self, data_generator: DataGenerator, kwargs: dict): NoOpenJobsError Raised when there are no open jobs left """ - # Retrieve the updated experimentdata object from disc - try: - self = self.from_file(self.project_dir) - except FileNotFoundError: # If not found, store current + # # Retrieve the updated experimentdata object from disc + # try: + # self = self.from_file(self.project_dir) + # except FileNotFoundError: # If not found, store current + # self.store() + + if not (self.project_dir / EXPERIMENTDATA_SUBFOLDER).exists(): self.store() while True: From ca4606cdbfd01dd33201183c1bd8964d87c5e6b0 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Wed, 26 Jun 2024 14:01:41 +0200 Subject: [PATCH 15/17] Add optional parameter to store methods --- src/f3dasm/_src/design/domain.py | 18 +++++++++++------- src/f3dasm/_src/experimentdata/_data.py | 19 ++++++++++++------- src/f3dasm/_src/experimentdata/_jobqueue.py | 15 +++++++++------ .../_src/experimentdata/experimentdata.py | 19 ++++++++++++++----- 4 files changed, 46 insertions(+), 25 deletions(-) diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py index 29c36b96..c88acc14 100644 --- a/src/f3dasm/_src/design/domain.py +++ b/src/f3dasm/_src/design/domain.py @@ -268,7 +268,7 @@ def from_data(cls: Type[Domain], # Export # ============================================================================= - def store(self, filename: Path) -> None: + def store(self, filename: Path, create_tmp: bool = False) -> None: """Stores the Domain in a pickle file. Parameters @@ -276,14 +276,18 @@ def store(self, filename: Path) -> None: filename : str Name of the file. """ - with open(filename.with_suffix('.tmp'), 'wb') as f: - pickle.dump(self, f) + if create_tmp: + with open(filename.with_suffix('.tmp'), 'wb') as f: + pickle.dump(self, f) - # remove old file if it exists - filename.with_suffix('.pkl').unlink(missing_ok=True) + # remove old file if it exists + filename.with_suffix('.pkl').unlink(missing_ok=True) - # rename the file to the correct extension - filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl')) + # rename the file to the correct extension + filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl')) + else: + with open(filename.with_suffix('.pkl'), 'wb') as f: + pickle.dump(self, f) def _cast_types_dataframe(self) -> dict: """Make a dictionary that provides the datatype of each parameter""" diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index 1a17448b..137ff3a8 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -251,7 +251,7 @@ def to_dataframe(self) -> pd.DataFrame: df.columns = self.names return df.astype(object) - def store(self, filename: Path) -> None: + def store(self, filename: Path, create_tmp: bool = False) -> None: """Stores the data to a file. Parameters @@ -263,14 +263,19 @@ def store(self, filename: Path) -> None: ---- The data is stored as a csv file. """ - # TODO: The column information is not saved in the .csv! - self.to_dataframe().to_csv(filename.with_suffix('.tmp')) - # remove the old file if it exists - filename.with_suffix('.csv').unlink(missing_ok=True) + if create_tmp: + self.to_dataframe().to_csv(filename.with_suffix('.tmp')) - # rename the file to the correct extension - filename.with_suffix('.tmp').rename(filename.with_suffix('.csv')) + # remove the old file if it exists + filename.with_suffix('.csv').unlink(missing_ok=True) + + # rename the file to the correct extension + filename.with_suffix('.tmp').rename(filename.with_suffix('.csv')) + + else: + # TODO: The column information is not saved in the .csv! + self.to_dataframe().to_csv(filename.with_suffix('.csv')) def n_best_samples(self, nosamples: int, column_name: List[str] | str) -> pd.DataFrame: diff --git a/src/f3dasm/_src/experimentdata/_jobqueue.py b/src/f3dasm/_src/experimentdata/_jobqueue.py index 34818226..63f105af 100644 --- a/src/f3dasm/_src/experimentdata/_jobqueue.py +++ b/src/f3dasm/_src/experimentdata/_jobqueue.py @@ -189,7 +189,7 @@ def select_all(self, status: str) -> _JobQueue: # Export # ========================================================================= - def store(self, filename: Path) -> None: + def store(self, filename: Path, create_tmp: bool = False) -> None: """Stores the jobs in a pickle file. Parameters @@ -197,13 +197,16 @@ def store(self, filename: Path) -> None: filename : Path Path of the file. """ - self.jobs.to_pickle(filename.with_suffix('.tmp')) + if create_tmp: + self.jobs.to_pickle(filename.with_suffix('.tmp')) - # remove old file if it exists - filename.with_suffix('.pkl').unlink(missing_ok=True) + # remove old file if it exists + filename.with_suffix('.pkl').unlink(missing_ok=True) - # rename the file to the correct extension - filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl')) + # rename the file to the correct extension + filename.with_suffix('.tmp').rename(filename.with_suffix('.pkl')) + else: + self.jobs.to_pickle(filename.with_suffix('.pkl')) def to_dataframe(self, name: str = "") -> pd.DataFrame: """Converts the job queue to a DataFrame. diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index ca700764..8deb2fff 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -551,7 +551,8 @@ def get_output_data(self, # Export # ========================================================================= - def store(self, project_dir: Optional[Path | str] = None): + def store(self, project_dir: Optional[Path | str] = None, + create_tmp: bool = False): """Write the ExperimentData to disk in the project directory. Parameters @@ -589,10 +590,18 @@ def store(self, project_dir: Optional[Path | str] = None): # Create the subdirectory if it does not exist subdirectory.mkdir(parents=True, exist_ok=True) - self._input_data.store(subdirectory / Path(INPUT_DATA_FILENAME)) - self._output_data.store(subdirectory / Path(OUTPUT_DATA_FILENAME)) - self._jobs.store(subdirectory / Path(JOBS_FILENAME)) - self.domain.store(subdirectory / Path(DOMAIN_FILENAME)) + self._input_data.store( + filename=subdirectory / Path(INPUT_DATA_FILENAME), + create_tmp=create_tmp) + self._output_data.store( + filename=subdirectory / Path(OUTPUT_DATA_FILENAME), + create_tmp=create_tmp) + self._jobs.store( + filename=subdirectory / Path(JOBS_FILENAME), + create_tmp=create_tmp) + self.domain.store( + filename=subdirectory / Path(DOMAIN_FILENAME), + create_tmp=create_tmp) def to_numpy(self) -> Tuple[np.ndarray, np.ndarray]: """ From f63e1d30443e7eb54458c5723a488a051213e500 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Wed, 26 Jun 2024 14:16:12 +0200 Subject: [PATCH 16/17] Fix reading empty Pandas DataFrame error --- src/f3dasm/_src/experimentdata/_io.py | 4 +++ .../_src/experimentdata/experimentdata.py | 28 +++++++++++-------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/f3dasm/_src/experimentdata/_io.py b/src/f3dasm/_src/experimentdata/_io.py index 385f495e..b4a0d92f 100644 --- a/src/f3dasm/_src/experimentdata/_io.py +++ b/src/f3dasm/_src/experimentdata/_io.py @@ -51,6 +51,10 @@ class TemporaryFilesNotCleared(Exception): pass + +class ReadingEmptyPandasDataFrameError(Exception): + pass + # Storing methods # ============================================================================= diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index 8deb2fff..f6d1ac49 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -38,8 +38,8 @@ from ._data import DataTypes, _Data, _data_factory from ._io import (DOMAIN_FILENAME, EXPERIMENTDATA_SUBFOLDER, INPUT_DATA_FILENAME, JOBS_FILENAME, LOCK_FILENAME, MAX_TRIES, - OUTPUT_DATA_FILENAME, _project_dir_factory, - check_for_temporary_files) + OUTPUT_DATA_FILENAME, ReadingEmptyPandasDataFrameError, + _project_dir_factory, check_for_temporary_files) from ._jobqueue import NoOpenJobsError, Status, _jobs_factory from .experimentsample import ExperimentSample from .samplers import Sampler, SamplerNames, _sampler_factory @@ -398,15 +398,21 @@ def _from_file_attempt(cls: Type[ExperimentData], # check if there is any .tmp file in the subdirectory check_for_temporary_files(subdirectory) - try: - return cls(domain=subdirectory / DOMAIN_FILENAME, - input_data=subdirectory / INPUT_DATA_FILENAME, - output_data=subdirectory / OUTPUT_DATA_FILENAME, - jobs=subdirectory / JOBS_FILENAME, - project_dir=project_dir) - except FileNotFoundError: - raise FileNotFoundError( - f"Cannot find the files from {subdirectory}.") + for attempt in range(MAX_TRIES): + try: + return cls(domain=subdirectory / DOMAIN_FILENAME, + input_data=subdirectory / INPUT_DATA_FILENAME, + output_data=subdirectory / OUTPUT_DATA_FILENAME, + jobs=subdirectory / JOBS_FILENAME, + project_dir=project_dir) + except FileNotFoundError: + raise FileNotFoundError( + f"Cannot find the files from {subdirectory}.") + except pd.errors.EmptyDataError: + sleep(1) + continue + + raise ReadingEmptyPandasDataFrameError(f"Reading empty dataframes") # Selecting subsets # ========================================================================= From f27235a18e038dc7990e4908a96ced4b55c5e8dc Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Wed, 26 Jun 2024 14:17:24 +0200 Subject: [PATCH 17/17] flake8 fix --- src/f3dasm/_src/experimentdata/experimentdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index f6d1ac49..be63b7d2 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -412,7 +412,7 @@ def _from_file_attempt(cls: Type[ExperimentData], sleep(1) continue - raise ReadingEmptyPandasDataFrameError(f"Reading empty dataframes") + raise ReadingEmptyPandasDataFrameError("Reading empty dataframes") # Selecting subsets # =========================================================================