Skip to content

Commit

Permalink
Merge pull request #302 from lsst/u/lynnej/add_median_normalization_o…
Browse files Browse the repository at this point in the history
…ption

Let 'baseline_run' be multiple runs
  • Loading branch information
rhiannonlynne authored Mar 30, 2023
2 parents 7223d02 + db297eb commit c822f2f
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 24 deletions.
33 changes: 25 additions & 8 deletions rubin_sim/maf/run_comparison/summary_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ def normalize_metric_summaries(
Parameters
----------
baseline_run : `str`
baseline_run : `str` or `list` of `str
The name of the run that defines a normalized value of 1.
If a list is provided, the median value of each metric across that list is used as the reference.
summary : `pandas.DataFrame`
The summary metrics to normalize (as returned by `get_metric_summaries`)
metric_sets : `pandas.DataFrame`
Expand Down Expand Up @@ -83,11 +84,23 @@ def normalize_metric_summaries(
# Get rid of duplicate metrics and runs
summary = summary.T.groupby("metric").first().T.groupby("run").first()

# And now create a line just for "baseline" --
# if baseline_run is >1, this is created from the median values per metric of those runs
# Make up a nonsense name for the reference, that is not currently in the summary dataframe
baseline_comparison = "bbb"
while baseline_comparison in summary.index:
baseline_comparison += "b"

if isinstance(summary.loc[baseline_run], pd.DataFrame):
summary.loc[baseline_comparison] = summary.loc[baseline_run].median(axis="rows")
else:
summary.loc[baseline_comparison] = summary.loc[baseline_run]

if metric_sets is None:
# If no invert/mag - just do simple normalization (1 + (x-0)/x0)
norm_summary = 1 + (
summary.loc[:, :].sub(summary.loc[baseline_run, :], axis="columns")
).div(summary.loc[baseline_run, :], axis="columns")
summary.loc[:, :].sub(summary.loc[baseline_comparison, :], axis="columns")
).div(summary.loc[baseline_comparison, :], axis="columns")
else:
# Reindex metric set and remove duplicates or non-available metrics
metric_names = [n for n in metric_sets.index.names if not n == "metric"]
Expand All @@ -106,30 +119,34 @@ def normalize_metric_summaries(
)

# Direct metrics are those that are neither inverted, nor compared as magnitudes
# direct = 1 + (value - norm) / norm == value / norm
direct = ~np.logical_or(metric_sets["invert"], metric_sets["mag"])
norm_summary.loc[:, direct] = summary.loc[:, direct]

# invert = 1 + (1/value - 1/norm) / (1/norm) == norm / value
norm_summary.loc[:, metric_sets["invert"]] = (
1.0 / summary.loc[:, metric_sets["invert"]]
)

# mag = 1 + (1+value-norm - (1+norm-norm)) / (1+norm-norm) == 1 + (value - norm)
norm_summary.loc[:, metric_sets["mag"]] = 1.0 + summary.loc[
:,
metric_sets["mag"],
].subtract(summary.loc[baseline_run, metric_sets["mag"]], axis="columns")
].subtract(summary.loc[baseline_comparison, metric_sets["mag"]], axis="columns")

# Some metrics can be both inverted and magnitudes (eg rms mag values)
both = np.logical_and(metric_sets["invert"], metric_sets["mag"])
# both = 1 + (1-(value-norm) - (1-(norm-norm))) / (1-(norm-norm)) == norm - value
norm_summary.loc[:, both] = 1.0 - summary.loc[:, both].subtract(
summary.loc[baseline_run, both], axis="columns"
summary.loc[baseline_comparison, both], axis="columns"
)

# Look a the fractional difference compared with the baseline
# Turn the values above into the fractional difference compared with the baseline
norm_summary.loc[:, :] = 1 + (
norm_summary.loc[:, :].sub(
norm_summary.loc[baseline_run, :], axis="columns"
norm_summary.loc[baseline_comparison, :], axis="columns"
)
).div(norm_summary.loc[baseline_run, :], axis="columns")
).div(norm_summary.loc[baseline_comparison, :], axis="columns")

# Set the index name
norm_summary.columns.name = "metric"
Expand Down
46 changes: 30 additions & 16 deletions tests/maf/test_summary_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def setUp(self):
self.baseline = self.runs[0]

self.metrics = [f"metric{i}" for i in range(self.num_metrics)]
self.inverted_metrics = ["metric3", "metric5"]
self.mag_metrics = ["metric1", "metric2"]
self.inverted_metrics = ["metric3", "metric5", "metric6"]
self.mag_metrics = ["metric1", "metric2", "metric6"]

self.metric_values = pd.DataFrame(
self.rng.normal(loc=3, scale=5, size=(self.num_runs, self.num_metrics)),
Expand All @@ -48,6 +48,7 @@ def setUp(self):
self.metric_set.loc["metric3", "style"] = "b--"

def test_normalize_metric_summaries(self):
# Test standard normalization with one run
norm_values = maf.normalize_metric_summaries(
self.baseline, self.metric_values, self.metric_set
)
Expand All @@ -58,9 +59,25 @@ def test_normalize_metric_summaries(self):
invert_cols=self.inverted_metrics,
mag_cols=self.mag_metrics,
)
np.testing.assert_allclose(norm_values.values, ref_norm_values.values)

# test normalizing against one run, as a list
norm_values = maf.normalize_metric_summaries(
[self.baseline], self.metric_values, self.metric_set
)
np.testing.assert_allclose(norm_values.values, ref_norm_values.values)

# test similar but pretend that self.baseline is two runs
norm_values = maf.normalize_metric_summaries(
[self.baseline, self.baseline], self.metric_values, self.metric_set
)
np.testing.assert_allclose(norm_values.values, ref_norm_values.values)

# test similar but different runs
norm_values = maf.normalize_metric_summaries(
[self.runs[0], self.runs[1]], self.metric_values, self.metric_set
)

def test_plot_run_metric(self):
fig, ax = maf.plot_run_metric(self.metric_values)

Expand Down Expand Up @@ -138,9 +155,7 @@ def test_plot_run_metric_mesh(self):
# internal functions & classes


def _run_infos_norm_df(
df, norm_run, invert_cols=None, reverse_cols=None, mag_cols=None
):
def _run_infos_norm_df(df, norm_run, invert_cols=None, mag_cols=None):
"""
Normalize values in a DataFrame, based on the values in a given run.
Can normalize some columns (metric values) differently (invert_cols, reverse_cols, mag_cols)
Expand All @@ -155,28 +170,27 @@ def _run_infos_norm_df(
The name of the simulation to normalize to (typically family_baseline)
invert_cols: list
Columns (metric values) to convert to 1 / value
reverse_cols: list
Columns (metric values) to invert (-1 * value)
mag_cols: list
Columns (metrics values) to treat as magnitudes (1 + (difference from norm_run))
Returns
-------
pd.DataFrame
Normalized data frame
"""

# This proc copied from https://github.com/lsst-pst/survey_strategy/blob/c559dcd895b3fe39f0e083832a07d89ecdfbe251/fbs_2.0/run_infos.py

# Copy the dataframe but drop the columns containing only strings
out_df = df.copy()
if reverse_cols is not None:
out_df[reverse_cols] = -out_df[reverse_cols]
if invert_cols is not None:
out_df[invert_cols] = 1 / out_df[invert_cols]
if mag_cols is not None:
out_df[mag_cols] = 1 + out_df[mag_cols] - out_df[mag_cols].loc[norm_run]
else:
if invert_cols is None:
invert_cols = []
if mag_cols is None:
mag_cols = []
both_cols = [c for c in invert_cols if c in mag_cols]
invert_cols = [c for c in invert_cols if c not in both_cols]
mag_cols = [c for c in mag_cols if c not in both_cols]
out_df[mag_cols] = 1 + out_df[mag_cols] - out_df[mag_cols].loc[norm_run]
out_df[invert_cols] = 1 / out_df[invert_cols]
out_df[both_cols] = 1 - out_df[both_cols] + out_df[both_cols].loc[norm_run]
# which columns are strings?
string_cols = [c for c, t in zip(df.columns, df.dtypes) if t == "object"]
cols = [c for c in out_df.columns.values if not (c in mag_cols or c in string_cols)]
Expand Down

0 comments on commit c822f2f

Please sign in to comment.