implement conditional probs (#39)

* implement conditional probs * remove accidental * test other index names * remove the floating url
wd60622 · Jan 14, 2024 · 8dbe4a6 · 8dbe4a6
1 parent 4bdc926
commit 8dbe4a6
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -19,7 +19,6 @@ Or install directly from GitHub for the latest functionality.
 
 ## Features 
 
-https://wd60622.github.io/latent-calendar
 - Integrated automatically into `pandas` with [`cal` attribute on DataFrames and Series](https://wd60622.github.io/latent-calendar/modules/extensions)
 - Compatible with [`scikit-learn` pipelines and transformers](https://wd60622.github.io/latent-calendar/examples/model/sklearn-compat)
 - [Transform and visualize data on a weekly calendar](https://wd60622.github.io/latent-calendar/examples/cal-attribute)

diff --git a/latent_calendar/const.py b/latent_calendar/const.py
@@ -1,3 +1,4 @@
+"""Constants used to create the full vocabulary of the dataset."""
 import calendar
 from itertools import product
 from typing import Dict, List, Union
@@ -41,6 +42,19 @@ def dicretized_hours(minutes: int) -> List[float]:
 def create_full_vocab(
     days_in_week: int, minutes: int, as_multiindex: bool = True
 ) -> Union[pd.MultiIndex, List[str]]:
+    """Create the full vocabulary of the dataset.
+
+    Args:
+        days_in_week: Number of days in the week.
+        minutes: Number of minutes to discretize the hours by.
+        as_multiindex: Whether to return a multiindex or a list of strings.
+
+    Returns:
+        The full vocabulary of the dataset.
+            Either a MultiIndex or a list of strings.
+
+    """
+
     if not as_multiindex:
         return [
             format_dow_hour(day_of_week, hour)

diff --git a/latent_calendar/extensions.py b/latent_calendar/extensions.py
@@ -165,6 +165,29 @@ def timestamp_features(
 
         return transformer.fit_transform(self._obj.rename(name).to_frame())
 
+    def conditional_probabilities(
+        self,
+        *,
+        level: Union[int, str] = 0,
+    ) -> pd.Series:
+        """Calculate conditional probabilities for each the row over the level.
+
+        Args:
+            level: level of the column MultiIndex.
+                Default 0 or day_of_week
+
+        Returns:
+            Series with conditional probabilities
+
+        """
+
+        if not isinstance(self._obj.index, pd.MultiIndex):
+            raise ValueError(
+                "Series is expected to have a MultiIndex with the last column as the vocab."
+            )
+
+        return self._obj.div(self._obj.groupby(level=level).sum(), level=level)
+
     def plot(
         self,
         *,
@@ -270,6 +293,30 @@ def normalize(self, kind: str) -> pd.DataFrame:
 
         raise ValueError(f"kind must be one of ['max', 'probs'], got {kind}")
 
+    def conditional_probabilities(
+        self,
+        *,
+        level: Union[int, str] = 0,
+    ) -> pd.DataFrame:
+        """Calculate conditional probabilities for each row over the level.
+
+        Args:
+            level: level of the columns MultiIndex.
+                Default 0 or day_of_week
+
+        Returns:
+            DataFrame with conditional probabilities
+
+        """
+        if not isinstance(self._obj.columns, pd.MultiIndex):
+            raise ValueError(
+                "DataFrame is expected to have a MultiIndex with the last column as the vocab."
+            )
+
+        return self._obj.div(
+            self._obj.groupby(level=level, axis=1).sum(), level=level, axis=1
+        )
+
     def timestamp_features(
         self,
         column: str,

diff --git a/latent_calendar/plot/core/model.py b/latent_calendar/plot/core/model.py
@@ -148,8 +148,8 @@ def plot_model_predictions(
     """Plot the model predictions compared to the test data.
 
     Args:
-        X_to_predict: Training data for the model
-        X_test: Testing data for the model
+        X_to_predict: Data for the model
+        X_holdout: Holdout data for the model
         model: LatentCalendar model instance
         divergent: Option to change the data displayed
         axes: list of 3 axes to plot this data

diff --git a/tests/test_extensions.py b/tests/test_extensions.py
@@ -21,6 +21,27 @@ def test_series_extensions(ser) -> None:
     assert isinstance(ax, plt.Axes)
 
 
+@pytest.fixture
+def ser_row(ser) -> pd.Series:
+    return pd.Series(1, index=FULL_VOCAB)
+
+
+@pytest.mark.parametrize(
+    "level, axis",
+    [
+        ("day_of_week", 1),
+        (0, 1),
+        ("hour", 0),
+        (1, 0),
+    ],
+)
+def test_series_conditional_probabilities(ser_row, level, axis) -> None:
+    result = ser_row.cal.conditional_probabilities(level=level).unstack().sum(axis=axis)
+    # All the probabilities should sum to 1
+
+    assert (result.round() == 1).all()
+
+
 @pytest.fixture
 def df() -> pd.DataFrame:
     """Generate some fake data."""
@@ -212,3 +233,40 @@ def test_wide_dataframe_extensions(df_wide: pd.DataFrame) -> None:
         pd.testing.assert_frame_equal(
             df_wide.cal.sum_next_hours(hours=next_hours), df_answer
         )
+
+
+@pytest.fixture
+def df_wide_subset() -> pd.DataFrame:
+    columns = pd.MultiIndex.from_tuples(
+        [
+            (0, 0),
+            (0, 1),
+            (0, 2),
+            (1, 0),
+            (1, 1),
+            (1, 2),
+        ],
+        names=["day_of_week", "hour"],
+    )
+
+    data = np.ones((3, 6))
+    return pd.DataFrame(data, columns=columns).sort_index(axis=1)
+
+
+@pytest.mark.parametrize(
+    "level, answer",
+    [
+        ("day_of_week", 1 / 3),
+        (0, 1 / 3),
+        ("hour", 1 / 2),
+        (1, 1 / 2),
+    ],
+)
+def test_dataframe_conditional_probabilities(
+    df_wide_subset: pd.DataFrame, level, answer
+) -> None:
+    result = df_wide_subset.cal.conditional_probabilities(level=level)
+    expected = pd.DataFrame(
+        answer, index=df_wide_subset.index, columns=df_wide_subset.columns
+    )
+    pd.testing.assert_frame_equal(result, expected)