Merge pull request #232 from jpreszler/issue_129_docstring_additions

Issue 129: increase docstring coverage
pymc-labs · Sep 15, 2023 · 234a0cd · 234a0cd
2 parents d7a12cb + c80d78e
commit 234a0cd
Show file tree

Hide file tree

Showing 24 changed files with 1,108 additions and 75 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,6 +34,10 @@ jobs:
         uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Run doctests
+        run: |
+          pip install -e .[test]
+          pytest --doctest-modules causalpy/
       - name: Run tests
         run: |
           pip install -e .[test]

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -121,6 +121,20 @@ We recommend that your contribution complies with the following guidelines befor
 
 - All public methods must have informative docstrings with sample usage when appropriate.
 
+- Example usage in docstrings is tested via doctest, which can be run via
+
+    ```bash
+    make doctest
+    ```
+
+- Doctest can also be run directly via pytest, which can be helpful to run only specific tests during development. The following commands run all doctests, only doctests in the pymc_models module, and only the doctests for the `ModelBuilder` class in pymc_models:
+
+    ```bash
+    pytest --doctest-modules causalpy/
+    pytest --doctest-modules causalpy/pymc_models.py
+    pytest --doctest-modules causalpy/pmyc_models.py::causalpy.pymc_models.ModelBuilder
+    ```
+
 - To indicate a work in progress please mark the PR as `draft`. Drafts may be useful to (1) indicate you are working on something to avoid duplicated work, (2) request broad review of functionality or API, or (3) seek collaborators.
 
 - All other tests pass when everything is rebuilt from scratch. Tests can be run with:

diff --git a/Makefile b/Makefile
@@ -17,6 +17,10 @@ check_lint:
 	nbqa isort --check-only .
 	interrogate .
 
+doctest:
+	pip install causalpy[test]
+	pytest --doctest-modules causalpy/
+
 test:
 	pip install causalpy[test]
 	pytest

diff --git a/causalpy/custom_exceptions.py b/causalpy/custom_exceptions.py
@@ -1,21 +1,26 @@
+"""
+Custom Exceptions for CausalPy.
+"""
+
+
 class BadIndexException(Exception):
     """Custom exception used when we have a mismatch in types between the dataframe
     index and an event, typically a treatment or intervention."""
 
-    def __init__(self, message):
+    def __init__(self, message: str):
         self.message = message
 
 
 class FormulaException(Exception):
     """Exception raised given when there is some error in a user-provided model
     formula"""
 
-    def __init__(self, message):
+    def __init__(self, message: str):
         self.message = message
 
 
 class DataException(Exception):
     """Exception raised given when there is some error in user-provided dataframe"""
 
-    def __init__(self, message):
+    def __init__(self, message: str):
         self.message = message
diff --git a/causalpy/data/datasets.py b/causalpy/data/datasets.py
@@ -1,3 +1,6 @@
+"""
+Functions to load example datasets
+"""
 import pathlib
 
 import pandas as pd

diff --git a/causalpy/data/simulate_data.py b/causalpy/data/simulate_data.py
@@ -1,3 +1,6 @@
+"""
+Functions that generate data sets used in examples
+"""
 import numpy as np
 import pandas as pd
 from scipy.stats import dirichlet, gamma, norm, uniform
@@ -11,6 +14,18 @@
 def _smoothed_gaussian_random_walk(
     gaussian_random_walk_mu, gaussian_random_walk_sigma, N, lowess_kwargs
 ):
+    """
+    Generates Gaussian random walk data and applies LOWESS
+
+    :param gaussian_random_walk_mu:
+        Mean of the random walk
+    :param gaussian_random_walk_sigma:
+        Standard deviation of the random walk
+    :param N:
+        Length of the random walk
+    :param lowess_kwargs:
+        Keyword argument dictionary passed to statsmodels lowess
+    """
     x = np.arange(N)
     y = norm(gaussian_random_walk_mu, gaussian_random_walk_sigma).rvs(N).cumsum()
     filtered = lowess(y, x, **lowess_kwargs)
@@ -26,12 +41,25 @@ def generate_synthetic_control_data(
     lowess_kwargs=default_lowess_kwargs,
 ):
     """
-    Example:
-    >> import pathlib
-    >> df, weightings_true = generate_synthetic_control_data(
-                                treatment_time=treatment_time
-                            )
-    >> df.to_csv(pathlib.Path.cwd() / 'synthetic_control.csv', index=False)
+    Generates data for synthetic control example.
+
+    :param N:
+        Number fo data points
+    :param treatment_time:
+        Index where treatment begins in the generated dataframe
+    :param grw_mu:
+        Mean of Gaussian Random Walk
+    :param grw_sigma:
+        Standard deviation of Gaussian Random Walk
+    :lowess_kwargs:
+        Keyword argument dictionary passed to statsmodels lowess
+
+    Example
+    --------
+    >>> from causalpy.data.simulate_data import generate_synthetic_control_data
+    >>> df, weightings_true = generate_synthetic_control_data(
+    ...                             treatment_time=70
+    ... )
     """
 
     # 1. Generate non-treated variables
@@ -70,6 +98,21 @@ def generate_synthetic_control_data(
 def generate_time_series_data(
     N=100, treatment_time=70, beta_temp=-1, beta_linear=0.5, beta_intercept=3
 ):
+    """
+    Generates interrupted time series example data
+
+    :param N:
+        Length of the time series
+    :param treatment_time:
+        Index of when treatment begins
+    :param beta_temp:
+        The temperature coefficient
+    :param beta_linear:
+        The linear coefficient
+    :param beta_intercept:
+        The intercept
+
+    """
     x = np.arange(0, 100, 1)
     df = pd.DataFrame(
         {
@@ -99,6 +142,9 @@ def generate_time_series_data(
 
 
 def generate_time_series_data_seasonal(treatment_time):
+    """
+    Generates 10 years of monthly data with seasonality
+    """
     dates = pd.date_range(
         start=pd.to_datetime("2010-01-01"), end=pd.to_datetime("2020-01-01"), freq="M"
     )
@@ -146,6 +192,14 @@ def generate_time_series_data_simple(treatment_time, slope=0.0):
 
 
 def generate_did():
+    """
+    Generate Difference in Differences data
+
+    Example
+    --------
+    >>> from causalpy.data.simulate_data import generate_did
+    >>> df = generate_did()
+    """
     # true parameters
     control_intercept = 1
     treat_intercept_delta = 0.25
@@ -157,6 +211,7 @@ def generate_did():
     def outcome(
         t, control_intercept, treat_intercept_delta, trend, Δ, group, post_treatment
     ):
+        """Compute the outcome of each unit"""
         return (
             control_intercept
             + (treat_intercept_delta * group)
@@ -191,16 +246,23 @@ def generate_regression_discontinuity_data(
     N=100, true_causal_impact=0.5, true_treatment_threshold=0.0
 ):
     """
-    Example use:
-    >> import pathlib
-    >> df = generate_regression_discontinuity_data(true_treatment_threshold=0.5)
-    >> df.to_csv(pathlib.Path.cwd() / 'regression_discontinuity.csv', index=False)
+    Generate regression discontinuity example data
+
+    Example
+    --------
+    >>> import pathlib
+    >>> from causalpy.data.simulate_data import generate_regression_discontinuity_data
+    >>> df = generate_regression_discontinuity_data(true_treatment_threshold=0.5)
+    >>> df.to_csv(pathlib.Path.cwd() / 'regression_discontinuity.csv',
+    ...     index=False) # doctest: +SKIP
     """
 
     def is_treated(x):
+        """Check if x was treated"""
         return np.greater_equal(x, true_treatment_threshold)
 
     def impact(x):
+        """Assign true_causal_impact to all treaated entries"""
         y = np.zeros(len(x))
         y[is_treated(x)] = true_causal_impact
         return y
@@ -214,6 +276,22 @@ def impact(x):
 def generate_ancova_data(
     N=200, pre_treatment_means=np.array([10, 12]), treatment_effect=2, sigma=1
 ):
+    """
+    Generate ANCOVA eample data
+
+    Example
+    --------
+    >>> import pathlib
+    >>> from causalpy.data.simulate_data import generate_ancova_data
+    >>> df = generate_ancova_data(
+    ...     N=200,
+    ...     pre_treatment_means=np.array([10, 12]),
+    ...     treatment_effect=2,
+    ...     sigma=1
+    ... )
+    >>> df.to_csv(pathlib.Path.cwd() / 'ancova_data.csv',
+    ...     index=False) # doctest: +SKIP
+    """
     group = np.random.choice(2, size=N)
     pre = np.random.normal(loc=pre_treatment_means[group])
     post = pre + treatment_effect * group + np.random.normal(size=N) * sigma
@@ -233,6 +311,10 @@ def generate_geolift_data():
     causal_impact = 0.2
 
     def create_series(n=52, amplitude=1, length_scale=2):
+        """
+        Returns numpy tile with generated seasonality data repeated over
+        multiple years
+        """
         return np.tile(
             generate_seasonality(n=n, amplitude=amplitude, length_scale=2) + 3, n_years
         )

diff --git a/causalpy/plot_utils.py b/causalpy/plot_utils.py
@@ -1,3 +1,7 @@
+"""
+Plotting utility functions.
+"""
+
 from typing import Any, Dict, Optional, Tuple, Union
 
 import arviz as az
@@ -17,7 +21,22 @@ def plot_xY(
     hdi_prob: float = 0.94,
     label: Union[str, None] = None,
 ) -> Tuple[Line2D, PolyCollection]:
-    """Utility function to plot HDI intervals."""
+    """
+    Utility function to plot HDI intervals.
+
+    :param x:
+        Pandas datetime index or numpy array of x-axis values
+    :param y:
+        Xarray data array of y-axis data
+    :param ax:
+        Matplotlib ax object
+    :param plot_hdi_kwargs:
+        Dictionary of keyword arguments passed to ax.plot()
+    :param hdi_prob:
+        The size of the HDI, default is 0.94
+    :param label:
+        The plot label
+    """
 
     if plot_hdi_kwargs is None:
         plot_hdi_kwargs = {}