Merge pull request #87 from pymc-labs/quickstart-fixes-and-general-im…

…provement Quickstart fixes and general improvement
pymc-labs · Nov 23, 2022 · 4a23838 · 4a23838
2 parents cce4909 + 960c8f2
commit 4a23838
Show file tree

Hide file tree

Showing 17 changed files with 302 additions and 249 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -72,10 +72,31 @@ If there are autodoc issues/errors in remote builds of the docs, we need to add
 
 ## New releases [work in progress]
 
+### Test release to `test.pypi.org` (manual)
+
 1. Bump the release version in `causalpy/version.py`. This is automatically read by `setup.py` and `docs/config.py`.
+2. Update on test.pypi.org. _Note that this requires username and password for test.pypi.org_. In the root directory type the following:
+```bash
+rm -rf dist
+python setup.py sdist
+twine upload --repository testpypi dist/*
+```
+3. At this point the updated build is available on test.pypi.org. We can test that this is working as expected by installing (into a test environment) from test.pypi.org with
+
+```bash
+conda create -n causalpy-test python
+conda activate causalpy-test
+python3 -m pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ causalpy
+```
+
+4. Now load a python or ipython session and follow the quickstart instructions to confirm things work.
+
+### Actual release to `pypi.org` (manual)
+
+1. If not done in the previous step, bump the release version in `causalpy/version.py`. This is automatically read by `setup.py` and `docs/config.py`.
 2. Update on pypi.org. In the root directory:
   - `python setup.py sdist`
-  - update to pypi.org with `twine upload dist/*`
+  - update to pypi.org with `twine upload dist/*` Note that this requires username and password for pypi.org.
 3. Readthedocs:
   - Docs should be built remotely every time there is a pull request
   - See here https://docs.readthedocs.io/en/stable/tutorial/#versioning-documentation for versioning the docs
diff --git a/README.md b/README.md
@@ -37,27 +37,23 @@ pip install git+https://github.com/pymc-labs/CausalPy.git
 ## Quickstart
 
 ```python
-from causalpy.pymc_experiments import RegressionDiscontinuity
-from causalpy.pymc_models import LinearRegression
-import pandas as pd
-import pathlib
+import causalpy as cp
 
 
 # Import and process data
-rd_data_path = pathlib.Path.cwd().parents[1] / "causalpy" / "data" / "drinking.csv"
 df = (
-    pd.read_csv(rd_data_path)[["agecell", "all", "mva", "suicide"]]
+    cp.load_data("drinking")
     .rename(columns={"agecell": "age"})
     .assign(treated=lambda df_: df_.age > 21)
     .dropna(axis=0)
     )
 
 # Run the analysis
-result = RegressionDiscontinuity(
+result = cp.pymc_experiments.RegressionDiscontinuity(
     df,
     formula="all ~ 1 + age + treated",
     running_variable_name="age",
-    prediction_model=LinearRegression(),
+    prediction_model=cp.pymc_models.LinearRegression(),
     treatment_threshold=21,
     )
 

diff --git a/causalpy/__init__.py b/causalpy/__init__.py
@@ -0,0 +1,6 @@
+import causalpy.pymc_experiments
+import causalpy.pymc_models
+import causalpy.skl_experiments
+import causalpy.skl_models
+
+from .data import load_data
diff --git a/causalpy/data/__init__.py b/causalpy/data/__init__.py
@@ -0,0 +1,4 @@
+"""Code for loading datasets."""
+from .datasets import load_data
+
+__all__ = ["load_data"]
diff --git a/causalpy/data/datasets.py b/causalpy/data/datasets.py
@@ -0,0 +1,33 @@
+import os
+import pathlib
+
+import pandas as pd
+
+import causalpy as cp
+
+DATASETS = {
+    "banks": {"filename": "banks.csv"},
+    "did": {"filename": "did.csv"},
+    "drinking": {"filename": "drinking.csv"},
+    "its": {"filename": "its.csv"},
+    "its simple": {"filename": "its_simple.csv"},
+    "rd": {"filename": "regression_discontinuity.csv"},
+    "sc": {"filename": "synthetic_control.csv"},
+}
+
+
+def get_data_home():
+    """Return the path of the data directory"""
+    return pathlib.Path(cp.__file__).parents[1] / "causalpy" / "data"
+
+
+def load_data(dataset: str = None):
+
+    if dataset in DATASETS:
+
+        data_dir = get_data_home()
+        datafile = DATASETS[dataset]
+        file_path = data_dir / datafile["filename"]
+        return pd.read_csv(file_path)
+    else:
+        raise ValueError(f"Dataset {dataset} not found!")
diff --git a/causalpy/version.py b/causalpy/version.py
@@ -1 +1 @@
-__version__ = "0.0.2"
+__version__ = "0.0.3"
diff --git a/docs/index.rst b/docs/index.rst
@@ -29,27 +29,23 @@ Quickstart
 
 .. code-block:: python
 
-   from causalpy.pymc_experiments import RegressionDiscontinuity
-   from causalpy.pymc_models import LinearRegression
-   import pandas as pd
-   import pathlib
+   import causalpy as cp
 
 
    # Import and process data
-   rd_data_path = pathlib.Path.cwd().parents[1] / "causalpy" / "data" / "drinking.csv"
    df = (
-      pd.read_csv(rd_data_path)[["agecell", "all", "mva", "suicide"]]
+      cp.load_data("drinking")
       .rename(columns={"agecell": "age"})
       .assign(treated=lambda df_: df_.age > 21)
       .dropna(axis=0)
       )
 
    # Run the analysis
-   result = RegressionDiscontinuity(
+   result = cp.pymc_experiments.RegressionDiscontinuity(
       df,
       formula="all ~ 1 + age + treated",
       running_variable_name="age",
-      prediction_model=LinearRegression(),
+      prediction_model=cp.pymc_models.LinearRegression(),
       treatment_threshold=21,
       )
 

diff --git a/docs/notebooks/did_pymc.ipynb b/docs/notebooks/did_pymc.ipynb
diff --git a/docs/notebooks/did_pymc_banks.ipynb b/docs/notebooks/did_pymc_banks.ipynb
diff --git a/docs/notebooks/did_skl.ipynb b/docs/notebooks/did_skl.ipynb
@@ -13,8 +13,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "import pathlib\n",
+    "import causalpy as cp\n",
     "import arviz as az"
    ]
   },
@@ -27,28 +26,13 @@
     "az.style.use(\"arviz-darkgrid\")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load data"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "did_data_path = pathlib.Path.cwd().parents[1] / \"causalpy\" / \"data\" / \"did.csv\"\n",
-    "data = pd.read_csv(did_data_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Run the analysis"
+    "data = cp.load_data(\"did\")"
    ]
   },
   {
@@ -57,26 +41,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from causalpy.skl_experiments import DifferenceInDifferences\n",
     "from sklearn.linear_model import LinearRegression\n",
     "\n",
-    "# NOTE: `treated` is a deterministic function of `t` and `group`. So add this function into the formula.\n",
     "\n",
-    "result = DifferenceInDifferences(\n",
+    "result = cp.skl_experiments.DifferenceInDifferences(\n",
     "    data,\n",
     "    formula=\"y ~ 1 + group + t + treated:group\",\n",
     "    time_variable_name=\"t\",\n",
     "    prediction_model=LinearRegression(),\n",
     ")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Examine the results"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 5,

diff --git a/docs/notebooks/rd_pymc.ipynb b/docs/notebooks/rd_pymc.ipynb
diff --git a/docs/notebooks/rd_pymc_drinking.ipynb b/docs/notebooks/rd_pymc_drinking.ipynb
diff --git a/docs/notebooks/rd_skl.ipynb b/docs/notebooks/rd_skl.ipynb
diff --git a/docs/notebooks/rd_skl_drinking.ipynb b/docs/notebooks/rd_skl_drinking.ipynb
@@ -17,9 +17,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "import pathlib\n",
-    "import arviz as az"
+    "import arviz as az\n",
+    "import causalpy as cp"
    ]
   },
   {
@@ -31,45 +30,29 @@
     "az.style.use(\"arviz-darkgrid\")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load data"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "rd_data_path = pathlib.Path.cwd().parents[1] / \"causalpy\" / \"data\" / \"drinking.csv\"\n",
     "df = (\n",
-    "    pd.read_csv(rd_data_path)[[\"agecell\", \"all\", \"mva\", \"suicide\"]]\n",
+    "    cp.load_data(\"drinking\")\n",
     "    .rename(columns={\"agecell\": \"age\"})\n",
     "    .assign(treated=lambda df_: df_.age > 21)\n",
     "    .dropna(axis=0)\n",
     ")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Linear model"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from causalpy.skl_experiments import RegressionDiscontinuity\n",
     "from sklearn.linear_model import LinearRegression\n",
     "\n",
-    "result = RegressionDiscontinuity(\n",
+    "result = cp.skl_experiments.RegressionDiscontinuity(\n",
     "    df,\n",
     "    formula=\"all ~ 1 + age + treated\",\n",
     "    running_variable_name=\"age\",\n",

diff --git a/docs/notebooks/sc_pymc.ipynb b/docs/notebooks/sc_pymc.ipynb
diff --git a/docs/notebooks/sc_skl.ipynb b/docs/notebooks/sc_skl.ipynb
@@ -13,8 +13,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "import pathlib\n",
+    "import causalpy as cp\n",
     "import arviz as az"
    ]
   },
@@ -40,10 +39,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sc_data_path = (\n",
-    "    pathlib.Path.cwd().parents[1] / \"causalpy\" / \"data\" / \"synthetic_control.csv\"\n",
-    ")\n",
-    "df = pd.read_csv(sc_data_path)\n",
+    "df = cp.load_data(\"sc\")\n",
     "treatment_time = 70"
    ]
   },
@@ -60,15 +56,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from causalpy.skl_models import WeightedProportion\n",
-    "from causalpy.skl_experiments import SyntheticControl\n",
-    "\n",
     "# Note, we do not want an intercept in this model\n",
-    "result = SyntheticControl(\n",
+    "result = cp.skl_experiments.SyntheticControl(\n",
     "    df,\n",
     "    treatment_time,\n",
     "    formula=\"actual ~ 0 + a + b + c + d + e + f + g\",\n",
-    "    prediction_model=WeightedProportion(),\n",
+    "    prediction_model=cp.skl_models.WeightedProportion(),\n",
     ")"
    ]
   },
@@ -132,11 +125,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from causalpy.skl_experiments import SyntheticControl\n",
     "from sklearn.linear_model import LinearRegression\n",
     "\n",
     "# Note, we do not want an intercept in this model\n",
-    "result = SyntheticControl(\n",
+    "result = cp.skl_experiments.SyntheticControl(\n",
     "    df,\n",
     "    treatment_time,\n",
     "    formula=\"actual ~ 0 + a + b + c + d + e + f + g\",\n",

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-arviz>=0.13.0rc1
+arviz>=0.14.0
 graphviz
 matplotlib>=3.5.3
 numpy