pymc-labs · drbenvincent · Nov 25, 2022 · Nov 21, 2022 · Nov 21, 2022 · Nov 21, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,37 @@
+name: ci
+
+on: [push]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run lint
+        run: |
+          make init
+          make check_lint
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run tests
+        run: |
+          make init
+          make test
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,26 +1,31 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
-    hooks:
-    -   id: trailing-whitespace
-        exclude_types: [svg]
-    -   id: end-of-file-fixer
-        exclude_types: [svg]
-    -   id: check-yaml
-    -   id: check-added-large-files
--   repo: https://github.com/asottile/seed-isort-config
-    rev: v2.2.0
-    hooks:
-    - id: seed-isort-config
--   repo: https://github.com/pre-commit/mirrors-isort
-    rev: v5.10.1
-    hooks:
-    - id: isort
-      types: [python]
--   repo: https://github.com/ambv/black
-    rev: 22.10.0
-    hooks:
-    - id: black
-    - id: black-jupyter
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v4.3.0
+      hooks:
+          - id: trailing-whitespace
+            exclude_types: [svg]
+          - id: end-of-file-fixer
+            exclude_types: [svg]
+          - id: check-yaml
+          - id: check-added-large-files
+    - repo: https://github.com/asottile/seed-isort-config
+      rev: v2.2.0
+      hooks:
+          - id: seed-isort-config
+    - repo: https://github.com/pre-commit/mirrors-isort
+      rev: v5.10.1
+      hooks:
+          - id: isort
+            args: [--profile, black]
+            types: [python]
+    - repo: https://github.com/ambv/black
+      rev: 22.10.0
+      hooks:
+          - id: black
+          - id: black-jupyter
+    - repo: https://github.com/pycqa/flake8
+      rev: 3.9.2
+      hooks:
+          - id: flake8
diff --git a/Makefile b/Makefile
@@ -0,0 +1,19 @@
+.PHONY: init lint check_lint test
+
+init:
+	python -m pip install -e .
+
+lint:
+	pip install -r requirements-lint.txt
+	isort .
+	black .
+
+check_lint:
+	pip install -r requirements-lint.txt
+	flake8 .
+	isort --check-only .
+	black --diff --check --fast .
+
+test:
+	pip install -r requirements-test.txt
+	pytest
diff --git a/causalpy/data/datasets.py b/causalpy/data/datasets.py
@@ -1,4 +1,3 @@
-import os
 import pathlib
 
 import pandas as pd

diff --git a/causalpy/data/simulate_data.py b/causalpy/data/simulate_data.py
@@ -28,7 +28,9 @@ def generate_synthetic_control_data(
     """
     Example:
     >> import pathlib
-    >> df, weightings_true = generate_synthetic_control_data(treatment_time=treatment_time)
+    >> df, weightings_true = generate_synthetic_control_data(
+                                treatment_time=treatment_time
+                            )
     >> df.to_csv(pathlib.Path.cwd() / 'synthetic_control.csv', index=False)
     """
 
@@ -45,15 +47,17 @@ def generate_synthetic_control_data(
         }
     )
 
-    # 2. Generate counterfactual, based on weighted sum of non-treated variables. This is the counterfactual with NO treatment.
+    # 2. Generate counterfactual, based on weighted sum of non-treated variables. This
+    # is the counterfactual with NO treatment.
     weightings_true = dirichlet(np.ones(7)).rvs(1)
     df["counterfactual"] = np.dot(df.to_numpy(), weightings_true.T)
 
     # 3. Generate the causal effect
     causal_effect = gamma(10).pdf(np.arange(0, N, 1) - treatment_time)
     df["causal effect"] = causal_effect * -50
 
-    # 4. Generate the actually observed data, ie the treated with the causal effect applied
+    # 4. Generate the actually observed data, ie the treated with the causal effect
+    # applied
     df["actual"] = df["counterfactual"] + df["causal effect"]
 
     # 5. apply observation noise to all relevant variables
@@ -94,13 +98,7 @@ def generate_time_series_data(
     return df
 
 
-def generate_time_series_data(treatment_time):
-    """
-    Example use:
-    >> import pathlib
-    >> df = generate_time_series_data("2017-01-01").loc[:, ['month', 'year', 't', 'y']]
-    df.to_csv(pathlib.Path.cwd() / 'its.csv')
-    """
+def generate_time_series_data_seasonal(treatment_time):
     dates = pd.date_range(
         start=pd.to_datetime("2010-01-01"), end=pd.to_datetime("2020-01-01"), freq="M"
     )
@@ -126,7 +124,9 @@ def generate_time_series_data(treatment_time):
 
 
 def generate_time_series_data_simple(treatment_time, slope=0.0):
-    """Generate simple interrupted time series data, with no seasonality or temporal structure"""
+    """Generate simple interrupted time series data, with no seasonality or temporal
+    structure.
+    """
     dates = pd.date_range(
         start=pd.to_datetime("2010-01-01"), end=pd.to_datetime("2020-01-01"), freq="M"
     )

diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py
@@ -28,19 +28,28 @@ def print_coefficients(self):
         """Prints the model coefficients"""
         print("Model coefficients:")
         coeffs = az.extract(self.prediction_model.idata.posterior, var_names="beta")
-        # Note: f"{name: <30}" pads the name with spaces so that we have alignment of the stats despite variable names of different lengths
+        # Note: f"{name: <30}" pads the name with spaces so that we have alignment of
+        # the stats despite variable names of different lengths
         for name in self.labels:
             coeff_samples = coeffs.sel(coeffs=name)
             print(
-                f"  {name: <30}{coeff_samples.mean().data:.2f}, 94% HDI [{coeff_samples.quantile(0.03).data:.2f}, {coeff_samples.quantile(1-0.03).data:.2f}]"
+                f"""
+                {name: <30}{coeff_samples.mean().data:.2f},
+                94% HDI [{coeff_samples.quantile(0.03).data:.2f},
+                {coeff_samples.quantile(1-0.03).data:.2f}]
+                """
             )
         # add coeff for measurement std
         coeff_samples = az.extract(
             self.prediction_model.idata.posterior, var_names="sigma"
         )
         name = "sigma"
         print(
-            f"  {name: <30}{coeff_samples.mean().data:.2f}, 94% HDI [{coeff_samples.quantile(0.03).data:.2f}, {coeff_samples.quantile(1-0.03).data:.2f}]"
+            f"""
+            {name: <30}{coeff_samples.mean().data:.2f},
+            94% HDI [{coeff_samples.quantile(0.03).data:.2f},
+            {coeff_samples.quantile(1-0.03).data:.2f}]
+            """
         )
 
 
@@ -121,8 +130,12 @@ def plot(self):
             include_label=False,
         )
         ax[0].plot(self.datapost.index, self.post_y, "k.")
+
         ax[0].set(
-            title=f"Pre-intervention Bayesian $R^2$: {self.score.r2:.3f} (std = {self.score.r2_std:.3f})"
+            title=f"""
+            Pre-intervention Bayesian $R^2$: {self.score.r2:.3f}
+            (std = {self.score.r2_std:.3f})
+            """
         )
 
         plot_xY(self.datapre.index, self.pre_impact, ax=ax[1])
@@ -198,7 +211,8 @@ class DifferenceInDifferences(ExperimentalDesign):
 
     .. note::
 
-       There is no pre/post intervention data distinction for DiD, we fit all the data available.
+        There is no pre/post intervention data distinction for DiD, we fit all the
+        data available.
 
     """
 
@@ -239,16 +253,26 @@ def __init__(
         assert (
             "treated" in self.data.columns
         ), "Require a boolean column labelling observations which are `treated`"
-        # Check for `unit` in the incoming dataframe. *This is only used for plotting purposes*
+        # Check for `unit` in the incoming dataframe.
+        # *This is only used for plotting purposes*
         assert (
             "unit" in self.data.columns
-        ), "Require a `unit` column to label unique units. This is used for plotting purposes"
-        # Check that `group_variable_name` has TWO levels, representing the treated/untreated. But it does not matter what the actual names of the levels are.
+        ), """
+        Require a `unit` column to label unique units.
+        This is used for plotting purposes
+        """
+        # Check that `group_variable_name` has TWO levels, representing the
+        # treated/untreated. But it does not matter what the actual names of
+        # the levels are.
         assert (
-            len(pd.Categorical(self.data[self.group_variable_name]).categories) is 2
-        ), f"There must be 2 levels of the grouping variable {self.group_variable_name}. I.e. the treated and untreated."
+            len(pd.Categorical(self.data[self.group_variable_name]).categories) == 2
+        ), f"""
+            There must be 2 levels of the grouping variable {self.group_variable_name}
+            .I.e. the treated and untreated.
+        """
 
-        # TODO: `treated` is a deterministic function of group and time, so this could be a function rather than supplied data
+        # TODO: `treated` is a deterministic function of group and time, so this could
+        # be a function rather than supplied data
 
         # DEVIATION FROM SKL EXPERIMENT CODE =============================
         # fit the model to the observed (pre-intervention) data
@@ -348,11 +372,13 @@ def plot(self):
             showmedians=False,
             widths=0.2,
         )
+
         for pc in parts["bodies"]:
             pc.set_facecolor("C1")
             pc.set_edgecolor("None")
             pc.set_alpha(0.5)
-        # Plot counterfactual - post-test for treatment group IF no treatment had occurred.
+        # Plot counterfactual - post-test for treatment group IF no treatment
+        # had occurred.
         parts = ax.violinplot(
             az.extract(
                 self.y_pred_counterfactual,
@@ -380,7 +406,8 @@ def plot(self):
 
     def _plot_causal_impact_arrow(self, ax):
         """
-        draw a vertical arrow between `y_pred_counterfactual` and `y_pred_counterfactual`
+        draw a vertical arrow between `y_pred_counterfactual` and
+        `y_pred_counterfactual`
         """
         # Calculate y values to plot the arrow between
         y_pred_treatment = (
@@ -438,13 +465,16 @@ class RegressionDiscontinuity(ExperimentalDesign):
 
     :param data: A pandas dataframe
     :param formula: A statistical model formula
-    :param treatment_threshold: A scalar threshold value at which the treatment is applied
+    :param treatment_threshold: A scalar threshold value at which the treatment
+                                is applied
     :param prediction_model: A PyMC model
-    :param running_variable_name: The name of the predictor variable that the treatment threshold is based upon
+    :param running_variable_name: The name of the predictor variable that the treatment
+                                  threshold is based upon
 
     .. note::
 
-       There is no pre/post intervention data distinction for the regression discontinuity design, we fit all the data available.
+        There is no pre/post intervention data distinction for the regression
+        discontinuity design, we fit all the data available.
     """
 
     def __init__(
@@ -469,7 +499,8 @@ def __init__(
         self.y, self.X = np.asarray(y), np.asarray(X)
         self.outcome_variable_name = y.design_info.column_names[0]
 
-        # TODO: `treated` is a deterministic function of x and treatment_threshold, so this could be a function rather than supplied data
+        # TODO: `treated` is a deterministic function of x and treatment_threshold, so
+        # this could be a function rather than supplied data
 
         # DEVIATION FROM SKL EXPERIMENT CODE =============================
         # fit the model to the observed (pre-intervention) data
@@ -492,8 +523,10 @@ def __init__(
         (new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
         self.pred = self.prediction_model.predict(X=np.asarray(new_x))
 
-        # calculate discontinuity by evaluating the difference in model expectation on either side of the discontinuity
-        # NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above (not below) the threshold
+        # calculate discontinuity by evaluating the difference in model expectation on
+        # either side of the discontinuity
+        # NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above
+        # (not below) the threshold
         self.x_discon = pd.DataFrame(
             {
                 self.running_variable_name: np.array(
@@ -514,7 +547,7 @@ def _is_treated(self, x):
 
         .. warning::
 
-           Assumes treatment is given to those ABOVE the treatment threshold.
+            Assumes treatment is given to those ABOVE the treatment threshold.
         """
         return np.greater_equal(x, self.treatment_threshold)
 
@@ -536,10 +569,13 @@ def plot(self):
             ax=ax,
         )
         # create strings to compose title
-        r2 = f"Bayesian $R^2$ on all data = {self.score.r2:.3f} (std = {self.score.r2_std:.3f})"
+        title_info = f"{self.score.r2:.3f} (std = {self.score.r2_std:.3f})"
+        r2 = f"Bayesian $R^2$ on all data = {title_info}"
         percentiles = self.discontinuity_at_threshold.quantile([0.03, 1 - 0.03]).values
         ci = r"$CI_{94\%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]"
-        discon = f"Discontinuity at threshold = {self.discontinuity_at_threshold.mean():.2f}, "
+        discon = f"""
+            Discontinuity at threshold = {self.discontinuity_at_threshold.mean():.2f},
+            """
         ax.set(title=r2 + "\n" + discon + ci)
         # Intervention line
         ax.axvline(
@@ -559,7 +595,7 @@ def summary(self):
         print(f"Formula: {self.formula}")
         print(f"Running variable: {self.running_variable_name}")
         print(f"Threshold on running variable: {self.treatment_threshold}")
-        print(f"\nResults:")
+        print("\nResults:")
         print(
             f"Discontinuity at threshold = {self.discontinuity_at_threshold.mean():.2f}"
         )

diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py
@@ -21,7 +21,9 @@ def _data_setter(self, X):
             pm.set_data({"X": X})
 
     def fit(self, X, y, coords):
-        """Draw samples from posterior, prior predictive, and posterior predictive distributions."""
+        """Draw samples from posterior, prior predictive, and posterior predictive
+        distributions.
+        """
         self.build_model(X, y, coords)
         with self.model:
             self.idata = pm.sample()
@@ -43,7 +45,8 @@ def score(self, X, y):
 
         .. caution::
 
-           The Bayesian :math:`R^2` is not the same as the traditional coefficient of determination, https://en.wikipedia.org/wiki/Coefficient_of_determination.
+            The Bayesian :math:`R^2` is not the same as the traditional coefficient of
+            determination, https://en.wikipedia.org/wiki/Coefficient_of_determination.
 
         """
         yhat = self.predict(X)