diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..d8068cb7 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,37 @@ +name: ci + +on: [push] + +jobs: + lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Run lint + run: | + make init + make check_lint + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Run tests + run: | + make init + make test diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc3eccf4..f208051b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,26 +1,31 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 - hooks: - - id: trailing-whitespace - exclude_types: [svg] - - id: end-of-file-fixer - exclude_types: [svg] - - id: check-yaml - - id: check-added-large-files -- repo: https://github.com/asottile/seed-isort-config - rev: v2.2.0 - hooks: - - id: seed-isort-config -- repo: https://github.com/pre-commit/mirrors-isort - rev: v5.10.1 - hooks: - - id: isort - types: [python] -- repo: https://github.com/ambv/black - rev: 22.10.0 - hooks: - - id: black - - id: black-jupyter + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + exclude_types: [svg] + - id: end-of-file-fixer + exclude_types: [svg] + - id: check-yaml + - id: check-added-large-files + - repo: https://github.com/asottile/seed-isort-config + rev: v2.2.0 + hooks: + - id: seed-isort-config + - repo: https://github.com/pre-commit/mirrors-isort + rev: v5.10.1 + hooks: + - id: isort + args: [--profile, black] + types: [python] + - repo: https://github.com/ambv/black + rev: 22.10.0 + hooks: + - id: black + - id: black-jupyter + - repo: https://github.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..46a36064 --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +.PHONY: init lint check_lint test + +init: + python -m pip install -e . + +lint: + pip install -r requirements-lint.txt + isort . + black . + +check_lint: + pip install -r requirements-lint.txt + flake8 . + isort --check-only . + black --diff --check --fast . + +test: + pip install -r requirements-test.txt + pytest diff --git a/causalpy/data/datasets.py b/causalpy/data/datasets.py index 3e69391c..00d13689 100644 --- a/causalpy/data/datasets.py +++ b/causalpy/data/datasets.py @@ -1,4 +1,3 @@ -import os import pathlib import pandas as pd diff --git a/causalpy/data/simulate_data.py b/causalpy/data/simulate_data.py index e69a1245..3a40f498 100644 --- a/causalpy/data/simulate_data.py +++ b/causalpy/data/simulate_data.py @@ -28,7 +28,9 @@ def generate_synthetic_control_data( """ Example: >> import pathlib - >> df, weightings_true = generate_synthetic_control_data(treatment_time=treatment_time) + >> df, weightings_true = generate_synthetic_control_data( + treatment_time=treatment_time + ) >> df.to_csv(pathlib.Path.cwd() / 'synthetic_control.csv', index=False) """ @@ -45,7 +47,8 @@ def generate_synthetic_control_data( } ) - # 2. Generate counterfactual, based on weighted sum of non-treated variables. This is the counterfactual with NO treatment. + # 2. Generate counterfactual, based on weighted sum of non-treated variables. This + # is the counterfactual with NO treatment. weightings_true = dirichlet(np.ones(7)).rvs(1) df["counterfactual"] = np.dot(df.to_numpy(), weightings_true.T) @@ -53,7 +56,8 @@ def generate_synthetic_control_data( causal_effect = gamma(10).pdf(np.arange(0, N, 1) - treatment_time) df["causal effect"] = causal_effect * -50 - # 4. Generate the actually observed data, ie the treated with the causal effect applied + # 4. Generate the actually observed data, ie the treated with the causal effect + # applied df["actual"] = df["counterfactual"] + df["causal effect"] # 5. apply observation noise to all relevant variables @@ -94,13 +98,7 @@ def generate_time_series_data( return df -def generate_time_series_data(treatment_time): - """ - Example use: - >> import pathlib - >> df = generate_time_series_data("2017-01-01").loc[:, ['month', 'year', 't', 'y']] - df.to_csv(pathlib.Path.cwd() / 'its.csv') - """ +def generate_time_series_data_seasonal(treatment_time): dates = pd.date_range( start=pd.to_datetime("2010-01-01"), end=pd.to_datetime("2020-01-01"), freq="M" ) @@ -126,7 +124,9 @@ def generate_time_series_data(treatment_time): def generate_time_series_data_simple(treatment_time, slope=0.0): - """Generate simple interrupted time series data, with no seasonality or temporal structure""" + """Generate simple interrupted time series data, with no seasonality or temporal + structure. + """ dates = pd.date_range( start=pd.to_datetime("2010-01-01"), end=pd.to_datetime("2020-01-01"), freq="M" ) diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py index f8665ee3..675caed3 100644 --- a/causalpy/pymc_experiments.py +++ b/causalpy/pymc_experiments.py @@ -28,11 +28,16 @@ def print_coefficients(self): """Prints the model coefficients""" print("Model coefficients:") coeffs = az.extract(self.prediction_model.idata.posterior, var_names="beta") - # Note: f"{name: <30}" pads the name with spaces so that we have alignment of the stats despite variable names of different lengths + # Note: f"{name: <30}" pads the name with spaces so that we have alignment of + # the stats despite variable names of different lengths for name in self.labels: coeff_samples = coeffs.sel(coeffs=name) print( - f" {name: <30}{coeff_samples.mean().data:.2f}, 94% HDI [{coeff_samples.quantile(0.03).data:.2f}, {coeff_samples.quantile(1-0.03).data:.2f}]" + f""" + {name: <30}{coeff_samples.mean().data:.2f}, + 94% HDI [{coeff_samples.quantile(0.03).data:.2f}, + {coeff_samples.quantile(1-0.03).data:.2f}] + """ ) # add coeff for measurement std coeff_samples = az.extract( @@ -40,7 +45,11 @@ def print_coefficients(self): ) name = "sigma" print( - f" {name: <30}{coeff_samples.mean().data:.2f}, 94% HDI [{coeff_samples.quantile(0.03).data:.2f}, {coeff_samples.quantile(1-0.03).data:.2f}]" + f""" + {name: <30}{coeff_samples.mean().data:.2f}, + 94% HDI [{coeff_samples.quantile(0.03).data:.2f}, + {coeff_samples.quantile(1-0.03).data:.2f}] + """ ) @@ -121,8 +130,12 @@ def plot(self): include_label=False, ) ax[0].plot(self.datapost.index, self.post_y, "k.") + ax[0].set( - title=f"Pre-intervention Bayesian $R^2$: {self.score.r2:.3f} (std = {self.score.r2_std:.3f})" + title=f""" + Pre-intervention Bayesian $R^2$: {self.score.r2:.3f} + (std = {self.score.r2_std:.3f}) + """ ) plot_xY(self.datapre.index, self.pre_impact, ax=ax[1]) @@ -198,7 +211,8 @@ class DifferenceInDifferences(ExperimentalDesign): .. note:: - There is no pre/post intervention data distinction for DiD, we fit all the data available. + There is no pre/post intervention data distinction for DiD, we fit all the + data available. """ @@ -239,16 +253,26 @@ def __init__( assert ( "treated" in self.data.columns ), "Require a boolean column labelling observations which are `treated`" - # Check for `unit` in the incoming dataframe. *This is only used for plotting purposes* + # Check for `unit` in the incoming dataframe. + # *This is only used for plotting purposes* assert ( "unit" in self.data.columns - ), "Require a `unit` column to label unique units. This is used for plotting purposes" - # Check that `group_variable_name` has TWO levels, representing the treated/untreated. But it does not matter what the actual names of the levels are. + ), """ + Require a `unit` column to label unique units. + This is used for plotting purposes + """ + # Check that `group_variable_name` has TWO levels, representing the + # treated/untreated. But it does not matter what the actual names of + # the levels are. assert ( - len(pd.Categorical(self.data[self.group_variable_name]).categories) is 2 - ), f"There must be 2 levels of the grouping variable {self.group_variable_name}. I.e. the treated and untreated." + len(pd.Categorical(self.data[self.group_variable_name]).categories) == 2 + ), f""" + There must be 2 levels of the grouping variable {self.group_variable_name} + .I.e. the treated and untreated. + """ - # TODO: `treated` is a deterministic function of group and time, so this could be a function rather than supplied data + # TODO: `treated` is a deterministic function of group and time, so this could + # be a function rather than supplied data # DEVIATION FROM SKL EXPERIMENT CODE ============================= # fit the model to the observed (pre-intervention) data @@ -348,11 +372,13 @@ def plot(self): showmedians=False, widths=0.2, ) + for pc in parts["bodies"]: pc.set_facecolor("C1") pc.set_edgecolor("None") pc.set_alpha(0.5) - # Plot counterfactual - post-test for treatment group IF no treatment had occurred. + # Plot counterfactual - post-test for treatment group IF no treatment + # had occurred. parts = ax.violinplot( az.extract( self.y_pred_counterfactual, @@ -380,7 +406,8 @@ def plot(self): def _plot_causal_impact_arrow(self, ax): """ - draw a vertical arrow between `y_pred_counterfactual` and `y_pred_counterfactual` + draw a vertical arrow between `y_pred_counterfactual` and + `y_pred_counterfactual` """ # Calculate y values to plot the arrow between y_pred_treatment = ( @@ -438,13 +465,16 @@ class RegressionDiscontinuity(ExperimentalDesign): :param data: A pandas dataframe :param formula: A statistical model formula - :param treatment_threshold: A scalar threshold value at which the treatment is applied + :param treatment_threshold: A scalar threshold value at which the treatment + is applied :param prediction_model: A PyMC model - :param running_variable_name: The name of the predictor variable that the treatment threshold is based upon + :param running_variable_name: The name of the predictor variable that the treatment + threshold is based upon .. note:: - There is no pre/post intervention data distinction for the regression discontinuity design, we fit all the data available. + There is no pre/post intervention data distinction for the regression + discontinuity design, we fit all the data available. """ def __init__( @@ -469,7 +499,8 @@ def __init__( self.y, self.X = np.asarray(y), np.asarray(X) self.outcome_variable_name = y.design_info.column_names[0] - # TODO: `treated` is a deterministic function of x and treatment_threshold, so this could be a function rather than supplied data + # TODO: `treated` is a deterministic function of x and treatment_threshold, so + # this could be a function rather than supplied data # DEVIATION FROM SKL EXPERIMENT CODE ============================= # fit the model to the observed (pre-intervention) data @@ -492,8 +523,10 @@ def __init__( (new_x,) = build_design_matrices([self._x_design_info], self.x_pred) self.pred = self.prediction_model.predict(X=np.asarray(new_x)) - # calculate discontinuity by evaluating the difference in model expectation on either side of the discontinuity - # NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above (not below) the threshold + # calculate discontinuity by evaluating the difference in model expectation on + # either side of the discontinuity + # NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above + # (not below) the threshold self.x_discon = pd.DataFrame( { self.running_variable_name: np.array( @@ -514,7 +547,7 @@ def _is_treated(self, x): .. warning:: - Assumes treatment is given to those ABOVE the treatment threshold. + Assumes treatment is given to those ABOVE the treatment threshold. """ return np.greater_equal(x, self.treatment_threshold) @@ -536,10 +569,13 @@ def plot(self): ax=ax, ) # create strings to compose title - r2 = f"Bayesian $R^2$ on all data = {self.score.r2:.3f} (std = {self.score.r2_std:.3f})" + title_info = f"{self.score.r2:.3f} (std = {self.score.r2_std:.3f})" + r2 = f"Bayesian $R^2$ on all data = {title_info}" percentiles = self.discontinuity_at_threshold.quantile([0.03, 1 - 0.03]).values ci = r"$CI_{94\%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]" - discon = f"Discontinuity at threshold = {self.discontinuity_at_threshold.mean():.2f}, " + discon = f""" + Discontinuity at threshold = {self.discontinuity_at_threshold.mean():.2f}, + """ ax.set(title=r2 + "\n" + discon + ci) # Intervention line ax.axvline( @@ -559,7 +595,7 @@ def summary(self): print(f"Formula: {self.formula}") print(f"Running variable: {self.running_variable_name}") print(f"Threshold on running variable: {self.treatment_threshold}") - print(f"\nResults:") + print("\nResults:") print( f"Discontinuity at threshold = {self.discontinuity_at_threshold.mean():.2f}" ) diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py index 7569bdf8..9b814b53 100644 --- a/causalpy/pymc_models.py +++ b/causalpy/pymc_models.py @@ -21,7 +21,9 @@ def _data_setter(self, X): pm.set_data({"X": X}) def fit(self, X, y, coords): - """Draw samples from posterior, prior predictive, and posterior predictive distributions.""" + """Draw samples from posterior, prior predictive, and posterior predictive + distributions. + """ self.build_model(X, y, coords) with self.model: self.idata = pm.sample() @@ -43,7 +45,8 @@ def score(self, X, y): .. caution:: - The Bayesian :math:`R^2` is not the same as the traditional coefficient of determination, https://en.wikipedia.org/wiki/Coefficient_of_determination. + The Bayesian :math:`R^2` is not the same as the traditional coefficient of + determination, https://en.wikipedia.org/wiki/Coefficient_of_determination. """ yhat = self.predict(X) diff --git a/causalpy/skl_experiments.py b/causalpy/skl_experiments.py index 27457a62..8153a84e 100644 --- a/causalpy/skl_experiments.py +++ b/causalpy/skl_experiments.py @@ -167,7 +167,8 @@ class DifferenceInDifferences(ExperimentalDesign): """ .. note:: - There is no pre/post intervention data distinction for DiD, we fit all the data available. + There is no pre/post intervention data distinction for DiD, we fit all the data + available. """ def __init__( @@ -189,7 +190,8 @@ def __init__( self.y, self.X = np.asarray(y), np.asarray(X) self.outcome_variable_name = y.design_info.column_names[0] - # TODO: `treated` is a deterministic function of group and time, so this should be a function rather than supplied data + # TODO: `treated` is a deterministic function of group and time, so this should + # be a function rather than supplied data # fit the model to all the data self.prediction_model.fit(X=self.X, y=self.y) @@ -253,7 +255,8 @@ def plot(self): markersize=10, label="model fit (treament group)", ) - # Plot counterfactual - post-test for treatment group IF no treatment had occurred. + # Plot counterfactual - post-test for treatment group IF no treatment + # had occurred. ax.plot( self.x_pred_counterfactual[self.time_variable_name], self.y_pred_counterfactual, @@ -296,7 +299,8 @@ class RegressionDiscontinuity(ExperimentalDesign): .. note:: - There is no pre/post intervention data distinction for the regression discontinuity design, we fit all the data available. + There is no pre/post intervention data distinction for the regression + discontinuity design, we fit all the data available. """ @@ -321,7 +325,8 @@ def __init__( self.y, self.X = np.asarray(y), np.asarray(X) self.outcome_variable_name = y.design_info.column_names[0] - # TODO: `treated` is a deterministic function of x and treatment_threshold, so this could be a function rather than supplied data + # TODO: `treated` is a deterministic function of x and treatment_threshold, so + # this could be a function rather than supplied data # fit the model to all the data self.prediction_model.fit(X=self.X, y=self.y) @@ -341,8 +346,10 @@ def __init__( (new_x,) = build_design_matrices([self._x_design_info], self.x_pred) self.pred = self.prediction_model.predict(X=np.asarray(new_x)) - # calculate discontinuity by evaluating the difference in model expectation on either side of the discontinuity - # NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above (not below) the threshold + # calculate discontinuity by evaluating the difference in model expectation on + # either side of the discontinuity + # NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above + # (not below) the threshold self.x_discon = pd.DataFrame( { self.running_variable_name: np.array( @@ -358,11 +365,12 @@ def __init__( ) def _is_treated(self, x): - """Returns ``True`` if ``x`` is greater than or equal to the treatment threshold. + """Returns ``True`` if ``x`` is greater than or equal to the treatment + threshold. .. warning:: - Assumes treatment is given to those ABOVE the treatment threshold. + Assumes treatment is given to those ABOVE the treatment threshold. """ return np.greater_equal(x, self.treatment_threshold) @@ -405,7 +413,7 @@ def summary(self): print(f"Formula: {self.formula}") print(f"Running variable: {self.running_variable_name}") print(f"Threshold on running variable: {self.treatment_threshold}") - print(f"\nResults:") + print("\nResults:") print(f"Discontinuity at threshold = {self.discontinuity_at_threshold:.2f}") print("Model coefficients:") for name, val in zip(self.labels, self.prediction_model.coef_[0]): diff --git a/causalpy/tests/__init__.py b/causalpy/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/causalpy/tests/test_dummy.py b/causalpy/tests/test_dummy.py new file mode 100644 index 00000000..3fb8a978 --- /dev/null +++ b/causalpy/tests/test_dummy.py @@ -0,0 +1,2 @@ +def test_dummy() -> None: + assert True diff --git a/docs/conf.py b/docs/conf.py index d6afd2c1..7badb86c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,11 +12,13 @@ import os import sys +from causalpy.version import __version__ + sys.path.insert(0, os.path.abspath("../")) # autodoc_mock_imports # This avoids autodoc breaking when it can't find packages imported in the code. -# https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#confval-autodoc_mock_imports +# https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#confval-autodoc_mock_imports # noqa: E501 autodoc_mock_imports = [ "arviz", "matplotlib", @@ -37,7 +39,6 @@ copyright = "2022, Benjamin T. Vincent" author = "Benjamin T. Vincent" -from causalpy.version import __version__ release = __version__ @@ -57,10 +58,13 @@ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- nbsphinx config ---------------------------------------------------------- -# Opt out of executing the notebooks remotely. This will save time in the remote build process on readthedocs. The notebooks in /docs/notebooks will be parsed/converted, but not re-executed. +# Opt out of executing the notebooks remotely. This will save time in the remote build +# process on readthedocs. The notebooks in /docs/notebooks will be parsed/converted, +# but not re-executed. nbsphinx_execute = "never" -# MyST options for working with markdown files. Info about extensions here https://myst-parser.readthedocs.io/en/latest/syntax/optional.html?highlight=math#admonition-directives +# MyST options for working with markdown files. +# Info about extensions here https://myst-parser.readthedocs.io/en/latest/syntax/optional.html?highlight=math#admonition-directives # noqa: E501 myst_enable_extensions = ["dollarmath", "amsmath", "colon_fence", "linkify"] # -- Options for HTML output ------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..32c18341 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,11 @@ +[tool.isort] +profile = "black" + +[tool.pytest.ini_options] +addopts = [ + "-v", + "--strict-markers", + "--strict-config", + "--cov=causalpy", +] +testpaths = "causalpy/tests" diff --git a/requirements-lint.txt b/requirements-lint.txt new file mode 100644 index 00000000..e3a42d7d --- /dev/null +++ b/requirements-lint.txt @@ -0,0 +1,4 @@ +black>=22.3.0 +flake8>=4.0.1 +isort>=5.10.1 +pre-commit>=2.19.0 diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 00000000..9955decc --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,2 @@ +pytest +pytest-cov diff --git a/setup.cfg b/setup.cfg index 76b73e13..fe7ea8b3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,8 @@ [metadata] description-file=README.md license_files=LICENSE + +[flake8] +max-line-length = 88 +extend-ignore = E203 +per-file-ignores = */__init__.py:F401,F403 diff --git a/setup.py b/setup.py index 59649f35..2b606a9d 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,12 @@ import os -import sys from setuptools import find_packages, setup PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) README_FILE = os.path.join(PROJECT_ROOT, "README.md") -VERSION_FILE = os.path.join(PROJECT_ROOT, "bambi", "version.py") +VERSION_FILE = os.path.join(PROJECT_ROOT, "causalpy", "version.py") REQUIREMENTS_FILE = os.path.join(PROJECT_ROOT, "requirements.txt") +TEST_REQUIREMENTS_FILE = os.path.join(PROJECT_ROOT, "requirements-test.txt") def get_long_description(): @@ -14,17 +14,18 @@ def get_long_description(): return f.read() -# get version -# sys.path.insert(0, os.path.abspath("../")) -# from causalpy.version import __version__ -exec(open("causalpy/version.py").read()) +with open(VERSION_FILE) as f: + version = f.read().split("=")[-1].strip().strip('"') with open(REQUIREMENTS_FILE) as f: install_reqs = f.read().splitlines() +with open(TEST_REQUIREMENTS_FILE) as f: + tests_reqs = f.read().splitlines() + setup( name="CausalPy", - version=__version__, + version=version, description="Causal inference for quasi-experiments in Python", long_description=get_long_description(), long_description_content_type="text/markdown", @@ -35,5 +36,5 @@ def get_long_description(): python_requires=">=3.8", maintainer="Benjamin T. Vincent", install_requires=install_reqs, - # tests_require=test_reqs, + tests_require=tests_reqs, )