pymc-labs · westford14 · Sep 2, 2024 · Sep 5, 2024 · Sep 6, 2024 · Sep 9, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,6 +25,8 @@ jobs:
         run: pip install -e .[test]
       - name: Run doctests
         run: pytest --doctest-modules --ignore=causalpy/tests/ causalpy/
+      - name: Run extra tests
+        run: pytest docs/source/.codespell/test_notebook_to_markdown.py
       - name: Run tests
         run: pytest --cov-report=xml --no-cov-on-fail
       - name: Upload coverage to Codecov

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
         exclude: &exclude_pattern 'iv_weak_instruments.ipynb'
         args: ["--maxkb=1500"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.2
+    rev: v0.6.3
     hooks:
       # Run the linter
       - id: ruff
@@ -41,3 +41,39 @@ repos:
         # needed to make excludes in pyproject.toml work
         # see here https://github.com/econchick/interrogate/issues/60#issuecomment-735436566
         pass_filenames: false
+  - repo: local
+    hooks:
+      - id: convert-notebooks
+        name: Convert Notebooks to Markdown
+        entry: python ./docs/source/.codespell/notebook_to_markdown.py
+        language: python
+        pass_filenames: false
+        always_run: false
+        additional_dependencies: ["nbconvert", "nbformat"]
+        args: ["--tempdir", "tmp_markdown"]
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.3.0
+    hooks:
+      - id: codespell
+        args: [
+            "-S",
+            "*.csv",
+            "-S",
+            "pyproject.toml",
+            "-S",
+            "*.svg",
+            "-S",
+            "*.ipynb",
+            "--ignore-words=./docs/source/.codespell/codespell-whitelist.txt",
+          ]
+        additional_dependencies:
+          # Support pyproject.toml configuration
+          - tomli
+  - repo: local
+    hooks:
+      - id: remove-temp-directory-notebooks
+        name: Remove temporary directory for codespell
+        entry: bash -c 'rm -rf tmp_markdown && exit 0'
+        language: system
+        always_run: true
+        pass_filenames: false
diff --git a/README.md b/README.md
@@ -89,7 +89,7 @@ This is appropriate when you have multiple units, one of which is treated. You b
 > The data (treated and untreated units), pre-treatment model fit, and counterfactual (i.e. the synthetic control) are plotted (top). The causal impact is shown as a blue shaded region. The Bayesian analysis shows shaded Bayesian credible regions of the model fit and counterfactual. Also shown is the causal impact (middle) and cumulative causal impact (bottom).
 
 ### Geographical lift (Geolift)
-We can also use synthetic control methods to analyse data from geographical lift studies. For example, we can try to evaluate the causal impact of an intervention (e.g. a marketing campaign) run in one geographical area by using control geographical areas which are similar to the intervention area but which did not recieve the specific marketing intervention.
+We can also use synthetic control methods to analyse data from geographical lift studies. For example, we can try to evaluate the causal impact of an intervention (e.g. a marketing campaign) run in one geographical area by using control geographical areas which are similar to the intervention area but which did not receive the specific marketing intervention.
 
 ### ANCOVA
 

diff --git a/causalpy/data/simulate_data.py b/causalpy/data/simulate_data.py
@@ -291,7 +291,7 @@ def generate_ancova_data(
     N=200, pre_treatment_means=np.array([10, 12]), treatment_effect=2, sigma=1
 ):
     """
-    Generate ANCOVA eample data
+    Generate ANCOVA example data
 
     Example
     --------
@@ -440,7 +440,7 @@ def generate_seasonality(n=12, amplitude=1, length_scale=0.5):
 
 
 def periodic_kernel(x1, x2, period=1, length_scale=1, amplitude=1):
-    """Generate a periodic kernal for gaussian process"""
+    """Generate a periodic kernel for gaussian process"""
     return amplitude**2 * np.exp(
         -2 * np.sin(np.pi * np.abs(x1 - x2) / period) ** 2 / length_scale**2
     )

diff --git a/causalpy/experiments/instrumental_variable.py b/causalpy/experiments/instrumental_variable.py
@@ -44,7 +44,7 @@ class InstrumentalVariable(BaseExperiment):
     :param model: A PyMC model
     :param priors: An optional dictionary of priors for the
                    mus and sigmas of both regressions. If priors are not
-                   specified we will substitue MLE estimates for the beta
+                   specified we will substitute MLE estimates for the beta
                    coefficients. Greater control can be achieved
                    by specifying the priors directly e.g. priors = {
                                     "mus": [0, 0],

diff --git a/causalpy/experiments/inverse_propensity_weighting.py b/causalpy/experiments/inverse_propensity_weighting.py
@@ -195,7 +195,7 @@ def make_doubly_robust_adjustment(self, ps):
         m1 = sk_lin_reg().fit(X[t == 1].astype(float), self.y[t == 1])
         m0_pred = m0.predict(X)
         m1_pred = m1.predict(X)
-        ## Compromise between outcome and treatement assignment model
+        ## Compromise between outcome and treatment assignment model
         weighted_outcome0 = (1 - t) * (self.y - m0_pred) / (1 - X["ps"]) + m0_pred
         weighted_outcome1 = t * (self.y - m1_pred) / X["ps"] + m1_pred
         return weighted_outcome0, weighted_outcome1, None, None

diff --git a/causalpy/experiments/prepostfit.py b/causalpy/experiments/prepostfit.py
@@ -311,7 +311,7 @@ class InterruptedTimeSeries(PrePostFit):
     :param data:
         A pandas dataframe
     :param treatment_time:
-        The time when treatment occured, should be in reference to the data index
+        The time when treatment occurred, should be in reference to the data index
     :param formula:
         A statistical model formula
     :param model:
@@ -352,7 +352,7 @@ class SyntheticControl(PrePostFit):
     :param data:
         A pandas dataframe
     :param treatment_time:
-        The time when treatment occured, should be in reference to the data index
+        The time when treatment occurred, should be in reference to the data index
     :param formula:
         A statistical model formula
     :param model:

diff --git a/causalpy/plot_utils.py b/causalpy/plot_utils.py
@@ -73,7 +73,7 @@ def plot_xY(
         ax=ax,
         **plot_hdi_kwargs,
     )
-    # Return handle to patch. We get a list of the childen of the axis. Filter for just
+    # Return handle to patch. We get a list of the children of the axis. Filter for just
     # the PolyCollection objects. Take the last one.
     h_patch = list(
         filter(lambda x: isinstance(x, PolyCollection), ax_hdi.get_children())

diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py
@@ -27,7 +27,7 @@
 
 
 class PyMCModel(pm.Model):
-    """A wraper class for PyMC models. This provides a scikit-learn like interface with
+    """A wrapper class for PyMC models. This provides a scikit-learn like interface with
     methods like `fit`, `predict`, and `score`. It also provides other methods which are
     useful for causal inference.
 

diff --git a/causalpy/tests/test_pymc_models.py b/causalpy/tests/test_pymc_models.py
@@ -142,7 +142,7 @@ def test_idata_property():
 @pytest.mark.parametrize("seed", seeds)
 def test_result_reproducibility(seed):
     """Test that we can reproduce the results from the model. We could in theory test
-    this with all the model and experiment types, but what is being targetted is
+    this with all the model and experiment types, but what is being targeted is
     the PyMCModel.fit method, so we should be safe testing with just one model. Here
     we use the DifferenceInDifferences experiment class."""
     # Load the data

diff --git a/docs/source/.codespell/codespell-whitelist.txt b/docs/source/.codespell/codespell-whitelist.txt
@@ -0,0 +1,4 @@
+nD
+CACE
+compliers
+complier
diff --git a/docs/source/.codespell/notebook_to_markdown.py b/docs/source/.codespell/notebook_to_markdown.py
@@ -0,0 +1,71 @@
+#   Copyright 2024 The PyMC Labs Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+"""
+This is a simple script that converts the jupyter notebooks into markdown
+for easier (and cleaner) parsing for the codespell check. Whitelisted words
+are maintained within this directory in the `codespeel-whitelist.txt`. For
+more information on this pre-commit hook please visit the github homepage
+for the project: https://github.com/codespell-project/codespell.
+"""
+
+import argparse
+import os
+from glob import glob
+
+import nbformat
+from nbconvert import MarkdownExporter
+
+
+def notebook_to_markdown(pattern: str, output_dir: str) -> None:
+    """
+    Utility to convert jupyter notebook to markdown files.
+
+    :param pattern:
+        str that is a glob appropriate pattern to search
+    :param output_dir:
+        str directory to save the markdown files to
+    """
+    for f_name in glob(pattern, recursive=True):
+        with open(f_name, "r", encoding="utf-8") as f:
+            nb = nbformat.read(f, as_version=4)
+
+        markdown_exporter = MarkdownExporter()
+        (body, _) = markdown_exporter.from_notebook_node(nb)
+
+        os.makedirs(output_dir, exist_ok=True)
+
+        output_file = os.path.join(
+            output_dir, os.path.splitext(os.path.basename(f_name))[0] + ".md"
+        )
+
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(body)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p",
+        "--pattern",
+        help="the glob appropriate pattern to search for jupyter notebooks",
+        default="docs/**/*.ipynb",
+    )
+    parser.add_argument(
+        "-t",
+        "--tempdir",
+        help="temporary directory to save the converted notebooks",
+        default="tmp_markdown",
+    )
+    args = parser.parse_args()
+    notebook_to_markdown(args.pattern, args.tempdir)
diff --git a/docs/source/.codespell/test_data/test_notebook.ipynb b/docs/source/.codespell/test_data/test_notebook.ipynb
@@ -0,0 +1,31 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"{os.__file}__\")\n",
+    "\n",
+    "# Speling mistake."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/.codespell/test_notebook_to_markdown.py b/docs/source/.codespell/test_notebook_to_markdown.py
@@ -0,0 +1,43 @@
+#   Copyright 2024 The PyMC Labs Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+"""Notebook to markdown tests."""
+
+import os
+from tempfile import TemporaryDirectory
+
+import pytest
+from notebook_to_markdown import notebook_to_markdown
+
+
+@pytest.fixture
+def data_dir() -> str:
+    """Get current directory."""
+    return os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_data")
+
+
+def test_notebook_to_markdown_empty_pattern(data_dir: str) -> None:
+    """Test basic functionality of notebook_to_markdown with empty pattern."""
+    with TemporaryDirectory() as tmp_dir:
+        pattern = "*.missing"
+        notebook_to_markdown(f"{data_dir}/{pattern}", tmp_dir)
+        assert len(os.listdir(tmp_dir)) == 0
+
+
+def test_notebook_to_markdown(data_dir: str) -> None:
+    """Test basic functionality of notebook_to_markdown with a correct pattern."""
+    with TemporaryDirectory() as tmp_dir:
+        pattern = "*.ipynb"
+        notebook_to_markdown(f"{data_dir}/{pattern}", tmp_dir)
+        assert len(os.listdir(tmp_dir)) == 1
+        assert "test_notebook.md" in os.listdir(tmp_dir)
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -98,7 +98,7 @@ This is appropriate when you have multiple units, one of which is treated. You b
 ![Synthetic Control](./_static/synthetic_control_pymc.svg)
 
 ### Geographical Lift / Geolift
-We can also use synthetic control methods to analyse data from geographical lift studies. For example, we can try to evaluate the causal impact of an intervention (e.g. a marketing campaign) run in one geographical area by using control geographical areas which are similar to the intervention area but which did not recieve the specific marketing intervention.
+We can also use synthetic control methods to analyse data from geographical lift studies. For example, we can try to evaluate the causal impact of an intervention (e.g. a marketing campaign) run in one geographical area by using control geographical areas which are similar to the intervention area but which did not receive the specific marketing intervention.
 
 ### ANCOVA
 

diff --git a/docs/source/knowledgebase/glossary.rst b/docs/source/knowledgebase/glossary.rst
@@ -9,11 +9,11 @@ Glossary
 
    Average treatment effect
    ATE
-      The average treatement effect across all units.
+      The average treatment effect across all units.
 
    Average treatment effect on the treated
    ATT
-      The average effect of the treatment on the units that recieved it. Also called Treatment on the treated.
+      The average effect of the treatment on the units that received it. Also called Treatment on the treated.
 
    Change score analysis
       A statistical procedure where the outcome variable is the difference between the posttest and protest scores.
@@ -48,7 +48,7 @@ Glossary
 
    Local Average Treatment effect
    LATE
-      Also known asthe complier average causal effect (CACE), is the effect of a treatment for subjects who comply with the experimental treatment assigned to their sample group. It is the quantity we're estimating in IV designs.
+      Also known as the complier average causal effect (CACE), is the effect of a treatment for subjects who comply with the experimental treatment assigned to their sample group. It is the quantity we're estimating in IV designs.
 
    Non-equivalent group designs
    NEGD
@@ -76,7 +76,7 @@ Glossary
       Where units are assigned to conditions at random.
 
    Randomized experiment
-      An emprical comparison used to estimate the effects of treatments where units are assigned to treatment conditions randomly.
+      An empirical comparison used to estimate the effects of treatments where units are assigned to treatment conditions randomly.
 
    Regression discontinuity design
    RDD
@@ -96,7 +96,7 @@ Glossary
 
    Treatment on the treated effect
    TOT
-      The average effect of the treatment on the units that recieved it. Also called the average treatment effect on the treated (ATT).
+      The average effect of the treatment on the units that received it. Also called the average treatment effect on the treated (ATT).
 
    Treatment effect
       The difference in outcomes between what happened after a treatment is implemented and what would have happened (see Counterfactual) if the treatment had not been implemented, assuming everything else had been the same.

diff --git a/docs/source/knowledgebase/quasi_dags.ipynb b/docs/source/knowledgebase/quasi_dags.ipynb
@@ -104,7 +104,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This leads us to Randomized Controlled Trials (RCTs) which are considered the gold standard for estimating causal effects. One reason for this is that we (as experimenters) intervene in the system by assigning units to treatment by {term}`random assignment`. Because of this intervention, any causal influence of the confounders upon the treatment $\\mathbf{X} \\rightarrow Z$ is broken - treamtent is now soley determined by the randomisation process, $R \\rightarrow T$. The following causal DAG illustrates the structure of an RCT."
+    "This leads us to Randomized Controlled Trials (RCTs) which are considered the gold standard for estimating causal effects. One reason for this is that we (as experimenters) intervene in the system by assigning units to treatment by {term}`random assignment`. Because of this intervention, any causal influence of the confounders upon the treatment $\\mathbf{X} \\rightarrow Z$ is broken - treamtent is now solely determined by the randomisation process, $R \\rightarrow T$. The following causal DAG illustrates the structure of an RCT."
    ]
   },
   {

diff --git a/docs/source/notebooks/ancova_pymc.ipynb b/docs/source/notebooks/ancova_pymc.ipynb
@@ -222,7 +222,7 @@
     "## Run the analysis\n",
     "\n",
     ":::{note}\n",
-    "The `random_seed` keyword argument for the PyMC sampler is not neccessary. We use it here so that the results are reproducible.\n",
+    "The `random_seed` keyword argument for the PyMC sampler is not necessary. We use it here so that the results are reproducible.\n",
     ":::"
    ]
   },

diff --git a/docs/source/notebooks/did_pymc.ipynb b/docs/source/notebooks/did_pymc.ipynb
@@ -148,7 +148,7 @@
     "## Run the analysis\n",
     "\n",
     ":::{note}\n",
-    "The `random_seed` keyword argument for the PyMC sampler is not neccessary. We use it here so that the results are reproducible.\n",
+    "The `random_seed` keyword argument for the PyMC sampler is not necessary. We use it here so that the results are reproducible.\n",
     ":::"
    ]
   },

diff --git a/docs/source/notebooks/did_pymc_banks.ipynb b/docs/source/notebooks/did_pymc_banks.ipynb
@@ -329,7 +329,7 @@
     "* $\\mu_i$ is the expected value of the outcome (number of banks in business) for the $i^{th}$ observation.\n",
     "* $\\beta_0$ is an intercept term to capture the basiline number of banks in business of the control group, in the pre-intervention period.\n",
     "* `district` is a dummy variable, so $\\beta_{d}$ will represent a main effect of district, that is any offset of the treatment group relative to the control group.\n",
-    "* `post_treatment` is also a dummy variable which captures any shift in the outcome after the treatment time, regardless of the recieving treatment or not.\n",
+    "* `post_treatment` is also a dummy variable which captures any shift in the outcome after the treatment time, regardless of the receiving treatment or not.\n",
     "* the interaction of the two dummary variables `district:post_treatment` will only take on values of 1 for the treatment group after the intervention. Therefore $\\beta_{\\Delta}$ will represent our estimated causal effect."
    ]
   },
@@ -515,7 +515,7 @@
    "source": [
     "## Analysis 2 - DiD with multiple pre/post observations\n",
     "\n",
-    "Now we'll do a difference in differences analysis of the full dataset. This approach has similarities to {term}`CITS` (Comparative Interrupted Time-Series) with a single control over time. Although slightly abitrary, we distinguish between the two techniques on whether there is enough time series data for CITS to capture the time series patterns."
+    "Now we'll do a difference in differences analysis of the full dataset. This approach has similarities to {term}`CITS` (Comparative Interrupted Time-Series) with a single control over time. Although slightly arbitrary, we distinguish between the two techniques on whether there is enough time series data for CITS to capture the time series patterns."
    ]
   },
   {