Merge pull request #12 from edahelsinki/select_dists

Select variables when using plot_dist
edahelsinki · May 11, 2024 · c3e251b · c3e251b
2 parents 2b4014e + 41e2629
commit c3e251b
Show file tree

Hide file tree

Showing 16 changed files with 248 additions and 170 deletions.
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -15,16 +15,15 @@ jobs:
       id-token: write
 
     steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.x"
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install build
-      - name: Build package
-        run: python -m build
-      - name: Publish package
-        uses: pypa/[email protected]
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip build
+    - name: Build package
+      run: python -m build
+    - name: Publish package to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/python-pytest.yml b/.github/workflows/python-pytest.yml
@@ -12,24 +12,49 @@ on:
   workflow_dispatch:
 
 jobs:
-  build:
+  test:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+          cache: "pip"
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install pytest build
-          python -m pip install .
-      - name: Build package
-        run: python -m build
+          python -m pip install --upgrade pip pytest pytest-cov
+          python -m pip install -e .
       - name: Test with pytest
-        run: pytest
+        run: |
+          pytest -k test_optim
+          NUMBA_DISABLE_JIT=1 pytest --cov-report term --cov=slise/ --cov-fail-under=9
+
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+      - run: python -m pip install --upgrade pip build
+      - name: Build package
+        run: |
+          python -m build
+          python -c "import os, glob; assert os.path.getsize(sorted(glob.glob('dist/slise-*.whl'))[-1]) > 10_000"
+
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+      - run: python -m pip install --upgrade pip ruff
+      - name: Lint with Ruff
+        run: |
+          ruff check --output-format=github
+          ruff format --check
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "slise"
-version = "2.2.3"
+version = "2.2.4"
 authors = [{ name = "Anton Björklund", email = "[email protected]" }]
 description = "The SLISE algorithm for robust regression and explanations of black box models"
 readme = "README.md"
@@ -28,7 +28,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-dev = ["pytest", "black[jupyter]", "pylint", "IPython"]
+dev = ["pytest", "pytest-cov", "black[jupyter]", "pylint", "IPython", "ruff"]
 tbb = ["tbb"]
 
 [project.urls]

diff --git a/slise/__init__.py b/slise/__init__.py
@@ -1,49 +1,53 @@
 """
-    SLISE - Sparse Linear Subset Explanations
-    -----------------------------------------
-
-    The SLISE algorithm can be used for both robust regression and to explain outcomes from black box models.
-    See [slise.slise.regression][] and [slise.slise.explain][] for referense.
-
-
-    In robust regression we fit regression models that can handle data that
-    contains outliers. SLISE accomplishes this by fitting a model such that
-    the largest possible subset of the data items have an error less than a
-    given value. All items with an error larger than that are considered
-    potential outliers and do not affect the resulting model.
-
-    SLISE can also be used to provide local model-agnostic explanations for
-    outcomes from black box models. To do this we replace the ground truth
-    response vector with the predictions from the complex model. Furthermore, we
-    force the model to fit a selected item (making the explanation local). This
-    gives us a local approximation of the complex model with a simpler linear
-    model. In contrast to other methods SLISE creates explanations using real
-    data (not some discretised and randomly sampled data) so we can be sure that
-    all inputs are valid (i.e. in the correct data manifold, and follows the
-    constraints used to generate the data, e.g., the laws of physics).
-
-
-    More in-depth details about the algorithm can be found in the papers:
-
-    Björklund A., Henelius A., Oikarinen E., Kallonen K., Puolamäki K.
-    Sparse Robust Regression for Explaining Classifiers.
-    Discovery Science (DS 2019).
-    Lecture Notes in Computer Science, vol 11828, Springer.
-    https://doi.org/10.1007/978-3-030-33778-0_27
-
-    Björklund A., Henelius A., Oikarinen E., Kallonen K., Puolamäki K.
-    Robust regression via error tolerance.
-    Data Mining and Knowledge Discovery (2022).
-    https://doi.org/10.1007/s10618-022-00819-2
-
+SLISE - Sparse Linear Subset Explanations
+-----------------------------------------
+
+The SLISE algorithm can be used for both robust regression and to explain outcomes from black box models.
+See [slise.slise.regression][] and [slise.slise.explain][] for referense.
+
+
+In robust regression we fit regression models that can handle data that
+contains outliers. SLISE accomplishes this by fitting a model such that
+the largest possible subset of the data items have an error less than a
+given value. All items with an error larger than that are considered
+potential outliers and do not affect the resulting model.
+
+SLISE can also be used to provide local model-agnostic explanations for
+outcomes from black box models. To do this we replace the ground truth
+response vector with the predictions from the complex model. Furthermore, we
+force the model to fit a selected item (making the explanation local). This
+gives us a local approximation of the complex model with a simpler linear
+model. In contrast to other methods SLISE creates explanations using real
+data (not some discretised and randomly sampled data) so we can be sure that
+all inputs are valid (i.e. in the correct data manifold, and follows the
+constraints used to generate the data, e.g., the laws of physics).
+
+
+More in-depth details about the algorithm can be found in the papers:
+
+Björklund A., Henelius A., Oikarinen E., Kallonen K., Puolamäki K.
+Sparse Robust Regression for Explaining Classifiers.
+Discovery Science (DS 2019).
+Lecture Notes in Computer Science, vol 11828, Springer.
+https://doi.org/10.1007/978-3-030-33778-0_27
+
+Björklund A., Henelius A., Oikarinen E., Kallonen K., Puolamäki K.
+Robust regression via error tolerance.
+Data Mining and Knowledge Discovery (2022).
+https://doi.org/10.1007/s10618-022-00819-2
+
+Björklund A., Henelius A., Oikarinen E., Kallonen K., Puolamäki K.
+Explaining any black box model using real data.
+Frontiers in Computer Science 5:1143904 (2023).
+https://doi.org/10.3389/fcomp.2023.1143904
 """
 
-from slise.slise import (
+from slise.slise import (  # noqa: F401
     SliseRegression,
     regression,
     SliseExplainer,
     explain,
     SliseWarning,
 )
-from slise.utils import limited_logit as logit
-from slise.data import normalise_robust
+from slise.utils import limited_logit as logit  # noqa: F401
+from slise.data import normalise_robust  # noqa: F401
diff --git a/slise/data.py b/slise/data.py
@@ -1,5 +1,5 @@
 """
-    This script contains functions for modifying data, mainly normalisation and PCA.
+This script contains functions for modifying data, mainly normalisation and PCA.
 """
 
 from typing import NamedTuple, Tuple, Union, Optional

diff --git a/slise/initialisation.py b/slise/initialisation.py
@@ -1,5 +1,5 @@
 """
-    This script contains functions for initialising alpha and beta in SLISE.
+This script contains functions for initialising alpha and beta in SLISE.
 """
 
 from math import log
@@ -122,9 +122,7 @@ def initialise_zeros(
     """
     epsilon = epsilon**2
     beta_max = min(beta_max, beta_max_init) / epsilon
-    beta = next_beta(
-        Y**2, epsilon, 0, weight, beta_max, log(max_approx), min_beta_step
-    )
+    beta = next_beta(Y**2, epsilon, 0, weight, beta_max, log(max_approx), min_beta_step)
     return np.zeros(X.shape[1]), beta
 
 

diff --git a/slise/optimisation.py b/slise/optimisation.py
@@ -1,5 +1,5 @@
 """
-    This script contains the loss functions and optimisation functions for SLISE.
+This script contains the loss functions and optimisation functions for SLISE.
 """
 
 from math import log
@@ -502,11 +502,11 @@ def regularised_regression(
     lambda2 = float(lambda2)
     assert X.shape[0] == len(Y), f"Different lengths {X.shape[0]} != {len(Y)}"
     if weight is None:
-        lf = lambda alpha: _ridge_numba(alpha, X, Y, lambda2)
+        lf = lambda alpha: _ridge_numba(alpha, X, Y, lambda2)  # noqa: E731
     else:
         weight = np.ascontiguousarray(weight, dtype=np.float64)
         assert Y.shape == weight.shape, f"Different shapes {Y.shape} != {weight.shape}"
-        lf = lambda alpha: _ridge_numbaw(alpha, X, Y, lambda2, weight)
+        lf = lambda alpha: _ridge_numbaw(alpha, X, Y, lambda2, weight)  # noqa: E731
     return owlqn(lf, np.zeros(X.shape[1], dtype=np.float64), lambda1, max_iterations)
 
 
@@ -547,11 +547,11 @@ def optimise_loss(
     epsilon = float(epsilon)
     beta = float(beta)
     if weight is None:
-        lf = lambda alpha: _loss_grad(alpha, X, Y, epsilon, beta, lambda2)
+        lf = lambda alpha: _loss_grad(alpha, X, Y, epsilon, beta, lambda2)  # noqa: E731
     else:
         weight = np.ascontiguousarray(weight, dtype=np.float64)
         assert Y.shape == weight.shape, f"Different shapes {Y.shape} != {weight.shape}"
-        lf = lambda alpha: _loss_gradw(alpha, X, Y, epsilon, beta, lambda2, weight)
+        lf = lambda alpha: _loss_gradw(alpha, X, Y, epsilon, beta, lambda2, weight)  # noqa: E731
     return owlqn(lf, alpha, lambda1, max_iterations)
 
 
@@ -576,8 +576,8 @@ def log_approximation_ratio(
     """
     if beta1 >= beta2:
         return 0
-    log_f = lambda r, beta: log_sigmoid(beta * (epsilon2 - r))
-    dlog_g = lambda r: -beta1 * dlog_sigmoid(
+    log_f = lambda r, beta: log_sigmoid(beta * (epsilon2 - r))  # noqa: E731
+    dlog_g = lambda r: -beta1 * dlog_sigmoid(  # noqa: E731
         beta1 * (epsilon2 - r)
     ) + beta2 * dlog_sigmoid(beta2 * (epsilon2 - r))
     if dlog_g(0) < 0:
@@ -628,7 +628,7 @@ def next_beta(
     if log_approx <= log_max_approx:
         return beta_max
     else:
-        f = (
+        f = (  # noqa: E731
             lambda b: log_approximation_ratio(residuals2, epsilon2, beta, b, weight)
             - log_max_approx
         )
@@ -681,9 +681,7 @@ def _debug_log(
     """
     residuals = (X @ alpha - Y) ** 2
     loss = loss_sharp(alpha, X, Y, epsilon, lambda1, lambda2, weight)
-    bloss = loss_residuals(
-        alpha, residuals, epsilon**2, beta, lambda1, lambda2, weight
-    )
+    bloss = loss_residuals(alpha, residuals, epsilon**2, beta, lambda1, lambda2, weight)
     epss = matching_epsilon(residuals, epsilon**2, beta, weight)
     beta = beta * epsilon**2
     print(