diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 20d0b75..0971a4a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,14 @@ name: CI -on: [push, pull_request] +on: + push: + branches: + - main + pull_request: + release: + types: + - released + - prereleased jobs: test: @@ -33,3 +41,50 @@ jobs: run: | pip install pytest pytest dask_glm + + build: + name: Build + runs-on: ubuntu-latest + strategy: + fail-fast: false + steps: + - name: Checkout source + uses: actions/checkout@v2 + + - name: Setup Conda Environment + uses: conda-incubator/setup-miniconda@v2.2.0 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + python-version: ${{ matrix.python-version }} + activate-environment: dask-glm + + - name: Build Source + run: python setup.py sdist + + - name: Build Wheel + run: python setup.py bdist_wheel + + - name: Upload Artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: dist/dask* + + upload_pypi: + needs: + - test + - build + if: "startsWith(github.ref, 'refs/tags/')" + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v3 + with: + name: dist + path: dist + - uses: pypa/gh-action-pypi-publish@v1.5.0 + with: + user: __token__ + password: ${{ secrets.pypi_token }} + diff --git a/dask_glm/estimators.py b/dask_glm/estimators.py deleted file mode 100644 index b83f8dd..0000000 --- a/dask_glm/estimators.py +++ /dev/null @@ -1,255 +0,0 @@ -""" -Models following scikit-learn's estimator API. -""" -from sklearn.base import BaseEstimator - -from . import algorithms, families -from .utils import ( - accuracy_score, - add_intercept, - dot, - exp, - is_dask_array_sparse, - mean_squared_error, - poisson_deviance, - sigmoid, -) - - -class _GLM(BaseEstimator): - """Base estimator for Generalized Linear Models - - You should not use this class directly, you should use one of its subclasses - instead. - - This class should be subclassed and paired with a GLM Family object like - Logistic, Linear, Poisson, etc. to form an estimator. - - See Also - -------- - LinearRegression - LogisticRegression - PoissonRegression - """ - - @property - def family(self): - """The family for which this is the estimator""" - - def __init__( - self, - fit_intercept=True, - solver="admm", - regularizer="l2", - max_iter=100, - tol=1e-4, - lamduh=1.0, - rho=1, - over_relax=1, - abstol=1e-4, - reltol=1e-2, - ): - self.fit_intercept = fit_intercept - self.solver = solver - self.regularizer = regularizer - self.max_iter = max_iter - self.tol = tol - self.lamduh = lamduh - self.rho = rho - self.over_relax = over_relax - self.abstol = abstol - self.reltol = reltol - - self.coef_ = None - self.intercept_ = None - self._coef = None # coef, maybe with intercept - - fit_kwargs = {"max_iter", "tol", "family"} - - if solver == "admm": - fit_kwargs.discard("tol") - fit_kwargs.update( - {"regularizer", "lamduh", "rho", "over_relax", "abstol", "reltol"} - ) - elif solver == "proximal_grad" or solver == "lbfgs": - fit_kwargs.update({"regularizer", "lamduh"}) - - self._fit_kwargs = {k: getattr(self, k) for k in fit_kwargs} - - def fit(self, X, y=None): - X_ = self._maybe_add_intercept(X) - fit_kwargs = dict(self._fit_kwargs) - if is_dask_array_sparse(X): - fit_kwargs["normalize"] = False - - self._coef = algorithms._solvers[self.solver](X_, y, **fit_kwargs) - - if self.fit_intercept: - self.coef_ = self._coef[:-1] - self.intercept_ = self._coef[-1] - else: - self.coef_ = self._coef - return self - - def _maybe_add_intercept(self, X): - if self.fit_intercept: - return add_intercept(X) - else: - return X - - -class LogisticRegression(_GLM): - """ - Estimator for logistic regression. - - Parameters - ---------- - fit_intercept : bool, default True - Specifies if a constant (a.k.a. bias or intercept) should be - added to the decision function. - solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'} - Solver to use. See :ref:`api.algorithms` for details - regularizer : {'l1', 'l2'} - Regularizer to use. See :ref:`api.regularizers` for details. - Only used with ``admm``, ``lbfgs``, and ``proximal_grad`` solvers. - max_iter : int, default 100 - Maximum number of iterations taken for the solvers to converge - tol : float, default 1e-4 - Tolerance for stopping criteria. Ignored for ``admm`` solver - lambduh : float, default 1.0 - Only used with ``admm``, ``lbfgs`` and ``proximal_grad`` solvers. - rho, over_relax, abstol, reltol : float - Only used with the ``admm`` solver. - - Attributes - ---------- - coef_ : array, shape (n_classes, n_features) - The learned value for the model's coefficients - intercept_ : float of None - The learned value for the intercept, if one was added - to the model - - Examples - -------- - >>> from dask_glm.datasets import make_classification - >>> X, y = make_classification() - >>> est = LogisticRegression() - >>> est.fit(X, y) - >>> est.predict(X) - >>> est.predict_proba(X) - >>> est.score(X, y) - """ - - family = families.Logistic - - def predict(self, X): - return self.predict_proba(X) > 0.5 # TODO: verify, multiclass broken - - def predict_proba(self, X): - X_ = self._maybe_add_intercept(X) - return sigmoid(dot(X_, self._coef)) - - def score(self, X, y): - return accuracy_score(y, self.predict(X)) - - -class LinearRegression(_GLM): - """ - Estimator for a linear model using Ordinary Least Squares. - - Parameters - ---------- - fit_intercept : bool, default True - Specifies if a constant (a.k.a. bias or intercept) should be - added to the decision function. - solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'} - Solver to use. See :ref:`api.algorithms` for details - regularizer : {'l1', 'l2'} - Regularizer to use. See :ref:`api.regularizers` for details. - Only used with ``admm`` and ``proximal_grad`` solvers. - max_iter : int, default 100 - Maximum number of iterations taken for the solvers to converge - tol : float, default 1e-4 - Tolerance for stopping criteria. Ignored for ``admm`` solver - lambduh : float, default 1.0 - Only used with ``admm`` and ``proximal_grad`` solvers - rho, over_relax, abstol, reltol : float - Only used with the ``admm`` solver. - - Attributes - ---------- - coef_ : array, shape (n_classes, n_features) - The learned value for the model's coefficients - intercept_ : float of None - The learned value for the intercept, if one was added - to the model - - Examples - -------- - >>> from dask_glm.datasets import make_regression - >>> X, y = make_regression() - >>> est = LinearRegression() - >>> est.fit(X, y) - >>> est.predict(X) - >>> est.score(X, y) - """ - - family = families.Normal - - def predict(self, X): - X_ = self._maybe_add_intercept(X) - return dot(X_, self._coef) - - def score(self, X, y): - return mean_squared_error(y, self.predict(X)) - - -class PoissonRegression(_GLM): - """ - Estimator for Poisson Regression. - - Parameters - ---------- - fit_intercept : bool, default True - Specifies if a constant (a.k.a. bias or intercept) should be - added to the decision function. - solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'} - Solver to use. See :ref:`api.algorithms` for details - regularizer : {'l1', 'l2'} - Regularizer to use. See :ref:`api.regularizers` for details. - Only used with ``admm``, ``lbfgs``, and ``proximal_grad`` solvers. - max_iter : int, default 100 - Maximum number of iterations taken for the solvers to converge - tol : float, default 1e-4 - Tolerance for stopping criteria. Ignored for ``admm`` solver - lambduh : float, default 1.0 - Only used with ``admm``, ``lbfgs`` and ``proximal_grad`` solvers. - rho, over_relax, abstol, reltol : float - Only used with the ``admm`` solver. - - Attributes - ---------- - coef_ : array, shape (n_classes, n_features) - The learned value for the model's coefficients - intercept_ : float of None - The learned value for the intercept, if one was added - to the model - - Examples - -------- - >>> from dask_glm.datasets import make_poisson - >>> X, y = make_poisson() - >>> est = PoissonRegression() - >>> est.fit(X, y) - >>> est.predict(X) - >>> est.get_deviance(X, y) - """ - - family = families.Poisson - - def predict(self, X): - X_ = self._maybe_add_intercept(X) - return exp(dot(X_, self._coef)) - - def get_deviance(self, X, y): - return poisson_deviance(y, self.predict(X)) diff --git a/dask_glm/tests/test_estimators.py b/dask_glm/tests/test_estimators.py deleted file mode 100644 index 68eb3a2..0000000 --- a/dask_glm/tests/test_estimators.py +++ /dev/null @@ -1,155 +0,0 @@ -import dask -import pytest - -from dask_glm.datasets import make_classification, make_poisson, make_regression -from dask_glm.estimators import LinearRegression, LogisticRegression, PoissonRegression -from dask_glm.regularizers import Regularizer -from dask_glm.utils import to_dask_cupy_array_xy - - -@pytest.fixture(params=[r() for r in Regularizer.__subclasses__()]) -def solver(request): - """Parametrized fixture for all the solver names""" - return request.param - - -@pytest.fixture(params=[r() for r in Regularizer.__subclasses__()]) -def regularizer(request): - """Parametrized fixture for all the regularizer names""" - return request.param - - -class DoNothingTransformer(object): - def fit(self, X, y=None): - return self - - def transform(self, X, y=None): - return X - - def fit_transform(self, X, y=None): - return X - - def get_params(self, deep=True): - return {} - - -X, y = make_classification() - - -def test_lr_init(solver): - LogisticRegression(solver=solver) - - -def test_pr_init(solver): - PoissonRegression(solver=solver) - - -def _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy): - if fit_intercept and is_sparse and not is_cupy: - msg = ( - "ValueError: This operation requires consistent fill-values, " - "but argument 1 had a fill value of 1.0, which is different " - "from a fill_value of 0.0 in the first argument" - ) - pytest.xfail(f"TODO: {msg}") - - -@pytest.mark.parametrize("fit_intercept", [True, False]) -@pytest.mark.parametrize( - "is_sparse,is_cupy", [(True, False), (False, False), (False, True)] -) -def test_fit(fit_intercept, is_sparse, is_cupy): - _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy) - - X, y = make_classification( - n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse - ) - - if is_cupy and not is_sparse: - cupy = pytest.importorskip("cupy") - X, y = to_dask_cupy_array_xy(X, y, cupy) - - lr = LogisticRegression(fit_intercept=fit_intercept) - lr.fit(X, y) - lr.predict(X) - lr.predict_proba(X) - - -@pytest.mark.parametrize("fit_intercept", [True, False]) -@pytest.mark.parametrize( - "is_sparse,is_cupy", [(True, False), (False, False), (False, True)] -) -def test_lm(fit_intercept, is_sparse, is_cupy): - _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy) - - X, y = make_regression( - n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse - ) - if is_cupy and not is_sparse: - cupy = pytest.importorskip("cupy") - X, y = to_dask_cupy_array_xy(X, y, cupy) - lr = LinearRegression(fit_intercept=fit_intercept) - lr.fit(X, y) - lr.predict(X) - if fit_intercept: - assert lr.intercept_ is not None - - -@pytest.mark.parametrize("fit_intercept", [True, False]) -@pytest.mark.parametrize( - "is_sparse,is_cupy", [(True, False), (False, False), (False, True)] -) -def test_big(fit_intercept, is_sparse, is_cupy): - _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy) - - with dask.config.set(scheduler="synchronous"): - X, y = make_classification(is_sparse=is_sparse) - if is_cupy and not is_sparse: - cupy = pytest.importorskip("cupy") - X, y = to_dask_cupy_array_xy(X, y, cupy) - lr = LogisticRegression(fit_intercept=fit_intercept) - lr.fit(X, y) - lr.predict(X) - lr.predict_proba(X) - if fit_intercept: - assert lr.intercept_ is not None - - -@pytest.mark.parametrize("fit_intercept", [True, False]) -@pytest.mark.parametrize( - "is_sparse,is_cupy", [(True, False), (False, False), (False, True)] -) -def test_poisson_fit(fit_intercept, is_sparse, is_cupy): - _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy) - - with dask.config.set(scheduler="synchronous"): - X, y = make_poisson(is_sparse=is_sparse) - if is_cupy and not is_sparse: - cupy = pytest.importorskip("cupy") - X, y = to_dask_cupy_array_xy(X, y, cupy) - pr = PoissonRegression(fit_intercept=fit_intercept) - pr.fit(X, y) - pr.predict(X) - pr.get_deviance(X, y) - if fit_intercept: - assert pr.intercept_ is not None - - -def test_in_pipeline(): - from sklearn.pipeline import make_pipeline - - X, y = make_classification(n_samples=100, n_features=5, chunksize=10) - pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) - pipe.fit(X, y) - - -def test_gridsearch(): - from sklearn.pipeline import make_pipeline - - dcv = pytest.importorskip("dask_searchcv") - - X, y = make_classification(n_samples=100, n_features=5, chunksize=10) - grid = {"logisticregression__lamduh": [0.001, 0.01, 0.1, 0.5]} - pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) - search = dcv.GridSearchCV(pipe, grid, cv=3) - search.fit(X, y) diff --git a/dask_glm/tests/test_utils.py b/dask_glm/tests/test_utils.py index 5c07383..6ce6a0c 100644 --- a/dask_glm/tests/test_utils.py +++ b/dask_glm/tests/test_utils.py @@ -101,15 +101,8 @@ def test_add_intercept_sparse(): assert (result == expected).all() -@pytest.mark.xfail( - reason=( - "TODO: ValueError: This operation requires consistent fill-values, " - "but argument 1 had a fill value of 1.0, which is different from a " - "fill_value of 0.0 in the first argument." - ) -) def test_add_intercept_sparse_dask(): - X = da.from_array(sparse.COO(np.zeros((4, 4))), chunks=(2, 4)) + X = da.from_array(sparse.COO(np.zeros((4, 4)), fill_value=1.0), chunks=(2, 4)) result = utils.add_intercept(X) expected = da.from_array( sparse.COO( diff --git a/docs/api.rst b/docs/api.rst index 4f83654..e012bfa 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -5,14 +5,6 @@ API Reference ------------- -.. _api.estimators: - -Estimators -========== - -.. automodule:: dask_glm.estimators - :members: - .. _api.families: Families diff --git a/docs/estimators.rst b/docs/estimators.rst deleted file mode 100644 index 5d4c011..0000000 --- a/docs/estimators.rst +++ /dev/null @@ -1,37 +0,0 @@ -Estimators -========== - -The :mod:`estimators` module offers a scikit-learn compatible API for -specifying your model and hyper-parameters, and fitting your model to data. - -.. code-block:: python - - >>> from dask_glm.estimators import LogisticRegression - >>> from dask_glm.datasets import make_classification - >>> X, y = make_classification() - >>> lr = LogisticRegression() - >>> lr.fit(X, y) - >>> lr - LogisticRegression(abstol=0.0001, fit_intercept=True, lamduh=1.0, - max_iter=100, over_relax=1, regularizer='l2', reltol=0.01, rho=1, - solver='admm', tol=0.0001) - - -All of the estimators follow a similar API. They can be instantiated with -a set of parameters that control the fit, including whether to add an intercept, -which solver to use, how to regularize the inputs, and various optimization -parameters. - -Given an instantiated estimator, you pass the data to the ``.fit`` method. -It takes an ``X``, the feature matrix or exogenous data, and a ``y`` the -target or endogenous data. Each of these can be a NumPy or dask array. - -With a fit model, you can make new predictions using the ``.predict`` method, -and can score known observations with the ``.score`` method. - -.. code-block:: python - - >>> lr.predict(X).compute() - array([False, False, False, True, ... True, False, True, True], dtype=bool) - -See the :ref:`api-reference` for more. diff --git a/docs/examples/basic_api.ipynb b/docs/examples/basic_api.ipynb index 5064c03..371e042 100644 --- a/docs/examples/basic_api.ipynb +++ b/docs/examples/basic_api.ipynb @@ -26,7 +26,8 @@ "from distributed import Client\n", "\n", "from dask import persist\n", - "from dask_glm.estimators import LogisticRegression" + "# dask_glm.LogistricRegression (re)moved to dask_ml\n", + "from dask_ml.linear_model.glm import LogisticRegression" ] }, { diff --git a/docs/index.rst b/docs/index.rst index 912f0b6..4c10299 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,7 +15,6 @@ It offers a `scikit-learn`_ compatible API for specifying your model. :maxdepth: 2 :caption: Contents: - estimators examples api