diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 20d0b75..0971a4a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,6 +1,14 @@
 name: CI
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  release:
+    types:
+      - released
+      - prereleased
 
 jobs:
   test:
@@ -33,3 +41,50 @@ jobs:
         run: |
           pip install pytest
           pytest dask_glm
+
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v2
+
+      - name: Setup Conda Environment
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          miniforge-variant: Mambaforge
+          miniforge-version: latest
+          use-mamba: true
+          python-version: ${{ matrix.python-version }}
+          activate-environment: dask-glm
+
+      - name: Build Source
+        run: python setup.py sdist
+
+      - name: Build Wheel
+        run: python setup.py bdist_wheel
+
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: dist/dask*
+
+  upload_pypi:
+    needs: 
+      - test
+      - build
+    if: "startsWith(github.ref, 'refs/tags/')"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v3
+        with:
+          name: dist
+          path: dist
+      - uses: pypa/gh-action-pypi-publish@v1.5.0
+        with:
+          user: __token__
+          password: ${{ secrets.pypi_token }}
+
diff --git a/dask_glm/estimators.py b/dask_glm/estimators.py
deleted file mode 100644
index b83f8dd..0000000
--- a/dask_glm/estimators.py
+++ /dev/null
@@ -1,255 +0,0 @@
-"""
-Models following scikit-learn's estimator API.
-"""
-from sklearn.base import BaseEstimator
-
-from . import algorithms, families
-from .utils import (
-    accuracy_score,
-    add_intercept,
-    dot,
-    exp,
-    is_dask_array_sparse,
-    mean_squared_error,
-    poisson_deviance,
-    sigmoid,
-)
-
-
-class _GLM(BaseEstimator):
-    """Base estimator for Generalized Linear Models
-
-    You should not use this class directly, you should use one of its subclasses
-    instead.
-
-    This class should be subclassed and paired with a GLM Family object like
-    Logistic, Linear, Poisson, etc. to form an estimator.
-
-    See Also
-    --------
-    LinearRegression
-    LogisticRegression
-    PoissonRegression
-    """
-
-    @property
-    def family(self):
-        """The family for which this is the estimator"""
-
-    def __init__(
-        self,
-        fit_intercept=True,
-        solver="admm",
-        regularizer="l2",
-        max_iter=100,
-        tol=1e-4,
-        lamduh=1.0,
-        rho=1,
-        over_relax=1,
-        abstol=1e-4,
-        reltol=1e-2,
-    ):
-        self.fit_intercept = fit_intercept
-        self.solver = solver
-        self.regularizer = regularizer
-        self.max_iter = max_iter
-        self.tol = tol
-        self.lamduh = lamduh
-        self.rho = rho
-        self.over_relax = over_relax
-        self.abstol = abstol
-        self.reltol = reltol
-
-        self.coef_ = None
-        self.intercept_ = None
-        self._coef = None  # coef, maybe with intercept
-
-        fit_kwargs = {"max_iter", "tol", "family"}
-
-        if solver == "admm":
-            fit_kwargs.discard("tol")
-            fit_kwargs.update(
-                {"regularizer", "lamduh", "rho", "over_relax", "abstol", "reltol"}
-            )
-        elif solver == "proximal_grad" or solver == "lbfgs":
-            fit_kwargs.update({"regularizer", "lamduh"})
-
-        self._fit_kwargs = {k: getattr(self, k) for k in fit_kwargs}
-
-    def fit(self, X, y=None):
-        X_ = self._maybe_add_intercept(X)
-        fit_kwargs = dict(self._fit_kwargs)
-        if is_dask_array_sparse(X):
-            fit_kwargs["normalize"] = False
-
-        self._coef = algorithms._solvers[self.solver](X_, y, **fit_kwargs)
-
-        if self.fit_intercept:
-            self.coef_ = self._coef[:-1]
-            self.intercept_ = self._coef[-1]
-        else:
-            self.coef_ = self._coef
-        return self
-
-    def _maybe_add_intercept(self, X):
-        if self.fit_intercept:
-            return add_intercept(X)
-        else:
-            return X
-
-
-class LogisticRegression(_GLM):
-    """
-    Estimator for logistic regression.
-
-    Parameters
-    ----------
-    fit_intercept : bool, default True
-        Specifies if a constant (a.k.a. bias or intercept) should be
-        added to the decision function.
-    solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'}
-        Solver to use. See :ref:`api.algorithms` for details
-    regularizer : {'l1', 'l2'}
-        Regularizer to use. See :ref:`api.regularizers` for details.
-        Only used with ``admm``, ``lbfgs``, and ``proximal_grad`` solvers.
-    max_iter : int, default 100
-        Maximum number of iterations taken for the solvers to converge
-    tol : float, default 1e-4
-        Tolerance for stopping criteria. Ignored for ``admm`` solver
-    lambduh : float, default 1.0
-        Only used with ``admm``, ``lbfgs`` and ``proximal_grad`` solvers.
-    rho, over_relax, abstol, reltol : float
-        Only used with the ``admm`` solver.
-
-    Attributes
-    ----------
-    coef_ : array, shape (n_classes, n_features)
-        The learned value for the model's coefficients
-    intercept_ : float of None
-        The learned value for the intercept, if one was added
-        to the model
-
-    Examples
-    --------
-    >>> from dask_glm.datasets import make_classification
-    >>> X, y = make_classification()
-    >>> est = LogisticRegression()
-    >>> est.fit(X, y)
-    >>> est.predict(X)
-    >>> est.predict_proba(X)
-    >>> est.score(X, y)
-    """
-
-    family = families.Logistic
-
-    def predict(self, X):
-        return self.predict_proba(X) > 0.5  # TODO: verify, multiclass broken
-
-    def predict_proba(self, X):
-        X_ = self._maybe_add_intercept(X)
-        return sigmoid(dot(X_, self._coef))
-
-    def score(self, X, y):
-        return accuracy_score(y, self.predict(X))
-
-
-class LinearRegression(_GLM):
-    """
-    Estimator for a linear model using Ordinary Least Squares.
-
-    Parameters
-    ----------
-    fit_intercept : bool, default True
-        Specifies if a constant (a.k.a. bias or intercept) should be
-        added to the decision function.
-    solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'}
-        Solver to use. See :ref:`api.algorithms` for details
-    regularizer : {'l1', 'l2'}
-        Regularizer to use. See :ref:`api.regularizers` for details.
-        Only used with ``admm`` and ``proximal_grad`` solvers.
-    max_iter : int, default 100
-        Maximum number of iterations taken for the solvers to converge
-    tol : float, default 1e-4
-        Tolerance for stopping criteria. Ignored for ``admm`` solver
-    lambduh : float, default 1.0
-        Only used with ``admm`` and ``proximal_grad`` solvers
-    rho, over_relax, abstol, reltol : float
-        Only used with the ``admm`` solver.
-
-    Attributes
-    ----------
-    coef_ : array, shape (n_classes, n_features)
-        The learned value for the model's coefficients
-    intercept_ : float of None
-        The learned value for the intercept, if one was added
-        to the model
-
-    Examples
-    --------
-    >>> from dask_glm.datasets import make_regression
-    >>> X, y = make_regression()
-    >>> est = LinearRegression()
-    >>> est.fit(X, y)
-    >>> est.predict(X)
-    >>> est.score(X, y)
-    """
-
-    family = families.Normal
-
-    def predict(self, X):
-        X_ = self._maybe_add_intercept(X)
-        return dot(X_, self._coef)
-
-    def score(self, X, y):
-        return mean_squared_error(y, self.predict(X))
-
-
-class PoissonRegression(_GLM):
-    """
-    Estimator for Poisson Regression.
-
-    Parameters
-    ----------
-    fit_intercept : bool, default True
-        Specifies if a constant (a.k.a. bias or intercept) should be
-        added to the decision function.
-    solver : {'admm', 'gradient_descent', 'newton', 'lbfgs', 'proximal_grad'}
-        Solver to use. See :ref:`api.algorithms` for details
-    regularizer : {'l1', 'l2'}
-        Regularizer to use. See :ref:`api.regularizers` for details.
-        Only used with ``admm``, ``lbfgs``, and ``proximal_grad`` solvers.
-    max_iter : int, default 100
-        Maximum number of iterations taken for the solvers to converge
-    tol : float, default 1e-4
-        Tolerance for stopping criteria. Ignored for ``admm`` solver
-    lambduh : float, default 1.0
-        Only used with ``admm``, ``lbfgs`` and ``proximal_grad`` solvers.
-    rho, over_relax, abstol, reltol : float
-        Only used with the ``admm`` solver.
-
-    Attributes
-    ----------
-    coef_ : array, shape (n_classes, n_features)
-        The learned value for the model's coefficients
-    intercept_ : float of None
-        The learned value for the intercept, if one was added
-        to the model
-
-    Examples
-    --------
-    >>> from dask_glm.datasets import make_poisson
-    >>> X, y = make_poisson()
-    >>> est = PoissonRegression()
-    >>> est.fit(X, y)
-    >>> est.predict(X)
-    >>> est.get_deviance(X, y)
-    """
-
-    family = families.Poisson
-
-    def predict(self, X):
-        X_ = self._maybe_add_intercept(X)
-        return exp(dot(X_, self._coef))
-
-    def get_deviance(self, X, y):
-        return poisson_deviance(y, self.predict(X))
diff --git a/dask_glm/tests/test_estimators.py b/dask_glm/tests/test_estimators.py
deleted file mode 100644
index 68eb3a2..0000000
--- a/dask_glm/tests/test_estimators.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import dask
-import pytest
-
-from dask_glm.datasets import make_classification, make_poisson, make_regression
-from dask_glm.estimators import LinearRegression, LogisticRegression, PoissonRegression
-from dask_glm.regularizers import Regularizer
-from dask_glm.utils import to_dask_cupy_array_xy
-
-
-@pytest.fixture(params=[r() for r in Regularizer.__subclasses__()])
-def solver(request):
-    """Parametrized fixture for all the solver names"""
-    return request.param
-
-
-@pytest.fixture(params=[r() for r in Regularizer.__subclasses__()])
-def regularizer(request):
-    """Parametrized fixture for all the regularizer names"""
-    return request.param
-
-
-class DoNothingTransformer(object):
-    def fit(self, X, y=None):
-        return self
-
-    def transform(self, X, y=None):
-        return X
-
-    def fit_transform(self, X, y=None):
-        return X
-
-    def get_params(self, deep=True):
-        return {}
-
-
-X, y = make_classification()
-
-
-def test_lr_init(solver):
-    LogisticRegression(solver=solver)
-
-
-def test_pr_init(solver):
-    PoissonRegression(solver=solver)
-
-
-def _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy):
-    if fit_intercept and is_sparse and not is_cupy:
-        msg = (
-            "ValueError: This operation requires consistent fill-values, "
-            "but argument 1 had a fill value of 1.0, which is different "
-            "from a fill_value of 0.0 in the first argument"
-        )
-        pytest.xfail(f"TODO: {msg}")
-
-
-@pytest.mark.parametrize("fit_intercept", [True, False])
-@pytest.mark.parametrize(
-    "is_sparse,is_cupy", [(True, False), (False, False), (False, True)]
-)
-def test_fit(fit_intercept, is_sparse, is_cupy):
-    _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy)
-
-    X, y = make_classification(
-        n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse
-    )
-
-    if is_cupy and not is_sparse:
-        cupy = pytest.importorskip("cupy")
-        X, y = to_dask_cupy_array_xy(X, y, cupy)
-
-    lr = LogisticRegression(fit_intercept=fit_intercept)
-    lr.fit(X, y)
-    lr.predict(X)
-    lr.predict_proba(X)
-
-
-@pytest.mark.parametrize("fit_intercept", [True, False])
-@pytest.mark.parametrize(
-    "is_sparse,is_cupy", [(True, False), (False, False), (False, True)]
-)
-def test_lm(fit_intercept, is_sparse, is_cupy):
-    _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy)
-
-    X, y = make_regression(
-        n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse
-    )
-    if is_cupy and not is_sparse:
-        cupy = pytest.importorskip("cupy")
-        X, y = to_dask_cupy_array_xy(X, y, cupy)
-    lr = LinearRegression(fit_intercept=fit_intercept)
-    lr.fit(X, y)
-    lr.predict(X)
-    if fit_intercept:
-        assert lr.intercept_ is not None
-
-
-@pytest.mark.parametrize("fit_intercept", [True, False])
-@pytest.mark.parametrize(
-    "is_sparse,is_cupy", [(True, False), (False, False), (False, True)]
-)
-def test_big(fit_intercept, is_sparse, is_cupy):
-    _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy)
-
-    with dask.config.set(scheduler="synchronous"):
-        X, y = make_classification(is_sparse=is_sparse)
-        if is_cupy and not is_sparse:
-            cupy = pytest.importorskip("cupy")
-            X, y = to_dask_cupy_array_xy(X, y, cupy)
-        lr = LogisticRegression(fit_intercept=fit_intercept)
-        lr.fit(X, y)
-        lr.predict(X)
-        lr.predict_proba(X)
-    if fit_intercept:
-        assert lr.intercept_ is not None
-
-
-@pytest.mark.parametrize("fit_intercept", [True, False])
-@pytest.mark.parametrize(
-    "is_sparse,is_cupy", [(True, False), (False, False), (False, True)]
-)
-def test_poisson_fit(fit_intercept, is_sparse, is_cupy):
-    _maybe_skip_sparse_error(fit_intercept, is_sparse, is_cupy)
-
-    with dask.config.set(scheduler="synchronous"):
-        X, y = make_poisson(is_sparse=is_sparse)
-        if is_cupy and not is_sparse:
-            cupy = pytest.importorskip("cupy")
-            X, y = to_dask_cupy_array_xy(X, y, cupy)
-        pr = PoissonRegression(fit_intercept=fit_intercept)
-        pr.fit(X, y)
-        pr.predict(X)
-        pr.get_deviance(X, y)
-    if fit_intercept:
-        assert pr.intercept_ is not None
-
-
-def test_in_pipeline():
-    from sklearn.pipeline import make_pipeline
-
-    X, y = make_classification(n_samples=100, n_features=5, chunksize=10)
-    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
-    pipe.fit(X, y)
-
-
-def test_gridsearch():
-    from sklearn.pipeline import make_pipeline
-
-    dcv = pytest.importorskip("dask_searchcv")
-
-    X, y = make_classification(n_samples=100, n_features=5, chunksize=10)
-    grid = {"logisticregression__lamduh": [0.001, 0.01, 0.1, 0.5]}
-    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
-    search = dcv.GridSearchCV(pipe, grid, cv=3)
-    search.fit(X, y)
diff --git a/dask_glm/tests/test_utils.py b/dask_glm/tests/test_utils.py
index 5c07383..6ce6a0c 100644
--- a/dask_glm/tests/test_utils.py
+++ b/dask_glm/tests/test_utils.py
@@ -101,15 +101,8 @@ def test_add_intercept_sparse():
     assert (result == expected).all()
 
 
-@pytest.mark.xfail(
-    reason=(
-        "TODO: ValueError: This operation requires consistent fill-values, "
-        "but argument 1 had a fill value of 1.0, which is different from a "
-        "fill_value of 0.0 in the first argument."
-    )
-)
 def test_add_intercept_sparse_dask():
-    X = da.from_array(sparse.COO(np.zeros((4, 4))), chunks=(2, 4))
+    X = da.from_array(sparse.COO(np.zeros((4, 4)), fill_value=1.0), chunks=(2, 4))
     result = utils.add_intercept(X)
     expected = da.from_array(
         sparse.COO(
diff --git a/docs/api.rst b/docs/api.rst
index 4f83654..e012bfa 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -5,14 +5,6 @@
 API Reference
 -------------
 
-.. _api.estimators:
-
-Estimators
-==========
-
-.. automodule:: dask_glm.estimators
-   :members:
-
 .. _api.families:
 
 Families
diff --git a/docs/estimators.rst b/docs/estimators.rst
deleted file mode 100644
index 5d4c011..0000000
--- a/docs/estimators.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-Estimators
-==========
-
-The :mod:`estimators` module offers a scikit-learn compatible API for
-specifying your model and hyper-parameters, and fitting your model to data.
-
-.. code-block:: python
-
-   >>> from dask_glm.estimators import LogisticRegression
-   >>> from dask_glm.datasets import make_classification
-   >>> X, y = make_classification()
-   >>> lr = LogisticRegression()
-   >>> lr.fit(X, y)
-   >>> lr
-   LogisticRegression(abstol=0.0001, fit_intercept=True, lamduh=1.0,
-             max_iter=100, over_relax=1, regularizer='l2', reltol=0.01, rho=1,
-             solver='admm', tol=0.0001)
-
-
-All of the estimators follow a similar API. They can be instantiated with
-a set of parameters that control the fit, including whether to add an intercept,
-which solver to use, how to regularize the inputs, and various optimization
-parameters.
-
-Given an instantiated estimator, you pass the data to the ``.fit`` method.
-It takes an ``X``, the feature matrix or exogenous data, and a ``y`` the
-target or endogenous data. Each of these can be a NumPy or dask array.
-
-With a fit model, you can make new predictions using the ``.predict`` method,
-and can score known observations with the ``.score`` method.
-
-.. code-block:: python
-
-   >>> lr.predict(X).compute()
-   array([False, False, False, True, ... True, False, True, True], dtype=bool)
-
-See the :ref:`api-reference` for more.
diff --git a/docs/examples/basic_api.ipynb b/docs/examples/basic_api.ipynb
index 5064c03..371e042 100644
--- a/docs/examples/basic_api.ipynb
+++ b/docs/examples/basic_api.ipynb
@@ -26,7 +26,8 @@
     "from distributed import Client\n",
     "\n",
     "from dask import persist\n",
-    "from dask_glm.estimators import LogisticRegression"
+    "# dask_glm.LogistricRegression (re)moved to dask_ml\n",
+    "from dask_ml.linear_model.glm import LogisticRegression"
    ]
   },
   {
diff --git a/docs/index.rst b/docs/index.rst
index 912f0b6..4c10299 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,7 +15,6 @@ It offers a `scikit-learn`_ compatible API for specifying your model.
    :maxdepth: 2
    :caption: Contents:
 
-   estimators
    examples
    api