diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
index aa9803ad..c7b7d3f3 100644
--- a/.github/workflows/build-wheels.yml
+++ b/.github/workflows/build-wheels.yml
@@ -1,14 +1,7 @@
 name: build_wheels
 
-on: #[push, pull_request]
-   release:
-     types:
-       - created
-   workflow_dispatch:
-     inputs:
-       version:
-         description: 'Manually trigger wheel build in Github UI'
-         required: true
+on: [push, pull_request]
+
 
 
 jobs:
@@ -25,59 +18,45 @@ jobs:
       - uses: actions/setup-python@v2
         name: Install Python
         with:
-          python-version: '3.8'
+          python-version: '3.10'
       - name: Install cibuildwheel
         run: |
-          python -m pip install cibuildwheel==2.12.1
+          python -m pip install cibuildwheel==2.21.3  setuptools cython numpy
       - name: Build wheels
         env:
           # We only build for Python 3.6+. On Linux manylinux2010 is used.
           # Skipping pypy wheels for now since scipy & scikit-learn haven't build them yet.
           # Skip python3.11 for 32bit.
-          CIBW_SKIP: "pp* *-win32 *-manylinux_i686 *musllinux*"
+          CIBW_SKIP: "pp* *-win32 *-manylinux_i686 *musllinux* *cp36* *cp37*"
           CIBW_TEST_REQUIRES: "pytest pandas scikit-learn"
           CIBW_TEST_COMMAND: "pytest --pyargs sklearn_extra"
         run: |
           python -m cibuildwheel --output-dir wheelhouse
-      - uses: actions/upload-artifact@v2
-        with:
-          path: ./wheelhouse/*.whl
-
-  build_sdist:
-    name: sdist
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-
-      - uses: actions/setup-python@v2
-        name: Install Python
-        with:
-          python-version: '3.8'
-
-      - name: Install dependencies
-        run: pip install setuptools cython numpy
 
       - name: Build sdist
         run: python setup.py sdist
 
-      - uses: actions/upload-artifact@v2
-        with:
-          path: dist/*.tar.gz
-
-  upload_pypi:
-    needs: [build_wheels, build_sdist]
-    runs-on: ubuntu-latest
-    # upload to PyPI on every tag starting with 'v'
-    if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
-    steps:
-      - uses: actions/download-artifact@v2
-        with:
-          name: artifact
-          path: dist
-
-      - uses: pypa/gh-action-pypi-publish@master
+      - uses: actions/upload-artifact@v4
         with:
-          user: __token__
-          password: ${{ secrets.pypi_password }}
-          # To test:
-          repository_url: https://test.pypi.org/legacy/
+          path: |
+                ./wheelhouse/*.whl
+                ./dist/*.tar.gz
+          name: ${{ matrix.os }}
+
+  # upload_pypi:
+  #   needs: [build_wheels, build_sdist]
+  #   runs-on: ubuntu-latest
+  #   # upload to PyPI on every tag starting with 'v'
+  #   if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
+  #   steps:
+  #     - uses: actions/download-artifact@v2
+  #       with:
+  #         name: artifact
+  #         path: dist
+
+  #     - uses: pypa/gh-action-pypi-publish@master
+  #       with:
+  #         user: __token__
+  #         password: ${{ secrets.pypi_password }}
+  #         # To test:
+  #         repository_url: https://test.pypi.org/legacy/
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 60a837d1..666ee7b8 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -7,18 +7,18 @@ jobs:
     matrix:
       Python39:
         python.version: '3.9'
-        NUMPY_VERSION: "1.19.4"
-        SCIPY_VERSION: "1.5.4"
+        NUMPY_VERSION: "*"
+        SCIPY_VERSION: "*"
         SKLEARN_VERSION: "*"
       Python310:
         python.version: '3.10'
-        NUMPY_VERSION: "1.26.1"
-        SCIPY_VERSION: "1.11.3"
+        NUMPY_VERSION: "*"
+        SCIPY_VERSION: "*"
         SKLEARN_VERSION: "*"
       Python311:
-        python.version: '3.10'
-        NUMPY_VERSION: "1.26.1"
-        SCIPY_VERSION: "1.11.3"
+        python.version: '3.11'
+        NUMPY_VERSION: "*"
+        SCIPY_VERSION: "*"
         SKLEARN_VERSION: "*"
 
   variables:
diff --git a/benchmarks/_bench/eigenpro_plot_mnist.py b/benchmarks/_bench/eigenpro_plot_mnist.py
deleted file mode 100644
index 77009842..00000000
--- a/benchmarks/_bench/eigenpro_plot_mnist.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
-from time import time
-
-from sklearn_extra.kernel_methods import EigenProClassifier
-from sklearn.svm import SVC
-from sklearn.datasets import fetch_openml
-
-rng = np.random.RandomState(1)
-
-#  Generate sample data from mnist
-mnist = fetch_openml("mnist_784")
-mnist.data = mnist.data / 255.0
-print("Data has loaded")
-
-p = rng.permutation(60000)
-x_train = mnist.data[p]
-y_train = np.int32(mnist.target[p])
-x_test = mnist.data[60000:]
-y_test = np.int32(mnist.target[60000:])
-
-# Run tests comparing eig to svc
-eig_fit_times = []
-eig_pred_times = []
-eig_err = []
-svc_fit_times = []
-svc_pred_times = []
-svc_err = []
-
-train_sizes = [500, 1000, 2000, 5000, 10000, 20000, 40000, 60000]
-
-gamma = 0.02
-# Fit models to data
-for train_size in train_sizes:
-    for name, estimator in [
-        (
-            "EigenPro",
-            EigenProClassifier(n_epoch=2, gamma=gamma, random_state=rng),
-        ),
-        ("SupportVector", SVC(C=5, gamma=gamma, random_state=rng)),
-    ]:
-        stime = time()
-        estimator.fit(x_train[:train_size], y_train[:train_size])
-        fit_t = time() - stime
-
-        stime = time()
-        y_pred_test = estimator.predict(x_test)
-        pred_t = time() - stime
-
-        err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
-        if name == "EigenPro":
-            eig_fit_times.append(fit_t)
-            eig_pred_times.append(pred_t)
-            eig_err.append(err)
-        else:
-            svc_fit_times.append(fit_t)
-            svc_pred_times.append(pred_t)
-            svc_err.append(err)
-        print(
-            "%s Classification with %i training samples in %0.2f seconds."
-            "Test error %.4f" % (name, train_size, fit_t + pred_t, err)
-        )
-
-# set up grid for figures
-fig = plt.figure(num=None, figsize=(6, 4), dpi=160)
-ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2)
-train_size_labels = ["500", "1k", "2k", "5k", "10k", "20k", "40k", "60k"]
-
-# Graph fit(train) time
-ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
-ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC")
-ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="EigenPro")
-ax.set_xscale("log")
-ax.set_yscale("log", nonposy="clip")
-ax.set_xlabel("train size")
-ax.set_ylabel("time (seconds)")
-ax.legend()
-ax.set_title("Train set")
-ax.set_xticks(train_sizes)
-ax.set_xticks([], minor=True)
-ax.set_xticklabels(train_size_labels)
-
-# Graph prediction(test) time
-ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1)
-ax.plot(train_sizes, eig_pred_times, "o-", color="r")
-ax.plot(train_sizes, svc_pred_times, "o--", color="g")
-ax.set_xscale("log")
-ax.set_yscale("log", nonposy="clip")
-ax.set_ylabel("time (seconds)")
-ax.set_title("Test set")
-ax.set_xticks(train_sizes)
-ax.set_xticks([], minor=True)
-ax.set_xticklabels(train_size_labels)
-
-# Graph training error
-ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1)
-ax.plot(train_sizes, eig_err, "o-", color="r")
-ax.plot(train_sizes, svc_err, "o-", color="g")
-ax.set_xscale("log")
-ax.set_xticks(train_sizes)
-ax.set_xticklabels(train_size_labels)
-ax.set_xticks([], minor=True)
-ax.set_xlabel("train size")
-ax.set_ylabel("classification error %")
-plt.tight_layout()
-plt.show()
diff --git a/benchmarks/_bench/eigenpro_plot_noisy_mnist.py b/benchmarks/_bench/eigenpro_plot_noisy_mnist.py
deleted file mode 100644
index 939e9aff..00000000
--- a/benchmarks/_bench/eigenpro_plot_noisy_mnist.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
-from time import time
-
-from sklearn.datasets import fetch_openml
-from sklearn_extra.kernel_methods import EigenProClassifier
-from sklearn.svm import SVC
-
-rng = np.random.RandomState(1)
-
-# Generate sample data from mnist
-mnist = fetch_openml("mnist_784")
-mnist.data = mnist.data / 255.0
-
-p = rng.permutation(60000)
-x_train = mnist.data[p][:60000]
-y_train = np.int32(mnist.target[p][:60000])
-x_test = mnist.data[60000:]
-y_test = np.int32(mnist.target[60000:])
-
-# randomize 20% of labels
-p = rng.choice(len(y_train), np.int32(len(y_train) * 0.2), False)
-y_train[p] = rng.choice(10, np.int32(len(y_train) * 0.2))
-p = rng.choice(len(y_test), np.int32(len(y_test) * 0.2), False)
-y_test[p] = rng.choice(10, np.int32(len(y_test) * 0.2))
-
-# Run tests comparing fkc to svc
-eig_fit_times = []
-eig_pred_times = []
-eig_err = []
-svc_fit_times = []
-svc_pred_times = []
-svc_err = []
-
-train_sizes = [500, 1000, 2000, 5000, 10000, 20000, 40000, 60000]
-
-gamma = 0.02
-
-# Fit models to data
-for train_size in train_sizes:
-    for name, estimator in [
-        (
-            "EigenPro",
-            EigenProClassifier(n_epoch=2, gamma=gamma, random_state=rng),
-        ),
-        ("SupportVector", SVC(C=5, gamma=gamma)),
-    ]:
-        stime = time()
-        estimator.fit(x_train[:train_size], y_train[:train_size])
-        fit_t = time() - stime
-
-        stime = time()
-        y_pred_test = estimator.predict(x_test)
-        pred_t = time() - stime
-        err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
-        if name == "EigenPro":
-            eig_fit_times.append(fit_t)
-            eig_pred_times.append(pred_t)
-            eig_err.append(err)
-        else:
-            svc_fit_times.append(fit_t)
-            svc_pred_times.append(pred_t)
-            svc_err.append(err)
-        print(
-            "%s Classification with %i training samples in %0.2f seconds. "
-            "Test error %.4f" % (name, train_size, fit_t + pred_t, err)
-        )
-
-# set up grid for figures
-fig = plt.figure(num=None, figsize=(6, 4), dpi=160)
-ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2)
-train_size_labels = ["500", "1k", "2k", "5k", "10k", "20k", "40k", "60k"]
-
-# Graph fit(train) time
-ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
-ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC")
-ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="EigenPro")
-ax.set_xscale("log")
-ax.set_yscale("log", nonposy="clip")
-ax.set_xlabel("train size")
-ax.set_ylabel("time (seconds)")
-ax.legend()
-ax.set_title("Train set")
-ax.set_xticks(train_sizes)
-ax.set_xticks([], minor=True)
-ax.set_xticklabels(train_size_labels)
-
-# Graph prediction(test) time
-ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1)
-ax.plot(train_sizes, eig_pred_times, "o-", color="r")
-ax.plot(train_sizes, svc_pred_times, "o--", color="g")
-ax.set_xscale("log")
-ax.set_yscale("log", nonposy="clip")
-ax.set_ylabel("time (seconds)")
-ax.set_title("Test set")
-ax.set_xticks(train_sizes)
-ax.set_xticks([], minor=True)
-ax.set_xticklabels(train_size_labels)
-
-# Graph training error
-ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1)
-ax.plot(train_sizes, eig_err, "o-", color="r")
-ax.plot(train_sizes, svc_err, "o-", color="g")
-ax.set_xscale("log")
-ax.set_xticks(train_sizes)
-ax.set_xticklabels(train_size_labels)
-ax.set_xticks([], minor=True)
-ax.set_xlabel("train size")
-ax.set_ylabel("classification error %")
-plt.tight_layout()
-plt.show()
diff --git a/benchmarks/_bench/eigenpro_plot_synthetic.py b/benchmarks/_bench/eigenpro_plot_synthetic.py
deleted file mode 100644
index 155ba985..00000000
--- a/benchmarks/_bench/eigenpro_plot_synthetic.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
-from time import time
-
-from sklearn.datasets import make_classification
-from sklearn_extra.kernel_methods import EigenProClassifier
-from sklearn.svm import SVC
-
-rng = np.random.RandomState(1)
-
-max_size = 50000
-test_size = 10000
-
-# Get data for testing
-
-x, y = make_classification(
-    n_samples=max_size + test_size,
-    n_features=400,
-    n_informative=6,
-    random_state=rng,
-)
-
-x_train = x[:max_size]
-y_train = y[:max_size]
-x_test = x[max_size:]
-y_test = y[max_size:]
-
-eig_fit_times = []
-eig_pred_times = []
-eig_err = []
-svc_fit_times = []
-svc_pred_times = []
-svc_err = []
-
-train_sizes = [2000, 5000, 10000, 20000, 50000]
-
-gamma = 0.005
-for train_size in train_sizes:
-    for name, estimator in [
-        (
-            "EigenPro",
-            EigenProClassifier(
-                n_epoch=3,
-                gamma=gamma,
-                n_components=30,
-                subsample_size=1000,
-                random_state=rng,
-            ),
-        ),
-        ("SupportVector", SVC(C=5, gamma=gamma)),
-    ]:
-        stime = time()
-        estimator.fit(x_train[:train_size], y_train[:train_size])
-        fit_t = time() - stime
-
-        stime = time()
-        y_pred_test = estimator.predict(x_test)
-        pred_t = time() - stime
-
-        err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
-        if name == "EigenPro":
-            eig_fit_times.append(fit_t)
-            eig_pred_times.append(pred_t)
-            eig_err.append(err)
-        else:
-            svc_fit_times.append(fit_t)
-            svc_pred_times.append(pred_t)
-            svc_err.append(err)
-        print(
-            "%s Classification with %i training samples in %0.2f seconds."
-            % (name, train_size, fit_t + pred_t)
-        )
-
-# set up grid for figures
-fig = plt.figure(num=None, figsize=(6, 4), dpi=160)
-ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2)
-train_size_labels = [str(s) for s in train_sizes]
-
-# Graph fit(train) time
-ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC")
-ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="FKC (EigenPro)")
-ax.set_xscale("log")
-ax.set_yscale("log", nonposy="clip")
-ax.set_xlabel("train size")
-ax.set_ylabel("time (seconds)")
-
-ax.legend()
-ax.set_title("Train set")
-ax.set_xticks(train_sizes)
-ax.set_xticklabels(train_size_labels)
-ax.set_xticks([], minor=True)
-ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
-
-# Graph prediction(test) time
-ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1)
-ax.plot(train_sizes, eig_pred_times, "o-", color="r")
-ax.plot(train_sizes, svc_pred_times, "o--", color="g")
-ax.set_xscale("log")
-ax.set_yscale("log", nonposy="clip")
-ax.set_ylabel("time (seconds)")
-ax.set_title("Test set")
-ax.set_xticks([])
-ax.set_xticks([], minor=True)
-
-# Graph training error
-ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1)
-ax.plot(train_sizes, eig_err, "o-", color="r")
-ax.plot(train_sizes, svc_err, "o-", color="g")
-ax.set_xscale("log")
-ax.set_xticks(train_sizes)
-ax.set_xticklabels(train_size_labels)
-ax.set_xticks([], minor=True)
-ax.set_xlabel("train size")
-ax.set_ylabel("classification error %")
-plt.tight_layout()
-plt.show()
diff --git a/doc/api.rst b/doc/api.rst
index 25fc8ed8..1d0af0a4 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -13,15 +13,6 @@ Kernel approximation
 
    kernel_approximation.Fastfood
 
-EigenPro
-========
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   kernel_methods.EigenProRegressor
-   kernel_methods.EigenProClassifier
 
 Clustering
 ====================
diff --git a/doc/modules/eigenpro.rst b/doc/modules/eigenpro.rst
deleted file mode 100644
index bd7535c9..00000000
--- a/doc/modules/eigenpro.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-.. _eigenpro:
-
-==========================================
-EigenPro for Regression and Classification
-==========================================
-
-.. currentmodule:: sklearn_extra.kernel_methods
-
-*EigenPro iteration* [MB17]_ is a very efficient implementation of kernel
-regression/classification that uses an optimization method based on
-preconditioned stochastic gradient descent. It essentially implements a
-"ridgeless" kernel regression. Regularization, when necessary, can be
-achieved by early stopping.
-
-Optimization parameters, such as step size, batch size, and the size of the preconditioning
-block are chosen automatically and optimally. (They can also be set up manually.)
-This results in a simple and user-friendly interface.
-
-Next, we present several experimental results using a server equipped with one
-Intel Xeon E5-1620 CPU.
-The figure below compares the EigenPro Classifier and the Support Vector
-Classifier (:class:`SVC`) on MNIST digits classification task.
-We see that EigenPro and SVC give competitive and similar accuracy on test set.
-Notably, on the full MNIST training and testing using EigenPro are
-approximately 2 times and 5 times faster than that using SVC, respectively.
-
-.. |mnist| image:: ../images/eigenpro_mnist.png
-    :target: ../auto_examples/eigenpro/eigenpro_mnist.html
-    :scale: 70
-
-.. centered:: |mnist|
-
-We then repeat the same experiments on MNIST with added label noise.
-Specifically, we randomly reset the label (0-9) of 20% samples.
-We see that EigenPro has a significant advantage over SVC
-on this noisy MNIST. Training and testing using EigenPro are
-both 10 to 20 times faster than they are when using SVC.
-
-.. |mnist_noisy| image:: ../images/eigenpro_mnist_noisy.png
-    :target: ../auto_examples/eigenpro/eigenpro_mnist_noisy.html
-    :scale: 70
-
-.. centered:: |mnist_noisy|
-
-
-The next figure compares the two methods on a binary classification problem
-with 400 synthetic features. Again, EigenPro demonstrates 10~20 times
-acceleration on training and testing without loss of accuracy.
-
-.. |synthetic| image:: ../images/eigenpro_synthetic.png
-    :target: ../auto_examples/eigenpro/eigenpro_synthetic.html
-    :scale: 70
-
-.. centered:: |synthetic|
-
-
-.. topic:: References:
-
-    .. [MB17] Siyuan Ma and Mikhail Belkin,
-       `"Diving into the shallows: a computational perspective on large-scale shallow learning"
-       <https://arxiv.org/abs/1703.10622>`_,
-       Advances in Neural Information Processing Systems, 2017.
diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst
index b234d691..e0b2231e 100644
--- a/doc/modules/kernel_approximation.rst
+++ b/doc/modules/kernel_approximation.rst
@@ -29,6 +29,3 @@ mapping a single example is O(n_components log d).  The space complexity is
 O(n_components).
 
 See `scikit-learn User-guide <https://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_ for more general informations on kernel approximations.
-
-See also :class:`EigenProRegressor <sklearn_extra.kernel_methods.EigenProRegressor>` and :class:`EigenProClassifier <sklearn_extra.kernel_methods.EigenProClassifier>` for another
-way to compute fast kernel methods algorithms.
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index 0c90c2e8..9c715375 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -10,7 +10,6 @@ User guide
 .. toctree::
   :numbered:
 
-  modules/eigenpro.rst
   modules/cluster.rst
   modules/robust.rst
   modules/kernel_approximation.rst
diff --git a/examples/cluster/README.txt b/examples/cluster/README.txt
index ad0ebf6a..0dfd5871 100644
--- a/examples/cluster/README.txt
+++ b/examples/cluster/README.txt
@@ -3,4 +3,4 @@
 Cluster
 =======
 
-Examples concerning the :mod:`sklearn_extra.kernel_methods.cluster` module.
+Examples concerning the :mod:`sklearn_extra.cluster` module.
diff --git a/examples/eigenpro/README.txt b/examples/eigenpro/README.txt
deleted file mode 100644
index 4ed1fb41..00000000
--- a/examples/eigenpro/README.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-.. _eigenpro_examples:
-
-Eigenpro
-========
-
-Examples concerning the :mod:`sklearn_extra.kernel_methods.eigenpro` module.
diff --git a/examples/eigenpro/plot_eigenpro_synthetic.py b/examples/eigenpro/plot_eigenpro_synthetic.py
deleted file mode 100644
index 802f8a57..00000000
--- a/examples/eigenpro/plot_eigenpro_synthetic.py
+++ /dev/null
@@ -1,127 +0,0 @@
-"""
-======================================================
-Comparison of EigenPro and SVC on Digit Classification
-======================================================
-
-Here we train a EigenPro Classifier and a Support
-Vector Classifier (SVC) on a synthetically generated
-binary classification problem. We halt the training
-of EigenPro after two epochs.
-While EigenPro is slower on low dimensional datasets, as
-the number of features exceeds 500, it begins to outperform
-SVM and shows more stability.
-"""
-print(__doc__)
-
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
-from time import time
-
-from sklearn.datasets import make_classification
-from sklearn_extra.kernel_methods import EigenProClassifier
-from sklearn.svm import SVC
-
-rng = np.random.RandomState(1)
-
-train_size = 2000
-test_size = 1000
-
-# Run tests comparing eig to svc
-eig_fit_times = []
-eig_pred_times = []
-eig_err = []
-svc_fit_times = []
-svc_pred_times = []
-svc_err = []
-
-feature_counts = [20, 50, 150, 500, 1500]
-gamma = 0.008
-
-# Fit models to data
-for n_features in feature_counts:
-    x, y = make_classification(
-        n_samples=train_size + test_size,
-        n_features=n_features,
-        random_state=rng,
-    )
-
-    x_train = x[:train_size]
-    y_train = y[:train_size]
-    x_test = x[train_size:]
-    y_test = y[train_size:]
-    for name, estimator in [
-        (
-            "EigenPro",
-            EigenProClassifier(
-                n_epoch=2, gamma=gamma, n_components=400, random_state=rng
-            ),
-        ),
-        ("SupportVector", SVC(gamma=gamma, random_state=rng)),
-    ]:
-        stime = time()
-        estimator.fit(x_train, y_train)
-        fit_t = time() - stime
-
-        stime = time()
-        y_pred_test = estimator.predict(x_test)
-        pred_t = time() - stime
-
-        err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
-        if name == "EigenPro":
-            eig_fit_times.append(fit_t)
-            eig_pred_times.append(pred_t)
-            eig_err.append(err)
-        else:
-            svc_fit_times.append(fit_t)
-            svc_pred_times.append(pred_t)
-            svc_err.append(err)
-        print(
-            "%s Classification with %i features in %0.2f seconds. Error: %0.1f"
-            % (name, n_features, fit_t + pred_t, err)
-        )
-
-# set up grid for figures
-fig = plt.figure(num=None, figsize=(6, 4), dpi=160)
-ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2)
-
-# Graph fit(train) time
-feature_number_labels = [str(s) for s in feature_counts]
-ax.plot(feature_counts, svc_fit_times, "o--", color="g", label="SVC")
-ax.plot(
-    feature_counts, eig_fit_times, "o-", color="r", label="EigenPro Classifier"
-)
-ax.set_xscale("log")
-ax.set_yscale("log", nonpositive="clip")
-ax.set_xlabel("Number of features")
-ax.set_ylabel("time (seconds)")
-ax.legend()
-ax.set_title("Training Time")
-ax.set_xticks(feature_counts)
-ax.set_xticklabels(feature_number_labels)
-ax.set_xticks([], minor=True)
-ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
-
-# Graph prediction(test) time
-ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1)
-ax.plot(feature_counts, eig_pred_times, "o-", color="r")
-ax.plot(feature_counts, svc_pred_times, "o--", color="g")
-ax.set_xscale("log")
-ax.set_yscale("log", nonpositive="clip")
-ax.set_ylabel("time (seconds)")
-ax.set_title("Prediction Time")
-ax.set_xticks([])
-ax.set_xticks([], minor=True)
-
-# Graph training error
-ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1)
-ax.plot(feature_counts, eig_err, "o-", color="r")
-ax.plot(feature_counts, svc_err, "o-", color="g")
-ax.set_xscale("log")
-ax.set_xticks(feature_counts)
-ax.set_xticklabels(feature_number_labels)
-ax.set_xticks([], minor=True)
-ax.set_xlabel("Number of features")
-ax.set_ylabel("Classification error %")
-plt.tight_layout()
-plt.show()
diff --git a/examples/kernel_approximation/README.txt b/examples/kernel_approximation/README.txt
index 5ea04362..27fcac09 100644
--- a/examples/kernel_approximation/README.txt
+++ b/examples/kernel_approximation/README.txt
@@ -3,5 +3,5 @@
 Kernel approximation
 ====================
 
-Examples concerning the :mod:`sklearn_extra.kernel_methods.kernel_approximation`
+Examples concerning the :mod:`sklearn_extra.kernel_approximation`
 module.
diff --git a/examples/robust/README.txt b/examples/robust/README.txt
index 526c9400..5ee474b3 100644
--- a/examples/robust/README.txt
+++ b/examples/robust/README.txt
@@ -3,4 +3,4 @@
 Robust
 ======
 
-Examples concerning the :mod:`sklearn_extra.kernel_methods.robust` module.
+Examples concerning the :mod:`sklearn_extra.robust` module.
diff --git a/pyproject.toml b/pyproject.toml
index 24b7dfba..64ef0bda 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,12 +4,7 @@ requires = [
     "setuptools",
     "wheel",
     "Cython>=0.28.5",
-
-    # use oldest-supported-numpy which provides the oldest numpy version with
-    # wheels on PyPI
-    #
-    # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg
-    "oldest-supported-numpy"
+    "numpy"
 ]
 
 [tool.black]
diff --git a/setup.py b/setup.py
index f3e94be9..c79b4cc2 100755
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,7 @@
     "tests": ["pytest", "pytest-cov"],
     "docs": [
         "pillow",
+        "pandas",
         "sphinx",
         "sphinx-gallery",
         "sphinx_rtd_theme",
diff --git a/sklearn_extra/__init__.py b/sklearn_extra/__init__.py
index b855d4eb..910ceef6 100644
--- a/sklearn_extra/__init__.py
+++ b/sklearn_extra/__init__.py
@@ -1,4 +1,4 @@
-from . import kernel_approximation, kernel_methods  # noqa
+from . import kernel_approximation  # noqa
 
 from ._version import __version__
 
diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index bb5165ba..a4087510 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -121,7 +121,7 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
     array([[1., 2.],
            [4., 2.]])
     >>> kmedoids.inertia_
-    8.0
+    np.float64(8.0)
 
     See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples
     of KMedoids with various distance metrics.
@@ -595,7 +595,7 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin):
     >>> clara.predict([[0,0], [4,4]])
     array([0, 1])
     >>> clara.inertia_
-    122.44919397611667
+    np.float64(122.44919397611667)
 
     References
     ----------
diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py
index 30f419a0..9af8943d 100644
--- a/sklearn_extra/cluster/tests/test_k_medoids.py
+++ b/sklearn_extra/cluster/tests/test_k_medoids.py
@@ -405,12 +405,10 @@ def test_clara_consistency_iris():
 
 
 def test_seuclidean():
-    with pytest.warns(None) as record:
-        km = KMedoids(2, metric="seuclidean", method="pam")
-        km.fit(np.array([0, 0, 0, 1]).reshape((4, 1)))
-        km.predict(np.array([0, 0, 0, 1]).reshape((4, 1)))
-        km.transform(np.array([0, 0, 0, 1]).reshape((4, 1)))
-    assert len(record) == 0
+    km = KMedoids(2, metric="seuclidean", method="pam")
+    km.fit(np.array([0, 0, 0, 1]).reshape((4, 1)))
+    km.predict(np.array([0, 0, 0, 1]).reshape((4, 1)))
+    km.transform(np.array([0, 0, 0, 1]).reshape((4, 1)))
 
 
 def test_medoids_indices():
diff --git a/sklearn_extra/kernel_methods/__init__.py b/sklearn_extra/kernel_methods/__init__.py
deleted file mode 100644
index 53be76dc..00000000
--- a/sklearn_extra/kernel_methods/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from ._eigenpro import BaseEigenPro, EigenProClassifier, EigenProRegressor
-
-__all__ = ["BaseEigenPro", "EigenProClassifier", "EigenProRegressor"]
diff --git a/sklearn_extra/kernel_methods/_eigenpro.py b/sklearn_extra/kernel_methods/_eigenpro.py
deleted file mode 100644
index 3016c491..00000000
--- a/sklearn_extra/kernel_methods/_eigenpro.py
+++ /dev/null
@@ -1,670 +0,0 @@
-# Authors: Alex Li <7Alex7Li@gmail.com>
-#          Siyuan Ma <Siyuan.ma9@gmail.com>
-
-import numpy as np
-from scipy.linalg import eigh, LinAlgError
-from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
-from sklearn.metrics.pairwise import pairwise_kernels, euclidean_distances
-from sklearn.utils import check_random_state
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import check_is_fitted, check_X_y
-
-
-class BaseEigenPro(BaseEstimator):
-    """
-    Base class for EigenPro iteration.
-    """
-
-    def __init__(
-        self,
-        batch_size="auto",
-        n_epoch=2,
-        n_components=1000,
-        subsample_size="auto",
-        kernel="rbf",
-        gamma="scale",
-        degree=3,
-        coef0=1,
-        kernel_params=None,
-        random_state=None,
-    ):
-        self.batch_size = batch_size
-        self.n_epoch = n_epoch
-        self.n_components = n_components
-        self.subsample_size = subsample_size
-        self.kernel = kernel
-        self.gamma = gamma
-        self.degree = degree
-        self.coef0 = coef0
-        self.kernel_params = kernel_params
-        self.random_state = random_state
-
-    def _kernel(self, X, Y):
-        """Calculate the kernel matrix
-
-        Parameters
-        ---------
-        X : {float, array}, shape = [n_samples, n_features]
-            Input data.
-
-        Y : {float, array}, shape = [n_centers, n_targets]
-            Kernel centers.
-
-        Returns
-        -------
-        K : {float, array}, shape = [n_samples, n_centers]
-            Kernel matrix.
-        """
-        if (
-            self.kernel != "rbf"
-            and self.kernel != "laplace"
-            and self.kernel != "cauchy"
-        ):
-            if callable(self.kernel):
-                params = self.kernel_params or {}
-            else:
-                params = {
-                    "gamma": self.gamma_,
-                    "degree": self.degree,
-                    "coef0": self.coef0,
-                }
-            return pairwise_kernels(
-                X, Y, metric=self.kernel, filter_params=True, **params
-            )
-        distance = euclidean_distances(X, Y, squared=True)
-        bandwidth = np.float32(1.0 / np.sqrt(2.0 * self.gamma_))
-        if self.kernel == "rbf":
-            distance = -self.gamma_ * distance
-            K = np.exp(distance)
-        elif self.kernel == "laplace":
-            d = np.maximum(distance, 0)
-            K = np.exp(-np.sqrt(d) / bandwidth)
-        else:  # self.kernel == "cauchy":
-            K = 1 / (1 + 2.0 * self.gamma_ * distance)
-        return K
-
-    def _nystrom_svd(self, X, n_components):
-        """Compute the top eigensystem of a kernel
-        operator using Nystrom method
-
-        Parameters
-        ----------
-        X : {float, array}, shape = [n_subsamples, n_features]
-            Subsample feature matrix.
-
-        n_components : int
-            Number of top eigencomponents to be restored.
-
-        Returns
-        -------
-        E : {float, array}, shape = [k]
-            Top eigenvalues.
-
-        Lambda : {float, array}, shape = [n_subsamples, k]
-            Top eigenvectors of a subsample kernel matrix (which can be
-            directly used to approximate the eigenfunctions of the kernel
-            operator).
-        """
-        m, _ = X.shape
-        K = self._kernel(X, X)
-
-        W = K / m
-        try:
-            E, Lambda = eigh(W, eigvals=(m - n_components, m - 1))
-        except LinAlgError:
-            # Use float64 when eigh fails due to precision
-            W = np.float64(W)
-            E, Lambda = eigh(W, eigvals=(m - n_components, m - 1))
-            E, Lambda = np.float32(E), np.float32(Lambda)
-        # Flip so eigenvalues are in descending order.
-        E = np.maximum(np.float32(1e-7), np.flipud(E))
-        Lambda = np.fliplr(Lambda)[:, :n_components] / np.sqrt(
-            m, dtype="float32"
-        )
-
-        return E, Lambda
-
-    def _setup(self, feat, max_components, mG, alpha):
-        """Compute preconditioner and scale factors for EigenPro iteration
-
-        Parameters
-        ----------
-        feat : {float, array}, shape = [n_samples, n_features]
-            Feature matrix (normally from training data).
-
-        max_components : int
-            Maximum number of components to be used in EigenPro iteration.
-
-        mG : int
-            Maximum batch size to fit in memory.
-
-        alpha : float
-            Exponential factor (< 1) for eigenvalue ratio.
-
-        Returns
-        -------
-        max_S : float
-            Normalized largest eigenvalue.
-
-        max_kxx : float
-            Maximum of k(x,x) where k is the EigenPro kernel.
-
-        E : {float, array}, shape = [k]
-            Preconditioner for EigenPro
-
-        Lambda : {float, array}, shape = [n_subsamples, k]
-            Top eigenvectors of a subsample kernel matrix
-        """
-        alpha = np.float32(alpha)
-
-        # Estimate eigenvalues (S) and eigenvectors (V) of the kernel matrix
-        # corresponding to the feature matrix.
-        E, Lambda = self._nystrom_svd(feat, max_components)
-        n_subsamples = feat.shape[0]
-
-        # Calculate the number of components to be used such that the
-        # corresponding batch size is bounded by the subsample size and the
-        # memory size.
-        max_bs = min(max(n_subsamples / 5, mG), n_subsamples)
-        n_components = np.sum(np.power(1 / E, alpha) < max_bs) - 1
-        if n_components < 2:
-            n_components = min(E.shape[0] - 1, 2)
-
-        Lambda = Lambda[:, :n_components]
-        scale = np.power(E[0] / E[n_components], alpha)
-
-        # Compute part of the preconditioner for step 2 of gradient descent in
-        # the eigenpro model
-        D = (1 - np.power(E[n_components] / E[:n_components], alpha)) / E[
-            :n_components
-        ]
-
-        max_S = E[0].astype(np.float32)
-        kxx = 1 - np.sum(Lambda**2, axis=1) * n_subsamples
-        return max_S / scale, np.max(kxx), D, Lambda
-
-    def _initialize_params(self, X, Y, random_state):
-        """
-        Validate parameters passed to the model, choose parameters
-        that have not been passed in, and run setup for EigenPro iteration.
-        Parameters
-        ----------
-        X : {float, array}, shape = [n_samples, n_features]
-            Training data.
-
-        Y : {float, array}, shape = [n_samples, n_targets]
-            Training targets.
-
-        random_state : RandomState instance
-            The random state to use for random number generation
-
-        Returns
-        -------
-        Y : {float, array}, shape = [n_samples, n_targets]
-            Training targets. If Y was originally of shape
-            [n_samples], it is now [n_samples, 1].
-
-        E : {float, array}, shape = [k]
-            Preconditioner for EigenPro
-
-        Lambda : {float, array}, shape = [n_subsamples, k]
-            Top eigenvectors of a subsample kernel matrix
-
-        eta : float
-            The learning rate
-
-        pinx : {int, array}, shape = [sample_size]
-            The rows of X used to calculate E and Lambda
-        """
-        n, d = X.shape
-        n_label = 1 if len(Y.shape) == 1 else Y.shape[1]
-        self.centers_ = X
-
-        # Calculate the subsample size to be used.
-        if self.subsample_size == "auto":
-            if n < 100000:
-                sample_size = 4000
-            else:
-                sample_size = 12000
-        else:
-            sample_size = self.subsample_size
-        sample_size = min(n, sample_size)
-
-        n_components = min(sample_size - 1, self.n_components)
-        n_components = max(1, n_components)
-
-        # Approximate amount of memory that we want to use
-        mem_bytes = 0.1 * 1024**3
-        # Memory used with a certain sample size
-        mem_usages = (d + n_label + 2 * np.arange(sample_size)) * n * 4
-        mG = np.int32(np.sum(mem_usages < mem_bytes))
-
-        # Calculate largest eigenvalue and max{k(x,x)} using subsamples.
-        pinx = random_state.choice(n, sample_size, replace=False).astype(
-            "int32"
-        )
-        if self.gamma == "scale":
-            self.gamma_ = np.float32(1.0 / (X.var() * d))
-        else:
-            self.gamma_ = self.gamma
-        max_S, beta, E, Lambda = self._setup(
-            X[pinx], n_components, mG, alpha=0.95
-        )
-        # Calculate best batch size.
-        if self.batch_size == "auto":
-            bs = min(np.int32(beta / max_S), mG) + 1
-        else:
-            bs = self.batch_size
-        self.bs_ = min(bs, n)
-
-        # Calculate best step size.
-        if self.bs_ < beta / max_S + 1:
-            eta = self.bs_ / beta
-        elif self.bs_ < n:
-            eta = 2.0 * self.bs_ / (beta + (self.bs_ - 1) * max_S)
-        else:
-            eta = 0.95 * 2 / max_S
-        # Remember the shape of Y for predict() and ensure it's shape is 2-D.
-        self.was_1D_ = False
-        if len(Y.shape) == 1:
-            Y = np.reshape(Y, (Y.shape[0], 1))
-            self.was_1D_ = True
-        return Y, E, Lambda, np.float32(eta), pinx
-
-    def validate_parameters(self):
-        """
-        Validate the parameters of the model to ensure that no unreasonable
-        values were passed in.
-        """
-        if self.n_epoch <= 0:
-            raise ValueError(
-                "n_epoch should be positive, was " + str(self.n_epoch)
-            )
-        if self.n_components < 0:
-            raise ValueError(
-                "n_components should be non-negative, was "
-                + str(self.n_components)
-            )
-        if self.subsample_size != "auto" and self.subsample_size < 0:
-            raise ValueError(
-                "subsample_size should be non-negative, was "
-                + str(self.subsample_size)
-            )
-        if self.batch_size != "auto" and self.batch_size <= 0:
-            raise ValueError(
-                "batch_size should be positive, was " + str(self.batch_size)
-            )
-        if self.gamma != "scale" and self.gamma <= 0:
-            raise ValueError(
-                "gamma should be positive, was " + str(self.gamma)
-            )
-
-    def _raw_fit(self, X, Y):
-        """Train eigenpro regression model
-
-        Parameters
-        ----------
-        X : {float, array}, shape = [n_samples, n_features]
-            Training data.
-
-        Y : {float, array}, shape = [n_samples, n_targets]
-            Training targets.
-
-        Returns
-        -------
-        self : returns an instance of self.
-        """
-        X, Y = check_X_y(
-            X,
-            Y,
-            dtype=np.float32,
-            multi_output=True,
-            ensure_min_samples=3,
-            y_numeric=True,
-        )
-        self.n_features_in_ = X.shape[1]
-        Y = Y.astype(np.float32)
-        random_state = check_random_state(self.random_state)
-
-        self.validate_parameters()
-        """Parameter Initialization"""
-        Y, D, V, eta, pinx = self._initialize_params(X, Y, random_state)
-
-        """Training loop"""
-        n = self.centers_.shape[0]
-
-        self.coef_ = np.zeros((n, Y.shape[1]), dtype=np.float32)
-        step = np.float32(eta / self.bs_)
-        for _ in range(0, self.n_epoch):
-            epoch_inds = random_state.choice(
-                n, n // self.bs_ * self.bs_, replace=False
-            ).astype("int32")
-
-            for batch_inds in np.array_split(epoch_inds, n // self.bs_):
-                batch_x = self.centers_[batch_inds]
-                kfeat = self._kernel(batch_x, self.centers_)
-                batch_y = Y[batch_inds]
-
-                # Update 1: Sampled Coordinate Block.
-                gradient = np.dot(kfeat, self.coef_) - batch_y
-
-                self.coef_[batch_inds] -= step * gradient
-
-                # Update 2: Fixed Coordinate Block
-                delta = np.dot(
-                    V * D, np.dot(V.T, np.dot(kfeat[:, pinx].T, gradient))
-                )
-                self.coef_[pinx] += step * delta
-        return self
-
-    def _raw_predict(self, X):
-        """Predict using the kernel regression model
-
-        Parameters
-        ----------
-        X : {float, array}, shape = [n_samples, n_features]
-            Samples.
-
-        Returns
-        -------
-        Y : {float, array}, shape = [n_samples, n_targets]
-            Predicted targets.
-        """
-        check_is_fitted(
-            self, ["bs_", "centers_", "coef_", "was_1D_", "gamma_"]
-        )
-        X = np.asarray(X, dtype=np.float64)
-
-        if len(X.shape) == 1:
-            raise ValueError(
-                "Reshape your data. X should be a matrix of shape"
-                " (n_samples, n_features)."
-            )
-        n = X.shape[0]
-
-        Ys = []
-        for batch_inds in np.array_split(range(n), max(1, n // self.bs_)):
-            batch_x = X[batch_inds]
-            kfeat = self._kernel(batch_x, self.centers_)
-
-            pred = np.dot(kfeat, self.coef_)
-            Ys.append(pred)
-        Y = np.vstack(Ys)
-        if self.was_1D_:
-            Y = np.reshape(Y, Y.shape[0])
-        return Y
-
-    def _get_tags(self):
-        tags = super()._get_tags()
-        tags["multioutput"] = True
-        return tags
-
-
-class EigenProRegressor(RegressorMixin, BaseEigenPro):
-    """Regression using EigenPro iteration.
-
-    Train least squared kernel regression model with mini-batch EigenPro
-    iteration.
-
-    Parameters
-    ----------
-    batch_size : int, default = 'auto'
-        Mini-batch size for gradient descent.
-
-    n_epoch : int, default = 2
-        The number of passes over the training data.
-
-    n_components : int, default = 1000
-        the maximum number of eigendirections used in modifying the kernel
-        operator. Convergence rate speedup over normal gradient descent is
-        approximately the largest eigenvalue over the n_componentth
-        eigenvalue, however, it may take time to compute eigenvalues for
-        large n_components
-
-    subsample_size : int, default = 'auto'
-        The number of subsamples used for estimating the largest
-        n_component eigenvalues and eigenvectors. When it is set to 'auto',
-        it will be 4000 if there are less than 100,000 samples
-        (for training), and otherwise 12000.
-
-    kernel : string or callable, default = "rbf"
-        Kernel mapping used internally. Strings can be anything supported
-        by scikit-learn, however, there is special support for the
-        rbf, laplace, and cauchy kernels. If a callable is given, it should
-        accept two arguments and return a floating point number.
-
-    gamma : float, default='scale'
-        Kernel coefficient. If 'scale', gamma = 1/(n_features*X.var()).
-        Interpretation of the default value is left to the kernel;
-        see the documentation for sklearn.metrics.pairwise.
-        For kernels that use bandwidth, bandwidth = 1/sqrt(2*gamma).
-
-    degree : float, default=3
-        Degree of the polynomial kernel. Ignored by other kernels.
-
-    coef0 : float, default=1
-        Zero coefficient for polynomial and sigmoid kernels.
-        Ignored by other kernels.
-
-    kernel_params : mapping of string to any
-        Additional parameters (keyword arguments) for kernel function
-        passed as callable object.
-
-    random_state : int, RandomState instance or None, (default=None)
-        The seed of the pseudo random number generator to use when
-        shuffling the data.  If int, random_state is the seed used by the
-        random number generator; If RandomState instance, random_state is
-        the random number generator; If None, the random number generator
-        is the RandomState instance used by `np.random`.
-
-    References
-    ----------
-    * Siyuan Ma, Mikhail Belkin
-      "Diving into the shallows: a computational perspective on
-      large-scale machine learning", NIPS 2017.
-
-    Examples
-    --------
-    >>> from sklearn_extra.kernel_methods import EigenProRegressor
-    >>> import numpy as np
-    >>> n_samples, n_features, n_targets = 4000, 20, 3
-    >>> rng = np.random.RandomState(1)
-    >>> x_train = rng.randn(n_samples, n_features)
-    >>> y_train = rng.randn(n_samples, n_targets)
-    >>> rgs = EigenProRegressor(n_epoch=3, gamma=.5, subsample_size=50)
-    >>> rgs.fit(x_train, y_train)
-    EigenProRegressor(gamma=0.5, n_epoch=3, subsample_size=50)
-    >>> y_pred = rgs.predict(x_train)
-    >>> loss = np.mean(np.square(y_train - y_pred))
-    """
-
-    def __init__(
-        self,
-        batch_size="auto",
-        n_epoch=2,
-        n_components=1000,
-        subsample_size="auto",
-        kernel="rbf",
-        gamma="scale",
-        degree=3,
-        coef0=1,
-        kernel_params=None,
-        random_state=None,
-    ):
-        super().__init__(
-            batch_size=batch_size,
-            n_epoch=n_epoch,
-            n_components=n_components,
-            subsample_size=subsample_size,
-            kernel=kernel,
-            gamma=gamma,
-            degree=degree,
-            coef0=coef0,
-            kernel_params=kernel_params,
-            random_state=random_state,
-        )
-
-    def fit(self, X, Y):
-        return self._raw_fit(X, Y)
-
-    def predict(self, X):
-        return self._raw_predict(X)
-
-
-class EigenProClassifier(ClassifierMixin, BaseEigenPro):
-    """Classification using EigenPro iteration.
-
-    Train least squared kernel classification model with mini-batch EigenPro
-    iteration.
-
-    Parameters
-    ----------
-    batch_size : int, default = 'auto'
-        Mini-batch size for gradient descent.
-
-    n_epoch : int, default = 2
-        The number of passes over the training data.
-
-    n_components : int, default = 1000
-        the maximum number of eigendirections used in modifying the
-        kernel operator. Convergence rate speedup over normal gradient
-        descent is approximately the largest eigenvalue over the
-        n_componenth eigenvalue, however, it may take time to compute
-        eigenvalues for large n_components
-
-    subsample_size : int, default = 'auto'
-        The size of subsamples used for estimating the largest
-        n_component eigenvalues and eigenvectors. When it is set to
-        'auto', it will be 4000 if there are less than 100,000 samples
-        (for training), and otherwise 12000.
-
-    kernel : string or callable, default = "rbf"
-        Kernel mapping used internally. Strings can be anything supported
-        by scikit-learn, however, there is special support for the
-        rbf, laplace, and cauchy kernels. If a callable is given, it should
-        accept two arguments and return a floating point number.
-
-    gamma : float, default='scale'
-        Kernel coefficient. If 'scale', gamma = 1/(n_features*X.var()).
-        Interpretation of the default value is left to the kernel;
-        see the documentation for sklearn.metrics.pairwise.
-        For kernels that use bandwidth, bandwidth = 1/sqrt(2*gamma).
-
-    degree : float, default=3
-        Degree of the polynomial kernel. Ignored by other kernels.
-
-    coef0 : float, default=1
-        Zero coefficient for polynomial and sigmoid kernels. Ignored by
-        other kernels.
-
-    kernel_params : mapping of string to any
-        Additional parameters (keyword arguments) for kernel function
-        passed as callable object.
-
-    random_state : int, RandomState instance or None (default=None)
-        The seed of the pseudo random number generator to use when
-        shuffling the data.  If int, random_state is the seed used by
-        the random number generator; If RandomState instance,
-        random_state is the random number generator;
-        If None, the random number generator is the RandomState
-        instance used by `np.random`.
-
-    References
-    ----------
-    * Siyuan Ma, Mikhail Belkin
-      "Diving into the shallows: a computational perspective on
-      large-scale machine learning", NIPS 2017.
-
-    Examples
-    --------
-    >>> from sklearn_extra.kernel_methods import EigenProClassifier
-    >>> import numpy as np
-    >>> n_samples, n_features, n_targets = 4000, 20, 3
-    >>> rng = np.random.RandomState(1)
-    >>> x_train = rng.randn(n_samples, n_features)
-    >>> y_train = rng.randint(n_targets, size=n_samples)
-    >>> rgs = EigenProClassifier(n_epoch=3, gamma=.01, subsample_size=50)
-    >>> rgs.fit(x_train, y_train)
-    EigenProClassifier(gamma=0.01, n_epoch=3, subsample_size=50)
-    >>> y_pred = rgs.predict(x_train)
-    >>> loss = np.mean(y_train != y_pred)
-    """
-
-    def __init__(
-        self,
-        batch_size="auto",
-        n_epoch=2,
-        n_components=1000,
-        subsample_size="auto",
-        kernel="rbf",
-        gamma=0.02,
-        degree=3,
-        coef0=1,
-        kernel_params=None,
-        random_state=None,
-    ):
-        super().__init__(
-            batch_size=batch_size,
-            n_epoch=n_epoch,
-            n_components=n_components,
-            subsample_size=subsample_size,
-            kernel=kernel,
-            gamma=gamma,
-            degree=degree,
-            coef0=coef0,
-            kernel_params=kernel_params,
-            random_state=random_state,
-        )
-
-    def fit(self, X, Y):
-        """Train eigenpro classification model
-
-        Parameters
-        ----------
-        X : {float, array}, shape = [n_samples, n_raw_feature]
-            The raw input feature matrix.
-
-        Y : {float, array}, shape =[n_samples]
-            The labels corresponding to the features of X.
-
-        Returns
-        -------
-        self : returns an instance of self.
-        """
-        X, Y = check_X_y(
-            X,
-            Y,
-            dtype=np.float32,
-            force_all_finite=True,
-            multi_output=False,
-            ensure_min_samples=3,
-        )
-        check_classification_targets(Y)
-        self.classes_ = np.unique(Y)
-
-        loc = {}
-        for ind, label in enumerate(self.classes_):
-            loc[label] = ind
-
-        class_matrix = np.zeros((Y.shape[0], self.classes_.shape[0]))
-
-        for ind, label in enumerate(Y):
-            class_matrix[ind, loc[label]] = 1
-        self._raw_fit(X, class_matrix)
-        return self
-
-    def predict(self, X):
-        """Predict using the kernel classification model
-
-        Parameters
-        ----------
-        X : {float, array}, shape = [n_samples, n_features]
-            Samples.
-
-        Returns
-        -------
-        y : {float, array}, shape = [n_samples]
-            Predicted labels.
-        """
-        Y = self._raw_predict(X)
-        return self.classes_[np.argmax(Y, axis=1)]
diff --git a/sklearn_extra/kernel_methods/tests/test_eigenpro.py b/sklearn_extra/kernel_methods/tests/test_eigenpro.py
deleted file mode 100644
index c28322c1..00000000
--- a/sklearn_extra/kernel_methods/tests/test_eigenpro.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import numpy as np
-
-from sklearn.datasets import make_regression, make_classification
-from numpy.testing import assert_allclose
-from sklearn_extra.kernel_methods import EigenProRegressor, EigenProClassifier
-
-import pytest
-
-# Tests for EigenPro Regression and Classification.
-
-
-def gen_regression(params):
-    """Generate a regression problem with make_regression
-    where random_state=1"""
-    return make_regression(**params, random_state=1)
-
-
-def gen_classification(params):
-    """Generate a classification problem with make_classification
-    where random_state=1"""
-    return make_classification(**params, random_state=1)
-
-
-@pytest.mark.parametrize(
-    "estimator, data",
-    [
-        (EigenProRegressor, gen_regression({})),
-        (EigenProClassifier, gen_classification({})),
-    ],
-)
-@pytest.mark.parametrize(
-    "params, err_msg",
-    [
-        ({"kernel": "not_a_kernel"}, "Unknown kernel 'not_a_kernel'"),
-        ({"n_epoch": 0}, "n_epoch should be positive, was 0"),
-        ({"n_epoch": -1}, "n_epoch should be positive, was -1"),
-        ({"n_components": -1}, "n_components should be non-negative, was -1"),
-        (
-            {"subsample_size": -1},
-            "subsample_size should be non-negative, was -1",
-        ),
-        ({"batch_size": 0}, "batch_size should be positive, was 0"),
-        ({"batch_size": -1}, "batch_size should be positive, was -1"),
-        ({"gamma": 0}, "gamma should be positive, was 0"),
-        ({"gamma": -1}, "gamma should be positive, was -1"),
-    ],
-)
-def test_parameter_validation(estimator, data, params, err_msg):
-    X, y = data
-    with pytest.raises(ValueError, match=err_msg):
-        estimator(**params).fit(X, y)
-
-
-@pytest.mark.parametrize(
-    "data, estimator",
-    [
-        # Test rbf kernel
-        (
-            gen_regression({}),
-            EigenProRegressor(kernel="rbf", n_epoch=100, random_state=1),
-        ),
-        # Test laplacian kernel
-        (
-            gen_regression({}),
-            EigenProRegressor(
-                kernel="laplace", n_epoch=100, gamma=0.008, random_state=1
-            ),
-        ),
-        # Test cauchy kernel
-        (
-            gen_regression({}),
-            EigenProRegressor(
-                kernel="cauchy",
-                n_epoch=100,
-                gamma=0.005,
-                subsample_size=1000,
-                random_state=1,
-            ),
-        ),
-        # Test with multiple outputs
-        (
-            gen_regression({"n_features": 200, "n_targets": 30}),
-            EigenProRegressor(
-                kernel="rbf", n_epoch=100, gamma=0.003, random_state=1
-            ),
-        ),
-        # Test with a very large number of input features
-        (
-            gen_regression({"n_features": 10000}),
-            EigenProRegressor(
-                kernel="rbf", n_epoch=100, gamma=0.5, random_state=1
-            ),
-        ),
-        # Test a very simple underlying distribution
-        (
-            gen_regression({"n_informative": 1}),
-            EigenProRegressor(
-                batch_size=500,
-                kernel="rbf",
-                n_epoch=100,
-                gamma=0.005,
-                random_state=1,
-            ),
-        ),
-        # Test a very complex underlying distribution
-        (
-            gen_regression({"n_samples": 500, "n_informative": 100}),
-            EigenProRegressor(
-                kernel="rbf", n_epoch=60, gamma=0.005, random_state=1
-            ),
-        ),
-    ],
-)
-def test_regressor_accuracy(data, estimator):
-    """
-    Test the accuracy of the EigenPro Regressor on multiple
-    data sets with different parameter inputs. We expect that the
-    regressor should achieve near-zero training error after sufficient
-    training time.
-    :param data: A tuple containing the input and output training data
-    :param Estimator: The regressor to do predictions with.
-    """
-    X, y = data
-    prediction = estimator.fit(X, y).predict(X)
-    assert_allclose(prediction, y, rtol=5e-3)
-
-
-def test_eigenpro_regression_duplicate_data():
-    """Test the performance when some data is repeated"""
-    X, y = make_regression(random_state=1)
-    X, y = np.concatenate([X, X]), np.concatenate([y, y])
-    prediction = (
-        EigenProRegressor(
-            kernel="rbf", n_epoch=100, gamma=0.02, random_state=1
-        )
-        .fit(X, y)
-        .predict(X)
-    )
-    assert_allclose(prediction, y, rtol=5e-3)
-
-
-def test_eigenpro_regression_conflict_data():
-    """Make sure the regressor doesn't crash when conflicting
-    data is given"""
-    X, y = make_regression(random_state=1)
-    y = np.reshape(y, (-1, 1))
-    X, y = X, np.hstack([y, y + 2])
-    # Make sure we don't throw an error when fitting or predicting
-    EigenProRegressor(
-        kernel="linear", n_epoch=5, gamma=0.5, random_state=1
-    ).fit(X, y).predict(X)
-
-
-# Tests for FastKernelClassification
-
-
-@pytest.mark.parametrize(
-    "data, estimator",
-    [
-        # Test rbf kernel
-        (
-            gen_classification({"n_samples": 10, "hypercube": False}),
-            EigenProClassifier(
-                batch_size=9,
-                kernel="rbf",
-                gamma=0.08,
-                n_epoch=100,
-                random_state=1,
-            ),
-        ),
-        # Test laplacian kernel
-        (
-            gen_classification({}),
-            EigenProClassifier(
-                kernel="laplace", n_epoch=100, gamma=0.003, random_state=1
-            ),
-        ),
-        # Test cauchy kernel
-        (
-            gen_classification({}),
-            EigenProClassifier(
-                kernel="cauchy", n_epoch=100, gamma=0.005, random_state=1
-            ),
-        ),
-        # Test with a very large number of input features
-        # and samples, shifted around and scaled
-        (
-            gen_classification(
-                {
-                    "n_samples": 500,
-                    "n_features": 500,
-                    "n_informative": 160,
-                    "scale": 30,
-                    "shift": 6,
-                }
-            ),
-            EigenProClassifier(
-                kernel="rbf", n_epoch=50, gamma="scale", random_state=1
-            ),
-        ),
-        # Test a distribution that has been shifted
-        (
-            gen_classification({"shift": 1, "hypercube": False}),
-            EigenProClassifier(
-                kernel="rbf", n_epoch=200, gamma=0.008, random_state=1
-            ),
-        ),
-        # Test with many redundant features.
-        (
-            gen_classification({"n_redundant": 18}),
-            EigenProClassifier(
-                kernel="laplace", n_epoch=100, gamma=0.0012, random_state=1
-            ),
-        ),
-    ],
-)
-def test_classifier_accuracy(data, estimator):
-    """
-    Test the accuracy of the EigenPro Classification on multiple
-    data sets with different parameter inputs. We expect that the
-    classification should achieve zero training error after sufficient
-    training time.
-    :param data: A tuple containing the input and output training data
-    :param Estimator: The classifier to do predictions with.
-    """
-    X, y = data
-    prediction = estimator.fit(X, y).predict(X)
-    assert_allclose(prediction, y, rtol=5e-3)
-
-
-def test_eigenpro_classification_duplicate_data():
-    """
-    Make sure that the classifier correctly handles cases
-    where some data is repeated.
-    """
-    X, y = make_classification(n_features=200, n_repeated=50, random_state=1)
-    prediction = (
-        EigenProClassifier(
-            kernel="rbf", n_epoch=60, gamma=0.002, random_state=1
-        )
-        .fit(X, y)
-        .predict(X)
-    )
-    assert_allclose(prediction, y, rtol=5e-3)
-
-
-def test_eigenpro_classification_conflict_data():
-    """Make sure that the classifier doesn't crash
-    when given conflicting input data"""
-    X, y = make_classification(random_state=1)
-    X, y = np.concatenate([X, X]), np.concatenate([y, 1 - y])
-    # Make sure we don't throw an error when fitting or predicting
-    EigenProClassifier(kernel="linear", n_epoch=5, random_state=1).fit(
-        X, y
-    ).predict(X)
diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx
index d05945cc..02493e1a 100644
--- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx
+++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx
@@ -14,8 +14,9 @@ import sys
 from time import time
 
 from libc.math cimport exp, log, sqrt, pow, fabs
+from libc.stdint cimport int32_t
 cimport numpy as np
-from numpy.math cimport INFINITY
+from libc.math cimport INFINITY
 
 
 # Modified from sklearn.cluster._k_means_fast.pyx
@@ -66,7 +67,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X
         np.ndarray[floating, ndim=2] centers = np.zeros([n_classes,
                                                          n_features],
                                                          dtype = dtype)
-        np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int)
+        np.ndarray[floating] num_in_cluster = np.zeros(n_classes, dtype = dtype)
         np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype)
     for i in range(n_samples):
         for j in range(n_features):
diff --git a/sklearn_extra/robust/tests/test_mean_estimators.py b/sklearn_extra/robust/tests/test_mean_estimators.py
index 2f005662..8cdca52f 100644
--- a/sklearn_extra/robust/tests/test_mean_estimators.py
+++ b/sklearn_extra/robust/tests/test_mean_estimators.py
@@ -1,6 +1,4 @@
 import numpy as np
-import pytest
-
 from sklearn_extra.robust.mean_estimators import median_of_means, huber
 
 
@@ -27,7 +25,5 @@ def test_mom():
 
 def test_huber():
     X = np.hstack([np.zeros(90), np.ones(10)])
-    with pytest.warns(None) as record:
-        mu = huber(X, c=0.5)
-    assert len(record) == 0
+    mu = huber(X, c=0.5)
     assert np.abs(mu) < 0.1
diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py
index 5b71ecf8..92c7a6c5 100644
--- a/sklearn_extra/tests/test_common.py
+++ b/sklearn_extra/tests/test_common.py
@@ -2,7 +2,6 @@
 from sklearn.utils import estimator_checks
 
 from sklearn_extra.kernel_approximation import Fastfood
-from sklearn_extra.kernel_methods import EigenProClassifier, EigenProRegressor
 from sklearn_extra.cluster import KMedoids, CommonNNClustering, CLARA
 from sklearn_extra.robust import (
     RobustWeightedClassifier,
@@ -15,8 +14,6 @@
     Fastfood,
     KMedoids,
     CLARA,
-    EigenProClassifier,
-    EigenProRegressor,
     CommonNNClustering,
     RobustWeightedKMeans,
     RobustWeightedRegressor,
@@ -27,12 +24,6 @@
 @estimator_checks.parametrize_with_checks([cls() for cls in ALL_ESTIMATORS])
 def test_all_estimators(estimator, check, request):
     # TODO: fix this common test failure cf #41
-    if isinstance(
-        estimator, EigenProClassifier
-    ) and "function check_classifier_multioutput" in str(check):
-        request.applymarker(
-            pytest.mark.xfail(run=False, reason="See issue #41")
-        )
 
     # TODO: fix this later, ask people at sklearn to advise on it.
     if isinstance(estimator, RobustWeightedRegressor) and (
diff --git a/sklearn_extra/kernel_methods/tests/__init__.py b/sklearn_extra/utils/__init__.py
similarity index 100%
rename from sklearn_extra/kernel_methods/tests/__init__.py
rename to sklearn_extra/utils/__init__.py