diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index aa9803ad..c7b7d3f3 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -1,14 +1,7 @@ name: build_wheels -on: #[push, pull_request] - release: - types: - - created - workflow_dispatch: - inputs: - version: - description: 'Manually trigger wheel build in Github UI' - required: true +on: [push, pull_request] + jobs: @@ -25,59 +18,45 @@ jobs: - uses: actions/setup-python@v2 name: Install Python with: - python-version: '3.8' + python-version: '3.10' - name: Install cibuildwheel run: | - python -m pip install cibuildwheel==2.12.1 + python -m pip install cibuildwheel==2.21.3 setuptools cython numpy - name: Build wheels env: # We only build for Python 3.6+. On Linux manylinux2010 is used. # Skipping pypy wheels for now since scipy & scikit-learn haven't build them yet. # Skip python3.11 for 32bit. - CIBW_SKIP: "pp* *-win32 *-manylinux_i686 *musllinux*" + CIBW_SKIP: "pp* *-win32 *-manylinux_i686 *musllinux* *cp36* *cp37*" CIBW_TEST_REQUIRES: "pytest pandas scikit-learn" CIBW_TEST_COMMAND: "pytest --pyargs sklearn_extra" run: | python -m cibuildwheel --output-dir wheelhouse - - uses: actions/upload-artifact@v2 - with: - path: ./wheelhouse/*.whl - - build_sdist: - name: sdist - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - uses: actions/setup-python@v2 - name: Install Python - with: - python-version: '3.8' - - - name: Install dependencies - run: pip install setuptools cython numpy - name: Build sdist run: python setup.py sdist - - uses: actions/upload-artifact@v2 - with: - path: dist/*.tar.gz - - upload_pypi: - needs: [build_wheels, build_sdist] - runs-on: ubuntu-latest - # upload to PyPI on every tag starting with 'v' - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') - steps: - - uses: actions/download-artifact@v2 - with: - name: artifact - path: dist - - - uses: pypa/gh-action-pypi-publish@master + - uses: actions/upload-artifact@v4 with: - user: __token__ - password: ${{ secrets.pypi_password }} - # To test: - repository_url: https://test.pypi.org/legacy/ + path: | + ./wheelhouse/*.whl + ./dist/*.tar.gz + name: ${{ matrix.os }} + + # upload_pypi: + # needs: [build_wheels, build_sdist] + # runs-on: ubuntu-latest + # # upload to PyPI on every tag starting with 'v' + # if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') + # steps: + # - uses: actions/download-artifact@v2 + # with: + # name: artifact + # path: dist + + # - uses: pypa/gh-action-pypi-publish@master + # with: + # user: __token__ + # password: ${{ secrets.pypi_password }} + # # To test: + # repository_url: https://test.pypi.org/legacy/ diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 60a837d1..666ee7b8 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,18 +7,18 @@ jobs: matrix: Python39: python.version: '3.9' - NUMPY_VERSION: "1.19.4" - SCIPY_VERSION: "1.5.4" + NUMPY_VERSION: "*" + SCIPY_VERSION: "*" SKLEARN_VERSION: "*" Python310: python.version: '3.10' - NUMPY_VERSION: "1.26.1" - SCIPY_VERSION: "1.11.3" + NUMPY_VERSION: "*" + SCIPY_VERSION: "*" SKLEARN_VERSION: "*" Python311: - python.version: '3.10' - NUMPY_VERSION: "1.26.1" - SCIPY_VERSION: "1.11.3" + python.version: '3.11' + NUMPY_VERSION: "*" + SCIPY_VERSION: "*" SKLEARN_VERSION: "*" variables: diff --git a/benchmarks/_bench/eigenpro_plot_mnist.py b/benchmarks/_bench/eigenpro_plot_mnist.py deleted file mode 100644 index 77009842..00000000 --- a/benchmarks/_bench/eigenpro_plot_mnist.py +++ /dev/null @@ -1,107 +0,0 @@ -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from time import time - -from sklearn_extra.kernel_methods import EigenProClassifier -from sklearn.svm import SVC -from sklearn.datasets import fetch_openml - -rng = np.random.RandomState(1) - -# Generate sample data from mnist -mnist = fetch_openml("mnist_784") -mnist.data = mnist.data / 255.0 -print("Data has loaded") - -p = rng.permutation(60000) -x_train = mnist.data[p] -y_train = np.int32(mnist.target[p]) -x_test = mnist.data[60000:] -y_test = np.int32(mnist.target[60000:]) - -# Run tests comparing eig to svc -eig_fit_times = [] -eig_pred_times = [] -eig_err = [] -svc_fit_times = [] -svc_pred_times = [] -svc_err = [] - -train_sizes = [500, 1000, 2000, 5000, 10000, 20000, 40000, 60000] - -gamma = 0.02 -# Fit models to data -for train_size in train_sizes: - for name, estimator in [ - ( - "EigenPro", - EigenProClassifier(n_epoch=2, gamma=gamma, random_state=rng), - ), - ("SupportVector", SVC(C=5, gamma=gamma, random_state=rng)), - ]: - stime = time() - estimator.fit(x_train[:train_size], y_train[:train_size]) - fit_t = time() - stime - - stime = time() - y_pred_test = estimator.predict(x_test) - pred_t = time() - stime - - err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) - if name == "EigenPro": - eig_fit_times.append(fit_t) - eig_pred_times.append(pred_t) - eig_err.append(err) - else: - svc_fit_times.append(fit_t) - svc_pred_times.append(pred_t) - svc_err.append(err) - print( - "%s Classification with %i training samples in %0.2f seconds." - "Test error %.4f" % (name, train_size, fit_t + pred_t, err) - ) - -# set up grid for figures -fig = plt.figure(num=None, figsize=(6, 4), dpi=160) -ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2) -train_size_labels = ["500", "1k", "2k", "5k", "10k", "20k", "40k", "60k"] - -# Graph fit(train) time -ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) -ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC") -ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="EigenPro") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_xlabel("train size") -ax.set_ylabel("time (seconds)") -ax.legend() -ax.set_title("Train set") -ax.set_xticks(train_sizes) -ax.set_xticks([], minor=True) -ax.set_xticklabels(train_size_labels) - -# Graph prediction(test) time -ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1) -ax.plot(train_sizes, eig_pred_times, "o-", color="r") -ax.plot(train_sizes, svc_pred_times, "o--", color="g") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_ylabel("time (seconds)") -ax.set_title("Test set") -ax.set_xticks(train_sizes) -ax.set_xticks([], minor=True) -ax.set_xticklabels(train_size_labels) - -# Graph training error -ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1) -ax.plot(train_sizes, eig_err, "o-", color="r") -ax.plot(train_sizes, svc_err, "o-", color="g") -ax.set_xscale("log") -ax.set_xticks(train_sizes) -ax.set_xticklabels(train_size_labels) -ax.set_xticks([], minor=True) -ax.set_xlabel("train size") -ax.set_ylabel("classification error %") -plt.tight_layout() -plt.show() diff --git a/benchmarks/_bench/eigenpro_plot_noisy_mnist.py b/benchmarks/_bench/eigenpro_plot_noisy_mnist.py deleted file mode 100644 index 939e9aff..00000000 --- a/benchmarks/_bench/eigenpro_plot_noisy_mnist.py +++ /dev/null @@ -1,112 +0,0 @@ -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from time import time - -from sklearn.datasets import fetch_openml -from sklearn_extra.kernel_methods import EigenProClassifier -from sklearn.svm import SVC - -rng = np.random.RandomState(1) - -# Generate sample data from mnist -mnist = fetch_openml("mnist_784") -mnist.data = mnist.data / 255.0 - -p = rng.permutation(60000) -x_train = mnist.data[p][:60000] -y_train = np.int32(mnist.target[p][:60000]) -x_test = mnist.data[60000:] -y_test = np.int32(mnist.target[60000:]) - -# randomize 20% of labels -p = rng.choice(len(y_train), np.int32(len(y_train) * 0.2), False) -y_train[p] = rng.choice(10, np.int32(len(y_train) * 0.2)) -p = rng.choice(len(y_test), np.int32(len(y_test) * 0.2), False) -y_test[p] = rng.choice(10, np.int32(len(y_test) * 0.2)) - -# Run tests comparing fkc to svc -eig_fit_times = [] -eig_pred_times = [] -eig_err = [] -svc_fit_times = [] -svc_pred_times = [] -svc_err = [] - -train_sizes = [500, 1000, 2000, 5000, 10000, 20000, 40000, 60000] - -gamma = 0.02 - -# Fit models to data -for train_size in train_sizes: - for name, estimator in [ - ( - "EigenPro", - EigenProClassifier(n_epoch=2, gamma=gamma, random_state=rng), - ), - ("SupportVector", SVC(C=5, gamma=gamma)), - ]: - stime = time() - estimator.fit(x_train[:train_size], y_train[:train_size]) - fit_t = time() - stime - - stime = time() - y_pred_test = estimator.predict(x_test) - pred_t = time() - stime - err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) - if name == "EigenPro": - eig_fit_times.append(fit_t) - eig_pred_times.append(pred_t) - eig_err.append(err) - else: - svc_fit_times.append(fit_t) - svc_pred_times.append(pred_t) - svc_err.append(err) - print( - "%s Classification with %i training samples in %0.2f seconds. " - "Test error %.4f" % (name, train_size, fit_t + pred_t, err) - ) - -# set up grid for figures -fig = plt.figure(num=None, figsize=(6, 4), dpi=160) -ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2) -train_size_labels = ["500", "1k", "2k", "5k", "10k", "20k", "40k", "60k"] - -# Graph fit(train) time -ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) -ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC") -ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="EigenPro") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_xlabel("train size") -ax.set_ylabel("time (seconds)") -ax.legend() -ax.set_title("Train set") -ax.set_xticks(train_sizes) -ax.set_xticks([], minor=True) -ax.set_xticklabels(train_size_labels) - -# Graph prediction(test) time -ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1) -ax.plot(train_sizes, eig_pred_times, "o-", color="r") -ax.plot(train_sizes, svc_pred_times, "o--", color="g") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_ylabel("time (seconds)") -ax.set_title("Test set") -ax.set_xticks(train_sizes) -ax.set_xticks([], minor=True) -ax.set_xticklabels(train_size_labels) - -# Graph training error -ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1) -ax.plot(train_sizes, eig_err, "o-", color="r") -ax.plot(train_sizes, svc_err, "o-", color="g") -ax.set_xscale("log") -ax.set_xticks(train_sizes) -ax.set_xticklabels(train_size_labels) -ax.set_xticks([], minor=True) -ax.set_xlabel("train size") -ax.set_ylabel("classification error %") -plt.tight_layout() -plt.show() diff --git a/benchmarks/_bench/eigenpro_plot_synthetic.py b/benchmarks/_bench/eigenpro_plot_synthetic.py deleted file mode 100644 index 155ba985..00000000 --- a/benchmarks/_bench/eigenpro_plot_synthetic.py +++ /dev/null @@ -1,117 +0,0 @@ -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from time import time - -from sklearn.datasets import make_classification -from sklearn_extra.kernel_methods import EigenProClassifier -from sklearn.svm import SVC - -rng = np.random.RandomState(1) - -max_size = 50000 -test_size = 10000 - -# Get data for testing - -x, y = make_classification( - n_samples=max_size + test_size, - n_features=400, - n_informative=6, - random_state=rng, -) - -x_train = x[:max_size] -y_train = y[:max_size] -x_test = x[max_size:] -y_test = y[max_size:] - -eig_fit_times = [] -eig_pred_times = [] -eig_err = [] -svc_fit_times = [] -svc_pred_times = [] -svc_err = [] - -train_sizes = [2000, 5000, 10000, 20000, 50000] - -gamma = 0.005 -for train_size in train_sizes: - for name, estimator in [ - ( - "EigenPro", - EigenProClassifier( - n_epoch=3, - gamma=gamma, - n_components=30, - subsample_size=1000, - random_state=rng, - ), - ), - ("SupportVector", SVC(C=5, gamma=gamma)), - ]: - stime = time() - estimator.fit(x_train[:train_size], y_train[:train_size]) - fit_t = time() - stime - - stime = time() - y_pred_test = estimator.predict(x_test) - pred_t = time() - stime - - err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) - if name == "EigenPro": - eig_fit_times.append(fit_t) - eig_pred_times.append(pred_t) - eig_err.append(err) - else: - svc_fit_times.append(fit_t) - svc_pred_times.append(pred_t) - svc_err.append(err) - print( - "%s Classification with %i training samples in %0.2f seconds." - % (name, train_size, fit_t + pred_t) - ) - -# set up grid for figures -fig = plt.figure(num=None, figsize=(6, 4), dpi=160) -ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2) -train_size_labels = [str(s) for s in train_sizes] - -# Graph fit(train) time -ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC") -ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="FKC (EigenPro)") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_xlabel("train size") -ax.set_ylabel("time (seconds)") - -ax.legend() -ax.set_title("Train set") -ax.set_xticks(train_sizes) -ax.set_xticklabels(train_size_labels) -ax.set_xticks([], minor=True) -ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) - -# Graph prediction(test) time -ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1) -ax.plot(train_sizes, eig_pred_times, "o-", color="r") -ax.plot(train_sizes, svc_pred_times, "o--", color="g") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_ylabel("time (seconds)") -ax.set_title("Test set") -ax.set_xticks([]) -ax.set_xticks([], minor=True) - -# Graph training error -ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1) -ax.plot(train_sizes, eig_err, "o-", color="r") -ax.plot(train_sizes, svc_err, "o-", color="g") -ax.set_xscale("log") -ax.set_xticks(train_sizes) -ax.set_xticklabels(train_size_labels) -ax.set_xticks([], minor=True) -ax.set_xlabel("train size") -ax.set_ylabel("classification error %") -plt.tight_layout() -plt.show() diff --git a/doc/api.rst b/doc/api.rst index 25fc8ed8..1d0af0a4 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -13,15 +13,6 @@ Kernel approximation kernel_approximation.Fastfood -EigenPro -======== - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - kernel_methods.EigenProRegressor - kernel_methods.EigenProClassifier Clustering ==================== diff --git a/doc/modules/eigenpro.rst b/doc/modules/eigenpro.rst deleted file mode 100644 index bd7535c9..00000000 --- a/doc/modules/eigenpro.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _eigenpro: - -========================================== -EigenPro for Regression and Classification -========================================== - -.. currentmodule:: sklearn_extra.kernel_methods - -*EigenPro iteration* [MB17]_ is a very efficient implementation of kernel -regression/classification that uses an optimization method based on -preconditioned stochastic gradient descent. It essentially implements a -"ridgeless" kernel regression. Regularization, when necessary, can be -achieved by early stopping. - -Optimization parameters, such as step size, batch size, and the size of the preconditioning -block are chosen automatically and optimally. (They can also be set up manually.) -This results in a simple and user-friendly interface. - -Next, we present several experimental results using a server equipped with one -Intel Xeon E5-1620 CPU. -The figure below compares the EigenPro Classifier and the Support Vector -Classifier (:class:`SVC`) on MNIST digits classification task. -We see that EigenPro and SVC give competitive and similar accuracy on test set. -Notably, on the full MNIST training and testing using EigenPro are -approximately 2 times and 5 times faster than that using SVC, respectively. - -.. |mnist| image:: ../images/eigenpro_mnist.png - :target: ../auto_examples/eigenpro/eigenpro_mnist.html - :scale: 70 - -.. centered:: |mnist| - -We then repeat the same experiments on MNIST with added label noise. -Specifically, we randomly reset the label (0-9) of 20% samples. -We see that EigenPro has a significant advantage over SVC -on this noisy MNIST. Training and testing using EigenPro are -both 10 to 20 times faster than they are when using SVC. - -.. |mnist_noisy| image:: ../images/eigenpro_mnist_noisy.png - :target: ../auto_examples/eigenpro/eigenpro_mnist_noisy.html - :scale: 70 - -.. centered:: |mnist_noisy| - - -The next figure compares the two methods on a binary classification problem -with 400 synthetic features. Again, EigenPro demonstrates 10~20 times -acceleration on training and testing without loss of accuracy. - -.. |synthetic| image:: ../images/eigenpro_synthetic.png - :target: ../auto_examples/eigenpro/eigenpro_synthetic.html - :scale: 70 - -.. centered:: |synthetic| - - -.. topic:: References: - - .. [MB17] Siyuan Ma and Mikhail Belkin, - `"Diving into the shallows: a computational perspective on large-scale shallow learning" - `_, - Advances in Neural Information Processing Systems, 2017. diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst index b234d691..e0b2231e 100644 --- a/doc/modules/kernel_approximation.rst +++ b/doc/modules/kernel_approximation.rst @@ -29,6 +29,3 @@ mapping a single example is O(n_components log d). The space complexity is O(n_components). See `scikit-learn User-guide `_ for more general informations on kernel approximations. - -See also :class:`EigenProRegressor ` and :class:`EigenProClassifier ` for another -way to compute fast kernel methods algorithms. diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 0c90c2e8..9c715375 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -10,7 +10,6 @@ User guide .. toctree:: :numbered: - modules/eigenpro.rst modules/cluster.rst modules/robust.rst modules/kernel_approximation.rst diff --git a/examples/cluster/README.txt b/examples/cluster/README.txt index ad0ebf6a..0dfd5871 100644 --- a/examples/cluster/README.txt +++ b/examples/cluster/README.txt @@ -3,4 +3,4 @@ Cluster ======= -Examples concerning the :mod:`sklearn_extra.kernel_methods.cluster` module. +Examples concerning the :mod:`sklearn_extra.cluster` module. diff --git a/examples/eigenpro/README.txt b/examples/eigenpro/README.txt deleted file mode 100644 index 4ed1fb41..00000000 --- a/examples/eigenpro/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -.. _eigenpro_examples: - -Eigenpro -======== - -Examples concerning the :mod:`sklearn_extra.kernel_methods.eigenpro` module. diff --git a/examples/eigenpro/plot_eigenpro_synthetic.py b/examples/eigenpro/plot_eigenpro_synthetic.py deleted file mode 100644 index 802f8a57..00000000 --- a/examples/eigenpro/plot_eigenpro_synthetic.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -====================================================== -Comparison of EigenPro and SVC on Digit Classification -====================================================== - -Here we train a EigenPro Classifier and a Support -Vector Classifier (SVC) on a synthetically generated -binary classification problem. We halt the training -of EigenPro after two epochs. -While EigenPro is slower on low dimensional datasets, as -the number of features exceeds 500, it begins to outperform -SVM and shows more stability. -""" -print(__doc__) - -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from time import time - -from sklearn.datasets import make_classification -from sklearn_extra.kernel_methods import EigenProClassifier -from sklearn.svm import SVC - -rng = np.random.RandomState(1) - -train_size = 2000 -test_size = 1000 - -# Run tests comparing eig to svc -eig_fit_times = [] -eig_pred_times = [] -eig_err = [] -svc_fit_times = [] -svc_pred_times = [] -svc_err = [] - -feature_counts = [20, 50, 150, 500, 1500] -gamma = 0.008 - -# Fit models to data -for n_features in feature_counts: - x, y = make_classification( - n_samples=train_size + test_size, - n_features=n_features, - random_state=rng, - ) - - x_train = x[:train_size] - y_train = y[:train_size] - x_test = x[train_size:] - y_test = y[train_size:] - for name, estimator in [ - ( - "EigenPro", - EigenProClassifier( - n_epoch=2, gamma=gamma, n_components=400, random_state=rng - ), - ), - ("SupportVector", SVC(gamma=gamma, random_state=rng)), - ]: - stime = time() - estimator.fit(x_train, y_train) - fit_t = time() - stime - - stime = time() - y_pred_test = estimator.predict(x_test) - pred_t = time() - stime - - err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) - if name == "EigenPro": - eig_fit_times.append(fit_t) - eig_pred_times.append(pred_t) - eig_err.append(err) - else: - svc_fit_times.append(fit_t) - svc_pred_times.append(pred_t) - svc_err.append(err) - print( - "%s Classification with %i features in %0.2f seconds. Error: %0.1f" - % (name, n_features, fit_t + pred_t, err) - ) - -# set up grid for figures -fig = plt.figure(num=None, figsize=(6, 4), dpi=160) -ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2) - -# Graph fit(train) time -feature_number_labels = [str(s) for s in feature_counts] -ax.plot(feature_counts, svc_fit_times, "o--", color="g", label="SVC") -ax.plot( - feature_counts, eig_fit_times, "o-", color="r", label="EigenPro Classifier" -) -ax.set_xscale("log") -ax.set_yscale("log", nonpositive="clip") -ax.set_xlabel("Number of features") -ax.set_ylabel("time (seconds)") -ax.legend() -ax.set_title("Training Time") -ax.set_xticks(feature_counts) -ax.set_xticklabels(feature_number_labels) -ax.set_xticks([], minor=True) -ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) - -# Graph prediction(test) time -ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1) -ax.plot(feature_counts, eig_pred_times, "o-", color="r") -ax.plot(feature_counts, svc_pred_times, "o--", color="g") -ax.set_xscale("log") -ax.set_yscale("log", nonpositive="clip") -ax.set_ylabel("time (seconds)") -ax.set_title("Prediction Time") -ax.set_xticks([]) -ax.set_xticks([], minor=True) - -# Graph training error -ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1) -ax.plot(feature_counts, eig_err, "o-", color="r") -ax.plot(feature_counts, svc_err, "o-", color="g") -ax.set_xscale("log") -ax.set_xticks(feature_counts) -ax.set_xticklabels(feature_number_labels) -ax.set_xticks([], minor=True) -ax.set_xlabel("Number of features") -ax.set_ylabel("Classification error %") -plt.tight_layout() -plt.show() diff --git a/examples/kernel_approximation/README.txt b/examples/kernel_approximation/README.txt index 5ea04362..27fcac09 100644 --- a/examples/kernel_approximation/README.txt +++ b/examples/kernel_approximation/README.txt @@ -3,5 +3,5 @@ Kernel approximation ==================== -Examples concerning the :mod:`sklearn_extra.kernel_methods.kernel_approximation` +Examples concerning the :mod:`sklearn_extra.kernel_approximation` module. diff --git a/examples/robust/README.txt b/examples/robust/README.txt index 526c9400..5ee474b3 100644 --- a/examples/robust/README.txt +++ b/examples/robust/README.txt @@ -3,4 +3,4 @@ Robust ====== -Examples concerning the :mod:`sklearn_extra.kernel_methods.robust` module. +Examples concerning the :mod:`sklearn_extra.robust` module. diff --git a/pyproject.toml b/pyproject.toml index 24b7dfba..64ef0bda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,7 @@ requires = [ "setuptools", "wheel", "Cython>=0.28.5", - - # use oldest-supported-numpy which provides the oldest numpy version with - # wheels on PyPI - # - # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg - "oldest-supported-numpy" + "numpy" ] [tool.black] diff --git a/setup.py b/setup.py index f3e94be9..c79b4cc2 100755 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ "tests": ["pytest", "pytest-cov"], "docs": [ "pillow", + "pandas", "sphinx", "sphinx-gallery", "sphinx_rtd_theme", diff --git a/sklearn_extra/__init__.py b/sklearn_extra/__init__.py index b855d4eb..910ceef6 100644 --- a/sklearn_extra/__init__.py +++ b/sklearn_extra/__init__.py @@ -1,4 +1,4 @@ -from . import kernel_approximation, kernel_methods # noqa +from . import kernel_approximation # noqa from ._version import __version__ diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index bb5165ba..a4087510 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -121,7 +121,7 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): array([[1., 2.], [4., 2.]]) >>> kmedoids.inertia_ - 8.0 + np.float64(8.0) See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples of KMedoids with various distance metrics. @@ -595,7 +595,7 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin): >>> clara.predict([[0,0], [4,4]]) array([0, 1]) >>> clara.inertia_ - 122.44919397611667 + np.float64(122.44919397611667) References ---------- diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py index 30f419a0..9af8943d 100644 --- a/sklearn_extra/cluster/tests/test_k_medoids.py +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -405,12 +405,10 @@ def test_clara_consistency_iris(): def test_seuclidean(): - with pytest.warns(None) as record: - km = KMedoids(2, metric="seuclidean", method="pam") - km.fit(np.array([0, 0, 0, 1]).reshape((4, 1))) - km.predict(np.array([0, 0, 0, 1]).reshape((4, 1))) - km.transform(np.array([0, 0, 0, 1]).reshape((4, 1))) - assert len(record) == 0 + km = KMedoids(2, metric="seuclidean", method="pam") + km.fit(np.array([0, 0, 0, 1]).reshape((4, 1))) + km.predict(np.array([0, 0, 0, 1]).reshape((4, 1))) + km.transform(np.array([0, 0, 0, 1]).reshape((4, 1))) def test_medoids_indices(): diff --git a/sklearn_extra/kernel_methods/__init__.py b/sklearn_extra/kernel_methods/__init__.py deleted file mode 100644 index 53be76dc..00000000 --- a/sklearn_extra/kernel_methods/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ._eigenpro import BaseEigenPro, EigenProClassifier, EigenProRegressor - -__all__ = ["BaseEigenPro", "EigenProClassifier", "EigenProRegressor"] diff --git a/sklearn_extra/kernel_methods/_eigenpro.py b/sklearn_extra/kernel_methods/_eigenpro.py deleted file mode 100644 index 3016c491..00000000 --- a/sklearn_extra/kernel_methods/_eigenpro.py +++ /dev/null @@ -1,670 +0,0 @@ -# Authors: Alex Li <7Alex7Li@gmail.com> -# Siyuan Ma - -import numpy as np -from scipy.linalg import eigh, LinAlgError -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin -from sklearn.metrics.pairwise import pairwise_kernels, euclidean_distances -from sklearn.utils import check_random_state -from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.validation import check_is_fitted, check_X_y - - -class BaseEigenPro(BaseEstimator): - """ - Base class for EigenPro iteration. - """ - - def __init__( - self, - batch_size="auto", - n_epoch=2, - n_components=1000, - subsample_size="auto", - kernel="rbf", - gamma="scale", - degree=3, - coef0=1, - kernel_params=None, - random_state=None, - ): - self.batch_size = batch_size - self.n_epoch = n_epoch - self.n_components = n_components - self.subsample_size = subsample_size - self.kernel = kernel - self.gamma = gamma - self.degree = degree - self.coef0 = coef0 - self.kernel_params = kernel_params - self.random_state = random_state - - def _kernel(self, X, Y): - """Calculate the kernel matrix - - Parameters - --------- - X : {float, array}, shape = [n_samples, n_features] - Input data. - - Y : {float, array}, shape = [n_centers, n_targets] - Kernel centers. - - Returns - ------- - K : {float, array}, shape = [n_samples, n_centers] - Kernel matrix. - """ - if ( - self.kernel != "rbf" - and self.kernel != "laplace" - and self.kernel != "cauchy" - ): - if callable(self.kernel): - params = self.kernel_params or {} - else: - params = { - "gamma": self.gamma_, - "degree": self.degree, - "coef0": self.coef0, - } - return pairwise_kernels( - X, Y, metric=self.kernel, filter_params=True, **params - ) - distance = euclidean_distances(X, Y, squared=True) - bandwidth = np.float32(1.0 / np.sqrt(2.0 * self.gamma_)) - if self.kernel == "rbf": - distance = -self.gamma_ * distance - K = np.exp(distance) - elif self.kernel == "laplace": - d = np.maximum(distance, 0) - K = np.exp(-np.sqrt(d) / bandwidth) - else: # self.kernel == "cauchy": - K = 1 / (1 + 2.0 * self.gamma_ * distance) - return K - - def _nystrom_svd(self, X, n_components): - """Compute the top eigensystem of a kernel - operator using Nystrom method - - Parameters - ---------- - X : {float, array}, shape = [n_subsamples, n_features] - Subsample feature matrix. - - n_components : int - Number of top eigencomponents to be restored. - - Returns - ------- - E : {float, array}, shape = [k] - Top eigenvalues. - - Lambda : {float, array}, shape = [n_subsamples, k] - Top eigenvectors of a subsample kernel matrix (which can be - directly used to approximate the eigenfunctions of the kernel - operator). - """ - m, _ = X.shape - K = self._kernel(X, X) - - W = K / m - try: - E, Lambda = eigh(W, eigvals=(m - n_components, m - 1)) - except LinAlgError: - # Use float64 when eigh fails due to precision - W = np.float64(W) - E, Lambda = eigh(W, eigvals=(m - n_components, m - 1)) - E, Lambda = np.float32(E), np.float32(Lambda) - # Flip so eigenvalues are in descending order. - E = np.maximum(np.float32(1e-7), np.flipud(E)) - Lambda = np.fliplr(Lambda)[:, :n_components] / np.sqrt( - m, dtype="float32" - ) - - return E, Lambda - - def _setup(self, feat, max_components, mG, alpha): - """Compute preconditioner and scale factors for EigenPro iteration - - Parameters - ---------- - feat : {float, array}, shape = [n_samples, n_features] - Feature matrix (normally from training data). - - max_components : int - Maximum number of components to be used in EigenPro iteration. - - mG : int - Maximum batch size to fit in memory. - - alpha : float - Exponential factor (< 1) for eigenvalue ratio. - - Returns - ------- - max_S : float - Normalized largest eigenvalue. - - max_kxx : float - Maximum of k(x,x) where k is the EigenPro kernel. - - E : {float, array}, shape = [k] - Preconditioner for EigenPro - - Lambda : {float, array}, shape = [n_subsamples, k] - Top eigenvectors of a subsample kernel matrix - """ - alpha = np.float32(alpha) - - # Estimate eigenvalues (S) and eigenvectors (V) of the kernel matrix - # corresponding to the feature matrix. - E, Lambda = self._nystrom_svd(feat, max_components) - n_subsamples = feat.shape[0] - - # Calculate the number of components to be used such that the - # corresponding batch size is bounded by the subsample size and the - # memory size. - max_bs = min(max(n_subsamples / 5, mG), n_subsamples) - n_components = np.sum(np.power(1 / E, alpha) < max_bs) - 1 - if n_components < 2: - n_components = min(E.shape[0] - 1, 2) - - Lambda = Lambda[:, :n_components] - scale = np.power(E[0] / E[n_components], alpha) - - # Compute part of the preconditioner for step 2 of gradient descent in - # the eigenpro model - D = (1 - np.power(E[n_components] / E[:n_components], alpha)) / E[ - :n_components - ] - - max_S = E[0].astype(np.float32) - kxx = 1 - np.sum(Lambda**2, axis=1) * n_subsamples - return max_S / scale, np.max(kxx), D, Lambda - - def _initialize_params(self, X, Y, random_state): - """ - Validate parameters passed to the model, choose parameters - that have not been passed in, and run setup for EigenPro iteration. - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_features] - Training data. - - Y : {float, array}, shape = [n_samples, n_targets] - Training targets. - - random_state : RandomState instance - The random state to use for random number generation - - Returns - ------- - Y : {float, array}, shape = [n_samples, n_targets] - Training targets. If Y was originally of shape - [n_samples], it is now [n_samples, 1]. - - E : {float, array}, shape = [k] - Preconditioner for EigenPro - - Lambda : {float, array}, shape = [n_subsamples, k] - Top eigenvectors of a subsample kernel matrix - - eta : float - The learning rate - - pinx : {int, array}, shape = [sample_size] - The rows of X used to calculate E and Lambda - """ - n, d = X.shape - n_label = 1 if len(Y.shape) == 1 else Y.shape[1] - self.centers_ = X - - # Calculate the subsample size to be used. - if self.subsample_size == "auto": - if n < 100000: - sample_size = 4000 - else: - sample_size = 12000 - else: - sample_size = self.subsample_size - sample_size = min(n, sample_size) - - n_components = min(sample_size - 1, self.n_components) - n_components = max(1, n_components) - - # Approximate amount of memory that we want to use - mem_bytes = 0.1 * 1024**3 - # Memory used with a certain sample size - mem_usages = (d + n_label + 2 * np.arange(sample_size)) * n * 4 - mG = np.int32(np.sum(mem_usages < mem_bytes)) - - # Calculate largest eigenvalue and max{k(x,x)} using subsamples. - pinx = random_state.choice(n, sample_size, replace=False).astype( - "int32" - ) - if self.gamma == "scale": - self.gamma_ = np.float32(1.0 / (X.var() * d)) - else: - self.gamma_ = self.gamma - max_S, beta, E, Lambda = self._setup( - X[pinx], n_components, mG, alpha=0.95 - ) - # Calculate best batch size. - if self.batch_size == "auto": - bs = min(np.int32(beta / max_S), mG) + 1 - else: - bs = self.batch_size - self.bs_ = min(bs, n) - - # Calculate best step size. - if self.bs_ < beta / max_S + 1: - eta = self.bs_ / beta - elif self.bs_ < n: - eta = 2.0 * self.bs_ / (beta + (self.bs_ - 1) * max_S) - else: - eta = 0.95 * 2 / max_S - # Remember the shape of Y for predict() and ensure it's shape is 2-D. - self.was_1D_ = False - if len(Y.shape) == 1: - Y = np.reshape(Y, (Y.shape[0], 1)) - self.was_1D_ = True - return Y, E, Lambda, np.float32(eta), pinx - - def validate_parameters(self): - """ - Validate the parameters of the model to ensure that no unreasonable - values were passed in. - """ - if self.n_epoch <= 0: - raise ValueError( - "n_epoch should be positive, was " + str(self.n_epoch) - ) - if self.n_components < 0: - raise ValueError( - "n_components should be non-negative, was " - + str(self.n_components) - ) - if self.subsample_size != "auto" and self.subsample_size < 0: - raise ValueError( - "subsample_size should be non-negative, was " - + str(self.subsample_size) - ) - if self.batch_size != "auto" and self.batch_size <= 0: - raise ValueError( - "batch_size should be positive, was " + str(self.batch_size) - ) - if self.gamma != "scale" and self.gamma <= 0: - raise ValueError( - "gamma should be positive, was " + str(self.gamma) - ) - - def _raw_fit(self, X, Y): - """Train eigenpro regression model - - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_features] - Training data. - - Y : {float, array}, shape = [n_samples, n_targets] - Training targets. - - Returns - ------- - self : returns an instance of self. - """ - X, Y = check_X_y( - X, - Y, - dtype=np.float32, - multi_output=True, - ensure_min_samples=3, - y_numeric=True, - ) - self.n_features_in_ = X.shape[1] - Y = Y.astype(np.float32) - random_state = check_random_state(self.random_state) - - self.validate_parameters() - """Parameter Initialization""" - Y, D, V, eta, pinx = self._initialize_params(X, Y, random_state) - - """Training loop""" - n = self.centers_.shape[0] - - self.coef_ = np.zeros((n, Y.shape[1]), dtype=np.float32) - step = np.float32(eta / self.bs_) - for _ in range(0, self.n_epoch): - epoch_inds = random_state.choice( - n, n // self.bs_ * self.bs_, replace=False - ).astype("int32") - - for batch_inds in np.array_split(epoch_inds, n // self.bs_): - batch_x = self.centers_[batch_inds] - kfeat = self._kernel(batch_x, self.centers_) - batch_y = Y[batch_inds] - - # Update 1: Sampled Coordinate Block. - gradient = np.dot(kfeat, self.coef_) - batch_y - - self.coef_[batch_inds] -= step * gradient - - # Update 2: Fixed Coordinate Block - delta = np.dot( - V * D, np.dot(V.T, np.dot(kfeat[:, pinx].T, gradient)) - ) - self.coef_[pinx] += step * delta - return self - - def _raw_predict(self, X): - """Predict using the kernel regression model - - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_features] - Samples. - - Returns - ------- - Y : {float, array}, shape = [n_samples, n_targets] - Predicted targets. - """ - check_is_fitted( - self, ["bs_", "centers_", "coef_", "was_1D_", "gamma_"] - ) - X = np.asarray(X, dtype=np.float64) - - if len(X.shape) == 1: - raise ValueError( - "Reshape your data. X should be a matrix of shape" - " (n_samples, n_features)." - ) - n = X.shape[0] - - Ys = [] - for batch_inds in np.array_split(range(n), max(1, n // self.bs_)): - batch_x = X[batch_inds] - kfeat = self._kernel(batch_x, self.centers_) - - pred = np.dot(kfeat, self.coef_) - Ys.append(pred) - Y = np.vstack(Ys) - if self.was_1D_: - Y = np.reshape(Y, Y.shape[0]) - return Y - - def _get_tags(self): - tags = super()._get_tags() - tags["multioutput"] = True - return tags - - -class EigenProRegressor(RegressorMixin, BaseEigenPro): - """Regression using EigenPro iteration. - - Train least squared kernel regression model with mini-batch EigenPro - iteration. - - Parameters - ---------- - batch_size : int, default = 'auto' - Mini-batch size for gradient descent. - - n_epoch : int, default = 2 - The number of passes over the training data. - - n_components : int, default = 1000 - the maximum number of eigendirections used in modifying the kernel - operator. Convergence rate speedup over normal gradient descent is - approximately the largest eigenvalue over the n_componentth - eigenvalue, however, it may take time to compute eigenvalues for - large n_components - - subsample_size : int, default = 'auto' - The number of subsamples used for estimating the largest - n_component eigenvalues and eigenvectors. When it is set to 'auto', - it will be 4000 if there are less than 100,000 samples - (for training), and otherwise 12000. - - kernel : string or callable, default = "rbf" - Kernel mapping used internally. Strings can be anything supported - by scikit-learn, however, there is special support for the - rbf, laplace, and cauchy kernels. If a callable is given, it should - accept two arguments and return a floating point number. - - gamma : float, default='scale' - Kernel coefficient. If 'scale', gamma = 1/(n_features*X.var()). - Interpretation of the default value is left to the kernel; - see the documentation for sklearn.metrics.pairwise. - For kernels that use bandwidth, bandwidth = 1/sqrt(2*gamma). - - degree : float, default=3 - Degree of the polynomial kernel. Ignored by other kernels. - - coef0 : float, default=1 - Zero coefficient for polynomial and sigmoid kernels. - Ignored by other kernels. - - kernel_params : mapping of string to any - Additional parameters (keyword arguments) for kernel function - passed as callable object. - - random_state : int, RandomState instance or None, (default=None) - The seed of the pseudo random number generator to use when - shuffling the data. If int, random_state is the seed used by the - random number generator; If RandomState instance, random_state is - the random number generator; If None, the random number generator - is the RandomState instance used by `np.random`. - - References - ---------- - * Siyuan Ma, Mikhail Belkin - "Diving into the shallows: a computational perspective on - large-scale machine learning", NIPS 2017. - - Examples - -------- - >>> from sklearn_extra.kernel_methods import EigenProRegressor - >>> import numpy as np - >>> n_samples, n_features, n_targets = 4000, 20, 3 - >>> rng = np.random.RandomState(1) - >>> x_train = rng.randn(n_samples, n_features) - >>> y_train = rng.randn(n_samples, n_targets) - >>> rgs = EigenProRegressor(n_epoch=3, gamma=.5, subsample_size=50) - >>> rgs.fit(x_train, y_train) - EigenProRegressor(gamma=0.5, n_epoch=3, subsample_size=50) - >>> y_pred = rgs.predict(x_train) - >>> loss = np.mean(np.square(y_train - y_pred)) - """ - - def __init__( - self, - batch_size="auto", - n_epoch=2, - n_components=1000, - subsample_size="auto", - kernel="rbf", - gamma="scale", - degree=3, - coef0=1, - kernel_params=None, - random_state=None, - ): - super().__init__( - batch_size=batch_size, - n_epoch=n_epoch, - n_components=n_components, - subsample_size=subsample_size, - kernel=kernel, - gamma=gamma, - degree=degree, - coef0=coef0, - kernel_params=kernel_params, - random_state=random_state, - ) - - def fit(self, X, Y): - return self._raw_fit(X, Y) - - def predict(self, X): - return self._raw_predict(X) - - -class EigenProClassifier(ClassifierMixin, BaseEigenPro): - """Classification using EigenPro iteration. - - Train least squared kernel classification model with mini-batch EigenPro - iteration. - - Parameters - ---------- - batch_size : int, default = 'auto' - Mini-batch size for gradient descent. - - n_epoch : int, default = 2 - The number of passes over the training data. - - n_components : int, default = 1000 - the maximum number of eigendirections used in modifying the - kernel operator. Convergence rate speedup over normal gradient - descent is approximately the largest eigenvalue over the - n_componenth eigenvalue, however, it may take time to compute - eigenvalues for large n_components - - subsample_size : int, default = 'auto' - The size of subsamples used for estimating the largest - n_component eigenvalues and eigenvectors. When it is set to - 'auto', it will be 4000 if there are less than 100,000 samples - (for training), and otherwise 12000. - - kernel : string or callable, default = "rbf" - Kernel mapping used internally. Strings can be anything supported - by scikit-learn, however, there is special support for the - rbf, laplace, and cauchy kernels. If a callable is given, it should - accept two arguments and return a floating point number. - - gamma : float, default='scale' - Kernel coefficient. If 'scale', gamma = 1/(n_features*X.var()). - Interpretation of the default value is left to the kernel; - see the documentation for sklearn.metrics.pairwise. - For kernels that use bandwidth, bandwidth = 1/sqrt(2*gamma). - - degree : float, default=3 - Degree of the polynomial kernel. Ignored by other kernels. - - coef0 : float, default=1 - Zero coefficient for polynomial and sigmoid kernels. Ignored by - other kernels. - - kernel_params : mapping of string to any - Additional parameters (keyword arguments) for kernel function - passed as callable object. - - random_state : int, RandomState instance or None (default=None) - The seed of the pseudo random number generator to use when - shuffling the data. If int, random_state is the seed used by - the random number generator; If RandomState instance, - random_state is the random number generator; - If None, the random number generator is the RandomState - instance used by `np.random`. - - References - ---------- - * Siyuan Ma, Mikhail Belkin - "Diving into the shallows: a computational perspective on - large-scale machine learning", NIPS 2017. - - Examples - -------- - >>> from sklearn_extra.kernel_methods import EigenProClassifier - >>> import numpy as np - >>> n_samples, n_features, n_targets = 4000, 20, 3 - >>> rng = np.random.RandomState(1) - >>> x_train = rng.randn(n_samples, n_features) - >>> y_train = rng.randint(n_targets, size=n_samples) - >>> rgs = EigenProClassifier(n_epoch=3, gamma=.01, subsample_size=50) - >>> rgs.fit(x_train, y_train) - EigenProClassifier(gamma=0.01, n_epoch=3, subsample_size=50) - >>> y_pred = rgs.predict(x_train) - >>> loss = np.mean(y_train != y_pred) - """ - - def __init__( - self, - batch_size="auto", - n_epoch=2, - n_components=1000, - subsample_size="auto", - kernel="rbf", - gamma=0.02, - degree=3, - coef0=1, - kernel_params=None, - random_state=None, - ): - super().__init__( - batch_size=batch_size, - n_epoch=n_epoch, - n_components=n_components, - subsample_size=subsample_size, - kernel=kernel, - gamma=gamma, - degree=degree, - coef0=coef0, - kernel_params=kernel_params, - random_state=random_state, - ) - - def fit(self, X, Y): - """Train eigenpro classification model - - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_raw_feature] - The raw input feature matrix. - - Y : {float, array}, shape =[n_samples] - The labels corresponding to the features of X. - - Returns - ------- - self : returns an instance of self. - """ - X, Y = check_X_y( - X, - Y, - dtype=np.float32, - force_all_finite=True, - multi_output=False, - ensure_min_samples=3, - ) - check_classification_targets(Y) - self.classes_ = np.unique(Y) - - loc = {} - for ind, label in enumerate(self.classes_): - loc[label] = ind - - class_matrix = np.zeros((Y.shape[0], self.classes_.shape[0])) - - for ind, label in enumerate(Y): - class_matrix[ind, loc[label]] = 1 - self._raw_fit(X, class_matrix) - return self - - def predict(self, X): - """Predict using the kernel classification model - - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_features] - Samples. - - Returns - ------- - y : {float, array}, shape = [n_samples] - Predicted labels. - """ - Y = self._raw_predict(X) - return self.classes_[np.argmax(Y, axis=1)] diff --git a/sklearn_extra/kernel_methods/tests/test_eigenpro.py b/sklearn_extra/kernel_methods/tests/test_eigenpro.py deleted file mode 100644 index c28322c1..00000000 --- a/sklearn_extra/kernel_methods/tests/test_eigenpro.py +++ /dev/null @@ -1,255 +0,0 @@ -import numpy as np - -from sklearn.datasets import make_regression, make_classification -from numpy.testing import assert_allclose -from sklearn_extra.kernel_methods import EigenProRegressor, EigenProClassifier - -import pytest - -# Tests for EigenPro Regression and Classification. - - -def gen_regression(params): - """Generate a regression problem with make_regression - where random_state=1""" - return make_regression(**params, random_state=1) - - -def gen_classification(params): - """Generate a classification problem with make_classification - where random_state=1""" - return make_classification(**params, random_state=1) - - -@pytest.mark.parametrize( - "estimator, data", - [ - (EigenProRegressor, gen_regression({})), - (EigenProClassifier, gen_classification({})), - ], -) -@pytest.mark.parametrize( - "params, err_msg", - [ - ({"kernel": "not_a_kernel"}, "Unknown kernel 'not_a_kernel'"), - ({"n_epoch": 0}, "n_epoch should be positive, was 0"), - ({"n_epoch": -1}, "n_epoch should be positive, was -1"), - ({"n_components": -1}, "n_components should be non-negative, was -1"), - ( - {"subsample_size": -1}, - "subsample_size should be non-negative, was -1", - ), - ({"batch_size": 0}, "batch_size should be positive, was 0"), - ({"batch_size": -1}, "batch_size should be positive, was -1"), - ({"gamma": 0}, "gamma should be positive, was 0"), - ({"gamma": -1}, "gamma should be positive, was -1"), - ], -) -def test_parameter_validation(estimator, data, params, err_msg): - X, y = data - with pytest.raises(ValueError, match=err_msg): - estimator(**params).fit(X, y) - - -@pytest.mark.parametrize( - "data, estimator", - [ - # Test rbf kernel - ( - gen_regression({}), - EigenProRegressor(kernel="rbf", n_epoch=100, random_state=1), - ), - # Test laplacian kernel - ( - gen_regression({}), - EigenProRegressor( - kernel="laplace", n_epoch=100, gamma=0.008, random_state=1 - ), - ), - # Test cauchy kernel - ( - gen_regression({}), - EigenProRegressor( - kernel="cauchy", - n_epoch=100, - gamma=0.005, - subsample_size=1000, - random_state=1, - ), - ), - # Test with multiple outputs - ( - gen_regression({"n_features": 200, "n_targets": 30}), - EigenProRegressor( - kernel="rbf", n_epoch=100, gamma=0.003, random_state=1 - ), - ), - # Test with a very large number of input features - ( - gen_regression({"n_features": 10000}), - EigenProRegressor( - kernel="rbf", n_epoch=100, gamma=0.5, random_state=1 - ), - ), - # Test a very simple underlying distribution - ( - gen_regression({"n_informative": 1}), - EigenProRegressor( - batch_size=500, - kernel="rbf", - n_epoch=100, - gamma=0.005, - random_state=1, - ), - ), - # Test a very complex underlying distribution - ( - gen_regression({"n_samples": 500, "n_informative": 100}), - EigenProRegressor( - kernel="rbf", n_epoch=60, gamma=0.005, random_state=1 - ), - ), - ], -) -def test_regressor_accuracy(data, estimator): - """ - Test the accuracy of the EigenPro Regressor on multiple - data sets with different parameter inputs. We expect that the - regressor should achieve near-zero training error after sufficient - training time. - :param data: A tuple containing the input and output training data - :param Estimator: The regressor to do predictions with. - """ - X, y = data - prediction = estimator.fit(X, y).predict(X) - assert_allclose(prediction, y, rtol=5e-3) - - -def test_eigenpro_regression_duplicate_data(): - """Test the performance when some data is repeated""" - X, y = make_regression(random_state=1) - X, y = np.concatenate([X, X]), np.concatenate([y, y]) - prediction = ( - EigenProRegressor( - kernel="rbf", n_epoch=100, gamma=0.02, random_state=1 - ) - .fit(X, y) - .predict(X) - ) - assert_allclose(prediction, y, rtol=5e-3) - - -def test_eigenpro_regression_conflict_data(): - """Make sure the regressor doesn't crash when conflicting - data is given""" - X, y = make_regression(random_state=1) - y = np.reshape(y, (-1, 1)) - X, y = X, np.hstack([y, y + 2]) - # Make sure we don't throw an error when fitting or predicting - EigenProRegressor( - kernel="linear", n_epoch=5, gamma=0.5, random_state=1 - ).fit(X, y).predict(X) - - -# Tests for FastKernelClassification - - -@pytest.mark.parametrize( - "data, estimator", - [ - # Test rbf kernel - ( - gen_classification({"n_samples": 10, "hypercube": False}), - EigenProClassifier( - batch_size=9, - kernel="rbf", - gamma=0.08, - n_epoch=100, - random_state=1, - ), - ), - # Test laplacian kernel - ( - gen_classification({}), - EigenProClassifier( - kernel="laplace", n_epoch=100, gamma=0.003, random_state=1 - ), - ), - # Test cauchy kernel - ( - gen_classification({}), - EigenProClassifier( - kernel="cauchy", n_epoch=100, gamma=0.005, random_state=1 - ), - ), - # Test with a very large number of input features - # and samples, shifted around and scaled - ( - gen_classification( - { - "n_samples": 500, - "n_features": 500, - "n_informative": 160, - "scale": 30, - "shift": 6, - } - ), - EigenProClassifier( - kernel="rbf", n_epoch=50, gamma="scale", random_state=1 - ), - ), - # Test a distribution that has been shifted - ( - gen_classification({"shift": 1, "hypercube": False}), - EigenProClassifier( - kernel="rbf", n_epoch=200, gamma=0.008, random_state=1 - ), - ), - # Test with many redundant features. - ( - gen_classification({"n_redundant": 18}), - EigenProClassifier( - kernel="laplace", n_epoch=100, gamma=0.0012, random_state=1 - ), - ), - ], -) -def test_classifier_accuracy(data, estimator): - """ - Test the accuracy of the EigenPro Classification on multiple - data sets with different parameter inputs. We expect that the - classification should achieve zero training error after sufficient - training time. - :param data: A tuple containing the input and output training data - :param Estimator: The classifier to do predictions with. - """ - X, y = data - prediction = estimator.fit(X, y).predict(X) - assert_allclose(prediction, y, rtol=5e-3) - - -def test_eigenpro_classification_duplicate_data(): - """ - Make sure that the classifier correctly handles cases - where some data is repeated. - """ - X, y = make_classification(n_features=200, n_repeated=50, random_state=1) - prediction = ( - EigenProClassifier( - kernel="rbf", n_epoch=60, gamma=0.002, random_state=1 - ) - .fit(X, y) - .predict(X) - ) - assert_allclose(prediction, y, rtol=5e-3) - - -def test_eigenpro_classification_conflict_data(): - """Make sure that the classifier doesn't crash - when given conflicting input data""" - X, y = make_classification(random_state=1) - X, y = np.concatenate([X, X]), np.concatenate([y, 1 - y]) - # Make sure we don't throw an error when fitting or predicting - EigenProClassifier(kernel="linear", n_epoch=5, random_state=1).fit( - X, y - ).predict(X) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index d05945cc..02493e1a 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -14,8 +14,9 @@ import sys from time import time from libc.math cimport exp, log, sqrt, pow, fabs +from libc.stdint cimport int32_t cimport numpy as np -from numpy.math cimport INFINITY +from libc.math cimport INFINITY # Modified from sklearn.cluster._k_means_fast.pyx @@ -66,7 +67,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[floating] num_in_cluster = np.zeros(n_classes, dtype = dtype) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): diff --git a/sklearn_extra/robust/tests/test_mean_estimators.py b/sklearn_extra/robust/tests/test_mean_estimators.py index 2f005662..8cdca52f 100644 --- a/sklearn_extra/robust/tests/test_mean_estimators.py +++ b/sklearn_extra/robust/tests/test_mean_estimators.py @@ -1,6 +1,4 @@ import numpy as np -import pytest - from sklearn_extra.robust.mean_estimators import median_of_means, huber @@ -27,7 +25,5 @@ def test_mom(): def test_huber(): X = np.hstack([np.zeros(90), np.ones(10)]) - with pytest.warns(None) as record: - mu = huber(X, c=0.5) - assert len(record) == 0 + mu = huber(X, c=0.5) assert np.abs(mu) < 0.1 diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py index 5b71ecf8..92c7a6c5 100644 --- a/sklearn_extra/tests/test_common.py +++ b/sklearn_extra/tests/test_common.py @@ -2,7 +2,6 @@ from sklearn.utils import estimator_checks from sklearn_extra.kernel_approximation import Fastfood -from sklearn_extra.kernel_methods import EigenProClassifier, EigenProRegressor from sklearn_extra.cluster import KMedoids, CommonNNClustering, CLARA from sklearn_extra.robust import ( RobustWeightedClassifier, @@ -15,8 +14,6 @@ Fastfood, KMedoids, CLARA, - EigenProClassifier, - EigenProRegressor, CommonNNClustering, RobustWeightedKMeans, RobustWeightedRegressor, @@ -27,12 +24,6 @@ @estimator_checks.parametrize_with_checks([cls() for cls in ALL_ESTIMATORS]) def test_all_estimators(estimator, check, request): # TODO: fix this common test failure cf #41 - if isinstance( - estimator, EigenProClassifier - ) and "function check_classifier_multioutput" in str(check): - request.applymarker( - pytest.mark.xfail(run=False, reason="See issue #41") - ) # TODO: fix this later, ask people at sklearn to advise on it. if isinstance(estimator, RobustWeightedRegressor) and ( diff --git a/sklearn_extra/kernel_methods/tests/__init__.py b/sklearn_extra/utils/__init__.py similarity index 100% rename from sklearn_extra/kernel_methods/tests/__init__.py rename to sklearn_extra/utils/__init__.py