From 3e773d5c75670661898c493c8bcb880e4b5c8545 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 3 Sep 2024 10:39:43 +0200 Subject: [PATCH 01/39] uncommend build wheel ci --- .github/workflows/build-wheels.yml | 34 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index aa9803ad..343e112f 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -1,6 +1,6 @@ name: build_wheels -on: #[push, pull_request] +on: [push, pull_request] release: types: - created @@ -64,20 +64,20 @@ jobs: with: path: dist/*.tar.gz - upload_pypi: - needs: [build_wheels, build_sdist] - runs-on: ubuntu-latest - # upload to PyPI on every tag starting with 'v' - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') - steps: - - uses: actions/download-artifact@v2 - with: - name: artifact - path: dist + # upload_pypi: + # needs: [build_wheels, build_sdist] + # runs-on: ubuntu-latest + # # upload to PyPI on every tag starting with 'v' + # if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') + # steps: + # - uses: actions/download-artifact@v2 + # with: + # name: artifact + # path: dist - - uses: pypa/gh-action-pypi-publish@master - with: - user: __token__ - password: ${{ secrets.pypi_password }} - # To test: - repository_url: https://test.pypi.org/legacy/ + # - uses: pypa/gh-action-pypi-publish@master + # with: + # user: __token__ + # password: ${{ secrets.pypi_password }} + # # To test: + # repository_url: https://test.pypi.org/legacy/ From bf188d2680714c2879de96c08416f41ec8e7b722 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 3 Sep 2024 11:24:53 +0200 Subject: [PATCH 02/39] modify to compile with up to date numpy --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 24b7dfba..384bec52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ requires = [ # wheels on PyPI # # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg - "oldest-supported-numpy" + "numpy" ] [tool.black] From 23a597e6ecd4502949023aea3f6b5cc5412de7dd Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 3 Sep 2024 14:19:37 +0200 Subject: [PATCH 03/39] try to fix ci --- benchmarks/_bench/eigenpro_plot_mnist.py | 4 ++-- pyproject.toml | 1 - sklearn_extra/kernel_methods/_eigenpro.py | 4 ++-- sklearn_extra/utils/__init__.py | 0 4 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 sklearn_extra/utils/__init__.py diff --git a/benchmarks/_bench/eigenpro_plot_mnist.py b/benchmarks/_bench/eigenpro_plot_mnist.py index 77009842..1e3c65d3 100644 --- a/benchmarks/_bench/eigenpro_plot_mnist.py +++ b/benchmarks/_bench/eigenpro_plot_mnist.py @@ -15,8 +15,8 @@ print("Data has loaded") p = rng.permutation(60000) -x_train = mnist.data[p] -y_train = np.int32(mnist.target[p]) +x_train = mnist.data.iloc[p] +y_train = np.int32(mnist.target.iloc[p]) x_test = mnist.data[60000:] y_test = np.int32(mnist.target[60000:]) diff --git a/pyproject.toml b/pyproject.toml index 384bec52..d5ac3239 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,6 @@ requires = [ "setuptools", "wheel", "Cython>=0.28.5", - # use oldest-supported-numpy which provides the oldest numpy version with # wheels on PyPI # diff --git a/sklearn_extra/kernel_methods/_eigenpro.py b/sklearn_extra/kernel_methods/_eigenpro.py index 3016c491..38fb4e6f 100644 --- a/sklearn_extra/kernel_methods/_eigenpro.py +++ b/sklearn_extra/kernel_methods/_eigenpro.py @@ -110,11 +110,11 @@ def _nystrom_svd(self, X, n_components): W = K / m try: - E, Lambda = eigh(W, eigvals=(m - n_components, m - 1)) + E, Lambda = eigh(W) except LinAlgError: # Use float64 when eigh fails due to precision W = np.float64(W) - E, Lambda = eigh(W, eigvals=(m - n_components, m - 1)) + E, Lambda = eigh(W) E, Lambda = np.float32(E), np.float32(Lambda) # Flip so eigenvalues are in descending order. E = np.maximum(np.float32(1e-7), np.flipud(E)) diff --git a/sklearn_extra/utils/__init__.py b/sklearn_extra/utils/__init__.py new file mode 100644 index 00000000..e69de29b From 9abcf676ac6827811e1312e6517a7637ef1d7086 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 3 Sep 2024 14:22:20 +0200 Subject: [PATCH 04/39] relax version and fix using 3.11 --- azure-pipelines.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 60a837d1..666ee7b8 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,18 +7,18 @@ jobs: matrix: Python39: python.version: '3.9' - NUMPY_VERSION: "1.19.4" - SCIPY_VERSION: "1.5.4" + NUMPY_VERSION: "*" + SCIPY_VERSION: "*" SKLEARN_VERSION: "*" Python310: python.version: '3.10' - NUMPY_VERSION: "1.26.1" - SCIPY_VERSION: "1.11.3" + NUMPY_VERSION: "*" + SCIPY_VERSION: "*" SKLEARN_VERSION: "*" Python311: - python.version: '3.10' - NUMPY_VERSION: "1.26.1" - SCIPY_VERSION: "1.11.3" + python.version: '3.11' + NUMPY_VERSION: "*" + SCIPY_VERSION: "*" SKLEARN_VERSION: "*" variables: From 79ff37ae73e1455bc1e209f48acc3d95015f627a Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 3 Sep 2024 16:24:19 +0200 Subject: [PATCH 05/39] correct test eigenpro and huber --- sklearn_extra/kernel_methods/tests/test_eigenpro.py | 2 +- sklearn_extra/robust/tests/test_mean_estimators.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn_extra/kernel_methods/tests/test_eigenpro.py b/sklearn_extra/kernel_methods/tests/test_eigenpro.py index c28322c1..372604ef 100644 --- a/sklearn_extra/kernel_methods/tests/test_eigenpro.py +++ b/sklearn_extra/kernel_methods/tests/test_eigenpro.py @@ -31,7 +31,7 @@ def gen_classification(params): @pytest.mark.parametrize( "params, err_msg", [ - ({"kernel": "not_a_kernel"}, "Unknown kernel 'not_a_kernel'"), + ({"kernel": "not_a_kernel"}, "The 'metric' parameter of pairwise_kernels must be a str among {'cosine', 'poly', 'laplacian', 'polynomial', 'chi2', 'linear', 'sigmoid', 'additive_chi2', 'precomputed', 'rbf'} or a callable. Got 'not_a_kernel' instead."), ({"n_epoch": 0}, "n_epoch should be positive, was 0"), ({"n_epoch": -1}, "n_epoch should be positive, was -1"), ({"n_components": -1}, "n_components should be non-negative, was -1"), diff --git a/sklearn_extra/robust/tests/test_mean_estimators.py b/sklearn_extra/robust/tests/test_mean_estimators.py index 2f005662..c9ef92e6 100644 --- a/sklearn_extra/robust/tests/test_mean_estimators.py +++ b/sklearn_extra/robust/tests/test_mean_estimators.py @@ -27,7 +27,6 @@ def test_mom(): def test_huber(): X = np.hstack([np.zeros(90), np.ones(10)]) - with pytest.warns(None) as record: - mu = huber(X, c=0.5) + mu = huber(X, c=0.5) assert len(record) == 0 assert np.abs(mu) < 0.1 From 684c0458968d6ada6fd1c72fe08c019981691977 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Sep 2024 14:24:28 +0000 Subject: [PATCH 06/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sklearn_extra/kernel_methods/tests/test_eigenpro.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn_extra/kernel_methods/tests/test_eigenpro.py b/sklearn_extra/kernel_methods/tests/test_eigenpro.py index 372604ef..83fc16e3 100644 --- a/sklearn_extra/kernel_methods/tests/test_eigenpro.py +++ b/sklearn_extra/kernel_methods/tests/test_eigenpro.py @@ -31,7 +31,10 @@ def gen_classification(params): @pytest.mark.parametrize( "params, err_msg", [ - ({"kernel": "not_a_kernel"}, "The 'metric' parameter of pairwise_kernels must be a str among {'cosine', 'poly', 'laplacian', 'polynomial', 'chi2', 'linear', 'sigmoid', 'additive_chi2', 'precomputed', 'rbf'} or a callable. Got 'not_a_kernel' instead."), + ( + {"kernel": "not_a_kernel"}, + "The 'metric' parameter of pairwise_kernels must be a str among {'cosine', 'poly', 'laplacian', 'polynomial', 'chi2', 'linear', 'sigmoid', 'additive_chi2', 'precomputed', 'rbf'} or a callable. Got 'not_a_kernel' instead.", + ), ({"n_epoch": 0}, "n_epoch should be positive, was 0"), ({"n_epoch": -1}, "n_epoch should be positive, was -1"), ({"n_components": -1}, "n_components should be non-negative, was -1"), From 73d286fa059897e37c67075ed6fab5e42b621bce Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Thu, 5 Sep 2024 15:46:18 +0200 Subject: [PATCH 07/39] fix tests --- sklearn_extra/cluster/_k_medoids.py | 4 ++-- sklearn_extra/cluster/tests/test_k_medoids.py | 9 ++++----- sklearn_extra/kernel_methods/tests/test_eigenpro.py | 3 ++- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index bb5165ba..a4087510 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -121,7 +121,7 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): array([[1., 2.], [4., 2.]]) >>> kmedoids.inertia_ - 8.0 + np.float64(8.0) See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples of KMedoids with various distance metrics. @@ -595,7 +595,7 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin): >>> clara.predict([[0,0], [4,4]]) array([0, 1]) >>> clara.inertia_ - 122.44919397611667 + np.float64(122.44919397611667) References ---------- diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py index 30f419a0..492ce5e2 100644 --- a/sklearn_extra/cluster/tests/test_k_medoids.py +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -405,11 +405,10 @@ def test_clara_consistency_iris(): def test_seuclidean(): - with pytest.warns(None) as record: - km = KMedoids(2, metric="seuclidean", method="pam") - km.fit(np.array([0, 0, 0, 1]).reshape((4, 1))) - km.predict(np.array([0, 0, 0, 1]).reshape((4, 1))) - km.transform(np.array([0, 0, 0, 1]).reshape((4, 1))) + km = KMedoids(2, metric="seuclidean", method="pam") + km.fit(np.array([0, 0, 0, 1]).reshape((4, 1))) + km.predict(np.array([0, 0, 0, 1]).reshape((4, 1))) + km.transform(np.array([0, 0, 0, 1]).reshape((4, 1))) assert len(record) == 0 diff --git a/sklearn_extra/kernel_methods/tests/test_eigenpro.py b/sklearn_extra/kernel_methods/tests/test_eigenpro.py index 372604ef..3328cc9c 100644 --- a/sklearn_extra/kernel_methods/tests/test_eigenpro.py +++ b/sklearn_extra/kernel_methods/tests/test_eigenpro.py @@ -31,7 +31,8 @@ def gen_classification(params): @pytest.mark.parametrize( "params, err_msg", [ - ({"kernel": "not_a_kernel"}, "The 'metric' parameter of pairwise_kernels must be a str among {'cosine', 'poly', 'laplacian', 'polynomial', 'chi2', 'linear', 'sigmoid', 'additive_chi2', 'precomputed', 'rbf'} or a callable. Got 'not_a_kernel' instead."), + # ({"kernel": "not_a_kernel"}, "The 'metric' parameter of pairwise_kernels must be a str among {'cosine', 'poly', 'laplacian', 'polynomial', 'chi2', 'linear', 'sigmoid', 'additive_chi2', 'precomputed', 'rbf'} or a callable. Got 'not_a_kernel' instead."), + # Remove this because the error message is not always the same. ({"n_epoch": 0}, "n_epoch should be positive, was 0"), ({"n_epoch": -1}, "n_epoch should be positive, was -1"), ({"n_components": -1}, "n_components should be non-negative, was -1"), From 1d75babcb38586daf02a11bd0c897a1c1a04d06c Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Thu, 5 Sep 2024 15:52:50 +0200 Subject: [PATCH 08/39] remove unused import --- sklearn_extra/robust/tests/test_mean_estimators.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn_extra/robust/tests/test_mean_estimators.py b/sklearn_extra/robust/tests/test_mean_estimators.py index c9ef92e6..3f9a4eb6 100644 --- a/sklearn_extra/robust/tests/test_mean_estimators.py +++ b/sklearn_extra/robust/tests/test_mean_estimators.py @@ -1,6 +1,4 @@ import numpy as np -import pytest - from sklearn_extra.robust.mean_estimators import median_of_means, huber From 55aa993afa1bdcd651391dae65b1cbe703bba251 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Sat, 2 Nov 2024 09:16:04 +0100 Subject: [PATCH 09/39] minor --- pyproject.toml | 4 ---- sklearn_extra/cluster/tests/test_k_medoids.py | 1 - sklearn_extra/robust/tests/test_mean_estimators.py | 1 - 3 files changed, 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d5ac3239..64ef0bda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,10 +4,6 @@ requires = [ "setuptools", "wheel", "Cython>=0.28.5", - # use oldest-supported-numpy which provides the oldest numpy version with - # wheels on PyPI - # - # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg "numpy" ] diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py index 492ce5e2..9af8943d 100644 --- a/sklearn_extra/cluster/tests/test_k_medoids.py +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -409,7 +409,6 @@ def test_seuclidean(): km.fit(np.array([0, 0, 0, 1]).reshape((4, 1))) km.predict(np.array([0, 0, 0, 1]).reshape((4, 1))) km.transform(np.array([0, 0, 0, 1]).reshape((4, 1))) - assert len(record) == 0 def test_medoids_indices(): diff --git a/sklearn_extra/robust/tests/test_mean_estimators.py b/sklearn_extra/robust/tests/test_mean_estimators.py index 3f9a4eb6..8cdca52f 100644 --- a/sklearn_extra/robust/tests/test_mean_estimators.py +++ b/sklearn_extra/robust/tests/test_mean_estimators.py @@ -26,5 +26,4 @@ def test_mom(): def test_huber(): X = np.hstack([np.zeros(90), np.ones(10)]) mu = huber(X, c=0.5) - assert len(record) == 0 assert np.abs(mu) < 0.1 From d39539e732b835b57925df4bb144977530132bc6 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 10:09:12 +0100 Subject: [PATCH 10/39] remove eigenpro --- examples/eigenpro/README.txt | 6 - examples/eigenpro/plot_eigenpro_synthetic.py | 127 ---- sklearn_extra/kernel_methods/__init__.py | 3 - sklearn_extra/kernel_methods/_eigenpro.py | 670 ------------------ .../kernel_methods/tests/__init__.py | 0 .../kernel_methods/tests/test_eigenpro.py | 256 ------- sklearn_extra/tests/test_common.py | 9 - 7 files changed, 1071 deletions(-) delete mode 100644 examples/eigenpro/README.txt delete mode 100644 examples/eigenpro/plot_eigenpro_synthetic.py delete mode 100644 sklearn_extra/kernel_methods/__init__.py delete mode 100644 sklearn_extra/kernel_methods/_eigenpro.py delete mode 100644 sklearn_extra/kernel_methods/tests/__init__.py delete mode 100644 sklearn_extra/kernel_methods/tests/test_eigenpro.py diff --git a/examples/eigenpro/README.txt b/examples/eigenpro/README.txt deleted file mode 100644 index 4ed1fb41..00000000 --- a/examples/eigenpro/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -.. _eigenpro_examples: - -Eigenpro -======== - -Examples concerning the :mod:`sklearn_extra.kernel_methods.eigenpro` module. diff --git a/examples/eigenpro/plot_eigenpro_synthetic.py b/examples/eigenpro/plot_eigenpro_synthetic.py deleted file mode 100644 index 802f8a57..00000000 --- a/examples/eigenpro/plot_eigenpro_synthetic.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -====================================================== -Comparison of EigenPro and SVC on Digit Classification -====================================================== - -Here we train a EigenPro Classifier and a Support -Vector Classifier (SVC) on a synthetically generated -binary classification problem. We halt the training -of EigenPro after two epochs. -While EigenPro is slower on low dimensional datasets, as -the number of features exceeds 500, it begins to outperform -SVM and shows more stability. -""" -print(__doc__) - -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from time import time - -from sklearn.datasets import make_classification -from sklearn_extra.kernel_methods import EigenProClassifier -from sklearn.svm import SVC - -rng = np.random.RandomState(1) - -train_size = 2000 -test_size = 1000 - -# Run tests comparing eig to svc -eig_fit_times = [] -eig_pred_times = [] -eig_err = [] -svc_fit_times = [] -svc_pred_times = [] -svc_err = [] - -feature_counts = [20, 50, 150, 500, 1500] -gamma = 0.008 - -# Fit models to data -for n_features in feature_counts: - x, y = make_classification( - n_samples=train_size + test_size, - n_features=n_features, - random_state=rng, - ) - - x_train = x[:train_size] - y_train = y[:train_size] - x_test = x[train_size:] - y_test = y[train_size:] - for name, estimator in [ - ( - "EigenPro", - EigenProClassifier( - n_epoch=2, gamma=gamma, n_components=400, random_state=rng - ), - ), - ("SupportVector", SVC(gamma=gamma, random_state=rng)), - ]: - stime = time() - estimator.fit(x_train, y_train) - fit_t = time() - stime - - stime = time() - y_pred_test = estimator.predict(x_test) - pred_t = time() - stime - - err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) - if name == "EigenPro": - eig_fit_times.append(fit_t) - eig_pred_times.append(pred_t) - eig_err.append(err) - else: - svc_fit_times.append(fit_t) - svc_pred_times.append(pred_t) - svc_err.append(err) - print( - "%s Classification with %i features in %0.2f seconds. Error: %0.1f" - % (name, n_features, fit_t + pred_t, err) - ) - -# set up grid for figures -fig = plt.figure(num=None, figsize=(6, 4), dpi=160) -ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2) - -# Graph fit(train) time -feature_number_labels = [str(s) for s in feature_counts] -ax.plot(feature_counts, svc_fit_times, "o--", color="g", label="SVC") -ax.plot( - feature_counts, eig_fit_times, "o-", color="r", label="EigenPro Classifier" -) -ax.set_xscale("log") -ax.set_yscale("log", nonpositive="clip") -ax.set_xlabel("Number of features") -ax.set_ylabel("time (seconds)") -ax.legend() -ax.set_title("Training Time") -ax.set_xticks(feature_counts) -ax.set_xticklabels(feature_number_labels) -ax.set_xticks([], minor=True) -ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) - -# Graph prediction(test) time -ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1) -ax.plot(feature_counts, eig_pred_times, "o-", color="r") -ax.plot(feature_counts, svc_pred_times, "o--", color="g") -ax.set_xscale("log") -ax.set_yscale("log", nonpositive="clip") -ax.set_ylabel("time (seconds)") -ax.set_title("Prediction Time") -ax.set_xticks([]) -ax.set_xticks([], minor=True) - -# Graph training error -ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1) -ax.plot(feature_counts, eig_err, "o-", color="r") -ax.plot(feature_counts, svc_err, "o-", color="g") -ax.set_xscale("log") -ax.set_xticks(feature_counts) -ax.set_xticklabels(feature_number_labels) -ax.set_xticks([], minor=True) -ax.set_xlabel("Number of features") -ax.set_ylabel("Classification error %") -plt.tight_layout() -plt.show() diff --git a/sklearn_extra/kernel_methods/__init__.py b/sklearn_extra/kernel_methods/__init__.py deleted file mode 100644 index 53be76dc..00000000 --- a/sklearn_extra/kernel_methods/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ._eigenpro import BaseEigenPro, EigenProClassifier, EigenProRegressor - -__all__ = ["BaseEigenPro", "EigenProClassifier", "EigenProRegressor"] diff --git a/sklearn_extra/kernel_methods/_eigenpro.py b/sklearn_extra/kernel_methods/_eigenpro.py deleted file mode 100644 index 38fb4e6f..00000000 --- a/sklearn_extra/kernel_methods/_eigenpro.py +++ /dev/null @@ -1,670 +0,0 @@ -# Authors: Alex Li <7Alex7Li@gmail.com> -# Siyuan Ma - -import numpy as np -from scipy.linalg import eigh, LinAlgError -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin -from sklearn.metrics.pairwise import pairwise_kernels, euclidean_distances -from sklearn.utils import check_random_state -from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.validation import check_is_fitted, check_X_y - - -class BaseEigenPro(BaseEstimator): - """ - Base class for EigenPro iteration. - """ - - def __init__( - self, - batch_size="auto", - n_epoch=2, - n_components=1000, - subsample_size="auto", - kernel="rbf", - gamma="scale", - degree=3, - coef0=1, - kernel_params=None, - random_state=None, - ): - self.batch_size = batch_size - self.n_epoch = n_epoch - self.n_components = n_components - self.subsample_size = subsample_size - self.kernel = kernel - self.gamma = gamma - self.degree = degree - self.coef0 = coef0 - self.kernel_params = kernel_params - self.random_state = random_state - - def _kernel(self, X, Y): - """Calculate the kernel matrix - - Parameters - --------- - X : {float, array}, shape = [n_samples, n_features] - Input data. - - Y : {float, array}, shape = [n_centers, n_targets] - Kernel centers. - - Returns - ------- - K : {float, array}, shape = [n_samples, n_centers] - Kernel matrix. - """ - if ( - self.kernel != "rbf" - and self.kernel != "laplace" - and self.kernel != "cauchy" - ): - if callable(self.kernel): - params = self.kernel_params or {} - else: - params = { - "gamma": self.gamma_, - "degree": self.degree, - "coef0": self.coef0, - } - return pairwise_kernels( - X, Y, metric=self.kernel, filter_params=True, **params - ) - distance = euclidean_distances(X, Y, squared=True) - bandwidth = np.float32(1.0 / np.sqrt(2.0 * self.gamma_)) - if self.kernel == "rbf": - distance = -self.gamma_ * distance - K = np.exp(distance) - elif self.kernel == "laplace": - d = np.maximum(distance, 0) - K = np.exp(-np.sqrt(d) / bandwidth) - else: # self.kernel == "cauchy": - K = 1 / (1 + 2.0 * self.gamma_ * distance) - return K - - def _nystrom_svd(self, X, n_components): - """Compute the top eigensystem of a kernel - operator using Nystrom method - - Parameters - ---------- - X : {float, array}, shape = [n_subsamples, n_features] - Subsample feature matrix. - - n_components : int - Number of top eigencomponents to be restored. - - Returns - ------- - E : {float, array}, shape = [k] - Top eigenvalues. - - Lambda : {float, array}, shape = [n_subsamples, k] - Top eigenvectors of a subsample kernel matrix (which can be - directly used to approximate the eigenfunctions of the kernel - operator). - """ - m, _ = X.shape - K = self._kernel(X, X) - - W = K / m - try: - E, Lambda = eigh(W) - except LinAlgError: - # Use float64 when eigh fails due to precision - W = np.float64(W) - E, Lambda = eigh(W) - E, Lambda = np.float32(E), np.float32(Lambda) - # Flip so eigenvalues are in descending order. - E = np.maximum(np.float32(1e-7), np.flipud(E)) - Lambda = np.fliplr(Lambda)[:, :n_components] / np.sqrt( - m, dtype="float32" - ) - - return E, Lambda - - def _setup(self, feat, max_components, mG, alpha): - """Compute preconditioner and scale factors for EigenPro iteration - - Parameters - ---------- - feat : {float, array}, shape = [n_samples, n_features] - Feature matrix (normally from training data). - - max_components : int - Maximum number of components to be used in EigenPro iteration. - - mG : int - Maximum batch size to fit in memory. - - alpha : float - Exponential factor (< 1) for eigenvalue ratio. - - Returns - ------- - max_S : float - Normalized largest eigenvalue. - - max_kxx : float - Maximum of k(x,x) where k is the EigenPro kernel. - - E : {float, array}, shape = [k] - Preconditioner for EigenPro - - Lambda : {float, array}, shape = [n_subsamples, k] - Top eigenvectors of a subsample kernel matrix - """ - alpha = np.float32(alpha) - - # Estimate eigenvalues (S) and eigenvectors (V) of the kernel matrix - # corresponding to the feature matrix. - E, Lambda = self._nystrom_svd(feat, max_components) - n_subsamples = feat.shape[0] - - # Calculate the number of components to be used such that the - # corresponding batch size is bounded by the subsample size and the - # memory size. - max_bs = min(max(n_subsamples / 5, mG), n_subsamples) - n_components = np.sum(np.power(1 / E, alpha) < max_bs) - 1 - if n_components < 2: - n_components = min(E.shape[0] - 1, 2) - - Lambda = Lambda[:, :n_components] - scale = np.power(E[0] / E[n_components], alpha) - - # Compute part of the preconditioner for step 2 of gradient descent in - # the eigenpro model - D = (1 - np.power(E[n_components] / E[:n_components], alpha)) / E[ - :n_components - ] - - max_S = E[0].astype(np.float32) - kxx = 1 - np.sum(Lambda**2, axis=1) * n_subsamples - return max_S / scale, np.max(kxx), D, Lambda - - def _initialize_params(self, X, Y, random_state): - """ - Validate parameters passed to the model, choose parameters - that have not been passed in, and run setup for EigenPro iteration. - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_features] - Training data. - - Y : {float, array}, shape = [n_samples, n_targets] - Training targets. - - random_state : RandomState instance - The random state to use for random number generation - - Returns - ------- - Y : {float, array}, shape = [n_samples, n_targets] - Training targets. If Y was originally of shape - [n_samples], it is now [n_samples, 1]. - - E : {float, array}, shape = [k] - Preconditioner for EigenPro - - Lambda : {float, array}, shape = [n_subsamples, k] - Top eigenvectors of a subsample kernel matrix - - eta : float - The learning rate - - pinx : {int, array}, shape = [sample_size] - The rows of X used to calculate E and Lambda - """ - n, d = X.shape - n_label = 1 if len(Y.shape) == 1 else Y.shape[1] - self.centers_ = X - - # Calculate the subsample size to be used. - if self.subsample_size == "auto": - if n < 100000: - sample_size = 4000 - else: - sample_size = 12000 - else: - sample_size = self.subsample_size - sample_size = min(n, sample_size) - - n_components = min(sample_size - 1, self.n_components) - n_components = max(1, n_components) - - # Approximate amount of memory that we want to use - mem_bytes = 0.1 * 1024**3 - # Memory used with a certain sample size - mem_usages = (d + n_label + 2 * np.arange(sample_size)) * n * 4 - mG = np.int32(np.sum(mem_usages < mem_bytes)) - - # Calculate largest eigenvalue and max{k(x,x)} using subsamples. - pinx = random_state.choice(n, sample_size, replace=False).astype( - "int32" - ) - if self.gamma == "scale": - self.gamma_ = np.float32(1.0 / (X.var() * d)) - else: - self.gamma_ = self.gamma - max_S, beta, E, Lambda = self._setup( - X[pinx], n_components, mG, alpha=0.95 - ) - # Calculate best batch size. - if self.batch_size == "auto": - bs = min(np.int32(beta / max_S), mG) + 1 - else: - bs = self.batch_size - self.bs_ = min(bs, n) - - # Calculate best step size. - if self.bs_ < beta / max_S + 1: - eta = self.bs_ / beta - elif self.bs_ < n: - eta = 2.0 * self.bs_ / (beta + (self.bs_ - 1) * max_S) - else: - eta = 0.95 * 2 / max_S - # Remember the shape of Y for predict() and ensure it's shape is 2-D. - self.was_1D_ = False - if len(Y.shape) == 1: - Y = np.reshape(Y, (Y.shape[0], 1)) - self.was_1D_ = True - return Y, E, Lambda, np.float32(eta), pinx - - def validate_parameters(self): - """ - Validate the parameters of the model to ensure that no unreasonable - values were passed in. - """ - if self.n_epoch <= 0: - raise ValueError( - "n_epoch should be positive, was " + str(self.n_epoch) - ) - if self.n_components < 0: - raise ValueError( - "n_components should be non-negative, was " - + str(self.n_components) - ) - if self.subsample_size != "auto" and self.subsample_size < 0: - raise ValueError( - "subsample_size should be non-negative, was " - + str(self.subsample_size) - ) - if self.batch_size != "auto" and self.batch_size <= 0: - raise ValueError( - "batch_size should be positive, was " + str(self.batch_size) - ) - if self.gamma != "scale" and self.gamma <= 0: - raise ValueError( - "gamma should be positive, was " + str(self.gamma) - ) - - def _raw_fit(self, X, Y): - """Train eigenpro regression model - - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_features] - Training data. - - Y : {float, array}, shape = [n_samples, n_targets] - Training targets. - - Returns - ------- - self : returns an instance of self. - """ - X, Y = check_X_y( - X, - Y, - dtype=np.float32, - multi_output=True, - ensure_min_samples=3, - y_numeric=True, - ) - self.n_features_in_ = X.shape[1] - Y = Y.astype(np.float32) - random_state = check_random_state(self.random_state) - - self.validate_parameters() - """Parameter Initialization""" - Y, D, V, eta, pinx = self._initialize_params(X, Y, random_state) - - """Training loop""" - n = self.centers_.shape[0] - - self.coef_ = np.zeros((n, Y.shape[1]), dtype=np.float32) - step = np.float32(eta / self.bs_) - for _ in range(0, self.n_epoch): - epoch_inds = random_state.choice( - n, n // self.bs_ * self.bs_, replace=False - ).astype("int32") - - for batch_inds in np.array_split(epoch_inds, n // self.bs_): - batch_x = self.centers_[batch_inds] - kfeat = self._kernel(batch_x, self.centers_) - batch_y = Y[batch_inds] - - # Update 1: Sampled Coordinate Block. - gradient = np.dot(kfeat, self.coef_) - batch_y - - self.coef_[batch_inds] -= step * gradient - - # Update 2: Fixed Coordinate Block - delta = np.dot( - V * D, np.dot(V.T, np.dot(kfeat[:, pinx].T, gradient)) - ) - self.coef_[pinx] += step * delta - return self - - def _raw_predict(self, X): - """Predict using the kernel regression model - - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_features] - Samples. - - Returns - ------- - Y : {float, array}, shape = [n_samples, n_targets] - Predicted targets. - """ - check_is_fitted( - self, ["bs_", "centers_", "coef_", "was_1D_", "gamma_"] - ) - X = np.asarray(X, dtype=np.float64) - - if len(X.shape) == 1: - raise ValueError( - "Reshape your data. X should be a matrix of shape" - " (n_samples, n_features)." - ) - n = X.shape[0] - - Ys = [] - for batch_inds in np.array_split(range(n), max(1, n // self.bs_)): - batch_x = X[batch_inds] - kfeat = self._kernel(batch_x, self.centers_) - - pred = np.dot(kfeat, self.coef_) - Ys.append(pred) - Y = np.vstack(Ys) - if self.was_1D_: - Y = np.reshape(Y, Y.shape[0]) - return Y - - def _get_tags(self): - tags = super()._get_tags() - tags["multioutput"] = True - return tags - - -class EigenProRegressor(RegressorMixin, BaseEigenPro): - """Regression using EigenPro iteration. - - Train least squared kernel regression model with mini-batch EigenPro - iteration. - - Parameters - ---------- - batch_size : int, default = 'auto' - Mini-batch size for gradient descent. - - n_epoch : int, default = 2 - The number of passes over the training data. - - n_components : int, default = 1000 - the maximum number of eigendirections used in modifying the kernel - operator. Convergence rate speedup over normal gradient descent is - approximately the largest eigenvalue over the n_componentth - eigenvalue, however, it may take time to compute eigenvalues for - large n_components - - subsample_size : int, default = 'auto' - The number of subsamples used for estimating the largest - n_component eigenvalues and eigenvectors. When it is set to 'auto', - it will be 4000 if there are less than 100,000 samples - (for training), and otherwise 12000. - - kernel : string or callable, default = "rbf" - Kernel mapping used internally. Strings can be anything supported - by scikit-learn, however, there is special support for the - rbf, laplace, and cauchy kernels. If a callable is given, it should - accept two arguments and return a floating point number. - - gamma : float, default='scale' - Kernel coefficient. If 'scale', gamma = 1/(n_features*X.var()). - Interpretation of the default value is left to the kernel; - see the documentation for sklearn.metrics.pairwise. - For kernels that use bandwidth, bandwidth = 1/sqrt(2*gamma). - - degree : float, default=3 - Degree of the polynomial kernel. Ignored by other kernels. - - coef0 : float, default=1 - Zero coefficient for polynomial and sigmoid kernels. - Ignored by other kernels. - - kernel_params : mapping of string to any - Additional parameters (keyword arguments) for kernel function - passed as callable object. - - random_state : int, RandomState instance or None, (default=None) - The seed of the pseudo random number generator to use when - shuffling the data. If int, random_state is the seed used by the - random number generator; If RandomState instance, random_state is - the random number generator; If None, the random number generator - is the RandomState instance used by `np.random`. - - References - ---------- - * Siyuan Ma, Mikhail Belkin - "Diving into the shallows: a computational perspective on - large-scale machine learning", NIPS 2017. - - Examples - -------- - >>> from sklearn_extra.kernel_methods import EigenProRegressor - >>> import numpy as np - >>> n_samples, n_features, n_targets = 4000, 20, 3 - >>> rng = np.random.RandomState(1) - >>> x_train = rng.randn(n_samples, n_features) - >>> y_train = rng.randn(n_samples, n_targets) - >>> rgs = EigenProRegressor(n_epoch=3, gamma=.5, subsample_size=50) - >>> rgs.fit(x_train, y_train) - EigenProRegressor(gamma=0.5, n_epoch=3, subsample_size=50) - >>> y_pred = rgs.predict(x_train) - >>> loss = np.mean(np.square(y_train - y_pred)) - """ - - def __init__( - self, - batch_size="auto", - n_epoch=2, - n_components=1000, - subsample_size="auto", - kernel="rbf", - gamma="scale", - degree=3, - coef0=1, - kernel_params=None, - random_state=None, - ): - super().__init__( - batch_size=batch_size, - n_epoch=n_epoch, - n_components=n_components, - subsample_size=subsample_size, - kernel=kernel, - gamma=gamma, - degree=degree, - coef0=coef0, - kernel_params=kernel_params, - random_state=random_state, - ) - - def fit(self, X, Y): - return self._raw_fit(X, Y) - - def predict(self, X): - return self._raw_predict(X) - - -class EigenProClassifier(ClassifierMixin, BaseEigenPro): - """Classification using EigenPro iteration. - - Train least squared kernel classification model with mini-batch EigenPro - iteration. - - Parameters - ---------- - batch_size : int, default = 'auto' - Mini-batch size for gradient descent. - - n_epoch : int, default = 2 - The number of passes over the training data. - - n_components : int, default = 1000 - the maximum number of eigendirections used in modifying the - kernel operator. Convergence rate speedup over normal gradient - descent is approximately the largest eigenvalue over the - n_componenth eigenvalue, however, it may take time to compute - eigenvalues for large n_components - - subsample_size : int, default = 'auto' - The size of subsamples used for estimating the largest - n_component eigenvalues and eigenvectors. When it is set to - 'auto', it will be 4000 if there are less than 100,000 samples - (for training), and otherwise 12000. - - kernel : string or callable, default = "rbf" - Kernel mapping used internally. Strings can be anything supported - by scikit-learn, however, there is special support for the - rbf, laplace, and cauchy kernels. If a callable is given, it should - accept two arguments and return a floating point number. - - gamma : float, default='scale' - Kernel coefficient. If 'scale', gamma = 1/(n_features*X.var()). - Interpretation of the default value is left to the kernel; - see the documentation for sklearn.metrics.pairwise. - For kernels that use bandwidth, bandwidth = 1/sqrt(2*gamma). - - degree : float, default=3 - Degree of the polynomial kernel. Ignored by other kernels. - - coef0 : float, default=1 - Zero coefficient for polynomial and sigmoid kernels. Ignored by - other kernels. - - kernel_params : mapping of string to any - Additional parameters (keyword arguments) for kernel function - passed as callable object. - - random_state : int, RandomState instance or None (default=None) - The seed of the pseudo random number generator to use when - shuffling the data. If int, random_state is the seed used by - the random number generator; If RandomState instance, - random_state is the random number generator; - If None, the random number generator is the RandomState - instance used by `np.random`. - - References - ---------- - * Siyuan Ma, Mikhail Belkin - "Diving into the shallows: a computational perspective on - large-scale machine learning", NIPS 2017. - - Examples - -------- - >>> from sklearn_extra.kernel_methods import EigenProClassifier - >>> import numpy as np - >>> n_samples, n_features, n_targets = 4000, 20, 3 - >>> rng = np.random.RandomState(1) - >>> x_train = rng.randn(n_samples, n_features) - >>> y_train = rng.randint(n_targets, size=n_samples) - >>> rgs = EigenProClassifier(n_epoch=3, gamma=.01, subsample_size=50) - >>> rgs.fit(x_train, y_train) - EigenProClassifier(gamma=0.01, n_epoch=3, subsample_size=50) - >>> y_pred = rgs.predict(x_train) - >>> loss = np.mean(y_train != y_pred) - """ - - def __init__( - self, - batch_size="auto", - n_epoch=2, - n_components=1000, - subsample_size="auto", - kernel="rbf", - gamma=0.02, - degree=3, - coef0=1, - kernel_params=None, - random_state=None, - ): - super().__init__( - batch_size=batch_size, - n_epoch=n_epoch, - n_components=n_components, - subsample_size=subsample_size, - kernel=kernel, - gamma=gamma, - degree=degree, - coef0=coef0, - kernel_params=kernel_params, - random_state=random_state, - ) - - def fit(self, X, Y): - """Train eigenpro classification model - - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_raw_feature] - The raw input feature matrix. - - Y : {float, array}, shape =[n_samples] - The labels corresponding to the features of X. - - Returns - ------- - self : returns an instance of self. - """ - X, Y = check_X_y( - X, - Y, - dtype=np.float32, - force_all_finite=True, - multi_output=False, - ensure_min_samples=3, - ) - check_classification_targets(Y) - self.classes_ = np.unique(Y) - - loc = {} - for ind, label in enumerate(self.classes_): - loc[label] = ind - - class_matrix = np.zeros((Y.shape[0], self.classes_.shape[0])) - - for ind, label in enumerate(Y): - class_matrix[ind, loc[label]] = 1 - self._raw_fit(X, class_matrix) - return self - - def predict(self, X): - """Predict using the kernel classification model - - Parameters - ---------- - X : {float, array}, shape = [n_samples, n_features] - Samples. - - Returns - ------- - y : {float, array}, shape = [n_samples] - Predicted labels. - """ - Y = self._raw_predict(X) - return self.classes_[np.argmax(Y, axis=1)] diff --git a/sklearn_extra/kernel_methods/tests/__init__.py b/sklearn_extra/kernel_methods/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sklearn_extra/kernel_methods/tests/test_eigenpro.py b/sklearn_extra/kernel_methods/tests/test_eigenpro.py deleted file mode 100644 index 3328cc9c..00000000 --- a/sklearn_extra/kernel_methods/tests/test_eigenpro.py +++ /dev/null @@ -1,256 +0,0 @@ -import numpy as np - -from sklearn.datasets import make_regression, make_classification -from numpy.testing import assert_allclose -from sklearn_extra.kernel_methods import EigenProRegressor, EigenProClassifier - -import pytest - -# Tests for EigenPro Regression and Classification. - - -def gen_regression(params): - """Generate a regression problem with make_regression - where random_state=1""" - return make_regression(**params, random_state=1) - - -def gen_classification(params): - """Generate a classification problem with make_classification - where random_state=1""" - return make_classification(**params, random_state=1) - - -@pytest.mark.parametrize( - "estimator, data", - [ - (EigenProRegressor, gen_regression({})), - (EigenProClassifier, gen_classification({})), - ], -) -@pytest.mark.parametrize( - "params, err_msg", - [ - # ({"kernel": "not_a_kernel"}, "The 'metric' parameter of pairwise_kernels must be a str among {'cosine', 'poly', 'laplacian', 'polynomial', 'chi2', 'linear', 'sigmoid', 'additive_chi2', 'precomputed', 'rbf'} or a callable. Got 'not_a_kernel' instead."), - # Remove this because the error message is not always the same. - ({"n_epoch": 0}, "n_epoch should be positive, was 0"), - ({"n_epoch": -1}, "n_epoch should be positive, was -1"), - ({"n_components": -1}, "n_components should be non-negative, was -1"), - ( - {"subsample_size": -1}, - "subsample_size should be non-negative, was -1", - ), - ({"batch_size": 0}, "batch_size should be positive, was 0"), - ({"batch_size": -1}, "batch_size should be positive, was -1"), - ({"gamma": 0}, "gamma should be positive, was 0"), - ({"gamma": -1}, "gamma should be positive, was -1"), - ], -) -def test_parameter_validation(estimator, data, params, err_msg): - X, y = data - with pytest.raises(ValueError, match=err_msg): - estimator(**params).fit(X, y) - - -@pytest.mark.parametrize( - "data, estimator", - [ - # Test rbf kernel - ( - gen_regression({}), - EigenProRegressor(kernel="rbf", n_epoch=100, random_state=1), - ), - # Test laplacian kernel - ( - gen_regression({}), - EigenProRegressor( - kernel="laplace", n_epoch=100, gamma=0.008, random_state=1 - ), - ), - # Test cauchy kernel - ( - gen_regression({}), - EigenProRegressor( - kernel="cauchy", - n_epoch=100, - gamma=0.005, - subsample_size=1000, - random_state=1, - ), - ), - # Test with multiple outputs - ( - gen_regression({"n_features": 200, "n_targets": 30}), - EigenProRegressor( - kernel="rbf", n_epoch=100, gamma=0.003, random_state=1 - ), - ), - # Test with a very large number of input features - ( - gen_regression({"n_features": 10000}), - EigenProRegressor( - kernel="rbf", n_epoch=100, gamma=0.5, random_state=1 - ), - ), - # Test a very simple underlying distribution - ( - gen_regression({"n_informative": 1}), - EigenProRegressor( - batch_size=500, - kernel="rbf", - n_epoch=100, - gamma=0.005, - random_state=1, - ), - ), - # Test a very complex underlying distribution - ( - gen_regression({"n_samples": 500, "n_informative": 100}), - EigenProRegressor( - kernel="rbf", n_epoch=60, gamma=0.005, random_state=1 - ), - ), - ], -) -def test_regressor_accuracy(data, estimator): - """ - Test the accuracy of the EigenPro Regressor on multiple - data sets with different parameter inputs. We expect that the - regressor should achieve near-zero training error after sufficient - training time. - :param data: A tuple containing the input and output training data - :param Estimator: The regressor to do predictions with. - """ - X, y = data - prediction = estimator.fit(X, y).predict(X) - assert_allclose(prediction, y, rtol=5e-3) - - -def test_eigenpro_regression_duplicate_data(): - """Test the performance when some data is repeated""" - X, y = make_regression(random_state=1) - X, y = np.concatenate([X, X]), np.concatenate([y, y]) - prediction = ( - EigenProRegressor( - kernel="rbf", n_epoch=100, gamma=0.02, random_state=1 - ) - .fit(X, y) - .predict(X) - ) - assert_allclose(prediction, y, rtol=5e-3) - - -def test_eigenpro_regression_conflict_data(): - """Make sure the regressor doesn't crash when conflicting - data is given""" - X, y = make_regression(random_state=1) - y = np.reshape(y, (-1, 1)) - X, y = X, np.hstack([y, y + 2]) - # Make sure we don't throw an error when fitting or predicting - EigenProRegressor( - kernel="linear", n_epoch=5, gamma=0.5, random_state=1 - ).fit(X, y).predict(X) - - -# Tests for FastKernelClassification - - -@pytest.mark.parametrize( - "data, estimator", - [ - # Test rbf kernel - ( - gen_classification({"n_samples": 10, "hypercube": False}), - EigenProClassifier( - batch_size=9, - kernel="rbf", - gamma=0.08, - n_epoch=100, - random_state=1, - ), - ), - # Test laplacian kernel - ( - gen_classification({}), - EigenProClassifier( - kernel="laplace", n_epoch=100, gamma=0.003, random_state=1 - ), - ), - # Test cauchy kernel - ( - gen_classification({}), - EigenProClassifier( - kernel="cauchy", n_epoch=100, gamma=0.005, random_state=1 - ), - ), - # Test with a very large number of input features - # and samples, shifted around and scaled - ( - gen_classification( - { - "n_samples": 500, - "n_features": 500, - "n_informative": 160, - "scale": 30, - "shift": 6, - } - ), - EigenProClassifier( - kernel="rbf", n_epoch=50, gamma="scale", random_state=1 - ), - ), - # Test a distribution that has been shifted - ( - gen_classification({"shift": 1, "hypercube": False}), - EigenProClassifier( - kernel="rbf", n_epoch=200, gamma=0.008, random_state=1 - ), - ), - # Test with many redundant features. - ( - gen_classification({"n_redundant": 18}), - EigenProClassifier( - kernel="laplace", n_epoch=100, gamma=0.0012, random_state=1 - ), - ), - ], -) -def test_classifier_accuracy(data, estimator): - """ - Test the accuracy of the EigenPro Classification on multiple - data sets with different parameter inputs. We expect that the - classification should achieve zero training error after sufficient - training time. - :param data: A tuple containing the input and output training data - :param Estimator: The classifier to do predictions with. - """ - X, y = data - prediction = estimator.fit(X, y).predict(X) - assert_allclose(prediction, y, rtol=5e-3) - - -def test_eigenpro_classification_duplicate_data(): - """ - Make sure that the classifier correctly handles cases - where some data is repeated. - """ - X, y = make_classification(n_features=200, n_repeated=50, random_state=1) - prediction = ( - EigenProClassifier( - kernel="rbf", n_epoch=60, gamma=0.002, random_state=1 - ) - .fit(X, y) - .predict(X) - ) - assert_allclose(prediction, y, rtol=5e-3) - - -def test_eigenpro_classification_conflict_data(): - """Make sure that the classifier doesn't crash - when given conflicting input data""" - X, y = make_classification(random_state=1) - X, y = np.concatenate([X, X]), np.concatenate([y, 1 - y]) - # Make sure we don't throw an error when fitting or predicting - EigenProClassifier(kernel="linear", n_epoch=5, random_state=1).fit( - X, y - ).predict(X) diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py index 5b71ecf8..92c7a6c5 100644 --- a/sklearn_extra/tests/test_common.py +++ b/sklearn_extra/tests/test_common.py @@ -2,7 +2,6 @@ from sklearn.utils import estimator_checks from sklearn_extra.kernel_approximation import Fastfood -from sklearn_extra.kernel_methods import EigenProClassifier, EigenProRegressor from sklearn_extra.cluster import KMedoids, CommonNNClustering, CLARA from sklearn_extra.robust import ( RobustWeightedClassifier, @@ -15,8 +14,6 @@ Fastfood, KMedoids, CLARA, - EigenProClassifier, - EigenProRegressor, CommonNNClustering, RobustWeightedKMeans, RobustWeightedRegressor, @@ -27,12 +24,6 @@ @estimator_checks.parametrize_with_checks([cls() for cls in ALL_ESTIMATORS]) def test_all_estimators(estimator, check, request): # TODO: fix this common test failure cf #41 - if isinstance( - estimator, EigenProClassifier - ) and "function check_classifier_multioutput" in str(check): - request.applymarker( - pytest.mark.xfail(run=False, reason="See issue #41") - ) # TODO: fix this later, ask people at sklearn to advise on it. if isinstance(estimator, RobustWeightedRegressor) and ( From b741129012c49965000af852cb5fe7f4ecc3da5a Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 10:12:33 +0100 Subject: [PATCH 11/39] fix import init --- sklearn_extra/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/__init__.py b/sklearn_extra/__init__.py index b855d4eb..910ceef6 100644 --- a/sklearn_extra/__init__.py +++ b/sklearn_extra/__init__.py @@ -1,4 +1,4 @@ -from . import kernel_approximation, kernel_methods # noqa +from . import kernel_approximation # noqa from ._version import __version__ From ef3a95ee944d07d9487e689f88022ff8a2cb7590 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 10:21:57 +0100 Subject: [PATCH 12/39] remove more eigenpro --- benchmarks/_bench/eigenpro_plot_mnist.py | 107 ---------------- .../_bench/eigenpro_plot_noisy_mnist.py | 112 ----------------- benchmarks/_bench/eigenpro_plot_synthetic.py | 117 ------------------ doc/api.rst | 9 -- doc/modules/eigenpro.rst | 62 ---------- doc/modules/kernel_approximation.rst | 3 +- doc/user_guide.rst | 1 - 7 files changed, 1 insertion(+), 410 deletions(-) delete mode 100644 benchmarks/_bench/eigenpro_plot_mnist.py delete mode 100644 benchmarks/_bench/eigenpro_plot_noisy_mnist.py delete mode 100644 benchmarks/_bench/eigenpro_plot_synthetic.py delete mode 100644 doc/modules/eigenpro.rst diff --git a/benchmarks/_bench/eigenpro_plot_mnist.py b/benchmarks/_bench/eigenpro_plot_mnist.py deleted file mode 100644 index 1e3c65d3..00000000 --- a/benchmarks/_bench/eigenpro_plot_mnist.py +++ /dev/null @@ -1,107 +0,0 @@ -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from time import time - -from sklearn_extra.kernel_methods import EigenProClassifier -from sklearn.svm import SVC -from sklearn.datasets import fetch_openml - -rng = np.random.RandomState(1) - -# Generate sample data from mnist -mnist = fetch_openml("mnist_784") -mnist.data = mnist.data / 255.0 -print("Data has loaded") - -p = rng.permutation(60000) -x_train = mnist.data.iloc[p] -y_train = np.int32(mnist.target.iloc[p]) -x_test = mnist.data[60000:] -y_test = np.int32(mnist.target[60000:]) - -# Run tests comparing eig to svc -eig_fit_times = [] -eig_pred_times = [] -eig_err = [] -svc_fit_times = [] -svc_pred_times = [] -svc_err = [] - -train_sizes = [500, 1000, 2000, 5000, 10000, 20000, 40000, 60000] - -gamma = 0.02 -# Fit models to data -for train_size in train_sizes: - for name, estimator in [ - ( - "EigenPro", - EigenProClassifier(n_epoch=2, gamma=gamma, random_state=rng), - ), - ("SupportVector", SVC(C=5, gamma=gamma, random_state=rng)), - ]: - stime = time() - estimator.fit(x_train[:train_size], y_train[:train_size]) - fit_t = time() - stime - - stime = time() - y_pred_test = estimator.predict(x_test) - pred_t = time() - stime - - err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) - if name == "EigenPro": - eig_fit_times.append(fit_t) - eig_pred_times.append(pred_t) - eig_err.append(err) - else: - svc_fit_times.append(fit_t) - svc_pred_times.append(pred_t) - svc_err.append(err) - print( - "%s Classification with %i training samples in %0.2f seconds." - "Test error %.4f" % (name, train_size, fit_t + pred_t, err) - ) - -# set up grid for figures -fig = plt.figure(num=None, figsize=(6, 4), dpi=160) -ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2) -train_size_labels = ["500", "1k", "2k", "5k", "10k", "20k", "40k", "60k"] - -# Graph fit(train) time -ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) -ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC") -ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="EigenPro") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_xlabel("train size") -ax.set_ylabel("time (seconds)") -ax.legend() -ax.set_title("Train set") -ax.set_xticks(train_sizes) -ax.set_xticks([], minor=True) -ax.set_xticklabels(train_size_labels) - -# Graph prediction(test) time -ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1) -ax.plot(train_sizes, eig_pred_times, "o-", color="r") -ax.plot(train_sizes, svc_pred_times, "o--", color="g") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_ylabel("time (seconds)") -ax.set_title("Test set") -ax.set_xticks(train_sizes) -ax.set_xticks([], minor=True) -ax.set_xticklabels(train_size_labels) - -# Graph training error -ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1) -ax.plot(train_sizes, eig_err, "o-", color="r") -ax.plot(train_sizes, svc_err, "o-", color="g") -ax.set_xscale("log") -ax.set_xticks(train_sizes) -ax.set_xticklabels(train_size_labels) -ax.set_xticks([], minor=True) -ax.set_xlabel("train size") -ax.set_ylabel("classification error %") -plt.tight_layout() -plt.show() diff --git a/benchmarks/_bench/eigenpro_plot_noisy_mnist.py b/benchmarks/_bench/eigenpro_plot_noisy_mnist.py deleted file mode 100644 index 939e9aff..00000000 --- a/benchmarks/_bench/eigenpro_plot_noisy_mnist.py +++ /dev/null @@ -1,112 +0,0 @@ -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from time import time - -from sklearn.datasets import fetch_openml -from sklearn_extra.kernel_methods import EigenProClassifier -from sklearn.svm import SVC - -rng = np.random.RandomState(1) - -# Generate sample data from mnist -mnist = fetch_openml("mnist_784") -mnist.data = mnist.data / 255.0 - -p = rng.permutation(60000) -x_train = mnist.data[p][:60000] -y_train = np.int32(mnist.target[p][:60000]) -x_test = mnist.data[60000:] -y_test = np.int32(mnist.target[60000:]) - -# randomize 20% of labels -p = rng.choice(len(y_train), np.int32(len(y_train) * 0.2), False) -y_train[p] = rng.choice(10, np.int32(len(y_train) * 0.2)) -p = rng.choice(len(y_test), np.int32(len(y_test) * 0.2), False) -y_test[p] = rng.choice(10, np.int32(len(y_test) * 0.2)) - -# Run tests comparing fkc to svc -eig_fit_times = [] -eig_pred_times = [] -eig_err = [] -svc_fit_times = [] -svc_pred_times = [] -svc_err = [] - -train_sizes = [500, 1000, 2000, 5000, 10000, 20000, 40000, 60000] - -gamma = 0.02 - -# Fit models to data -for train_size in train_sizes: - for name, estimator in [ - ( - "EigenPro", - EigenProClassifier(n_epoch=2, gamma=gamma, random_state=rng), - ), - ("SupportVector", SVC(C=5, gamma=gamma)), - ]: - stime = time() - estimator.fit(x_train[:train_size], y_train[:train_size]) - fit_t = time() - stime - - stime = time() - y_pred_test = estimator.predict(x_test) - pred_t = time() - stime - err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) - if name == "EigenPro": - eig_fit_times.append(fit_t) - eig_pred_times.append(pred_t) - eig_err.append(err) - else: - svc_fit_times.append(fit_t) - svc_pred_times.append(pred_t) - svc_err.append(err) - print( - "%s Classification with %i training samples in %0.2f seconds. " - "Test error %.4f" % (name, train_size, fit_t + pred_t, err) - ) - -# set up grid for figures -fig = plt.figure(num=None, figsize=(6, 4), dpi=160) -ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2) -train_size_labels = ["500", "1k", "2k", "5k", "10k", "20k", "40k", "60k"] - -# Graph fit(train) time -ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) -ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC") -ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="EigenPro") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_xlabel("train size") -ax.set_ylabel("time (seconds)") -ax.legend() -ax.set_title("Train set") -ax.set_xticks(train_sizes) -ax.set_xticks([], minor=True) -ax.set_xticklabels(train_size_labels) - -# Graph prediction(test) time -ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1) -ax.plot(train_sizes, eig_pred_times, "o-", color="r") -ax.plot(train_sizes, svc_pred_times, "o--", color="g") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_ylabel("time (seconds)") -ax.set_title("Test set") -ax.set_xticks(train_sizes) -ax.set_xticks([], minor=True) -ax.set_xticklabels(train_size_labels) - -# Graph training error -ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1) -ax.plot(train_sizes, eig_err, "o-", color="r") -ax.plot(train_sizes, svc_err, "o-", color="g") -ax.set_xscale("log") -ax.set_xticks(train_sizes) -ax.set_xticklabels(train_size_labels) -ax.set_xticks([], minor=True) -ax.set_xlabel("train size") -ax.set_ylabel("classification error %") -plt.tight_layout() -plt.show() diff --git a/benchmarks/_bench/eigenpro_plot_synthetic.py b/benchmarks/_bench/eigenpro_plot_synthetic.py deleted file mode 100644 index 155ba985..00000000 --- a/benchmarks/_bench/eigenpro_plot_synthetic.py +++ /dev/null @@ -1,117 +0,0 @@ -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from time import time - -from sklearn.datasets import make_classification -from sklearn_extra.kernel_methods import EigenProClassifier -from sklearn.svm import SVC - -rng = np.random.RandomState(1) - -max_size = 50000 -test_size = 10000 - -# Get data for testing - -x, y = make_classification( - n_samples=max_size + test_size, - n_features=400, - n_informative=6, - random_state=rng, -) - -x_train = x[:max_size] -y_train = y[:max_size] -x_test = x[max_size:] -y_test = y[max_size:] - -eig_fit_times = [] -eig_pred_times = [] -eig_err = [] -svc_fit_times = [] -svc_pred_times = [] -svc_err = [] - -train_sizes = [2000, 5000, 10000, 20000, 50000] - -gamma = 0.005 -for train_size in train_sizes: - for name, estimator in [ - ( - "EigenPro", - EigenProClassifier( - n_epoch=3, - gamma=gamma, - n_components=30, - subsample_size=1000, - random_state=rng, - ), - ), - ("SupportVector", SVC(C=5, gamma=gamma)), - ]: - stime = time() - estimator.fit(x_train[:train_size], y_train[:train_size]) - fit_t = time() - stime - - stime = time() - y_pred_test = estimator.predict(x_test) - pred_t = time() - stime - - err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) - if name == "EigenPro": - eig_fit_times.append(fit_t) - eig_pred_times.append(pred_t) - eig_err.append(err) - else: - svc_fit_times.append(fit_t) - svc_pred_times.append(pred_t) - svc_err.append(err) - print( - "%s Classification with %i training samples in %0.2f seconds." - % (name, train_size, fit_t + pred_t) - ) - -# set up grid for figures -fig = plt.figure(num=None, figsize=(6, 4), dpi=160) -ax = plt.subplot2grid((2, 2), (0, 0), rowspan=2) -train_size_labels = [str(s) for s in train_sizes] - -# Graph fit(train) time -ax.plot(train_sizes, svc_fit_times, "o--", color="g", label="SVC") -ax.plot(train_sizes, eig_fit_times, "o-", color="r", label="FKC (EigenPro)") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_xlabel("train size") -ax.set_ylabel("time (seconds)") - -ax.legend() -ax.set_title("Train set") -ax.set_xticks(train_sizes) -ax.set_xticklabels(train_size_labels) -ax.set_xticks([], minor=True) -ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) - -# Graph prediction(test) time -ax = plt.subplot2grid((2, 2), (0, 1), rowspan=1) -ax.plot(train_sizes, eig_pred_times, "o-", color="r") -ax.plot(train_sizes, svc_pred_times, "o--", color="g") -ax.set_xscale("log") -ax.set_yscale("log", nonposy="clip") -ax.set_ylabel("time (seconds)") -ax.set_title("Test set") -ax.set_xticks([]) -ax.set_xticks([], minor=True) - -# Graph training error -ax = plt.subplot2grid((2, 2), (1, 1), rowspan=1) -ax.plot(train_sizes, eig_err, "o-", color="r") -ax.plot(train_sizes, svc_err, "o-", color="g") -ax.set_xscale("log") -ax.set_xticks(train_sizes) -ax.set_xticklabels(train_size_labels) -ax.set_xticks([], minor=True) -ax.set_xlabel("train size") -ax.set_ylabel("classification error %") -plt.tight_layout() -plt.show() diff --git a/doc/api.rst b/doc/api.rst index 25fc8ed8..1d0af0a4 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -13,15 +13,6 @@ Kernel approximation kernel_approximation.Fastfood -EigenPro -======== - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - kernel_methods.EigenProRegressor - kernel_methods.EigenProClassifier Clustering ==================== diff --git a/doc/modules/eigenpro.rst b/doc/modules/eigenpro.rst deleted file mode 100644 index bd7535c9..00000000 --- a/doc/modules/eigenpro.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _eigenpro: - -========================================== -EigenPro for Regression and Classification -========================================== - -.. currentmodule:: sklearn_extra.kernel_methods - -*EigenPro iteration* [MB17]_ is a very efficient implementation of kernel -regression/classification that uses an optimization method based on -preconditioned stochastic gradient descent. It essentially implements a -"ridgeless" kernel regression. Regularization, when necessary, can be -achieved by early stopping. - -Optimization parameters, such as step size, batch size, and the size of the preconditioning -block are chosen automatically and optimally. (They can also be set up manually.) -This results in a simple and user-friendly interface. - -Next, we present several experimental results using a server equipped with one -Intel Xeon E5-1620 CPU. -The figure below compares the EigenPro Classifier and the Support Vector -Classifier (:class:`SVC`) on MNIST digits classification task. -We see that EigenPro and SVC give competitive and similar accuracy on test set. -Notably, on the full MNIST training and testing using EigenPro are -approximately 2 times and 5 times faster than that using SVC, respectively. - -.. |mnist| image:: ../images/eigenpro_mnist.png - :target: ../auto_examples/eigenpro/eigenpro_mnist.html - :scale: 70 - -.. centered:: |mnist| - -We then repeat the same experiments on MNIST with added label noise. -Specifically, we randomly reset the label (0-9) of 20% samples. -We see that EigenPro has a significant advantage over SVC -on this noisy MNIST. Training and testing using EigenPro are -both 10 to 20 times faster than they are when using SVC. - -.. |mnist_noisy| image:: ../images/eigenpro_mnist_noisy.png - :target: ../auto_examples/eigenpro/eigenpro_mnist_noisy.html - :scale: 70 - -.. centered:: |mnist_noisy| - - -The next figure compares the two methods on a binary classification problem -with 400 synthetic features. Again, EigenPro demonstrates 10~20 times -acceleration on training and testing without loss of accuracy. - -.. |synthetic| image:: ../images/eigenpro_synthetic.png - :target: ../auto_examples/eigenpro/eigenpro_synthetic.html - :scale: 70 - -.. centered:: |synthetic| - - -.. topic:: References: - - .. [MB17] Siyuan Ma and Mikhail Belkin, - `"Diving into the shallows: a computational perspective on large-scale shallow learning" - `_, - Advances in Neural Information Processing Systems, 2017. diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst index b234d691..64650341 100644 --- a/doc/modules/kernel_approximation.rst +++ b/doc/modules/kernel_approximation.rst @@ -30,5 +30,4 @@ O(n_components). See `scikit-learn User-guide `_ for more general informations on kernel approximations. -See also :class:`EigenProRegressor ` and :class:`EigenProClassifier ` for another -way to compute fast kernel methods algorithms. + diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 0c90c2e8..9c715375 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -10,7 +10,6 @@ User guide .. toctree:: :numbered: - modules/eigenpro.rst modules/cluster.rst modules/robust.rst modules/kernel_approximation.rst From 9b753919e5349cf544844448444d17b43fa3e097 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 10:25:03 +0100 Subject: [PATCH 13/39] fix readmes examples --- examples/cluster/README.txt | 2 +- examples/kernel_approximation/README.txt | 2 +- examples/robust/README.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cluster/README.txt b/examples/cluster/README.txt index ad0ebf6a..0dfd5871 100644 --- a/examples/cluster/README.txt +++ b/examples/cluster/README.txt @@ -3,4 +3,4 @@ Cluster ======= -Examples concerning the :mod:`sklearn_extra.kernel_methods.cluster` module. +Examples concerning the :mod:`sklearn_extra.cluster` module. diff --git a/examples/kernel_approximation/README.txt b/examples/kernel_approximation/README.txt index 5ea04362..27fcac09 100644 --- a/examples/kernel_approximation/README.txt +++ b/examples/kernel_approximation/README.txt @@ -3,5 +3,5 @@ Kernel approximation ==================== -Examples concerning the :mod:`sklearn_extra.kernel_methods.kernel_approximation` +Examples concerning the :mod:`sklearn_extra.kernel_approximation` module. diff --git a/examples/robust/README.txt b/examples/robust/README.txt index 526c9400..5ee474b3 100644 --- a/examples/robust/README.txt +++ b/examples/robust/README.txt @@ -3,4 +3,4 @@ Robust ====== -Examples concerning the :mod:`sklearn_extra.kernel_methods.robust` module. +Examples concerning the :mod:`sklearn_extra.robust` module. From f888048a1d262541678be6d0ed5668652a494f24 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Nov 2024 09:25:12 +0000 Subject: [PATCH 14/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/modules/kernel_approximation.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst index 64650341..e0b2231e 100644 --- a/doc/modules/kernel_approximation.rst +++ b/doc/modules/kernel_approximation.rst @@ -29,5 +29,3 @@ mapping a single example is O(n_components log d). The space complexity is O(n_components). See `scikit-learn User-guide `_ for more general informations on kernel approximations. - - From c13d6634997be5483ed2ca5cb2698c3e2c88f349 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 10:43:54 +0100 Subject: [PATCH 15/39] requirement doc --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index f3e94be9..c79b4cc2 100755 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ "tests": ["pytest", "pytest-cov"], "docs": [ "pillow", + "pandas", "sphinx", "sphinx-gallery", "sphinx_rtd_theme", From 74b423741e2377f3bad9473f296646fd4619e7c6 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 10:53:41 +0100 Subject: [PATCH 16/39] fix workflow --- .github/workflows/build-wheels.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 343e112f..d3cdf6ed 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -1,14 +1,7 @@ name: build_wheels on: [push, pull_request] - release: - types: - - created - workflow_dispatch: - inputs: - version: - description: 'Manually trigger wheel build in Github UI' - required: true + jobs: From eb101fa8838b89e51cf9450a0c69de121a2db9fa Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 10:58:01 +0100 Subject: [PATCH 17/39] fix workflow --- .github/workflows/build-wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index d3cdf6ed..ca5b6336 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -32,7 +32,7 @@ jobs: CIBW_TEST_COMMAND: "pytest --pyargs sklearn_extra" run: | python -m cibuildwheel --output-dir wheelhouse - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v4 with: path: ./wheelhouse/*.whl @@ -53,7 +53,7 @@ jobs: - name: Build sdist run: python setup.py sdist - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v4 with: path: dist/*.tar.gz From 9ac737cb53b95b49a18ec87337b088d7abdce869 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:04:14 +0100 Subject: [PATCH 18/39] try some update --- .github/workflows/build-wheels.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index ca5b6336..f0936ee3 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -18,10 +18,10 @@ jobs: - uses: actions/setup-python@v2 name: Install Python with: - python-version: '3.8' + python-version: '3.10' - name: Install cibuildwheel run: | - python -m pip install cibuildwheel==2.12.1 + python -m pip install cibuildwheel==2.21.3 - name: Build wheels env: # We only build for Python 3.6+. On Linux manylinux2010 is used. @@ -45,7 +45,7 @@ jobs: - uses: actions/setup-python@v2 name: Install Python with: - python-version: '3.8' + python-version: '3.10' - name: Install dependencies run: pip install setuptools cython numpy From c1b8668347a527ce5bc94669e2be99c62b560c62 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:12:11 +0100 Subject: [PATCH 19/39] skip 3.6 --- .github/workflows/build-wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index f0936ee3..3edf8438 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -27,7 +27,7 @@ jobs: # We only build for Python 3.6+. On Linux manylinux2010 is used. # Skipping pypy wheels for now since scipy & scikit-learn haven't build them yet. # Skip python3.11 for 32bit. - CIBW_SKIP: "pp* *-win32 *-manylinux_i686 *musllinux*" + CIBW_SKIP: "pp* *-win32 *-manylinux_i686 *musllinux* *cp36*" CIBW_TEST_REQUIRES: "pytest pandas scikit-learn" CIBW_TEST_COMMAND: "pytest --pyargs sklearn_extra" run: | From 668a3d62c67842c9580d2f855927bf5281fd7628 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:15:33 +0100 Subject: [PATCH 20/39] skip 3.7 --- .github/workflows/build-wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 3edf8438..7154cdcc 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -27,7 +27,7 @@ jobs: # We only build for Python 3.6+. On Linux manylinux2010 is used. # Skipping pypy wheels for now since scipy & scikit-learn haven't build them yet. # Skip python3.11 for 32bit. - CIBW_SKIP: "pp* *-win32 *-manylinux_i686 *musllinux* *cp36*" + CIBW_SKIP: "pp* *-win32 *-manylinux_i686 *musllinux* *cp36* *cp37*" CIBW_TEST_REQUIRES: "pytest pandas scikit-learn" CIBW_TEST_COMMAND: "pytest --pyargs sklearn_extra" run: | From 366b3f53498f11c3fad70997ce97fd6f48be8db4 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:25:42 +0100 Subject: [PATCH 21/39] try lon long fix windows --- .../_robust_weighted_estimator_helper.pyx | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index d05945cc..3bbb0c17 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -8,7 +8,9 @@ import numpy as np cimport numpy as np from sklearn.utils.extmath import row_norms -from cython cimport floating +from libc.stdint cimport int32_t, int64_t +# instead of int and long + import sys from time import time @@ -24,12 +26,12 @@ np.import_array() cdef floating _euclidean_dense_dense( floating* a, # IN floating* b, # IN - int n_features) nogil: + int64_t n_features) nogil: """Euclidean distance between a dense and b dense""" cdef: - int i - int n = n_features // 4 - int rem = n_features % 4 + int64_t i + int64_t n = n_features // 4 + int64_t rem = n_features % 4 floating result = 0 # We manually unroll the loop for better cache optimization. @@ -48,7 +50,7 @@ cdef floating _euclidean_dense_dense( cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X, - int[:] labels): + int64_t[:] labels): """Compute inertia squared distancez between each sample and its assigned center. @@ -59,14 +61,14 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X dtype = np.double cdef: - int n_samples = X.shape[0] - int n_features = X.shape[1] - int i, j - int n_classes = len(np.unique(labels)) + int64_t n_samples = X.shape[0] + int64_t n_features = X.shape[1] + int64_t i, j + int64_t n_classes = len(np.unique(labels)) np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int64_t) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From 3fd3aee61f7339e1459a2ae6c14a97489be6e2b5 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:31:21 +0100 Subject: [PATCH 22/39] try fix windows --- .../_robust_weighted_estimator_helper.pyx | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 3bbb0c17..34c00d6e 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -8,6 +8,8 @@ import numpy as np cimport numpy as np from sklearn.utils.extmath import row_norms +from cython cimport floating + from libc.stdint cimport int32_t, int64_t # instead of int and long @@ -26,12 +28,12 @@ np.import_array() cdef floating _euclidean_dense_dense( floating* a, # IN floating* b, # IN - int64_t n_features) nogil: + int32_t n_features) nogil: """Euclidean distance between a dense and b dense""" cdef: - int64_t i - int64_t n = n_features // 4 - int64_t rem = n_features % 4 + int32_t i + int32_t n = n_features // 4 + int32_t rem = n_features % 4 floating result = 0 # We manually unroll the loop for better cache optimization. @@ -50,7 +52,7 @@ cdef floating _euclidean_dense_dense( cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X, - int64_t[:] labels): + int32_t[:] labels): """Compute inertia squared distancez between each sample and its assigned center. @@ -61,14 +63,14 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X dtype = np.double cdef: - int64_t n_samples = X.shape[0] - int64_t n_features = X.shape[1] - int64_t i, j - int64_t n_classes = len(np.unique(labels)) + int32_t n_samples = X.shape[0] + int32_t n_features = X.shape[1] + int32_t i, j + int32_t n_classes = len(np.unique(labels)) np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int64_t) + np.ndarray[int64_t] num_in_cluster = np.zeros(n_classes, dtype = int32_t) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From f51843e843489e3725c3f62205ac074f6025c413 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:43:47 +0100 Subject: [PATCH 23/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 34c00d6e..997af183 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -70,7 +70,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[int64_t] num_in_cluster = np.zeros(n_classes, dtype = int32_t) + np.ndarray[int64_t] num_in_cluster = np.zeros(n_classes, dtype = int) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From c9931174838fced96cd5d543acff74f356525167 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:48:21 +0100 Subject: [PATCH 24/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 997af183..07a6fca8 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -70,7 +70,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[int64_t] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[int32_t] num_in_cluster = np.zeros(n_classes, dtype = int) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From 9d34dcea558aac7434fb9589df239aaf5f00784a Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:52:08 +0100 Subject: [PATCH 25/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 07a6fca8..94d0e4a4 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -70,7 +70,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[int32_t] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[int] num_in_cluster = np.zeros(n_classes, dtype = int) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From ebcbd5621fa9446bc16b27ccf6a84eb0ea9139b8 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 11:55:43 +0100 Subject: [PATCH 26/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 94d0e4a4..78522d62 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -70,7 +70,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[int] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[np.int] num_in_cluster = np.zeros(n_classes, dtype = np.int) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From fa315a9d98ac012a3440b9c5e9e7b629083ce513 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 12:07:56 +0100 Subject: [PATCH 27/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 78522d62..abd427bd 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -75,7 +75,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X for i in range(n_samples): for j in range(n_features): centers[labels[i], j] += X[i, j] - num_in_cluster[labels[i]] += 1 + num_in_cluster[labels[i]] = num_in_cluster[labels[i]] + 1 for i in range(n_classes): for j in range(n_features): From 1970ba3714972701a56860cebbc380e76c561485 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 12:42:24 +0100 Subject: [PATCH 28/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index abd427bd..81561e46 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -70,7 +70,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[np.int] num_in_cluster = np.zeros(n_classes, dtype = np.int) + np.ndarray[np.int32] num_in_cluster = np.zeros(n_classes, dtype = np.int32) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From 0b7df6cbc76ac2b8fb5a966651215ea854ebb545 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 13:13:21 +0100 Subject: [PATCH 29/39] revert to last working --- .../_robust_weighted_estimator_helper.pyx | 158 +----------------- 1 file changed, 1 insertion(+), 157 deletions(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 81561e46..e0e8453e 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -1,159 +1,3 @@ -# cython: infer_types=True -# Fast swap step in PAM algorithm for k_medoid. -# Author: Timothée Mathieu -# License: 3-clause BSD - -cimport cython -import numpy as np -cimport numpy as np - -from sklearn.utils.extmath import row_norms -from cython cimport floating - -from libc.stdint cimport int32_t, int64_t -# instead of int and long - - -import sys -from time import time - -from libc.math cimport exp, log, sqrt, pow, fabs -cimport numpy as np -from numpy.math cimport INFINITY - - -# Modified from sklearn.cluster._k_means_fast.pyx -np.import_array() - -cdef floating _euclidean_dense_dense( - floating* a, # IN - floating* b, # IN - int32_t n_features) nogil: - """Euclidean distance between a dense and b dense""" - cdef: - int32_t i - int32_t n = n_features // 4 - int32_t rem = n_features % 4 - floating result = 0 - - # We manually unroll the loop for better cache optimization. - for i in range(n): - result += ((a[0] - b[0]) * (a[0] - b[0]) - +(a[1] - b[1]) * (a[1] - b[1]) - +(a[2] - b[2]) * (a[2] - b[2]) - +(a[3] - b[3]) * (a[3] - b[3])) - a += 4; b += 4 - - for i in range(rem): - result += (a[i] - b[i]) * (a[i] - b[i]) - - return result - - - -cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X, - int32_t[:] labels): - """Compute inertia - - squared distancez between each sample and its assigned center. - """ - if floating is float: - dtype = np.float32 - elif floating is double: - dtype = np.double - - cdef: - int32_t n_samples = X.shape[0] - int32_t n_features = X.shape[1] - int32_t i, j - int32_t n_classes = len(np.unique(labels)) - np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, - n_features], - dtype = dtype) - np.ndarray[np.int32] num_in_cluster = np.zeros(n_classes, dtype = np.int32) - np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) - for i in range(n_samples): - for j in range(n_features): - centers[labels[i], j] += X[i, j] - num_in_cluster[labels[i]] = num_in_cluster[labels[i]] + 1 - - for i in range(n_classes): - for j in range(n_features): - centers[i, j] /= num_in_cluster[i] - - for i in range(n_samples): - j = labels[i] - inertias[i] = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], n_features) - return inertias - - - - - -# Regression and Classification losses, from scikit-learn. - - - - -# ---------------------------------------- -# Extension Types for Loss Functions -# ---------------------------------------- - -cdef class LossFunction: - """Base class for convex loss functions""" - - cdef double loss(self, double p, double y) nogil: - """Evaluate the loss function. - - Parameters - ---------- - p : double - The prediction, p = w^T x - y : double - The true value (aka target) - - Returns - ------- - double - The loss evaluated at `p` and `y`. - """ - return 0. - - def py_dloss(self, double p, double y): - """Python version of `dloss` for testing. - - Pytest needs a python function and can't use cdef functions. - """ - return self.dloss(p, y) - - def py_loss(self, double p, double y): - """Python version of `dloss` for testing. - - Pytest needs a python function and can't use cdef functions. - """ - return self.loss(p, y) - - - cdef double dloss(self, double p, double y) nogil: - """Evaluate the derivative of the loss function with respect to - the prediction `p`. - - Parameters - ---------- - p : double - The prediction, p = w^T x - y : double - The true value (aka target) - Returns - ------- - double - The derivative of the loss function with regards to `p`. - """ - return 0. - - -cdef class Regression(LossFunction): - """Base class for loss functions for regression""" cdef double loss(self, double p, double y) nogil: return 0. @@ -336,4 +180,4 @@ cdef class Huber(Regression): return -self.c def __reduce__(self): - return Huber, (self.c,) + return Huber, (self.c,) \ No newline at end of file From 1c47b6f9032b7c8fca7d41758fe4ca9b78f8e7d4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Nov 2024 12:14:26 +0000 Subject: [PATCH 30/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index e0e8453e..a90872bb 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -180,4 +180,4 @@ cdef class Huber(Regression): return -self.c def __reduce__(self): - return Huber, (self.c,) \ No newline at end of file + return Huber, (self.c,) From b737a8a86573b309c88d83c0a01ecfabfa720446 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 13:25:14 +0100 Subject: [PATCH 31/39] typos --- .../_robust_weighted_estimator_helper.pyx | 154 +++++++++++++++++- 1 file changed, 153 insertions(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index e0e8453e..d05945cc 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -1,3 +1,155 @@ +# cython: infer_types=True +# Fast swap step in PAM algorithm for k_medoid. +# Author: Timothée Mathieu +# License: 3-clause BSD + +cimport cython +import numpy as np +cimport numpy as np + +from sklearn.utils.extmath import row_norms +from cython cimport floating + +import sys +from time import time + +from libc.math cimport exp, log, sqrt, pow, fabs +cimport numpy as np +from numpy.math cimport INFINITY + + +# Modified from sklearn.cluster._k_means_fast.pyx +np.import_array() + +cdef floating _euclidean_dense_dense( + floating* a, # IN + floating* b, # IN + int n_features) nogil: + """Euclidean distance between a dense and b dense""" + cdef: + int i + int n = n_features // 4 + int rem = n_features % 4 + floating result = 0 + + # We manually unroll the loop for better cache optimization. + for i in range(n): + result += ((a[0] - b[0]) * (a[0] - b[0]) + +(a[1] - b[1]) * (a[1] - b[1]) + +(a[2] - b[2]) * (a[2] - b[2]) + +(a[3] - b[3]) * (a[3] - b[3])) + a += 4; b += 4 + + for i in range(rem): + result += (a[i] - b[i]) * (a[i] - b[i]) + + return result + + + +cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X, + int[:] labels): + """Compute inertia + + squared distancez between each sample and its assigned center. + """ + if floating is float: + dtype = np.float32 + elif floating is double: + dtype = np.double + + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j + int n_classes = len(np.unique(labels)) + np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, + n_features], + dtype = dtype) + np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) + for i in range(n_samples): + for j in range(n_features): + centers[labels[i], j] += X[i, j] + num_in_cluster[labels[i]] += 1 + + for i in range(n_classes): + for j in range(n_features): + centers[i, j] /= num_in_cluster[i] + + for i in range(n_samples): + j = labels[i] + inertias[i] = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], n_features) + return inertias + + + + + +# Regression and Classification losses, from scikit-learn. + + + + +# ---------------------------------------- +# Extension Types for Loss Functions +# ---------------------------------------- + +cdef class LossFunction: + """Base class for convex loss functions""" + + cdef double loss(self, double p, double y) nogil: + """Evaluate the loss function. + + Parameters + ---------- + p : double + The prediction, p = w^T x + y : double + The true value (aka target) + + Returns + ------- + double + The loss evaluated at `p` and `y`. + """ + return 0. + + def py_dloss(self, double p, double y): + """Python version of `dloss` for testing. + + Pytest needs a python function and can't use cdef functions. + """ + return self.dloss(p, y) + + def py_loss(self, double p, double y): + """Python version of `dloss` for testing. + + Pytest needs a python function and can't use cdef functions. + """ + return self.loss(p, y) + + + cdef double dloss(self, double p, double y) nogil: + """Evaluate the derivative of the loss function with respect to + the prediction `p`. + + Parameters + ---------- + p : double + The prediction, p = w^T x + y : double + The true value (aka target) + Returns + ------- + double + The derivative of the loss function with regards to `p`. + """ + return 0. + + +cdef class Regression(LossFunction): + """Base class for loss functions for regression""" cdef double loss(self, double p, double y) nogil: return 0. @@ -180,4 +332,4 @@ cdef class Huber(Regression): return -self.c def __reduce__(self): - return Huber, (self.c,) \ No newline at end of file + return Huber, (self.c,) From 9bbb2deaf5cdd26a7e6a5612eb31f3a6d6762bf6 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 13:35:09 +0100 Subject: [PATCH 32/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index d05945cc..b1b781aa 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -14,6 +14,7 @@ import sys from time import time from libc.math cimport exp, log, sqrt, pow, fabs +from libc.stdint cimport int64_t cimport numpy as np from numpy.math cimport INFINITY @@ -66,7 +67,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[int64_t] num_in_cluster = np.zeros(n_classes, dtype = int) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From 78ef2f2049e11b053343c7ba4dcfd6e844b519f8 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 13:40:31 +0100 Subject: [PATCH 33/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index b1b781aa..0ee12a98 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -14,7 +14,7 @@ import sys from time import time from libc.math cimport exp, log, sqrt, pow, fabs -from libc.stdint cimport int64_t +from libc.stdint cimport int32_t cimport numpy as np from numpy.math cimport INFINITY @@ -67,7 +67,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[int64_t] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[int32_t] num_in_cluster = np.zeros(n_classes, dtype = int) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From 832dfc19ac35594cd6d6cdc3a47a3116b94451d1 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 13:45:54 +0100 Subject: [PATCH 34/39] try fix windows --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 0ee12a98..d2bbd6b9 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -67,7 +67,7 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X np.ndarray[floating, ndim=2] centers = np.zeros([n_classes, n_features], dtype = dtype) - np.ndarray[int32_t] num_in_cluster = np.zeros(n_classes, dtype = int) + np.ndarray[floating] num_in_cluster = np.zeros(n_classes, dtype = dtype) np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype) for i in range(n_samples): for j in range(n_features): From f19d98a7192a838efc676784c6ee5caf6eae1fe3 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 14:24:20 +0100 Subject: [PATCH 35/39] try fix windows --- .github/workflows/build-wheels.yml | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 7154cdcc..4c31be6c 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -32,30 +32,15 @@ jobs: CIBW_TEST_COMMAND: "pytest --pyargs sklearn_extra" run: | python -m cibuildwheel --output-dir wheelhouse - - uses: actions/upload-artifact@v4 - with: - path: ./wheelhouse/*.whl - - build_sdist: - name: sdist - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - uses: actions/setup-python@v2 - name: Install Python - with: - python-version: '3.10' - - - name: Install dependencies - run: pip install setuptools cython numpy - name: Build sdist run: python setup.py sdist - uses: actions/upload-artifact@v4 with: - path: dist/*.tar.gz + path: | + ./wheelhouse/*.whl + ./dist/*.tar.gz # upload_pypi: # needs: [build_wheels, build_sdist] From 9ba0d2c3e4b70fac0524821445d8766f88707910 Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 14:59:17 +0100 Subject: [PATCH 36/39] try fix windows --- .github/workflows/build-wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 4c31be6c..87a78639 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -21,7 +21,7 @@ jobs: python-version: '3.10' - name: Install cibuildwheel run: | - python -m pip install cibuildwheel==2.21.3 + python -m pip install cibuildwheel==2.21.3 setuptools cython numpy - name: Build wheels env: # We only build for Python 3.6+. On Linux manylinux2010 is used. From 0acd4916eab71b1fed48ee1aa9283fb03091837a Mon Sep 17 00:00:00 2001 From: Timothee Mathieu Date: Tue, 5 Nov 2024 15:17:06 +0100 Subject: [PATCH 37/39] try fix windows --- .github/workflows/build-wheels.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 87a78639..c7b7d3f3 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -41,6 +41,7 @@ jobs: path: | ./wheelhouse/*.whl ./dist/*.tar.gz + name: ${{ matrix.os }} # upload_pypi: # needs: [build_wheels, build_sdist] From 10a4e9f1f7da7e5b050a9bd3dd64af615930c30c Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Mon, 19 May 2025 16:38:11 +0200 Subject: [PATCH 38/39] update how cimport Infinity --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index d2bbd6b9..8118c19e 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -16,8 +16,8 @@ from time import time from libc.math cimport exp, log, sqrt, pow, fabs from libc.stdint cimport int32_t cimport numpy as np -from numpy.math cimport INFINITY - +from libc.math cimport INFINITY + # Modified from sklearn.cluster._k_means_fast.pyx np.import_array() From 21f5aa70c18e4615aaf700905014e9bf84c0d4f3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 May 2025 14:38:37 +0000 Subject: [PATCH 39/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sklearn_extra/robust/_robust_weighted_estimator_helper.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx index 8118c19e..02493e1a 100644 --- a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx +++ b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx @@ -17,7 +17,7 @@ from libc.math cimport exp, log, sqrt, pow, fabs from libc.stdint cimport int32_t cimport numpy as np from libc.math cimport INFINITY - + # Modified from sklearn.cluster._k_means_fast.pyx np.import_array()