From fecc606b47f9dd71784db0963009ae01b5eb76ce Mon Sep 17 00:00:00 2001 From: yangarbiter Date: Sun, 22 Aug 2021 10:16:53 -0700 Subject: [PATCH 1/2] Improve linting --- libact/base/interfaces.py | 10 - libact/labelers/__init__.py | 4 +- libact/labelers/ideal_labeler.py | 11 +- libact/labelers/interactive_labeler.py | 1 + libact/models/logistic_regression.py | 2 +- libact/models/perceptron.py | 1 + libact/models/sklearn_adapter.py | 7 +- libact/models/svm.py | 14 +- libact/query_strategies/__init__.py | 2 + .../density_weighted_uncertainty_sampling.py | 2 +- libact/query_strategies/multiclass/mdsp.py | 912 +++++++++--------- libact/query_strategies/query_by_committee.py | 2 +- libact/query_strategies/variance_reduction.py | 2 +- 13 files changed, 485 insertions(+), 485 deletions(-) diff --git a/libact/base/interfaces.py b/libact/base/interfaces.py index ac1b4f09..cff88843 100644 --- a/libact/base/interfaces.py +++ b/libact/base/interfaces.py @@ -36,7 +36,6 @@ def update(self, entry_id, label): label : float The label of the queried sample. """ - pass def _get_scores(self): """Return the score used for making query, the larger the better. Read-only. @@ -48,7 +47,6 @@ def _get_scores(self): (ask_id, scores): list of tuple (int, float) The index of the next unlabeled sample to be queried and the score assigned. """ - pass @abstractmethod def make_query(self): @@ -61,7 +59,6 @@ def make_query(self): ask_id : int The index of the next unlabeled sample to be queried and labeled. """ - pass class Labeler(with_metaclass(ABCMeta, object)): @@ -84,7 +81,6 @@ def label(self, feature): label : int The class label of the queried feature. """ - pass class Model(with_metaclass(ABCMeta, object)): @@ -108,7 +104,6 @@ def train(self, dataset, *args, **kwargs): self : object Returns self. """ - pass @abstractmethod def predict(self, feature, *args, **kwargs): @@ -124,7 +119,6 @@ def predict(self, feature, *args, **kwargs): y_pred : array-like, shape (n_samples,) The class labels for samples in the feature array. """ - pass @abstractmethod def score(self, testing_dataset, *args, **kwargs): @@ -141,7 +135,6 @@ def score(self, testing_dataset, *args, **kwargs): score : float Mean accuracy of self.predict(X) wrt. y. """ - pass class MultilabelModel(Model): @@ -150,7 +143,6 @@ class MultilabelModel(Model): A Model returns a multilabel-predicting function for future samples after trained on a training dataset. """ - pass class ContinuousModel(Model): @@ -183,7 +175,6 @@ def predict_real(self, feature, *args, **kwargs): Each entry is the confidence scores per (sample, class) combination. """ - pass class ProbabilisticModel(ContinuousModel): @@ -210,4 +201,3 @@ def predict_proba(self, feature, *args, **kwargs): X : array-like, shape (n_samples, n_classes) Each entry is the prabablity estimate for each class. """ - pass diff --git a/libact/labelers/__init__.py b/libact/labelers/__init__.py index 457a5b35..dfa83060 100644 --- a/libact/labelers/__init__.py +++ b/libact/labelers/__init__.py @@ -5,6 +5,6 @@ from .ideal_labeler import IdealLabeler try: from .interactive_labeler import InteractiveLabeler -except ImportError: +except ImportError as import_error: raise ImportError("Error importing matplotlib." - "InteractiveLabeler not supported.") + "InteractiveLabeler not supported.") from import_error diff --git a/libact/labelers/ideal_labeler.py b/libact/labelers/ideal_labeler.py index b16c9550..8a121d82 100644 --- a/libact/labelers/ideal_labeler.py +++ b/libact/labelers/ideal_labeler.py @@ -20,7 +20,8 @@ class IdealLabeler(Labeler): """ - def __init__(self, dataset, **kwargs): + def __init__(self, dataset): + super().__init__() X, y = dataset.get_entries() # make sure the input dataset is fully labeled assert (np.array(y) != np.array(None)).all() @@ -29,7 +30,7 @@ def __init__(self, dataset, **kwargs): @inherit_docstring_from(Labeler) def label(self, feature): - yy = self.y[np.where([np.array_equal(x, feature) - for x in self.X])[0]] - ind = np.arange(len(yy)) - return yy[np.random.choice(ind, 1)[0]] + labels = self.y[np.where([np.array_equal(x, feature) + for x in self.X])[0]] + ind = np.arange(len(labels)) + return labels[np.random.choice(ind, 1)[0]] diff --git a/libact/labelers/interactive_labeler.py b/libact/labelers/interactive_labeler.py index de06fe52..f25e90c3 100644 --- a/libact/labelers/interactive_labeler.py +++ b/libact/labelers/interactive_labeler.py @@ -26,6 +26,7 @@ class InteractiveLabeler(Labeler): """ def __init__(self, **kwargs): + super().__init__() self.label_name = kwargs.pop('label_name', None) @inherit_docstring_from(Labeler) diff --git a/libact/models/logistic_regression.py b/libact/models/logistic_regression.py index 538c11b0..57ce0af8 100644 --- a/libact/models/logistic_regression.py +++ b/libact/models/logistic_regression.py @@ -18,6 +18,7 @@ class LogisticRegression(ProbabilisticModel): """ def __init__(self, *args, **kwargs): + super().__init__() self.model = sklearn.linear_model.LogisticRegression(*args, **kwargs) def train(self, dataset, *args, **kwargs): @@ -38,4 +39,3 @@ def predict_real(self, feature, *args, **kwargs): def predict_proba(self, feature, *args, **kwargs): return self.model.predict_proba(feature, *args, **kwargs) - diff --git a/libact/models/perceptron.py b/libact/models/perceptron.py index 4b0e67ac..b716a71f 100644 --- a/libact/models/perceptron.py +++ b/libact/models/perceptron.py @@ -16,6 +16,7 @@ class Perceptron(Model): """ def __init__(self, *args, **kwargs): + super().__init__() self.model = sklearn.linear_model.Perceptron(*args, **kwargs) def train(self, dataset, *args, **kwargs): diff --git a/libact/models/sklearn_adapter.py b/libact/models/sklearn_adapter.py index 61dc93b6..9837228d 100644 --- a/libact/models/sklearn_adapter.py +++ b/libact/models/sklearn_adapter.py @@ -1,7 +1,7 @@ """scikit-learn classifier adapter """ from sklearn.base import clone -from libact.base.interfaces import Model, ContinuousModel, ProbabilisticModel +from libact.base.interfaces import Model, ProbabilisticModel class SklearnAdapter(Model): @@ -37,6 +37,7 @@ class SklearnAdapter(Model): """ def __init__(self, clf): + super().__init__() self._model = clf def train(self, dataset, *args, **kwargs): @@ -50,6 +51,8 @@ def score(self, testing_dataset, *args, **kwargs): **kwargs) def clone(self): + """Constructs a new untrained model with the same parameters. + """ return SklearnProbaAdapter(clone(self._model)) @@ -108,4 +111,6 @@ def predict_proba(self, feature, *args, **kwargs): return self._model.predict_proba(feature, *args, **kwargs) def clone(self): + """Constructs a new untrained model with the same parameters. + """ return SklearnProbaAdapter(clone(self._model)) diff --git a/libact/models/svm.py b/libact/models/svm.py index e3698c93..0edd6c94 100644 --- a/libact/models/svm.py +++ b/libact/models/svm.py @@ -3,7 +3,6 @@ An interface for scikit-learn's C-Support Vector Classifier model. """ import logging -LOGGER = logging.getLogger(__name__) import numpy as np import sklearn.svm @@ -12,6 +11,9 @@ from libact.base.interfaces import ContinuousModel +LOGGER = logging.getLogger(__name__) + + class SVM(ContinuousModel): """C-Support Vector Machine Classifier @@ -46,8 +48,8 @@ def predict_real(self, feature, *args, **kwargs): dvalue = self.model.decision_function(feature, *args, **kwargs) if len(np.shape(dvalue)) == 1: # n_classes == 2 return np.vstack((-dvalue, dvalue)).T - else: - if self.decision_function_shape != 'ovr': - LOGGER.warn("SVM model support only 'ovr' for multiclass" - "predict_real.") - return dvalue + + if self.decision_function_shape != 'ovr': + LOGGER.warning("SVM model support only 'ovr' for multiclass" + "predict_real.") + return dvalue diff --git a/libact/query_strategies/__init__.py b/libact/query_strategies/__init__.py index 04ec42de..82736514 100644 --- a/libact/query_strategies/__init__.py +++ b/libact/query_strategies/__init__.py @@ -3,6 +3,8 @@ """ from __future__ import absolute_import +# pylint: disable=wrong-import-position + import os ON_RTD = os.environ.get('READTHEDOCS', None) == 'True' import logging diff --git a/libact/query_strategies/density_weighted_uncertainty_sampling.py b/libact/query_strategies/density_weighted_uncertainty_sampling.py index f2986c55..d3450eda 100644 --- a/libact/query_strategies/density_weighted_uncertainty_sampling.py +++ b/libact/query_strategies/density_weighted_uncertainty_sampling.py @@ -154,7 +154,7 @@ def make_query(self): return unlabeled_entry_ids[ask_id] class DensityWeightedLogisticRegression(object): - """Density Weighted Logistic Regression + r"""Density Weighted Logistic Regression Density Weighted Logistice Regression is used in DWUS to estimate the probability of representing which label for each cluster. diff --git a/libact/query_strategies/multiclass/mdsp.py b/libact/query_strategies/multiclass/mdsp.py index 84631a89..fa35eeb6 100644 --- a/libact/query_strategies/multiclass/mdsp.py +++ b/libact/query_strategies/multiclass/mdsp.py @@ -1,457 +1,455 @@ -""" -Multi-dimensional Scaling Partial (MDSP) -This modeuls is modified from -https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/manifold/mds.py -by Kuan-Hao Huang. -""" - -# author: Nelle Varoquaux -# Licence: BSD - -import numpy as np -import sklearn - -import warnings - -from sklearn.base import BaseEstimator -from sklearn.metrics import euclidean_distances -from sklearn.utils import check_random_state, check_array, check_symmetric -from sklearn.externals.joblib import Parallel -from sklearn.externals.joblib import delayed -from sklearn.isotonic import IsotonicRegression - - -def _smacof_single_p(similarities, n_uq, metric=True, n_components=2, init=None, - max_iter=300, verbose=0, eps=1e-3, random_state=None): - """ - Computes multidimensional scaling using SMACOF algorithm - - Parameters - ---------- - n_uq - - similarities: symmetric ndarray, shape [n * n] - similarities between the points - - metric: boolean, optional, default: True - compute metric or nonmetric SMACOF algorithm - - n_components: int, optional, default: 2 - number of dimension in which to immerse the similarities - overwritten if initial array is provided. - - init: {None or ndarray}, optional - if None, randomly chooses the initial configuration - if ndarray, initialize the SMACOF algorithm with this array - - max_iter: int, optional, default: 300 - Maximum number of iterations of the SMACOF algorithm for a single run - - verbose: int, optional, default: 0 - level of verbosity - - eps: float, optional, default: 1e-6 - relative tolerance w.r.t stress to declare converge - - random_state: integer or numpy.RandomState, optional - The generator used to initialize the centers. If an integer is - given, it fixes the seed. Defaults to the global numpy random - number generator. - - Returns - ------- - X: ndarray (n_samples, n_components), float - coordinates of the n_samples points in a n_components-space - - stress_: float - The final value of the stress (sum of squared distance of the - disparities and the distances for all constrained points) - - n_iter : int - Number of iterations run. - - """ - similarities = check_symmetric(similarities, raise_exception=True) - - n_samples = similarities.shape[0] - random_state = check_random_state(random_state) - - W = np.ones((n_samples, n_samples)) - W[:n_uq, :n_uq] = 0.0 - W[n_uq:, n_uq:] = 0.0 - # W[np.arange(len(W)), np.arange(len(W))] = 0.0 - - V = -W - V[np.arange(len(V)), np.arange(len(V))] = W.sum(axis=1) - e = np.ones((n_samples, 1)) - - Vp = np.linalg.inv(V + np.dot(e, e.T)/n_samples) - np.dot(e, e.T)/n_samples - # Vp = np.linalg.pinv(V) - - # sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel() - sim_flat = similarities.ravel() - sim_flat_w = sim_flat[sim_flat != 0] - if init is None: - # Randomly choose initial configuration - X = random_state.rand(n_samples * n_components) - X = X.reshape((n_samples, n_components)) - else: - # overrides the parameter p - n_components = init.shape[1] - if n_samples != init.shape[0]: - raise ValueError("init matrix should be of shape (%d, %d)" % - (n_samples, n_components)) - X = init - - old_stress = None - ir = IsotonicRegression() - for it in range(max_iter): - # Compute distance and monotonic regression - dis = euclidean_distances(X) - - if metric: - disparities = similarities - else: - # dis_flat = dis.ravel() - # # similarities with 0 are considered as missing values - # dis_flat_w = dis_flat[sim_flat != 0] - - # # Compute the disparities using a monotonic regression - # disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) - # disparities = dis_flat.copy() - # disparities[sim_flat != 0] = disparities_flat - # disparities = disparities.reshape((n_samples, n_samples)) - # disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / - # (disparities ** 2).sum()) - - dis_flat = dis.ravel() - # similarities with 0 are considered as missing values - dis_flat_w = dis_flat[sim_flat != 0] - - # Compute the disparities using a monotonic regression - disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) - disparities = dis_flat.copy() - disparities[sim_flat != 0] = disparities_flat - disparities = disparities.reshape((n_samples, n_samples)) - disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / (disparities ** 2).sum()) - disparities[similarities==0] = 0 - - # Compute stress - # stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2 - _stress = (W.ravel()*((dis.ravel() - disparities.ravel()) ** 2)).sum() / 2 - - # Update X using the Guttman transform - # dis[dis == 0] = 1e-5 - # ratio = disparities / dis - # B = - ratio - # B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1) - # X = 1. / n_samples * np.dot(B, X) - # print (1. / n_samples * np.dot(B, X))[:5].T - - dis[dis == 0] = 1e-5 - ratio = disparities / dis - _B = - W*ratio - _B[np.arange(len(_B)), np.arange(len(_B))] += (W*ratio).sum(axis=1) - - X = np.dot(Vp, np.dot(_B, X)) - # print X[:5].T - - dis = np.sqrt((X ** 2).sum(axis=1)).sum() - - if verbose >= 2: - print('it: %d, stress %s' % (it, _stress)) - if old_stress is not None: - if(old_stress - _stress / dis) < eps: - if verbose: - print('breaking at iteration %d with stress %s' % (it, - _stress)) - break - old_stress = _stress / dis - - return X, _stress, it + 1 - - -def smacof_p(similarities, n_uq, metric=True, n_components=2, init=None, n_init=8, - n_jobs=1, max_iter=300, verbose=0, eps=1e-3, random_state=None, - return_n_iter=False): - """ - Computes multidimensional scaling using SMACOF (Scaling by Majorizing a - Complicated Function) algorithm - - The SMACOF algorithm is a multidimensional scaling algorithm: it minimizes - a objective function, the *stress*, using a majorization technique. The - Stress Majorization, also known as the Guttman Transform, guarantees a - monotone convergence of Stress, and is more powerful than traditional - techniques such as gradient descent. - - The SMACOF algorithm for metric MDS can summarized by the following steps: - - 1. Set an initial start configuration, randomly or not. - 2. Compute the stress - 3. Compute the Guttman Transform - 4. Iterate 2 and 3 until convergence. - - The nonmetric algorithm adds a monotonic regression steps before computing - the stress. - - Parameters - ---------- - similarities : symmetric ndarray, shape (n_samples, n_samples) - similarities between the points - - metric : boolean, optional, default: True - compute metric or nonmetric SMACOF algorithm - - n_components : int, optional, default: 2 - number of dimension in which to immerse the similarities - overridden if initial array is provided. - - init : {None or ndarray of shape (n_samples, n_components)}, optional - if None, randomly chooses the initial configuration - if ndarray, initialize the SMACOF algorithm with this array - - n_init : int, optional, default: 8 - Number of time the smacof_p algorithm will be run with different - initialisation. The final results will be the best output of the - n_init consecutive runs in terms of stress. - - n_jobs : int, optional, default: 1 - - The number of jobs to use for the computation. This works by breaking - down the pairwise matrix into n_jobs even slices and computing them in - parallel. - - If -1 all CPUs are used. If 1 is given, no parallel computing code is - used at all, which is useful for debugging. For n_jobs below -1, - (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one - are used. - - max_iter : int, optional, default: 300 - Maximum number of iterations of the SMACOF algorithm for a single run - - verbose : int, optional, default: 0 - level of verbosity - - eps : float, optional, default: 1e-6 - relative tolerance w.r.t stress to declare converge - - random_state : integer or numpy.RandomState, optional - The generator used to initialize the centers. If an integer is - given, it fixes the seed. Defaults to the global numpy random - number generator. - - return_n_iter : bool - Whether or not to return the number of iterations. - - Returns - ------- - X : ndarray (n_samples,n_components) - Coordinates of the n_samples points in a n_components-space - - stress : float - The final value of the stress (sum of squared distance of the - disparities and the distances for all constrained points) - - n_iter : int - The number of iterations corresponding to the best stress. - Returned only if `return_n_iter` is set to True. - - Notes - ----- - "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; - Groenen P. Springer Series in Statistics (1997) - - "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. - Psychometrika, 29 (1964) - - "Multidimensional scaling by optimizing goodness of fit to a nonmetric - hypothesis" Kruskal, J. Psychometrika, 29, (1964) - """ - - similarities = check_array(similarities) - random_state = check_random_state(random_state) - - if hasattr(init, '__array__'): - init = np.asarray(init).copy() - if not n_init == 1: - warnings.warn( - 'Explicit initial positions passed: ' - 'performing only one init of the MDS instead of %d' - % n_init) - n_init = 1 - - best_pos, best_stress = None, None - - if n_jobs == 1: - for it in range(n_init): - pos, stress, n_iter_ = _smacof_single_p( - similarities, n_uq, metric=metric, - n_components=n_components, init=init, - max_iter=max_iter, verbose=verbose, - eps=eps, random_state=random_state) - if best_stress is None or stress < best_stress: - best_stress = stress - best_pos = pos.copy() - best_iter = n_iter_ - else: - seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) - results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))( - delayed(_smacof_single_p)( - similarities, n_uq, metric=metric, n_components=n_components, - init=init, max_iter=max_iter, verbose=verbose, eps=eps, - random_state=seed) - for seed in seeds) - positions, stress, n_iters = zip(*results) - best = np.argmin(stress) - best_stress = stress[best] - best_pos = positions[best] - best_iter = n_iters[best] - - if return_n_iter: - return best_pos, best_stress, best_iter - else: - return best_pos, best_stress - - -class MDSP(BaseEstimator): - """Multidimensional scaling - - Parameters - ---------- - metric : boolean, optional, default: True - compute metric or nonmetric SMACOF (Scaling by Majorizing a - Complicated Function) algorithm - - n_components : int, optional, default: 2 - number of dimension in which to immerse the similarities - overridden if initial array is provided. - - n_init : int, optional, default: 4 - Number of time the smacof_p algorithm will be run with different - initialisation. The final results will be the best output of the - n_init consecutive runs in terms of stress. - - max_iter : int, optional, default: 300 - Maximum number of iterations of the SMACOF algorithm for a single run - - verbose : int, optional, default: 0 - level of verbosity - - eps : float, optional, default: 1e-6 - relative tolerance w.r.t stress to declare converge - - n_jobs : int, optional, default: 1 - The number of jobs to use for the computation. This works by breaking - down the pairwise matrix into n_jobs even slices and computing them in - parallel. - - If -1 all CPUs are used. If 1 is given, no parallel computing code is - used at all, which is useful for debugging. For n_jobs below -1, - (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one - are used. - - random_state : integer or numpy.RandomState, optional - The generator used to initialize the centers. If an integer is - given, it fixes the seed. Defaults to the global numpy random - number generator. - - dissimilarity : string - Which dissimilarity measure to use. - Supported are 'euclidean' and 'precomputed'. - - - Attributes - ---------- - embedding_ : array-like, shape [n_components, n_samples] - Stores the position of the dataset in the embedding space - - stress_ : float - The final value of the stress (sum of squared distance of the - disparities and the distances for all constrained points) - - - References - ---------- - "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; - Groenen P. Springer Series in Statistics (1997) - - "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. - Psychometrika, 29 (1964) - - "Multidimensional scaling by optimizing goodness of fit to a nonmetric - hypothesis" Kruskal, J. Psychometrika, 29, (1964) - - """ - def __init__(self, n_components=2, n_uq=1, metric=True, n_init=4, - max_iter=300, verbose=0, eps=1e-3, n_jobs=1, - random_state=None, dissimilarity="euclidean"): - self.n_components = n_components - self.n_uq = n_uq - self.dissimilarity = dissimilarity - self.metric = metric - self.n_init = n_init - self.max_iter = max_iter - self.eps = eps - self.verbose = verbose - self.n_jobs = n_jobs - self.random_state = random_state - - @property - def _pairwise(self): - return self.kernel == "precomputed" - - def fit(self, X, y=None, init=None): - """ - Computes the position of the points in the embedding space - - Parameters - ---------- - X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \ - if dissimilarity='precomputed' - Input data. - - init : {None or ndarray, shape (n_samples,)}, optional - If None, randomly chooses the initial configuration - if ndarray, initialize the SMACOF algorithm with this array. - """ - self.fit_transform(X, init=init) - return self - - def fit_transform(self, X, y=None, init=None): - """ - Fit the data from X, and returns the embedded coordinates - - Parameters - ---------- - X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \ - if dissimilarity='precomputed' - Input data. - - init : {None or ndarray, shape (n_samples,)}, optional - If None, randomly chooses the initial configuration - if ndarray, initialize the SMACOF algorithm with this array. - - """ - X = check_array(X) - if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": - warnings.warn("The MDS API has changed. ``fit`` now constructs an" - " dissimilarity matrix from data. To use a custom " - "dissimilarity matrix, set " - "``dissimilarity=precomputed``.") - - if self.dissimilarity == "precomputed": - self.dissimilarity_matrix_ = X - elif self.dissimilarity == "euclidean": - self.dissimilarity_matrix_ = euclidean_distances(X) - else: - raise ValueError("Proximity must be 'precomputed' or 'euclidean'." - " Got %s instead" % str(self.dissimilarity)) - - self.embedding_, self.stress_, self.n_iter_ = smacof_p( - self.dissimilarity_matrix_, self.n_uq, metric=self.metric, - n_components=self.n_components, init=init, n_init=self.n_init, - n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, - eps=self.eps, random_state=self.random_state, - return_n_iter=True) - - return self.embedding_ +""" +Multi-dimensional Scaling Partial (MDSP) +This modeuls is modified from +https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/manifold/mds.py +by Kuan-Hao Huang. +""" + +# author: Nelle Varoquaux +# Licence: BSD + +import numpy as np + +import warnings + +from sklearn.base import BaseEstimator +from sklearn.metrics import euclidean_distances +from sklearn.utils import check_random_state, check_array, check_symmetric +from joblib import Parallel, delayed +from sklearn.isotonic import IsotonicRegression + + +def _smacof_single_p(similarities, n_uq, metric=True, n_components=2, init=None, + max_iter=300, verbose=0, eps=1e-3, random_state=None): + """ + Computes multidimensional scaling using SMACOF algorithm + + Parameters + ---------- + n_uq + + similarities: symmetric ndarray, shape [n * n] + similarities between the points + + metric: boolean, optional, default: True + compute metric or nonmetric SMACOF algorithm + + n_components: int, optional, default: 2 + number of dimension in which to immerse the similarities + overwritten if initial array is provided. + + init: {None or ndarray}, optional + if None, randomly chooses the initial configuration + if ndarray, initialize the SMACOF algorithm with this array + + max_iter: int, optional, default: 300 + Maximum number of iterations of the SMACOF algorithm for a single run + + verbose: int, optional, default: 0 + level of verbosity + + eps: float, optional, default: 1e-6 + relative tolerance w.r.t stress to declare converge + + random_state: integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + Returns + ------- + X: ndarray (n_samples, n_components), float + coordinates of the n_samples points in a n_components-space + + stress_: float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points) + + n_iter : int + Number of iterations run. + + """ + similarities = check_symmetric(similarities, raise_exception=True) + + n_samples = similarities.shape[0] + random_state = check_random_state(random_state) + + W = np.ones((n_samples, n_samples)) + W[:n_uq, :n_uq] = 0.0 + W[n_uq:, n_uq:] = 0.0 + # W[np.arange(len(W)), np.arange(len(W))] = 0.0 + + V = -W + V[np.arange(len(V)), np.arange(len(V))] = W.sum(axis=1) + e = np.ones((n_samples, 1)) + + Vp = np.linalg.inv(V + np.dot(e, e.T)/n_samples) - np.dot(e, e.T)/n_samples + # Vp = np.linalg.pinv(V) + + # sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel() + sim_flat = similarities.ravel() + sim_flat_w = sim_flat[sim_flat != 0] + if init is None: + # Randomly choose initial configuration + X = random_state.rand(n_samples * n_components) + X = X.reshape((n_samples, n_components)) + else: + # overrides the parameter p + n_components = init.shape[1] + if n_samples != init.shape[0]: + raise ValueError("init matrix should be of shape (%d, %d)" % + (n_samples, n_components)) + X = init + + old_stress = None + ir = IsotonicRegression() + for it in range(max_iter): + # Compute distance and monotonic regression + dis = euclidean_distances(X) + + if metric: + disparities = similarities + else: + # dis_flat = dis.ravel() + # # similarities with 0 are considered as missing values + # dis_flat_w = dis_flat[sim_flat != 0] + + # # Compute the disparities using a monotonic regression + # disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) + # disparities = dis_flat.copy() + # disparities[sim_flat != 0] = disparities_flat + # disparities = disparities.reshape((n_samples, n_samples)) + # disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / + # (disparities ** 2).sum()) + + dis_flat = dis.ravel() + # similarities with 0 are considered as missing values + dis_flat_w = dis_flat[sim_flat != 0] + + # Compute the disparities using a monotonic regression + disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) + disparities = dis_flat.copy() + disparities[sim_flat != 0] = disparities_flat + disparities = disparities.reshape((n_samples, n_samples)) + disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / (disparities ** 2).sum()) + disparities[similarities==0] = 0 + + # Compute stress + # stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2 + _stress = (W.ravel()*((dis.ravel() - disparities.ravel()) ** 2)).sum() / 2 + + # Update X using the Guttman transform + # dis[dis == 0] = 1e-5 + # ratio = disparities / dis + # B = - ratio + # B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1) + # X = 1. / n_samples * np.dot(B, X) + # print (1. / n_samples * np.dot(B, X))[:5].T + + dis[dis == 0] = 1e-5 + ratio = disparities / dis + _B = - W*ratio + _B[np.arange(len(_B)), np.arange(len(_B))] += (W*ratio).sum(axis=1) + + X = np.dot(Vp, np.dot(_B, X)) + # print X[:5].T + + dis = np.sqrt((X ** 2).sum(axis=1)).sum() + + if verbose >= 2: + print('it: %d, stress %s' % (it, _stress)) + if old_stress is not None: + if(old_stress - _stress / dis) < eps: + if verbose: + print('breaking at iteration %d with stress %s' % (it, + _stress)) + break + old_stress = _stress / dis + + return X, _stress, it + 1 + + +def smacof_p(similarities, n_uq, metric=True, n_components=2, init=None, n_init=8, + n_jobs=1, max_iter=300, verbose=0, eps=1e-3, random_state=None, + return_n_iter=False): + """ + Computes multidimensional scaling using SMACOF (Scaling by Majorizing a + Complicated Function) algorithm + + The SMACOF algorithm is a multidimensional scaling algorithm: it minimizes + a objective function, the *stress*, using a majorization technique. The + Stress Majorization, also known as the Guttman Transform, guarantees a + monotone convergence of Stress, and is more powerful than traditional + techniques such as gradient descent. + + The SMACOF algorithm for metric MDS can summarized by the following steps: + + 1. Set an initial start configuration, randomly or not. + 2. Compute the stress + 3. Compute the Guttman Transform + 4. Iterate 2 and 3 until convergence. + + The nonmetric algorithm adds a monotonic regression steps before computing + the stress. + + Parameters + ---------- + similarities : symmetric ndarray, shape (n_samples, n_samples) + similarities between the points + + metric : boolean, optional, default: True + compute metric or nonmetric SMACOF algorithm + + n_components : int, optional, default: 2 + number of dimension in which to immerse the similarities + overridden if initial array is provided. + + init : {None or ndarray of shape (n_samples, n_components)}, optional + if None, randomly chooses the initial configuration + if ndarray, initialize the SMACOF algorithm with this array + + n_init : int, optional, default: 8 + Number of time the smacof_p algorithm will be run with different + initialisation. The final results will be the best output of the + n_init consecutive runs in terms of stress. + + n_jobs : int, optional, default: 1 + + The number of jobs to use for the computation. This works by breaking + down the pairwise matrix into n_jobs even slices and computing them in + parallel. + + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + max_iter : int, optional, default: 300 + Maximum number of iterations of the SMACOF algorithm for a single run + + verbose : int, optional, default: 0 + level of verbosity + + eps : float, optional, default: 1e-6 + relative tolerance w.r.t stress to declare converge + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + return_n_iter : bool + Whether or not to return the number of iterations. + + Returns + ------- + X : ndarray (n_samples,n_components) + Coordinates of the n_samples points in a n_components-space + + stress : float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points) + + n_iter : int + The number of iterations corresponding to the best stress. + Returned only if `return_n_iter` is set to True. + + Notes + ----- + "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; + Groenen P. Springer Series in Statistics (1997) + + "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. + Psychometrika, 29 (1964) + + "Multidimensional scaling by optimizing goodness of fit to a nonmetric + hypothesis" Kruskal, J. Psychometrika, 29, (1964) + """ + + similarities = check_array(similarities) + random_state = check_random_state(random_state) + + if hasattr(init, '__array__'): + init = np.asarray(init).copy() + if not n_init == 1: + warnings.warn( + 'Explicit initial positions passed: ' + 'performing only one init of the MDS instead of %d' + % n_init) + n_init = 1 + + best_pos, best_stress = None, None + + if n_jobs == 1: + for it in range(n_init): + pos, stress, n_iter_ = _smacof_single_p( + similarities, n_uq, metric=metric, + n_components=n_components, init=init, + max_iter=max_iter, verbose=verbose, + eps=eps, random_state=random_state) + if best_stress is None or stress < best_stress: + best_stress = stress + best_pos = pos.copy() + best_iter = n_iter_ + else: + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))( + delayed(_smacof_single_p)( + similarities, n_uq, metric=metric, n_components=n_components, + init=init, max_iter=max_iter, verbose=verbose, eps=eps, + random_state=seed) + for seed in seeds) + positions, stress, n_iters = zip(*results) + best = np.argmin(stress) + best_stress = stress[best] + best_pos = positions[best] + best_iter = n_iters[best] + + if return_n_iter: + return best_pos, best_stress, best_iter + else: + return best_pos, best_stress + + +class MDSP(BaseEstimator): + """Multidimensional scaling + + Parameters + ---------- + metric : boolean, optional, default: True + compute metric or nonmetric SMACOF (Scaling by Majorizing a + Complicated Function) algorithm + + n_components : int, optional, default: 2 + number of dimension in which to immerse the similarities + overridden if initial array is provided. + + n_init : int, optional, default: 4 + Number of time the smacof_p algorithm will be run with different + initialisation. The final results will be the best output of the + n_init consecutive runs in terms of stress. + + max_iter : int, optional, default: 300 + Maximum number of iterations of the SMACOF algorithm for a single run + + verbose : int, optional, default: 0 + level of verbosity + + eps : float, optional, default: 1e-6 + relative tolerance w.r.t stress to declare converge + + n_jobs : int, optional, default: 1 + The number of jobs to use for the computation. This works by breaking + down the pairwise matrix into n_jobs even slices and computing them in + parallel. + + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + dissimilarity : string + Which dissimilarity measure to use. + Supported are 'euclidean' and 'precomputed'. + + + Attributes + ---------- + embedding_ : array-like, shape [n_components, n_samples] + Stores the position of the dataset in the embedding space + + stress_ : float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points) + + + References + ---------- + "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; + Groenen P. Springer Series in Statistics (1997) + + "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. + Psychometrika, 29 (1964) + + "Multidimensional scaling by optimizing goodness of fit to a nonmetric + hypothesis" Kruskal, J. Psychometrika, 29, (1964) + + """ + def __init__(self, n_components=2, n_uq=1, metric=True, n_init=4, + max_iter=300, verbose=0, eps=1e-3, n_jobs=1, + random_state=None, dissimilarity="euclidean"): + self.n_components = n_components + self.n_uq = n_uq + self.dissimilarity = dissimilarity + self.metric = metric + self.n_init = n_init + self.max_iter = max_iter + self.eps = eps + self.verbose = verbose + self.n_jobs = n_jobs + self.random_state = random_state + + @property + def _pairwise(self): + return self.kernel == "precomputed" + + def fit(self, X, y=None, init=None): + """ + Computes the position of the points in the embedding space + + Parameters + ---------- + X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \ + if dissimilarity='precomputed' + Input data. + + init : {None or ndarray, shape (n_samples,)}, optional + If None, randomly chooses the initial configuration + if ndarray, initialize the SMACOF algorithm with this array. + """ + self.fit_transform(X, init=init) + return self + + def fit_transform(self, X, y=None, init=None): + """ + Fit the data from X, and returns the embedded coordinates + + Parameters + ---------- + X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \ + if dissimilarity='precomputed' + Input data. + + init : {None or ndarray, shape (n_samples,)}, optional + If None, randomly chooses the initial configuration + if ndarray, initialize the SMACOF algorithm with this array. + + """ + X = check_array(X) + if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": + warnings.warn("The MDS API has changed. ``fit`` now constructs an" + " dissimilarity matrix from data. To use a custom " + "dissimilarity matrix, set " + "``dissimilarity=precomputed``.") + + if self.dissimilarity == "precomputed": + self.dissimilarity_matrix_ = X + elif self.dissimilarity == "euclidean": + self.dissimilarity_matrix_ = euclidean_distances(X) + else: + raise ValueError("Proximity must be 'precomputed' or 'euclidean'." + " Got %s instead" % str(self.dissimilarity)) + + self.embedding_, self.stress_, self.n_iter_ = smacof_p( + self.dissimilarity_matrix_, self.n_uq, metric=self.metric, + n_components=self.n_components, init=init, n_init=self.n_init, + n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, + eps=self.eps, random_state=self.random_state, + return_n_iter=True) + + return self.embedding_ diff --git a/libact/query_strategies/query_by_committee.py b/libact/query_strategies/query_by_committee.py index 73cc6cdf..6d23b174 100644 --- a/libact/query_strategies/query_by_committee.py +++ b/libact/query_strategies/query_by_committee.py @@ -73,7 +73,7 @@ class QueryByCommittee(QueryStrategy): """ def __init__(self, *args, **kwargs): - super(QueryByCommittee, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.disagreement = kwargs.pop('disagreement', 'vote') diff --git a/libact/query_strategies/variance_reduction.py b/libact/query_strategies/variance_reduction.py index 2e83dbf1..b25f5885 100644 --- a/libact/query_strategies/variance_reduction.py +++ b/libact/query_strategies/variance_reduction.py @@ -50,7 +50,7 @@ class VarianceReduction(QueryStrategy): """ def __init__(self, *args, **kwargs): - super(VarianceReduction, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) model = kwargs.pop('model', None) if isinstance(model, str): self.model = getattr(libact.models, model)() From af55b160fb95a66b519bb4d7f1ae67be43ca67c8 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Yang Date: Sat, 17 Jun 2023 18:38:15 -0700 Subject: [PATCH 2/2] test --- docs/.readthedocs.yaml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 docs/.readthedocs.yaml diff --git a/docs/.readthedocs.yaml b/docs/.readthedocs.yaml new file mode 100644 index 00000000..797edb94 --- /dev/null +++ b/docs/.readthedocs.yaml @@ -0,0 +1,22 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# We recommend specifying your dependencies to enable reproducible builds: +# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/rtd-requirements.txt