diff --git a/batchglm/api/models/tf2/glm_beta.py b/batchglm/api/models/tf2/glm_beta.py index 8b5f563e..67cffbf8 100644 --- a/batchglm/api/models/tf2/glm_beta.py +++ b/batchglm/api/models/tf2/glm_beta.py @@ -1,2 +1,2 @@ -#from batchglm.models.glm_beta import InputDataGLM, Model, Simulator -#from batchglm.train.tf2.glm_beta import Estimator +from batchglm.models.glm_beta import InputDataGLM, Model, Simulator +from batchglm.train.tf2.glm_beta import Estimator diff --git a/batchglm/api/models/tf2/glm_norm.py b/batchglm/api/models/tf2/glm_norm.py index 45fc0453..13e1727d 100644 --- a/batchglm/api/models/tf2/glm_norm.py +++ b/batchglm/api/models/tf2/glm_norm.py @@ -1,2 +1,2 @@ -#from batchglm.models.glm_norm import InputDataGLM, Model, Simulator -#from batchglm.train.tf2.glm_norm import Estimator +from batchglm.models.glm_norm import InputDataGLM, Model, Simulator +from batchglm.train.tf2.glm_norm import Estimator diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py index 9afb32bf..450a8dba 100644 --- a/batchglm/pkg_constants.py +++ b/batchglm/pkg_constants.py @@ -11,22 +11,34 @@ # Trust region hyper parameters: TRUST_REGION_RADIUS_INIT = 100. +TRUST_REGION_RADIUS_INIT_SCALE = 1. TRUST_REGION_ETA0 = 0. TRUST_REGION_ETA1 = 0.25 TRUST_REGION_ETA2 = 0.25 TRUST_REGION_T1 = 0.5 # Fast collapse to avoid trailing. TRUST_REGION_T2 = 1.5 # Allow expansion if not shrinking. -TRUST_REGION_UPPER_BOUND = 1e5 +TRUST_REGION_UPPER_BOUND = 1e40 -TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 1 +TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 0.5 +TRUST_REGIONT_T2_IRLS_GD_TR_SCALE = 1.5 # Convergence hyper-parameters: -LLTOL_BY_FEATURE = 1e-10 +LLTOL_BY_FEATURE = 1e-12 XTOL_BY_FEATURE_LOC = 1e-8 XTOL_BY_FEATURE_SCALE = 1e-6 GTOL_BY_FEATURE_LOC = 1e-8 GTOL_BY_FEATURE_SCALE = 1e-8 +TRTOL_BY_FEATURE_LOC = 1e-8 +TRTOL_BY_FEATURE_SCALE = 1e-6 + +FEATUREWISE_THRESHOLD = 10 # the minimal number of features to converge before next featurewise batch +FEATUREWISE_RECALCULATE = False # if set to True, recalculate the results from the previous train step + +WOLFE_C1 = 1e-3 +WOLFE_C2 = 0.99 +ALPHA0 = 100 + try: import tensorflow as tf diff --git a/batchglm/train/tf2/__init__.py b/batchglm/train/tf2/__init__.py new file mode 100644 index 00000000..9170f2ff --- /dev/null +++ b/batchglm/train/tf2/__init__.py @@ -0,0 +1,3 @@ +from . import glm_nb as nb +from . import glm_norm as norm +from . import glm_beta as beta \ No newline at end of file diff --git a/batchglm/train/tf2/base/__init__.py b/batchglm/train/tf2/base/__init__.py new file mode 100644 index 00000000..bc66a13a --- /dev/null +++ b/batchglm/train/tf2/base/__init__.py @@ -0,0 +1,3 @@ +from .estimator import TFEstimator +from .model import ProcessModelBase, ModelBase +from .optim import OptimizerBase diff --git a/batchglm/train/tf2/base/estimator.py b/batchglm/train/tf2/base/estimator.py new file mode 100644 index 00000000..7c103b02 --- /dev/null +++ b/batchglm/train/tf2/base/estimator.py @@ -0,0 +1,28 @@ +from .model import ModelBase +import tensorflow as tf + + +class TFEstimator: + model: ModelBase + + def __init__(self, input_data, dtype): + + self._input_data = input_data + self.dtype = dtype + + def _train( + self, + is_batched: bool, + batch_size: int, + optimizer_object: tf.keras.optimizers.Optimizer, + convergence_criteria: str, + stopping_criteria: int, + autograd: bool, + featurewise: bool, + benchmark: bool, + optimizer: str + ): + pass + + def fetch_fn(self, idx): + pass diff --git a/batchglm/train/tf2/base/external.py b/batchglm/train/tf2/base/external.py new file mode 100644 index 00000000..9133cd4d --- /dev/null +++ b/batchglm/train/tf2/base/external.py @@ -0,0 +1,4 @@ +#from batchglm.models.base import _Estimator_Base +#from batchglm.xarray_sparse import SparseXArrayDataArray, SparseXArrayDataSet +#import batchglm.utils.stats as stat_utils +from batchglm import pkg_constants diff --git a/batchglm/train/tf2/base/model.py b/batchglm/train/tf2/base/model.py new file mode 100644 index 00000000..76eaa6fa --- /dev/null +++ b/batchglm/train/tf2/base/model.py @@ -0,0 +1,47 @@ +import abc +import logging +import tensorflow as tf +import numpy as np + +logger = logging.getLogger(__name__) + + +class ModelBase(tf.keras.Model, metaclass=abc.ABCMeta): + + def __init__(self, dtype): + super(ModelBase, self).__init__(dtype=dtype) + + @abc.abstractmethod + def call(self, inputs, training=False, mask=None): + pass + + +class ProcessModelBase: + + @abc.abstractmethod + def param_bounds(self, dtype): + pass + + def tf_clip_param( + self, + param, + name + ): + bounds_min, bounds_max = self.param_bounds(param.dtype) + return tf.clip_by_value( + param, + bounds_min[name], + bounds_max[name] + ) + + def np_clip_param( + self, + param, + name + ): + bounds_min, bounds_max = self.param_bounds(param.dtype) + return np.clip( + param, + bounds_min[name], + bounds_max[name] + ) diff --git a/batchglm/train/tf2/base/optim.py b/batchglm/train/tf2/base/optim.py new file mode 100644 index 00000000..5fc8d13b --- /dev/null +++ b/batchglm/train/tf2/base/optim.py @@ -0,0 +1,52 @@ +import abc +import logging +import tensorflow as tf + +logger = logging.getLogger("batchglm") + + +class OptimizerBase(tf.keras.optimizers.Optimizer, metaclass=abc.ABCMeta): + + def __init__(self, name): + super(OptimizerBase, self).__init__(name=name) + + @abc.abstractmethod + def _resource_apply_dense(self, grad, handle): + pass + + @abc.abstractmethod + def _resource_apply_sparse(self, grad, handle, apply_state): + pass + + @abc.abstractmethod + def _create_slots(self): + pass + + """ + @property + @abc.abstractmethod + def vars(self): + pass + + @property + @abc.abstractmethod + def gradients(self): + return None + + @property + @abc.abstractmethod + def hessians(self): + pass + + @property + @abc.abstractmethod + def fims(self): + pass + + @abc.abstractmethod + def step(self, learning_rate): + pass + """ + @abc.abstractmethod + def get_config(self): + pass diff --git a/batchglm/train/tf2/base_glm/README.md b/batchglm/train/tf2/base_glm/README.md new file mode 100644 index 00000000..eea79ccc --- /dev/null +++ b/batchglm/train/tf2/base_glm/README.md @@ -0,0 +1,2 @@ +# Classes with GLM specific code. +All noise models that are in the GLM category inherit all of these classes. \ No newline at end of file diff --git a/batchglm/train/tf2/base_glm/__init__.py b/batchglm/train/tf2/base_glm/__init__.py new file mode 100644 index 00000000..699b1bce --- /dev/null +++ b/batchglm/train/tf2/base_glm/__init__.py @@ -0,0 +1,12 @@ +from .processModel import ProcessModelGLM +from .model import GLM + +from .estimator import Estimator +from .vars import ModelVarsGLM +from .layers import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM +from .layers import LikelihoodGLM, UnpackParamsGLM +from .layers_gradients import JacobianGLM, HessianGLM, FIMGLM +from .optim import NR, IRLS + +from .generator import DataGenerator +from .convergence import ConvergenceCalculator diff --git a/batchglm/train/tf2/base_glm/convergence.py b/batchglm/train/tf2/base_glm/convergence.py new file mode 100644 index 00000000..aedd8d64 --- /dev/null +++ b/batchglm/train/tf2/base_glm/convergence.py @@ -0,0 +1,198 @@ +import numpy as np +import tensorflow as tf +from .external import pkg_constants + + +class ConvergenceCalculator: + """Wrapper object containing all necessary methods to calculate convergences based on change + in likelihood, gradient and parameters.""" + + def __init__(self, estimator, last_ll: np.ndarray): + self.estimator = estimator + self.last_params = estimator.model.params_copy.numpy() + self.last_ll = last_ll + self.previous_number_converged = 0 + self.calc_separated = self.estimator.irls_algo and self.estimator._train_scale + + def calculate_convergence(self, results, jac_normalization, optimizer_object, batch_features): + """Calculates convergence based on change in likelihood, gradient and parameters.""" + + features_updated = self.estimator.model.model_vars.updated + converged_a = self.estimator.model.model_vars.converged + not_converged_a = ~ converged_a + if self.calc_separated: + features_updated_b = self.estimator.model.model_vars.updated_b + converged_b = self.estimator.model.model_vars.converged_b + not_converged_b = ~ converged_b + + n_features = self.estimator.input_data.num_features + + ########################################################### + # FIRST PART: Retrieve and manipulate ll, grads and params. + #### + if self.estimator.irls_algo: + grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1)) + elif self.estimator.nr_algo: + grad_numpy = tf.abs(results[1]) + else: + grad_numpy = tf.abs(tf.transpose(results[1])) + new_ll = tf.negative(tf.divide(results[0], self.estimator.input_data.num_observations)) + new_params = self.estimator.model.params_copy + + if batch_features: + # map columns of ll to full feature space + remaining_features = self.estimator.model.model_vars.remaining_features + indices = tf.where(remaining_features) + updated_lls = tf.scatter_nd(indices, new_ll, shape=[n_features]) + # fill the added columns with previous ll + new_ll = tf.where(remaining_features, updated_lls, self.last_ll) + + # fill added columns with the gradients from previous runs. + grad_numpy = tf.scatter_nd( + indices, + grad_numpy, + shape=(n_features, self.estimator.model.params.get_shape()[0]) + ) + # TODO: added columns are zero here, does that matter? + + # map columns of params to full feature space + new_params = tf.transpose( + tf.scatter_nd( + indices, + tf.transpose(new_params), + shape=(self.estimator.model.params.shape[1], self.estimator.model.params.shape[0]) + ) + ) + # TODO: added columns are zero here, does that matter? + + grad_numpy = grad_numpy.numpy() + new_params = new_params.numpy() + new_ll = new_ll.numpy() + + ########################################################### + # SECOND PART: Calculate ll convergence. + #### + + # Get all converged features due to change in ll < LLTOL_BY_FEATURE + # IMPORTANT: we need to ensure they have also been updated, otherwise ll_prev = ll_current! + ll_difference = np.abs(self.last_ll - new_ll) / self.last_ll + # print('ll_diff: ', ll_difference[0]) + # print(self.estimator.model.model_vars.converged[0], self.estimator.model.model_vars.updated[0]) + # print(self.estimator.model.model_vars.converged_b[0], self.estimator.model.model_vars.updated_b[0]) + ll_converged = ll_difference < pkg_constants.LLTOL_BY_FEATURE + + ll_converged_a = ll_converged & features_updated + epoch_ll_converged_a = not_converged_a & ll_converged_a # formerly known as converged_f + + if self.calc_separated: + ll_converged_b = ll_converged & features_updated_b + epoch_ll_converged_b = not_converged_b & ll_converged_b # formerly known as converged_f + + ########################################################### + # THIRD PART: calculate grad convergence. + #### + grad_loc = np.sum(grad_numpy[:, self.estimator.model.model_vars.idx_train_loc], axis=1) + grad_norm_loc = grad_loc / jac_normalization + grad_scale = np.sum(grad_numpy[:, self.estimator.model.model_vars.idx_train_scale], axis=1) + grad_norm_scale = grad_scale / jac_normalization + + grad_norm_loc_converged = grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC + grad_norm_scale_converged = grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE + if self.calc_separated: + grad_converged_a = grad_norm_loc_converged & features_updated + grad_converged_b = grad_norm_scale_converged & features_updated_b + epoch_grad_converged_b = not_converged_b & grad_converged_b # formerly known as converged_g + + else: + grad_converged_a = grad_norm_loc_converged & grad_norm_scale_converged & features_updated + epoch_grad_converged_a = not_converged_a & grad_converged_a # formerly known as converged_g + # print('grad: ', grad_norm_loc[0], grad_norm_scale[0]) + + ########################################################### + # Fourth PART: calculate parameter step convergence. + #### + x_step_a, x_step_b = self.calc_x_step(self.last_params, new_params) + if self.calc_separated: + x_step_converged_a = x_step_a & features_updated + x_step_converged_b = x_step_b & features_updated_b + epoch_step_converged_b = not_converged_b & x_step_converged_b + + else: + x_step_converged_a = x_step_a & x_step_b & features_updated + epoch_step_converged_a = not_converged_a & x_step_converged_a + # print('x_step: ', x_step_converged_a[0], x_step_converged_b[0]) + + # In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius. + # For now it must not be below the threshold for the X step of the loc model. + if hasattr(optimizer_object, 'tr_mode') and optimizer_object.tr_mode: + converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC + epoch_tr_converged = not_converged_a & converged_tr + epoch_step_converged_a |= epoch_tr_converged + if hasattr(optimizer_object, 'tr_mode_b') and optimizer_object.tr_mode_b and self.estimator._train_scale: + converged_tr_b = optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE + epoch_tr_converged_b = not_converged_b & converged_tr_b + epoch_step_converged_b |= epoch_tr_converged_b + + # print('tr: ', epoch_tr_converged[0], epoch_tr_converged_b[0]) + # print(self.estimator.model.model_vars.converged[0], self.estimator.model.model_vars.updated[0]) + # print(self.estimator.model.model_vars.converged_b[0], self.estimator.model.model_vars.updated_b[0]) + ########################################################### + # FINAL PART: exchange the current with the new containers. + #### + self.previous_number_converged = np.sum(self.estimator.model.model_vars.total_converged) + self.last_params = new_params + self.last_ll = new_ll + converged_a = np.logical_or.reduce((converged_a, epoch_ll_converged_a, epoch_grad_converged_a, epoch_step_converged_a)) + if self.calc_separated: + converged_b = np.logical_or.reduce((converged_b, epoch_ll_converged_b, epoch_grad_converged_b, epoch_step_converged_b)) + self.estimator.model.model_vars.total_converged = converged_a & converged_b + self.estimator.model.model_vars.converged_b = converged_b + epoch_ll_converged_a |= epoch_ll_converged_b + epoch_grad_converged_a |= epoch_grad_converged_b + epoch_step_converged_a |= epoch_step_converged_b + else: + self.estimator.model.model_vars.total_converged = converged_a + self.estimator.model.model_vars.converged = converged_a + # print(self.estimator.model.model_vars.total_converged[0]) + return epoch_ll_converged_a, epoch_grad_converged_a, epoch_step_converged_a + + def calc_x_step(self, prev_params, curr_params): + """Calculates convergence based on the difference in parameters before and + after the update.""" + def get_norm_converged(model: str, prev_params): + if model == 'loc': + idx_train = self.estimator.model.model_vars.idx_train_loc + xtol = pkg_constants.XTOL_BY_FEATURE_LOC + elif model == 'scale': + idx_train = self.estimator.model.model_vars.idx_train_scale + xtol = pkg_constants.XTOL_BY_FEATURE_SCALE + else: + assert False, "Supply either 'loc' or 'scale'!" + x_step = curr_params - prev_params + x_norm = np.sqrt(np.sum(np.square(x_step[idx_train, :]), axis=0)) + # print('x_norm: ', x_norm[0]) + return x_norm < xtol + + # We use a trick here: First we set both the loc and scale convergence to True. + # It is not necessary to use an array with length = number of features, since bitwise + # AND also works with a single boolean. + loc_conv = np.bool_(True) + scale_conv = np.bool_(True) + + # Now we check which models need to be trained. E.g. if you are using quick_scale = True, + # self._train_scale will be False and so the above single True value will be used. + if self.estimator._train_loc: + loc_conv = get_norm_converged('loc', prev_params) + if self.estimator._train_scale: + scale_conv = get_norm_converged('scale', prev_params) + + # Finally, we check that only features updated in this epoch can evaluate to True. + # This is only a problem for 2nd order optims with trusted region mode, since it might + # occur, that a feature isn't updated, so the x_step is zero although not yet converged. + return loc_conv, scale_conv + + def getLoss(self): + return np.sum(self.last_ll) + + def getPreviousNumberConverged(self): + return self.previous_number_converged diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py new file mode 100644 index 00000000..9db1f1b1 --- /dev/null +++ b/batchglm/train/tf2/base_glm/estimator.py @@ -0,0 +1,402 @@ +import abc +import sys +import logging +import time +import numpy as np +from .generator import DataGenerator +from .convergence import ConvergenceCalculator +import tensorflow as tf +from .model import GLM +from .external import TFEstimator, _EstimatorGLM +from .optim import NR, IRLS +from .external import pkg_constants + +logger = logging.getLogger("batchglm") + + +class Estimator(TFEstimator, _EstimatorGLM, metaclass=abc.ABCMeta): + """ + Estimator for Generalized Linear Models (GLMs). + """ + model: GLM + _train_loc: bool + _train_scale: bool + _initialized: bool = False + noise_model: str + irls_algo: bool = False + nr_algo: bool = False + optimizer = None + + def initialize(self, **kwargs): + self.values = [] + self.times = [] + self.converged = [] + self.lls = [] + self._initialized = True + self.model = None + + def update(self, results, *args): + self.optimizer.apply_gradients([(results[1], self.model.params_copy)]) + self.model.model_vars.updated = ~self.model.model_vars.converged + + def finalize(self, **kwargs): + """ + Evaluate all tensors that need to be exported from session, + save these as class attributes and close session. + Changes .model entry from tf-based EstimatorGraph to numpy based Model instance and + transfers relevant attributes. + """ + a_var, b_var = self.model.unpack_params( + [self.model.params, self.model.model_vars.a_var.get_shape()[0]]) + self.model = self.get_model_container(self.input_data) + self.model._a_var = a_var.numpy() + self.model._b_var = b_var.numpy() + self._loss = tf.reduce_sum( + tf.negative(self._log_likelihood) / self.input_data.num_observations).numpy() + + def __init__( + self, + input_data, + dtype, + ): + TFEstimator.__init__(self=self, input_data=input_data, dtype=dtype) + _EstimatorGLM.__init__(self=self, model=None, input_data=input_data) + + def _train( + self, + noise_model: str, + is_batched: bool = False, + batch_size: int = 5000, + optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(), + convergence_criteria: str = "step", + stopping_criteria: int = 1000, + autograd: bool = False, + featurewise: bool = True, + benchmark: bool = False, + optim_algo: str = "adam", + b_update_freq = 1 + ): + # define some useful shortcuts here + n_obs = self.input_data.num_observations + n_features = self.input_data.num_features + # set necessary attributes + self.noise_model = noise_model + optim = optim_algo.lower() + self.irls_algo = optim.startswith('irls') + self.nr_algo = optim in ['nr', 'nr_tr'] + + ################################################ + # INIT Step 1: Consistency Checks + #### + assert not is_batched, "The TF2 backend does not yet support updates on individual" \ + "batches. Use full data updates instead." + assert convergence_criteria in ["step", "all_converged"], \ + ("Unrecognized convergence criteria %s", convergence_criteria) + + if not self._initialized: + raise RuntimeError("Cannot train the model: Estimator not initialized. \ + Did you forget to call estimator.initialize() ?") + + if b_update_freq == 0: + b_update_freq = 1 + + if autograd and optim_algo.lower() in ['nr', 'nr_tr']: + logger.warning( + "Automatic differentiation is currently not supported for hessians. Falling back \ + to closed form. Only Jacobians are calculated using autograd.") + + if featurewise and not (self.irls_algo or self.nr_algo): + featurewise = False + logger.warning("WARNING: 'Featurewise batching' is only available for 2nd order " + "optimizers IRLS and NR. Fallback to full featurespace fitting.") + if batch_size > n_obs: + batch_size = n_obs + + ################################################ + # INIT Step 2: Intialise training loop. + # + + # create a tensorflow dataset using the DataGenerator + datagenerator = DataGenerator(self, noise_model, is_batched, batch_size) + epoch_set = datagenerator.new_epoch_set() + + # first model call to initialise prior to first update. + epochs_until_b_update = b_update_freq - 1 + compute_b = epochs_until_b_update == 0 + for i, x_batch in enumerate(epoch_set): + if i == 0: + results = self.model(x_batch, compute_b=compute_b) + else: + results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))] + + # create ConvergenceCalculator to check for new convergences. + conv_calc = ConvergenceCalculator(self, tf.negative(tf.divide(results[0], n_obs)).numpy()) + + # termination decision for training loop + def convergence_decision(num_converged, train_step): + not_done_fitting = num_converged < n_features + if convergence_criteria == "step": + not_done_fitting &= train_step < stopping_criteria + return not_done_fitting + + # condition variables neede during while loop + batch_features = False + train_step = 0 + num_converged = 0 + num_converged_prev = 0 + need_new_epoch_set = False + n_conv_last_featurewise_batch = 0 + + ################################################ + # Training Loop: Model Fitting happens here. + #### + + while convergence_decision(num_converged, train_step): + + if benchmark: + t0_epoch = time.time() + + ############################################ + # 1. recalculate, only done if featurewise + if need_new_epoch_set: + need_new_epoch_set = False + # this is executed only if a new feature converged in the last train step and + # using featurewise. + epoch_set = datagenerator.new_epoch_set(batch_features=batch_features) + if pkg_constants.FEATUREWISE_RECALCULATE: + for i, x_batch in enumerate(epoch_set, compute_b=compute_b): + results = self.model(x_batch) if i == 0 else \ + [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))] + + ############################################ + # 2. Update the parameters + self.update(results, epoch_set, batch_features, epochs_until_b_update == 0) + ############################################ + # 3. calculate new ll, jacs, hessian/fim + compute_b = epochs_until_b_update < 2 + for i, x_batch in enumerate(epoch_set): + # need new params_copy in model in case we use featurewise without recalculation + results = self.model(x_batch, compute_b=compute_b) if i == 0 \ + else [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))] + + ############################################ + # 4. check for any new convergences + convergences = conv_calc.calculate_convergence( + results=results, + jac_normalization=batch_size if is_batched else n_obs, + optimizer_object=optimizer_object, + batch_features=batch_features + ) + + num_converged = np.sum(self.model.model_vars.total_converged) + loss = conv_calc.getLoss() + if self.irls_algo and self._train_scale: + num_updated = np.sum( + np.logical_or(self.model.model_vars.updated, self.model.model_vars.updated_b)) + else: + num_updated = np.sum(self.model.model_vars.updated) + log_output = f"Step: {train_step} loss: {loss}, "\ + f"converged {num_converged}, updated {num_updated}" + num_converged_prev = conv_calc.getPreviousNumberConverged() + + ############################################ + # 5. report any new convergences + if num_converged == num_converged_prev: + logger.warning(log_output) + else: + if featurewise: + if not batch_features: + batch_features = True + self.model.batch_features = batch_features + conv_diff = num_converged - n_conv_last_featurewise_batch + if pkg_constants.FEATUREWISE_THRESHOLD < 1: + conv_diff /= n_features-n_conv_last_featurewise_batch + # Update params if number of new convergences since last + # featurewise batch is reached again. + if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD: + need_new_epoch_set = True + n_conv_last_featurewise_batch = num_converged + self.model.apply_featurewise_updates(conv_calc.last_params) + if not pkg_constants.FEATUREWISE_RECALCULATE: + results = self.mask_unconverged(results) + self.model.model_vars.remaining_features = \ + ~self.model.model_vars.total_converged + self.model.featurewise_batch() + + sums = [np.sum(convergence_vals) for convergence_vals in convergences] + log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\ + f"x_step: {sums[2]}" + logger.warning(log_output) + + train_step += 1 + epochs_until_b_update = (epochs_until_b_update + b_update_freq - 1) % b_update_freq + + # make sure loc is not updated any longer if completely converged + if b_update_freq > 1 and epochs_until_b_update > 1: + if np.all(self.model.model_vars.converged): + epochs_until_b_update = 1 # must not be 0: scale grads not yet calculated + b_update_freq = 1 # from now on, calc scale grads in each step + + # store some useful stuff for benchmarking purposes. + if benchmark: + t1_epoch = time.time() + self.times.append(t1_epoch-t0_epoch) + self.converged.append(num_converged) + self.values.append(self.model.trainable_variables[0].numpy().copy()) + self.lls.append(conv_calc.last_ll) + + ################################################ + # Final model call on the full feature space. + #### + logger.warning("Final Evaluation run.") + if batch_features: + # need to update `model.params` if conv_diff wasn't reached in last train step + # as updates since the last featurewise batch are not yet applied in that case. + if np.any(self.model.model_vars.remaining_features): + self.model.apply_featurewise_updates(conv_calc.last_params) + # now make sure we use the full feature space for the last update + self.model.model_vars.remaining_features = np.ones(n_features, dtype=np.bool) + self.model.featurewise_batch() + + batch_features = False # reset in case train is run repeatedly + # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv + self.model.setMethod('nr_tr') # TODO: maybe stay with irls to compute fim in the future + self.model.hessian.compute_b = True # since self._train_scale could be False. + + # need new set here with full feature space + # TODO: only needed if batch_features, maybe put this in the above if switch later + final_set = datagenerator.new_epoch_set() + for i, x_batch in enumerate(final_set): + results = self.model(x_batch) if i == 0 else \ + [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))] + + # store all the final results in this estimator instance. + self._log_likelihood = results[0].numpy() + self._jacobian = tf.reduce_sum(tf.abs(results[1] / n_obs), axis=1) + self._hessian = - results[2].numpy() + + fisher_inv = np.zeros_like(self._hessian) + invertible = np.where(np.linalg.cond(self._hessian, p=None) < 1 / sys.float_info.epsilon)[0] + num_non_invertible = n_features - len(invertible) + if num_non_invertible > 0: + logger.warning(f"fisher_inv could not be calculated for {num_non_invertible} features!") + fisher_inv[invertible] = np.linalg.inv(-self._hessian[invertible]) + self._fisher_inv = fisher_inv.copy() + self.model.hessian.compute_b = self.model.compute_b # reset if not self._train_scale + + def update_params(self, batches, results, batch_features, update_func): + """Wrapper method to conduct updates based on different optimizers/conditions.""" + if self.irls_algo or self.nr_algo: + if self.irls_algo: + # separate loc and scale update if using IRLS. + update_func( + inputs=[batches, *results], + compute_a=True, + compute_b=False, + batch_features=batch_features, + is_batched=False + ) + if self._train_scale: + update_func( + inputs=[batches, *results], + compute_a=False, + compute_b=True, + batch_features=batch_features, + is_batched=False + ) + else: + update_func( + inputs=[batches, *results], + batch_features=batch_features, + is_batched=False + ) + else: + update_var = results[1] + update_func([(update_var, self.model.params_copy)]) + self.model.model_vars.updated = ~self.model.model_vars.converged + + def mask_unconverged(self, results): + + # the idx from unconverged features, thus features reamining in the curent results + idx = np.where(self.model.model_vars.remaining_features)[0] + # the new remaining_features in reduced feature space + mask = ~(self.model.model_vars.total_converged[idx]) + + ll = tf.boolean_mask(results[0], mask) + if self.irls_algo: + jac_a = tf.boolean_mask(results[1], mask) + jac_b = tf.boolean_mask(results[2], mask) + fim_a = tf.boolean_mask(results[3], mask) + fim_b = tf.boolean_mask(results[4], mask) + return ll, jac_a, jac_b, fim_a, fim_b + elif self.nr_algo: + jac = tf.boolean_mask(results[1], mask) + hessian = tf.boolean_mask(results[2], mask) + return ll, jac, hessian + else: + jac = tf.boolean_mask(results[1], mask, axis=1) + return ll, jac + + def get_optimizer_object(self, optimizer: str, learning_rate): + """ Creates an optimizer object based on the given optimizer string.""" + optimizer = optimizer.lower() + if optimizer == "gd": + optim_obj = tf.keras.optimizers.SGD(learning_rate=learning_rate) + elif optimizer == "adam": + optim_obj = tf.keras.optimizers.Adam(learning_rate=learning_rate) + elif optimizer == "adagrad": + optim_obj = tf.keras.optimizers.Adagrad(learning_rate=learning_rate) + elif optimizer == "rmsprop": + optim_obj = tf.keras.optimizers.RMSprop(learning_rate=learning_rate) + else: + tr_mode = optimizer.endswith('tr') + init_dict = { + "dtype": self.dtype, + "model": self.model, + "name": optimizer, + "tr_mode": tr_mode, + "n_obs": self.input_data.num_observations + } + if optimizer.startswith('irls'): + optim_obj = IRLS(**init_dict) + elif optimizer.startswith('nr'): + optim_obj = NR(**init_dict) + else: + optim_obj = tf.keras.optimizers.Adam(learning_rate=learning_rate) + logger.warning("No valid optimizer given. Default optimizer Adam chosen.") + + return optim_obj + + @staticmethod + def get_init_from_model(init_a, init_b, input_data, init_model): + # Locations model: + if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"): + my_loc_names = set(input_data.loc_names) + my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names)) + + init_loc = np.zeros([input_data.num_loc_params, input_data.num_features]) + for parm in my_loc_names: + init_idx = np.where(init_model.input_data.loc_names == parm)[0] + my_idx = np.where(input_data.loc_names == parm)[0] + init_loc[my_idx] = init_model.a_var[init_idx] + + init_a = init_loc + + # Scale model: + if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"): + my_scale_names = set(input_data.scale_names) + my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names) + + init_scale = np.zeros([input_data.num_scale_params, input_data.num_features]) + for parm in my_scale_names: + init_idx = np.where(init_model.input_data.scale_names == parm)[0] + my_idx = np.where(input_data.scale_names == parm)[0] + init_scale[my_idx] = init_model.b_var[init_idx] + + init_b = init_scale + + return init_a, init_b + + @abc.abstractmethod + def get_model_container(self, input_data): + pass diff --git a/batchglm/train/tf2/base_glm/external.py b/batchglm/train/tf2/base_glm/external.py new file mode 100644 index 00000000..db1c8e0e --- /dev/null +++ b/batchglm/train/tf2/base_glm/external.py @@ -0,0 +1,9 @@ +from batchglm.train.tf2.base import ProcessModelBase, ModelBase, TFEstimator +from batchglm.train.tf2.base import OptimizerBase +#from batchglm.train.tf2.glm_nb import NR, IRLS + +from batchglm.models.base_glm import InputDataGLM, _ModelGLM, _EstimatorGLM + +#import batchglm.train.tf.ops as op_utils +from batchglm.utils.linalg import groupwise_solve_lm +from batchglm import pkg_constants diff --git a/batchglm/train/tf2/base_glm/generator.py b/batchglm/train/tf2/base_glm/generator.py new file mode 100644 index 00000000..1d061fcd --- /dev/null +++ b/batchglm/train/tf2/base_glm/generator.py @@ -0,0 +1,103 @@ +import numpy as np +from scipy.sparse import csr_matrix +import tensorflow as tf +import time + +class DataGenerator: + """Wrapper Object to generate an iterable TensorFlow Dataset from given input data.""" + + def __init__( + self, + estimator, + noise_model: str, + is_batched_model: bool, + batch_size: int + ): + self.estimator = estimator + self.noise_model = noise_model + self.is_batched_model = is_batched_model + self.batch_size = batch_size + self.sparse = isinstance(estimator.input_data.x, csr_matrix) + self.n_obs = estimator.input_data.num_observations + # integer ceil division with arithmetic trick: ceil(a/b)=(a+b-1)//b + # We need this for cases where n_obs mod batch_size != 0 + self.num_batches = (self.n_obs + batch_size - 1) // batch_size + dtp = estimator.dtype + output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if self.sparse else (dtp,) * 4 + self.dataset = tf.data.Dataset.from_generator( + generator=self._generate, output_types=output_types) + + def _generate(self): + """ + Generates `(counts, design_loc, design_scale, size_factors)` tuples of `self.input_data`. + The number of observations in each such data batch is given by `self.batch size`. + If `self.is_batched_model`, the method uses a random permutation of `input_data` each time + it is called. + """ + input_data = self.estimator.input_data + fetch_size_factors = input_data.size_factors is not None \ + and self.noise_model in ["nb", "norm"] + obs_pool = np.random.permutation(self.n_obs) \ + if self.is_batched_model else np.arange(self.n_obs) + for start_id in range(0, self.n_obs, self.batch_size): + # numpy ignores ids > len(obs_pool) so no out of bounds check needed here: + idx = obs_pool[start_id: start_id + self.batch_size] + counts = input_data.fetch_x_sparse(idx) if self.sparse \ + else input_data.fetch_x_dense(idx) + dloc = input_data.fetch_design_loc(idx) + dscale = input_data.fetch_design_scale(idx) + size_factors = input_data.fetch_size_factors(idx) if fetch_size_factors else 1 + yield counts, dloc, dscale, size_factors + + def _featurewise_batch_sparse(self, ivs_tuple, dloc, dscale, size_factors): + + not_converged = self.estimator.model.model_vars.remaining_features + + not_converged_numeric = tf.cast(not_converged, dtype=tf.int64) + col_idx_map = tf.cumsum(not_converged_numeric, exclusive=True) + ivs_tuple[0].set_shape([None, 2]) + mask = tf.gather(not_converged, ivs_tuple[0][:, 1]) + new_indices = tf.gather_nd(ivs_tuple[0], tf.where(mask)) + row_idx, col_idx = tf.split(new_indices, num_or_size_splits=2, axis=1, num=2) + + new_indices = tf.concat([row_idx, tf.gather(col_idx_map, col_idx)], 1) + new_values = tf.boolean_mask(ivs_tuple[1], mask) + n_features = col_idx_map[-1] + if not_converged[-1]: + n_features += 1 + + x_tensor = (new_indices, new_values, (ivs_tuple[2][0], n_features)) + return x_tensor, dloc, dscale, size_factors + + def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors): + """Takes an element of a dataset, performs featurewise batching + and returns the reduced element.""" + + not_converged = self.estimator.model.model_vars.remaining_features + if self.sparse: + feature_columns = tf.sparse.split( + x_tensor, + num_split=self.estimator.model_vars.n_features, + axis=1) + not_converged_idx = np.where(not_converged)[0] + feature_columns = [feature_columns[i] for i in not_converged_idx] + x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns) + else: + x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1) + return x_tensor, dloc, dscale, size_factors + + def new_epoch_set(self, batch_features: bool = False): + """Returns an iterable TensorFlow Dataset of the input data.""" + dataset_to_return = self.dataset.take(self.num_batches) + + if self.sparse: + dataset_to_return = dataset_to_return.map( + lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf) + ) + if batch_features: + dataset_to_return = dataset_to_return.map(self._featurewise_batch) + else: + if batch_features: + dataset_to_return = dataset_to_return.map(self._featurewise_batch) + + return dataset_to_return.cache().prefetch(1) diff --git a/batchglm/train/tf2/base_glm/layers.py b/batchglm/train/tf2/base_glm/layers.py new file mode 100644 index 00000000..5e66ab10 --- /dev/null +++ b/batchglm/train/tf2/base_glm/layers.py @@ -0,0 +1,264 @@ +from typing import Union + +import abc +import tensorflow as tf +from .processModel import ProcessModelGLM + + +class UnpackParamsGLM(tf.keras.layers.Layer, ProcessModelGLM): + + """ + Layer that slices the parameter tensor into mean and variance block. + """ + + def __init__(self, dtype): + super(UnpackParamsGLM, self).__init__(dtype=dtype) + + def call(self, inputs, **kwargs): + """ + :param inputs: tuple (params, border) + Must contain the parameter matrix (params) and the first index + of the variance block within the parameters matrix (border) + + :return tf.Tensor, tf.Tensor + The two returned tensor correspond to the mean and variance block + of the parameter matrix. + """ + params, border = inputs + a_var = params[0:border] # loc obs + b_var = params[border:] # scale obs + a_var = self.tf_clip_param(a_var, "a_var") + b_var = self.tf_clip_param(b_var, "b_var") + return a_var, b_var + + +class LinearLocGLM(tf.keras.layers.Layer, ProcessModelGLM): + + """ + Computes the dot product between the design matrix of the mean model and the mean block of the parameter matrix. + """ + + def __init__(self, dtype): + super(LinearLocGLM, self).__init__(dtype=dtype) + + def _eta_loc( + self, + a_var: tf.Tensor, + design_loc: tf.Tensor, + constraints_loc: Union[tf.Tensor, None] = None, + size_factors: Union[tf.Tensor, None] = None + ): + """ + Does the actual computation of eta_loc. + + :param a_var: tf.Tensor + the mean block of the parameter matrix + :param design_loc: tf.Tensor + the design matrix of the mean model + :param contraints_loc: tf.Tensor, optional + ??? # TODO + :param size_factors: tf.Tensor, optional + ??? # TODO + + :return tf.Tensor + the mean values for each individual distribution, encoded in linker space. + """ + if constraints_loc is not None: + eta_loc = tf.matmul( + design_loc, + tf.matmul(constraints_loc, a_var) + ) + else: + eta_loc = tf.matmul(design_loc, a_var) + + if size_factors is not None and size_factors.shape != (1, 1): + eta_loc = self.with_size_factors(eta_loc, size_factors) + + eta_loc = self.tf_clip_param(eta_loc, "eta_loc") + + return eta_loc + + @abc.abstractmethod + def with_size_factors(self, eta_loc, size_factors): + """ + Calculates eta_loc with size_factors. Is noise model specific and needs to be implemented in the inheriting + layer. + :param eta_loc: tf.Tensor + the mean values for each individual distribution, encoded in linker space + """ + + def call(self, inputs, **kwargs): + """ + Calculates the eta_loc tensor, containing the mean values for each individual distribution, + encoded in linker space. + + :param input: tuple + Must contain a_var, design_loc, constraints_loc and size_factors in this order, where + contraints_loc and size_factor can be None. + + :return tf.Tensor + the mean values for each individual distribution, encoded in linker space. + """ + return self._eta_loc(*inputs) + + +class LinearScaleGLM(tf.keras.layers.Layer, ProcessModelGLM): + + """ + Computes the dot product between the design matrix of the variance model + and the variance block of the parameter matrix. + """ + + def __init__(self, dtype): + super(LinearScaleGLM, self).__init__(dtype=dtype) + + def _eta_scale( + self, + b_var: tf.Tensor, + design_scale: tf.Tensor, + constraints_scale: Union[tf.Tensor, None] = None + ): + """ + Does the actual computation of eta_scale. + + :param b_var: tf.Tensor + the variance block of the parameter matrix + :param design_scale: tf.Tensor + the design matrix of the mean model + :param contraints_scale: tf.Tensor, optional + ??? # TODO + + :return tf.Tensor + the variance values for each individual distribution, encoded in linker space. + """ + if constraints_scale is not None: + eta_scale = tf.matmul( + design_scale, + tf.matmul(constraints_scale, b_var) + ) + else: + eta_scale = tf.matmul(design_scale, b_var) + + eta_scale = self.tf_clip_param(eta_scale, "eta_scale") + + return eta_scale + + def call(self, inputs, **kwargs): + """ + Calculates the eta_scale tensor, containing the variance values for each individual distribution, + encoded in linker space. + + :param input: tuple + Must contain b_var, design_scale and constraints_loc in this order, where + contraints_loc can be None. + + :return tf.Tensor + the variance values for each individual distribution, encoded in linker space. + """ + return self._eta_scale(*inputs) + + +class LinkerLocGLM(tf.keras.layers.Layer): + + """ + Translation from linker to data space for the mean model. + """ + + def __init__(self, dtype): + super(LinkerLocGLM, self).__init__(dtype=dtype) + + @abc.abstractmethod + def _inv_linker(self, loc: tf.Tensor): + """ + Translates the given mean values from linker to data space. Depends on the given noise model and needs to + be implemented in the inheriting layer. + + :param loc: tf. Tensor + the mean values for each individual distribution, encoded in linker space. + + :return tf.Tensor + the mean values for each individual distribution, encoded in data space. + """ + + def call(self, eta_loc: tf.Tensor, **kwargs): + """ + Calls the distribution specific linker function to translate from linker to data space. + + :param eta_loc: tf.Tensor + the mean values for each individual distribution, encoded in linker space. + + :return tf.Tensor + the mean values for each individual distribution, encoded in data space. + """ + loc = self._inv_linker(eta_loc) + return loc + + +class LinkerScaleGLM(tf.keras.layers.Layer): + + """ + Translation from linker to data space for the variance model. + """ + + def __init__(self, dtype): + super(LinkerScaleGLM, self).__init__(dtype=dtype) + + @abc.abstractmethod + def _inv_linker(self, scale: tf.Tensor): + pass + + def call(self, eta_scale: tf.Tensor, **kwargs): + """ + Calls the distribution specific linker function to translate from linker to data space. + + :param eta_scale: tf.Tensor + the variance values for each individual distribution, encoded in linker space. + + :return tf.Tensor + the variance values for each individual distribution, encoded in data space. + """ + scale = self._inv_linker(eta_scale) + return scale + + +class LikelihoodGLM(tf.keras.layers.Layer, ProcessModelGLM): + + """ + Contains the computation of the distribution specific log-likelihood function + """ + + def __init__(self, dtype): + super(LikelihoodGLM, self).__init__(dtype=dtype) + + @abc.abstractmethod + def _ll(self, eta_loc, eta_scale, loc, scale, x): + """ + Does the actual likelihood calculation. Depends on the given noise model and needs to be implemented in the + inheriting layer. + + :param eta_loc: tf.Tensor + the mean values for each individual distribution, encoded in linker space. + :param eta_scale: tf.Tensor + the variance values for each individual distribution, encoded in linker space. + :param loc: tf.Tensor + the mean values for each individual distribution, encoded in data space. + :param scale: tf.Tensor + the variance values for each individual distribution, encoded in data space. + :param x: tf.Tensor + the input data + + :return tf.Tensor + the log-likelihoods of each individual data point. + """ + + def call(self, inputs, **kwargs): + """ + Calls the distribution specific log-likelihood function. + + :param inputs: tuple + Must contain eta_loc, eta_scale, loc, scale, x, n_features in this order. + + :return tf.Tensor + the log-likelihoods of each individual data point. + """ + return self._ll(*inputs) diff --git a/batchglm/train/tf2/base_glm/layers_gradients.py b/batchglm/train/tf2/base_glm/layers_gradients.py new file mode 100644 index 00000000..a807e51f --- /dev/null +++ b/batchglm/train/tf2/base_glm/layers_gradients.py @@ -0,0 +1,447 @@ +import abc +import tensorflow as tf + + +class Gradient(tf.keras.layers.Layer): + + """Superclass for Jacobians, Hessian, FIM""" + + def __init__(self, model_vars, dtype): + super(Gradient, self).__init__(dtype=dtype) + self.model_vars = model_vars + + @abc.abstractmethod + def call(self, inputs, **kwargs): + pass + + @staticmethod + def calc_design_mat(design_mat, constraints): + if constraints is not None: + xh = tf.matmul(design_mat, constraints) + else: + xh = design_mat + return xh + + # Here, we use the einsum to efficiently perform the two outer products and the marginalisation. + @staticmethod + def create_specific_block(w, xh_loc, xh_scale): + return tf.einsum('ofc,od->fcd', tf.einsum('of,oc->ofc', w, xh_loc), xh_scale) + + +class FIMGLM(Gradient): + """ + Compute expected fisher information matrix (FIM) + for iteratively re-weighted least squares (IWLS or IRLS) parameter updates for GLMs. + """ + + def call(self, inputs, **kwargs): + return self._fim_analytic(*inputs) + + def _fim_analytic(self, x, design_loc, design_scale, loc, scale, concat=False, compute_a=True, compute_b=True) -> tf.Tensor: + """ + Compute the closed-form of the base_glm_all model fim + by evalutating its terms grouped by observations. + """ + + def _a_byobs(): + """ + Compute the mean model diagonal block of the + closed form fim of base_glm_all model by observation across features + for a batch of observations. + """ + w = self._weight_fim_aa(x=x, loc=loc, scale=scale) # [observations x features] + # The computation of the fim block requires two outer products between + # feature-wise constants and the coefficient wise design matrix entries, for each observation. + # The resulting tensor is observations x features x coefficients x coefficients which + # is too large too store in memory in most cases. However, the full 4D tensor is never + # actually needed but only its marginal across features, the final hessian block shape. + # Here, we use the einsum to efficiently perform the two outer products and the marginalisation. + xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc) + + fim_block = self.create_specific_block(w, xh, xh) + return fim_block + + def _b_byobs(): + """ + Compute the dispersion model diagonal block of the + closed form fim of base_glm_all model by observation across features. + """ + w = self._weight_fim_bb(x=x, loc=loc, scale=scale) # [observations=1 x features] + # The computation of the fim block requires two outer products between + # feature-wise constants and the coefficient wise design matrix entries, for each observation. + # The resulting tensor is observations x features x coefficients x coefficients which + # is too large too store in memory in most cases. However, the full 4D tensor is never + # actually needed but only its marginal across features, the final hessian block shape. + # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation. + xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale) + + fim_block = self.create_specific_block(w, xh, xh) + return fim_block + + # The full fisher information matrix is block-diagonal with the cross-model + # blocks all zero. Accordingly, mean and dispersion model updates can be + # treated independently and the full fisher information matrix is never required. + # Here, the non-zero model-wise diagonal blocks are computed and returned + # as a dictionary. The according score function vectors are also returned as a dictionary. + + if compute_a and compute_b: + fim_a = _a_byobs() + fim_b = _b_byobs() + + elif compute_a and not compute_b: + fim_a = _a_byobs() + fim_b = tf.zeros(fim_a.get_shape(), self.dtype) + elif not compute_a and compute_b: + fim_a = tf.zeros(fim_a.get_shape(), self.dtype) + fim_b = _b_byobs() + else: + fim_a = tf.zeros_like(self.model_vars.a_var, dtype=self.dtype) + fim_b = tf.zeros_like(self.model_vars.b_var, dtype=self.dtype) + + if concat: + fim = tf.concat([fim_a, fim_b], axis=1) + return fim + else: + return fim_a, fim_b + + @abc.abstractmethod + def _weight_fim_aa( + self, + x, + loc, + scale + ): + """ + Compute for mean model IWLS update for a GLM. + + :param loc: tf.tensor observations x features + Value of mean model by observation and feature. + :param scale: tf.tensor observations x features + Value of dispersion model by observation and feature. + + :return tuple of tf.tensors + Constants with respect to coefficient index for + Fisher information matrix and score function computation. + """ + pass + + @abc.abstractmethod + def _weight_fim_bb( + self, + x, + loc, + scale + ): + """ + Compute for dispersion model IWLS update for a GLM. + + :param x: tf.tensor observations x features + Observation by observation and feature. + :param loc: tf.tensor observations x features + Value of mean model by observation and feature. + :param scale: tf.tensor observations x features + Value of dispersion model by observation and feature. + + :return tuple of tf.tensors + Constants with respect to coefficient index for + Fisher information matrix and score function computation. + """ + pass + + +class JacobianGLM(Gradient): + + def call(self, inputs, **kwargs): + return self._jac_analytic(*inputs) + + def _jac_analytic(self, x, design_loc, design_scale, loc, scale, concat, compute_a=True, compute_b=True) -> tf.Tensor: + """ + Compute the closed-form of the base_glm_all model jacobian + by evalutating its terms grouped by observations. + + :param x: tf.tensor observations x features + Observation by observation and feature. + :param loc: tf.tensor observations x features + Value of mean model by observation and feature. + :param scale: tf.tensor observations x features + Value of dispersion model by observation and feature. + """ + + def _a_byobs(): + """ + Compute the mean model block of the jacobian. + + :return Jblock: tf.tensor features x coefficients + Block of jacobian. + """ + w = self._weights_jac_a(x=x, loc=loc, scale=scale) # [observations, features] + xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc) # [observations, coefficient] + + jblock = tf.matmul(tf.transpose(w), xh) # [features, coefficients] + return jblock + + def _b_byobs(): + """ + Compute the dispersion model block of the jacobian. + + :return Jblock: tf.tensor features x coefficients + Block of jacobian. + """ + w = self._weights_jac_b(x=x, loc=loc, scale=scale) # [observations, features] + xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale) # [observations, coefficient] + + jblock = tf.matmul(tf.transpose(w), xh) # [features, coefficients] + return jblock + + if compute_a and compute_b: + j_a = _a_byobs() + j_b = _b_byobs() + elif compute_a and not compute_b: + j_a = _a_byobs() + j_b = tf.zeros((j_a.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.dtype) + elif not compute_a and compute_b: + j_b = _b_byobs() + j_a = tf.zeros((j_b.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.dtype) + else: + j_a = tf.transpose(tf.zeros_like(self.model_vars.a_var, dtype=self.dtype)) + j_b = tf.transpose(tf.zeros_like(self.model_vars.b_var, dtype=self.dtype)) + + if concat: + j = tf.concat([j_a, j_b], axis=1) + return j + else: + return j_a, j_b + + @abc.abstractmethod + def _weights_jac_a( + self, + x, + loc, + scale + ): + """ + Compute the coefficient index invariant part of the + mean model gradient. + + :param x: tf.tensor observations x features + Observation by observation and feature. + :param loc: tf.tensor observations x features + Value of mean model by observation and feature. + :param scale: tf.tensor observations x features + Value of dispersion model by observation and feature. + + :return const: tf.tensor observations x features + Coefficient invariant terms of hessian of + given observations and features. + """ + pass + + @abc.abstractmethod + def _weights_jac_b( + self, + x, + loc, + scale + ): + """ + Compute the coefficient index invariant part of the + dispersion model gradient. + + :param x: tf.tensor observations x features + Observation by observation and feature. + :param loc: tf.tensor observations x features + Value of mean model by observation and feature. + :param scale: tf.tensor observations x features + Value of dispersion model by observation and feature. + + :return const: tf.tensor observations x features + Coefficient invariant terms of hessian of + given observations and features. + """ + pass + + +class HessianGLM(Gradient): + """ + Compute the closed-form of the base_glm_all model hessian + by evaluating its terms grouped by observations. + + Has three sub-functions which built the specific blocks of the hessian + and one sub-function which concatenates the blocks into a full hessian. + """ + + def call(self, inputs, **kwargs): + return self._hessian_analytic(*inputs) + + def _hessian_analytic(self, x, design_loc, design_scale, loc, scale, concat, compute_a=True, compute_b=True) -> tf.Tensor: + """ + Compute the closed-form of the base_glm_all model hessian + by evaluating its terms grouped by observations. + + Has three sub-functions which built the specific blocks of the hessian + and one sub-function which concatenates the blocks into a full hessian. + """ + + def _aa_byobs_batched(): + """ + Compute the mean model diagonal block of the + closed form hessian of base_glm_all model by observation across features + for a batch of observations. + """ + w = self._weight_hessian_aa(x=x, loc=loc, scale=scale) # [observations x features] + # The computation of the hessian block requires two outer products between + # feature-wise constants and the coefficient wise design matrix entries, for each observation. + # The resulting tensor is observations x features x coefficients x coefficients which + # is too large too store in memory in most cases. However, the full 4D tensor is never + # actually needed but only its marginal across features, the final hessian block shape. + # Here, we use the einsum to efficiently perform the two outer products and the marginalisation. + xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc) + + hblock = self.create_specific_block(w, xh, xh) + return hblock + + def _bb_byobs_batched(): + """ + Compute the dispersion model diagonal block of the + closed form hessian of base_glm_all model by observation across features. + """ + w = self._weight_hessian_bb(x=x, loc=loc, scale=scale) # [observations x features] + # The computation of the hessian block requires two outer products between + # feature-wise constants and the coefficient wise design matrix entries, for each observation. + # The resulting tensor is observations x features x coefficients x coefficients which + # is too large too store in memory in most cases. However, the full 4D tensor is never + # actually needed but only its marginal across features, the final hessian block shape. + # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation. + xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale) + + hblock = self.create_specific_block(w, xh, xh) + return hblock + + def _ab_byobs_batched(): + """ + Compute the mean-dispersion model off-diagonal block of the + closed form hessian of base_glm_all model by observastion across features. + + Note that there are two blocks of the same size which can + be compute from each other with a transpose operation as + the hessian is symmetric. + """ + w = self._weight_hessian_ab(x=x, loc=loc, scale=scale) # [observations x features] + # The computation of the hessian block requires two outer products between + # feature-wise constants and the coefficient wise design matrix entries, for each observation. + # The resulting tensor is observations x features x coefficients x coefficients which + # is too large too store in memory in most cases. However, the full 4D tensor is never + # actually needed but only its marginal across features, the final hessian block shape. + # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation. + xhloc = self.calc_design_mat(design_loc, self.model_vars.constraints_loc) + xhscale = self.calc_design_mat(design_scale, self.model_vars.constraints_scale) + + hblock = self.create_specific_block(w, xhloc, xhscale) + return hblock + + if compute_a and compute_b: + h_aa = _aa_byobs_batched() + h_bb = _bb_byobs_batched() + h_ab = _ab_byobs_batched() + h_ba = tf.transpose(h_ab, perm=[0, 2, 1]) + elif compute_a and not compute_b: + h_aa = _aa_byobs_batched() + h_bb = tf.zeros_like(h_aa, dtype=self.dtype) + h_ab = tf.zeros_like(h_aa, dtype=self.dtype) + h_ba = tf.zeros_like(h_aa, dtype=self.dtype) + elif not compute_a and compute_b: + h_bb = _bb_byobs_batched() + h_aa = tf.zeros_like(h_bb, dtype=self.dtype) + h_ab = tf.zeros_like(h_bb, dtype=self.dtype) + h_ba = tf.zeros_like(h_bb, dtype=self.dtype) + else: + h_aa = tf.zeros((), dtype=self.dtype) + h_bb = tf.zeros((), dtype=self.dtype) + h_ab = tf.zeros((), dtype=self.dtype) + h_ba = tf.zeros((), dtype=self.dtype) + + if concat: + h = tf.concat( + [tf.concat([h_aa, h_ab], axis=2), + tf.concat([h_ba, h_bb], axis=2)], + axis=1 + ) + return h + else: + return h_aa, h_ab, h_ba, h_bb + + @abc.abstractmethod + def _weight_hessian_aa( + self, + x, + loc, + scale + ): + """ + Compute the coefficient index invariant part of the + mean model block of the hessian. + + :param x: tf.tensor observations x features + Observation by observation and feature. + :param loc: tf.tensor observations x features + Value of mean model by observation and feature. + :param scale: tf.tensor observations x features + Value of dispersion model by observation and feature. + + :return const: tf.tensor observations x features + Coefficient invariant terms of hessian of + given observations and features. + """ + pass + + @abc.abstractmethod + def _weight_hessian_bb( + self, + x, + loc, + scale + ): + """ + Compute the coefficient index invariant part of the + dispersion model block of the hessian. + + :param x: tf.tensor observations x features + Observation by observation and feature. + :param loc: tf.tensor observations x features + Value of mean model by observation and feature. + :param scale: tf.tensor observations x features + Value of dispersion model by observation and feature. + + :return const: tf.tensor observations x features + Coefficient invariant terms of hessian of + given observations and features. + """ + pass + + @abc.abstractmethod + def _weight_hessian_ab( + self, + x, + loc, + scale + ): + """ + Compute the coefficient index invariant part of the + mean-dispersion model block of the hessian. + + Note that there are two blocks of the same size which can + be compute from each other with a transpose operation as + the hessian is symmetric. + + :param x: tf.tensor observations x features + Observation by observation and feature. + :param loc: tf.tensor observations x features + Value of mean model by observation and feature. + :param scale: tf.tensor observations x features + Value of dispersion model by observation and feature. + + :return const: tf.tensor observations x features + Coefficient invariant terms of hessian of + given observations and features. + """ + pass diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py new file mode 100644 index 00000000..378196cb --- /dev/null +++ b/batchglm/train/tf2/base_glm/model.py @@ -0,0 +1,232 @@ +from importlib import import_module +import logging +import tensorflow as tf +from .external import ModelBase +from .processModel import ProcessModelGLM +logger = logging.getLogger("batchglm") + + +class GLM(ModelBase, ProcessModelGLM): + """ + base GLM class containg the model call. + """ + + def __init__( + self, + model_vars, + optimizer: str, + noise_module: str, + use_gradient_tape: bool = False, + compute_a: bool = True, + compute_b: bool = True, + dtype: str = "float32", + ): + super(GLM, self).__init__(dtype=dtype) + + self.model_vars = model_vars + self.use_gradient_tape = use_gradient_tape + self.compute_a = compute_a + self.compute_b = compute_b + self.params = tf.Variable(tf.concat( + [ + model_vars.init_a_clipped, + model_vars.init_b_clipped, + ], + axis=0 + ), name="params", trainable=True) + self.params_copy = self.params + + # import and add noise model specific layers. + layers = import_module('...' + noise_module + '.layers', __name__) + grad_layers = import_module('...' + noise_module + '.layers_gradients', __name__) + self.unpack_params = layers.UnpackParams(dtype=dtype) + self.linear_loc = layers.LinearLoc(dtype=dtype) + self.linear_scale = layers.LinearScale(dtype=dtype) + self.linker_loc = layers.LinkerLoc(dtype=dtype) + self.linker_scale = layers.LinkerScale(dtype=dtype) + self.likelihood = layers.Likelihood(dtype=dtype) + self.jacobian = grad_layers.Jacobian(model_vars=model_vars, dtype=dtype) + self.hessian = grad_layers.Hessian(model_vars=model_vars, dtype=dtype) + self.fim = grad_layers.FIM(model_vars=model_vars, dtype=dtype) + + self.setMethod(optimizer) + + def setMethod(self, optimizer: str): + """ + Determines which function is executed to calculate and return the desired outputs when + calling the model. The internal function is chosen based on the given optimizer. It will + through an AssertionError if the optimizer is not understood. + """ + optimizer = optimizer.lower() + if optimizer in ['gd', 'adam', 'adagrad', 'rmsprop']: + self._calc = self.calc_jacobians + + elif optimizer in ['nr', 'nr_tr']: + self._calc = self._calc_hessians + + elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_tr_ar', 'irls_tr_gd_tr']: + self._calc = self._calc_fim + else: + assert False, ("Unrecognized optimizer: %s", optimizer) + + def featurewise_batch(self): + """ + Applies a boolean mask over the feature dimension of the parameter matrix by removing + some feature columns (e.g. to exclude converged parameters) determined by the + `remaining_features` vector in `model_vars`. This method must be called after each + featurewise batch event to ensure the feature dimension of the input tensors matches the + feature dimension of `params_copy` in the following model call. + """ + self.params_copy = tf.Variable( + tf.boolean_mask(tensor=self.params, mask=self.model_vars.remaining_features, axis=1), + trainable=True) + + def apply_featurewise_updates(self, full_params_copy: tf.Tensor): + """ + Applies featurewise updates stored in `params_copy` on `params`. Feature columns in + `params` corresponding to remaining feature columns in `params_copy` are overwritten with + the new values while the others (corresponding to features with converged parameters) are + retained. This method must be called after each featurewise batch event to ensure that the + updates stored in `params_copy` aren't lost when deriving a new `params_copy` from `params` + in the following model calls using `featurewise_batch()`. + """ + self.params.assign( + tf.where(self.model_vars.remaining_features, full_params_copy, self.params)) + + def _call_parameters(self, inputs): + design_loc, design_scale, size_factors = inputs + a_var, b_var = self.unpack_params([self.params_copy, self.model_vars.a_var.get_shape()[0]]) + eta_loc = self.linear_loc([a_var, design_loc, self.model_vars.constraints_loc, size_factors]) + eta_scale = self.linear_scale([b_var, design_scale, self.model_vars.constraints_scale]) + loc = self.linker_loc(eta_loc) + scale = self.linker_scale(eta_scale) + return eta_loc, eta_scale, loc, scale, a_var, b_var + + def calc_ll(self, inputs): + """ + Calculates the log probabilities, summed up per feature column and returns it together with + loc, scale, a_var, and b_var (forwarding results from `_call_parameters`). + """ + parameters = self._call_parameters(inputs[1:]) + log_probs = self.likelihood([*parameters[:-2], inputs[0]]) + log_probs = tf.reduce_sum(log_probs, axis=0) + return (log_probs, *parameters[2:]) + + def calc_jacobians(self, inputs, compute_a=True, compute_b=None, concat=True): + if compute_b is None: + compute_b = self.compute_b + return self._calc_jacobians(inputs, compute_a=compute_a, compute_b=compute_b, concat=concat)[2:] + + def _calc_jacobians(self, inputs, compute_a, compute_b, concat=True, transpose=True): + """ + calculates jacobian. + + :param inputs: TODO + :param concat: boolean + if true, concatenates the loc and scale block. + :param transpose: bool + transpose the gradient if true. + autograd returns gradients with respect to the shape of self.params. + But analytic differentiation returns it the other way, which is + often needed for downstream operations (e.g. hessian) + Therefore, if self.use_gradient_tape, it will transpose if transpose == False + """ + + with tf.GradientTape(persistent=True) as g: + log_probs, loc, scale, a_var, b_var = self.calc_ll(inputs) + + if self.use_gradient_tape: + + if compute_a: + if compute_b: + if concat: + jacobians = g.gradient(log_probs, self.params_copy) + if not transpose: + jacobians = tf.transpose(jacobians) + else: + jac_a = g.gradient(log_probs, a_var) + jac_b = g.gradient(log_probs, b_var) + if not transpose: + jac_a = tf.transpose(jac_a) + jac_b = tf.transpose(jac_b) + else: + jac_a = g.gradient(log_probs, a_var) + jac_b = tf.zeros((jac_a.get_shape()[0], b_var.get_shape()[1]), b_var.dtype) + if concat: + jacobians = tf.concat([jac_a, jac_b], axis=0) + if not transpose: + jacobians = tf.transpose(jacobians) + else: + jac_b = g.gradient(log_probs, b_var) + jac_a = tf.zeros((jac_b.get_shape()[0], a_var.get_shape()[0]), a_var.dtype) + if concat: + jacobians = tf.concat([jac_a, jac_b], axis=0) + if not transpose: + jacobians = tf.transpose(jacobians) + + else: + + if concat: + jacobians = self.jacobian([*inputs[0:3], loc, scale, True, compute_a, compute_b]) + if transpose: + jacobians = tf.transpose(jacobians) + else: + jac_a, jac_b = self.jacobian([*inputs[0:3], loc, scale, False, compute_a, compute_b]) + + del g + if concat: + return loc, scale, log_probs, tf.negative(jacobians) + return loc, scale, log_probs, tf.negative(jac_a), tf.negative(jac_b) + + def _calc_hessians(self, inputs, compute_a, compute_b): + # with tf.GradientTape(persistent=True) as g2: + loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, compute_a=compute_a, compute_b=compute_b, transpose=False) + ''' + autograd not yet working. TODO: Search error in the following code: + + if self.use_gradient_tape: + + i = tf.constant(0, tf.int32) + h_tensor_array = tf.TensorArray( # hessian slices [:,:,j] + dtype=self.params_copy.dtype, + size=self.params_copy.get_shape()[0], + clear_after_read=False + ) + while i < self.params_copy.get_shape()[0]: + grad = g2.gradient(results_arr[i], self.params_copy) + h_tensor_array.write(index=i, value=grad) + i += 1 + + # h_tensor_array is a TensorArray, reshape this into a tensor so that it can be used + # in down-stream computation graphs. + + hessians = tf.transpose(tf.reshape( + h_tensor_array.stack(), + tf.stack((self.params_copy.get_shape()[0], + self.params_copy.get_shape()[0], + self.params_copy.get_shape()[1])) + ), perm=[2, 1, 0]) + hessians = tf.negative(hessians) + ''' + hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True, compute_a, compute_b])) + return log_probs, jacobians, hessians + + def _calc_fim(self, inputs, compute_a, compute_b): + loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians( + inputs, + compute_a=compute_a, + compute_b=compute_b, + concat=False, + transpose=False) + fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False, compute_a, compute_b]) + return log_probs, jac_a, jac_b, fim_a, fim_b + + def call(self, inputs, compute_a=True, compute_b=None): + """ + Wrapper method to call this model. Depending on the desired calculations specified by the + `optimizer` arg to `__init__`, it will forward the call to the necessary function to perform + the right calculations and return all the results. + """ + if compute_b is None: + compute_b = self.compute_b + return self._calc(inputs, compute_a, compute_b) diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py new file mode 100644 index 00000000..7903b372 --- /dev/null +++ b/batchglm/train/tf2/base_glm/optim.py @@ -0,0 +1,501 @@ +from .external import pkg_constants +import tensorflow as tf +from .external import OptimizerBase +import abc +import numpy as np + + +class SecondOrderOptim(OptimizerBase, metaclass=abc.ABCMeta): + + """ + Superclass for NR and IRLS + """ + + def _norm_neg_log_likelihood(self, log_probs): + return - log_probs / self.n_obs + + def _resource_apply_dense(self, grad, handle, apply_state=None): + + update_op = handle.assign_add(grad, read_value=False) + + return update_op + + def _resource_apply_sparse(self, grad, handle, apply_state=None): + + raise NotImplementedError('Applying SparseTensor currently not possible.') + + def get_config(self): + + config = {"name": "SOO"} + return config + + def _create_slots(self, var_list): + + self.add_slot(var_list[0], 'mu_r') + + def _trust_region_ops( + self, + proposed_vector, + compute_a, + compute_b, + batch_features, + update_theta, + ): + if compute_b and not compute_a: + t1 = pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE + t2 = pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE + else: + t1 = pkg_constants.TRUST_REGION_T1 + t2 = pkg_constants.TRUST_REGION_T2 + t1 = tf.constant(t1, dtype=self._dtype) + t2 = tf.constant(t2, dtype=self._dtype) + upper_bound = tf.constant(pkg_constants.TRUST_REGION_UPPER_BOUND, dtype=self._dtype) + + # Phase I: Perform a trial update. + # Propose parameter update: + + coeffs = self.model.model_vars.idx_train_loc if compute_b and not compute_a else self.model.model_vars.idx_train_scale + if compute_b: + idx_stop = self.model.model_vars.idx_train_scale[-1] + 1 + if compute_a: + idx_start = 0 + else: + idx_start = self.model.model_vars.idx_train_loc[-1] + 1 + else: + idx_start = 0 + idx_stop = self.model.model_vars.idx_train_loc[-1] + 1 + update_vector_length = tf.sqrt(tf.reduce_sum(tf.square(proposed_vector[idx_start:idx_stop]), axis=0)) + update_theta_full = update_theta + if batch_features: + n_features = self.model.model_vars.n_features + indices = tf.where(self.model.model_vars.remaining_features) + update_theta_full = tf.scatter_nd(indices, update_theta, shape=(n_features,)) + update_vector_length = tf.scatter_nd(indices, update_vector_length, shape=(n_features,)) + + tr_radius = self.tr_radius_b if compute_b and not compute_a else self.tr_radius + + increase_radius = update_theta_full + decrease_radius = tf.logical_not(update_theta_full) + + if compute_a: + increase_radius = tf.logical_and(increase_radius, update_vector_length > 0.9 * tr_radius) + decrease_radius = tf.logical_or(decrease_radius, update_vector_length < 0.5 * tr_radius) + + if compute_b and not compute_a: + self.model.model_vars.updated_b = update_theta_full.numpy() + else: + self.model.model_vars.updated = update_theta_full.numpy() + + # Update trusted region accordingly: + + keep_radius = tf.logical_and(tf.logical_not(decrease_radius), + tf.logical_not(increase_radius)) + radius_update = tf.add_n([ + tf.multiply(t1, tf.cast(decrease_radius, self._dtype)), + tf.multiply(t2, tf.cast(increase_radius, self._dtype)), + tf.multiply(tf.ones_like(t1), tf.cast(keep_radius, self._dtype)) + ]) + + radius_new = tf.minimum(tf.multiply(tr_radius, radius_update), upper_bound) + tr_radius.assign(radius_new) + + return update_theta + + def _trial_update(self, x_batches, log_probs, proposed_vector, is_batched, compute_a, compute_b): + + eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype) + """ + Current likelihood refers to the likelihood that has been calculated in the last model call. + We are always evaluating on the full model, so if we train on the batched model (is_batched), + current likelihood needs to be calculated on the full model using the same model state as + used in the last model call. Moreover, if this update is conducted separately for loc + (compute_a) and scale (compute_b), current likelihood always needs to be recalculated when + updating the scale params since the location params changed in the location update before. + This is only true if the location params are updated before the scale params however! + """ + current_likelihood = log_probs + if is_batched or compute_b and not compute_a: + for i, x_batch in enumerate(x_batches): + log_likelihood = self.model.calc_ll([*x_batch])[0] + current_likelihood = log_likelihood if i == 0 else tf.math.add(current_likelihood, log_likelihood) + + current_likelihood = self._norm_neg_log_likelihood(current_likelihood) + + """ + The new likelihood is calculated on the full model now, after updating the parameters using + the proposed vector: + """ + original_params_copy = tf.identity(self.model.params_copy) + self.model.params_copy.assign_sub(proposed_vector) + + for i, x_batch in enumerate(x_batches): + log_likelihood = self.model.calc_ll([*x_batch])[0] + if i == 0: + new_likelihood = log_likelihood + else: + new_likelihood += log_likelihood + new_likelihood = self._norm_neg_log_likelihood(new_likelihood) + + """ + delta_f_actual shows the difference between the log likelihoods before and after the proposed + update of parameters. It is > 0 if the new likelihood is greater than the old. + """ + delta_f_actual = tf.math.subtract(current_likelihood, new_likelihood) + + """ + If we use feature batching, the individual vector indices need to be spread out to the full + feature space by adding columns corresponding to positions of converged (non calculated) + features. + """ + + # Compute parameter updates.g + update_theta = delta_f_actual > eta0 + self.model.params_copy.assign(tf.where(update_theta, self.model.params_copy, original_params_copy)) + + return update_theta + + def __init__(self, dtype: tf.dtypes.DType, tr_mode: bool, model: tf.keras.Model, name: str, n_obs: int): + + super(SecondOrderOptim, self).__init__(name) + + self.model = model + self._dtype = dtype + self.n_obs = tf.cast(n_obs, dtype=self._dtype) + self.tr_mode = tr_mode + + n_features = self.model.model_vars.n_features + if tr_mode: + self.tr_radius = tf.Variable( + np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT, + dtype=self._dtype, trainable=False) + + else: + self.tr_radius = tf.Variable(np.array([np.inf]), dtype=self._dtype, trainable=False) + self.model.model_vars.updated = np.repeat(a=True, repeats=n_features) + + @abc.abstractmethod + def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False): + pass + + def _newton_type_update(self, lhs, rhs, psd=False): + + new_rhs = tf.expand_dims(rhs, axis=-1) + res = tf.linalg.lstsq(lhs, new_rhs, fast=psd) + delta_t = tf.squeeze(res, axis=-1) + update_tensor = tf.transpose(delta_t) + return update_tensor + + def _pad_updates( + self, + update_raw, + compute_a, + compute_b + ): + # Pad update vectors to receive update tensors that match + # the shape of model_vars.params. + if compute_a: + if compute_b: + netwon_type_update = update_raw + else: + netwon_type_update = tf.concat([ + update_raw, + tf.zeros(shape=(self.model.model_vars.b_var.get_shape()[0], update_raw.get_shape()[1]), + dtype=self._dtype) + ], axis=0) + + elif compute_b: + netwon_type_update = tf.concat([ + tf.zeros(shape=(self.model.model_vars.a_var.get_shape()[0], update_raw.get_shape()[1]), + dtype=self._dtype), + update_raw + ], axis=0) + + else: + raise ValueError("No training necessary") + + return netwon_type_update + + @staticmethod + def _calc_update_magnitudes(update_raw): + update_magnitude_sq = tf.reduce_sum(tf.square(update_raw), axis=0) + update_magnitude = tf.where( + condition=update_magnitude_sq > 0, + x=tf.sqrt(update_magnitude_sq), + y=tf.zeros_like(update_magnitude_sq) + ) + update_magnitude_inv = tf.where( + condition=update_magnitude > 0, + x=tf.divide( + tf.ones_like(update_magnitude), + update_magnitude + ), + y=tf.zeros_like(update_magnitude) + ) + return update_magnitude, update_magnitude_inv + + def _trust_region_update( + self, + update_raw, + radius_container, + ): + update_magnitude, update_magnitude_inv = SecondOrderOptim._calc_update_magnitudes(update_raw) + update_norm = tf.multiply(update_raw, update_magnitude_inv) + + update_scale = tf.minimum( + radius_container, + update_magnitude + ) + proposed_vector = tf.multiply( + update_norm, + update_scale + ) + + return proposed_vector + + def normalize_update_magnitude(self, update_magnitude): + return update_magnitude + + def _trust_region_newton_cost_gain( + self, + proposed_vector, + neg_jac, + hessian_fim + ): + pred_cost_gain = tf.add( + tf.einsum( + 'ni,in->n', + neg_jac, + proposed_vector + ) / self.n_obs, + 0.5 * tf.einsum( + 'nix,xin->n', + tf.einsum('inx,nij->njx', + tf.expand_dims(proposed_vector, axis=-1), + hessian_fim), + tf.expand_dims(proposed_vector, axis=0) + ) / tf.square(self.n_obs) + ) + return pred_cost_gain + + +class NR(SecondOrderOptim): + + def _get_updates(self, lhs, rhs, compute_a, compute_b): + + update_raw = self._newton_type_update(lhs=lhs, rhs=rhs) + update = self._pad_updates(update_raw, compute_a, compute_b) + + return update_raw, update + + def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False): + + x_batches, log_probs, jacobians, hessians = inputs + tr_mode = self.tr_mode + if compute_b: + if not compute_a: + self.model.model_vars.updated_b = np.repeat(a=False, repeats=self.params.shape[1]) # Initialise to is updated. + tr_mode = self.tr_mode_b + + assert (compute_a or compute_b), "Nothing can be trained. Please make sure" \ + "at least one of train_mu and train_r is set to True." + + update_raw, update = self._get_updates(hessians, jacobians, compute_a, compute_b) + + if tr_mode: + if batch_features: + radius_container = tf.boolean_mask( + tensor=self.tr_radius, + mask=self.model.model_vars.remaining_features) + else: + radius_container = self.tr_radius + tr_proposed_vector = self._trust_region_update( + update_raw=update_raw, + radius_container=radius_container + ) + #tr_pred_cost_gain = self._trust_region_newton_cost_gain( + # proposed_vector=tr_proposed_vector, + # neg_jac=jacobians, + # hessian_fim=hessians + #) + + #tr_proposed_vector_pad = self._pad_updates( + # update_raw=tr_proposed_vector, + # compute_a=compute_a, + # compute_b=compute_b + #) + update_theta = self._trial_update( + x_batches=x_batches, + log_probs=log_probs, + proposed_vector=tr_proposed_vector, + is_batched=is_batched, + compute_a=compute_a, + compute_b=compute_b) + self._trust_region_ops( + proposed_vector=tr_proposed_vector, + compute_a=compute_a, + compute_b=compute_b, + batch_features=batch_features, + update_theta=update_theta + ) + + else: + update_theta = self._trial_update( + x_batches=x_batches, + log_probs=log_probs, + proposed_vector=update, + is_batched=is_batched, + compute_a=compute_a, + compute_b=compute_b + ) + + #self.model.params_copy.assign_sub(update) + + +class IRLS(SecondOrderOptim): + + def _calc_proposed_vector_and_pred_cost_gain( + self, + update_x, + radius_container, + neg_jac_x, + fim_x=None + ): + """ + Calculates the proposed vector and predicted cost gain for either mean or scale part. + :param update_x: tf.tensor coefficients x features ? TODO + + :param radius_container: tf.tensor ? x ? TODO + + :param neg_jac_x: tf.Tensor coefficients x features ? TODO + Upper (mu part) or lower (r part) of negative jacobian matrix + :param fim_x + Upper (mu part) or lower (r part) of Fisher Inverse Matrix. + Defaults to None, is only needed if gd is False + :return proposed_vector_x, pred_cost_gain_x + Returns proposed vector and predicted cost gain after + trusted region update for either mu or r part, depending on x + """ + + proposed_vector_x = self._trust_region_update( + update_raw=update_x, + radius_container=radius_container + ) + + #pred_cost_gain_x = self._trust_region_newton_cost_gain( + # proposed_vector=proposed_vector_x, + # neg_jac=neg_jac_x, + # hessian_fim=fim_x + #) + + return proposed_vector_x, None#pred_cost_gain_x + + + def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False): + + x_batches, log_probs, jac_a, jac_b, fim_a, fim_b = inputs + tr_mode = self.tr_mode + if compute_b: + if not compute_a: + self.model.model_vars.updated_b = np.repeat(a=False, repeats=self.params.shape[1]) # Initialise to is updated. + tr_mode = self.tr_mode_b + + assert (compute_a or compute_b), "Nothing can be trained. Please make sure" \ + "at least one of train_mu and train_r is set to True." + + # Compute a and b model updates separately. + if compute_a: + # The FIM of the mean model is guaranteed to be + # positive semi-definite and can therefore be inverted + # with the Cholesky decomposition. This information is + # passed here with psd=True. + update_a = self._newton_type_update( + lhs=fim_a, + rhs=jac_a, + psd=False + ) + if compute_b: + + update_b = self._newton_type_update( + lhs=fim_b, + rhs=jac_b + ) + + if not tr_mode: + if compute_a: + if compute_b: + update_raw = tf.concat([update_a, update_b], axis=0) + else: + update_raw = update_a + else: + update_raw = update_b + + update = self._pad_updates( + update_raw=update_raw, + compute_a=compute_a, + compute_b=compute_b + ) + + update_theta = self._trial_update( + x_batches=x_batches, + log_probs=log_probs, + proposed_vector=update, + is_batched=is_batched, + compute_a=compute_a, + compute_b=compute_b + ) + #print(update_theta) + #self.model.params_copy.assign_sub(update_var) + + else: + # put together update_raw based on proposed vector and cost gain depending on train_r and train_mu + if batch_features: + radius_container = tf.boolean_mask( + tensor=self.tr_radius, + mask=self.model.model_vars.remaining_features) + else: + radius_container = self.tr_radius + + if compute_b: + tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain( + update_b, radius_container, jac_b, fim_b) + if compute_a: + tr_proposed_vector_a, tr_pred_cost_gain_a = self._calc_proposed_vector_and_pred_cost_gain( + update_a, radius_container, jac_a, fim_a) + + tr_update_raw = tf.concat([tr_proposed_vector_a, tr_proposed_vector_b], axis=0) + #tr_pred_cost_gain = tf.add(tr_pred_cost_gain_a, tr_pred_cost_gain_b) + else: + # directly apply output of calc_proposed_vector_and_pred_cost_gain to tr_update_raw + # and tr_pred_cost_gain + tr_update_raw = tr_proposed_vector_b + #tr_pred_cost_gain = tr_pred_cost_gain_b + else: + # here train_r is False AND train_mu is true, so the output of the function can directly be applied to + # tr_update_raw and tr_pred_cost_gain, similar to train_r = True and train_mu = False + tr_update_raw, tr_pred_cost_gain = self._calc_proposed_vector_and_pred_cost_gain( + update_a, radius_container, jac_a, fim_a) + + tr_update = self._pad_updates( + update_raw=tr_update_raw, + compute_a=compute_a, + compute_b=compute_b + ) + # perform update + update_theta = self._trial_update( + x_batches=x_batches, + log_probs=log_probs, + proposed_vector=tr_update, + is_batched=is_batched, + compute_a=compute_a, + compute_b=compute_b) + self._trust_region_ops( + proposed_vector=tr_update, + compute_a=compute_a, + compute_b=compute_b, + batch_features=batch_features, + update_theta=update_theta + ) + + def calc_delta_f_actual(self, current_likelihood, new_likelihood, jacobian): + eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype) + return diff --git a/batchglm/train/tf2/base_glm/processModel.py b/batchglm/train/tf2/base_glm/processModel.py new file mode 100644 index 00000000..4b6aedf7 --- /dev/null +++ b/batchglm/train/tf2/base_glm/processModel.py @@ -0,0 +1,9 @@ +from .external import ProcessModelBase +import abc + + +class ProcessModelGLM(ProcessModelBase): + + @abc.abstractmethod + def param_bounds(self, dtype): + pass diff --git a/batchglm/train/tf2/base_glm/training_strategies.py b/batchglm/train/tf2/base_glm/training_strategies.py new file mode 100644 index 00000000..63f295d3 --- /dev/null +++ b/batchglm/train/tf2/base_glm/training_strategies.py @@ -0,0 +1,111 @@ +from enum import Enum + + +class TrainingStrategies(Enum): + + AUTO = None + + DEFAULT = \ + { + "optim_algo": "default_adam", + "jacobian": True, + "hessian": False, + "fim": False, + "concat_grads": True + } + + GD = \ + { + "optim_algo": "gd", + "jacobian": True, + "hessian": False, + "fim": False, + "concat_grads": True + } + + ADAM = \ + { + "optim_algo": "adam", + "jacobian": True, + "hessian": False, + "fim": False, + "concat_grads": True + } + + ADAGRAD = \ + { + "optim_algo": "adagrad", + "jacobian": True, + "hessian": False, + "fim": False, + "concat_grads": True + } + + RMSPROP = \ + { + "optim_algo": "rmsprop", + "jacobian": True, + "hessian": False, + "fim": False, + "concat_grads": True + } + + IRLS = \ + { + "optim_algo": "irls", + "jacobian": False, + "hessian": False, + "fim": True, + "concat_grads": False, + "calc_separated": True + } + + IRLS_TR = \ + { + "optim_algo": "irls_tr", + "jacobian": False, + "hessian": False, + "fim": True, + "concat_grads": False, + "calc_separated": True + } + + IRLS_GD = \ + { + "optim_algo": "irls_gd", + "jacobian": False, + "hessian": False, + "fim": True, + "concat_grads": False, + "calc_separated": True + } + + IRLS_GD_TR = \ + { + "optim_algo": "irls_gd_tr", + "jacobian": False, + "hessian": False, + "fim": True, + "concat_grads": False, + "calc_separated": True + } + + NR = \ + { + "optim_algo": "nr", + "jacobian": False, + "hessian": True, + "fim": False, + "concat_grads": True, + "calc_separated": False + } + + NR_TR = \ + { + "optim_algo": "nr_tr", + "jacobian": False, + "hessian": True, + "fim": False, + "concat_grads": True, + "calc_separated": False + } diff --git a/batchglm/train/tf2/base_glm/vars.py b/batchglm/train/tf2/base_glm/vars.py new file mode 100644 index 00000000..f57ff08c --- /dev/null +++ b/batchglm/train/tf2/base_glm/vars.py @@ -0,0 +1,90 @@ +import numpy as np +import tensorflow as tf +import abc + +from .model import ProcessModelGLM + + +class ModelVarsGLM(ProcessModelGLM): + """ Build tf.Variables to be optimzed and their constraints. + + a_var and b_var slices of the tf.Variable params which contains + all parameters to be optimized during model estimation. + Params is defined across both location and scale model so that + the hessian can be computed for the entire model. + a and b are the clipped parameter values which also contain + constraints and constrained dependent coefficients which are not + directly optimized. + """ + + constraints_loc: tf.Tensor + constraints_scale: tf.Tensor + params: tf.Variable + a_var: tf.Tensor + b_var: tf.Tensor + updated: np.ndarray + converged: np.ndarray + dtype: str + n_features: int + + def __init__( + self, + init_a: np.ndarray, + init_b: np.ndarray, + constraints_loc: np.ndarray, + constraints_scale: np.ndarray, + dtype: str + ): + """ + + :param init_a: nd.array (mean model size x features) + Initialisation for all parameters of mean model. + :param init_b: nd.array (dispersion model size x features) + Initialisation for all parameters of dispersion model. + :param dtype: Precision used in tensorflow. + """ + self.constraints_loc = tf.convert_to_tensor(constraints_loc, dtype) + self.constraints_scale = tf.convert_to_tensor(constraints_scale, dtype) + + self.init_a = tf.convert_to_tensor(init_a, dtype=dtype) + self.init_b = tf.convert_to_tensor(init_b, dtype=dtype) + + self.init_a_clipped = self.tf_clip_param(self.init_a, "a_var") + self.init_b_clipped = self.tf_clip_param(self.init_b, "b_var") + + # Param is the only tf.Variable in the graph. + # a_var and b_var have to be slices of params. + self.params = tf.Variable(tf.concat( + [ + self.init_a_clipped, + self.init_b_clipped, + ], + axis=0 + ), name="params") + + a_var = self.params[0:init_a.shape[0]] + b_var = self.params[init_a.shape[0]:] + + self.a_var = self.tf_clip_param(a_var, "a_var") + self.b_var = self.tf_clip_param(b_var, "b_var") + + # Properties to follow gene-wise convergence. + self.updated = np.repeat(a=False, repeats=self.params.shape[1]) # Initialise to is updated. + self.updated_b = np.repeat(a=False, repeats=self.params.shape[1]) # Initialise to is updated. + self.converged = np.repeat(a=False, repeats=self.params.shape[1]) # Initialise to non-converged. + self.converged_b = np.repeat(a=False, repeats=self.params.shape[1]) # Initialise to non-converged. + + self.total_converged = np.repeat(a=False, repeats=self.params.shape[1]) # Initialise to non-converged. + self.remaining_features = np.repeat(a=True, repeats=self.params.shape[1]) + self.dtype = dtype + self.n_features = self.params.shape[1] + self.idx_train_loc = np.arange(0, init_a.shape[0]) + self.idx_train_scale = np.arange(init_a.shape[0], init_a.shape[0]+init_b.shape[0]) + + @abc.abstractmethod + def param_bounds(self, dtype): + pass + + def convergence_update(self, status: np.ndarray, features_updated: np.ndarray): + self.converged = status.copy() + self.updated = features_updated diff --git a/batchglm/train/tf2/glm_beta/__init__.py b/batchglm/train/tf2/glm_beta/__init__.py new file mode 100644 index 00000000..a616f181 --- /dev/null +++ b/batchglm/train/tf2/glm_beta/__init__.py @@ -0,0 +1,5 @@ +from .processModel import ProcessModel +from .vars import ModelVars +from .estimator import Estimator + +from .model import BetaGLM diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py new file mode 100644 index 00000000..a2f3f056 --- /dev/null +++ b/batchglm/train/tf2/glm_beta/estimator.py @@ -0,0 +1,247 @@ +import logging +from typing import Union + +import numpy as np + +from .external import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize +from .external import InputDataGLM, Model +from .external import Estimator as GLMEstimator +from .model import BetaGLM +from .processModel import ProcessModel +from .vars import ModelVars +from .training_strategies import TrainingStrategies + + +class Estimator(GLMEstimator, ProcessModel): + """ + Estimator for Generalized Linear Models (GLMs) with beta distributed noise. + Uses a logit linker function for loc and log linker function for scale. + """ + + model: BetaGLM + + def __init__( + self, + input_data: InputDataGLM, + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + quick_scale: bool = False, + dtype="float32", + ): + """ + Performs initialisation and creates a new estimator. + + :param input_data: InputDataGLM + The input data + :param init_a: (Optional) + Low-level initial values for a. Can be: + + - str: + * "auto": automatically choose best initialization + * "random": initialize with random values + * "standard": initialize intercept with observed mean + * "init_model": initialize with another model (see `ìnit_model` parameter) + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) + Low-level initial values for b. Can be: + + - str: + * "auto": automatically choose best initialization + * "random": initialize with random values + * "standard": initialize with zeros + * "init_model": initialize with another model (see `ìnit_model` parameter) + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param quick_scale: bool + Whether `scale` will be fitted faster and maybe less accurate. + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Precision used in tensorflow. + """ + self.TrainingStrategies = TrainingStrategies + + self._train_loc = True + self._train_scale = True + + (init_a, init_b) = self.init_par( + input_data=input_data, + init_a=init_a, + init_b=init_b, + init_model=None + ) + init_a = init_a.astype(dtype) + init_b = init_b.astype(dtype) + if quick_scale: + self._train_scale = False + + self.model_vars = ModelVars( + init_a=init_a, + init_b=init_b, + constraints_loc=input_data.constraints_loc, + constraints_scale=input_data.constraints_scale, + dtype=dtype + ) + + super(Estimator, self).__init__( + input_data=input_data, + dtype=dtype + ) + + def train( + self, + use_batching: bool = True, + batch_size: int = 500, + optim_algo: str = "adam", + learning_rate: float = 1e-2, + convergence_criteria: str = "step", + stopping_criteria: int = 1000, + autograd: bool = False, + featurewise: bool = True, + benchmark: bool = False + ): + + if self.model is None: + self.model = BetaGLM( + model_vars=self.model_vars, + dtype=self.model_vars.dtype, + compute_a=self._train_loc, + compute_b=self._train_scale, + use_gradient_tape=autograd, + optimizer=optim_algo + ) + else: + self.model.setMethod(optim_algo) + + optimizer_object = self.get_optimizer_object(optim_algo, learning_rate) + + super(Estimator, self)._train( + noise_model="beta", + is_batched=use_batching, + batch_size=batch_size, + optimizer_object=optimizer_object, + convergence_criteria=convergence_criteria, + stopping_criteria=stopping_criteria, + autograd=autograd, + benchmark=benchmark, + optim_algo=optim_algo + ) + + def get_model_container( + self, + input_data + ): + return Model(input_data=input_data) + + def init_par( + self, + input_data, + init_a, + init_b, + init_model + ): + r""" + standard: + Only initialise intercept and keep other coefficients as zero. + + closed-form: + Initialize with Maximum Likelihood / Maximum of Momentum estimators + """ + + if init_model is None: + groupwise_means = None + init_a_str = None + if isinstance(init_a, str): + init_a_str = init_a.lower() + # Chose option if auto was chosen + if init_a.lower() == "auto": + init_a = "closed_form" + + if init_a.lower() == "closed_form": + groupwise_means, init_a, rmsd_a = closedform_beta_glm_logitmean( + x=input_data.x, + design_loc=input_data.design_loc, + constraints_loc=input_data.constraints_loc, + size_factors=input_data.size_factors_init, + link_fn=lambda mean: np.log( + 1/(1/self.np_clip_param(mean, "mean")-1) + ) + ) + + # train mu, if the closed-form solution is inaccurate + self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0) + + logging.getLogger("batchglm").debug("Using closed-form MME initialization for mean") + elif init_a.lower() == "standard": + overall_means = np.mean(input_data.x, axis=0) + overall_means = self.np_clip_param(overall_means, "mean") + + init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) + init_a[0, :] = np.log(overall_means/(1-overall_means)) + self._train_loc = True + + logging.getLogger("batchglm").debug("Using standard initialization for mean") + elif init_a.lower() == "all_zero": + init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) + self._train_loc = True + + logging.getLogger("batchglm").debug("Using all_zero initialization for mean") + else: + raise ValueError("init_a string %s not recognized" % init_a) + logging.getLogger("batchglm").debug("Should train mean: %s", self._train_loc) + if isinstance(init_b, str): + if init_b.lower() == "auto": + init_b = "standard" + + if init_b.lower() == "standard": + groupwise_scales, init_b_intercept, rmsd_b = closedform_beta_glm_logsamplesize( + x=input_data.x, + design_scale=input_data.design_scale[:, [0]], + constraints=input_data.constraints_scale[[0], :][:, [0]], + size_factors=input_data.size_factors, + groupwise_means=None, + link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize")) + ) + init_b = np.zeros([input_data.num_scale_params, input_data.num_features]) + init_b[0, :] = init_b_intercept + + logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion") + elif init_b.lower() == "closed_form": + dmats_unequal = False + if input_data.num_design_loc_params == input_data.num_design_scale_params: + if np.any(input_data.design_loc != input_data.design_scale): + dmats_unequal = True + + inits_unequal = False + if init_a_str is not None: + if init_a_str != init_b: + inits_unequal = True + + if inits_unequal or dmats_unequal: + raise ValueError( + "cannot use closed_form init for scale model if scale model differs from loc model" + ) + + groupwise_scales, init_b, rmsd_b = closedform_beta_glm_logsamplesize( + x=input_data.x, + design_scale=input_data.design_scale, + constraints=input_data.constraints_scale, + size_factors=input_data.size_factors, + groupwise_means=groupwise_means, + link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize")) + ) + + logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion") + elif init_b.lower() == "all_zero": + init_b = np.zeros([input_data.num_scale_params, input_data.num_features]) + + logging.getLogger("batchglm").debug("Using standard initialization for dispersion") + else: + raise ValueError("init_b string %s not recognized" % init_b) + logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale) + else: + init_a, init_b = self.get_init_from_model(init_a=init_a, + init_b=init_b, + input_data=input_data, + init_model=init_model) + + return init_a, init_b diff --git a/batchglm/train/tf2/glm_beta/external.py b/batchglm/train/tf2/glm_beta/external.py new file mode 100644 index 00000000..6ee962a0 --- /dev/null +++ b/batchglm/train/tf2/glm_beta/external.py @@ -0,0 +1,11 @@ +from batchglm import pkg_constants +import batchglm.data as data_utils + +from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale +from batchglm.models.glm_beta import _EstimatorGLM, InputDataGLM, Model +from batchglm.models.glm_beta.utils import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize +from batchglm.utils.linalg import groupwise_solve_lm + +from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, Estimator, ModelVarsGLM +from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM +from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM diff --git a/batchglm/train/tf2/glm_beta/layers.py b/batchglm/train/tf2/glm_beta/layers.py new file mode 100644 index 00000000..0cfc0ef9 --- /dev/null +++ b/batchglm/train/tf2/glm_beta/layers.py @@ -0,0 +1,53 @@ +import tensorflow as tf +from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM +from .processModel import ProcessModel + + +class UnpackParams(UnpackParamsGLM, ProcessModel): + """ + Full class. + """ + + +class LinearLoc(LinearLocGLM, ProcessModel): + + def with_size_factors(self, eta_loc, size_factors): + raise NotImplementedError("There are no size_factors for GLMs with Beta noise.") + + +class LinearScale(LinearScaleGLM, ProcessModel): + """ + Full Class + """ + + +class LinkerLoc(LinkerLocGLM): + + def _inv_linker(self, loc: tf.Tensor): + return 1 / (1 + tf.exp(-loc)) + + +class LinkerScale(LinkerScaleGLM): + + def _inv_linker(self, scale: tf.Tensor): + return tf.exp(scale) + + +class Likelihood(LikelihoodGLM, ProcessModel): + + def _ll(self, eta_loc, eta_scale, loc, scale, x): + + if isinstance(x, tf.SparseTensor): + one_minus_x = tf.negative(tf.sparse.add(x, tf.negative(tf.ones_like(loc)))) + else: + one_minus_x = 1 - x + + one_minus_loc = 1 - loc + log_probs = tf.math.lgamma(scale) - tf.math.lgamma(loc * scale) \ + - tf.math.lgamma(one_minus_loc * scale) \ + + (scale * loc - 1) * tf.math.log(x) \ + + (one_minus_loc * scale - 1) * tf.math.log(one_minus_x) + + log_probs = self.tf_clip_param(log_probs, "log_probs") + + return log_probs diff --git a/batchglm/train/tf2/glm_beta/layers_gradients.py b/batchglm/train/tf2/glm_beta/layers_gradients.py new file mode 100644 index 00000000..566e9b44 --- /dev/null +++ b/batchglm/train/tf2/glm_beta/layers_gradients.py @@ -0,0 +1,144 @@ +import tensorflow as tf +from .external import FIMGLM, JacobianGLM, HessianGLM + + +class FIM(FIMGLM): + # No Fisher Information Matrices due to unsolvable E[log(X)] + + def _weight_fim_aa( + self, + x, + loc, + scale + ): + assert False, "not implemented" + + def _weight_fim_bb( + self, + x, + loc, + scale + ): + assert False, "not implemented" + + +class Jacobian(JacobianGLM): + + def _weights_jac_a( + self, + x, + loc, + scale, + ): + one_minus_loc = 1 - loc + if isinstance(x, tf.SparseTensor): + const1 = tf.math.log(tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc)))) + else: + const1 = tf.math.log(x / (1 - x)) + const2 = - tf.math.digamma(loc * scale) + tf.math.digamma(one_minus_loc * scale) + const1 + const = const2 * scale * loc * one_minus_loc + return const + + def _weights_jac_b( + self, + x, + loc, + scale, + ): + if isinstance(x, tf.SparseTensor): + one_minus_x = - tf.sparse.add(x, -tf.ones_like(loc)) + else: + one_minus_x = 1 - x + one_minus_loc = 1 - loc + const = scale * (tf.math.digamma(scale) - tf.math.digamma(loc * scale) * loc - tf.math.digamma( + one_minus_loc * scale) * one_minus_loc + loc * tf.math.log(x) + one_minus_loc * tf.math.log( + one_minus_x)) + return const + + +class Hessian(HessianGLM): + + def _weight_hessian_aa( + self, + x, + loc, + scale, + ): + one_minus_loc = 1 - loc + loc_times_scale = loc * scale + one_minus_loc_times_scale = one_minus_loc * scale + + if isinstance(x, tf.SparseTensor): + # Using the dense matrix of the location model to serve the correct shapes for the sparse X. + const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc))) + # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below, + # to_dense does not work. + else: + const1 = tf.math.log(x / (tf.ones_like(x) - x)) + + const2 = (1 - 2 * loc) * ( + - tf.math.digamma(loc_times_scale) + tf.math.digamma(one_minus_loc_times_scale) + const1) + const3 = loc * one_minus_loc_times_scale * ( + - tf.math.polygamma(tf.ones_like(loc), loc_times_scale) - tf.math.polygamma(tf.ones_like(loc), + one_minus_loc_times_scale)) + const = loc * one_minus_loc_times_scale * (const2 + const3) + return const + + def _weight_hessian_ab( + self, + x, + loc, + scale, + ): + one_minus_loc = 1 - loc + loc_times_scale = loc * scale + one_minus_loc_times_scale = one_minus_loc * scale + scalar_one = tf.constant(1, shape=(), dtype=self.dtype) + + if isinstance(x, tf.SparseTensor): + # Using the dense matrix of the location model to serve the correct shapes for the sparse X. + const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc))) + # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below, + # to_dense does not work. + else: + const1 = tf.math.log(x / (1 - x)) + + const2 = - tf.math.digamma(loc_times_scale) + tf.math.digamma(one_minus_loc_times_scale) + const1 + const3 = scale * (- tf.math.polygamma(scalar_one, loc_times_scale) * loc + one_minus_loc * tf.math.polygamma( + scalar_one, + one_minus_loc_times_scale)) + + const = loc * one_minus_loc_times_scale * (const2 + const3) + + return const + + def _weight_hessian_bb( + self, + x, + loc, + scale, + ): + one_minus_loc = 1 - loc + loc_times_scale = loc * scale + one_minus_loc_times_scale = one_minus_loc * scale + scalar_one = tf.constant(1, shape=(), dtype=self.dtype) + + if isinstance(x, tf.SparseTensor): + # Using the dense matrix of the location model to serve the correct shapes for the sparse X. + const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc))) + # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below, + # to_dense does not work. + const2 = loc * (tf.math.log(tf.sparse.add(tf.zeros_like(loc), x)) - tf.math.digamma(loc_times_scale)) \ + - one_minus_loc * (tf.math.digamma(one_minus_loc_times_scale) + tf.math.log(const1)) \ + + tf.math.digamma(scale) + else: + const1 = tf.math.log(x / (1 - x)) + const2 = loc * (tf.math.log(x) - tf.math.digamma(loc_times_scale)) \ + - one_minus_loc * (tf.math.digamma(one_minus_loc_times_scale) + tf.math.log(const1)) \ + + tf.math.digamma(scale) + const3 = scale * (- tf.square(loc) * tf.math.polygamma(scalar_one, loc_times_scale) + + tf.math.polygamma(scalar_one, scale) + - tf.math.polygamma(scalar_one, one_minus_loc_times_scale) * tf.square(one_minus_loc)) + const = scale * (const2 + const3) + + return const diff --git a/batchglm/train/tf2/glm_beta/model.py b/batchglm/train/tf2/glm_beta/model.py new file mode 100644 index 00000000..556f5335 --- /dev/null +++ b/batchglm/train/tf2/glm_beta/model.py @@ -0,0 +1,24 @@ +from .external import GLM +from .processModel import ProcessModel + + +class BetaGLM(GLM, ProcessModel): + + def __init__( + self, + model_vars, + optimizer: str, + compute_a: bool, + compute_b: bool, + use_gradient_tape: bool, + dtype: str, + ): + super(BetaGLM, self).__init__( + model_vars=model_vars, + noise_module='glm_beta', + optimizer=optimizer, + compute_a=compute_a, + compute_b=compute_b, + use_gradient_tape=use_gradient_tape, + dtype=dtype + ) diff --git a/batchglm/train/tf2/glm_beta/processModel.py b/batchglm/train/tf2/glm_beta/processModel.py new file mode 100644 index 00000000..c21811a4 --- /dev/null +++ b/batchglm/train/tf2/glm_beta/processModel.py @@ -0,0 +1,45 @@ +from .external import ProcessModelGLM +import tensorflow as tf +import numpy as np +from .external import pkg_constants + + +class ProcessModel(ProcessModelGLM): + + def param_bounds( + self, + dtype + ): + if isinstance(dtype, tf.DType): + dmax = dtype.max + dtype = dtype.as_numpy_dtype + else: + dtype = np.dtype(dtype) + dmax = np.finfo(dtype).max + dtype = dtype.type + + zero = np.nextafter(0, np.inf, dtype=dtype) + one = np.nextafter(1, -np.inf, dtype=dtype) + + sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT) + bounds_min = { + "a_var": np.log(zero / (1 - zero)) / sf, + "b_var": np.log(zero) / sf, + "eta_loc": np.log(zero / (1 - zero)) / sf, + "eta_scale": np.log(zero) / sf, + "mean": np.nextafter(0, np.inf, dtype=dtype), + "samplesize": np.nextafter(0, np.inf, dtype=dtype), + "probs": dtype(0), + "log_probs": np.log(zero), + } + bounds_max = { + "a_var": np.log(one / (1 - one)) / sf, + "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf, + "eta_loc": np.log(one / (1 - one)) / sf, + "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf, + "mean": one, + "samplesize": np.nextafter(dmax, -np.inf, dtype=dtype) / sf, + "probs": dtype(1), + "log_probs": dtype(0), + } + return bounds_min, bounds_max diff --git a/batchglm/train/tf2/glm_beta/training_strategies.py b/batchglm/train/tf2/glm_beta/training_strategies.py new file mode 100644 index 00000000..b6db5b22 --- /dev/null +++ b/batchglm/train/tf2/glm_beta/training_strategies.py @@ -0,0 +1,37 @@ +from enum import Enum + +class TrainingStrategies(Enum): + + AUTO = None + DEFAULT = [ + { + "convergence_criteria": "all_converged_ll", + "stopping_criteria": 1e-8, + "use_batching": False, + "optim_algo": "nr_tr", + }, + ] + INEXACT = [ + { + "convergence_criteria": "all_converged_ll", + "stopping_criteria": 1e-6, + "use_batching": False, + "optim_algo": "nr_tr", + }, + ] + EXACT = [ + { + "convergence_criteria": "all_converged_ll", + "stopping_criteria": 1e-8, + "use_batching": False, + "optim_algo": "nr_tr", + }, + ] + IRLS = [ + { + "convergence_criteria": "all_converged_ll", + "stopping_criteria": 1e-8, + "use_batching": False, + "optim_algo": "irls_tr", + }, + ] diff --git a/batchglm/train/tf2/glm_beta/vars.py b/batchglm/train/tf2/glm_beta/vars.py new file mode 100644 index 00000000..b1200abc --- /dev/null +++ b/batchglm/train/tf2/glm_beta/vars.py @@ -0,0 +1,8 @@ +from .model import ProcessModel +from .external import ModelVarsGLM + + +class ModelVars(ProcessModel, ModelVarsGLM): + """ + Full class. + """ diff --git a/batchglm/train/tf2/glm_nb/__init__.py b/batchglm/train/tf2/glm_nb/__init__.py new file mode 100644 index 00000000..f8cd6ee7 --- /dev/null +++ b/batchglm/train/tf2/glm_nb/__init__.py @@ -0,0 +1,5 @@ +from .processModel import ProcessModel +from .vars import ModelVars +from .estimator import Estimator + +from .model import NBGLM diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py new file mode 100644 index 00000000..4cabc8ac --- /dev/null +++ b/batchglm/train/tf2/glm_nb/estimator.py @@ -0,0 +1,305 @@ +import logging +from typing import Union +import numpy as np + +from .external import InputDataGLM, Model +from .external import closedform_nb_glm_logmu, closedform_nb_glm_logphi +from .model import NBGLM +from .vars import ModelVars +from .processModel import ProcessModel +from .external import Estimator as GLMEstimator +from .training_strategies import TrainingStrategies + +# needed for train_irls_ls_tr +from .optim import IRLS_LS + + +class Estimator(GLMEstimator, ProcessModel): + """ + Estimator for Generalized Linear Models (GLMs) with negative binomial noise. + Uses the natural logarithm as linker function. + """ + model: NBGLM + + def __init__( + self, + input_data: InputDataGLM, + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + quick_scale: bool = False, + dtype="float32", + ): + """ + Performs initialisation and creates a new estimator. + + :param input_data: InputDataGLM + The input data + :param init_a: (Optional) + Low-level initial values for a. Can be: + + - str: + * "auto": automatically choose best initialization + * "random": initialize with random values + * "standard": initialize intercept with observed mean + * "init_model": initialize with another model (see `ìnit_model` parameter) + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) + Low-level initial values for b. Can be: + + - str: + * "auto": automatically choose best initialization + * "random": initialize with random values + * "standard": initialize with zeros + * "init_model": initialize with another model (see `ìnit_model` parameter) + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param quick_scale: bool + Whether `scale` will be fitted faster and maybe less accurate. + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Precision used in tensorflow. + """ + self.TrainingStrategies = TrainingStrategies + self._train_loc = True + self._train_scale = True + + (init_a, init_b) = self.init_par( + input_data=input_data, + init_a=init_a, + init_b=init_b, + init_model=None + ) + init_a = init_a.astype(dtype) + init_b = init_b.astype(dtype) + if quick_scale: + self._train_scale = False + + self.model_vars = ModelVars( + init_a=init_a, + init_b=init_b, + constraints_loc=input_data.constraints_loc, + constraints_scale=input_data.constraints_scale, + dtype=dtype + ) + + super(Estimator, self).__init__( + input_data=input_data, + dtype=dtype + ) + + def train( + self, + use_batching: bool = True, + batch_size: int = 5000, + optim_algo: str = "adam", + learning_rate: float = 1e-2, + convergence_criteria: str = "step", + stopping_criteria: int = 1000, + autograd: bool = False, + featurewise: bool = True, + benchmark: bool = False, + maxiter: int = 1, + b_update_freq = 1 + ): + if self.model is None: + self.model = NBGLM( + model_vars=self.model_vars, + dtype=self.model_vars.dtype, + compute_a=self._train_loc, + compute_b=self._train_scale, + use_gradient_tape=autograd, + optimizer=optim_algo + ) + else: + self.model.setMethod(optim_algo) + + intercept_scale = len(self.model.model_vars.idx_train_scale) == 1 + optimizer_object = self.get_optimizer_object(optim_algo, learning_rate, intercept_scale) + self.optimizer = optimizer_object + if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr', 'irls_tr_gd_tr']: + self.update = self.update_separated + self.maxiter = maxiter + + super(Estimator, self)._train( + noise_model="nb", + is_batched=use_batching, + batch_size=batch_size, + optimizer_object=optimizer_object, + convergence_criteria=convergence_criteria, + stopping_criteria=stopping_criteria, + autograd=autograd, + featurewise=featurewise, + benchmark=benchmark, + optim_algo=optim_algo, + b_update_freq=b_update_freq + ) + + def get_optimizer_object(self, optimizer, learning_rate, intercept_scale): + optim = optimizer.lower() + if optim in ['irls_gd_tr', 'irls_gd', 'irls_ar_tr', 'irls_tr_gd_tr']: + return IRLS_LS( + dtype=self.dtype, + tr_mode=optim.endswith('tr'), + model=self.model, + name=optim, + n_obs=self.input_data.num_observations, + intercept_scale=intercept_scale) + return super().get_optimizer_object(optimizer, learning_rate) + + def update_separated(self, results, batches, batch_features, compute_b): + + self.optimizer.perform_parameter_update( + inputs=[batches, *results], + compute_a=True, + compute_b=False, + batch_features=batch_features, + is_batched=False + ) + if compute_b: + self.optimizer.perform_parameter_update( + inputs=[batches, *results], + compute_a=False, + compute_b=True, + batch_features=batch_features, + is_batched=False, + maxiter=self.maxiter + ) + else: + self.model.model_vars.updated_b = np.zeros_like(self.model.model_vars.updated_b) + + + def get_model_container( + self, + input_data + ): + return Model(input_data=input_data) + + def init_par( + self, + input_data, + init_a, + init_b, + init_model + ): + r""" + standard: + Only initialise intercept and keep other coefficients as zero. + + closed-form: + Initialize with Maximum Likelihood / Maximum of Momentum estimators + + Idea: + $$ + \theta &= f(x) \\ + \Rightarrow f^{-1}(\theta) &= x \\ + &= (D \cdot D^{+}) \cdot x \\ + &= D \cdot (D^{+} \cdot x) \\ + &= D \cdot x' = f^{-1}(\theta) + $$ + """ + + if init_model is None: + groupwise_means = None + init_a_str = None + if isinstance(init_a, str): + init_a_str = init_a.lower() + # Chose option if auto was chosen + if init_a.lower() == "auto": + init_a = "standard" + + if init_a.lower() == "closed_form": + groupwise_means, init_a, rmsd_a = closedform_nb_glm_logmu( + x=input_data.x, + design_loc=input_data.design_loc, + constraints_loc=input_data.constraints_loc, + size_factors=input_data.size_factors, + link_fn=lambda loc: np.log(self.np_clip_param(loc, "loc")) + ) + + # train mu, if the closed-form solution is inaccurate + self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0) + + if input_data.size_factors is not None: + if np.any(input_data.size_factors != 1): + self._train_loc = True + + logging.getLogger("batchglm").debug("Using closed-form MLE initialization for mean") + logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc) + elif init_a.lower() == "standard": + overall_means = np.mean(input_data.x, axis=0) # directly calculate the mean + overall_means = self.np_clip_param(overall_means, "loc") + + init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) + init_a[0, :] = np.log(overall_means) + self._train_loc = True + + logging.getLogger("batchglm").debug("Using standard initialization for mean") + logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc) + elif init_a.lower() == "all_zero": + init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) + self._train_loc = True + + logging.getLogger("batchglm").debug("Using all_zero initialization for mean") + logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc) + else: + raise ValueError("init_a string %s not recognized" % init_a) + + if isinstance(init_b, str): + if init_b.lower() == "auto": + init_b = "standard" + + if init_b.lower() == "standard": + groupwise_scales, init_b_intercept, rmsd_b = closedform_nb_glm_logphi( + x=input_data.x, + design_scale=input_data.design_scale[:, [0]], + constraints=input_data.constraints_scale[[0], :][:, [0]], + size_factors=input_data.size_factors, + groupwise_means=None, + link_fn=lambda scale: np.log(self.np_clip_param(scale, "scale")) + ) + init_b = np.zeros([input_data.num_scale_params, input_data.num_features]) + init_b[0, :] = init_b_intercept + + logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion") + logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale) + elif init_b.lower() == "closed_form": + dmats_unequal = False + if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]: + if np.any(input_data.design_loc != input_data.design_scale): + dmats_unequal = True + + inits_unequal = False + if init_a_str is not None: + if init_a_str != init_b: + inits_unequal = True + + if inits_unequal or dmats_unequal: + raise ValueError( + "cannot use closed_form init for scale model if scale model differs from loc model" + ) + + groupwise_scales, init_b, rmsd_b = closedform_nb_glm_logphi( + x=input_data.x, + design_scale=input_data.design_scale, + constraints=input_data.constraints_scale, + size_factors=input_data.size_factors, + groupwise_means=groupwise_means, + link_fn=lambda scale: np.log(self.np_clip_param(scale, "scale")) + ) + + logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion") + logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale) + elif init_b.lower() == "all_zero": + init_b = np.zeros([input_data.num_scale_params, input_data.x.shape[1]]) + + logging.getLogger("batchglm").debug("Using standard initialization for dispersion") + logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale) + else: + raise ValueError("init_b string %s not recognized" % init_b) + else: + init_a, init_b = self.get_init_from_model(init_a=init_a, + init_b=init_b, + input_data=input_data, + init_model=init_model) + + return init_a, init_b diff --git a/batchglm/train/tf2/glm_nb/external.py b/batchglm/train/tf2/glm_nb/external.py new file mode 100644 index 00000000..2d6638fa --- /dev/null +++ b/batchglm/train/tf2/glm_nb/external.py @@ -0,0 +1,21 @@ +import batchglm.data as data_utils + +from batchglm.models.glm_nb import _EstimatorGLM, InputDataGLM, Model +from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale +from batchglm.models.glm_nb.utils import closedform_nb_glm_logmu, closedform_nb_glm_logphi + +from batchglm.utils.linalg import groupwise_solve_lm +from batchglm import pkg_constants + +from batchglm.train.tf2.base_glm import GLM +from batchglm.train.tf2.base_glm import ProcessModelGLM, ModelVarsGLM + +# import necessary base_glm layers +from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM +from batchglm.train.tf2.base_glm import LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM +from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM +from batchglm.train.tf2.base_glm import Estimator + +# these are needed for nb specific irls_ls_tr training +from batchglm.train.tf2.base_glm import IRLS +from batchglm.train.tf2.base_glm import DataGenerator, ConvergenceCalculator diff --git a/batchglm/train/tf2/glm_nb/layers.py b/batchglm/train/tf2/glm_nb/layers.py new file mode 100644 index 00000000..afa67f7e --- /dev/null +++ b/batchglm/train/tf2/glm_nb/layers.py @@ -0,0 +1,59 @@ +import tensorflow as tf +from .processModel import ProcessModel +from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM +from .external import LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM + + +class UnpackParams(UnpackParamsGLM, ProcessModel): + """ + Full class. + """ + + +class LinearLoc(LinearLocGLM, ProcessModel): + + def with_size_factors(self, eta_loc, size_factors): + return tf.add(eta_loc, tf.math.log(size_factors)) + + +class LinearScale(LinearScaleGLM, ProcessModel): + """ + Full class. + """ + + +class LinkerLoc(LinkerLocGLM): + + def _inv_linker(self, loc: tf.Tensor): + return tf.exp(loc) + + +class LinkerScale(LinkerScaleGLM): + + def _inv_linker(self, scale: tf.Tensor): + return tf.exp(scale) + + +class Likelihood(LikelihoodGLM, ProcessModel): + + def _ll(self, eta_loc, eta_scale, loc, scale, x): + + # Log-likelihood: + log_r_plus_mu = tf.math.log(scale + loc) + if isinstance(x, tf.SparseTensor): + log_probs_sparse = x.__mul__(eta_loc - log_r_plus_mu) + log_probs_dense = tf.math.lgamma(tf.sparse.add(x, scale)) - \ + tf.math.lgamma(tf.sparse.add(x, tf.ones(shape=x.dense_shape, dtype=self.dtype))) - \ + tf.math.lgamma(scale) + \ + tf.multiply(scale, eta_scale - log_r_plus_mu) + log_probs = tf.sparse.add(log_probs_sparse, log_probs_dense) + # log_probs.set_shape([None, n_features]) # need as shape completely lost. + else: + log_probs = tf.math.lgamma(scale + x) - \ + tf.math.lgamma(x + tf.ones_like(x)) - \ + tf.math.lgamma(scale) + \ + tf.multiply(x, eta_loc - log_r_plus_mu) + \ + tf.multiply(scale, eta_scale - log_r_plus_mu) + + log_probs = self.tf_clip_param(log_probs, "log_probs") + return log_probs diff --git a/batchglm/train/tf2/glm_nb/layers_gradients.py b/batchglm/train/tf2/glm_nb/layers_gradients.py new file mode 100644 index 00000000..8ff079c6 --- /dev/null +++ b/batchglm/train/tf2/glm_nb/layers_gradients.py @@ -0,0 +1,144 @@ +import tensorflow as tf +from .external import FIMGLM, JacobianGLM, HessianGLM + + +class FIM(FIMGLM): + + def _weight_fim_aa( + self, + x, + loc, + scale + ): + const = tf.divide(scale, scale + loc) + w = tf.multiply(loc, const) + + return w + + def _weight_fim_bb( + self, + x, + loc, + scale + ): + return tf.zeros_like(scale) + + +class Jacobian(JacobianGLM): + + def _weights_jac_a( + self, + x, + loc, + scale, + ): + if isinstance(x, tf.SparseTensor): # or isinstance(x, tf.SparseTensorValue): + const = tf.sparse.add(x, tf.negative(loc)) + else: + const = tf.subtract(x, loc) + return tf.divide(tf.multiply(scale, const), tf.add(loc, scale)) + + def _weights_jac_b(self, x, loc, scale): + # Pre-define sub-graphs that are used multiple times: + scalar_one = tf.constant(1, shape=(), dtype=self.dtype) + if isinstance(x, tf.SparseTensor): # or isinstance(x, tf.SparseTensorValue): + scale_plus_x = tf.sparse.add(x, scale) + else: + scale_plus_x = scale + x + + r_plus_mu = scale + loc + + # Define graphs for individual terms of constant term of hessian: + const1 = tf.subtract( + tf.math.digamma(x=scale_plus_x), + tf.math.digamma(x=scale) + ) + const2 = tf.negative(scale_plus_x / r_plus_mu) + const3 = tf.add( + tf.math.log(scale), + scalar_one - tf.math.log(r_plus_mu) + ) + const = tf.add_n([const1, const2, const3]) # [observations, features] + const = scale * const + + return const + + +class Hessian(HessianGLM): + + def _weight_hessian_ab(self, x, loc, scale): + + if isinstance(x, tf.SparseTensor): + x_minus_mu = tf.sparse.add(x, -loc) + else: + x_minus_mu = x - loc + + const = tf.multiply( + loc * scale, + tf.divide( + x_minus_mu, + tf.square(loc + scale) + ) + ) + + return const + + def _weight_hessian_aa( + self, + x, + loc, + scale, + ): + if isinstance(x, tf.SparseTensor):# or isinstance(x, tf.SparseTensorValue): + x_by_scale_plus_one = tf.sparse.add(x.__div__(scale), tf.ones_like(scale)) + else: + x_by_scale_plus_one = x / scale + tf.ones_like(scale) + + const = tf.negative(tf.multiply( + loc, + tf.divide( + x_by_scale_plus_one, + tf.square((loc / scale) + tf.ones_like(loc)) + ) + )) + + return const + + def _weight_hessian_bb( + self, + x, + loc, + scale, + ): + if isinstance(x, tf.SparseTensor):# or isinstance(x, tf.SparseTensorValue): + scale_plus_x = tf.sparse.add(x, scale) + else: + scale_plus_x = x + scale + + scalar_one = tf.constant(1, shape=(), dtype=self.dtype) + scalar_two = tf.constant(2, shape=(), dtype=self.dtype) + # Pre-define sub-graphs that are used multiple times: + scale_plus_loc = scale + loc + # Define graphs for individual terms of constant term of hessian: + const1 = tf.add( + tf.math.digamma(x=scale_plus_x), + scale * tf.math.polygamma(a=scalar_one, x=scale_plus_x) + ) + const2 = tf.negative(tf.add( + tf.math.digamma(x=scale), + scale * tf.math.polygamma(a=scalar_one, x=scale) + )) + const3 = tf.negative(tf.divide( + tf.add( + loc * scale_plus_x, + scalar_two * scale * scale_plus_loc + ), + tf.square(scale_plus_loc) + )) + const4 = tf.add( + tf.math.log(scale), + scalar_two - tf.math.log(scale_plus_loc) + ) + const = tf.add_n([const1, const2, const3, const4]) + const = tf.multiply(scale, const) + return const diff --git a/batchglm/train/tf2/glm_nb/model.py b/batchglm/train/tf2/glm_nb/model.py new file mode 100644 index 00000000..bd2dbe66 --- /dev/null +++ b/batchglm/train/tf2/glm_nb/model.py @@ -0,0 +1,24 @@ +from .external import GLM +from .processModel import ProcessModel + + +class NBGLM(GLM, ProcessModel): + + def __init__( + self, + model_vars, + optimizer: str, + compute_a: bool, + compute_b: bool, + use_gradient_tape: bool, + dtype: str, + ): + super(NBGLM, self).__init__( + model_vars=model_vars, + noise_module='glm_nb', + optimizer=optimizer, + compute_a=compute_a, + compute_b=compute_b, + use_gradient_tape=use_gradient_tape, + dtype=dtype + ) diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py new file mode 100644 index 00000000..ebab6441 --- /dev/null +++ b/batchglm/train/tf2/glm_nb/optim.py @@ -0,0 +1,275 @@ +import tensorflow as tf +import numpy as np +from .external import IRLS, pkg_constants + +class IRLS_LS(IRLS): + + def __init__(self, dtype, tr_mode, model, name, n_obs, intercept_scale): + + parent_tr_mode = False + self.tr_mode_b = False + if name.startswith('irls_tr'): + parent_tr_mode = True # for loc model + if name in ['irls_tr_gd_tr', 'irls_gd_tr', 'irls_gd', 'irls_tr_gd']: + self.update_b_func = self.update_b_gd + elif name in ['irls_ar', 'irls_tr_ar']: + assert intercept_scale, "Line search (armijo) is only available" \ + "for scale models with a single coefficient (intercept scale)." + self.update_b_func = self.update_b_ar + else: + assert False, "Unrecognized method for optimization given." + super(IRLS_LS, self).__init__( + dtype=dtype, + tr_mode=parent_tr_mode, + model=model, + name=name, + n_obs=n_obs) + + if tr_mode: + n_features = self.model.model_vars.n_features + self.tr_radius_b = tf.Variable( + np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE, + dtype=self._dtype, trainable=False) + self.tr_mode_b = True + + def _trust_region_linear_cost_gain( + self, + proposed_vector, + neg_jac + ): + pred_cost_gain = tf.reduce_sum(tf.multiply( + proposed_vector, + tf.transpose(neg_jac) + ), axis=0) + return pred_cost_gain + + def perform_parameter_update( + self, + inputs, + compute_a=True, + compute_b=True, + batch_features=False, + is_batched=False, + maxiter=1 + ): + assert compute_a ^ compute_b, \ + "IRLS_LS computes either loc or scale model updates, not both nor none at the same time." + + if compute_a: + super(IRLS_LS, self).perform_parameter_update( + inputs, compute_a, compute_b, batch_features, is_batched) + else: + # global_step = tf.zeros_like(self.model.model_vars.remaining_features) + results = inputs[1:4] + x_batches = inputs[0] + iteration = 0 + not_converged = np.zeros_like(self.model.model_vars.remaining_features) + updated_b = np.zeros_like(self.model.model_vars.updated_b) + while True: + iteration += 1 + step = self.update_b_func([x_batches, *results], batch_features, is_batched) + not_converged = tf.abs(step).numpy() > pkg_constants.XTOL_BY_FEATURE_SCALE + updated_b |= self.model.model_vars.updated_b + if not tf.reduce_any(not_converged) or iteration == maxiter: + break + for i, x_batch in enumerate(inputs[0]): + results = self.model.calc_jacobians(x_batch, concat=False, compute_a=False) if i == 0 else \ + [tf.math.add(results[i], x) for + i, x in enumerate(self.model.calc_jacobians(x_batch, concat=False, compute_a=False))] + self.model.model_vars.updated_b = updated_b + + def gett1t2(self): + t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype) + t2 = tf.constant(pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE, dtype=self._dtype) + return t1, t2 + + def _trust_region_update_b( + self, + update_raw, + radius_container, + ): + update_magnitude, update_magnitude_inv = IRLS_LS._calc_update_magnitudes(update_raw) + update_norm = tf.multiply(update_raw, update_magnitude_inv) + + update_magnitude = update_magnitude / self.n_obs * radius_container + + update_scale = tf.minimum( + radius_container, + update_magnitude + ) + proposed_vector = tf.multiply( + update_norm, + update_scale + ) + + return proposed_vector + def update_b_gd(self, inputs, batch_features, is_batched): + + x_batches, log_probs, _, jac_b = inputs + + update_b = tf.transpose(jac_b) + if not self.tr_mode_b: + update = self._pad_updates( + update_raw=update_b, + compute_a=False, + compute_b=True + ) + + update_theta = self._trial_update( + x_batches=x_batches, + log_probs=log_probs, + proposed_vector=update, + is_batched=is_batched, + compute_a=False, + compute_b=True + ) + self.model.params_copy.assign_sub(update) + + return tf.where(update_theta, update, tf.zeros_like(update)) + + else: + if batch_features: + radius_container = tf.boolean_mask( + tensor=self.tr_radius_b, + mask=self.model.model_vars.remaining_features) + else: + radius_container = self.tr_radius_b + + tr_proposed_vector_b = self._trust_region_update_b( + update_raw=update_b, + radius_container=radius_container + ) + + tr_update_b = self._pad_updates( + update_raw=tr_proposed_vector_b, + compute_a=False, + compute_b=True + ) + + # perform update + update_theta = self._trial_update( + x_batches=x_batches, + log_probs=log_probs, + proposed_vector=tr_update_b, + is_batched=is_batched, + compute_a=False, + compute_b=True) + self._trust_region_ops( + proposed_vector=tr_update_b, + compute_a=False, + compute_b=True, + batch_features=batch_features, + update_theta=update_theta) + + return tf.where(update_theta, tr_proposed_vector_b, tf.zeros_like(tr_proposed_vector_b)) + + def update_b_ar(self, inputs, batch_features, is_batched, alpha0=None): + + x_batches, log_probs, _, jac_b = inputs + jac_b = tf.reshape(jac_b, [jac_b.shape[0]]) + direction = -tf.sign(jac_b) + derphi0 = jac_b / self.n_obs + if alpha0 is None: + alpha0 = tf.ones_like(jac_b) * pkg_constants.ALPHA0 + original_params_b_copy = self.model.params_copy[-1] + + def phi(alpha): + multiplier = tf.multiply(alpha, direction) + new_scale_params = tf.add(original_params_b_copy, multiplier) + self.model.params_copy[-1].assign(new_scale_params) + new_likelihood = None + for i, x_batch in enumerate(x_batches): + log_likelihood = self.model.calc_ll([*x_batch])[0] + new_likelihood = log_likelihood if i == 0 else \ + tf.math.add(new_likelihood, log_likelihood) + new_likelihood = self._norm_neg_log_likelihood(new_likelihood) + return new_likelihood + + current_likelihood = self._norm_neg_log_likelihood(log_probs) + new_likelihood = phi(alpha0) + beneficial = self.wolfe1(current_likelihood, new_likelihood, alpha0, derphi0) + + if tf.reduce_all(beneficial): # are all beneficial? + updated = beneficial + if batch_features: + n_features = self.model.model_vars.n_features + indices = tf.where(self.model.model_vars.remaining_features) + updated = tf.scatter_nd(indices, beneficial, shape=(n_features,)) + self.model.model_vars.updated_b = updated + return tf.multiply(alpha0, direction) + + divisor = new_likelihood - current_likelihood - derphi0 * alpha0 + alpha1 = tf.negative(derphi0) * alpha0**2 / 2 / divisor + alpha1 = tf.where(beneficial, alpha0, alpha1) + new_likelihood2 = phi(alpha1) + beneficial = self.wolfe1(current_likelihood, new_likelihood2, alpha1, derphi0) + if tf.reduce_all(beneficial): + updated = beneficial + if batch_features: + n_features = self.model.model_vars.n_features + indices = tf.where(self.model.model_vars.remaining_features) + updated = tf.scatter_nd(indices, beneficial, shape=(n_features,)) + self.model.model_vars.updated_b = updated + return tf.multiply(alpha1, direction) + + if not tf.reduce_any(alpha1 > pkg_constants.XTOL_BY_FEATURE_SCALE): + # catch in case it doesn't enter the loop. + new_scale_params = tf.where(beneficial, self.model.params_copy[-1], original_params_b_copy) + self.model.params_copy[-1].assign(new_scale_params) + self.model.model_vars.updated_b = np.ones_like(self.model.model_vars.updated_b) + return tf.multiply(alpha1, direction) + + while tf.reduce_any(alpha1 > pkg_constants.XTOL_BY_FEATURE_SCALE): + + factor = alpha0**2 * alpha1**2 * (alpha1-alpha0) + a = alpha0**2 * (new_likelihood2 - current_likelihood - derphi0 * alpha1) - \ + alpha1**2 * (new_likelihood - current_likelihood - derphi0 * alpha0) + a = a / factor + + b = -alpha0**3 * (new_likelihood2 - current_likelihood - derphi0 * alpha1) + \ + alpha1**3 * (new_likelihood - current_likelihood - derphi0 * alpha0) + b = b / factor + + alpha2 = (-b + tf.sqrt(tf.abs(tf.square(b) - 3 * a * derphi0))) / (3 * a) + alpha2 = tf.where(beneficial, alpha1, alpha2) + idx_to_clip = tf.logical_or(tf.math.is_nan(alpha2), alpha2 < 0) + alpha2 = tf.where(idx_to_clip, tf.zeros_like(alpha2), alpha2) + new_likelihood3 = phi(alpha2) + beneficial = self.wolfe1(current_likelihood, new_likelihood3, alpha2, derphi0) + + if tf.reduce_all(beneficial): + updated = beneficial + if batch_features: + n_features = self.model.model_vars.n_features + indices = tf.where(self.model.model_vars.remaining_features) + updated = tf.scatter_nd(indices, beneficial, shape=(n_features,)) + self.model.model_vars.updated_b = updated + return tf.multiply(alpha2, direction) + + step_diff_greater_half_alpha1 = (alpha1 - alpha2) > alpha1 / 2 + ratio = (1 - alpha2/alpha1) < 0.96 + set_back = tf.logical_or(step_diff_greater_half_alpha1, ratio) + alpha2 = tf.where(set_back, alpha1 / 2, alpha2) + alpha2 = tf.where(tf.logical_or(tf.math.is_nan(alpha2), alpha2 < 0), tf.zeros_like(alpha2), alpha2) + + alpha0 = alpha1 + alpha1 = alpha2 + new_likelihood = new_likelihood2 + new_likelihood2 = new_likelihood3 + + new_scale_params = tf.where(beneficial, self.model.params_copy[-1], original_params_b_copy) + self.model.params_copy[-1].assign(new_scale_params) + updated = beneficial + if batch_features: + n_features = self.model.model_vars.n_features + indices = tf.where(self.model.model_vars.remaining_features) + updated = tf.scatter_nd(indices, beneficial, shape=(n_features,)) + self.model.model_vars.updated_b = np.ones_like(self.model.model_vars.updated_b) + return tf.multiply(alpha2, direction) + + def wolfe1(self, current_likelihood, new_likelihood, alpha, jacobian): + """Checks if an update satisfies the first wolfe condition by returning the difference + to the previous likelihood.""" + c1 = tf.constant(pkg_constants.WOLFE_C1, dtype=self._dtype) + limit = tf.add(current_likelihood, tf.multiply(tf.multiply(c1, alpha), jacobian)) + return new_likelihood < limit diff --git a/batchglm/train/tf2/glm_nb/processModel.py b/batchglm/train/tf2/glm_nb/processModel.py new file mode 100644 index 00000000..6a177f7f --- /dev/null +++ b/batchglm/train/tf2/glm_nb/processModel.py @@ -0,0 +1,42 @@ +from .external import ProcessModelGLM +import tensorflow as tf +import numpy as np +from .external import pkg_constants + + +class ProcessModel(ProcessModelGLM): + + def param_bounds( + self, + dtype + ): + if isinstance(dtype, tf.DType): + dmax = dtype.max + dtype = dtype.as_numpy_dtype + else: + dtype = np.dtype(dtype) + dmax = np.finfo(dtype).max + dtype = dtype.type + + sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT) + bounds_min = { + "a_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf, + "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf, + "eta_loc": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf, + "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf, + "loc": np.nextafter(0, np.inf, dtype=dtype), + "scale": np.nextafter(0, np.inf, dtype=dtype), + "probs": dtype(0), + "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)), + } + bounds_max = { + "a_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf, + "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf, + "eta_loc": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf, + "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf, + "loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf, + "scale": np.nextafter(dmax, -np.inf, dtype=dtype) / sf, + "probs": dtype(1), + "log_probs": dtype(0), + } + return bounds_min, bounds_max diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py new file mode 100644 index 00000000..18c35302 --- /dev/null +++ b/batchglm/train/tf2/glm_nb/training_strategies.py @@ -0,0 +1,40 @@ +from enum import Enum + +class TrainingStrategies(Enum): + + AUTO = None + DEFAULT = [ + { + "convergence_criteria": "all_converged", + "use_batching": False, + "optim_algo": "irls_gd_tr", + }, + ] + IRLS = [ + { + "convergence_criteria": "all_converged", + "use_batching": False, + "optim_algo": "irls_tr_gd_tr", + }, + ] + IRLS_BATCHED = [ + { + "convergence_criteria": "all_converged", + "use_batching": True, + "optim_algo": "irls_gd_tr", + }, + ] + ADAM_BATCHED = [ + { + "convergence_criteria": "all_converged", + "use_batching": True, + "optim_algo": "adam", + }, + ] + ADAM = [ + { + "convergence_criteria": "all_converged", + "use_batching": False, + "optim_algo": "adam", + }, + ] diff --git a/batchglm/train/tf2/glm_nb/vars.py b/batchglm/train/tf2/glm_nb/vars.py new file mode 100644 index 00000000..b1200abc --- /dev/null +++ b/batchglm/train/tf2/glm_nb/vars.py @@ -0,0 +1,8 @@ +from .model import ProcessModel +from .external import ModelVarsGLM + + +class ModelVars(ProcessModel, ModelVarsGLM): + """ + Full class. + """ diff --git a/batchglm/train/tf2/glm_norm/__init__.py b/batchglm/train/tf2/glm_norm/__init__.py new file mode 100644 index 00000000..b6bf02af --- /dev/null +++ b/batchglm/train/tf2/glm_norm/__init__.py @@ -0,0 +1,5 @@ +from .processModel import ProcessModel +from .vars import ModelVars +from .estimator import Estimator + +from .model import NormGLM diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py new file mode 100644 index 00000000..b66176da --- /dev/null +++ b/batchglm/train/tf2/glm_norm/estimator.py @@ -0,0 +1,276 @@ +import logging +import numpy as np +import scipy.sparse +from typing import Union + +from .external import closedform_norm_glm_logsd +from .external import InputDataGLM, Model +from .external import Estimator as GLMEstimator +from .model import NormGLM +from .processModel import ProcessModel +from .vars import ModelVars +from .training_strategies import TrainingStrategies + + +logger = logging.getLogger("batchglm") + + +class Estimator(GLMEstimator, ProcessModel): + """ + Estimator for Generalized Linear Models (GLMs) with normal distributed noise. + Uses the identity function as linker function for loc and a log-linker function for scale. + """ + + model: NormGLM + + def __init__( + self, + input_data: InputDataGLM, + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + quick_scale: bool = False, + dtype="float32", + ): + """ + Performs initialisation and creates a new estimator. + + :param input_data: InputDataGLM + The input data + :param init_a: (Optional) + Low-level initial values for a. Can be: + + - str: + * "auto": automatically choose best initialization + * "all zero": initialize with zeros + * "random": initialize with random values + * "standard": initialize intercept with observed mean + * "init_model": initialize with another model (see `ìnit_model` parameter) + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) + Low-level initial values for b. Can be: + + - str: + * "auto": automatically choose best initialization + * "random": initialize with random values + * "standard": initialize with zeros + * "init_model": initialize with another model (see `ìnit_model` parameter) + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param quick_scale: bool + Whether `scale` will be fitted faster and maybe less accurate. + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Precision used in tensorflow. + """ + self.TrainingStrategies = TrainingStrategies + self._train_loc = True + self._train_scale = True + + (init_a, init_b) = self.init_par( + input_data=input_data, + init_a=init_a, + init_b=init_b, + init_model=None + ) + init_a = init_a.astype(dtype) + init_b = init_b.astype(dtype) + if quick_scale: + self._train_scale = False + + self.model_vars = ModelVars( + init_a=init_a, + init_b=init_b, + constraints_loc=input_data.constraints_loc, + constraints_scale=input_data.constraints_scale, + dtype=dtype + ) + + super(Estimator, self).__init__( + input_data=input_data, + dtype=dtype + ) + + def train( + self, + use_batching: bool = True, + batch_size: int = 500, + optim_algo: str = "adam", + learning_rate: float = 1e-2, + convergence_criteria: str = "step", + stopping_criteria: int = 1000, + autograd: bool = False, + featurewise: bool = True, + benchmark: bool = False + ): + if self.model is None: + self.model = NormGLM( + model_vars=self.model_vars, + dtype=self.model_vars.dtype, + compute_a=self._train_loc, + compute_b=self._train_scale, + use_gradient_tape=autograd, + optimizer=optim_algo + ) + else: + self.model.setMethod(optim_algo) + + optimizer_object = self.get_optimizer_object(optim_algo, learning_rate) + + super(Estimator, self)._train( + noise_model="norm", + is_batched=use_batching, + batch_size=batch_size, + optimizer_object=optimizer_object, + convergence_criteria=convergence_criteria, + stopping_criteria=stopping_criteria, + autograd=autograd, + featurewise=featurewise, + benchmark=benchmark, + optim_algo=optim_algo + + ) + + def get_model_container( + self, + input_data + ): + return Model(input_data=input_data) + + def init_par(self, input_data, init_a, init_b, init_model): + r""" + standard: + Only initialise intercept and keep other coefficients as zero. + + closed-form: + Initialize with Maximum Likelihood / Maximum of Momentum estimators + """ + + sf_given = False + if input_data.size_factors is not None: + if np.any(np.abs(input_data.size_factors - 1.) > 1e-8): + sf_given = True + + is_ols_model = input_data.design_scale.shape[1] == 1 and \ + np.all(np.abs(input_data.design_scale - 1.) < 1e-8) and not sf_given + + if init_model is None: + groupwise_means = None + init_a_str = None + if isinstance(init_a, str): + init_a_str = init_a.lower() + # Chose option if auto was chosen + if init_a.lower() == "auto": + init_a = "closed_form" + + if init_a.lower() == "closed_form" or init_a.lower() == "standard": + design_constr = np.matmul(input_data.design_loc, input_data.constraints_loc) + # Iterate over genes if X is sparse to avoid large sparse tensor. + # If X is dense, the least square problem can be vectorised easily. + if isinstance(input_data.x, scipy.sparse.csr_matrix): + init_a, rmsd_a, _, _ = np.linalg.lstsq( + np.matmul(design_constr.T, design_constr), + input_data.x.T.dot(design_constr).T, # need double .T because of dot product on sparse. + rcond=None + ) + else: + init_a, rmsd_a, _, _ = np.linalg.lstsq( + np.matmul(design_constr.T, design_constr), + np.matmul(design_constr.T, input_data.x), + rcond=None + ) + groupwise_means = None + if is_ols_model: + self._train_loc = False + + logger.debug("Using OLS initialization for location model") + elif init_a.lower() == "all_zero": + init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) + self._train_loc = True + + logger.debug("Using all_zero initialization for mean") + else: + raise ValueError("init_a string %s not recognized" % init_a) + logger.debug("Should train location model: %s", self._train_loc) + + if isinstance(init_b, str): + if init_b.lower() == "auto": + init_b = "standard" + + if is_ols_model: + # Calculated variance via E(x)^2 or directly depending on whether `mu` was specified. + if isinstance(input_data.x, scipy.sparse.csr_matrix): + expect_xsq = np.asarray(np.mean(input_data.x.power(2), axis=0)) + else: + expect_xsq = np.expand_dims(np.mean(np.square(input_data.x), axis=0), axis=0) + mean_model = np.matmul( + np.matmul(input_data.design_loc, input_data.constraints_loc), + init_a + ) + expect_x_sq = np.mean(np.square(mean_model), axis=0) + variance = (expect_xsq - expect_x_sq) + init_b = np.log(np.sqrt(variance)) + self._train_scale = False + + logger.debug("Using residuals from OLS estimate for variance estimate") + elif init_b.lower() == "closed_form": + dmats_unequal = False + if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]: + if np.any(input_data.design_loc != input_data.design_scale): + dmats_unequal = True + + inits_unequal = False + if init_a_str is not None: + if init_a_str != init_b: + inits_unequal = True + + # Watch out: init_mean is full obs x features matrix and is very large in many cases. + if inits_unequal or dmats_unequal: + raise ValueError( + "cannot use closed_form init for scale model \ + if scale model differs from loc model" + ) + + groupwise_scales, init_b, rmsd_b = closedform_norm_glm_logsd( + x=input_data.x, + design_scale=input_data.design_scale, + constraints=input_data.constraints_scale, + size_factors=input_data.size_factors, + groupwise_means=groupwise_means, + link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd")) + ) + + # train scale, if the closed-form solution is inaccurate + self._train_scale = not (np.all(rmsd_b == 0) or rmsd_b.size == 0) + + logger.debug("Using closed-form MME initialization for standard deviation") + elif init_b.lower() == "standard": + groupwise_scales, init_b_intercept, rmsd_b = closedform_norm_glm_logsd( + x=input_data.x, + design_scale=input_data.design_scale[:, [0]], + constraints=input_data.constraints_scale[[0], :][:, [0]], + size_factors=input_data.size_factors, + groupwise_means=None, + link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd")) + ) + init_b = np.zeros([input_data.num_scale_params, input_data.num_features]) + init_b[0, :] = init_b_intercept + + # train scale, if the closed-form solution is inaccurate + self._train_scale = not (np.all(rmsd_b == 0) or rmsd_b.size == 0) + + logger.debug("Using closed-form MME initialization for standard deviation") + logger.debug("Should train sd: %s", self._train_scale) + elif init_b.lower() == "all_zero": + init_b = np.zeros([input_data.num_scale_params, input_data.num_features]) + + logger.debug("Using standard initialization for standard deviation") + else: + raise ValueError("init_b string %s not recognized" % init_b) + logger.debug("Should train sd: %s", self._train_scale) + else: + init_a, init_b = self.get_init_from_model(init_a=init_a, + init_b=init_b, + input_data=input_data, + init_model=init_model) + + return init_a, init_b diff --git a/batchglm/train/tf2/glm_norm/external.py b/batchglm/train/tf2/glm_norm/external.py new file mode 100644 index 00000000..5fc7ab0b --- /dev/null +++ b/batchglm/train/tf2/glm_norm/external.py @@ -0,0 +1,12 @@ +import batchglm.data as data_utils + +from batchglm.models.glm_norm import _EstimatorGLM, InputDataGLM, Model +from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale +from batchglm.models.glm_norm.utils import closedform_norm_glm_mean, closedform_norm_glm_logsd + +from batchglm.utils.linalg import groupwise_solve_lm +from batchglm import pkg_constants + +from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, Estimator, ModelVarsGLM +from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM +from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM diff --git a/batchglm/train/tf2/glm_norm/layers.py b/batchglm/train/tf2/glm_norm/layers.py new file mode 100644 index 00000000..ba067352 --- /dev/null +++ b/batchglm/train/tf2/glm_norm/layers.py @@ -0,0 +1,49 @@ +import tensorflow as tf +import numpy as np +from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM +from .processModel import ProcessModel + + +class UnpackParams(UnpackParamsGLM, ProcessModel): + """ + Full class. + """ + + +class LinearLoc(LinearLocGLM, ProcessModel): + + def with_size_factors(self, eta_loc, size_factors): + return tf.multiply(eta_loc, size_factors) + + +class LinearScale(LinearScaleGLM, ProcessModel): + """ + Full Class + """ + + +class LinkerLoc(LinkerLocGLM): + + def _inv_linker(self, loc: tf.Tensor): + return loc + + +class LinkerScale(LinkerScaleGLM): + + def _inv_linker(self, scale: tf.Tensor): + return tf.math.exp(scale) + + +class Likelihood(LikelihoodGLM, ProcessModel): + + def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features): + + const = tf.constant(-0.5 * np.log(2 * np.pi), shape=(), dtype=self.ll_dtype) + if isinstance(x, tf.SparseTensor): + log_probs = const - eta_scale - 0.5 * tf.math.square(tf.divide(tf.sparse.add(x, - loc), scale)) + # log_probs.set_shape([None, a_var.shape[1]]) # Need this so as shape is completely lost. + else: + log_probs = const - eta_scale - 0.5 * tf.math.square(tf.divide(x - loc, scale)) + log_probs = self.tf_clip_param(log_probs, "log_probs") + + return log_probs diff --git a/batchglm/train/tf2/glm_norm/layers_gradients.py b/batchglm/train/tf2/glm_norm/layers_gradients.py new file mode 100644 index 00000000..e2b35119 --- /dev/null +++ b/batchglm/train/tf2/glm_norm/layers_gradients.py @@ -0,0 +1,116 @@ +import tensorflow as tf +from .external import FIMGLM, JacobianGLM, HessianGLM + + +class FIM(FIMGLM): + + def _weight_fim_aa( + self, + x, + loc, + scale + ): + w = tf.square(tf.divide(tf.ones_like(scale), scale)) + + return w + + def _weight_fim_bb( + self, + x, + loc, + scale + ): + w = tf.constant(2, shape=loc.shape, dtype=self.dtype) + + return w + + +class Jacobian(JacobianGLM): + + def _weights_jac_a( + self, + x, + loc, + scale, + ): + if isinstance(x, tf.SparseTensor): + const1 = tf.sparse.add(x, -loc) + const = tf.divide(const1, tf.square(scale)) + else: + const1 = tf.subtract(x, loc) + const = tf.divide(const1, tf.square(scale)) + return const + + def _weights_jac_b( + self, + x, + loc, + scale, + ): + scalar_one = tf.constant(1, shape=(), dtype=self.dtype) + if isinstance(x, tf.SparseTensor): + const = tf.negative(scalar_one) + tf.math.square( + tf.divide(tf.sparse.add(x, -loc), scale) + ) + else: + const = tf.negative(scalar_one) + tf.math.square( + tf.divide(tf.subtract(x, loc), scale) + ) + return const + + +class Hessian(HessianGLM): + + def _weight_hessian_ab( + self, + x, + loc, + scale, + ): + scalar_two = tf.constant(2, shape=(), dtype=self.dtype) + if isinstance(x, tf.SparseTensor): + x_minus_loc = tf.sparse.add(x, -loc) + else: + x_minus_loc = x - loc + + const = - tf.multiply(scalar_two, + tf.divide( + x_minus_loc, + tf.square(scale) + ) + ) + return const + + def _weight_hessian_aa( + self, + x, + loc, + scale, + ): + scalar_one = tf.constant(1, shape=(), dtype=self.dtype) + const = - tf.divide(scalar_one, tf.square(scale)) + + return const + + def _weight_hessian_bb( + self, + x, + loc, + scale, + ): + scalar_two = tf.constant(2, shape=(), dtype=self.dtype) + if isinstance(x, tf.SparseTensor): + x_minus_loc = tf.sparse.add(x, -loc) + else: + x_minus_loc = x - loc + + const = - tf.multiply( + scalar_two, + tf.math.square( + tf.divide( + x_minus_loc, + scale + ) + ) + ) + return const diff --git a/batchglm/train/tf2/glm_norm/model.py b/batchglm/train/tf2/glm_norm/model.py new file mode 100644 index 00000000..7bcb41fe --- /dev/null +++ b/batchglm/train/tf2/glm_norm/model.py @@ -0,0 +1,24 @@ +from .external import GLM +from .processModel import ProcessModel + + +class NormGLM(GLM, ProcessModel): + + def __init__( + self, + model_vars, + optimizer: str, + compute_a: bool, + compute_b: bool, + use_gradient_tape: bool, + dtype: str, + ): + super(NormGLM, self).__init__( + model_vars=model_vars, + noise_module='glm_norm', + optimizer=optimizer, + compute_a=compute_a, + compute_b=compute_b, + use_gradient_tape=use_gradient_tape, + dtype=dtype + ) diff --git a/batchglm/train/tf2/glm_norm/processModel.py b/batchglm/train/tf2/glm_norm/processModel.py new file mode 100644 index 00000000..629099ff --- /dev/null +++ b/batchglm/train/tf2/glm_norm/processModel.py @@ -0,0 +1,42 @@ +from .external import ProcessModelGLM +import tensorflow as tf +import numpy as np +from .external import pkg_constants + + +class ProcessModel(ProcessModelGLM): + + def param_bounds( + self, + dtype + ): + if isinstance(dtype, tf.DType): + dmax = dtype.max + dtype = dtype.as_numpy_dtype + else: + dtype = np.dtype(dtype) + dmax = np.finfo(dtype).max + dtype = dtype.type + + sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT) + bounds_min = { + "a_var": np.nextafter(-dmax, np.inf, dtype=dtype) / sf, + "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf, + "eta_loc": np.nextafter(-dmax, np.inf, dtype=dtype) / sf, + "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf, + "mean": np.nextafter(-dmax, np.inf, dtype=dtype) / sf, + "sd": np.nextafter(0, np.inf, dtype=dtype), + "probs": dtype(0), + "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)), + } + bounds_max = { + "a_var": np.nextafter(dmax, -np.inf, dtype=dtype) / sf, + "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf, + "eta_loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf, + "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf, + "mean": np.nextafter(dmax, -np.inf, dtype=dtype) / sf, + "sd": np.nextafter(dmax, -np.inf, dtype=dtype) / sf, + "probs": dtype(1), + "log_probs": dtype(0), + } + return bounds_min, bounds_max diff --git a/batchglm/train/tf2/glm_norm/training_strategies.py b/batchglm/train/tf2/glm_norm/training_strategies.py new file mode 100644 index 00000000..2ba524a7 --- /dev/null +++ b/batchglm/train/tf2/glm_norm/training_strategies.py @@ -0,0 +1,27 @@ +from enum import Enum + + +class TrainingStrategies(Enum): + + AUTO = None + DEFAULT = [ + { + "convergence_criteria": "all_converged", + "use_batching": False, + "optim_algo": "irls_tr", + }, + ] + IRLS = [ + { + "convergence_criteria": "all_converged", + "use_batching": False, + "optim_algo": "irls_tr", + }, + ] + IRLS_BATCHED = [ + { + "convergence_criteria": "all_converged", + "use_batching": True, + "optim_algo": "irls_tr", + }, + ] diff --git a/batchglm/train/tf2/glm_norm/vars.py b/batchglm/train/tf2/glm_norm/vars.py new file mode 100644 index 00000000..b1200abc --- /dev/null +++ b/batchglm/train/tf2/glm_norm/vars.py @@ -0,0 +1,8 @@ +from .model import ProcessModel +from .external import ModelVarsGLM + + +class ModelVars(ProcessModel, ModelVarsGLM): + """ + Full class. + """ diff --git a/batchglm/train/tf2/ops.py b/batchglm/train/tf2/ops.py new file mode 100644 index 00000000..56fbf48b --- /dev/null +++ b/batchglm/train/tf2/ops.py @@ -0,0 +1,59 @@ +import tensorflow as tf +from typing import Union + + +def swap_dims(tensor, axis0, axis1, exec_transpose=True, return_perm=False, name="swap_dims"): + """ + Swaps two dimensions in a given tensor. + + :param tensor: The tensor whose axes should be swapped + :param axis0: The first axis which should be swapped with `axis1` + :param axis1: The second axis which should be swapped with `axis0` + :param exec_transpose: Should the transpose operation be applied? + :param return_perm: Should the permutation argument for `tf.transpose` be returned? + Autmoatically true, if `exec_transpose` is False + :param name: The name scope of this op + :return: either retval, (retval, permutation) or permutation + """ + with tf.name_scope(name): + rank = tf.range(tf.rank(tensor)) + idx0 = rank[axis0] + idx1 = rank[axis1] + perm0 = tf.where(tf.equal(rank, idx0), tf.tile(tf.expand_dims(idx1, 0), [tf.size(rank)]), rank) + perm1 = tf.where(tf.equal(rank, idx1), tf.tile(tf.expand_dims(idx0, 0), [tf.size(rank)]), perm0) + + if exec_transpose: + retval = tf.transpose(tensor, perm1) + + if return_perm: + return retval, perm1 + else: + return retval + else: + return perm1 + + +def stacked_lstsq(L, b, rcond=1e-10, name="stacked_lstsq"): + r""" + Solve `Lx = b`, via SVD least squares cutting of small singular values + + :param L: tensor of shape (..., M, K) + :param b: tensor of shape (..., M, N). + :param rcond: threshold for inverse + :param name: name scope of this op + :return: x of shape (..., K, N) + """ + with tf.name_scope(name): + u, s, v = tf.linalg.svd(L, full_matrices=False) + s_max = s.max(axis=-1, keepdims=True) + s_min = rcond * s_max + + inv_s = tf.where(s >= s_min, tf.reciprocal(s), 0) + + x = tf.einsum( + '...MK,...MN->...KN', + v, + tf.einsum('...K,...MK,...MN->...KN', inv_s, u, b) + ) + + return tf.conj(x)