diff --git a/batchglm/api/models/tf2/glm_beta.py b/batchglm/api/models/tf2/glm_beta.py
index 8b5f563e..67cffbf8 100644
--- a/batchglm/api/models/tf2/glm_beta.py
+++ b/batchglm/api/models/tf2/glm_beta.py
@@ -1,2 +1,2 @@
-#from batchglm.models.glm_beta import InputDataGLM, Model, Simulator
-#from batchglm.train.tf2.glm_beta import Estimator
+from batchglm.models.glm_beta import InputDataGLM, Model, Simulator
+from batchglm.train.tf2.glm_beta import Estimator
diff --git a/batchglm/api/models/tf2/glm_norm.py b/batchglm/api/models/tf2/glm_norm.py
index 45fc0453..13e1727d 100644
--- a/batchglm/api/models/tf2/glm_norm.py
+++ b/batchglm/api/models/tf2/glm_norm.py
@@ -1,2 +1,2 @@
-#from batchglm.models.glm_norm import InputDataGLM, Model, Simulator
-#from batchglm.train.tf2.glm_norm import Estimator
+from batchglm.models.glm_norm import InputDataGLM, Model, Simulator
+from batchglm.train.tf2.glm_norm import Estimator
diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 9afb32bf..450a8dba 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -11,22 +11,34 @@
 
 # Trust region hyper parameters:
 TRUST_REGION_RADIUS_INIT = 100.
+TRUST_REGION_RADIUS_INIT_SCALE = 1.
 TRUST_REGION_ETA0 = 0.
 TRUST_REGION_ETA1 = 0.25
 TRUST_REGION_ETA2 = 0.25
 TRUST_REGION_T1 = 0.5  # Fast collapse to avoid trailing.
 TRUST_REGION_T2 = 1.5  # Allow expansion if not shrinking.
-TRUST_REGION_UPPER_BOUND = 1e5
+TRUST_REGION_UPPER_BOUND = 1e40
 
-TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 1
+TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 0.5
+TRUST_REGIONT_T2_IRLS_GD_TR_SCALE = 1.5
 
 # Convergence hyper-parameters:
-LLTOL_BY_FEATURE = 1e-10
+LLTOL_BY_FEATURE = 1e-12
 XTOL_BY_FEATURE_LOC = 1e-8
 XTOL_BY_FEATURE_SCALE = 1e-6
 GTOL_BY_FEATURE_LOC = 1e-8
 GTOL_BY_FEATURE_SCALE = 1e-8
 
+TRTOL_BY_FEATURE_LOC = 1e-8
+TRTOL_BY_FEATURE_SCALE = 1e-6
+
+FEATUREWISE_THRESHOLD = 10  # the minimal number of features to converge before next featurewise batch
+FEATUREWISE_RECALCULATE = False # if set to True, recalculate the results from the previous train step
+
+WOLFE_C1 = 1e-3
+WOLFE_C2 = 0.99
+ALPHA0 = 100
+
 try:
     import tensorflow as tf
 
diff --git a/batchglm/train/tf2/__init__.py b/batchglm/train/tf2/__init__.py
new file mode 100644
index 00000000..9170f2ff
--- /dev/null
+++ b/batchglm/train/tf2/__init__.py
@@ -0,0 +1,3 @@
+from . import glm_nb as nb
+from . import glm_norm as norm
+from . import glm_beta as beta
\ No newline at end of file
diff --git a/batchglm/train/tf2/base/__init__.py b/batchglm/train/tf2/base/__init__.py
new file mode 100644
index 00000000..bc66a13a
--- /dev/null
+++ b/batchglm/train/tf2/base/__init__.py
@@ -0,0 +1,3 @@
+from .estimator import TFEstimator
+from .model import ProcessModelBase, ModelBase
+from .optim import OptimizerBase
diff --git a/batchglm/train/tf2/base/estimator.py b/batchglm/train/tf2/base/estimator.py
new file mode 100644
index 00000000..7c103b02
--- /dev/null
+++ b/batchglm/train/tf2/base/estimator.py
@@ -0,0 +1,28 @@
+from .model import ModelBase
+import tensorflow as tf
+
+
+class TFEstimator:
+    model: ModelBase
+
+    def __init__(self, input_data, dtype):
+
+        self._input_data = input_data
+        self.dtype = dtype
+
+    def _train(
+            self,
+            is_batched: bool,
+            batch_size: int,
+            optimizer_object: tf.keras.optimizers.Optimizer,
+            convergence_criteria: str,
+            stopping_criteria: int,
+            autograd: bool,
+            featurewise: bool,
+            benchmark: bool,
+            optimizer: str
+    ):
+        pass
+
+    def fetch_fn(self, idx):
+        pass
diff --git a/batchglm/train/tf2/base/external.py b/batchglm/train/tf2/base/external.py
new file mode 100644
index 00000000..9133cd4d
--- /dev/null
+++ b/batchglm/train/tf2/base/external.py
@@ -0,0 +1,4 @@
+#from batchglm.models.base import _Estimator_Base
+#from batchglm.xarray_sparse import SparseXArrayDataArray, SparseXArrayDataSet
+#import batchglm.utils.stats as stat_utils
+from batchglm import pkg_constants
diff --git a/batchglm/train/tf2/base/model.py b/batchglm/train/tf2/base/model.py
new file mode 100644
index 00000000..76eaa6fa
--- /dev/null
+++ b/batchglm/train/tf2/base/model.py
@@ -0,0 +1,47 @@
+import abc
+import logging
+import tensorflow as tf
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+class ModelBase(tf.keras.Model, metaclass=abc.ABCMeta):
+
+    def __init__(self, dtype):
+        super(ModelBase, self).__init__(dtype=dtype)
+
+    @abc.abstractmethod
+    def call(self, inputs, training=False, mask=None):
+        pass
+
+
+class ProcessModelBase:
+
+    @abc.abstractmethod
+    def param_bounds(self, dtype):
+        pass
+
+    def tf_clip_param(
+            self,
+            param,
+            name
+    ):
+        bounds_min, bounds_max = self.param_bounds(param.dtype)
+        return tf.clip_by_value(
+            param,
+            bounds_min[name],
+            bounds_max[name]
+        )
+
+    def np_clip_param(
+            self,
+            param,
+            name
+    ):
+        bounds_min, bounds_max = self.param_bounds(param.dtype)
+        return np.clip(
+            param,
+            bounds_min[name],
+            bounds_max[name]
+        )
diff --git a/batchglm/train/tf2/base/optim.py b/batchglm/train/tf2/base/optim.py
new file mode 100644
index 00000000..5fc8d13b
--- /dev/null
+++ b/batchglm/train/tf2/base/optim.py
@@ -0,0 +1,52 @@
+import abc
+import logging
+import tensorflow as tf
+
+logger = logging.getLogger("batchglm")
+
+
+class OptimizerBase(tf.keras.optimizers.Optimizer, metaclass=abc.ABCMeta):
+
+    def __init__(self, name):
+        super(OptimizerBase, self).__init__(name=name)
+
+    @abc.abstractmethod
+    def _resource_apply_dense(self, grad, handle):
+        pass
+
+    @abc.abstractmethod
+    def _resource_apply_sparse(self, grad, handle, apply_state):
+        pass
+
+    @abc.abstractmethod
+    def _create_slots(self):
+        pass
+
+    """
+    @property
+    @abc.abstractmethod
+    def vars(self):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def gradients(self):
+        return None
+
+    @property
+    @abc.abstractmethod
+    def hessians(self):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def fims(self):
+        pass
+
+    @abc.abstractmethod
+    def step(self, learning_rate):
+        pass
+    """
+    @abc.abstractmethod
+    def get_config(self):
+        pass
diff --git a/batchglm/train/tf2/base_glm/README.md b/batchglm/train/tf2/base_glm/README.md
new file mode 100644
index 00000000..eea79ccc
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/README.md
@@ -0,0 +1,2 @@
+# Classes with GLM specific code.
+All noise models that are in the GLM category inherit all of these classes.
\ No newline at end of file
diff --git a/batchglm/train/tf2/base_glm/__init__.py b/batchglm/train/tf2/base_glm/__init__.py
new file mode 100644
index 00000000..699b1bce
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/__init__.py
@@ -0,0 +1,12 @@
+from .processModel import ProcessModelGLM
+from .model import GLM
+
+from .estimator import Estimator
+from .vars import ModelVarsGLM
+from .layers import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM
+from .layers import LikelihoodGLM, UnpackParamsGLM
+from .layers_gradients import JacobianGLM, HessianGLM, FIMGLM
+from .optim import NR, IRLS
+
+from .generator import DataGenerator
+from .convergence import ConvergenceCalculator
diff --git a/batchglm/train/tf2/base_glm/convergence.py b/batchglm/train/tf2/base_glm/convergence.py
new file mode 100644
index 00000000..aedd8d64
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/convergence.py
@@ -0,0 +1,198 @@
+import numpy as np
+import tensorflow as tf
+from .external import pkg_constants
+
+
+class ConvergenceCalculator:
+    """Wrapper object containing all necessary methods to calculate convergences based on change
+    in likelihood, gradient and parameters."""
+
+    def __init__(self, estimator, last_ll: np.ndarray):
+        self.estimator = estimator
+        self.last_params = estimator.model.params_copy.numpy()
+        self.last_ll = last_ll
+        self.previous_number_converged = 0
+        self.calc_separated = self.estimator.irls_algo and self.estimator._train_scale
+
+    def calculate_convergence(self, results, jac_normalization, optimizer_object, batch_features):
+        """Calculates convergence based on change in likelihood, gradient and parameters."""
+
+        features_updated = self.estimator.model.model_vars.updated
+        converged_a = self.estimator.model.model_vars.converged
+        not_converged_a = ~ converged_a
+        if self.calc_separated:
+            features_updated_b = self.estimator.model.model_vars.updated_b
+            converged_b = self.estimator.model.model_vars.converged_b
+            not_converged_b = ~ converged_b
+
+        n_features = self.estimator.input_data.num_features
+
+        ###########################################################
+        # FIRST PART: Retrieve and manipulate ll, grads and params.
+        ####
+        if self.estimator.irls_algo:
+            grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1))
+        elif self.estimator.nr_algo:
+            grad_numpy = tf.abs(results[1])
+        else:
+            grad_numpy = tf.abs(tf.transpose(results[1]))
+        new_ll = tf.negative(tf.divide(results[0], self.estimator.input_data.num_observations))
+        new_params = self.estimator.model.params_copy
+
+        if batch_features:
+            # map columns of ll to full feature space
+            remaining_features = self.estimator.model.model_vars.remaining_features
+            indices = tf.where(remaining_features)
+            updated_lls = tf.scatter_nd(indices, new_ll, shape=[n_features])
+            # fill the added columns with previous ll
+            new_ll = tf.where(remaining_features, updated_lls, self.last_ll)
+
+            # fill added columns with the gradients from previous runs.
+            grad_numpy = tf.scatter_nd(
+                indices,
+                grad_numpy,
+                shape=(n_features, self.estimator.model.params.get_shape()[0])
+            )
+            # TODO: added columns are zero here, does that matter?
+
+            # map columns of params to full feature space
+            new_params = tf.transpose(
+                tf.scatter_nd(
+                    indices,
+                    tf.transpose(new_params),
+                    shape=(self.estimator.model.params.shape[1], self.estimator.model.params.shape[0])
+                )
+            )
+            # TODO: added columns are zero here, does that matter?
+
+        grad_numpy = grad_numpy.numpy()
+        new_params = new_params.numpy()
+        new_ll = new_ll.numpy()
+
+        ###########################################################
+        # SECOND PART: Calculate ll convergence.
+        ####
+
+        # Get all converged features due to change in ll < LLTOL_BY_FEATURE
+        # IMPORTANT: we need to ensure they have also been updated, otherwise ll_prev = ll_current!
+        ll_difference = np.abs(self.last_ll - new_ll) / self.last_ll
+        # print('ll_diff: ', ll_difference[0])
+        # print(self.estimator.model.model_vars.converged[0], self.estimator.model.model_vars.updated[0])
+        # print(self.estimator.model.model_vars.converged_b[0], self.estimator.model.model_vars.updated_b[0])
+        ll_converged = ll_difference < pkg_constants.LLTOL_BY_FEATURE
+
+        ll_converged_a = ll_converged & features_updated
+        epoch_ll_converged_a = not_converged_a & ll_converged_a  # formerly known as converged_f
+
+        if self.calc_separated:
+            ll_converged_b = ll_converged & features_updated_b
+            epoch_ll_converged_b = not_converged_b & ll_converged_b  # formerly known as converged_f
+
+        ###########################################################
+        # THIRD PART: calculate grad convergence.
+        ####
+        grad_loc = np.sum(grad_numpy[:, self.estimator.model.model_vars.idx_train_loc], axis=1)
+        grad_norm_loc = grad_loc / jac_normalization
+        grad_scale = np.sum(grad_numpy[:, self.estimator.model.model_vars.idx_train_scale], axis=1)
+        grad_norm_scale = grad_scale / jac_normalization
+
+        grad_norm_loc_converged = grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC
+        grad_norm_scale_converged = grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE
+        if self.calc_separated:
+            grad_converged_a = grad_norm_loc_converged & features_updated
+            grad_converged_b = grad_norm_scale_converged & features_updated_b
+            epoch_grad_converged_b = not_converged_b & grad_converged_b  # formerly known as converged_g
+
+        else:
+            grad_converged_a = grad_norm_loc_converged & grad_norm_scale_converged & features_updated
+        epoch_grad_converged_a = not_converged_a & grad_converged_a  # formerly known as converged_g
+        # print('grad: ', grad_norm_loc[0], grad_norm_scale[0])
+
+        ###########################################################
+        # Fourth PART: calculate parameter step convergence.
+        ####
+        x_step_a, x_step_b = self.calc_x_step(self.last_params, new_params)
+        if self.calc_separated:
+            x_step_converged_a = x_step_a & features_updated
+            x_step_converged_b = x_step_b & features_updated_b
+            epoch_step_converged_b = not_converged_b & x_step_converged_b
+
+        else:
+            x_step_converged_a = x_step_a & x_step_b & features_updated
+        epoch_step_converged_a = not_converged_a & x_step_converged_a
+        # print('x_step: ', x_step_converged_a[0], x_step_converged_b[0])
+
+        # In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius.
+        # For now it must not be below the threshold for the X step of the loc model.
+        if hasattr(optimizer_object, 'tr_mode') and optimizer_object.tr_mode:
+            converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
+            epoch_tr_converged = not_converged_a & converged_tr
+            epoch_step_converged_a |= epoch_tr_converged
+        if hasattr(optimizer_object, 'tr_mode_b') and optimizer_object.tr_mode_b and self.estimator._train_scale:
+            converged_tr_b = optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
+            epoch_tr_converged_b = not_converged_b & converged_tr_b
+            epoch_step_converged_b |= epoch_tr_converged_b
+
+        # print('tr: ', epoch_tr_converged[0], epoch_tr_converged_b[0])
+        # print(self.estimator.model.model_vars.converged[0], self.estimator.model.model_vars.updated[0])
+        # print(self.estimator.model.model_vars.converged_b[0], self.estimator.model.model_vars.updated_b[0])
+        ###########################################################
+        # FINAL PART: exchange the current with the new containers.
+        ####
+        self.previous_number_converged = np.sum(self.estimator.model.model_vars.total_converged)
+        self.last_params = new_params
+        self.last_ll = new_ll
+        converged_a = np.logical_or.reduce((converged_a, epoch_ll_converged_a, epoch_grad_converged_a, epoch_step_converged_a))
+        if self.calc_separated:
+            converged_b = np.logical_or.reduce((converged_b, epoch_ll_converged_b, epoch_grad_converged_b, epoch_step_converged_b))
+            self.estimator.model.model_vars.total_converged = converged_a & converged_b
+            self.estimator.model.model_vars.converged_b = converged_b
+            epoch_ll_converged_a |= epoch_ll_converged_b
+            epoch_grad_converged_a |= epoch_grad_converged_b
+            epoch_step_converged_a |= epoch_step_converged_b
+        else:
+            self.estimator.model.model_vars.total_converged = converged_a
+        self.estimator.model.model_vars.converged = converged_a
+        # print(self.estimator.model.model_vars.total_converged[0])
+        return epoch_ll_converged_a, epoch_grad_converged_a, epoch_step_converged_a
+
+    def calc_x_step(self, prev_params, curr_params):
+        """Calculates convergence based on the difference in parameters before and
+        after the update."""
+        def get_norm_converged(model: str, prev_params):
+            if model == 'loc':
+                idx_train = self.estimator.model.model_vars.idx_train_loc
+                xtol = pkg_constants.XTOL_BY_FEATURE_LOC
+            elif model == 'scale':
+                idx_train = self.estimator.model.model_vars.idx_train_scale
+                xtol = pkg_constants.XTOL_BY_FEATURE_SCALE
+            else:
+                assert False, "Supply either 'loc' or 'scale'!"
+            x_step = curr_params - prev_params
+            x_norm = np.sqrt(np.sum(np.square(x_step[idx_train, :]), axis=0))
+            # print('x_norm: ', x_norm[0])
+            return x_norm < xtol
+
+        # We use a trick here: First we set both the loc and scale convergence to True.
+        # It is not necessary to use an array with length = number of features, since bitwise
+        # AND also works with a single boolean.
+        loc_conv = np.bool_(True)
+        scale_conv = np.bool_(True)
+
+        # Now we check which models need to be trained. E.g. if you are using quick_scale = True,
+        # self._train_scale will be False and so the above single True value will be used.
+        if self.estimator._train_loc:
+            loc_conv = get_norm_converged('loc', prev_params)
+        if self.estimator._train_scale:
+            scale_conv = get_norm_converged('scale', prev_params)
+
+        # Finally, we check that only features updated in this epoch can evaluate to True.
+        # This is only a problem for 2nd order optims with trusted region mode, since it might
+        # occur, that a feature isn't updated, so the x_step is zero although not yet converged.
+        return loc_conv, scale_conv
+
+    def getLoss(self):
+        return np.sum(self.last_ll)
+
+    def getPreviousNumberConverged(self):
+        return self.previous_number_converged
diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
new file mode 100644
index 00000000..9db1f1b1
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -0,0 +1,402 @@
+import abc
+import sys
+import logging
+import time
+import numpy as np
+from .generator import DataGenerator
+from .convergence import ConvergenceCalculator
+import tensorflow as tf
+from .model import GLM
+from .external import TFEstimator, _EstimatorGLM
+from .optim import NR, IRLS
+from .external import pkg_constants
+
+logger = logging.getLogger("batchglm")
+
+
+class Estimator(TFEstimator, _EstimatorGLM, metaclass=abc.ABCMeta):
+    """
+    Estimator for Generalized Linear Models (GLMs).
+    """
+    model: GLM
+    _train_loc: bool
+    _train_scale: bool
+    _initialized: bool = False
+    noise_model: str
+    irls_algo: bool = False
+    nr_algo: bool = False
+    optimizer = None
+
+    def initialize(self, **kwargs):
+        self.values = []
+        self.times = []
+        self.converged = []
+        self.lls = []
+        self._initialized = True
+        self.model = None
+
+    def update(self, results, *args):
+        self.optimizer.apply_gradients([(results[1], self.model.params_copy)])
+        self.model.model_vars.updated = ~self.model.model_vars.converged
+
+    def finalize(self, **kwargs):
+        """
+        Evaluate all tensors that need to be exported from session,
+        save these as class attributes and close session.
+        Changes .model entry from tf-based EstimatorGraph to numpy based Model instance and
+        transfers relevant attributes.
+        """
+        a_var, b_var = self.model.unpack_params(
+            [self.model.params, self.model.model_vars.a_var.get_shape()[0]])
+        self.model = self.get_model_container(self.input_data)
+        self.model._a_var = a_var.numpy()
+        self.model._b_var = b_var.numpy()
+        self._loss = tf.reduce_sum(
+            tf.negative(self._log_likelihood) / self.input_data.num_observations).numpy()
+
+    def __init__(
+            self,
+            input_data,
+            dtype,
+    ):
+        TFEstimator.__init__(self=self, input_data=input_data, dtype=dtype)
+        _EstimatorGLM.__init__(self=self, model=None, input_data=input_data)
+
+    def _train(
+            self,
+            noise_model: str,
+            is_batched: bool = False,
+            batch_size: int = 5000,
+            optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(),
+            convergence_criteria: str = "step",
+            stopping_criteria: int = 1000,
+            autograd: bool = False,
+            featurewise: bool = True,
+            benchmark: bool = False,
+            optim_algo: str = "adam",
+            b_update_freq = 1
+    ):
+        # define some useful shortcuts here
+        n_obs = self.input_data.num_observations
+        n_features = self.input_data.num_features
+        # set necessary attributes
+        self.noise_model = noise_model
+        optim = optim_algo.lower()
+        self.irls_algo = optim.startswith('irls')
+        self.nr_algo = optim in ['nr', 'nr_tr']
+
+        ################################################
+        # INIT Step 1: Consistency Checks
+        ####
+        assert not is_batched, "The TF2 backend does not yet support updates on individual" \
+            "batches. Use full data updates instead."
+        assert convergence_criteria in ["step", "all_converged"], \
+            ("Unrecognized convergence criteria %s", convergence_criteria)
+
+        if not self._initialized:
+            raise RuntimeError("Cannot train the model: Estimator not initialized. \
+                Did you forget to call estimator.initialize() ?")
+
+        if b_update_freq == 0:
+            b_update_freq = 1
+
+        if autograd and optim_algo.lower() in ['nr', 'nr_tr']:
+            logger.warning(
+                "Automatic differentiation is currently not supported for hessians. Falling back \
+                to closed form. Only Jacobians are calculated using autograd.")
+
+        if featurewise and not (self.irls_algo or self.nr_algo):
+            featurewise = False
+            logger.warning("WARNING: 'Featurewise batching' is only available for 2nd order "
+                           "optimizers IRLS and NR. Fallback to full featurespace fitting.")
+        if batch_size > n_obs:
+            batch_size = n_obs
+
+        ################################################
+        # INIT Step 2: Intialise training loop.
+        #
+
+        # create a tensorflow dataset using the DataGenerator
+        datagenerator = DataGenerator(self, noise_model, is_batched, batch_size)
+        epoch_set = datagenerator.new_epoch_set()
+
+        # first model call to initialise prior to first update.
+        epochs_until_b_update = b_update_freq - 1
+        compute_b = epochs_until_b_update == 0
+        for i, x_batch in enumerate(epoch_set):
+            if i == 0:
+                results = self.model(x_batch, compute_b=compute_b)
+            else:
+                results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))]
+
+        # create ConvergenceCalculator to check for new convergences.
+        conv_calc = ConvergenceCalculator(self, tf.negative(tf.divide(results[0], n_obs)).numpy())
+
+        # termination decision for training loop
+        def convergence_decision(num_converged, train_step):
+            not_done_fitting = num_converged < n_features
+            if convergence_criteria == "step":
+                not_done_fitting &= train_step < stopping_criteria
+            return not_done_fitting
+
+        # condition variables neede during while loop
+        batch_features = False
+        train_step = 0
+        num_converged = 0
+        num_converged_prev = 0
+        need_new_epoch_set = False
+        n_conv_last_featurewise_batch = 0
+
+        ################################################
+        # Training Loop: Model Fitting happens here.
+        ####
+
+        while convergence_decision(num_converged, train_step):
+
+            if benchmark:
+                t0_epoch = time.time()
+
+            ############################################
+            # 1. recalculate, only done if featurewise
+            if need_new_epoch_set:
+                need_new_epoch_set = False
+                # this is executed only if a new feature converged in the last train step and
+                # using featurewise.
+                epoch_set = datagenerator.new_epoch_set(batch_features=batch_features)
+                if pkg_constants.FEATUREWISE_RECALCULATE:
+                    for i, x_batch in enumerate(epoch_set, compute_b=compute_b):
+                        results = self.model(x_batch) if i == 0 else \
+                            [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))]
+
+            ############################################
+            # 2. Update the parameters
+            self.update(results, epoch_set, batch_features, epochs_until_b_update == 0)
+            ############################################
+            # 3. calculate new ll, jacs, hessian/fim
+            compute_b = epochs_until_b_update < 2
+            for i, x_batch in enumerate(epoch_set):
+                # need new params_copy in model in case we use featurewise without recalculation
+                results = self.model(x_batch, compute_b=compute_b) if i == 0 \
+                    else [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))]
+
+            ############################################
+            # 4. check for any new convergences
+            convergences = conv_calc.calculate_convergence(
+                results=results,
+                jac_normalization=batch_size if is_batched else n_obs,
+                optimizer_object=optimizer_object,
+                batch_features=batch_features
+            )
+
+            num_converged = np.sum(self.model.model_vars.total_converged)
+            loss = conv_calc.getLoss()
+            if self.irls_algo and self._train_scale:
+                num_updated = np.sum(
+                    np.logical_or(self.model.model_vars.updated, self.model.model_vars.updated_b))
+            else:
+                num_updated = np.sum(self.model.model_vars.updated)
+            log_output = f"Step: {train_step} loss: {loss}, "\
+                f"converged {num_converged}, updated {num_updated}"
+            num_converged_prev = conv_calc.getPreviousNumberConverged()
+
+            ############################################
+            # 5. report any new convergences
+            if num_converged == num_converged_prev:
+                logger.warning(log_output)
+            else:
+                if featurewise:
+                    if not batch_features:
+                        batch_features = True
+                        self.model.batch_features = batch_features
+                    conv_diff = num_converged - n_conv_last_featurewise_batch
+                    if pkg_constants.FEATUREWISE_THRESHOLD < 1:
+                        conv_diff /= n_features-n_conv_last_featurewise_batch
+                    # Update params if number of new convergences since last
+                    # featurewise batch is reached again.
+                    if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
+                        need_new_epoch_set = True
+                        n_conv_last_featurewise_batch = num_converged
+                        self.model.apply_featurewise_updates(conv_calc.last_params)
+                        if not pkg_constants.FEATUREWISE_RECALCULATE:
+                            results = self.mask_unconverged(results)
+                        self.model.model_vars.remaining_features = \
+                            ~self.model.model_vars.total_converged
+                        self.model.featurewise_batch()
+
+                sums = [np.sum(convergence_vals) for convergence_vals in convergences]
+                log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
+                    f"x_step: {sums[2]}"
+                logger.warning(log_output)
+
+            train_step += 1
+            epochs_until_b_update = (epochs_until_b_update + b_update_freq - 1) % b_update_freq
+
+            # make sure loc is not updated any longer if completely converged
+            if b_update_freq > 1 and epochs_until_b_update > 1:
+                if np.all(self.model.model_vars.converged):
+                    epochs_until_b_update = 1  # must not be 0: scale grads not yet calculated
+                    b_update_freq = 1  # from now on, calc scale grads in each step
+
+            # store some useful stuff for benchmarking purposes.
+            if benchmark:
+                t1_epoch = time.time()
+                self.times.append(t1_epoch-t0_epoch)
+                self.converged.append(num_converged)
+                self.values.append(self.model.trainable_variables[0].numpy().copy())
+                self.lls.append(conv_calc.last_ll)
+
+        ################################################
+        # Final model call on the full feature space.
+        ####
+        logger.warning("Final Evaluation run.")
+        if batch_features:
+            # need to update `model.params` if conv_diff wasn't reached in last train step
+            # as updates since the last featurewise batch are not yet applied in that case.
+            if np.any(self.model.model_vars.remaining_features):
+                self.model.apply_featurewise_updates(conv_calc.last_params)
+            # now make sure we use the full feature space for the last update
+            self.model.model_vars.remaining_features = np.ones(n_features, dtype=np.bool)
+            self.model.featurewise_batch()
+
+        batch_features = False  # reset in case train is run repeatedly
+        # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv
+        self.model.setMethod('nr_tr')  # TODO: maybe stay with irls to compute fim in the future
+        self.model.hessian.compute_b = True  # since self._train_scale could be False.
+
+        # need new set here with full feature space
+        # TODO: only needed if batch_features, maybe put this in the above if switch later
+        final_set = datagenerator.new_epoch_set()
+        for i, x_batch in enumerate(final_set):
+            results = self.model(x_batch) if i == 0 else \
+                [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
+
+        # store all the final results in this estimator instance.
+        self._log_likelihood = results[0].numpy()
+        self._jacobian = tf.reduce_sum(tf.abs(results[1] / n_obs), axis=1)
+        self._hessian = - results[2].numpy()
+
+        fisher_inv = np.zeros_like(self._hessian)
+        invertible = np.where(np.linalg.cond(self._hessian, p=None) < 1 / sys.float_info.epsilon)[0]
+        num_non_invertible = n_features - len(invertible)
+        if num_non_invertible > 0:
+            logger.warning(f"fisher_inv could not be calculated for {num_non_invertible} features!")
+        fisher_inv[invertible] = np.linalg.inv(-self._hessian[invertible])
+        self._fisher_inv = fisher_inv.copy()
+        self.model.hessian.compute_b = self.model.compute_b  # reset if not self._train_scale
+
+    def update_params(self, batches, results, batch_features, update_func):
+        """Wrapper method to conduct updates based on different optimizers/conditions."""
+        if self.irls_algo or self.nr_algo:
+            if self.irls_algo:
+                # separate loc and scale update if using IRLS.
+                update_func(
+                    inputs=[batches, *results],
+                    compute_a=True,
+                    compute_b=False,
+                    batch_features=batch_features,
+                    is_batched=False
+                )
+                if self._train_scale:
+                    update_func(
+                        inputs=[batches, *results],
+                        compute_a=False,
+                        compute_b=True,
+                        batch_features=batch_features,
+                        is_batched=False
+                    )
+            else:
+                update_func(
+                    inputs=[batches, *results],
+                    batch_features=batch_features,
+                    is_batched=False
+                )
+        else:
+            update_var = results[1]
+            update_func([(update_var, self.model.params_copy)])
+            self.model.model_vars.updated = ~self.model.model_vars.converged
+
+    def mask_unconverged(self, results):
+
+        # the idx from unconverged features, thus features reamining in the curent results
+        idx = np.where(self.model.model_vars.remaining_features)[0]
+        # the new remaining_features in reduced feature space
+        mask = ~(self.model.model_vars.total_converged[idx])
+
+        ll = tf.boolean_mask(results[0], mask)
+        if self.irls_algo:
+            jac_a = tf.boolean_mask(results[1], mask)
+            jac_b = tf.boolean_mask(results[2], mask)
+            fim_a = tf.boolean_mask(results[3], mask)
+            fim_b = tf.boolean_mask(results[4], mask)
+            return ll, jac_a, jac_b, fim_a, fim_b
+        elif self.nr_algo:
+            jac = tf.boolean_mask(results[1], mask)
+            hessian = tf.boolean_mask(results[2], mask)
+            return ll, jac, hessian
+        else:
+            jac = tf.boolean_mask(results[1], mask, axis=1)
+        return ll, jac
+
+    def get_optimizer_object(self, optimizer: str, learning_rate):
+        """ Creates an optimizer object based on the given optimizer string."""
+        optimizer = optimizer.lower()
+        if optimizer == "gd":
+            optim_obj = tf.keras.optimizers.SGD(learning_rate=learning_rate)
+        elif optimizer == "adam":
+            optim_obj = tf.keras.optimizers.Adam(learning_rate=learning_rate)
+        elif optimizer == "adagrad":
+            optim_obj = tf.keras.optimizers.Adagrad(learning_rate=learning_rate)
+        elif optimizer == "rmsprop":
+            optim_obj = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
+        else:
+            tr_mode = optimizer.endswith('tr')
+            init_dict = {
+                "dtype": self.dtype,
+                "model": self.model,
+                "name": optimizer,
+                "tr_mode": tr_mode,
+                "n_obs": self.input_data.num_observations
+            }
+            if optimizer.startswith('irls'):
+                optim_obj = IRLS(**init_dict)
+            elif optimizer.startswith('nr'):
+                optim_obj = NR(**init_dict)
+            else:
+                optim_obj = tf.keras.optimizers.Adam(learning_rate=learning_rate)
+                logger.warning("No valid optimizer given. Default optimizer Adam chosen.")
+
+        return optim_obj
+
+    @staticmethod
+    def get_init_from_model(init_a, init_b, input_data, init_model):
+        # Locations model:
+        if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"):
+            my_loc_names = set(input_data.loc_names)
+            my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names))
+
+            init_loc = np.zeros([input_data.num_loc_params, input_data.num_features])
+            for parm in my_loc_names:
+                init_idx = np.where(init_model.input_data.loc_names == parm)[0]
+                my_idx = np.where(input_data.loc_names == parm)[0]
+                init_loc[my_idx] = init_model.a_var[init_idx]
+
+            init_a = init_loc
+
+        # Scale model:
+        if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"):
+            my_scale_names = set(input_data.scale_names)
+            my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names)
+
+            init_scale = np.zeros([input_data.num_scale_params, input_data.num_features])
+            for parm in my_scale_names:
+                init_idx = np.where(init_model.input_data.scale_names == parm)[0]
+                my_idx = np.where(input_data.scale_names == parm)[0]
+                init_scale[my_idx] = init_model.b_var[init_idx]
+
+            init_b = init_scale
+
+        return init_a, init_b
+
+    @abc.abstractmethod
+    def get_model_container(self, input_data):
+        pass
diff --git a/batchglm/train/tf2/base_glm/external.py b/batchglm/train/tf2/base_glm/external.py
new file mode 100644
index 00000000..db1c8e0e
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/external.py
@@ -0,0 +1,9 @@
+from batchglm.train.tf2.base import ProcessModelBase, ModelBase, TFEstimator
+from batchglm.train.tf2.base import OptimizerBase
+#from batchglm.train.tf2.glm_nb import NR, IRLS
+
+from batchglm.models.base_glm import InputDataGLM, _ModelGLM, _EstimatorGLM
+
+#import batchglm.train.tf.ops as op_utils
+from batchglm.utils.linalg import groupwise_solve_lm
+from batchglm import pkg_constants
diff --git a/batchglm/train/tf2/base_glm/generator.py b/batchglm/train/tf2/base_glm/generator.py
new file mode 100644
index 00000000..1d061fcd
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/generator.py
@@ -0,0 +1,103 @@
+import numpy as np
+from scipy.sparse import csr_matrix
+import tensorflow as tf
+import time
+
+class DataGenerator:
+    """Wrapper Object to generate an iterable TensorFlow Dataset from given input data."""
+
+    def __init__(
+            self,
+            estimator,
+            noise_model: str,
+            is_batched_model: bool,
+            batch_size: int
+    ):
+        self.estimator = estimator
+        self.noise_model = noise_model
+        self.is_batched_model = is_batched_model
+        self.batch_size = batch_size
+        self.sparse = isinstance(estimator.input_data.x, csr_matrix)
+        self.n_obs = estimator.input_data.num_observations
+        # integer ceil division with arithmetic trick: ceil(a/b)=(a+b-1)//b
+        # We need this for cases where n_obs mod batch_size != 0
+        self.num_batches = (self.n_obs + batch_size - 1) // batch_size
+        dtp = estimator.dtype
+        output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if self.sparse else (dtp,) * 4
+        self.dataset = tf.data.Dataset.from_generator(
+            generator=self._generate, output_types=output_types)
+
+    def _generate(self):
+        """
+        Generates `(counts, design_loc, design_scale, size_factors)` tuples of `self.input_data`.
+        The number of observations in each such data batch is given by `self.batch size`.
+        If `self.is_batched_model`, the method uses a random permutation of `input_data` each time
+        it is called.
+        """
+        input_data = self.estimator.input_data
+        fetch_size_factors = input_data.size_factors is not None \
+            and self.noise_model in ["nb", "norm"]
+        obs_pool = np.random.permutation(self.n_obs) \
+            if self.is_batched_model else np.arange(self.n_obs)
+        for start_id in range(0, self.n_obs, self.batch_size):
+            # numpy ignores ids > len(obs_pool) so no out of bounds check needed here:
+            idx = obs_pool[start_id: start_id + self.batch_size]
+            counts = input_data.fetch_x_sparse(idx) if self.sparse \
+                else input_data.fetch_x_dense(idx)
+            dloc = input_data.fetch_design_loc(idx)
+            dscale = input_data.fetch_design_scale(idx)
+            size_factors = input_data.fetch_size_factors(idx) if fetch_size_factors else 1
+            yield counts, dloc, dscale, size_factors
+
+    def _featurewise_batch_sparse(self, ivs_tuple, dloc, dscale, size_factors):
+
+        not_converged = self.estimator.model.model_vars.remaining_features
+
+        not_converged_numeric = tf.cast(not_converged, dtype=tf.int64)
+        col_idx_map = tf.cumsum(not_converged_numeric, exclusive=True)
+        ivs_tuple[0].set_shape([None, 2])
+        mask = tf.gather(not_converged, ivs_tuple[0][:, 1])
+        new_indices = tf.gather_nd(ivs_tuple[0], tf.where(mask))
+        row_idx, col_idx = tf.split(new_indices, num_or_size_splits=2, axis=1, num=2)
+
+        new_indices = tf.concat([row_idx, tf.gather(col_idx_map, col_idx)], 1)
+        new_values = tf.boolean_mask(ivs_tuple[1], mask)
+        n_features = col_idx_map[-1]
+        if not_converged[-1]:
+            n_features += 1
+
+        x_tensor = (new_indices, new_values, (ivs_tuple[2][0], n_features))
+        return x_tensor, dloc, dscale, size_factors
+
+    def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors):
+        """Takes an element of a dataset, performs featurewise batching
+        and returns the reduced element."""
+
+        not_converged = self.estimator.model.model_vars.remaining_features
+        if self.sparse:
+            feature_columns = tf.sparse.split(
+                x_tensor,
+                num_split=self.estimator.model_vars.n_features,
+                axis=1)
+            not_converged_idx = np.where(not_converged)[0]
+            feature_columns = [feature_columns[i] for i in not_converged_idx]
+            x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
+        else:
+            x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
+        return x_tensor, dloc, dscale, size_factors
+
+    def new_epoch_set(self, batch_features: bool = False):
+        """Returns an iterable TensorFlow Dataset of the input data."""
+        dataset_to_return = self.dataset.take(self.num_batches)
+
+        if self.sparse:
+            dataset_to_return = dataset_to_return.map(
+                lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
+            )
+            if batch_features:
+                dataset_to_return = dataset_to_return.map(self._featurewise_batch)
+        else:
+            if batch_features:
+                dataset_to_return = dataset_to_return.map(self._featurewise_batch)
+
+        return dataset_to_return.cache().prefetch(1)
diff --git a/batchglm/train/tf2/base_glm/layers.py b/batchglm/train/tf2/base_glm/layers.py
new file mode 100644
index 00000000..5e66ab10
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/layers.py
@@ -0,0 +1,264 @@
+from typing import Union
+
+import abc
+import tensorflow as tf
+from .processModel import ProcessModelGLM
+
+
+class UnpackParamsGLM(tf.keras.layers.Layer, ProcessModelGLM):
+
+    """
+    Layer that slices the parameter tensor into mean and variance block.
+    """
+
+    def __init__(self, dtype):
+        super(UnpackParamsGLM, self).__init__(dtype=dtype)
+
+    def call(self, inputs, **kwargs):
+        """
+        :param inputs: tuple (params, border)
+           Must contain the parameter matrix (params) and the first index
+           of the variance block within the parameters matrix (border)
+
+        :return tf.Tensor, tf.Tensor
+           The two returned tensor correspond to the mean and variance block
+           of the parameter matrix.
+        """
+        params, border = inputs
+        a_var = params[0:border]  # loc obs
+        b_var = params[border:]  # scale obs
+        a_var = self.tf_clip_param(a_var, "a_var")
+        b_var = self.tf_clip_param(b_var, "b_var")
+        return a_var, b_var
+
+
+class LinearLocGLM(tf.keras.layers.Layer, ProcessModelGLM):
+
+    """
+    Computes the dot product between the design matrix of the mean model and the mean block of the parameter matrix.
+    """
+
+    def __init__(self, dtype):
+        super(LinearLocGLM, self).__init__(dtype=dtype)
+
+    def _eta_loc(
+            self,
+            a_var: tf.Tensor,
+            design_loc: tf.Tensor,
+            constraints_loc: Union[tf.Tensor, None] = None,
+            size_factors: Union[tf.Tensor, None] = None
+    ):
+        """
+        Does the actual computation of eta_loc.
+
+        :param a_var: tf.Tensor
+            the mean block of the parameter matrix
+        :param design_loc: tf.Tensor
+            the design matrix of the mean model
+        :param contraints_loc: tf.Tensor, optional
+            ??? # TODO
+        :param size_factors: tf.Tensor, optional
+            ??? # TODO
+
+        :return tf.Tensor
+            the mean values for each individual distribution, encoded in linker space.
+        """
+        if constraints_loc is not None:
+            eta_loc = tf.matmul(
+                design_loc,
+                tf.matmul(constraints_loc, a_var)
+            )
+        else:
+            eta_loc = tf.matmul(design_loc, a_var)
+
+        if size_factors is not None and size_factors.shape != (1, 1):
+            eta_loc = self.with_size_factors(eta_loc, size_factors)
+
+        eta_loc = self.tf_clip_param(eta_loc, "eta_loc")
+
+        return eta_loc
+
+    @abc.abstractmethod
+    def with_size_factors(self, eta_loc, size_factors):
+        """
+        Calculates eta_loc with size_factors. Is noise model specific and needs to be implemented in the inheriting
+        layer.
+        :param eta_loc: tf.Tensor
+            the mean values for each individual distribution, encoded in linker space
+        """
+
+    def call(self, inputs, **kwargs):
+        """
+        Calculates the eta_loc tensor, containing the mean values for each individual distribution,
+        encoded in linker space.
+
+        :param input: tuple
+            Must contain a_var, design_loc, constraints_loc and size_factors in this order, where
+            contraints_loc and size_factor can be None.
+
+        :return tf.Tensor
+            the mean values for each individual distribution, encoded in linker space.
+        """
+        return self._eta_loc(*inputs)
+
+
+class LinearScaleGLM(tf.keras.layers.Layer, ProcessModelGLM):
+
+    """
+    Computes the dot product between the design matrix of the variance model
+    and the variance block of the parameter matrix.
+    """
+
+    def __init__(self, dtype):
+        super(LinearScaleGLM, self).__init__(dtype=dtype)
+
+    def _eta_scale(
+            self,
+            b_var: tf.Tensor,
+            design_scale: tf.Tensor,
+            constraints_scale: Union[tf.Tensor, None] = None
+    ):
+        """
+        Does the actual computation of eta_scale.
+
+        :param b_var: tf.Tensor
+            the variance block of the parameter matrix
+        :param design_scale: tf.Tensor
+            the design matrix of the mean model
+        :param contraints_scale: tf.Tensor, optional
+            ??? # TODO
+
+        :return tf.Tensor
+            the variance values for each individual distribution, encoded in linker space.
+        """
+        if constraints_scale is not None:
+            eta_scale = tf.matmul(
+                design_scale,
+                tf.matmul(constraints_scale, b_var)
+            )
+        else:
+            eta_scale = tf.matmul(design_scale, b_var)
+
+        eta_scale = self.tf_clip_param(eta_scale, "eta_scale")
+
+        return eta_scale
+
+    def call(self, inputs, **kwargs):
+        """
+        Calculates the eta_scale tensor, containing the variance values for each individual distribution,
+        encoded in linker space.
+
+        :param input: tuple
+            Must contain b_var, design_scale and constraints_loc in this order, where
+            contraints_loc can be None.
+
+        :return tf.Tensor
+            the variance values for each individual distribution, encoded in linker space.
+        """
+        return self._eta_scale(*inputs)
+
+
+class LinkerLocGLM(tf.keras.layers.Layer):
+
+    """
+    Translation from linker to data space for the mean model.
+    """
+
+    def __init__(self, dtype):
+        super(LinkerLocGLM, self).__init__(dtype=dtype)
+
+    @abc.abstractmethod
+    def _inv_linker(self, loc: tf.Tensor):
+        """
+        Translates the given mean values from linker to data space. Depends on the given noise model and needs to
+        be implemented in the inheriting layer.
+
+        :param loc: tf. Tensor
+            the mean values for each individual distribution, encoded in linker space.
+
+        :return tf.Tensor
+            the mean values for each individual distribution, encoded in data space.
+        """
+
+    def call(self, eta_loc: tf.Tensor, **kwargs):
+        """
+        Calls the distribution specific linker function to translate from linker to data space.
+
+        :param eta_loc: tf.Tensor
+            the mean values for each individual distribution, encoded in linker space.
+
+        :return tf.Tensor
+            the mean values for each individual distribution, encoded in data space.
+        """
+        loc = self._inv_linker(eta_loc)
+        return loc
+
+
+class LinkerScaleGLM(tf.keras.layers.Layer):
+
+    """
+    Translation from linker to data space for the variance model.
+    """
+
+    def __init__(self, dtype):
+        super(LinkerScaleGLM, self).__init__(dtype=dtype)
+
+    @abc.abstractmethod
+    def _inv_linker(self, scale: tf.Tensor):
+        pass
+
+    def call(self, eta_scale: tf.Tensor, **kwargs):
+        """
+        Calls the distribution specific linker function to translate from linker to data space.
+
+        :param eta_scale: tf.Tensor
+            the variance values for each individual distribution, encoded in linker space.
+
+        :return tf.Tensor
+            the variance values for each individual distribution, encoded in data space.
+        """
+        scale = self._inv_linker(eta_scale)
+        return scale
+
+
+class LikelihoodGLM(tf.keras.layers.Layer, ProcessModelGLM):
+
+    """
+    Contains the computation of the distribution specific log-likelihood function
+    """
+
+    def __init__(self, dtype):
+        super(LikelihoodGLM, self).__init__(dtype=dtype)
+
+    @abc.abstractmethod
+    def _ll(self, eta_loc, eta_scale, loc, scale, x):
+        """
+        Does the actual likelihood calculation. Depends on the given noise model and needs to be implemented in the
+        inheriting layer.
+
+        :param eta_loc: tf.Tensor
+            the mean values for each individual distribution, encoded in linker space.
+        :param eta_scale: tf.Tensor
+            the variance values for each individual distribution, encoded in linker space.
+        :param loc: tf.Tensor
+            the mean values for each individual distribution, encoded in data space.
+        :param scale: tf.Tensor
+            the variance values for each individual distribution, encoded in data space.
+        :param x: tf.Tensor
+            the input data
+
+        :return tf.Tensor
+            the log-likelihoods of each individual data point.
+        """
+
+    def call(self, inputs, **kwargs):
+        """
+        Calls the distribution specific log-likelihood function.
+
+        :param inputs: tuple
+            Must contain eta_loc, eta_scale, loc, scale, x, n_features in this order.
+
+        :return tf.Tensor
+            the log-likelihoods of each individual data point.
+        """
+        return self._ll(*inputs)
diff --git a/batchglm/train/tf2/base_glm/layers_gradients.py b/batchglm/train/tf2/base_glm/layers_gradients.py
new file mode 100644
index 00000000..a807e51f
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/layers_gradients.py
@@ -0,0 +1,447 @@
+import abc
+import tensorflow as tf
+
+
+class Gradient(tf.keras.layers.Layer):
+
+    """Superclass for Jacobians, Hessian, FIM"""
+
+    def __init__(self, model_vars, dtype):
+        super(Gradient, self).__init__(dtype=dtype)
+        self.model_vars = model_vars
+
+    @abc.abstractmethod
+    def call(self, inputs, **kwargs):
+        pass
+
+    @staticmethod
+    def calc_design_mat(design_mat, constraints):
+        if constraints is not None:
+            xh = tf.matmul(design_mat, constraints)
+        else:
+            xh = design_mat
+        return xh
+
+    # Here, we use the einsum to efficiently perform the two outer products and the marginalisation.
+    @staticmethod
+    def create_specific_block(w, xh_loc, xh_scale):
+        return tf.einsum('ofc,od->fcd', tf.einsum('of,oc->ofc', w, xh_loc), xh_scale)
+
+
+class FIMGLM(Gradient):
+    """
+    Compute expected fisher information matrix (FIM)
+    for iteratively re-weighted least squares (IWLS or IRLS) parameter updates for GLMs.
+    """
+
+    def call(self, inputs, **kwargs):
+        return self._fim_analytic(*inputs)
+
+    def _fim_analytic(self, x, design_loc, design_scale, loc, scale, concat=False, compute_a=True, compute_b=True) -> tf.Tensor:
+        """
+        Compute the closed-form of the base_glm_all model fim
+        by evalutating its terms grouped by observations.
+        """
+
+        def _a_byobs():
+            """
+            Compute the mean model diagonal block of the
+            closed form fim of base_glm_all model by observation across features
+            for a batch of observations.
+            """
+            w = self._weight_fim_aa(x=x, loc=loc, scale=scale)  # [observations x features]
+            # The computation of the fim block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the einsum to efficiently perform the two outer products and the marginalisation.
+            xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc)
+
+            fim_block = self.create_specific_block(w, xh, xh)
+            return fim_block
+
+        def _b_byobs():
+            """
+            Compute the dispersion model diagonal block of the
+            closed form fim of base_glm_all model by observation across features.
+            """
+            w = self._weight_fim_bb(x=x, loc=loc, scale=scale)  # [observations=1 x features]
+            # The computation of the fim block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
+            xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale)
+
+            fim_block = self.create_specific_block(w, xh, xh)
+            return fim_block
+
+        # The full fisher information matrix is block-diagonal with the cross-model
+        # blocks all zero. Accordingly, mean and dispersion model updates can be
+        # treated independently and the full fisher information matrix is never required.
+        # Here, the non-zero model-wise diagonal blocks are computed and returned
+        # as a dictionary. The according score function vectors are also returned as a dictionary.
+
+        if compute_a and compute_b:
+            fim_a = _a_byobs()
+            fim_b = _b_byobs()
+
+        elif compute_a and not compute_b:
+            fim_a = _a_byobs()
+            fim_b = tf.zeros(fim_a.get_shape(), self.dtype)
+        elif not compute_a and compute_b:
+            fim_a = tf.zeros(fim_a.get_shape(), self.dtype)
+            fim_b = _b_byobs()
+        else:
+            fim_a = tf.zeros_like(self.model_vars.a_var, dtype=self.dtype)
+            fim_b = tf.zeros_like(self.model_vars.b_var, dtype=self.dtype)
+
+        if concat:
+            fim = tf.concat([fim_a, fim_b], axis=1)
+            return fim
+        else:
+            return fim_a, fim_b
+
+    @abc.abstractmethod
+    def _weight_fim_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute for mean model IWLS update for a GLM.
+
+        :param loc: tf.tensor observations x features
+           Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+           Value of dispersion model by observation and feature.
+
+        :return tuple of tf.tensors
+           Constants with respect to coefficient index for
+           Fisher information matrix and score function computation.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _weight_fim_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute for dispersion model IWLS update for a GLM.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return tuple of tf.tensors
+            Constants with respect to coefficient index for
+            Fisher information matrix and score function computation.
+        """
+        pass
+
+
+class JacobianGLM(Gradient):
+
+    def call(self, inputs, **kwargs):
+        return self._jac_analytic(*inputs)
+
+    def _jac_analytic(self, x, design_loc, design_scale, loc, scale, concat, compute_a=True, compute_b=True) -> tf.Tensor:
+        """
+        Compute the closed-form of the base_glm_all model jacobian
+        by evalutating its terms grouped by observations.
+
+        :param x: tf.tensor observations x features
+                Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+                Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+                Value of dispersion model by observation and feature.
+        """
+
+        def _a_byobs():
+            """
+            Compute the mean model block of the jacobian.
+
+            :return Jblock: tf.tensor features x coefficients
+                Block of jacobian.
+            """
+            w = self._weights_jac_a(x=x, loc=loc, scale=scale)  # [observations, features]
+            xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc)  # [observations, coefficient]
+
+            jblock = tf.matmul(tf.transpose(w), xh)  # [features, coefficients]
+            return jblock
+
+        def _b_byobs():
+            """
+            Compute the dispersion model block of the jacobian.
+
+            :return Jblock: tf.tensor features x coefficients
+                Block of jacobian.
+            """
+            w = self._weights_jac_b(x=x, loc=loc, scale=scale)  # [observations, features]
+            xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale)  # [observations, coefficient]
+
+            jblock = tf.matmul(tf.transpose(w), xh)  # [features, coefficients]
+            return jblock
+
+        if compute_a and compute_b:
+            j_a = _a_byobs()
+            j_b = _b_byobs()
+        elif compute_a and not compute_b:
+            j_a = _a_byobs()
+            j_b = tf.zeros((j_a.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.dtype)
+        elif not compute_a and compute_b:
+            j_b = _b_byobs()
+            j_a = tf.zeros((j_b.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.dtype)
+        else:
+            j_a = tf.transpose(tf.zeros_like(self.model_vars.a_var, dtype=self.dtype))
+            j_b = tf.transpose(tf.zeros_like(self.model_vars.b_var, dtype=self.dtype))
+
+        if concat:
+            j = tf.concat([j_a, j_b], axis=1)
+            return j
+        else:
+            return j_a, j_b
+
+    @abc.abstractmethod
+    def _weights_jac_a(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        mean model gradient.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _weights_jac_b(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        dispersion model gradient.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
+
+
+class HessianGLM(Gradient):
+    """
+    Compute the closed-form of the base_glm_all model hessian
+    by evaluating its terms grouped by observations.
+
+    Has three sub-functions which built the specific blocks of the hessian
+    and one sub-function which concatenates the blocks into a full hessian.
+    """
+
+    def call(self, inputs, **kwargs):
+        return self._hessian_analytic(*inputs)
+
+    def _hessian_analytic(self, x, design_loc, design_scale, loc, scale, concat, compute_a=True, compute_b=True) -> tf.Tensor:
+        """
+        Compute the closed-form of the base_glm_all model hessian
+        by evaluating its terms grouped by observations.
+
+        Has three sub-functions which built the specific blocks of the hessian
+        and one sub-function which concatenates the blocks into a full hessian.
+        """
+
+        def _aa_byobs_batched():
+            """
+            Compute the mean model diagonal block of the
+            closed form hessian of base_glm_all model by observation across features
+            for a batch of observations.
+            """
+            w = self._weight_hessian_aa(x=x, loc=loc, scale=scale)  # [observations x features]
+            # The computation of the hessian block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the einsum to efficiently perform the two outer products and the marginalisation.
+            xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc)
+
+            hblock = self.create_specific_block(w, xh, xh)
+            return hblock
+
+        def _bb_byobs_batched():
+            """
+            Compute the dispersion model diagonal block of the
+            closed form hessian of base_glm_all model by observation across features.
+            """
+            w = self._weight_hessian_bb(x=x, loc=loc, scale=scale)  # [observations x features]
+            # The computation of the hessian block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
+            xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale)
+
+            hblock = self.create_specific_block(w, xh, xh)
+            return hblock
+
+        def _ab_byobs_batched():
+            """
+            Compute the mean-dispersion model off-diagonal block of the
+            closed form hessian of base_glm_all model by observastion across features.
+
+            Note that there are two blocks of the same size which can
+            be compute from each other with a transpose operation as
+            the hessian is symmetric.
+            """
+            w = self._weight_hessian_ab(x=x, loc=loc, scale=scale)  # [observations x features]
+            # The computation of the hessian block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
+            xhloc = self.calc_design_mat(design_loc, self.model_vars.constraints_loc)
+            xhscale = self.calc_design_mat(design_scale, self.model_vars.constraints_scale)
+
+            hblock = self.create_specific_block(w, xhloc, xhscale)
+            return hblock
+
+        if compute_a and compute_b:
+            h_aa = _aa_byobs_batched()
+            h_bb = _bb_byobs_batched()
+            h_ab = _ab_byobs_batched()
+            h_ba = tf.transpose(h_ab, perm=[0, 2, 1])
+        elif compute_a and not compute_b:
+            h_aa = _aa_byobs_batched()
+            h_bb = tf.zeros_like(h_aa, dtype=self.dtype)
+            h_ab = tf.zeros_like(h_aa, dtype=self.dtype)
+            h_ba = tf.zeros_like(h_aa, dtype=self.dtype)
+        elif not compute_a and compute_b:
+            h_bb = _bb_byobs_batched()
+            h_aa = tf.zeros_like(h_bb, dtype=self.dtype)
+            h_ab = tf.zeros_like(h_bb, dtype=self.dtype)
+            h_ba = tf.zeros_like(h_bb, dtype=self.dtype)
+        else:
+            h_aa = tf.zeros((), dtype=self.dtype)
+            h_bb = tf.zeros((), dtype=self.dtype)
+            h_ab = tf.zeros((), dtype=self.dtype)
+            h_ba = tf.zeros((), dtype=self.dtype)
+
+        if concat:
+            h = tf.concat(
+                [tf.concat([h_aa, h_ab], axis=2),
+                 tf.concat([h_ba, h_bb], axis=2)],
+                axis=1
+            )
+            return h
+        else:
+            return h_aa, h_ab, h_ba, h_bb
+
+    @abc.abstractmethod
+    def _weight_hessian_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        mean model block of the hessian.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _weight_hessian_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        dispersion model block of the hessian.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _weight_hessian_ab(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        mean-dispersion model block of the hessian.
+
+        Note that there are two blocks of the same size which can
+        be compute from each other with a transpose operation as
+        the hessian is symmetric.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
new file mode 100644
index 00000000..378196cb
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -0,0 +1,232 @@
+from importlib import import_module
+import logging
+import tensorflow as tf
+from .external import ModelBase
+from .processModel import ProcessModelGLM
+logger = logging.getLogger("batchglm")
+
+
+class GLM(ModelBase, ProcessModelGLM):
+    """
+    base GLM class containg the model call.
+    """
+
+    def __init__(
+            self,
+            model_vars,
+            optimizer: str,
+            noise_module: str,
+            use_gradient_tape: bool = False,
+            compute_a: bool = True,
+            compute_b: bool = True,
+            dtype: str = "float32",
+    ):
+        super(GLM, self).__init__(dtype=dtype)
+
+        self.model_vars = model_vars
+        self.use_gradient_tape = use_gradient_tape
+        self.compute_a = compute_a
+        self.compute_b = compute_b
+        self.params = tf.Variable(tf.concat(
+            [
+                model_vars.init_a_clipped,
+                model_vars.init_b_clipped,
+            ],
+            axis=0
+        ), name="params", trainable=True)
+        self.params_copy = self.params
+
+        # import and add noise model specific layers.
+        layers = import_module('...' + noise_module + '.layers', __name__)
+        grad_layers = import_module('...' + noise_module + '.layers_gradients', __name__)
+        self.unpack_params = layers.UnpackParams(dtype=dtype)
+        self.linear_loc = layers.LinearLoc(dtype=dtype)
+        self.linear_scale = layers.LinearScale(dtype=dtype)
+        self.linker_loc = layers.LinkerLoc(dtype=dtype)
+        self.linker_scale = layers.LinkerScale(dtype=dtype)
+        self.likelihood = layers.Likelihood(dtype=dtype)
+        self.jacobian = grad_layers.Jacobian(model_vars=model_vars, dtype=dtype)
+        self.hessian = grad_layers.Hessian(model_vars=model_vars, dtype=dtype)
+        self.fim = grad_layers.FIM(model_vars=model_vars, dtype=dtype)
+
+        self.setMethod(optimizer)
+
+    def setMethod(self, optimizer: str):
+        """
+        Determines which function is executed to calculate and return the desired outputs when
+        calling the model. The internal function is chosen based on the given optimizer. It will
+        through an AssertionError if the optimizer is not understood.
+        """
+        optimizer = optimizer.lower()
+        if optimizer in ['gd', 'adam', 'adagrad', 'rmsprop']:
+            self._calc = self.calc_jacobians
+
+        elif optimizer in ['nr', 'nr_tr']:
+            self._calc = self._calc_hessians
+
+        elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_tr_ar', 'irls_tr_gd_tr']:
+            self._calc = self._calc_fim
+        else:
+            assert False, ("Unrecognized optimizer: %s", optimizer)
+
+    def featurewise_batch(self):
+        """
+        Applies a boolean mask over the feature dimension of the parameter matrix by removing
+        some feature columns (e.g. to exclude converged parameters) determined by the
+        `remaining_features` vector in `model_vars`. This method must be called after each
+        featurewise batch event to ensure the feature dimension of the input tensors matches the
+        feature dimension of `params_copy` in the following model call.
+        """
+        self.params_copy = tf.Variable(
+            tf.boolean_mask(tensor=self.params, mask=self.model_vars.remaining_features, axis=1),
+            trainable=True)
+
+    def apply_featurewise_updates(self, full_params_copy: tf.Tensor):
+        """
+        Applies featurewise updates stored in `params_copy` on `params`. Feature columns in
+        `params` corresponding to remaining feature columns in `params_copy` are overwritten with
+        the new values while the others (corresponding to features with converged parameters) are
+        retained. This method must be called after each featurewise batch event to ensure that the
+        updates stored in `params_copy` aren't lost when deriving a new `params_copy` from `params`
+        in the following model calls using `featurewise_batch()`.
+        """
+        self.params.assign(
+            tf.where(self.model_vars.remaining_features, full_params_copy, self.params))
+
+    def _call_parameters(self, inputs):
+        design_loc, design_scale, size_factors = inputs
+        a_var, b_var = self.unpack_params([self.params_copy, self.model_vars.a_var.get_shape()[0]])
+        eta_loc = self.linear_loc([a_var, design_loc, self.model_vars.constraints_loc, size_factors])
+        eta_scale = self.linear_scale([b_var, design_scale, self.model_vars.constraints_scale])
+        loc = self.linker_loc(eta_loc)
+        scale = self.linker_scale(eta_scale)
+        return eta_loc, eta_scale, loc, scale, a_var, b_var
+
+    def calc_ll(self, inputs):
+        """
+        Calculates the log probabilities, summed up per feature column and returns it together with
+        loc, scale, a_var, and b_var (forwarding results from `_call_parameters`).
+        """
+        parameters = self._call_parameters(inputs[1:])
+        log_probs = self.likelihood([*parameters[:-2], inputs[0]])
+        log_probs = tf.reduce_sum(log_probs, axis=0)
+        return (log_probs, *parameters[2:])
+
+    def calc_jacobians(self, inputs, compute_a=True, compute_b=None, concat=True):
+        if compute_b is None:
+            compute_b = self.compute_b
+        return self._calc_jacobians(inputs, compute_a=compute_a, compute_b=compute_b, concat=concat)[2:]
+
+    def _calc_jacobians(self, inputs, compute_a, compute_b, concat=True, transpose=True):
+        """
+        calculates jacobian.
+
+        :param inputs: TODO
+        :param concat: boolean
+            if true, concatenates the loc and scale block.
+        :param transpose: bool
+            transpose the gradient if true.
+            autograd returns gradients with respect to the shape of self.params.
+            But analytic differentiation returns it the other way, which is
+            often needed for downstream operations (e.g. hessian)
+            Therefore, if self.use_gradient_tape, it will transpose if transpose == False
+        """
+
+        with tf.GradientTape(persistent=True) as g:
+            log_probs, loc, scale, a_var, b_var = self.calc_ll(inputs)
+
+        if self.use_gradient_tape:
+
+            if compute_a:
+                if compute_b:
+                    if concat:
+                        jacobians = g.gradient(log_probs, self.params_copy)
+                        if not transpose:
+                            jacobians = tf.transpose(jacobians)
+                    else:
+                        jac_a = g.gradient(log_probs, a_var)
+                        jac_b = g.gradient(log_probs, b_var)
+                        if not transpose:
+                            jac_a = tf.transpose(jac_a)
+                            jac_b = tf.transpose(jac_b)
+                else:
+                    jac_a = g.gradient(log_probs, a_var)
+                    jac_b = tf.zeros((jac_a.get_shape()[0], b_var.get_shape()[1]), b_var.dtype)
+                    if concat:
+                        jacobians = tf.concat([jac_a, jac_b], axis=0)
+                        if not transpose:
+                            jacobians = tf.transpose(jacobians)
+            else:
+                jac_b = g.gradient(log_probs, b_var)
+                jac_a = tf.zeros((jac_b.get_shape()[0], a_var.get_shape()[0]), a_var.dtype)
+                if concat:
+                    jacobians = tf.concat([jac_a, jac_b], axis=0)
+                    if not transpose:
+                        jacobians = tf.transpose(jacobians)
+
+        else:
+
+            if concat:
+                jacobians = self.jacobian([*inputs[0:3], loc, scale, True, compute_a, compute_b])
+                if transpose:
+                    jacobians = tf.transpose(jacobians)
+            else:
+                jac_a, jac_b = self.jacobian([*inputs[0:3], loc, scale, False, compute_a, compute_b])
+
+        del g
+        if concat:
+            return loc, scale, log_probs, tf.negative(jacobians)
+        return loc, scale, log_probs, tf.negative(jac_a), tf.negative(jac_b)
+
+    def _calc_hessians(self, inputs, compute_a, compute_b):
+        # with tf.GradientTape(persistent=True) as g2:
+        loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, compute_a=compute_a, compute_b=compute_b, transpose=False)
+        '''
+        autograd not yet working. TODO: Search error in the following code:
+
+        if self.use_gradient_tape:
+
+            i = tf.constant(0, tf.int32)
+            h_tensor_array = tf.TensorArray(  # hessian slices [:,:,j]
+                dtype=self.params_copy.dtype,
+                size=self.params_copy.get_shape()[0],
+                clear_after_read=False
+            )
+            while i < self.params_copy.get_shape()[0]:
+                grad = g2.gradient(results_arr[i], self.params_copy)
+                h_tensor_array.write(index=i, value=grad)
+                i += 1
+
+            # h_tensor_array is a TensorArray, reshape this into a tensor so that it can be used
+            # in down-stream computation graphs.
+
+            hessians = tf.transpose(tf.reshape(
+                h_tensor_array.stack(),
+                tf.stack((self.params_copy.get_shape()[0],
+                          self.params_copy.get_shape()[0],
+                          self.params_copy.get_shape()[1]))
+            ), perm=[2, 1, 0])
+            hessians = tf.negative(hessians)
+        '''
+        hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True, compute_a, compute_b]))
+        return log_probs, jacobians, hessians
+
+    def _calc_fim(self, inputs, compute_a, compute_b):
+        loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(
+            inputs,
+            compute_a=compute_a,
+            compute_b=compute_b,
+            concat=False,
+            transpose=False)
+        fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False, compute_a, compute_b])
+        return log_probs, jac_a, jac_b, fim_a, fim_b
+
+    def call(self, inputs, compute_a=True, compute_b=None):
+        """
+        Wrapper method to call this model. Depending on the desired calculations specified by the
+        `optimizer` arg to `__init__`, it will forward the call to the necessary function to perform
+        the right calculations and return all the results.
+        """
+        if compute_b is None:
+            compute_b = self.compute_b
+        return self._calc(inputs, compute_a, compute_b)
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
new file mode 100644
index 00000000..7903b372
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -0,0 +1,501 @@
+from .external import pkg_constants
+import tensorflow as tf
+from .external import OptimizerBase
+import abc
+import numpy as np
+
+
+class SecondOrderOptim(OptimizerBase, metaclass=abc.ABCMeta):
+
+    """
+    Superclass for NR and IRLS
+    """
+
+    def _norm_neg_log_likelihood(self, log_probs):
+        return - log_probs / self.n_obs
+
+    def _resource_apply_dense(self, grad, handle, apply_state=None):
+
+        update_op = handle.assign_add(grad, read_value=False)
+
+        return update_op
+
+    def _resource_apply_sparse(self, grad, handle, apply_state=None):
+
+        raise NotImplementedError('Applying SparseTensor currently not possible.')
+
+    def get_config(self):
+
+        config = {"name": "SOO"}
+        return config
+
+    def _create_slots(self, var_list):
+
+        self.add_slot(var_list[0], 'mu_r')
+
+    def _trust_region_ops(
+            self,
+            proposed_vector,
+            compute_a,
+            compute_b,
+            batch_features,
+            update_theta,
+    ):
+        if compute_b and not compute_a:
+            t1 = pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE
+            t2 = pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE
+        else:
+            t1 = pkg_constants.TRUST_REGION_T1
+            t2 = pkg_constants.TRUST_REGION_T2
+        t1 = tf.constant(t1, dtype=self._dtype)
+        t2 = tf.constant(t2, dtype=self._dtype)
+        upper_bound = tf.constant(pkg_constants.TRUST_REGION_UPPER_BOUND, dtype=self._dtype)
+
+        # Phase I: Perform a trial update.
+        # Propose parameter update:
+
+        coeffs = self.model.model_vars.idx_train_loc if compute_b and not compute_a else self.model.model_vars.idx_train_scale
+        if compute_b:
+            idx_stop = self.model.model_vars.idx_train_scale[-1] + 1
+            if compute_a:
+                idx_start = 0
+            else:
+                idx_start = self.model.model_vars.idx_train_loc[-1] + 1
+        else:
+            idx_start = 0
+            idx_stop = self.model.model_vars.idx_train_loc[-1] + 1
+        update_vector_length = tf.sqrt(tf.reduce_sum(tf.square(proposed_vector[idx_start:idx_stop]), axis=0))
+        update_theta_full = update_theta
+        if batch_features:
+            n_features = self.model.model_vars.n_features
+            indices = tf.where(self.model.model_vars.remaining_features)
+            update_theta_full = tf.scatter_nd(indices, update_theta, shape=(n_features,))
+            update_vector_length = tf.scatter_nd(indices, update_vector_length, shape=(n_features,))
+
+        tr_radius = self.tr_radius_b if compute_b and not compute_a else self.tr_radius
+
+        increase_radius = update_theta_full
+        decrease_radius = tf.logical_not(update_theta_full)
+
+        if compute_a:
+            increase_radius = tf.logical_and(increase_radius, update_vector_length > 0.9 * tr_radius)
+            decrease_radius = tf.logical_or(decrease_radius, update_vector_length < 0.5 * tr_radius)
+
+        if compute_b and not compute_a:
+            self.model.model_vars.updated_b = update_theta_full.numpy()
+        else:
+            self.model.model_vars.updated = update_theta_full.numpy()
+
+        # Update trusted region accordingly:
+
+        keep_radius = tf.logical_and(tf.logical_not(decrease_radius),
+                                     tf.logical_not(increase_radius))
+        radius_update = tf.add_n([
+            tf.multiply(t1, tf.cast(decrease_radius, self._dtype)),
+            tf.multiply(t2, tf.cast(increase_radius, self._dtype)),
+            tf.multiply(tf.ones_like(t1), tf.cast(keep_radius, self._dtype))
+        ])
+
+        radius_new = tf.minimum(tf.multiply(tr_radius, radius_update), upper_bound)
+        tr_radius.assign(radius_new)
+
+        return update_theta
+
+    def _trial_update(self, x_batches, log_probs, proposed_vector, is_batched, compute_a, compute_b):
+
+        eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
+        """
+        Current likelihood refers to the likelihood that has been calculated in the last model call.
+        We are always evaluating on the full model, so if we train on the batched model (is_batched),
+        current likelihood needs to be calculated on the full model using the same model state as
+        used in the last model call. Moreover, if this update is conducted separately for loc
+        (compute_a) and scale (compute_b), current likelihood always needs to be recalculated when
+        updating the scale params since the location params changed in the location update before.
+        This is only true if the location params are updated before the scale params however!
+        """
+        current_likelihood = log_probs
+        if is_batched or compute_b and not compute_a:
+            for i, x_batch in enumerate(x_batches):
+                log_likelihood = self.model.calc_ll([*x_batch])[0]
+                current_likelihood = log_likelihood if i == 0 else tf.math.add(current_likelihood, log_likelihood)
+
+        current_likelihood = self._norm_neg_log_likelihood(current_likelihood)
+
+        """
+        The new likelihood is calculated on the full model now, after updating the parameters using
+        the proposed vector:
+        """
+        original_params_copy = tf.identity(self.model.params_copy)
+        self.model.params_copy.assign_sub(proposed_vector)
+
+        for i, x_batch in enumerate(x_batches):
+            log_likelihood = self.model.calc_ll([*x_batch])[0]
+            if i == 0:
+                new_likelihood = log_likelihood
+            else:
+                new_likelihood += log_likelihood
+        new_likelihood = self._norm_neg_log_likelihood(new_likelihood)
+
+        """
+        delta_f_actual shows the difference between the log likelihoods before and after the proposed
+        update of parameters. It is > 0 if the new likelihood is greater than the old.
+        """
+        delta_f_actual = tf.math.subtract(current_likelihood, new_likelihood)
+
+        """
+        If we use feature batching, the individual vector indices need to be spread out to the full
+        feature space by adding columns corresponding to positions of converged (non calculated)
+        features.
+        """
+
+        # Compute parameter updates.g
+        update_theta = delta_f_actual > eta0
+        self.model.params_copy.assign(tf.where(update_theta, self.model.params_copy, original_params_copy))
+
+        return update_theta
+
+    def __init__(self, dtype: tf.dtypes.DType, tr_mode: bool, model: tf.keras.Model, name: str, n_obs: int):
+
+        super(SecondOrderOptim, self).__init__(name)
+
+        self.model = model
+        self._dtype = dtype
+        self.n_obs = tf.cast(n_obs, dtype=self._dtype)
+        self.tr_mode = tr_mode
+
+        n_features = self.model.model_vars.n_features
+        if tr_mode:
+            self.tr_radius = tf.Variable(
+                np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
+                dtype=self._dtype, trainable=False)
+
+        else:
+            self.tr_radius = tf.Variable(np.array([np.inf]), dtype=self._dtype, trainable=False)
+            self.model.model_vars.updated = np.repeat(a=True, repeats=n_features)
+
+    @abc.abstractmethod
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
+        pass
+
+    def _newton_type_update(self, lhs, rhs, psd=False):
+
+        new_rhs = tf.expand_dims(rhs, axis=-1)
+        res = tf.linalg.lstsq(lhs, new_rhs, fast=psd)
+        delta_t = tf.squeeze(res, axis=-1)
+        update_tensor = tf.transpose(delta_t)
+        return update_tensor
+
+    def _pad_updates(
+            self,
+            update_raw,
+            compute_a,
+            compute_b
+    ):
+        # Pad update vectors to receive update tensors that match
+        # the shape of model_vars.params.
+        if compute_a:
+            if compute_b:
+                netwon_type_update = update_raw
+            else:
+                netwon_type_update = tf.concat([
+                    update_raw,
+                    tf.zeros(shape=(self.model.model_vars.b_var.get_shape()[0], update_raw.get_shape()[1]),
+                             dtype=self._dtype)
+                ], axis=0)
+
+        elif compute_b:
+            netwon_type_update = tf.concat([
+                tf.zeros(shape=(self.model.model_vars.a_var.get_shape()[0], update_raw.get_shape()[1]),
+                         dtype=self._dtype),
+                update_raw
+            ], axis=0)
+
+        else:
+            raise ValueError("No training necessary")
+
+        return netwon_type_update
+
+    @staticmethod
+    def _calc_update_magnitudes(update_raw):
+        update_magnitude_sq = tf.reduce_sum(tf.square(update_raw), axis=0)
+        update_magnitude = tf.where(
+            condition=update_magnitude_sq > 0,
+            x=tf.sqrt(update_magnitude_sq),
+            y=tf.zeros_like(update_magnitude_sq)
+        )
+        update_magnitude_inv = tf.where(
+            condition=update_magnitude > 0,
+            x=tf.divide(
+                tf.ones_like(update_magnitude),
+                update_magnitude
+            ),
+            y=tf.zeros_like(update_magnitude)
+        )
+        return update_magnitude, update_magnitude_inv
+
+    def _trust_region_update(
+            self,
+            update_raw,
+            radius_container,
+    ):
+        update_magnitude, update_magnitude_inv = SecondOrderOptim._calc_update_magnitudes(update_raw)
+        update_norm = tf.multiply(update_raw, update_magnitude_inv)
+
+        update_scale = tf.minimum(
+            radius_container,
+            update_magnitude
+        )
+        proposed_vector = tf.multiply(
+            update_norm,
+            update_scale
+        )
+
+        return proposed_vector
+
+    def normalize_update_magnitude(self, update_magnitude):
+        return update_magnitude
+
+    def _trust_region_newton_cost_gain(
+            self,
+            proposed_vector,
+            neg_jac,
+            hessian_fim
+    ):
+        pred_cost_gain = tf.add(
+            tf.einsum(
+                'ni,in->n',
+                neg_jac,
+                proposed_vector
+            ) / self.n_obs,
+            0.5 * tf.einsum(
+                'nix,xin->n',
+                tf.einsum('inx,nij->njx',
+                          tf.expand_dims(proposed_vector, axis=-1),
+                          hessian_fim),
+                tf.expand_dims(proposed_vector, axis=0)
+            ) / tf.square(self.n_obs)
+        )
+        return pred_cost_gain
+
+
+class NR(SecondOrderOptim):
+
+    def _get_updates(self, lhs, rhs, compute_a, compute_b):
+
+        update_raw = self._newton_type_update(lhs=lhs, rhs=rhs)
+        update = self._pad_updates(update_raw, compute_a, compute_b)
+
+        return update_raw, update
+
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
+
+        x_batches, log_probs, jacobians, hessians = inputs
+        tr_mode = self.tr_mode
+        if compute_b:
+            if not compute_a:
+                self.model.model_vars.updated_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+                tr_mode = self.tr_mode_b
+
+        assert (compute_a or compute_b), "Nothing can be trained. Please make sure" \
+            "at least one of train_mu and train_r is set to True."
+
+        update_raw, update = self._get_updates(hessians, jacobians, compute_a, compute_b)
+
+        if tr_mode:
+            if batch_features:
+                radius_container = tf.boolean_mask(
+                    tensor=self.tr_radius,
+                    mask=self.model.model_vars.remaining_features)
+            else:
+                radius_container = self.tr_radius
+            tr_proposed_vector = self._trust_region_update(
+                update_raw=update_raw,
+                radius_container=radius_container
+            )
+            #tr_pred_cost_gain = self._trust_region_newton_cost_gain(
+            #    proposed_vector=tr_proposed_vector,
+            #    neg_jac=jacobians,
+            #    hessian_fim=hessians
+            #)
+
+            #tr_proposed_vector_pad = self._pad_updates(
+            #    update_raw=tr_proposed_vector,
+            #    compute_a=compute_a,
+            #    compute_b=compute_b
+            #)
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=tr_proposed_vector,
+                is_batched=is_batched,
+                compute_a=compute_a,
+                compute_b=compute_b)
+            self._trust_region_ops(
+                proposed_vector=tr_proposed_vector,
+                compute_a=compute_a,
+                compute_b=compute_b,
+                batch_features=batch_features,
+                update_theta=update_theta
+            )
+
+        else:
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=update,
+                is_batched=is_batched,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+
+            #self.model.params_copy.assign_sub(update)
+
+
+class IRLS(SecondOrderOptim):
+
+    def _calc_proposed_vector_and_pred_cost_gain(
+            self,
+            update_x,
+            radius_container,
+            neg_jac_x,
+            fim_x=None
+    ):
+        """
+        Calculates the proposed vector and predicted cost gain for either mean or scale part.
+        :param update_x: tf.tensor coefficients x features ? TODO
+
+        :param radius_container: tf.tensor ? x ? TODO
+
+        :param neg_jac_x: tf.Tensor coefficients x features ? TODO
+            Upper (mu part) or lower (r part) of negative jacobian matrix
+        :param fim_x
+            Upper (mu part) or lower (r part) of Fisher Inverse Matrix.
+            Defaults to None, is only needed if gd is False
+        :return proposed_vector_x, pred_cost_gain_x
+            Returns proposed vector and predicted cost gain after
+            trusted region update for either mu or r part, depending on x
+        """
+
+        proposed_vector_x = self._trust_region_update(
+            update_raw=update_x,
+            radius_container=radius_container
+        )
+
+        #pred_cost_gain_x = self._trust_region_newton_cost_gain(
+        #    proposed_vector=proposed_vector_x,
+        #    neg_jac=neg_jac_x,
+        #    hessian_fim=fim_x
+        #)
+
+        return proposed_vector_x, None#pred_cost_gain_x
+
+
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
+
+        x_batches, log_probs, jac_a, jac_b, fim_a, fim_b = inputs
+        tr_mode = self.tr_mode
+        if compute_b:
+            if not compute_a:
+                self.model.model_vars.updated_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+                tr_mode = self.tr_mode_b
+
+        assert (compute_a or compute_b), "Nothing can be trained. Please make sure" \
+            "at least one of train_mu and train_r is set to True."
+
+        # Compute a and b model updates separately.
+        if compute_a:
+            # The FIM of the mean model is guaranteed to be
+            # positive semi-definite and can therefore be inverted
+            # with the Cholesky decomposition. This information is
+            # passed here with psd=True.
+            update_a = self._newton_type_update(
+                lhs=fim_a,
+                rhs=jac_a,
+                psd=False
+            )
+        if compute_b:
+
+            update_b = self._newton_type_update(
+                lhs=fim_b,
+                rhs=jac_b
+            )
+
+        if not tr_mode:
+            if compute_a:
+                if compute_b:
+                    update_raw = tf.concat([update_a, update_b], axis=0)
+                else:
+                    update_raw = update_a
+            else:
+                update_raw = update_b
+
+            update = self._pad_updates(
+                update_raw=update_raw,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=update,
+                is_batched=is_batched,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+            #print(update_theta)
+            #self.model.params_copy.assign_sub(update_var)
+
+        else:
+            # put together update_raw based on proposed vector and cost gain depending on train_r and train_mu
+            if batch_features:
+                radius_container = tf.boolean_mask(
+                    tensor=self.tr_radius,
+                    mask=self.model.model_vars.remaining_features)
+            else:
+                radius_container = self.tr_radius
+
+            if compute_b:
+                tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
+                    update_b, radius_container, jac_b, fim_b)
+                if compute_a:
+                    tr_proposed_vector_a, tr_pred_cost_gain_a = self._calc_proposed_vector_and_pred_cost_gain(
+                        update_a, radius_container, jac_a, fim_a)
+
+                    tr_update_raw = tf.concat([tr_proposed_vector_a, tr_proposed_vector_b], axis=0)
+                    #tr_pred_cost_gain = tf.add(tr_pred_cost_gain_a, tr_pred_cost_gain_b)
+                else:
+                    # directly apply output of calc_proposed_vector_and_pred_cost_gain to tr_update_raw
+                    # and tr_pred_cost_gain
+                    tr_update_raw = tr_proposed_vector_b
+                    #tr_pred_cost_gain = tr_pred_cost_gain_b
+            else:
+                # here train_r is False AND train_mu is true, so the output of the function can directly be applied to
+                # tr_update_raw and tr_pred_cost_gain, similar to train_r = True and train_mu = False
+                tr_update_raw, tr_pred_cost_gain = self._calc_proposed_vector_and_pred_cost_gain(
+                    update_a, radius_container, jac_a, fim_a)
+
+            tr_update = self._pad_updates(
+                update_raw=tr_update_raw,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+            # perform update
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=tr_update,
+                is_batched=is_batched,
+                compute_a=compute_a,
+                compute_b=compute_b)
+            self._trust_region_ops(
+                proposed_vector=tr_update,
+                compute_a=compute_a,
+                compute_b=compute_b,
+                batch_features=batch_features,
+                update_theta=update_theta
+            )
+
+    def calc_delta_f_actual(self, current_likelihood, new_likelihood, jacobian):
+        eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)
+        return
diff --git a/batchglm/train/tf2/base_glm/processModel.py b/batchglm/train/tf2/base_glm/processModel.py
new file mode 100644
index 00000000..4b6aedf7
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/processModel.py
@@ -0,0 +1,9 @@
+from .external import ProcessModelBase
+import abc
+
+
+class ProcessModelGLM(ProcessModelBase):
+
+    @abc.abstractmethod
+    def param_bounds(self, dtype):
+        pass
diff --git a/batchglm/train/tf2/base_glm/training_strategies.py b/batchglm/train/tf2/base_glm/training_strategies.py
new file mode 100644
index 00000000..63f295d3
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/training_strategies.py
@@ -0,0 +1,111 @@
+from enum import Enum
+
+
+class TrainingStrategies(Enum):
+
+    AUTO = None
+
+    DEFAULT = \
+        {
+            "optim_algo": "default_adam",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    GD = \
+        {
+            "optim_algo": "gd",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    ADAM =  \
+        {
+            "optim_algo": "adam",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    ADAGRAD = \
+        {
+            "optim_algo": "adagrad",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    RMSPROP = \
+        {
+            "optim_algo": "rmsprop",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    IRLS = \
+        {
+            "optim_algo": "irls",
+            "jacobian": False,
+            "hessian": False,
+            "fim": True,
+            "concat_grads": False,
+            "calc_separated": True
+        }
+
+    IRLS_TR = \
+        {
+            "optim_algo": "irls_tr",
+            "jacobian": False,
+            "hessian": False,
+            "fim": True,
+            "concat_grads": False,
+            "calc_separated": True
+        }
+
+    IRLS_GD = \
+        {
+            "optim_algo": "irls_gd",
+            "jacobian": False,
+            "hessian": False,
+            "fim": True,
+            "concat_grads": False,
+            "calc_separated": True
+        }
+
+    IRLS_GD_TR = \
+        {
+            "optim_algo": "irls_gd_tr",
+            "jacobian": False,
+            "hessian": False,
+            "fim": True,
+            "concat_grads": False,
+            "calc_separated": True
+        }
+
+    NR = \
+        {
+            "optim_algo": "nr",
+            "jacobian": False,
+            "hessian": True,
+            "fim": False,
+            "concat_grads": True,
+            "calc_separated": False
+        }
+
+    NR_TR = \
+        {
+            "optim_algo": "nr_tr",
+            "jacobian": False,
+            "hessian": True,
+            "fim": False,
+            "concat_grads": True,
+            "calc_separated": False
+        }
diff --git a/batchglm/train/tf2/base_glm/vars.py b/batchglm/train/tf2/base_glm/vars.py
new file mode 100644
index 00000000..f57ff08c
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/vars.py
@@ -0,0 +1,90 @@
+import numpy as np
+import tensorflow as tf
+import abc
+
+from .model import ProcessModelGLM
+
+
+class ModelVarsGLM(ProcessModelGLM):
+    """ Build tf.Variables to be optimzed and their constraints.
+
+    a_var and b_var slices of the tf.Variable params which contains
+    all parameters to be optimized during model estimation.
+    Params is defined across both location and scale model so that
+    the hessian can be computed for the entire model.
+    a and b are the clipped parameter values which also contain
+    constraints and constrained dependent coefficients which are not
+    directly optimized.
+    """
+
+    constraints_loc: tf.Tensor
+    constraints_scale: tf.Tensor
+    params: tf.Variable
+    a_var: tf.Tensor
+    b_var: tf.Tensor
+    updated: np.ndarray
+    converged: np.ndarray
+    dtype: str
+    n_features: int
+
+    def __init__(
+            self,
+            init_a: np.ndarray,
+            init_b: np.ndarray,
+            constraints_loc: np.ndarray,
+            constraints_scale: np.ndarray,
+            dtype: str
+    ):
+        """
+
+        :param init_a: nd.array (mean model size x features)
+            Initialisation for all parameters of mean model.
+        :param init_b: nd.array (dispersion model size x features)
+            Initialisation for all parameters of dispersion model.
+        :param dtype: Precision used in tensorflow.
+        """
+        self.constraints_loc = tf.convert_to_tensor(constraints_loc, dtype)
+        self.constraints_scale = tf.convert_to_tensor(constraints_scale, dtype)
+
+        self.init_a = tf.convert_to_tensor(init_a, dtype=dtype)
+        self.init_b = tf.convert_to_tensor(init_b, dtype=dtype)
+
+        self.init_a_clipped = self.tf_clip_param(self.init_a, "a_var")
+        self.init_b_clipped = self.tf_clip_param(self.init_b, "b_var")
+
+        # Param is the only tf.Variable in the graph.
+        # a_var and b_var have to be slices of params.
+        self.params = tf.Variable(tf.concat(
+            [
+                self.init_a_clipped,
+                self.init_b_clipped,
+            ],
+            axis=0
+        ), name="params")
+
+        a_var = self.params[0:init_a.shape[0]]
+        b_var = self.params[init_a.shape[0]:]
+
+        self.a_var = self.tf_clip_param(a_var, "a_var")
+        self.b_var = self.tf_clip_param(b_var, "b_var")
+
+        # Properties to follow gene-wise convergence.
+        self.updated = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+        self.updated_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+        self.converged = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
+        self.converged_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
+
+        self.total_converged = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
+        self.remaining_features = np.repeat(a=True, repeats=self.params.shape[1])
+        self.dtype = dtype
+        self.n_features = self.params.shape[1]
+        self.idx_train_loc = np.arange(0, init_a.shape[0])
+        self.idx_train_scale = np.arange(init_a.shape[0], init_a.shape[0]+init_b.shape[0])
+
+    @abc.abstractmethod
+    def param_bounds(self, dtype):
+        pass
+
+    def convergence_update(self, status: np.ndarray, features_updated: np.ndarray):
+        self.converged = status.copy()
+        self.updated = features_updated
diff --git a/batchglm/train/tf2/glm_beta/__init__.py b/batchglm/train/tf2/glm_beta/__init__.py
new file mode 100644
index 00000000..a616f181
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/__init__.py
@@ -0,0 +1,5 @@
+from .processModel import ProcessModel
+from .vars import ModelVars
+from .estimator import Estimator
+
+from .model import BetaGLM
diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
new file mode 100644
index 00000000..a2f3f056
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -0,0 +1,247 @@
+import logging
+from typing import Union
+
+import numpy as np
+
+from .external import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
+from .external import InputDataGLM, Model
+from .external import Estimator as GLMEstimator
+from .model import BetaGLM
+from .processModel import ProcessModel
+from .vars import ModelVars
+from .training_strategies import TrainingStrategies
+
+
+class Estimator(GLMEstimator, ProcessModel):
+    """
+    Estimator for Generalized Linear Models (GLMs) with beta distributed noise.
+    Uses a logit linker function for loc and log linker function for scale.
+    """
+
+    model: BetaGLM
+
+    def __init__(
+            self,
+            input_data: InputDataGLM,
+            init_a: Union[np.ndarray, str] = "AUTO",
+            init_b: Union[np.ndarray, str] = "AUTO",
+            quick_scale: bool = False,
+            dtype="float32",
+    ):
+        """
+        Performs initialisation and creates a new estimator.
+
+        :param input_data: InputDataGLM
+            The input data
+        :param init_a: (Optional)
+            Low-level initial values for a. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize intercept with observed mean
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'a'
+        :param init_b: (Optional)
+            Low-level initial values for b. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize with zeros
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'b'
+        :param quick_scale: bool
+            Whether `scale` will be fitted faster and maybe less accurate.
+            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+        :param dtype: Precision used in tensorflow.
+        """
+        self.TrainingStrategies = TrainingStrategies
+
+        self._train_loc = True
+        self._train_scale = True
+
+        (init_a, init_b) = self.init_par(
+            input_data=input_data,
+            init_a=init_a,
+            init_b=init_b,
+            init_model=None
+        )
+        init_a = init_a.astype(dtype)
+        init_b = init_b.astype(dtype)
+        if quick_scale:
+            self._train_scale = False
+
+        self.model_vars = ModelVars(
+            init_a=init_a,
+            init_b=init_b,
+            constraints_loc=input_data.constraints_loc,
+            constraints_scale=input_data.constraints_scale,
+            dtype=dtype
+        )
+
+        super(Estimator, self).__init__(
+            input_data=input_data,
+            dtype=dtype
+        )
+
+    def train(
+        self,
+        use_batching: bool = True,
+        batch_size: int = 500,
+        optim_algo: str = "adam",
+        learning_rate: float = 1e-2,
+        convergence_criteria: str = "step",
+        stopping_criteria: int = 1000,
+        autograd: bool = False,
+        featurewise: bool = True,
+        benchmark: bool = False
+    ):
+
+        if self.model is None:
+            self.model = BetaGLM(
+                model_vars=self.model_vars,
+                dtype=self.model_vars.dtype,
+                compute_a=self._train_loc,
+                compute_b=self._train_scale,
+                use_gradient_tape=autograd,
+                optimizer=optim_algo
+            )
+        else:
+            self.model.setMethod(optim_algo)
+
+        optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
+
+        super(Estimator, self)._train(
+            noise_model="beta",
+            is_batched=use_batching,
+            batch_size=batch_size,
+            optimizer_object=optimizer_object,
+            convergence_criteria=convergence_criteria,
+            stopping_criteria=stopping_criteria,
+            autograd=autograd,
+            benchmark=benchmark,
+            optim_algo=optim_algo
+        )
+
+    def get_model_container(
+            self,
+            input_data
+    ):
+        return Model(input_data=input_data)
+
+    def init_par(
+            self,
+            input_data,
+            init_a,
+            init_b,
+            init_model
+    ):
+        r"""
+        standard:
+        Only initialise intercept and keep other coefficients as zero.
+
+        closed-form:
+        Initialize with Maximum Likelihood / Maximum of Momentum estimators
+        """
+
+        if init_model is None:
+            groupwise_means = None
+            init_a_str = None
+            if isinstance(init_a, str):
+                init_a_str = init_a.lower()
+                # Chose option if auto was chosen
+                if init_a.lower() == "auto":
+                    init_a = "closed_form"
+
+                if init_a.lower() == "closed_form":
+                    groupwise_means, init_a, rmsd_a = closedform_beta_glm_logitmean(
+                        x=input_data.x,
+                        design_loc=input_data.design_loc,
+                        constraints_loc=input_data.constraints_loc,
+                        size_factors=input_data.size_factors_init,
+                        link_fn=lambda mean: np.log(
+                            1/(1/self.np_clip_param(mean, "mean")-1)
+                        )
+                    )
+
+                    # train mu, if the closed-form solution is inaccurate
+                    self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0)
+
+                    logging.getLogger("batchglm").debug("Using closed-form MME initialization for mean")
+                elif init_a.lower() == "standard":
+                    overall_means = np.mean(input_data.x, axis=0)
+                    overall_means = self.np_clip_param(overall_means, "mean")
+
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    init_a[0, :] = np.log(overall_means/(1-overall_means))
+                    self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using standard initialization for mean")
+                elif init_a.lower() == "all_zero":
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using all_zero initialization for mean")
+                else:
+                    raise ValueError("init_a string %s not recognized" % init_a)
+                logging.getLogger("batchglm").debug("Should train mean: %s", self._train_loc)
+            if isinstance(init_b, str):
+                if init_b.lower() == "auto":
+                    init_b = "standard"
+
+                if init_b.lower() == "standard":
+                    groupwise_scales, init_b_intercept, rmsd_b = closedform_beta_glm_logsamplesize(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale[:, [0]],
+                        constraints=input_data.constraints_scale[[0], :][:, [0]],
+                        size_factors=input_data.size_factors,
+                        groupwise_means=None,
+                        link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize"))
+                    )
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+                    init_b[0, :] = init_b_intercept
+
+                    logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion")
+                elif init_b.lower() == "closed_form":
+                    dmats_unequal = False
+                    if input_data.num_design_loc_params == input_data.num_design_scale_params:
+                        if np.any(input_data.design_loc != input_data.design_scale):
+                            dmats_unequal = True
+
+                    inits_unequal = False
+                    if init_a_str is not None:
+                        if init_a_str != init_b:
+                            inits_unequal = True
+
+                    if inits_unequal or dmats_unequal:
+                        raise ValueError(
+                            "cannot use closed_form init for scale model if scale model differs from loc model"
+                        )
+
+                    groupwise_scales, init_b, rmsd_b = closedform_beta_glm_logsamplesize(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale,
+                        constraints=input_data.constraints_scale,
+                        size_factors=input_data.size_factors,
+                        groupwise_means=groupwise_means,
+                        link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize"))
+                    )
+
+                    logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion")
+                elif init_b.lower() == "all_zero":
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+
+                    logging.getLogger("batchglm").debug("Using standard initialization for dispersion")
+                else:
+                    raise ValueError("init_b string %s not recognized" % init_b)
+                logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale)
+        else:
+            init_a, init_b = self.get_init_from_model(init_a=init_a,
+                                                      init_b=init_b,
+                                                      input_data=input_data,
+                                                      init_model=init_model)
+
+        return init_a, init_b
diff --git a/batchglm/train/tf2/glm_beta/external.py b/batchglm/train/tf2/glm_beta/external.py
new file mode 100644
index 00000000..6ee962a0
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/external.py
@@ -0,0 +1,11 @@
+from batchglm import pkg_constants
+import batchglm.data as data_utils
+
+from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
+from batchglm.models.glm_beta import _EstimatorGLM, InputDataGLM, Model
+from batchglm.models.glm_beta.utils import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
+from batchglm.utils.linalg import groupwise_solve_lm
+
+from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, Estimator, ModelVarsGLM
+from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
diff --git a/batchglm/train/tf2/glm_beta/layers.py b/batchglm/train/tf2/glm_beta/layers.py
new file mode 100644
index 00000000..0cfc0ef9
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/layers.py
@@ -0,0 +1,53 @@
+import tensorflow as tf
+from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from .processModel import ProcessModel
+
+
+class UnpackParams(UnpackParamsGLM, ProcessModel):
+    """
+    Full class.
+    """
+
+
+class LinearLoc(LinearLocGLM, ProcessModel):
+
+    def with_size_factors(self, eta_loc, size_factors):
+        raise NotImplementedError("There are no size_factors for GLMs with Beta noise.")
+
+
+class LinearScale(LinearScaleGLM, ProcessModel):
+    """
+    Full Class
+    """
+
+
+class LinkerLoc(LinkerLocGLM):
+
+    def _inv_linker(self, loc: tf.Tensor):
+        return 1 / (1 + tf.exp(-loc))
+
+
+class LinkerScale(LinkerScaleGLM):
+
+    def _inv_linker(self, scale: tf.Tensor):
+        return tf.exp(scale)
+
+
+class Likelihood(LikelihoodGLM, ProcessModel):
+
+    def _ll(self, eta_loc, eta_scale, loc, scale, x):
+
+        if isinstance(x, tf.SparseTensor):
+            one_minus_x = tf.negative(tf.sparse.add(x, tf.negative(tf.ones_like(loc))))
+        else:
+            one_minus_x = 1 - x
+
+        one_minus_loc = 1 - loc
+        log_probs = tf.math.lgamma(scale) - tf.math.lgamma(loc * scale) \
+            - tf.math.lgamma(one_minus_loc * scale) \
+            + (scale * loc - 1) * tf.math.log(x) \
+            + (one_minus_loc * scale - 1) * tf.math.log(one_minus_x)
+
+        log_probs = self.tf_clip_param(log_probs, "log_probs")
+
+        return log_probs
diff --git a/batchglm/train/tf2/glm_beta/layers_gradients.py b/batchglm/train/tf2/glm_beta/layers_gradients.py
new file mode 100644
index 00000000..566e9b44
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/layers_gradients.py
@@ -0,0 +1,144 @@
+import tensorflow as tf
+from .external import FIMGLM, JacobianGLM, HessianGLM
+
+
+class FIM(FIMGLM):
+    # No Fisher Information Matrices due to unsolvable E[log(X)]
+
+    def _weight_fim_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        assert False, "not implemented"
+
+    def _weight_fim_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        assert False, "not implemented"
+
+
+class Jacobian(JacobianGLM):
+
+    def _weights_jac_a(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        one_minus_loc = 1 - loc
+        if isinstance(x, tf.SparseTensor):
+            const1 = tf.math.log(tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc))))
+        else:
+            const1 = tf.math.log(x / (1 - x))
+        const2 = - tf.math.digamma(loc * scale) + tf.math.digamma(one_minus_loc * scale) + const1
+        const = const2 * scale * loc * one_minus_loc
+        return const
+
+    def _weights_jac_b(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):
+            one_minus_x = - tf.sparse.add(x, -tf.ones_like(loc))
+        else:
+            one_minus_x = 1 - x
+        one_minus_loc = 1 - loc
+        const = scale * (tf.math.digamma(scale) - tf.math.digamma(loc * scale) * loc - tf.math.digamma(
+            one_minus_loc * scale) * one_minus_loc + loc * tf.math.log(x) + one_minus_loc * tf.math.log(
+            one_minus_x))
+        return const
+
+
+class Hessian(HessianGLM):
+
+    def _weight_hessian_aa(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        one_minus_loc = 1 - loc
+        loc_times_scale = loc * scale
+        one_minus_loc_times_scale = one_minus_loc * scale
+
+        if isinstance(x, tf.SparseTensor):
+            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
+            const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc)))
+            # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
+            # to_dense does not work.
+        else:
+            const1 = tf.math.log(x / (tf.ones_like(x) - x))
+
+        const2 = (1 - 2 * loc) * (
+                - tf.math.digamma(loc_times_scale) + tf.math.digamma(one_minus_loc_times_scale) + const1)
+        const3 = loc * one_minus_loc_times_scale * (
+                - tf.math.polygamma(tf.ones_like(loc), loc_times_scale) - tf.math.polygamma(tf.ones_like(loc),
+                                                                                            one_minus_loc_times_scale))
+        const = loc * one_minus_loc_times_scale * (const2 + const3)
+        return const
+
+    def _weight_hessian_ab(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        one_minus_loc = 1 - loc
+        loc_times_scale = loc * scale
+        one_minus_loc_times_scale = one_minus_loc * scale
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+
+        if isinstance(x, tf.SparseTensor):
+            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
+            const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc)))
+            # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
+            # to_dense does not work.
+        else:
+            const1 = tf.math.log(x / (1 - x))
+
+        const2 = - tf.math.digamma(loc_times_scale) + tf.math.digamma(one_minus_loc_times_scale) + const1
+        const3 = scale * (- tf.math.polygamma(scalar_one, loc_times_scale) * loc + one_minus_loc * tf.math.polygamma(
+            scalar_one,
+            one_minus_loc_times_scale))
+
+        const = loc * one_minus_loc_times_scale * (const2 + const3)
+
+        return const
+
+    def _weight_hessian_bb(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        one_minus_loc = 1 - loc
+        loc_times_scale = loc * scale
+        one_minus_loc_times_scale = one_minus_loc * scale
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+
+        if isinstance(x, tf.SparseTensor):
+            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
+            const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc)))
+            # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
+            # to_dense does not work.
+            const2 = loc * (tf.math.log(tf.sparse.add(tf.zeros_like(loc), x)) - tf.math.digamma(loc_times_scale)) \
+                     - one_minus_loc * (tf.math.digamma(one_minus_loc_times_scale) + tf.math.log(const1)) \
+                     + tf.math.digamma(scale)
+        else:
+            const1 = tf.math.log(x / (1 - x))
+            const2 = loc * (tf.math.log(x) - tf.math.digamma(loc_times_scale)) \
+                     - one_minus_loc * (tf.math.digamma(one_minus_loc_times_scale) + tf.math.log(const1)) \
+                     + tf.math.digamma(scale)
+        const3 = scale * (- tf.square(loc) * tf.math.polygamma(scalar_one, loc_times_scale)
+                          + tf.math.polygamma(scalar_one, scale)
+                          - tf.math.polygamma(scalar_one, one_minus_loc_times_scale) * tf.square(one_minus_loc))
+        const = scale * (const2 + const3)
+
+        return const
diff --git a/batchglm/train/tf2/glm_beta/model.py b/batchglm/train/tf2/glm_beta/model.py
new file mode 100644
index 00000000..556f5335
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/model.py
@@ -0,0 +1,24 @@
+from .external import GLM
+from .processModel import ProcessModel
+
+
+class BetaGLM(GLM, ProcessModel):
+
+    def __init__(
+            self,
+            model_vars,
+            optimizer: str,
+            compute_a: bool,
+            compute_b: bool,
+            use_gradient_tape: bool,
+            dtype: str,
+    ):
+        super(BetaGLM, self).__init__(
+            model_vars=model_vars,
+            noise_module='glm_beta',
+            optimizer=optimizer,
+            compute_a=compute_a,
+            compute_b=compute_b,
+            use_gradient_tape=use_gradient_tape,
+            dtype=dtype
+        )
diff --git a/batchglm/train/tf2/glm_beta/processModel.py b/batchglm/train/tf2/glm_beta/processModel.py
new file mode 100644
index 00000000..c21811a4
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/processModel.py
@@ -0,0 +1,45 @@
+from .external import ProcessModelGLM
+import tensorflow as tf
+import numpy as np
+from .external import pkg_constants
+
+
+class ProcessModel(ProcessModelGLM):
+
+    def param_bounds(
+            self,
+            dtype
+    ):
+        if isinstance(dtype, tf.DType):
+            dmax = dtype.max
+            dtype = dtype.as_numpy_dtype
+        else:
+            dtype = np.dtype(dtype)
+            dmax = np.finfo(dtype).max
+            dtype = dtype.type
+
+        zero = np.nextafter(0, np.inf, dtype=dtype)
+        one = np.nextafter(1, -np.inf, dtype=dtype)
+
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+        bounds_min = {
+            "a_var": np.log(zero / (1 - zero)) / sf,
+            "b_var": np.log(zero) / sf,
+            "eta_loc": np.log(zero / (1 - zero)) / sf,
+            "eta_scale": np.log(zero) / sf,
+            "mean": np.nextafter(0, np.inf, dtype=dtype),
+            "samplesize": np.nextafter(0, np.inf, dtype=dtype),
+            "probs": dtype(0),
+            "log_probs": np.log(zero),
+        }
+        bounds_max = {
+            "a_var": np.log(one / (1 - one)) / sf,
+            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.log(one / (1 - one)) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "mean": one,
+            "samplesize": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "probs": dtype(1),
+            "log_probs": dtype(0),
+        }
+        return bounds_min, bounds_max
diff --git a/batchglm/train/tf2/glm_beta/training_strategies.py b/batchglm/train/tf2/glm_beta/training_strategies.py
new file mode 100644
index 00000000..b6db5b22
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/training_strategies.py
@@ -0,0 +1,37 @@
+from enum import Enum
+
+class TrainingStrategies(Enum):
+
+    AUTO = None
+    DEFAULT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    INEXACT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-6,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    EXACT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    IRLS = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "irls_tr",
+        },
+    ]
diff --git a/batchglm/train/tf2/glm_beta/vars.py b/batchglm/train/tf2/glm_beta/vars.py
new file mode 100644
index 00000000..b1200abc
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/vars.py
@@ -0,0 +1,8 @@
+from .model import ProcessModel
+from .external import ModelVarsGLM
+
+
+class ModelVars(ProcessModel, ModelVarsGLM):
+    """
+    Full class.
+    """
diff --git a/batchglm/train/tf2/glm_nb/__init__.py b/batchglm/train/tf2/glm_nb/__init__.py
new file mode 100644
index 00000000..f8cd6ee7
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/__init__.py
@@ -0,0 +1,5 @@
+from .processModel import ProcessModel
+from .vars import ModelVars
+from .estimator import Estimator
+
+from .model import NBGLM
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
new file mode 100644
index 00000000..4cabc8ac
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -0,0 +1,305 @@
+import logging
+from typing import Union
+import numpy as np
+
+from .external import InputDataGLM, Model
+from .external import closedform_nb_glm_logmu, closedform_nb_glm_logphi
+from .model import NBGLM
+from .vars import ModelVars
+from .processModel import ProcessModel
+from .external import Estimator as GLMEstimator
+from .training_strategies import TrainingStrategies
+
+# needed for train_irls_ls_tr
+from .optim import IRLS_LS
+
+
+class Estimator(GLMEstimator, ProcessModel):
+    """
+    Estimator for Generalized Linear Models (GLMs) with negative binomial noise.
+    Uses the natural logarithm as linker function.
+    """
+    model: NBGLM
+
+    def __init__(
+            self,
+            input_data: InputDataGLM,
+            init_a: Union[np.ndarray, str] = "AUTO",
+            init_b: Union[np.ndarray, str] = "AUTO",
+            quick_scale: bool = False,
+            dtype="float32",
+    ):
+        """
+        Performs initialisation and creates a new estimator.
+
+        :param input_data: InputDataGLM
+            The input data
+        :param init_a: (Optional)
+            Low-level initial values for a. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize intercept with observed mean
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'a'
+        :param init_b: (Optional)
+            Low-level initial values for b. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize with zeros
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'b'
+        :param quick_scale: bool
+            Whether `scale` will be fitted faster and maybe less accurate.
+            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+        :param dtype: Precision used in tensorflow.
+        """
+        self.TrainingStrategies = TrainingStrategies
+        self._train_loc = True
+        self._train_scale = True
+
+        (init_a, init_b) = self.init_par(
+            input_data=input_data,
+            init_a=init_a,
+            init_b=init_b,
+            init_model=None
+        )
+        init_a = init_a.astype(dtype)
+        init_b = init_b.astype(dtype)
+        if quick_scale:
+            self._train_scale = False
+
+        self.model_vars = ModelVars(
+            init_a=init_a,
+            init_b=init_b,
+            constraints_loc=input_data.constraints_loc,
+            constraints_scale=input_data.constraints_scale,
+            dtype=dtype
+        )
+
+        super(Estimator, self).__init__(
+            input_data=input_data,
+            dtype=dtype
+        )
+
+    def train(
+            self,
+            use_batching: bool = True,
+            batch_size: int = 5000,
+            optim_algo: str = "adam",
+            learning_rate: float = 1e-2,
+            convergence_criteria: str = "step",
+            stopping_criteria: int = 1000,
+            autograd: bool = False,
+            featurewise: bool = True,
+            benchmark: bool = False,
+            maxiter: int = 1,
+            b_update_freq = 1
+    ):
+        if self.model is None:
+            self.model = NBGLM(
+                model_vars=self.model_vars,
+                dtype=self.model_vars.dtype,
+                compute_a=self._train_loc,
+                compute_b=self._train_scale,
+                use_gradient_tape=autograd,
+                optimizer=optim_algo
+            )
+        else:
+            self.model.setMethod(optim_algo)
+
+        intercept_scale = len(self.model.model_vars.idx_train_scale) == 1
+        optimizer_object = self.get_optimizer_object(optim_algo, learning_rate, intercept_scale)
+        self.optimizer = optimizer_object
+        if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr', 'irls_tr_gd_tr']:
+            self.update = self.update_separated
+            self.maxiter = maxiter
+
+        super(Estimator, self)._train(
+            noise_model="nb",
+            is_batched=use_batching,
+            batch_size=batch_size,
+            optimizer_object=optimizer_object,
+            convergence_criteria=convergence_criteria,
+            stopping_criteria=stopping_criteria,
+            autograd=autograd,
+            featurewise=featurewise,
+            benchmark=benchmark,
+            optim_algo=optim_algo,
+            b_update_freq=b_update_freq
+        )
+
+    def get_optimizer_object(self, optimizer, learning_rate, intercept_scale):
+        optim = optimizer.lower()
+        if optim in ['irls_gd_tr', 'irls_gd', 'irls_ar_tr', 'irls_tr_gd_tr']:
+            return IRLS_LS(
+                dtype=self.dtype,
+                tr_mode=optim.endswith('tr'),
+                model=self.model,
+                name=optim,
+                n_obs=self.input_data.num_observations,
+                intercept_scale=intercept_scale)
+        return super().get_optimizer_object(optimizer, learning_rate)
+
+    def update_separated(self, results, batches, batch_features, compute_b):
+
+        self.optimizer.perform_parameter_update(
+            inputs=[batches, *results],
+            compute_a=True,
+            compute_b=False,
+            batch_features=batch_features,
+            is_batched=False
+        )
+        if compute_b:
+            self.optimizer.perform_parameter_update(
+                inputs=[batches, *results],
+                compute_a=False,
+                compute_b=True,
+                batch_features=batch_features,
+                is_batched=False,
+                maxiter=self.maxiter
+            )
+        else:
+            self.model.model_vars.updated_b = np.zeros_like(self.model.model_vars.updated_b)
+
+
+    def get_model_container(
+            self,
+            input_data
+    ):
+        return Model(input_data=input_data)
+
+    def init_par(
+            self,
+            input_data,
+            init_a,
+            init_b,
+            init_model
+    ):
+        r"""
+        standard:
+        Only initialise intercept and keep other coefficients as zero.
+
+        closed-form:
+        Initialize with Maximum Likelihood / Maximum of Momentum estimators
+
+        Idea:
+        $$
+            \theta &= f(x) \\
+            \Rightarrow f^{-1}(\theta) &= x \\
+                &= (D \cdot D^{+}) \cdot x \\
+                &= D \cdot (D^{+} \cdot x) \\
+                &= D \cdot x' = f^{-1}(\theta)
+        $$
+        """
+
+        if init_model is None:
+            groupwise_means = None
+            init_a_str = None
+            if isinstance(init_a, str):
+                init_a_str = init_a.lower()
+                # Chose option if auto was chosen
+                if init_a.lower() == "auto":
+                    init_a = "standard"
+
+                if init_a.lower() == "closed_form":
+                    groupwise_means, init_a, rmsd_a = closedform_nb_glm_logmu(
+                        x=input_data.x,
+                        design_loc=input_data.design_loc,
+                        constraints_loc=input_data.constraints_loc,
+                        size_factors=input_data.size_factors,
+                        link_fn=lambda loc: np.log(self.np_clip_param(loc, "loc"))
+                    )
+
+                    # train mu, if the closed-form solution is inaccurate
+                    self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0)
+
+                    if input_data.size_factors is not None:
+                        if np.any(input_data.size_factors != 1):
+                            self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using closed-form MLE initialization for mean")
+                    logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc)
+                elif init_a.lower() == "standard":
+                    overall_means = np.mean(input_data.x, axis=0)  # directly calculate the mean
+                    overall_means = self.np_clip_param(overall_means, "loc")
+
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    init_a[0, :] = np.log(overall_means)
+                    self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using standard initialization for mean")
+                    logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc)
+                elif init_a.lower() == "all_zero":
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using all_zero initialization for mean")
+                    logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc)
+                else:
+                    raise ValueError("init_a string %s not recognized" % init_a)
+
+            if isinstance(init_b, str):
+                if init_b.lower() == "auto":
+                    init_b = "standard"
+
+                if init_b.lower() == "standard":
+                    groupwise_scales, init_b_intercept, rmsd_b = closedform_nb_glm_logphi(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale[:, [0]],
+                        constraints=input_data.constraints_scale[[0], :][:, [0]],
+                        size_factors=input_data.size_factors,
+                        groupwise_means=None,
+                        link_fn=lambda scale: np.log(self.np_clip_param(scale, "scale"))
+                    )
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+                    init_b[0, :] = init_b_intercept
+
+                    logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion")
+                    logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale)
+                elif init_b.lower() == "closed_form":
+                    dmats_unequal = False
+                    if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]:
+                        if np.any(input_data.design_loc != input_data.design_scale):
+                            dmats_unequal = True
+
+                    inits_unequal = False
+                    if init_a_str is not None:
+                        if init_a_str != init_b:
+                            inits_unequal = True
+
+                    if inits_unequal or dmats_unequal:
+                        raise ValueError(
+                            "cannot use closed_form init for scale model if scale model differs from loc model"
+                        )
+
+                    groupwise_scales, init_b, rmsd_b = closedform_nb_glm_logphi(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale,
+                        constraints=input_data.constraints_scale,
+                        size_factors=input_data.size_factors,
+                        groupwise_means=groupwise_means,
+                        link_fn=lambda scale: np.log(self.np_clip_param(scale, "scale"))
+                    )
+
+                    logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion")
+                    logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale)
+                elif init_b.lower() == "all_zero":
+                    init_b = np.zeros([input_data.num_scale_params, input_data.x.shape[1]])
+
+                    logging.getLogger("batchglm").debug("Using standard initialization for dispersion")
+                    logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale)
+                else:
+                    raise ValueError("init_b string %s not recognized" % init_b)
+        else:
+            init_a, init_b = self.get_init_from_model(init_a=init_a,
+                                                      init_b=init_b,
+                                                      input_data=input_data,
+                                                      init_model=init_model)
+
+        return init_a, init_b
diff --git a/batchglm/train/tf2/glm_nb/external.py b/batchglm/train/tf2/glm_nb/external.py
new file mode 100644
index 00000000..2d6638fa
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/external.py
@@ -0,0 +1,21 @@
+import batchglm.data as data_utils
+
+from batchglm.models.glm_nb import _EstimatorGLM, InputDataGLM, Model
+from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
+from batchglm.models.glm_nb.utils import closedform_nb_glm_logmu, closedform_nb_glm_logphi
+
+from batchglm.utils.linalg import groupwise_solve_lm
+from batchglm import pkg_constants
+
+from batchglm.train.tf2.base_glm import GLM
+from batchglm.train.tf2.base_glm import ProcessModelGLM, ModelVarsGLM
+
+# import necessary base_glm layers
+from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM
+from batchglm.train.tf2.base_glm import LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
+from batchglm.train.tf2.base_glm import Estimator
+
+# these are needed for nb specific irls_ls_tr training
+from batchglm.train.tf2.base_glm import IRLS
+from batchglm.train.tf2.base_glm import DataGenerator, ConvergenceCalculator
diff --git a/batchglm/train/tf2/glm_nb/layers.py b/batchglm/train/tf2/glm_nb/layers.py
new file mode 100644
index 00000000..afa67f7e
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/layers.py
@@ -0,0 +1,59 @@
+import tensorflow as tf
+from .processModel import ProcessModel
+from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM
+from .external import LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+
+
+class UnpackParams(UnpackParamsGLM, ProcessModel):
+    """
+    Full class.
+    """
+
+
+class LinearLoc(LinearLocGLM, ProcessModel):
+
+    def with_size_factors(self, eta_loc, size_factors):
+        return tf.add(eta_loc, tf.math.log(size_factors))
+
+
+class LinearScale(LinearScaleGLM, ProcessModel):
+    """
+    Full class.
+    """
+
+
+class LinkerLoc(LinkerLocGLM):
+
+    def _inv_linker(self, loc: tf.Tensor):
+        return tf.exp(loc)
+
+
+class LinkerScale(LinkerScaleGLM):
+
+    def _inv_linker(self, scale: tf.Tensor):
+        return tf.exp(scale)
+
+
+class Likelihood(LikelihoodGLM, ProcessModel):
+
+    def _ll(self, eta_loc, eta_scale, loc, scale, x):
+
+        # Log-likelihood:
+        log_r_plus_mu = tf.math.log(scale + loc)
+        if isinstance(x, tf.SparseTensor):
+            log_probs_sparse = x.__mul__(eta_loc - log_r_plus_mu)
+            log_probs_dense = tf.math.lgamma(tf.sparse.add(x, scale)) - \
+                              tf.math.lgamma(tf.sparse.add(x, tf.ones(shape=x.dense_shape, dtype=self.dtype))) - \
+                              tf.math.lgamma(scale) + \
+                              tf.multiply(scale, eta_scale - log_r_plus_mu)
+            log_probs = tf.sparse.add(log_probs_sparse, log_probs_dense)
+            # log_probs.set_shape([None, n_features])  # need as shape completely lost.
+        else:
+            log_probs = tf.math.lgamma(scale + x) - \
+                        tf.math.lgamma(x + tf.ones_like(x)) - \
+                        tf.math.lgamma(scale) + \
+                        tf.multiply(x, eta_loc - log_r_plus_mu) + \
+                        tf.multiply(scale, eta_scale - log_r_plus_mu)
+
+        log_probs = self.tf_clip_param(log_probs, "log_probs")
+        return log_probs
diff --git a/batchglm/train/tf2/glm_nb/layers_gradients.py b/batchglm/train/tf2/glm_nb/layers_gradients.py
new file mode 100644
index 00000000..8ff079c6
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/layers_gradients.py
@@ -0,0 +1,144 @@
+import tensorflow as tf
+from .external import FIMGLM, JacobianGLM, HessianGLM
+
+
+class FIM(FIMGLM):
+
+    def _weight_fim_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        const = tf.divide(scale, scale + loc)
+        w = tf.multiply(loc, const)
+
+        return w
+
+    def _weight_fim_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        return tf.zeros_like(scale)
+
+
+class Jacobian(JacobianGLM):
+
+    def _weights_jac_a(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):  # or isinstance(x, tf.SparseTensorValue):
+            const = tf.sparse.add(x, tf.negative(loc))
+        else:
+            const = tf.subtract(x, loc)
+        return tf.divide(tf.multiply(scale, const), tf.add(loc, scale))
+
+    def _weights_jac_b(self, x, loc, scale):
+        # Pre-define sub-graphs that are used multiple times:
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+        if isinstance(x, tf.SparseTensor):  # or isinstance(x, tf.SparseTensorValue):
+            scale_plus_x = tf.sparse.add(x, scale)
+        else:
+            scale_plus_x = scale + x
+
+        r_plus_mu = scale + loc
+
+        # Define graphs for individual terms of constant term of hessian:
+        const1 = tf.subtract(
+            tf.math.digamma(x=scale_plus_x),
+            tf.math.digamma(x=scale)
+        )
+        const2 = tf.negative(scale_plus_x / r_plus_mu)
+        const3 = tf.add(
+            tf.math.log(scale),
+            scalar_one - tf.math.log(r_plus_mu)
+        )
+        const = tf.add_n([const1, const2, const3])  # [observations, features]
+        const = scale * const
+
+        return const
+
+
+class Hessian(HessianGLM):
+
+    def _weight_hessian_ab(self, x, loc, scale):
+
+        if isinstance(x, tf.SparseTensor):
+            x_minus_mu = tf.sparse.add(x, -loc)
+        else:
+            x_minus_mu = x - loc
+
+        const = tf.multiply(
+            loc * scale,
+            tf.divide(
+                x_minus_mu,
+                tf.square(loc + scale)
+            )
+        )
+
+        return const
+
+    def _weight_hessian_aa(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):# or isinstance(x, tf.SparseTensorValue):
+            x_by_scale_plus_one = tf.sparse.add(x.__div__(scale), tf.ones_like(scale))
+        else:
+            x_by_scale_plus_one = x / scale + tf.ones_like(scale)
+
+        const = tf.negative(tf.multiply(
+            loc,
+            tf.divide(
+                x_by_scale_plus_one,
+                tf.square((loc / scale) + tf.ones_like(loc))
+            )
+        ))
+
+        return const
+
+    def _weight_hessian_bb(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):#  or isinstance(x, tf.SparseTensorValue):
+            scale_plus_x = tf.sparse.add(x, scale)
+        else:
+            scale_plus_x = x + scale
+
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
+        # Pre-define sub-graphs that are used multiple times:
+        scale_plus_loc = scale + loc
+        # Define graphs for individual terms of constant term of hessian:
+        const1 = tf.add(
+            tf.math.digamma(x=scale_plus_x),
+            scale * tf.math.polygamma(a=scalar_one, x=scale_plus_x)
+        )
+        const2 = tf.negative(tf.add(
+            tf.math.digamma(x=scale),
+            scale * tf.math.polygamma(a=scalar_one, x=scale)
+        ))
+        const3 = tf.negative(tf.divide(
+            tf.add(
+                loc * scale_plus_x,
+                scalar_two * scale * scale_plus_loc
+            ),
+            tf.square(scale_plus_loc)
+        ))
+        const4 = tf.add(
+            tf.math.log(scale),
+            scalar_two - tf.math.log(scale_plus_loc)
+        )
+        const = tf.add_n([const1, const2, const3, const4])
+        const = tf.multiply(scale, const)
+        return const
diff --git a/batchglm/train/tf2/glm_nb/model.py b/batchglm/train/tf2/glm_nb/model.py
new file mode 100644
index 00000000..bd2dbe66
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/model.py
@@ -0,0 +1,24 @@
+from .external import GLM
+from .processModel import ProcessModel
+
+
+class NBGLM(GLM, ProcessModel):
+
+    def __init__(
+            self,
+            model_vars,
+            optimizer: str,
+            compute_a: bool,
+            compute_b: bool,
+            use_gradient_tape: bool,
+            dtype: str,
+    ):
+        super(NBGLM, self).__init__(
+            model_vars=model_vars,
+            noise_module='glm_nb',
+            optimizer=optimizer,
+            compute_a=compute_a,
+            compute_b=compute_b,
+            use_gradient_tape=use_gradient_tape,
+            dtype=dtype
+        )
diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
new file mode 100644
index 00000000..ebab6441
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -0,0 +1,275 @@
+import tensorflow as tf
+import numpy as np
+from .external import IRLS, pkg_constants
+
+class IRLS_LS(IRLS):
+
+    def __init__(self, dtype, tr_mode, model, name, n_obs, intercept_scale):
+
+        parent_tr_mode = False
+        self.tr_mode_b = False
+        if name.startswith('irls_tr'):
+            parent_tr_mode = True  # for loc model
+        if name in ['irls_tr_gd_tr', 'irls_gd_tr', 'irls_gd', 'irls_tr_gd']:
+            self.update_b_func = self.update_b_gd
+        elif name in ['irls_ar', 'irls_tr_ar']:
+            assert intercept_scale, "Line search (armijo) is only available" \
+                "for scale models with a single coefficient (intercept scale)."
+            self.update_b_func = self.update_b_ar
+        else:
+            assert False, "Unrecognized method for optimization given."
+        super(IRLS_LS, self).__init__(
+            dtype=dtype,
+            tr_mode=parent_tr_mode,
+            model=model,
+            name=name,
+            n_obs=n_obs)
+
+        if tr_mode:
+            n_features = self.model.model_vars.n_features
+            self.tr_radius_b = tf.Variable(
+                np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE,
+                dtype=self._dtype, trainable=False)
+            self.tr_mode_b = True
+
+    def _trust_region_linear_cost_gain(
+            self,
+            proposed_vector,
+            neg_jac
+    ):
+        pred_cost_gain = tf.reduce_sum(tf.multiply(
+            proposed_vector,
+            tf.transpose(neg_jac)
+        ), axis=0)
+        return pred_cost_gain
+
+    def perform_parameter_update(
+        self,
+        inputs,
+        compute_a=True,
+        compute_b=True,
+        batch_features=False,
+        is_batched=False,
+        maxiter=1
+    ):
+        assert compute_a ^ compute_b, \
+            "IRLS_LS computes either loc or scale model updates, not both nor none at the same time."
+
+        if compute_a:
+            super(IRLS_LS, self).perform_parameter_update(
+                inputs, compute_a, compute_b, batch_features, is_batched)
+        else:
+            # global_step = tf.zeros_like(self.model.model_vars.remaining_features)
+            results = inputs[1:4]
+            x_batches = inputs[0]
+            iteration = 0
+            not_converged = np.zeros_like(self.model.model_vars.remaining_features)
+            updated_b = np.zeros_like(self.model.model_vars.updated_b)
+            while True:
+                iteration += 1
+                step = self.update_b_func([x_batches, *results], batch_features, is_batched)
+                not_converged = tf.abs(step).numpy() > pkg_constants.XTOL_BY_FEATURE_SCALE
+                updated_b |= self.model.model_vars.updated_b
+                if not tf.reduce_any(not_converged) or iteration == maxiter:
+                    break
+                for i, x_batch in enumerate(inputs[0]):
+                    results = self.model.calc_jacobians(x_batch, concat=False, compute_a=False) if i == 0 else \
+                        [tf.math.add(results[i], x) for
+                         i, x in enumerate(self.model.calc_jacobians(x_batch, concat=False, compute_a=False))]
+            self.model.model_vars.updated_b = updated_b
+
+    def gett1t2(self):
+        t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
+        t2 = tf.constant(pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE, dtype=self._dtype)
+        return t1, t2
+
+    def _trust_region_update_b(
+            self,
+            update_raw,
+            radius_container,
+    ):
+        update_magnitude, update_magnitude_inv = IRLS_LS._calc_update_magnitudes(update_raw)
+        update_norm = tf.multiply(update_raw, update_magnitude_inv)
+
+        update_magnitude = update_magnitude / self.n_obs * radius_container
+
+        update_scale = tf.minimum(
+            radius_container,
+            update_magnitude
+        )
+        proposed_vector = tf.multiply(
+            update_norm,
+            update_scale
+        )
+
+        return proposed_vector
+    def update_b_gd(self, inputs, batch_features, is_batched):
+
+        x_batches, log_probs, _, jac_b = inputs
+
+        update_b = tf.transpose(jac_b)
+        if not self.tr_mode_b:
+            update = self._pad_updates(
+                update_raw=update_b,
+                compute_a=False,
+                compute_b=True
+            )
+
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=update,
+                is_batched=is_batched,
+                compute_a=False,
+                compute_b=True
+            )
+            self.model.params_copy.assign_sub(update)
+
+            return tf.where(update_theta, update, tf.zeros_like(update))
+
+        else:
+            if batch_features:
+                radius_container = tf.boolean_mask(
+                    tensor=self.tr_radius_b,
+                    mask=self.model.model_vars.remaining_features)
+            else:
+                radius_container = self.tr_radius_b
+
+            tr_proposed_vector_b = self._trust_region_update_b(
+                update_raw=update_b,
+                radius_container=radius_container
+            )
+
+            tr_update_b = self._pad_updates(
+                update_raw=tr_proposed_vector_b,
+                compute_a=False,
+                compute_b=True
+            )
+
+            # perform update
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=tr_update_b,
+                is_batched=is_batched,
+                compute_a=False,
+                compute_b=True)
+            self._trust_region_ops(
+                proposed_vector=tr_update_b,
+                compute_a=False,
+                compute_b=True,
+                batch_features=batch_features,
+                update_theta=update_theta)
+
+            return tf.where(update_theta, tr_proposed_vector_b, tf.zeros_like(tr_proposed_vector_b))
+
+    def update_b_ar(self, inputs, batch_features, is_batched, alpha0=None):
+
+        x_batches, log_probs, _, jac_b = inputs
+        jac_b = tf.reshape(jac_b, [jac_b.shape[0]])
+        direction = -tf.sign(jac_b)
+        derphi0 = jac_b / self.n_obs
+        if alpha0 is None:
+            alpha0 = tf.ones_like(jac_b) * pkg_constants.ALPHA0
+        original_params_b_copy = self.model.params_copy[-1]
+
+        def phi(alpha):
+            multiplier = tf.multiply(alpha, direction)
+            new_scale_params = tf.add(original_params_b_copy, multiplier)
+            self.model.params_copy[-1].assign(new_scale_params)
+            new_likelihood = None
+            for i, x_batch in enumerate(x_batches):
+                log_likelihood = self.model.calc_ll([*x_batch])[0]
+                new_likelihood = log_likelihood if i == 0 else \
+                    tf.math.add(new_likelihood, log_likelihood)
+            new_likelihood = self._norm_neg_log_likelihood(new_likelihood)
+            return new_likelihood
+
+        current_likelihood = self._norm_neg_log_likelihood(log_probs)
+        new_likelihood = phi(alpha0)
+        beneficial = self.wolfe1(current_likelihood, new_likelihood, alpha0, derphi0)
+
+        if tf.reduce_all(beneficial):  # are all beneficial?
+            updated = beneficial
+            if batch_features:
+                n_features = self.model.model_vars.n_features
+                indices = tf.where(self.model.model_vars.remaining_features)
+                updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
+            self.model.model_vars.updated_b = updated
+            return tf.multiply(alpha0, direction)
+
+        divisor = new_likelihood - current_likelihood - derphi0 * alpha0
+        alpha1 = tf.negative(derphi0) * alpha0**2 / 2 / divisor
+        alpha1 = tf.where(beneficial, alpha0, alpha1)
+        new_likelihood2 = phi(alpha1)
+        beneficial = self.wolfe1(current_likelihood, new_likelihood2, alpha1, derphi0)
+        if tf.reduce_all(beneficial):
+            updated = beneficial
+            if batch_features:
+                n_features = self.model.model_vars.n_features
+                indices = tf.where(self.model.model_vars.remaining_features)
+                updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
+            self.model.model_vars.updated_b = updated
+            return tf.multiply(alpha1, direction)
+
+        if not tf.reduce_any(alpha1 > pkg_constants.XTOL_BY_FEATURE_SCALE):
+            # catch in case it doesn't enter the loop.
+            new_scale_params = tf.where(beneficial, self.model.params_copy[-1], original_params_b_copy)
+            self.model.params_copy[-1].assign(new_scale_params)
+            self.model.model_vars.updated_b = np.ones_like(self.model.model_vars.updated_b)
+            return tf.multiply(alpha1, direction)
+
+        while tf.reduce_any(alpha1 > pkg_constants.XTOL_BY_FEATURE_SCALE):
+
+            factor = alpha0**2 * alpha1**2 * (alpha1-alpha0)
+            a = alpha0**2 * (new_likelihood2 - current_likelihood - derphi0 * alpha1) - \
+                alpha1**2 * (new_likelihood - current_likelihood - derphi0 * alpha0)
+            a = a / factor
+
+            b = -alpha0**3 * (new_likelihood2 - current_likelihood - derphi0 * alpha1) + \
+                alpha1**3 * (new_likelihood - current_likelihood - derphi0 * alpha0)
+            b = b / factor
+
+            alpha2 = (-b + tf.sqrt(tf.abs(tf.square(b) - 3 * a * derphi0))) / (3 * a)
+            alpha2 = tf.where(beneficial, alpha1, alpha2)
+            idx_to_clip = tf.logical_or(tf.math.is_nan(alpha2), alpha2 < 0)
+            alpha2 = tf.where(idx_to_clip, tf.zeros_like(alpha2), alpha2)
+            new_likelihood3 = phi(alpha2)
+            beneficial = self.wolfe1(current_likelihood, new_likelihood3, alpha2, derphi0)
+
+            if tf.reduce_all(beneficial):
+                updated = beneficial
+                if batch_features:
+                    n_features = self.model.model_vars.n_features
+                    indices = tf.where(self.model.model_vars.remaining_features)
+                    updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
+                self.model.model_vars.updated_b = updated
+                return tf.multiply(alpha2, direction)
+
+            step_diff_greater_half_alpha1 = (alpha1 - alpha2) > alpha1 / 2
+            ratio = (1 - alpha2/alpha1) < 0.96
+            set_back = tf.logical_or(step_diff_greater_half_alpha1, ratio)
+            alpha2 = tf.where(set_back, alpha1 / 2, alpha2)
+            alpha2 = tf.where(tf.logical_or(tf.math.is_nan(alpha2), alpha2 < 0), tf.zeros_like(alpha2), alpha2)
+
+            alpha0 = alpha1
+            alpha1 = alpha2
+            new_likelihood = new_likelihood2
+            new_likelihood2 = new_likelihood3
+
+        new_scale_params = tf.where(beneficial, self.model.params_copy[-1], original_params_b_copy)
+        self.model.params_copy[-1].assign(new_scale_params)
+        updated = beneficial
+        if batch_features:
+            n_features = self.model.model_vars.n_features
+            indices = tf.where(self.model.model_vars.remaining_features)
+            updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
+        self.model.model_vars.updated_b = np.ones_like(self.model.model_vars.updated_b)
+        return tf.multiply(alpha2, direction)
+
+    def wolfe1(self, current_likelihood, new_likelihood, alpha, jacobian):
+        """Checks if an update satisfies the first wolfe condition by returning the difference
+        to the previous likelihood."""
+        c1 = tf.constant(pkg_constants.WOLFE_C1, dtype=self._dtype)
+        limit = tf.add(current_likelihood, tf.multiply(tf.multiply(c1, alpha), jacobian))
+        return new_likelihood < limit
diff --git a/batchglm/train/tf2/glm_nb/processModel.py b/batchglm/train/tf2/glm_nb/processModel.py
new file mode 100644
index 00000000..6a177f7f
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/processModel.py
@@ -0,0 +1,42 @@
+from .external import ProcessModelGLM
+import tensorflow as tf
+import numpy as np
+from .external import pkg_constants
+
+
+class ProcessModel(ProcessModelGLM):
+
+    def param_bounds(
+            self,
+            dtype
+    ):
+        if isinstance(dtype, tf.DType):
+            dmax = dtype.max
+            dtype = dtype.as_numpy_dtype
+        else:
+            dtype = np.dtype(dtype)
+            dmax = np.finfo(dtype).max
+            dtype = dtype.type
+
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+        bounds_min = {
+            "a_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_loc": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "loc": np.nextafter(0, np.inf, dtype=dtype),
+            "scale": np.nextafter(0, np.inf, dtype=dtype),
+            "probs": dtype(0),
+            "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)),
+        }
+        bounds_max = {
+            "a_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "scale": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "probs": dtype(1),
+            "log_probs": dtype(0),
+        }
+        return bounds_min, bounds_max
diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py
new file mode 100644
index 00000000..18c35302
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/training_strategies.py
@@ -0,0 +1,40 @@
+from enum import Enum
+
+class TrainingStrategies(Enum):
+
+    AUTO = None
+    DEFAULT = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "irls_gd_tr",
+        },
+    ]
+    IRLS = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "irls_tr_gd_tr",
+        },
+    ]
+    IRLS_BATCHED = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": True,
+            "optim_algo": "irls_gd_tr",
+        },
+    ]
+    ADAM_BATCHED = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": True,
+            "optim_algo": "adam",
+        },
+    ]
+    ADAM = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "adam",
+        },
+    ]
diff --git a/batchglm/train/tf2/glm_nb/vars.py b/batchglm/train/tf2/glm_nb/vars.py
new file mode 100644
index 00000000..b1200abc
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/vars.py
@@ -0,0 +1,8 @@
+from .model import ProcessModel
+from .external import ModelVarsGLM
+
+
+class ModelVars(ProcessModel, ModelVarsGLM):
+    """
+    Full class.
+    """
diff --git a/batchglm/train/tf2/glm_norm/__init__.py b/batchglm/train/tf2/glm_norm/__init__.py
new file mode 100644
index 00000000..b6bf02af
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/__init__.py
@@ -0,0 +1,5 @@
+from .processModel import ProcessModel
+from .vars import ModelVars
+from .estimator import Estimator
+
+from .model import NormGLM
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
new file mode 100644
index 00000000..b66176da
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -0,0 +1,276 @@
+import logging
+import numpy as np
+import scipy.sparse
+from typing import Union
+
+from .external import closedform_norm_glm_logsd
+from .external import InputDataGLM, Model
+from .external import Estimator as GLMEstimator
+from .model import NormGLM
+from .processModel import ProcessModel
+from .vars import ModelVars
+from .training_strategies import TrainingStrategies
+
+
+logger = logging.getLogger("batchglm")
+
+
+class Estimator(GLMEstimator, ProcessModel):
+    """
+    Estimator for Generalized Linear Models (GLMs) with normal distributed noise.
+    Uses the identity function as linker function for loc and a log-linker function for scale.
+    """
+
+    model: NormGLM
+
+    def __init__(
+            self,
+            input_data: InputDataGLM,
+            init_a: Union[np.ndarray, str] = "AUTO",
+            init_b: Union[np.ndarray, str] = "AUTO",
+            quick_scale: bool = False,
+            dtype="float32",
+    ):
+        """
+        Performs initialisation and creates a new estimator.
+
+        :param input_data: InputDataGLM
+            The input data
+        :param init_a: (Optional)
+            Low-level initial values for a. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "all zero": initialize with zeros
+                * "random": initialize with random values
+                * "standard": initialize intercept with observed mean
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'a'
+        :param init_b: (Optional)
+            Low-level initial values for b. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize with zeros
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'b'
+        :param quick_scale: bool
+            Whether `scale` will be fitted faster and maybe less accurate.
+            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+        :param dtype: Precision used in tensorflow.
+        """
+        self.TrainingStrategies = TrainingStrategies
+        self._train_loc = True
+        self._train_scale = True
+
+        (init_a, init_b) = self.init_par(
+            input_data=input_data,
+            init_a=init_a,
+            init_b=init_b,
+            init_model=None
+        )
+        init_a = init_a.astype(dtype)
+        init_b = init_b.astype(dtype)
+        if quick_scale:
+            self._train_scale = False
+
+        self.model_vars = ModelVars(
+            init_a=init_a,
+            init_b=init_b,
+            constraints_loc=input_data.constraints_loc,
+            constraints_scale=input_data.constraints_scale,
+            dtype=dtype
+        )
+
+        super(Estimator, self).__init__(
+            input_data=input_data,
+            dtype=dtype
+        )
+
+    def train(
+        self,
+        use_batching: bool = True,
+        batch_size: int = 500,
+        optim_algo: str = "adam",
+        learning_rate: float = 1e-2,
+        convergence_criteria: str = "step",
+        stopping_criteria: int = 1000,
+        autograd: bool = False,
+        featurewise: bool = True,
+        benchmark: bool = False
+    ):
+        if self.model is None:
+            self.model = NormGLM(
+                model_vars=self.model_vars,
+                dtype=self.model_vars.dtype,
+                compute_a=self._train_loc,
+                compute_b=self._train_scale,
+                use_gradient_tape=autograd,
+                optimizer=optim_algo
+            )
+        else:
+            self.model.setMethod(optim_algo)
+
+        optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
+
+        super(Estimator, self)._train(
+            noise_model="norm",
+            is_batched=use_batching,
+            batch_size=batch_size,
+            optimizer_object=optimizer_object,
+            convergence_criteria=convergence_criteria,
+            stopping_criteria=stopping_criteria,
+            autograd=autograd,
+            featurewise=featurewise,
+            benchmark=benchmark,
+            optim_algo=optim_algo
+
+        )
+
+    def get_model_container(
+            self,
+            input_data
+    ):
+        return Model(input_data=input_data)
+
+    def init_par(self, input_data, init_a, init_b, init_model):
+        r"""
+        standard:
+        Only initialise intercept and keep other coefficients as zero.
+
+        closed-form:
+        Initialize with Maximum Likelihood / Maximum of Momentum estimators
+        """
+
+        sf_given = False
+        if input_data.size_factors is not None:
+            if np.any(np.abs(input_data.size_factors - 1.) > 1e-8):
+                sf_given = True
+
+        is_ols_model = input_data.design_scale.shape[1] == 1 and \
+            np.all(np.abs(input_data.design_scale - 1.) < 1e-8) and not sf_given
+
+        if init_model is None:
+            groupwise_means = None
+            init_a_str = None
+            if isinstance(init_a, str):
+                init_a_str = init_a.lower()
+                # Chose option if auto was chosen
+                if init_a.lower() == "auto":
+                    init_a = "closed_form"
+
+                if init_a.lower() == "closed_form" or init_a.lower() == "standard":
+                    design_constr = np.matmul(input_data.design_loc, input_data.constraints_loc)
+                    # Iterate over genes if X is sparse to avoid large sparse tensor.
+                    # If X is dense, the least square problem can be vectorised easily.
+                    if isinstance(input_data.x, scipy.sparse.csr_matrix):
+                        init_a, rmsd_a, _, _ = np.linalg.lstsq(
+                            np.matmul(design_constr.T, design_constr),
+                            input_data.x.T.dot(design_constr).T,  # need double .T because of dot product on sparse.
+                            rcond=None
+                        )
+                    else:
+                        init_a, rmsd_a, _, _ = np.linalg.lstsq(
+                            np.matmul(design_constr.T, design_constr),
+                            np.matmul(design_constr.T, input_data.x),
+                            rcond=None
+                        )
+                    groupwise_means = None
+                    if is_ols_model:
+                        self._train_loc = False
+
+                    logger.debug("Using OLS initialization for location model")
+                elif init_a.lower() == "all_zero":
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    self._train_loc = True
+
+                    logger.debug("Using all_zero initialization for mean")
+                else:
+                    raise ValueError("init_a string %s not recognized" % init_a)
+                logger.debug("Should train location model: %s", self._train_loc)
+
+            if isinstance(init_b, str):
+                if init_b.lower() == "auto":
+                    init_b = "standard"
+
+                if is_ols_model:
+                    # Calculated variance via E(x)^2 or directly depending on whether `mu` was specified.
+                    if isinstance(input_data.x, scipy.sparse.csr_matrix):
+                        expect_xsq = np.asarray(np.mean(input_data.x.power(2), axis=0))
+                    else:
+                        expect_xsq = np.expand_dims(np.mean(np.square(input_data.x), axis=0), axis=0)
+                    mean_model = np.matmul(
+                        np.matmul(input_data.design_loc, input_data.constraints_loc),
+                        init_a
+                    )
+                    expect_x_sq = np.mean(np.square(mean_model), axis=0)
+                    variance = (expect_xsq - expect_x_sq)
+                    init_b = np.log(np.sqrt(variance))
+                    self._train_scale = False
+
+                    logger.debug("Using residuals from OLS estimate for variance estimate")
+                elif init_b.lower() == "closed_form":
+                    dmats_unequal = False
+                    if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]:
+                        if np.any(input_data.design_loc != input_data.design_scale):
+                            dmats_unequal = True
+
+                    inits_unequal = False
+                    if init_a_str is not None:
+                        if init_a_str != init_b:
+                            inits_unequal = True
+
+                    # Watch out: init_mean is full obs x features matrix and is very large in many cases.
+                    if inits_unequal or dmats_unequal:
+                        raise ValueError(
+                            "cannot use closed_form init for scale model \
+                            if scale model differs from loc model"
+                        )
+
+                    groupwise_scales, init_b, rmsd_b = closedform_norm_glm_logsd(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale,
+                        constraints=input_data.constraints_scale,
+                        size_factors=input_data.size_factors,
+                        groupwise_means=groupwise_means,
+                        link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd"))
+                    )
+
+                    # train scale, if the closed-form solution is inaccurate
+                    self._train_scale = not (np.all(rmsd_b == 0) or rmsd_b.size == 0)
+
+                    logger.debug("Using closed-form MME initialization for standard deviation")
+                elif init_b.lower() == "standard":
+                    groupwise_scales, init_b_intercept, rmsd_b = closedform_norm_glm_logsd(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale[:, [0]],
+                        constraints=input_data.constraints_scale[[0], :][:, [0]],
+                        size_factors=input_data.size_factors,
+                        groupwise_means=None,
+                        link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd"))
+                    )
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+                    init_b[0, :] = init_b_intercept
+
+                    # train scale, if the closed-form solution is inaccurate
+                    self._train_scale = not (np.all(rmsd_b == 0) or rmsd_b.size == 0)
+
+                    logger.debug("Using closed-form MME initialization for standard deviation")
+                    logger.debug("Should train sd: %s", self._train_scale)
+                elif init_b.lower() == "all_zero":
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+
+                    logger.debug("Using standard initialization for standard deviation")
+                else:
+                    raise ValueError("init_b string %s not recognized" % init_b)
+                logger.debug("Should train sd: %s", self._train_scale)
+        else:
+            init_a, init_b = self.get_init_from_model(init_a=init_a,
+                                                      init_b=init_b,
+                                                      input_data=input_data,
+                                                      init_model=init_model)
+
+        return init_a, init_b
diff --git a/batchglm/train/tf2/glm_norm/external.py b/batchglm/train/tf2/glm_norm/external.py
new file mode 100644
index 00000000..5fc7ab0b
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/external.py
@@ -0,0 +1,12 @@
+import batchglm.data as data_utils
+
+from batchglm.models.glm_norm import _EstimatorGLM, InputDataGLM, Model
+from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
+from batchglm.models.glm_norm.utils import closedform_norm_glm_mean, closedform_norm_glm_logsd
+
+from batchglm.utils.linalg import groupwise_solve_lm
+from batchglm import pkg_constants
+
+from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, Estimator, ModelVarsGLM
+from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
diff --git a/batchglm/train/tf2/glm_norm/layers.py b/batchglm/train/tf2/glm_norm/layers.py
new file mode 100644
index 00000000..ba067352
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/layers.py
@@ -0,0 +1,49 @@
+import tensorflow as tf
+import numpy as np
+from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from .processModel import ProcessModel
+
+
+class UnpackParams(UnpackParamsGLM, ProcessModel):
+    """
+    Full class.
+    """
+
+
+class LinearLoc(LinearLocGLM, ProcessModel):
+
+    def with_size_factors(self, eta_loc, size_factors):
+        return tf.multiply(eta_loc, size_factors)
+
+
+class LinearScale(LinearScaleGLM, ProcessModel):
+    """
+    Full Class
+    """
+
+
+class LinkerLoc(LinkerLocGLM):
+
+    def _inv_linker(self, loc: tf.Tensor):
+        return loc
+
+
+class LinkerScale(LinkerScaleGLM):
+
+    def _inv_linker(self, scale: tf.Tensor):
+        return tf.math.exp(scale)
+
+
+class Likelihood(LikelihoodGLM, ProcessModel):
+
+    def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
+
+        const = tf.constant(-0.5 * np.log(2 * np.pi), shape=(), dtype=self.ll_dtype)
+        if isinstance(x, tf.SparseTensor):
+            log_probs = const - eta_scale - 0.5 * tf.math.square(tf.divide(tf.sparse.add(x, - loc), scale))
+            # log_probs.set_shape([None, a_var.shape[1]])  # Need this so as shape is completely lost.
+        else:
+            log_probs = const - eta_scale - 0.5 * tf.math.square(tf.divide(x - loc, scale))
+        log_probs = self.tf_clip_param(log_probs, "log_probs")
+
+        return log_probs
diff --git a/batchglm/train/tf2/glm_norm/layers_gradients.py b/batchglm/train/tf2/glm_norm/layers_gradients.py
new file mode 100644
index 00000000..e2b35119
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/layers_gradients.py
@@ -0,0 +1,116 @@
+import tensorflow as tf
+from .external import FIMGLM, JacobianGLM, HessianGLM
+
+
+class FIM(FIMGLM):
+
+    def _weight_fim_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        w = tf.square(tf.divide(tf.ones_like(scale), scale))
+
+        return w
+
+    def _weight_fim_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        w = tf.constant(2, shape=loc.shape, dtype=self.dtype)
+
+        return w
+
+
+class Jacobian(JacobianGLM):
+
+    def _weights_jac_a(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):
+            const1 = tf.sparse.add(x, -loc)
+            const = tf.divide(const1, tf.square(scale))
+        else:
+            const1 = tf.subtract(x, loc)
+            const = tf.divide(const1, tf.square(scale))
+        return const
+
+    def _weights_jac_b(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+        if isinstance(x, tf.SparseTensor):
+            const = tf.negative(scalar_one) + tf.math.square(
+                tf.divide(tf.sparse.add(x, -loc), scale)
+            )
+        else:
+            const = tf.negative(scalar_one) + tf.math.square(
+                tf.divide(tf.subtract(x, loc), scale)
+            )
+        return const
+
+
+class Hessian(HessianGLM):
+
+    def _weight_hessian_ab(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
+        if isinstance(x, tf.SparseTensor):
+            x_minus_loc = tf.sparse.add(x, -loc)
+        else:
+            x_minus_loc = x - loc
+
+        const = - tf.multiply(scalar_two,
+                              tf.divide(
+                                  x_minus_loc,
+                                  tf.square(scale)
+                              )
+                              )
+        return const
+
+    def _weight_hessian_aa(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+        const = - tf.divide(scalar_one, tf.square(scale))
+
+        return const
+
+    def _weight_hessian_bb(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
+        if isinstance(x, tf.SparseTensor):
+            x_minus_loc = tf.sparse.add(x, -loc)
+        else:
+            x_minus_loc = x - loc
+
+        const = - tf.multiply(
+            scalar_two,
+            tf.math.square(
+                tf.divide(
+                    x_minus_loc,
+                    scale
+                )
+            )
+        )
+        return const
diff --git a/batchglm/train/tf2/glm_norm/model.py b/batchglm/train/tf2/glm_norm/model.py
new file mode 100644
index 00000000..7bcb41fe
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/model.py
@@ -0,0 +1,24 @@
+from .external import GLM
+from .processModel import ProcessModel
+
+
+class NormGLM(GLM, ProcessModel):
+
+    def __init__(
+            self,
+            model_vars,
+            optimizer: str,
+            compute_a: bool,
+            compute_b: bool,
+            use_gradient_tape: bool,
+            dtype: str,
+    ):
+        super(NormGLM, self).__init__(
+            model_vars=model_vars,
+            noise_module='glm_norm',
+            optimizer=optimizer,
+            compute_a=compute_a,
+            compute_b=compute_b,
+            use_gradient_tape=use_gradient_tape,
+            dtype=dtype
+        )
diff --git a/batchglm/train/tf2/glm_norm/processModel.py b/batchglm/train/tf2/glm_norm/processModel.py
new file mode 100644
index 00000000..629099ff
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/processModel.py
@@ -0,0 +1,42 @@
+from .external import ProcessModelGLM
+import tensorflow as tf
+import numpy as np
+from .external import pkg_constants
+
+
+class ProcessModel(ProcessModelGLM):
+
+    def param_bounds(
+            self,
+            dtype
+    ):
+        if isinstance(dtype, tf.DType):
+            dmax = dtype.max
+            dtype = dtype.as_numpy_dtype
+        else:
+            dtype = np.dtype(dtype)
+            dmax = np.finfo(dtype).max
+            dtype = dtype.type
+
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+        bounds_min = {
+            "a_var": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_loc": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "mean": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "sd": np.nextafter(0, np.inf, dtype=dtype),
+            "probs": dtype(0),
+            "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)),
+        }
+        bounds_max = {
+            "a_var": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "mean": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "sd": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "probs": dtype(1),
+            "log_probs": dtype(0),
+        }
+        return bounds_min, bounds_max
diff --git a/batchglm/train/tf2/glm_norm/training_strategies.py b/batchglm/train/tf2/glm_norm/training_strategies.py
new file mode 100644
index 00000000..2ba524a7
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/training_strategies.py
@@ -0,0 +1,27 @@
+from enum import Enum
+
+
+class TrainingStrategies(Enum):
+
+    AUTO = None
+    DEFAULT = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "irls_tr",
+        },
+    ]
+    IRLS = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "irls_tr",
+        },
+    ]
+    IRLS_BATCHED = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": True,
+            "optim_algo": "irls_tr",
+        },
+    ]
diff --git a/batchglm/train/tf2/glm_norm/vars.py b/batchglm/train/tf2/glm_norm/vars.py
new file mode 100644
index 00000000..b1200abc
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/vars.py
@@ -0,0 +1,8 @@
+from .model import ProcessModel
+from .external import ModelVarsGLM
+
+
+class ModelVars(ProcessModel, ModelVarsGLM):
+    """
+    Full class.
+    """
diff --git a/batchglm/train/tf2/ops.py b/batchglm/train/tf2/ops.py
new file mode 100644
index 00000000..56fbf48b
--- /dev/null
+++ b/batchglm/train/tf2/ops.py
@@ -0,0 +1,59 @@
+import tensorflow as tf
+from typing import Union
+
+
+def swap_dims(tensor, axis0, axis1, exec_transpose=True, return_perm=False, name="swap_dims"):
+    """
+    Swaps two dimensions in a given tensor.
+
+    :param tensor: The tensor whose axes should be swapped
+    :param axis0: The first axis which should be swapped with `axis1`
+    :param axis1: The second axis which should be swapped with `axis0`
+    :param exec_transpose: Should the transpose operation be applied?
+    :param return_perm: Should the permutation argument for `tf.transpose` be returned?
+        Autmoatically true, if `exec_transpose` is False
+    :param name: The name scope of this op
+    :return: either retval, (retval, permutation) or permutation
+    """
+    with tf.name_scope(name):
+        rank = tf.range(tf.rank(tensor))
+        idx0 = rank[axis0]
+        idx1 = rank[axis1]
+        perm0 = tf.where(tf.equal(rank, idx0), tf.tile(tf.expand_dims(idx1, 0), [tf.size(rank)]), rank)
+        perm1 = tf.where(tf.equal(rank, idx1), tf.tile(tf.expand_dims(idx0, 0), [tf.size(rank)]), perm0)
+
+    if exec_transpose:
+        retval = tf.transpose(tensor, perm1)
+
+        if return_perm:
+            return retval, perm1
+        else:
+            return retval
+    else:
+        return perm1
+
+
+def stacked_lstsq(L, b, rcond=1e-10, name="stacked_lstsq"):
+    r"""
+    Solve `Lx = b`, via SVD least squares cutting of small singular values
+
+    :param L: tensor of shape (..., M, K)
+    :param b: tensor of shape (..., M, N).
+    :param rcond: threshold for inverse
+    :param name: name scope of this op
+    :return: x of shape (..., K, N)
+    """
+    with tf.name_scope(name):
+        u, s, v = tf.linalg.svd(L, full_matrices=False)
+        s_max = s.max(axis=-1, keepdims=True)
+        s_min = rcond * s_max
+
+        inv_s = tf.where(s >= s_min, tf.reciprocal(s), 0)
+
+        x = tf.einsum(
+            '...MK,...MN->...KN',
+            v,
+            tf.einsum('...K,...MK,...MN->...KN', inv_s, u, b)
+        )
+
+        return tf.conj(x)