Fix Tensorboard logging with Tensorflow [#224]

jamesdolezal · jamesdolezal · commit 0a6d0ad159b9 · 2023-02-02T12:05:19.000-06:00
- Fixes Tensorboard logging to include both training and validation logs, at the batch and epoch level, when using the Tensorboard backend. Validation logs were previously not being written due to a change in how the Tensorflow/Keras `Model.fit()` API interacted with Tensorboard callbacks.
- Add an informative error message to an assertion check in `Dataset.train_val_split()`
- All tests passed
diff --git a/slideflow/dataset.py b/slideflow/dataset.py
@@ -2812,9 +2812,18 @@ def train_val_split(
                 tfr for tfr in tfrecord_dir_list
                 if path_to_name(tfr) in train_slides
             ]
-
-        assert(len(val_tfrecords) == len(val_slides))
-        assert(len(training_tfrecords) == len(train_slides))
+        if not len(val_tfrecords) == len(val_slides):
+            raise errors.DatasetError(
+                f"Number of validation tfrecords ({len(val_tfrecords)}) does not "
+                f"match the number of validation slides ({len(val_slides)}). "
+                "This may happen if multiple tfrecords were found for some slides."
+            )
+        if not len(training_tfrecords) == len(train_slides):
+            raise errors.DatasetError(
+                f"Number of training tfrecords ({len(val_tfrecords)}) does not "
+                f"match the number of training slides ({len(val_slides)}). "
+                "This may happen if multiple tfrecords were found for some slides."
+            )
         training_dts = copy.deepcopy(self)
         training_dts = training_dts.filter(filters={'slide': train_slides})
         val_dts = copy.deepcopy(self)
diff --git a/slideflow/model/tensorflow.py b/slideflow/model/tensorflow.py
@@ -652,6 +652,10 @@ def __init__(self, parent: "Trainer", cb_args: SimpleNamespace) -> None:
         self.results = {'epochs': {}}  # type: Dict[str, Dict]
         self.neptune_run = self.parent.neptune_run
         self.global_step = 0
+        self.train_summary_writer = tf.summary.create_file_writer(
+            join(self.parent.outdir, 'train'))
+        self.val_summary_writer = tf.summary.create_file_writer(
+            join(self.parent.outdir, 'validation'))
 
         # Circumvents buffer overflow error with Python 3.10.
         # Without this, a buffer overflow error will be encountered when
@@ -663,6 +667,128 @@ def __init__(self, parent: "Trainer", cb_args: SimpleNamespace) -> None:
             plt.figure()
             plt.close()
 
+    def _log_training_metrics(self, logs):
+        """Log training metrics to Tensorboard/Neptune."""
+        # Log to Tensorboard.
+        for _log in logs:
+            tf.summary.scalar(
+                f'batch_{_log}',
+                data=logs[_log],
+                step=self.global_step)
+        # Log to neptune.
+        if self.neptune_run:
+            self.neptune_run['metrics/train/batch/loss'].log(
+                logs['loss'],
+                step=self.global_step)
+            sf.util.neptune_utils.list_log(
+                self.neptune_run,
+                'metrics/train/batch/accuracy',
+                logs['accuracy'],
+                step=self.global_step)
+
+    def _log_validation_metrics(self, metrics):
+        """Log validation metrics to Tensorboard/Neptune."""
+        # Tensorboard logging for validation metrics
+        with self.val_summary_writer.as_default():
+            for _log in metrics:
+                tf.summary.scalar(
+                    f'batch_{_log}',
+                    data=metrics[_log],
+                    step=self.global_step)
+        # Log to neptune
+        if self.neptune_run:
+            for v in metrics:
+                self.neptune_run[f"metrics/val/batch/{v}"].log(
+                    round(metrics[v], 3),
+                    step=self.global_step
+                )
+            if self.last_ema != -1:
+                self.neptune_run["metrics/val/batch/exp_moving_avg"].log(
+                    round(self.last_ema, 3),
+                    step=self.global_step
+                )
+            self.neptune_run["early_stop/stopped_early"] = False
+
+    def _log_epoch_evaluation(self, epoch_results, metrics, accuracy, loss, logs={}):
+        """Log the end-of-epoch evaluation to CSV, Tensorboard, and Neptune."""
+        epoch = self.epoch_count
+        run = self.neptune_run
+        sf.util.update_results_log(
+            self.cb_args.results_log,
+            'trained_model',
+            {f'epoch{epoch}': epoch_results}
+        )
+        with self.val_summary_writer.as_default():
+            # Note: Tensorboard epoch logging starts with index=0,
+            # whereas all other logging starts with index=1
+            if isinstance(accuracy, (list, tuple, np.ndarray)):
+                for i in range(len(accuracy)):
+                    tf.summary.scalar(f'epoch_accuracy-{i}', data=accuracy[i], step=epoch-1)
+            elif accuracy is not None:
+                tf.summary.scalar(f'epoch_accuracy', data=accuracy, step=epoch-1)
+            if isinstance(loss, (list, tuple, np.ndarray)):
+                for i in range(len(loss)):
+                    tf.summary.scalar(f'epoch_loss-{i}', data=loss[i], step=epoch-1)
+            else:
+                tf.summary.scalar(f'epoch_loss', data=loss, step=epoch-1)
+
+        # Log epoch results to Neptune
+        if run:
+            # Training epoch metrics
+            run['metrics/train/epoch/loss'].log(logs['loss'], step=epoch)
+            sf.util.neptune_utils.list_log(
+                run,
+                'metrics/train/epoch/accuracy',
+                logs['accuracy'],
+                step=epoch
+            )
+            # Validation epoch metrics
+            run['metrics/val/epoch/loss'].log(loss, step=epoch)
+            sf.util.neptune_utils.list_log(
+                run,
+                'metrics/val/epoch/accuracy',
+                accuracy,
+                step=epoch
+            )
+            for metric in metrics:
+                if metrics[metric]['tile'] is None:
+                    continue
+                for outcome in metrics[metric]['tile']:
+                    # If only one outcome, log to metrics/val/epoch/[metric].
+                    # If more than one outcome, log to
+                    # metrics/val/epoch/[metric]/[outcome_name]
+                    def metric_label(s):
+                        if len(metrics[metric]['tile']) == 1:
+                            return f'metrics/val/epoch/{s}_{metric}'
+                        else:
+                            return f'metrics/val/epoch/{s}_{metric}/{outcome}'
+
+                    tile_metric = metrics[metric]['tile'][outcome]
+                    slide_metric = metrics[metric]['slide'][outcome]
+                    patient_metric = metrics[metric]['patient'][outcome]
+
+                    # If only one value for a metric, log to .../[metric]
+                    # If more than one value for a metric (e.g. AUC for each
+                    # category), log to .../[metric]/[i]
+                    sf.util.neptune_utils.list_log(
+                        run,
+                        metric_label('tile'),
+                        tile_metric,
+                        step=epoch
+                    )
+                    sf.util.neptune_utils.list_log(
+                        run,
+                        metric_label('slide'),
+                        slide_metric,
+                        step=epoch
+                    )
+                    sf.util.neptune_utils.list_log(
+                        run,
+                        metric_label('patient'),
+                        patient_metric,
+                        step=epoch
+                    )
+
     def _metrics_from_dataset(
         self,
         epoch_label: str,
@@ -728,18 +854,10 @@ def on_epoch_end(self, epoch: int, logs={}) -> None:
         self.model.stop_training = self.early_stop
 
     def on_train_batch_end(self, batch: int, logs={}) -> None:
-        # Neptune logging for training metrics
-        if self.neptune_run:
-            self.neptune_run['metrics/train/batch/loss'].log(
-                logs['loss'],
-                step=self.global_step
-            )
-            sf.util.neptune_utils.list_log(
-                self.neptune_run,
-                'metrics/train/batch/accuracy',
-                logs['accuracy'],
-                step=self.global_step
-            )
+        # Tensorboard logging for training metrics
+        if batch > 0 and batch % self.cb_args.log_frequency == 0:
+            #with self.train_summary_writer.as_default():
+            self._log_training_metrics(logs)
 
         # Check if manual early stopping has been triggered
         if (self.hp.early_stop
@@ -802,19 +920,10 @@ def on_train_batch_end(self, batch: int, logs={}) -> None:
                 print('\r\033[K', end='')
             self.moving_average += [early_stop_value]
 
-            # Log to neptune
-            if self.neptune_run:
-                for v in val_metrics:
-                    self.neptune_run[f"metrics/val/batch/{v}"].log(
-                        round(val_metrics[v], 3),
-                        step=self.global_step
-                    )
-                if self.last_ema != -1:
-                    self.neptune_run["metrics/val/batch/exp_moving_avg"].log(
-                        round(self.last_ema, 3),
-                        step=self.global_step
-                    )
-                self.neptune_run["early_stop/stopped_early"] = False
+            self._log_validation_metrics(logs)
+            # Log training metrics if not already logged this batch
+            if batch % self.cb_args.log_frequency > 0:
+                self._log_training_metrics(logs)
 
             # Base logging message
             batch_msg = f'[blue]Batch {batch:<5}[/]'
@@ -915,73 +1024,9 @@ def evaluate_model(self, logs={}) -> None:
             self.results['epochs'][f'epoch{epoch}'][f'patient_{m}'] = metrics[m]['patient']
 
         epoch_results = self.results['epochs'][f'epoch{epoch}']
-        sf.util.update_results_log(
-            self.cb_args.results_log,
-            'trained_model',
-            {f'epoch{epoch}': epoch_results}
+        self._log_epoch_evaluation(
+            epoch_results, metrics=metrics, accuracy=acc, loss=loss, logs=logs
         )
-        # Log epoch results to Neptune
-        if self.neptune_run:
-            # Training epoch metrics
-            self.neptune_run['metrics/train/epoch/loss'].log(
-                logs['loss'],
-                step=epoch
-            )
-            sf.util.neptune_utils.list_log(
-                self.neptune_run,
-                'metrics/train/epoch/accuracy',
-                logs['accuracy'],
-                step=epoch
-            )
-            # Validation epoch metrics
-            self.neptune_run['metrics/val/epoch/loss'].log(
-                val_metrics['loss'],
-                step=epoch
-            )
-            sf.util.neptune_utils.list_log(
-                self.neptune_run,
-                'metrics/val/epoch/accuracy',
-                val_metrics['accuracy'],
-                step=epoch
-            )
-            for metric in metrics:
-                if metrics[metric]['tile'] is None:
-                    continue
-                for outcome in metrics[metric]['tile']:
-                    # If only one outcome, log to metrics/val/epoch/[metric].
-                    # If more than one outcome, log to
-                    # metrics/val/epoch/[metric]/[outcome_name]
-                    def metric_label(s):
-                        if len(metrics[metric]['tile']) == 1:
-                            return f'metrics/val/epoch/{s}_{metric}'
-                        else:
-                            return f'metrics/val/epoch/{s}_{metric}/{outcome}'
-
-                    tile_metric = metrics[metric]['tile'][outcome]
-                    slide_metric = metrics[metric]['slide'][outcome]
-                    patient_metric = metrics[metric]['patient'][outcome]
-
-                    # If only one value for a metric, log to .../[metric]
-                    # If more than one value for a metric (e.g. AUC for each
-                    # category), log to .../[metric]/[i]
-                    sf.util.neptune_utils.list_log(
-                        self.neptune_run,
-                        metric_label('tile'),
-                        tile_metric,
-                        step=epoch
-                    )
-                    sf.util.neptune_utils.list_log(
-                        self.neptune_run,
-                        metric_label('slide'),
-                        slide_metric,
-                        step=epoch
-                    )
-                    sf.util.neptune_utils.list_log(
-                        self.neptune_run,
-                        metric_label('patient'),
-                        patient_metric,
-                        step=epoch
-                    )
 
 
 class Trainer:
@@ -1791,7 +1836,8 @@ def train(
                 save_predictions=save_predictions,
                 save_model=save_model,
                 results_log=results_log,
-                reduce_method=reduce_method
+                reduce_method=reduce_method,
+                log_frequency=log_frequency
             )
 
             # Create callbacks for early stopping, checkpoint saving,
@@ -1806,11 +1852,15 @@ def train(
                 )
                 callbacks += [cp_callback]
             if use_tensorboard:
+                log.debug(
+                    "Logging with Tensorboard to {} every {} batches.".format(
+                        self.outdir, log_frequency
+                    ))
                 tensorboard_callback = tf.keras.callbacks.TensorBoard(
                     log_dir=self.outdir,
                     histogram_freq=0,
                     write_graph=False,
-                    update_freq=log_frequency
+                    update_freq='batch'
                 )
                 callbacks += [tensorboard_callback]