nttcslab
diff --git a/‎README.md‎
Lines changed: 2 additions & 29 deletions b/‎README.md‎
Lines changed: 2 additions & 29 deletions
diff --git a/‎all_eval.sh‎
Lines changed: 18 additions & 9 deletions b/‎all_eval.sh‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎audio_dataset.py‎
Lines changed: 125 additions & 0 deletions b/‎audio_dataset.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎examples/Example_1.ipynb‎
Lines changed: 115 additions & 0 deletions b/‎examples/Example_1.ipynb‎
Lines changed: 115 additions & 0 deletions
@@ -49,6 +49,7 @@ curl -o util/lars.py https://raw.githubusercontent.com/facebookresearch/mae/efb2
 curl -o util/lr_decay.py https://raw.githubusercontent.com/facebookresearch/mae/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/lr_decay.py
 curl -o util/lr_sched.py https://raw.githubusercontent.com/facebookresearch/mae/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/lr_sched.py
 curl -o util/misc.py https://raw.githubusercontent.com/facebookresearch/mae/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/misc.py
+curl -o util/analyze_repr.py https://raw.githubusercontent.com/daisukelab/general-learning/master/SSL/analyze_repr.py
 curl -o m2d/pos_embed.py https://raw.githubusercontent.com/facebookresearch/mae/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py
 curl -o train_audio.py https://raw.githubusercontent.com/facebookresearch/mae/efb2a8062c206524e35e47d04501ed4f544c0ae8/main_pretrain.py
 curl -o train_speech.py https://raw.githubusercontent.com/facebookresearch/mae/efb2a8062c206524e35e47d04501ed4f544c0ae8/main_pretrain.py
@@ -75,19 +76,16 @@ We have a utility runtime model utility, RuntimeM2D, that helps you to load a pr
 ```python
 from m2d.runtime_audio import RuntimeM2D
 
-device = torch.device('cpu')  # set 'cuda' if you run on a GPU
-
 # Prepare your batch of audios. This is a dummy  example of three 10s  waves.
 batch_audio = 2 * torch.rand((3, 10 * 16000)) - 1.0 # input range = [-1., 1]
 batch_audio = batch_audio.to(device)
 
 # Create a model with pretrained weights.
 runtime = RuntimeM2D(weight_file='m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth')
-runtime = runtime.to(device)
 
 # Encode raw audio into features. `encode()` will do the followings automatically:
 # 1. Convert the input `batch_audio` to log-mel spectrograms (LMS).
-# 2. Normalize the batch LMS with mean and std calculated from the batch.
+# 2. Normalize the batch LMS with mean and std used in the pre-training.
 # 3. Encode the bach LMS to features.
 frame_level = runtime.encode(batch_audio)
 
@@ -101,31 +99,6 @@ clip_level = torch.mean(frame_level, dim=1)
 print(clip_level.shape)
 ```
 
-To get the best features, you can normalize your audio with normalization statistics of your entire input data and use them in your pipeline.
-
-```python
-# Calculate statistics in advance. This is an example with 10 random waves.
-means, stds = [], []
-for _ in range(10):
-    lms = runtime.to_feature(torch.rand((10 * 16000)).to(device))
-    means.append(lms.mean())
-    stds.append(lms.std())
-
-dataset_mean, dataset_std = torch.mean(torch.stack(means)), torch.mean(torch.stack(stds))
-# These can be numbers [-5.4919195, 5.0389895], for example.
-
-# The followings are an example pipeline.
-
-# Convert your batch audios into LMS.
-batch_lms = runtime.to_feature(batch_audio)
-# Normalize them.
-batch_lms = (batch_lms - dataset_mean) / (dataset_std + torch.finfo().eps)
-# Encode them to feame-level features.
-frame_level = runtime.encode_lms(batch_lms)
-#  Calculate clip-level features if needed.
-clip_level = torch.mean(frame_level, dim=1)
-```
-
 To get features per layer, you can add `return_layers=True`.
 
 ```python
 
@@ -1,11 +1,20 @@
 cd evar
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml cremad batch_size=16,weight_file=$1
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml gtzan batch_size=16,weight_file=$1
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml spcv2 batch_size=64,weight_file=$1
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml esc50 batch_size=64,weight_file=$1
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml us8k batch_size=64,weight_file=$1
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml vc1 batch_size=64,weight_file=$1
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml voxforge batch_size=64,weight_file=$1
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml nsynth batch_size=64,weight_file=$1
-CUDA_VISIBLE_DEVICES=0 python 2pass_lineareval.py config/m2d.yaml surge batch_size=64,weight_file=$1
+GPU=0
+
+if [[ "$1" == *'p32k-'* ]]; then
+    cfg='config/m2d_32k.yaml'
+else
+    cfg='config/m2d.yaml'
+fi
+
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg cremad batch_size=16,weight_file=$1
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg gtzan batch_size=16,weight_file=$1
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg spcv2 batch_size=64,weight_file=$1
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg esc50 batch_size=64,weight_file=$1
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg us8k batch_size=64,weight_file=$1
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg vc1 batch_size=64,weight_file=$1
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg voxforge batch_size=64,weight_file=$1
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg nsynth batch_size=64,weight_file=$1
+CUDA_VISIBLE_DEVICES=$GPU python 2pass_lineareval.py $cfg surge batch_size=64,weight_file=$1
+
 python summarize.py $1
@@ -152,3 +152,128 @@ def build_viz_dataset(cfg):
     norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None
     ds = SpectrogramDataset(folder=cfg.data_path, files=files, crop_frames=cfg.input_size[1], tfms=None, norm_stats=norm_stats)
     return ds, files
+
+
+# Mixed dataset
+
+def log_mixup_exp(xa, xb, alpha):
+    xa = xa.exp()
+    xb = xb.exp()
+    x = alpha * xa + (1. - alpha) * xb
+    return torch.log(torch.max(x, torch.finfo(x.dtype).eps*torch.ones_like(x)))
+
+
+class MixedSpecDataset(torch.utils.data.Dataset):
+    def __init__(self, base_folder, files_main, files_bg_noise, crop_size, noise_ratio=0.0,
+                 random_crop=True, n_norm_calc=10000) -> None:
+        super().__init__()
+
+        self.ds1 = SpectrogramDataset(folder=base_folder, files=files_main, crop_frames=crop_size[1],
+                random_crop=random_crop, norm_stats=None,
+                n_norm_calc=n_norm_calc//2)
+        self.norm_stats = self.ds1.norm_stats  # for compatibility with SpectrogramDataset
+        # disable normalizion scaling in the ds1
+        self.norm_std = self.ds1.norm_stats[1]
+        self.ds1.norm_stats = (self.ds1.norm_stats[0], 1.0)
+
+        if noise_ratio > 0.0:
+            self.ds2 = SpectrogramDataset(folder=base_folder, files=files_bg_noise, crop_frames=crop_size[1],
+                    random_crop=random_crop, norm_stats=None, n_norm_calc=n_norm_calc//2, repeat_short=True)
+            self.ds2.norm_stats = (self.ds2.norm_stats[0], 1.0) # disable normalizion scaling in the ds2
+
+        self.noise_ratio = noise_ratio
+        self.bg_index = []
+
+    def __len__(self):
+        return len(self.ds1)
+
+    def __getitem__(self, index, fixed_noise=False):
+        # load index sample
+        clean = self.ds1[index]
+        if self.noise_ratio > 0.0:
+            # load random noise sample ### , while making noise floor zero
+            noise = self.ds2[index if fixed_noise else self.get_next_bgidx()]
+            # mix
+            mixed = log_mixup_exp(noise, clean, self.noise_ratio) if self.noise_ratio < 1.0 else noise
+        else:
+            mixed = clean.clone()
+        # finish normalization. clean and noise were averaged to zero. the following will scale to 1.0 using ds1 std.
+        clean = clean / self.norm_std
+        mixed = mixed / self.norm_std
+        return clean, mixed
+
+
+    def get_next_bgidx(self):
+        if len(self.bg_index) == 0:
+            self.bg_index = torch.randperm(len(self.ds2)).tolist()
+            # print(f'Refreshed the bg index list with {len(self.bg_index)} items: {self.bg_index[:5]}...')
+        return self.bg_index.pop(0)
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + f'(crop_frames={self.ds1.crop_frames}, '
+        format_string += f'folder_sp={self.ds1.df.file_name.values[0].split("/")[0]}, '
+        if self.noise_ratio > 0.: format_string += f'folder_bg={self.ds2.df.file_name.values[0].split("/")[0]}, '
+        return format_string
+
+
+def inflate_files(files, desired_size):
+    if len(files) == 0:
+        return files
+    files = list(files)  # make sure `files`` is a list
+    while len(files) < desired_size:
+        files = (files + files)[:desired_size]
+    return files
+
+
+def build_mixed_dataset(cfg):
+    """The followings configure the training dataset details.
+        - data_path: Root folder of the training dataset.
+        - dataset: The _name_ of the training dataset, an stem name of a `.csv` training data list.
+        - norm_stats: Normalization statistics, a list of [mean, std].
+        - input_size: Input size, a list of [# of freq. bins, # of time frames].
+    """
+
+    # get files and inflate the number of files (by repeating the list) if needed
+    files_main = get_files(cfg.csv_main)
+    files_bg = get_files(cfg.csv_bg_noise) if cfg.noise_ratio > 0. else []
+    desired_min_size = 0
+    if 'min_ds_size' in cfg and cfg.min_ds_size > 0:
+        desired_min_size = cfg.min_ds_size
+    if desired_min_size > 0:
+        old_sizes = len(files_main), len(files_bg)
+        files_main, files_bg = inflate_files(files_main, desired_min_size), inflate_files(files_bg, desired_min_size)
+        print('The numbers of data files are increased from', old_sizes, 'to', (len(files_main), len(files_bg)))
+
+    ds = MixedSpecDataset(
+        base_folder=cfg.data_path, files_main=files_main,
+        files_bg_noise=files_bg,
+        crop_size=cfg.input_size,
+        noise_ratio=cfg.noise_ratio,
+        random_crop=True)
+    if 'weighted' in cfg and cfg.weighted:
+        assert desired_min_size == 0
+        ds.weight = pd.read_csv(cfg.csv_main).weight.values
+
+    val_ds = SpectrogramDataset(folder=cfg.data_path, files=get_files(cfg.csv_val), crop_frames=cfg.input_size[1], random_crop=True) \
+        if cfg.csv_val else None
+
+    return ds, val_ds
+
+
+def build_mixed_viz_dataset(cfg):
+    files = [str(f).replace(str(cfg.data_path) + '/', '') for f in sorted(Path(cfg.data_path).glob('vis_samples/*.npy'))]
+    if len(files) == 0:
+        return None, []
+    norm_stats = cfg.norm_stats if 'norm_stats' in cfg else None
+    ds = SpectrogramDataset(folder=cfg.data_path, files=files, crop_frames=cfg.input_size[1], tfms=None, norm_stats=norm_stats)
+    return ds, files
+
+
+if __name__ == '__main__':
+    # Test
+    ds = MixedSpecDataset(base_folder='data', files_main=get_files('data/files_gtzan.csv'),
+                          files_bg_noise=get_files('data/files_audioset.csv'),
+                          crop_size=[80, 608], noise_ratio=0.2, random_crop=True, n_norm_calc=10)
+    for i in range(0, 10):
+        clean, mixed = ds[i]
+        print(clean.shape, mixed.shape)
@@ -0,0 +1,115 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Short example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings; warnings.simplefilter('ignore')\n",
+    "import logging\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "import sys\n",
+    "sys.path.append('..')\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:<All keys matched successfully>\n",
+      "INFO:root:Model input size: [80, 608]\n",
+      "INFO:root:Using weights: m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth\n",
+      "INFO:root:Feature dimension: 3840\n",
+      "INFO:root:Norm stats: -7.1, 4.2\n",
+      "INFO:root:Runtime MelSpectrogram(16000, 400, 400, 160, 80, 50, 8000):\n",
+      "INFO:root:MelSpectrogram(\n",
+      "  Mel filter banks size = (80, 201), trainable_mel=False\n",
+      "  (stft): STFT(n_fft=400, Fourier Kernel size=(201, 1, 400), iSTFT=False, trainable=False)\n",
+      ")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " using 150 parameters, while dropped 250 out of 400 parameters from m2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth\n",
+      " (dropped: ['mask_token', 'decoder_pos_embed', 'decoder_embed.weight', 'decoder_embed.bias', 'decoder_blocks.0.norm1.weight'] ...)\n",
+      "<All keys matched successfully>\n"
+     ]
+    }
+   ],
+   "source": [
+    "from portable_m2d import PortableM2D\n",
+    "weight = 'm2d_vit_base-80x608p16x16-220930-mr7/checkpoint-300.pth'\n",
+    "model = PortableM2D(weight_file=weight)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 63, 3840])\n"
+     ]
+    }
+   ],
+   "source": [
+    "# A single sample of random waveform\n",
+    "wav = torch.rand(1, 16000 * 10)\n",
+    "\n",
+    "# Encode with M2D\n",
+    "with torch.no_grad():\n",
+    "    embeddings = model(wav)\n",
+    "\n",
+    "# The output embeddings has a shape of [Batch, Frame, Dimension]\n",
+    "print(embeddings.shape)  # --> torch.Size([1, 63, 3840])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ar",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}