Add multi gpu batched inference (#12)

* add more aug * add multi gpu inference
EleutherAI · Mar 7, 2024 · e49951c · e49951c
1 parent f6f5fbb
commit e49951c
Show file tree

Hide file tree

Showing 11 changed files with 595 additions and 285 deletions.
diff --git a/amt/audio.py b/amt/audio.py
@@ -186,14 +186,18 @@ class AudioTransform(torch.nn.Module):
     def __init__(
         self,
         reverb_factor: int = 1,
-        min_snr: int = 10,
-        max_snr: int = 40,
+        min_snr: int = 20,
+        max_snr: int = 50,
+        max_dist_gain: int = 25,
+        min_dist_gain: int = 0,
     ):
         super().__init__()
         self.tokenizer = AmtTokenizer()
         self.reverb_factor = reverb_factor
         self.min_snr = min_snr
         self.max_snr = max_snr
+        self.max_dist_gain = max_dist_gain
+        self.min_dist_gain = min_dist_gain
 
         self.config = load_config()["audio"]
         self.sample_rate = self.config["sample_rate"]
@@ -230,10 +234,10 @@ def __init__(
         )
         self.spec_aug = torch.nn.Sequential(
             torchaudio.transforms.FrequencyMasking(
-                freq_mask_param=15, iid_masks=True
+                freq_mask_param=10, iid_masks=True
             ),
             torchaudio.transforms.TimeMasking(
-                time_mask_param=500, iid_masks=True
+                time_mask_param=1000, iid_masks=True
             ),
         )
 
@@ -309,6 +313,12 @@ def apply_noise(self, wav: torch.tensor):
 
         return AF.add_noise(waveform=wav, noise=noise, snr=snr_dbs)
 
+    def apply_distortion(self, wav: torch.tensor):
+        gain = random.randint(self.min_dist_gain, self.max_dist_gain)
+        colour = random.randint(5, 95)
+
+        return AF.overdrive(wav, gain=gain, colour=colour)
+
     def shift_spec(self, specs: torch.Tensor, shift: int):
         if shift == 0:
             return specs
@@ -335,7 +345,13 @@ def shift_spec(self, specs: torch.Tensor, shift: int):
         return shifted_specs
 
     def aug_wav(self, wav: torch.Tensor):
-        return self.apply_reverb(self.apply_noise(wav))
+        # Only apply distortion in 20% of cases
+        if random.random() > 0.20:
+            return self.apply_reverb(self.apply_noise(wav))
+        else:
+            return self.apply_reverb(
+                self.apply_distortion(self.apply_noise(wav))
+            )
 
     def norm_mel(self, mel_spec: torch.Tensor):
         log_spec = torch.clamp(mel_spec, min=1e-10).log10()
@@ -364,8 +380,8 @@ def forward(self, wav: torch.Tensor, shift: int = 0):
         # Spec & pitch shift
         log_mel = self.log_mel(wav, shift)
 
-        # Spec aug
-        if random.random() > 0.2:
+        # Spec aug in 20% of cases
+        if random.random() > 0.20:
             log_mel = self.spec_aug(log_mel)
 
         return log_mel
diff --git a/amt/data.py b/amt/data.py
@@ -14,20 +14,25 @@
 
 
 def get_wav_mid_segments(
-    audio_path: str, mid_path: str = "", return_json: bool = False
+    audio_path: str,
+    mid_path: str = "",
+    return_json: bool = False,
+    stride_factor: int | None = None,
 ):
     """This function yields tuples of matched log mel spectrograms and
     tokenized sequences (np.array, list). If it is given only an audio path
     then it will return an empty list for the mid_feature
     """
     tokenizer = AmtTokenizer()
     config = load_config()
-    stride_factor = config["data"]["stride_factor"]
     sample_rate = config["audio"]["sample_rate"]
     chunk_len = config["audio"]["chunk_len"]
     num_samples = sample_rate * chunk_len
     samples_per_ms = sample_rate // 1000
 
+    if not stride_factor:
+        stride_factor = config["data"]["stride_factor"]
+
     if not os.path.isfile(audio_path):
         return None