rishikksh20 · carankt · Sep 7, 2020 · Sep 7, 2020 · Sep 7, 2020 · Sep 7, 2020
diff --git a/configs/default.yaml b/configs/default.yaml
@@ -1,6 +1,6 @@
 data:
-  data_dir: 'H:\Deepsync\backup\fastspeech\data\'
-  wav_dir: 'H:\Deepsync\backup\deepsync\LJSpeech-1.1\wavs\'
+  data_dir: '/workspace/data/'
+  wav_dir: '/workspace/LJSpeech-1.1/wavs/'
   # Compute statistics
   e_mean: 21.578571319580078
   e_std: 18.916799545288086
@@ -106,10 +106,12 @@ model:
 
 
 train:
+  discriminator_start: 20000
+  rep_discriminator: 1
   # optimization related
   eos: False #True
   opt: 'noam'
-  accum_grad: 4
+  accum_grad: 1
   grad_clip: 1.0
   weight_decay: 0.001
   patience: 0
@@ -125,7 +127,7 @@ train:
   seed: 1       # random seed number
   resume: ""    # the snapshot path to resume (if set empty, no effect)
   use_phonemes: True
-  batch_size : 16
+  batch_size : 24
   # other
   melgan_vocoder : True
   save_interval : 1000
@@ -134,4 +136,4 @@ train:
   summary_interval : 200
   validation_step : 500
   tts_max_mel_len : 870              # if you have a couple of extremely long spectrograms you might want to use this
-  tts_bin_lengths : True              # bins the spectrogram lengths before sampling in data loader - speeds up training
+  tts_bin_lengths : True              # bins the spectrogram lengths before sampling in data loader - speeds up training
diff --git a/core/discriminator.py b/core/discriminator.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Discriminator(nn.Module):
+    def __init__(self):
+        super(Discriminator, self).__init__()
+
+        self.discriminator = nn.Sequential(
+                nn.Conv2d(1, 16, kernel_size=3, stride=1, padding = 1),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Conv2d(16, 32, kernel_size=3, stride=1, padding = 1),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Conv2d(32, 64, kernel_size=3, stride=1, padding = 1),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Conv2d(64, 1, kernel_size=3, stride=1, padding = 1)
+                #nn.Flatten(),   # add conv2d a 1 channel
+                #nn.Linear(46240,256)
+                )
+
+    def forward(self, x):
+        '''
+        we directly predict score without last sigmoid function
+        since we're using Least Squares GAN (https://arxiv.org/abs/1611.04076)
+        '''
+        # print(x.shape, "Input to Discriminator")
+        return self.discriminator(x)
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(0.0, 0.02)
+    elif classname.find("BatchNorm2d") != -1:
+        m.weight.data.normal_(1.0, 0.02)
+        m.bias.data.fill_(0)
+
+class SFDiscriminator(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.disc1 = Discriminator()
+        self.disc2 = Discriminator()
+        self.disc3 = Discriminator()
+        self.apply(weights_init)
+    def forward(self, x, start):
+        results = []
+        results.append(self.disc1(x[:, :, start: start + 40, 0:40]))
+        results.append(self.disc2(x[:, :, start: start + 40, 20:60]))
+        results.append(self.disc3(x[:, :, start: start + 40, 40:80, ]))
+        return results
+
+if __name__ == '__main__':
+    model = SFDiscriminator()
+
+    x = torch.randn(16, 1, 40, 80)
+    print(x.shape)
+
+    out = model(x)
+    print(len(out), "Shape of output")
+
+    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(pytorch_total_params)