From 909240db786f6b20658cc10df34fa72df20f7002 Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Mon, 7 Sep 2020 14:22:29 +0530
Subject: [PATCH 01/10] Add SFdisc

---
 configs/default.yaml  |  3 ++-
 core/discriminator.py | 63 +++++++++++++++++++++++++++++++++++++++++++
 train_fastspeech.py   | 18 +++++++++++++
 3 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 core/discriminator.py

diff --git a/configs/default.yaml b/configs/default.yaml
index 92ed7a9..b06b1e5 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -106,6 +106,7 @@ model:
 
 
 train:
+  discriminator_start: 10000
   # optimization related
   eos: False #True
   opt: 'noam'
@@ -134,4 +135,4 @@ train:
   summary_interval : 200
   validation_step : 500
   tts_max_mel_len : 870              # if you have a couple of extremely long spectrograms you might want to use this
-  tts_bin_lengths : True              # bins the spectrogram lengths before sampling in data loader - speeds up training
\ No newline at end of file
+  tts_bin_lengths : True              # bins the spectrogram lengths before sampling in data loader - speeds up training
diff --git a/core/discriminator.py b/core/discriminator.py
new file mode 100644
index 0000000..a907ce6
--- /dev/null
+++ b/core/discriminator.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Discriminator(nn.Module):
+    def __init__(self):
+        super(Discriminator, self).__init__()
+
+        self.discriminator = nn.ModuleList([
+            nn.Sequential(
+                nn.utils.weight_norm(nn.Conv2d(1, 40, kernel_size=3, stride=1)),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.utils.weight_norm(nn.Conv2d(40, 40, kernel_size=3, stride=1)),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.utils.weight_norm(nn.Conv2d(40, 40, kernel_size=3, stride=1)),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Flatten(),
+                nn.Linear(46240,256)
+                )
+                ])
+
+    def forward(self, x):
+        '''
+            we directly predict score without last sigmoid function
+            since we're using Least Squares GAN (https://arxiv.org/abs/1611.04076)
+        '''
+        for module in self.discriminator:
+            x = module(x)
+        return x
+
+class SFDiscriminator(nn.Module):
+    def __init__(self):
+        super(SFDiscriminator, self).__init__()
+
+        self.discriminators = nn.ModuleList(
+            [Discriminator() for _ in range(3)]
+        )
+
+    def forward(self, x):
+        # x - input mel of size [B, 1, 40, 80]
+        x_in = [ x[0:16, 0:1, 0:40, 0:40], x[0:16, 0:1, 0:40, 20:60], x[0:16, 0:1, 0:40, 40:80] ]
+        disc_out = list()
+
+
+        for disc, x_ in zip(self.discriminators, x_in):
+            x = disc(x_)
+            disc_out.append(x)
+
+        return disc_out # [SF_out0, SF_out1, SF_out2]
+
+
+if __name__ == '__main__':
+    model = SFDiscriminator()
+
+    x = torch.randn(16, 1, 40, 80)
+    print(x.shape)
+
+    out = model(x)
+    print(len(out), "Shape of output")
+
+    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(pytorch_total_params)
diff --git a/train_fastspeech.py b/train_fastspeech.py
index d7f4b5c..ed53dd7 100644
--- a/train_fastspeech.py
+++ b/train_fastspeech.py
@@ -18,6 +18,8 @@
 from utils.util import get_commit_hash
 from utils.hparams import HParam
 
+from core.discriminator import SFDiscriminator
+
 BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
 BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]
 
@@ -34,6 +36,8 @@ def train(args, hp, hp_str, logger, vocoder):
     idim = len(valid_symbols)
     odim = hp.audio.num_mels
     model = fastspeech.FeedForwardTransformer(idim, odim, hp)
+    SFdisc = SFDiscriminator.cuda()
+
     # set torch device
     model = model.to(device)
     print("Model is loaded ...")
@@ -74,10 +78,18 @@ def train(args, hp, hp_str, logger, vocoder):
             hp.model.transformer_warmup_steps,
             hp.model.transformer_lr,
         )
+        optimizer_d = get_std_opt(
+            SFdisc,
+            hp.model.adim,
+            hp.model.transformer_warmup_steps,
+            hp.model.transformer_lr,
+        )
+
 
     print("Batch Size :", hp.train.batch_size)
 
     num_params(model)
+    num_params(SFdisc)
 
     os.makedirs(os.path.join(hp.train.log_dir, args.name), exist_ok=True)
     writer = SummaryWriter(os.path.join(hp.train.log_dir, args.name))
@@ -108,6 +120,11 @@ def train(args, hp, hp_str, logger, vocoder):
             loss = loss.mean() / hp.train.accum_grad
             running_loss += loss.item()
 
+            if step >= hp.train.discriminator_start:
+                loss = SFdisc()
+
+
+
             loss.backward()
 
             # update parameters
@@ -129,6 +146,7 @@ def train(args, hp, hp_str, logger, vocoder):
                 optimizer.step()
             optimizer.zero_grad()
 
+
             if step % hp.train.summary_interval == 0:
                 pbar.set_description(
                     "Average Loss %.04f Loss %.04f | step %d"

From c686daa2df1f374d92fbe13d5ee451d924b20b0c Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Mon, 7 Sep 2020 16:38:15 +0530
Subject: [PATCH 02/10] update code

---
 configs/default.yaml  |  1 +
 core/discriminator.py | 52 ++++++++++++++++++++-------------------
 fastspeech.py         |  2 +-
 train_fastspeech.py   | 57 +++++++++++++++++++++++++++++++++++--------
 4 files changed, 76 insertions(+), 36 deletions(-)

diff --git a/configs/default.yaml b/configs/default.yaml
index b06b1e5..23b33cb 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -107,6 +107,7 @@ model:
 
 train:
   discriminator_start: 10000
+  rep_discriminator: 1
   # optimization related
   eos: False #True
   opt: 'noam'
diff --git a/core/discriminator.py b/core/discriminator.py
index a907ce6..889b2b0 100644
--- a/core/discriminator.py
+++ b/core/discriminator.py
@@ -9,46 +9,48 @@ def __init__(self):
 
         self.discriminator = nn.ModuleList([
             nn.Sequential(
-                nn.utils.weight_norm(nn.Conv2d(1, 40, kernel_size=3, stride=1)),
+                nn.Conv2d(1, 16, kernel_size=3, stride=1, padding = 1),
                 nn.LeakyReLU(0.2, inplace=True),
-                nn.utils.weight_norm(nn.Conv2d(40, 40, kernel_size=3, stride=1)),
+                nn.Conv2d(16, 32, kernel_size=3, stride=1, padding = 1),
                 nn.LeakyReLU(0.2, inplace=True),
-                nn.utils.weight_norm(nn.Conv2d(40, 40, kernel_size=3, stride=1)),
+                nn.Conv2d(32, 64, kernel_size=3, stride=1, padding = 1),
                 nn.LeakyReLU(0.2, inplace=True),
-                nn.Flatten(),
-                nn.Linear(46240,256)
+                nn.Conv2d(64, 1, kernel_size=3, stride=1, padding = 1)
+                #nn.Flatten(),   # add conv2d a 1 channel
+                #nn.Linear(46240,256)
                 )
                 ])
 
     def forward(self, x):
         '''
-            we directly predict score without last sigmoid function
-            since we're using Least Squares GAN (https://arxiv.org/abs/1611.04076)
+        we directly predict score without last sigmoid function
+        since we're using Least Squares GAN (https://arxiv.org/abs/1611.04076)
         '''
         for module in self.discriminator:
             x = module(x)
         return x
 
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(0.0, 0.02)
+    elif classname.find("BatchNorm2d") != -1:
+        m.weight.data.normal_(1.0, 0.02)
+        m.bias.data.fill_(0)
+
 class SFDiscriminator(nn.Module):
     def __init__(self):
-        super(SFDiscriminator, self).__init__()
-
-        self.discriminators = nn.ModuleList(
-            [Discriminator() for _ in range(3)]
-        )
-
-    def forward(self, x):
-        # x - input mel of size [B, 1, 40, 80]
-        x_in = [ x[0:16, 0:1, 0:40, 0:40], x[0:16, 0:1, 0:40, 20:60], x[0:16, 0:1, 0:40, 40:80] ]
-        disc_out = list()
-
-
-        for disc, x_ in zip(self.discriminators, x_in):
-            x = disc(x_)
-            disc_out.append(x)
-
-        return disc_out # [SF_out0, SF_out1, SF_out2]
-
+        super().__init__()
+        self.disc1 = Discriminator()
+        self.disc2 = Discriminator()
+        self.disc3 = Discriminator()
+        self.apply(weights_init)
+    def forward(self, x, start):
+        results = []
+        results.append(self.disc1(x[:, : , 0:40, start: start + 40]))
+        results.append(self.disc2(x[:, :, 20:60, start: start + 40]))
+        results.append(self.disc3(x[:, :, 40:80, start: start + 40]))
+        return results
 
 if __name__ == '__main__':
     model = SFDiscriminator()
diff --git a/fastspeech.py b/fastspeech.py
index 0202677..0ae7866 100644
--- a/fastspeech.py
+++ b/fastspeech.py
@@ -333,7 +333,7 @@ def forward(
 
         # self.reporter.report(report_keys)
 
-        return loss, report_keys
+        return loss, report_keys, after_outs
 
     def inference(self, x: torch.Tensor) -> torch.Tensor:
         """Generate the sequence of features given the sequences of characters.
diff --git a/train_fastspeech.py b/train_fastspeech.py
index ed53dd7..78aee34 100644
--- a/train_fastspeech.py
+++ b/train_fastspeech.py
@@ -36,7 +36,8 @@ def train(args, hp, hp_str, logger, vocoder):
     idim = len(valid_symbols)
     odim = hp.audio.num_mels
     model = fastspeech.FeedForwardTransformer(idim, odim, hp)
-    SFdisc = SFDiscriminator.cuda()
+    model_d = SFDiscriminator.cuda()
+    criterion_d = torch.nn.MSELoss().cuda()
 
     # set torch device
     model = model.to(device)
@@ -78,8 +79,8 @@ def train(args, hp, hp_str, logger, vocoder):
             hp.model.transformer_warmup_steps,
             hp.model.transformer_lr,
         )
-        optimizer_d = get_std_opt(
-            SFdisc,
+        optim_d = get_std_opt(
+            model_d,
             hp.model.adim,
             hp.model.transformer_warmup_steps,
             hp.model.transformer_lr,
@@ -89,7 +90,7 @@ def train(args, hp, hp_str, logger, vocoder):
     print("Batch Size :", hp.train.batch_size)
 
     num_params(model)
-    num_params(SFdisc)
+    num_params(model_d)
 
     os.makedirs(os.path.join(hp.train.log_dir, args.name), exist_ok=True)
     writer = SummaryWriter(os.path.join(hp.train.log_dir, args.name))
@@ -100,6 +101,7 @@ def train(args, hp, hp_str, logger, vocoder):
         start = time.time()
         running_loss = 0
         j = 0
+        d_loss = []
 
         pbar = tqdm.tqdm(dataloader, desc="Loading train data")
         for data in pbar:
@@ -108,7 +110,7 @@ def train(args, hp, hp_str, logger, vocoder):
             # x : [batch , num_char], input_length : [batch], y : [batch, T_in, num_mel]
             #             # stop_token : [batch, T_in], out_length : [batch]
 
-            loss, report_dict = model(
+            loss, report_dict, mel = model(
                 x.cuda(),
                 input_length.cuda(),
                 y.cuda(),
@@ -121,10 +123,14 @@ def train(args, hp, hp_str, logger, vocoder):
             running_loss += loss.item()
 
             if step >= hp.train.discriminator_start:
-                loss = SFdisc()
-
-
-
+                start = np.random.randint(0, out_length.min()-40)
+                disc_fake = model_d(mel.cuda(), start)
+                for score_fake in disc_fake:
+                    # adv_loss += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2]))
+                    adv_loss += criterion_d(score_fake, torch.ones_like(score_fake))
+                    adv_loss = adv_loss / len(disc_fake) # len(disc_fake) = 3
+
+            loss = loss + adv_loss
             loss.backward()
 
             # update parameters
@@ -147,6 +153,37 @@ def train(args, hp, hp_str, logger, vocoder):
             optimizer.zero_grad()
 
 
+            # Discriminator
+            loss_d_avg = 0.0
+            if step > hp.train.discriminator_start:
+                loss, report_dict, mel = model(
+                    x.cuda(),
+                    input_length.cuda(),
+                    y.cuda(),
+                    out_length.cuda(),
+                    dur.cuda(),
+                    e.cuda(),
+                    p.cuda(),
+                )
+                for _ in range(hp.train.rep_discriminator):
+                    optim_d.zero_grad()
+                    disc_fake = model_d(mel.cuda())
+                    disc_real = model_d(y.cuda())
+                    loss_d = 0.0
+                    loss_d_real = 0.0
+                    loss_d_fake = 0.0
+                    for score_fake, score_real in zip(disc_fake, disc_real):
+                        loss_d_real += criterion_d(score_real, torch.ones_like(score_real))
+                        loss_d_fake += criterion_d(score_fake, torch.zeros_like(score_fake))
+                    loss_d_real = loss_d_real / len(disc_real) # len(disc_real) = 3
+                    loss_d_fake = loss_d_fake / len(disc_fake) # len(disc_fake) = 3
+                    loss_d = loss_d_real + loss_d_fake
+                    loss_d.backward()
+                    optim_d.step()
+                    loss_d_sum += loss_d
+                loss_d_avg = loss_d_sum / hp.train.rep_discriminator
+                loss_d_avg = loss_d_avg.item()
+
             if step % hp.train.summary_interval == 0:
                 pbar.set_description(
                     "Average Loss %.04f Loss %.04f | step %d"
@@ -168,7 +205,7 @@ def train(args, hp, hp_str, logger, vocoder):
                     x_, input_length_, y_, _, out_length_, ids_, dur_, e_, p_ = valid
                     model.eval()
                     with torch.no_grad():
-                        loss_, report_dict_ = model(
+                        loss_, report_dict_, _ = model(
                             x_.cuda(),
                             input_length_.cuda(),
                             y_.cuda(),

From 698d0c4864deb6e55ca61d73320493904fbd8ddd Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Mon, 7 Sep 2020 16:46:49 +0530
Subject: [PATCH 03/10] remove bugs

---
 dataset/texts/__init__.py | 214 ++++++++++++--------------------------
 1 file changed, 69 insertions(+), 145 deletions(-)

diff --git a/dataset/texts/__init__.py b/dataset/texts/__init__.py
index 24f35e3..76c98fb 100644
--- a/dataset/texts/__init__.py
+++ b/dataset/texts/__init__.py
@@ -1,14 +1,7 @@
 """ from https://github.com/keithito/tacotron """
 import re
 from dataset.texts import cleaners
-from dataset.texts.symbols import (
-    symbols,
-    _eos,
-    phonemes_symbols,
-    PAD,
-    EOS,
-    _PHONEME_SEP,
-)
+from dataset.texts.symbols import symbols, _eos, phonemes_symbols, PAD, EOS, _PHONEME_SEP
 from dataset.texts.dict_ import symbols_
 import nltk
 from g2p_en import G2p
@@ -18,125 +11,64 @@
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 
 # Regular expression matching text enclosed in curly braces:
-_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
 
 symbols_inv = {v: k for k, v in symbols_.items()}
 
-valid_symbols = [
-    "AA",
-    "AA1",
-    "AE",
-    "AE0",
-    "AE1",
-    "AH",
-    "AH0",
-    "AH1",
-    "AO",
-    "AO1",
-    "AW",
-    "AW0",
-    "AW1",
-    "AY",
-    "AY0",
-    "AY1",
-    "B",
-    "CH",
-    "D",
-    "DH",
-    "EH",
-    "EH0",
-    "EH1",
-    "ER",
-    "EY",
-    "EY0",
-    "EY1",
-    "F",
-    "G",
-    "HH",
-    "IH",
-    "IH0",
-    "IH1",
-    "IY",
-    "IY0",
-    "IY1",
-    "JH",
-    "K",
-    "L",
-    "M",
-    "N",
-    "NG",
-    "OW",
-    "OW0",
-    "OW1",
-    "OY",
-    "OY0",
-    "OY1",
-    "P",
-    "R",
-    "S",
-    "SH",
-    "T",
-    "TH",
-    "UH",
-    "UH0",
-    "UH1",
-    "UW",
-    "UW0",
-    "UW1",
-    "V",
-    "W",
-    "Y",
-    "Z",
-    "ZH",
-    "pau",
-    "sil",
-]
-
+valid_symbols = ['AA', 'AA1', 'AE', 'AE0', 'AE1', 'AH', 'AH0', 'AH1',
+                 'AO', 'AO1', 'AW', 'AW0', 'AW1', 'AY', 'AY0', 'AY1',
+                 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'ER', 'EY',
+                 'EY0', 'EY1', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IY',
+                 'IY0', 'IY1', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0',
+                 'OW1', 'OY', 'OY0','OY1', 'P', 'R', 'S', 'SH', 'T', 'TH',
+                 'UH', 'UH0', 'UH1',  'UW','UW0', 'UW1', 'V', 'W', 'Y', 'Z',
+                 'ZH', 'pau', 'sil', 'spn']
 
 def pad_with_eos_bos(_sequence):
     return _sequence + [_symbol_to_id[_eos]]
 
 
+
 def text_to_sequence(text, cleaner_names, eos):
-    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-    Args:
-      text: string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
-    Returns:
-      List of integers corresponding to the symbols in the text
-    """
+    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+      The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+      in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+      Args:
+        text: string to convert to a sequence
+        cleaner_names: names of the cleaner functions to run the text through
+      Returns:
+        List of integers corresponding to the symbols in the text
+    '''
     sequence = []
     if eos:
-        text = text + "~"
+        text = text + '~'
     try:
         sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
     except KeyError:
-        print("text : ", text)
+        print("text : ",text)
         exit(0)
 
     return sequence
 
 
 def sequence_to_text(sequence):
-    """Converts a sequence of IDs back to a string"""
-    result = ""
+    '''Converts a sequence of IDs back to a string'''
+    result = ''
     for symbol_id in sequence:
         if symbol_id in symbols_inv:
             s = symbols_inv[symbol_id]
             # Enclose ARPAbet back in curly braces:
-            if len(s) > 1 and s[0] == "@":
-                s = "{%s}" % s[1:]
+            if len(s) > 1 and s[0] == '@':
+                s = '{%s}' % s[1:]
             result += s
-    return result.replace("}{", " ")
+    return result.replace('}{', ' ')
 
 
 def _clean_text(text, cleaner_names):
     for name in cleaner_names:
         cleaner = getattr(cleaners, name)
         if not cleaner:
-            raise Exception("Unknown cleaner: %s" % name)
+            raise Exception('Unknown cleaner: %s' % name)
         text = cleaner(text)
     return text
 
@@ -146,11 +78,11 @@ def _symbols_to_sequence(symbols):
 
 
 def _arpabet_to_sequence(text):
-    return _symbols_to_sequence(["@" + s for s in text.split()])
+    return _symbols_to_sequence(['@' + s for s in text.split()])
 
 
 def _should_keep_symbol(s):
-    return s in _symbol_to_id and s != "_" and s != "~"
+    return s in _symbol_to_id and s is not '_' and s is not '~'
 
 
 # For phonemes
@@ -159,58 +91,55 @@ def _should_keep_symbol(s):
 
 
 def _should_keep_token(token, token_dict):
-    return (
-        token in token_dict
-        and token != PAD
-        and token != EOS
-        and token != _phoneme_to_id[PAD]
-        and token != _phoneme_to_id[EOS]
-    )
-
+    return token in token_dict \
+           and token != PAD and token != EOS \
+           and token != _phoneme_to_id[PAD] \
+           and token != _phoneme_to_id[EOS]
 
 def phonemes_to_sequence(phonemes):
     string = phonemes.split() if isinstance(phonemes, str) else phonemes
-    # string.append(EOS)
+    #string.append(EOS)
     sequence = list(map(convert_phoneme_CMU, string))
-    sequence = [_phoneme_to_id[s] for s in string]
-    # if _should_keep_token(s, _phoneme_to_id)]
+    sequence = [_phoneme_to_id[s] for s in sequence]
+                #if _should_keep_token(s, _phoneme_to_id)]
     return sequence
 
 
 def sequence_to_phonemes(sequence, use_eos=False):
     string = [_id_to_phoneme[idx] for idx in sequence]
-    # if _should_keep_token(idx, _id_to_phoneme)]
+              #if _should_keep_token(idx, _id_to_phoneme)]
     string = _PHONEME_SEP.join(string)
     if use_eos:
-        string = string.replace(EOS, "")
+        string = string.replace(EOS, '')
     return string
 
 
 def convert_phoneme_CMU(phoneme):
     REMAPPING = {
-        "AA0": "AA1",
-        "AA2": "AA1",
-        "AE2": "AE1",
-        "AH2": "AH1",
-        "AO0": "AO1",
-        "AO2": "AO1",
-        "AW2": "AW1",
-        "AY2": "AY1",
-        "EH2": "EH1",
-        "ER0": "EH1",
-        "ER1": "EH1",
-        "ER2": "EH1",
-        "EY2": "EY1",
-        "IH2": "IH1",
-        "IY2": "IY1",
-        "OW2": "OW1",
-        "OY2": "OY1",
-        "UH2": "UH1",
-        "UW2": "UW1",
+    'AA0': 'AA1',
+    'AA2': 'AA1',
+    'AE2': 'AE1',
+    'AH2': 'AH1',
+    'AO0': 'AO1',
+    'AO2': 'AO1',
+    'AW2': 'AW1',
+    'AY2': 'AY1',
+    'EH2': 'EH1',
+    'ER0': 'EH1',
+    'ER1': 'EH1',
+    'ER2': 'EH1',
+    'EY2': 'EY1',
+    'IH2': 'IH1',
+    'IY2': 'IY1',
+    'OW2': 'OW1',
+    'OY2': 'OY1',
+    'UH2': 'UH1',
+    'UW2': 'UW1',
     }
     return REMAPPING.get(phoneme, phoneme)
 
 
+
 def text_to_phonemes(text, custom_words={}):
     """
     Convert text into ARPAbet.
@@ -224,7 +153,7 @@ def text_to_phonemes(text, custom_words={}):
     """
     g2p = G2p()
 
-    """def convert_phoneme_CMU(phoneme):
+    '''def convert_phoneme_CMU(phoneme):
         REMAPPING = {
             'AA0': 'AA1',
             'AA2': 'AA1',
@@ -247,18 +176,17 @@ def text_to_phonemes(text, custom_words={}):
             'UW2': 'UW1',
         }
         return REMAPPING.get(phoneme, phoneme)
-        """
-
+        '''
     def convert_phoneme_listener(phoneme):
-        VOWELS = ["A", "E", "I", "O", "U"]
+        VOWELS = ['A', 'E', 'I', 'O', 'U']
         if phoneme[0] in VOWELS:
-            phoneme += "1"
-        return phoneme  # convert_phoneme_CMU(phoneme)
+            phoneme += '1'
+        return phoneme #convert_phoneme_CMU(phoneme)
 
     try:
         known_words = nltk.corpus.cmudict.dict()
     except LookupError:
-        nltk.download("cmudict")
+        nltk.download('cmudict')
         known_words = nltk.corpus.cmudict.dict()
 
     for word, phonemes in custom_words.items():
@@ -267,20 +195,16 @@ def convert_phoneme_listener(phoneme):
     words = nltk.tokenize.WordPunctTokenizer().tokenize(text.lower())
 
     phonemes = []
-    PUNCTUATION = "!?.,-:;\"'()"
+    PUNCTUATION = '!?.,-:;"\'()'
     for word in words:
         if all(c in PUNCTUATION for c in word):
-            pronounciation = ["pau"]
+            pronounciation = ['pau']
         elif word in known_words:
             pronounciation = known_words[word][0]
-            pronounciation = list(
-                pronounciation
-            )  # map(convert_phoneme_CMU, pronounciation))
+            pronounciation = list(pronounciation)#map(convert_phoneme_CMU, pronounciation))
         else:
             pronounciation = g2p(word)
-            pronounciation = list(
-                pronounciation
-            )  # (map(convert_phoneme_CMU, pronounciation))
+            pronounciation = list(pronounciation)#(map(convert_phoneme_CMU, pronounciation))
 
         phonemes += pronounciation
 

From 0d6983472d28154e96bcf59181fb6fbf6dfa7b40 Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Mon, 7 Sep 2020 16:47:17 +0530
Subject: [PATCH 04/10] remove bug

---
 train_fastspeech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_fastspeech.py b/train_fastspeech.py
index 78aee34..3c37192 100644
--- a/train_fastspeech.py
+++ b/train_fastspeech.py
@@ -36,7 +36,7 @@ def train(args, hp, hp_str, logger, vocoder):
     idim = len(valid_symbols)
     odim = hp.audio.num_mels
     model = fastspeech.FeedForwardTransformer(idim, odim, hp)
-    model_d = SFDiscriminator.cuda()
+    model_d = SFDiscriminator().cuda()
     criterion_d = torch.nn.MSELoss().cuda()
 
     # set torch device

From be35ebea418cf2d45b41e3c6086f4d509ec11bb7 Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Mon, 7 Sep 2020 16:50:29 +0530
Subject: [PATCH 05/10] remove bug

---
 train_fastspeech.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train_fastspeech.py b/train_fastspeech.py
index 3c37192..4f0b288 100644
--- a/train_fastspeech.py
+++ b/train_fastspeech.py
@@ -122,15 +122,15 @@ def train(args, hp, hp_str, logger, vocoder):
             loss = loss.mean() / hp.train.accum_grad
             running_loss += loss.item()
 
-            if step >= hp.train.discriminator_start:
+            if global_step >= hp.train.discriminator_start:
                 start = np.random.randint(0, out_length.min()-40)
                 disc_fake = model_d(mel.cuda(), start)
                 for score_fake in disc_fake:
                     # adv_loss += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2]))
                     adv_loss += criterion_d(score_fake, torch.ones_like(score_fake))
                     adv_loss = adv_loss / len(disc_fake) # len(disc_fake) = 3
+                loss = loss + adv_loss
 
-            loss = loss + adv_loss
             loss.backward()
 
             # update parameters

From f20adcd282269639b29aeaf97698d03aec56f3f5 Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Mon, 7 Sep 2020 16:58:46 +0530
Subject: [PATCH 06/10] Update train_fastspeech.py

---
 train_fastspeech.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/train_fastspeech.py b/train_fastspeech.py
index 4f0b288..5f37d91 100644
--- a/train_fastspeech.py
+++ b/train_fastspeech.py
@@ -123,8 +123,8 @@ def train(args, hp, hp_str, logger, vocoder):
             running_loss += loss.item()
 
             if global_step >= hp.train.discriminator_start:
-                start = np.random.randint(0, out_length.min()-40)
-                disc_fake = model_d(mel.cuda(), start)
+                start_disc = np.random.randint(0, out_length.min()-40)
+                disc_fake = model_d(mel.cuda(), start_disc)
                 for score_fake in disc_fake:
                     # adv_loss += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2]))
                     adv_loss += criterion_d(score_fake, torch.ones_like(score_fake))
@@ -167,8 +167,9 @@ def train(args, hp, hp_str, logger, vocoder):
                 )
                 for _ in range(hp.train.rep_discriminator):
                     optim_d.zero_grad()
-                    disc_fake = model_d(mel.cuda())
-                    disc_real = model_d(y.cuda())
+                    start_disc = np.random.randint(0, out_length.min()-40)
+                    disc_fake = model_d(mel.cuda(), start_disc)
+                    disc_real = model_d(y.cuda(), start_disc)
                     loss_d = 0.0
                     loss_d_real = 0.0
                     loss_d_fake = 0.0

From 3b61b78438489b08f0eff5d261e34802e0528837 Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Mon, 7 Sep 2020 17:38:26 +0530
Subject: [PATCH 07/10] remove bugs

---
 configs/default.yaml  |  1 -
 core/discriminator.py |  9 +++------
 fastspeech.py         |  5 +++--
 train_fastspeech.py   | 45 ++++++++++++++++++++++++-------------------
 4 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/configs/default.yaml b/configs/default.yaml
index 23b33cb..b06b1e5 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -107,7 +107,6 @@ model:
 
 train:
   discriminator_start: 10000
-  rep_discriminator: 1
   # optimization related
   eos: False #True
   opt: 'noam'
diff --git a/core/discriminator.py b/core/discriminator.py
index 889b2b0..319e389 100644
--- a/core/discriminator.py
+++ b/core/discriminator.py
@@ -7,8 +7,7 @@ class Discriminator(nn.Module):
     def __init__(self):
         super(Discriminator, self).__init__()
 
-        self.discriminator = nn.ModuleList([
-            nn.Sequential(
+        self.discriminator = nn.Sequential(
                 nn.Conv2d(1, 16, kernel_size=3, stride=1, padding = 1),
                 nn.LeakyReLU(0.2, inplace=True),
                 nn.Conv2d(16, 32, kernel_size=3, stride=1, padding = 1),
@@ -19,16 +18,14 @@ def __init__(self):
                 #nn.Flatten(),   # add conv2d a 1 channel
                 #nn.Linear(46240,256)
                 )
-                ])
 
     def forward(self, x):
         '''
         we directly predict score without last sigmoid function
         since we're using Least Squares GAN (https://arxiv.org/abs/1611.04076)
         '''
-        for module in self.discriminator:
-            x = module(x)
-        return x
+        print(x.shape, "Input to Discriminator")
+        return self.discriminator(x)
 
 def weights_init(m):
     classname = m.__class__.__name__
diff --git a/fastspeech.py b/fastspeech.py
index 0ae7866..f4658a6 100644
--- a/fastspeech.py
+++ b/fastspeech.py
@@ -269,7 +269,7 @@ def forward(
         before_outs, after_outs, d_outs, e_outs, p_outs = self._forward(
             xs, ilens, olens, ds, es, ps, is_inference=False
         )
-
+        out_mels = after_outs.detach()
         # modifiy mod part of groundtruth
         # if hp.model.reduction_factor > 1:
         #     olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
@@ -332,8 +332,9 @@ def forward(
         ]
 
         # self.reporter.report(report_keys)
+        #print(out_mels.shape, "Shape of out_mels in Fs")
 
-        return loss, report_keys, after_outs
+        return loss, report_keys, out_mels
 
     def inference(self, x: torch.Tensor) -> torch.Tensor:
         """Generate the sequence of features given the sequences of characters.
diff --git a/train_fastspeech.py b/train_fastspeech.py
index 5f37d91..21c18bc 100644
--- a/train_fastspeech.py
+++ b/train_fastspeech.py
@@ -103,6 +103,7 @@ def train(args, hp, hp_str, logger, vocoder):
         j = 0
         d_loss = []
 
+
         pbar = tqdm.tqdm(dataloader, desc="Loading train data")
         for data in pbar:
             global_step += 1
@@ -122,9 +123,12 @@ def train(args, hp, hp_str, logger, vocoder):
             loss = loss.mean() / hp.train.accum_grad
             running_loss += loss.item()
 
+            adv_loss = 0
+
             if global_step >= hp.train.discriminator_start:
                 start_disc = np.random.randint(0, out_length.min()-40)
-                disc_fake = model_d(mel.cuda(), start_disc)
+                print(mel.shape)
+                disc_fake = model_d(mel.unsqueeze(1).cuda(), start_disc)
                 for score_fake in disc_fake:
                     # adv_loss += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2]))
                     adv_loss += criterion_d(score_fake, torch.ones_like(score_fake))
@@ -155,6 +159,8 @@ def train(args, hp, hp_str, logger, vocoder):
 
             # Discriminator
             loss_d_avg = 0.0
+            loss_d_sum = 0.0
+
             if step > hp.train.discriminator_start:
                 loss, report_dict, mel = model(
                     x.cuda(),
@@ -165,25 +171,24 @@ def train(args, hp, hp_str, logger, vocoder):
                     e.cuda(),
                     p.cuda(),
                 )
-                for _ in range(hp.train.rep_discriminator):
-                    optim_d.zero_grad()
-                    start_disc = np.random.randint(0, out_length.min()-40)
-                    disc_fake = model_d(mel.cuda(), start_disc)
-                    disc_real = model_d(y.cuda(), start_disc)
-                    loss_d = 0.0
-                    loss_d_real = 0.0
-                    loss_d_fake = 0.0
-                    for score_fake, score_real in zip(disc_fake, disc_real):
-                        loss_d_real += criterion_d(score_real, torch.ones_like(score_real))
-                        loss_d_fake += criterion_d(score_fake, torch.zeros_like(score_fake))
-                    loss_d_real = loss_d_real / len(disc_real) # len(disc_real) = 3
-                    loss_d_fake = loss_d_fake / len(disc_fake) # len(disc_fake) = 3
-                    loss_d = loss_d_real + loss_d_fake
-                    loss_d.backward()
-                    optim_d.step()
-                    loss_d_sum += loss_d
-                loss_d_avg = loss_d_sum / hp.train.rep_discriminator
-                loss_d_avg = loss_d_avg.item()
+
+                optim_d.zero_grad()
+                start_disc = np.random.randint(0, out_length.min()-40)
+                disc_fake = model_d(mel.unsqueeze(1).cuda(), start_disc)
+                disc_real = model_d(y.unsqueeze(1).cuda(), start_disc)
+                loss_d = 0.0
+                loss_d_real = 0.0
+                loss_d_fake = 0.0
+                for score_fake, score_real in zip(disc_fake, disc_real):
+                    loss_d_real += criterion_d(score_real, torch.ones_like(score_real))
+                    loss_d_fake += criterion_d(score_fake, torch.zeros_like(score_fake))
+                loss_d_real = loss_d_real / len(disc_real) # len(disc_real) = 3
+                loss_d_fake = loss_d_fake / len(disc_fake) # len(disc_fake) = 3
+                loss_d = loss_d_real + loss_d_fake
+                loss_d.backward()
+                optim_d.step()
+                loss_d_sum += loss_d
+                loss_d_avg = loss_d_sum.item()
 
             if step % hp.train.summary_interval == 0:
                 pbar.set_description(

From db8abd69f945b392e4fed3070faca750929ae3d1 Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Tue, 8 Sep 2020 10:58:09 +0530
Subject: [PATCH 08/10] working baseline

---
 configs/default.yaml  | 11 ++++++-----
 core/discriminator.py |  8 ++++----
 train_fastspeech.py   |  8 +++++++-
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/configs/default.yaml b/configs/default.yaml
index b06b1e5..d6a33e2 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -1,6 +1,6 @@
 data:
-  data_dir: 'H:\Deepsync\backup\fastspeech\data\'
-  wav_dir: 'H:\Deepsync\backup\deepsync\LJSpeech-1.1\wavs\'
+  data_dir: '/workspace/data/'
+  wav_dir: '/workspace/LJSpeech-1.1/wavs/'
   # Compute statistics
   e_mean: 21.578571319580078
   e_std: 18.916799545288086
@@ -106,11 +106,12 @@ model:
 
 
 train:
-  discriminator_start: 10000
+  discriminator_start: 20000
+  rep_discriminator: 1
   # optimization related
   eos: False #True
   opt: 'noam'
-  accum_grad: 4
+  accum_grad: 1
   grad_clip: 1.0
   weight_decay: 0.001
   patience: 0
@@ -126,7 +127,7 @@ train:
   seed: 1       # random seed number
   resume: ""    # the snapshot path to resume (if set empty, no effect)
   use_phonemes: True
-  batch_size : 16
+  batch_size : 24
   # other
   melgan_vocoder : True
   save_interval : 1000
diff --git a/core/discriminator.py b/core/discriminator.py
index 319e389..08b2a30 100644
--- a/core/discriminator.py
+++ b/core/discriminator.py
@@ -24,7 +24,7 @@ def forward(self, x):
         we directly predict score without last sigmoid function
         since we're using Least Squares GAN (https://arxiv.org/abs/1611.04076)
         '''
-        print(x.shape, "Input to Discriminator")
+        # print(x.shape, "Input to Discriminator")
         return self.discriminator(x)
 
 def weights_init(m):
@@ -44,9 +44,9 @@ def __init__(self):
         self.apply(weights_init)
     def forward(self, x, start):
         results = []
-        results.append(self.disc1(x[:, : , 0:40, start: start + 40]))
-        results.append(self.disc2(x[:, :, 20:60, start: start + 40]))
-        results.append(self.disc3(x[:, :, 40:80, start: start + 40]))
+        results.append(self.disc1(x[:, :, start: start + 40, 0:40]))
+        results.append(self.disc2(x[:, :, start: start + 40, 20:60]))
+        results.append(self.disc3(x[:, :, start: start + 40, 40:80, ]))
         return results
 
 if __name__ == '__main__':
diff --git a/train_fastspeech.py b/train_fastspeech.py
index 21c18bc..050047c 100644
--- a/train_fastspeech.py
+++ b/train_fastspeech.py
@@ -54,6 +54,12 @@ def train(args, hp, hp_str, logger, vocoder):
                 hp.model.transformer_warmup_steps,
                 hp.model.transformer_lr,
             )
+            optim_d = get_std_opt(
+            model_d,
+            hp.model.adim,
+            hp.model.transformer_warmup_steps,
+            hp.model.transformer_lr,
+        )
             optimizer.load_state_dict(checkpoint["optim"])
             global_step = checkpoint["step"]
 
@@ -127,7 +133,7 @@ def train(args, hp, hp_str, logger, vocoder):
 
             if global_step >= hp.train.discriminator_start:
                 start_disc = np.random.randint(0, out_length.min()-40)
-                print(mel.shape)
+
                 disc_fake = model_d(mel.unsqueeze(1).cuda(), start_disc)
                 for score_fake in disc_fake:
                     # adv_loss += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2]))

From 32e7a07da857d518f110bf5e61e45049fe356cb4 Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Tue, 8 Sep 2020 16:52:28 +0530
Subject: [PATCH 09/10] add adversarial loss on tensorboard

---
 train_fastspeech.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train_fastspeech.py b/train_fastspeech.py
index 050047c..4bf3518 100644
--- a/train_fastspeech.py
+++ b/train_fastspeech.py
@@ -195,6 +195,7 @@ def train(args, hp, hp_str, logger, vocoder):
                 optim_d.step()
                 loss_d_sum += loss_d
                 loss_d_avg = loss_d_sum.item()
+                writer.add_scalar("Advverserial Loss", loss_d_avg, step)
 
             if step % hp.train.summary_interval == 0:
                 pbar.set_description(

From 9f0d7e10b98a88bf63a17e3dc74f57bed31bffd4 Mon Sep 17 00:00:00 2001
From: Karan Thakkar <kkt31415@gmail.com>
Date: Tue, 8 Sep 2020 17:02:01 +0530
Subject: [PATCH 10/10] Update __init__.py

---
 dataset/texts/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/dataset/texts/__init__.py b/dataset/texts/__init__.py
index 76c98fb..a9f5930 100644
--- a/dataset/texts/__init__.py
+++ b/dataset/texts/__init__.py
@@ -81,10 +81,6 @@ def _arpabet_to_sequence(text):
     return _symbols_to_sequence(['@' + s for s in text.split()])
 
 
-def _should_keep_symbol(s):
-    return s in _symbol_to_id and s is not '_' and s is not '~'
-
-
 # For phonemes
 _phoneme_to_id = {s: i for i, s in enumerate(valid_symbols)}
 _id_to_phoneme = {i: s for i, s in enumerate(valid_symbols)}