diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index c61ec0197b..de7e439d8a 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -6,33 +6,53 @@ labels: bug
 assignees: ''
 
 ---
+<!-- Welcome to the 🐸TTS!
+We are excited to see your interest, and appreciate your support! --->
+## 🐛 Description
 
-Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support!
+<!-- A clear and concise description of what the bug is. -->
 
-This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
+### To Reproduce
 
-If you've found a bug, please provide the following information:
+<!--
+Please share your code to reproduce the error. Issues fixed faster if you can provide a working example.
 
-**Describe the bug**
-A clear and concise description of what the bug is.
+The best place for sharing code is colab. https://colab.research.google.com/
+So we can directly run your code and reproduce the issue.
+
+In the worse case provide steps to reproduce the behaviour.
 
-**To Reproduce**
-Steps to reproduce the behavior:
 1. Run the following command '...'
 2. ...
 3. See error
+-->
+
+### Expected behavior
+
+<!-- Write down what the expected behaviour -->
+
+### Environment
+
+<!--
+You can either run `TTS/bin/collect_env_info.py`
+
+```bash
+wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_details.py
+python collect_env_details.py
+```
 
-**Expected behavior**
-A clear and concise description of what you expected to happen.
+or fill in the fields below manually.
+-->
 
-**Environment (please complete the following information):**
-- **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**:
-- **PyTorch or TensorFlow version (use command below)**:
-- **Python version**:
-- **CUDA/cuDNN version**:
-- **GPU model and memory**:
-- **Exact command to reproduce**:
+- 🐸TTS Version (e.g., 1.3.0):
+- PyTorch Version (e.g., 1.8)
+- Python version:
+- OS (e.g., Linux):
+- CUDA/cuDNN version:
+- GPU models and configuration:
+- How you installed PyTorch (`conda`, `pip`, source):
+- Any other relevant information:
 
-**Additional context**
-Add any other context about the problem here.
+### Additional context
 
+<!-- Add any other context about the problem here. -->
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index d49aa70749..941ab9b143 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,25 +2,24 @@
 name: 🚀 Feature request
 about: Suggest a feature or an idea for this project
 title: '[Feature request] '
-labels: feature request 
+labels: feature request
 assignees: ''
 
 ---
+<!-- Welcome to the 🐸TTS project!
+We are excited to see your interest, and appreciate your support! --->
+**🚀 Feature Description**
 
-Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support!
+<!--A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] -->
 
-This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
+**Solution**
 
-If you have a feature request, then please provide the following information:
+<!-- A clear and concise description of what you want to happen. -->
 
-**Is your feature request related to a problem? Please describe.**
-A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Alternative Solutions**
 
-**Describe the solution you'd like**
-A clear and concise description of what you want to happen.
-
-**Describe alternatives you've considered**
-A clear and concise description of any alternative solutions or features you've considered.
+<!-- A clear and concise description of any alternative solutions or features you've considered. -->
 
 **Additional context**
-Add any other context or screenshots about the feature request here.
+
+<!-- Add any other context or screenshots about the feature request here. -->
diff --git a/TTS/.models.json b/TTS/.models.json
index 6bff584b92..44c5fc6c63 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -98,7 +98,7 @@
                 "fast_pitch":{
                     "description": "FastPitch model trained on VCTK dataseset.",
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--en--vctk--fast_pitch.zip",
-                    "default_vocoder": "vocoder_models/en/vctk/hifigan_v2",
+                    "default_vocoder": null,
                     "commit": "bdab788d",
                     "author": "Eren @erogol",
                     "license": "CC BY-NC-ND 4.0",
diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py
new file mode 100644
index 0000000000..662fcd02ec
--- /dev/null
+++ b/TTS/bin/collect_env_info.py
@@ -0,0 +1,48 @@
+"""Get detailed info about the working environment."""
+import os
+import platform
+import sys
+
+import numpy
+import torch
+
+sys.path += [os.path.abspath(".."), os.path.abspath(".")]
+import json
+
+import TTS
+
+
+def system_info():
+    return {
+        "OS": platform.system(),
+        "architecture": platform.architecture(),
+        "version": platform.version(),
+        "processor": platform.processor(),
+        "python": platform.python_version(),
+    }
+
+
+def cuda_info():
+    return {
+        "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
+        "available": torch.cuda.is_available(),
+        "version": torch.version.cuda,
+    }
+
+
+def package_info():
+    return {
+        "numpy": numpy.__version__,
+        "PyTorch_version": torch.__version__,
+        "PyTorch_debug": torch.version.debug,
+        "TTS": TTS.__version__,
+    }
+
+
+def main():
+    details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
+    print(json.dumps(details, indent=4, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 3e69e1adca..fb2e41b4ab 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -254,7 +254,7 @@ def main():
     print(" > Text: {}".format(args.text))
 
     # kick it
-    wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav)
+    wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav, args.gst_style)
 
     # save the results
     print(" > Saving output to {}".format(args.out_path))
diff --git a/TTS/server/server.py b/TTS/server/server.py
index 5d66d9eb63..c6d67141c0 100644
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@@ -103,7 +103,7 @@ def convert_boolean(x):
     model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
 )
 
-use_multi_speaker = hasattr(synthesizer.tts_model, "speaker_manager") and synthesizer.tts_model.num_speakers > 1
+use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1
 speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
 # TODO: set this from SpeakerManager
 use_gst = synthesizer.tts_config.get("use_gst", False)
diff --git a/TTS/trainer.py b/TTS/trainer.py
index 9fcd77a765..2a2cfc46bb 100644
--- a/TTS/trainer.py
+++ b/TTS/trainer.py
@@ -284,8 +284,8 @@ def __init__(  # pylint: disable=dangerous-default-value
         self.optimizer = self.get_optimizer(self.model, self.config)
 
         # CALLBACK
-        self.callbacks = TrainerCallback(self)
-        self.callbacks.on_init_start()
+        self.callbacks = TrainerCallback()
+        self.callbacks.on_init_start(self)
 
         # init AMP
         if self.use_amp_scaler:
@@ -324,7 +324,7 @@ def __init__(  # pylint: disable=dangerous-default-value
         num_params = count_parameters(self.model)
         print("\n > Model has {} parameters".format(num_params))
 
-        self.callbacks.on_init_end()
+        self.callbacks.on_init_end(self)
 
     @staticmethod
     def parse_argv(args: Union[Coqpit, List]):
@@ -677,7 +677,7 @@ def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_ti
         Returns:
             Tuple[Dict, Dict]: Model outputs and losses.
         """
-        self.callbacks.on_train_step_start()
+        self.callbacks.on_train_step_start(self)
         # format data
         batch = self.format_batch(batch)
         loader_time = time.time() - loader_start_time
@@ -792,7 +792,7 @@ def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_ti
             self.dashboard_logger.flush()
 
         self.total_steps_done += 1
-        self.callbacks.on_train_step_end()
+        self.callbacks.on_train_step_end(self)
         return outputs, loss_dict
 
     def train_epoch(self) -> None:
@@ -983,7 +983,7 @@ def _fit(self) -> None:
             if self.num_gpus > 1:
                 # let all processes sync up before starting with a new epoch of training
                 dist.barrier()
-            self.callbacks.on_epoch_start()
+            self.callbacks.on_epoch_start(self)
             self.keep_avg_train = KeepAverage()
             self.keep_avg_eval = KeepAverage() if self.config.run_eval else None
             self.epochs_done = epoch
@@ -999,7 +999,7 @@ def _fit(self) -> None:
             )
             if self.args.rank in [None, 0]:
                 self.save_best_model()
-            self.callbacks.on_epoch_end()
+            self.callbacks.on_epoch_end(self)
 
     def fit(self) -> None:
         """Where the ✨️magic✨️ happens..."""
@@ -1008,7 +1008,7 @@ def fit(self) -> None:
             if self.args.rank == 0:
                 self.dashboard_logger.finish()
         except KeyboardInterrupt:
-            self.callbacks.on_keyboard_interrupt()
+            self.callbacks.on_keyboard_interrupt(self)
             # if the output folder is empty remove the run.
             remove_experiment_folder(self.output_path)
             # clear the DDP processes
diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py
index 040a891091..31d994421d 100644
--- a/TTS/tts/configs/fast_speech_config.py
+++ b/TTS/tts/configs/fast_speech_config.py
@@ -110,6 +110,7 @@ class FastSpeechConfig(BaseTTSConfig):
     model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
 
     # multi-speaker settings
+    num_speakers: int = 0
     speakers_file: str = None
     use_speaker_embedding: bool = False
     use_d_vector_file: bool = False
@@ -142,7 +143,7 @@ class FastSpeechConfig(BaseTTSConfig):
     r: int = 1  # DO NOT CHANGE
 
     # dataset configs
-    compute_f0: bool = True
+    compute_f0: bool = False
     f0_cache_path: str = None
 
     # testing
diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index 780249363d..4fae974f88 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -91,6 +91,8 @@ def load_tts_samples(
                 for idx, ins in enumerate(meta_data_eval_all):
                     attn_file = meta_data[ins[1]].strip()
                     meta_data_eval_all[idx].append(attn_file)
+        # set none for the next iter
+        formatter = None
     return meta_data_train_all, meta_data_eval_all
 
 
@@ -110,3 +112,18 @@ def _get_formatter_by_name(name):
     """Returns the respective preprocessing function."""
     thismodule = sys.modules[__name__]
     return getattr(thismodule, name.lower())
+
+
+def find_unique_chars(data_samples, verbose=True):
+    texts = "".join(item[0] for item in data_samples)
+    chars = set(texts)
+    lower_chars = filter(lambda c: c.islower(), chars)
+    chars_force_lower = [c.lower() for c in chars]
+    chars_force_lower = set(chars_force_lower)
+
+    if verbose:
+        print(f" > Number of unique characters: {len(chars)}")
+        print(f" > Unique characters: {''.join(sorted(chars))}")
+        print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
+        print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
+    return chars_force_lower
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index dcd1874006..425eb0cdd0 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -60,7 +60,13 @@ def mozilla_de(root_path, meta_file):
 
 
 def mailabs(root_path, meta_files=None):
-    """Normalizes M-AI-Labs meta data files to TTS format"""
+    """Normalizes M-AI-Labs meta data files to TTS format
+
+    Args:
+        root_path (str): root folder of the MAILAB language folder.
+        meta_files (str):  list of meta files to be used in the training. If None, finds all the csv files
+            recursively. Defaults to None
+    """
     speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
     if meta_files is None:
         csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py
index 53f7ca7c04..91e53da325 100644
--- a/TTS/tts/layers/vits/stochastic_duration_predictor.py
+++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py
@@ -266,7 +266,7 @@ def forward(self, x, x_mask, dr=None, g=None, reverse=False, noise_scale=1.0):
 
         flows = list(reversed(self.flows))
         flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-        z = torch.rand(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
+        z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
         for flow in flows:
             z = torch.flip(z, [1])
             z = flow(z, x_mask, g=x, reverse=reverse)
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index b77c1e2315..854526de8c 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -219,7 +219,7 @@ def get_data_loader(
                 use_phonemes=config.use_phonemes,
                 phoneme_language=config.phoneme_language,
                 enable_eos_bos=config.enable_eos_bos_chars,
-                use_noise_augment=not is_eval,
+                use_noise_augment=False if is_eval else config.use_noise_augment,
                 verbose=verbose,
                 speaker_id_mapping=speaker_id_mapping,
                 d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 5185139e93..578c26c0d3 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -250,11 +250,11 @@ def synthesis(
     # GST processing
     style_mel = None
     custom_symbols = None
-    if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
-        if isinstance(style_wav, dict):
-            style_mel = style_wav
-        else:
-            style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda)
+    if style_wav:
+        style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda)
+    elif CONFIG.has("gst") and CONFIG.gst and not style_wav:
+        if CONFIG.gst.gst_style_input_weights:
+            style_mel = CONFIG.gst.gst_style_input_weights
     if hasattr(model, "make_symbols"):
         custom_symbols = model.make_symbols(CONFIG)
     # preprocess the given text
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 66f518b407..537d23017f 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -5,6 +5,7 @@
 from typing import Dict, List
 
 import gruut
+from gruut_ipa import IPA
 
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
@@ -32,7 +33,7 @@
 GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
 
 
-def text2phone(text, language, use_espeak_phonemes=False):
+def text2phone(text, language, use_espeak_phonemes=False, keep_stress=False):
     """Convert graphemes to phonemes.
     Parameters:
             text (str): text to phonemize
@@ -51,36 +52,44 @@ def text2phone(text, language, use_espeak_phonemes=False):
         ph = japanese_text_to_phonemes(text)
         return ph
 
-    if gruut.is_language_supported(language):
-        # Use gruut for phonemization
-        phonemizer_args = {
-            "remove_stress": True,
-            "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
-            "ipa_major_breaks": False,  # don't replace periods with IPA ‖
-        }
-
-        if use_espeak_phonemes:
-            # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
-            # This is intended for backwards compatibility with TTS<=v0.0.13
-            # pre-trained models.
-            phonemizer_args["model_prefix"] = "espeak"
-
-        ph_list = gruut.text_to_phonemes(
-            text,
-            lang=language,
-            return_format="word_phonemes",
-            phonemizer_args=phonemizer_args,
-        )
-
-        # Join and re-split to break apart dipthongs, suprasegmentals, etc.
-        ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
-        ph = "| ".join(ph_words)
-
-        # Fix a few phonemes
-        ph = ph.translate(GRUUT_TRANS_TABLE)
-        return ph
-
-    raise ValueError(f" [!] Language {language} is not supported for phonemization.")
+    if not gruut.is_language_supported(language):
+        raise ValueError(f" [!] Language {language} is not supported for phonemization.")
+
+    # Use gruut for phonemization
+    ph_list = []
+    for sentence in gruut.sentences(text, lang=language, espeak=use_espeak_phonemes):
+        for word in sentence:
+            if word.is_break:
+                # Use actual character for break phoneme (e.g., comma)
+                if ph_list:
+                    # Join with previous word
+                    ph_list[-1].append(word.text)
+                else:
+                    # First word is punctuation
+                    ph_list.append([word.text])
+            elif word.phonemes:
+                # Add phonemes for word
+                word_phonemes = []
+
+                for word_phoneme in word.phonemes:
+                    if not keep_stress:
+                        # Remove primary/secondary stress
+                        word_phoneme = IPA.without_stress(word_phoneme)
+
+                    word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
+
+                    if word_phoneme:
+                        # Flatten phonemes
+                        word_phonemes.extend(word_phoneme)
+
+                if word_phonemes:
+                    ph_list.append(word_phonemes)
+
+    # Join and re-split to break apart dipthongs, suprasegmentals, etc.
+    ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
+    ph = "| ".join(ph_words)
+
+    return ph
 
 
 def intersperse(sequence, token):
diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py
index dd9c570197..e64b95e01f 100644
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@@ -674,7 +674,7 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray:
         return f0
 
     ### Audio Processing ###
-    def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
+    def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int:
         """Find the last point without silence at the end of a audio signal.
 
         Args:
@@ -687,7 +687,7 @@ def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8)
         """
         window_length = int(self.sample_rate * min_silence_sec)
         hop_length = int(window_length / 4)
-        threshold = self._db_to_amp(threshold_db)
+        threshold = self._db_to_amp(-self.trim_db)
         for x in range(hop_length, len(wav) - window_length, hop_length):
             if np.max(wav[x : x + window_length]) < threshold:
                 return x + hop_length
diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py
index 18b6c34c27..511d215c65 100644
--- a/TTS/utils/callbacks.py
+++ b/TTS/utils/callbacks.py
@@ -1,75 +1,105 @@
 class TrainerCallback:
-    def __init__(self, trainer):
-        super().__init__()
-        self.trainer = trainer
-
-    def on_init_start(self) -> None:
-        if hasattr(self.trainer.model, "on_init_start"):
-            self.trainer.model.on_init_start(self.trainer)
-
-        if hasattr(self.trainer.criterion, "on_init_start"):
-            self.trainer.criterion.on_init_start(self.trainer)
-
-        if hasattr(self.trainer.optimizer, "on_init_start"):
-            self.trainer.optimizer.on_init_start(self.trainer)
-
-    def on_init_end(self) -> None:
-        if hasattr(self.trainer.model, "on_init_end"):
-            self.trainer.model.on_init_end(self.trainer)
-
-        if hasattr(self.trainer.criterion, "on_init_end"):
-            self.trainer.criterion.on_init_end(self.trainer)
-
-        if hasattr(self.trainer.optimizer, "on_init_end"):
-            self.trainer.optimizer.on_init_end(self.trainer)
-
-    def on_epoch_start(self) -> None:
-        if hasattr(self.trainer.model, "on_epoch_start"):
-            self.trainer.model.on_epoch_start(self.trainer)
-
-        if hasattr(self.trainer.criterion, "on_epoch_start"):
-            self.trainer.criterion.on_epoch_start(self.trainer)
-
-        if hasattr(self.trainer.optimizer, "on_epoch_start"):
-            self.trainer.optimizer.on_epoch_start(self.trainer)
-
-    def on_epoch_end(self) -> None:
-        if hasattr(self.trainer.model, "on_epoch_end"):
-            self.trainer.model.on_epoch_end(self.trainer)
-
-        if hasattr(self.trainer.criterion, "on_epoch_end"):
-            self.trainer.criterion.on_epoch_end(self.trainer)
-
-        if hasattr(self.trainer.optimizer, "on_epoch_end"):
-            self.trainer.optimizer.on_epoch_end(self.trainer)
-
-    def on_train_step_start(self) -> None:
-        if hasattr(self.trainer.model, "on_train_step_start"):
-            self.trainer.model.on_train_step_start(self.trainer)
-
-        if hasattr(self.trainer.criterion, "on_train_step_start"):
-            self.trainer.criterion.on_train_step_start(self.trainer)
-
-        if hasattr(self.trainer.optimizer, "on_train_step_start"):
-            self.trainer.optimizer.on_train_step_start(self.trainer)
-
-    def on_train_step_end(self) -> None:
-
-        if hasattr(self.trainer.model, "on_train_step_end"):
-            self.trainer.model.on_train_step_end(self.trainer)
-
-        if hasattr(self.trainer.criterion, "on_train_step_end"):
-            self.trainer.criterion.on_train_step_end(self.trainer)
-
-        if hasattr(self.trainer.optimizer, "on_train_step_end"):
-            self.trainer.optimizer.on_train_step_end(self.trainer)
-
-    def on_keyboard_interrupt(self) -> None:
-        if hasattr(self.trainer.model, "on_keyboard_interrupt"):
-            self.trainer.model.on_keyboard_interrupt(self.trainer)
-
-        if hasattr(self.trainer.criterion, "on_keyboard_interrupt"):
-            self.trainer.criterion.on_keyboard_interrupt(self.trainer)
-
-        if hasattr(self.trainer.optimizer, "on_keyboard_interrupt"):
-            self.trainer.optimizer.on_keyboard_interrupt(self.trainer)
+    @staticmethod
+    def on_init_start(trainer) -> None:
+        if hasattr(trainer.model, "module"):
+            if hasattr(trainer.model.module, "on_init_start"):
+                trainer.model.module.on_init_start(trainer)
+        else:
+            if hasattr(trainer.model, "on_init_start"):
+                trainer.model.on_init_start(trainer)
+
+        if hasattr(trainer.criterion, "on_init_start"):
+            trainer.criterion.on_init_start(trainer)
+
+        if hasattr(trainer.optimizer, "on_init_start"):
+            trainer.optimizer.on_init_start(trainer)
+
+    @staticmethod
+    def on_init_end(trainer) -> None:
+        if hasattr(trainer.model, "module"):
+            if hasattr(trainer.model.module, "on_init_end"):
+                trainer.model.module.on_init_end(trainer)
+        else:
+            if hasattr(trainer.model, "on_init_end"):
+                trainer.model.on_init_end(trainer)
+
+        if hasattr(trainer.criterion, "on_init_end"):
+            trainer.criterion.on_init_end(trainer)
+
+        if hasattr(trainer.optimizer, "on_init_end"):
+            trainer.optimizer.on_init_end(trainer)
+
+    @staticmethod
+    def on_epoch_start(trainer) -> None:
+        if hasattr(trainer.model, "module"):
+            if hasattr(trainer.model.module, "on_epoch_start"):
+                trainer.model.module.on_epoch_start(trainer)
+        else:
+            if hasattr(trainer.model, "on_epoch_start"):
+                trainer.model.on_epoch_start(trainer)
+
+        if hasattr(trainer.criterion, "on_epoch_start"):
+            trainer.criterion.on_epoch_start(trainer)
+
+        if hasattr(trainer.optimizer, "on_epoch_start"):
+            trainer.optimizer.on_epoch_start(trainer)
+
+    @staticmethod
+    def on_epoch_end(trainer) -> None:
+        if hasattr(trainer.model, "module"):
+            if hasattr(trainer.model.module, "on_epoch_end"):
+                trainer.model.module.on_epoch_end(trainer)
+        else:
+            if hasattr(trainer.model, "on_epoch_end"):
+                trainer.model.on_epoch_end(trainer)
+
+        if hasattr(trainer.criterion, "on_epoch_end"):
+            trainer.criterion.on_epoch_end(trainer)
+
+        if hasattr(trainer.optimizer, "on_epoch_end"):
+            trainer.optimizer.on_epoch_end(trainer)
+
+    @staticmethod
+    def on_train_step_start(trainer) -> None:
+        if hasattr(trainer.model, "module"):
+            if hasattr(trainer.model.module, "on_train_step_start"):
+                trainer.model.module.on_train_step_start(trainer)
+        else:
+            if hasattr(trainer.model, "on_train_step_start"):
+                trainer.model.on_train_step_start(trainer)
+
+        if hasattr(trainer.criterion, "on_train_step_start"):
+            trainer.criterion.on_train_step_start(trainer)
+
+        if hasattr(trainer.optimizer, "on_train_step_start"):
+            trainer.optimizer.on_train_step_start(trainer)
+
+    @staticmethod
+    def on_train_step_end(trainer) -> None:
+        if hasattr(trainer.model, "module"):
+            if hasattr(trainer.model.module, "on_train_step_end"):
+                trainer.model.module.on_train_step_end(trainer)
+        else:
+            if hasattr(trainer.model, "on_train_step_end"):
+                trainer.model.on_train_step_end(trainer)
+
+        if hasattr(trainer.criterion, "on_train_step_end"):
+            trainer.criterion.on_train_step_end(trainer)
+
+        if hasattr(trainer.optimizer, "on_train_step_end"):
+            trainer.optimizer.on_train_step_end(trainer)
+
+    @staticmethod
+    def on_keyboard_interrupt(trainer) -> None:
+        if hasattr(trainer.model, "module"):
+            if hasattr(trainer.model.module, "on_keyboard_interrupt"):
+                trainer.model.module.on_keyboard_interrupt(trainer)
+        else:
+            if hasattr(trainer.model, "on_keyboard_interrupt"):
+                trainer.model.on_keyboard_interrupt(trainer)
+
+        if hasattr(trainer.criterion, "on_keyboard_interrupt"):
+            trainer.criterion.on_keyboard_interrupt(trainer)
+
+        if hasattr(trainer.optimizer, "on_keyboard_interrupt"):
+            trainer.optimizer.on_keyboard_interrupt(trainer)
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index af07419f44..043c498236 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -74,6 +74,8 @@ def __init__(
         if vocoder_checkpoint:
             self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
             self.output_sample_rate = self.vocoder_config.audio["sample_rate"]
+        else:
+            print(" > Using Griffin-Lim as no vocoder model defined")
 
     @staticmethod
     def _get_segmenter(lang: str):
@@ -265,7 +267,8 @@ def tts(self, text: str, speaker_idx: str = "", speaker_wav=None, style_wav=None
             waveform = waveform.squeeze()
 
             # trim silence
-            waveform = trim_silence(waveform, self.ap)
+            if self.tts_config.audio["do_trim_silence"] is True:
+                waveform = trim_silence(waveform, self.ap)
 
             wavs += list(waveform)
             wavs += [0] * 10000
diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index 81ba87c42f..e36c2cd16a 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -310,6 +310,7 @@ def get_data_loader(  # pylint: disable=no-self-use
         data_items: List,
         verbose: bool,
         num_gpus: int,
+        rank: int = 0,  # pylint: disable=unused-argument
     ):
         """Initiate and return the GAN dataloader.
 
diff --git a/docs/source/finetuning.md b/docs/source/finetuning.md
index fd9a295c03..42b9e51887 100644
--- a/docs/source/finetuning.md
+++ b/docs/source/finetuning.md
@@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
 
     ```bash
     CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
     ```
 
     ```bash
     CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \
         --config_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
     ```
 
     As stated above, you can also use command-line arguments to change the model configuration.
@@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
 
     ```bash
     CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
         --coqpit.run_name "glow-tts-finetune" \
         --coqpit.lr 0.00001
     ```
diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md
index cbefc61dc0..3db38af0b0 100644
--- a/docs/source/formatting_your_dataset.md
+++ b/docs/source/formatting_your_dataset.md
@@ -19,15 +19,15 @@ Let's assume you created the audio clips and their transcription. You can collec
 
 You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text.
 
-We recommend the following format delimited by `||`.
+We recommend the following format delimited by `||`. In the following example, `audio1`, `audio2` refer to files `audio1.wav`, `audio2.wav` etc.
 
 ```
 # metadata.txt
 
-audio1.wav || This is my sentence.
-audio2.wav || This is maybe my sentence.
-audio3.wav || This is certainly my sentence.
-audio4.wav || Let this be your sentence.
+audio1||This is my sentence.
+audio2||This is maybe my sentence.
+audio3||This is certainly my sentence.
+audio4||Let this be your sentence.
 ...
 ```
 
@@ -80,4 +80,4 @@ See `TTS.tts.datasets.TTSDataset`, a generic `Dataset` implementation for the `t
 See `TTS.vocoder.datasets.*`, for different `Dataset` implementations for the `vocoder` models.
 
 See `TTS.utils.audio.AudioProcessor` that includes all the audio processing and feature extraction functions used in a
-`Dataset` implementation. Feel free to add things as you need.passed
\ No newline at end of file
+`Dataset` implementation. Feel free to add things as you need.passed
diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
index 346d650b8f..63efb78470 100644
--- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
+++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
@@ -35,7 +35,7 @@
     test_delay_epochs=-1,
     r=2,
     # gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
-    double_decoder_consistency=False,
+    double_decoder_consistency=True,
     epochs=1000,
     text_cleaner="phoneme_cleaners",
     use_phonemes=True,
diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py
new file mode 100644
index 0000000000..346d650b8f
--- /dev/null
+++ b/recipes/vctk/tacotron2/train_tacotron2.py
@@ -0,0 +1,87 @@
+import os
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.trainer import Trainer, TrainingArgs
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.tacotron2_config import Tacotron2Config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.tacotron2 import Tacotron2
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.utils.audio import AudioProcessor
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    resample=False,  # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training.
+    do_trim_silence=True,
+    trim_db=23.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    preemphasis=0.0,
+)
+
+config = Tacotron2Config(  # This is the config that is saved for the future use
+    audio=audio_config,
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    r=2,
+    # gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
+    double_decoder_consistency=False,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    print_step=150,
+    print_eval=False,
+    mixed_precision=True,
+    sort_by_audio_len=True,
+    min_seq_len=14800,
+    max_seq_len=22050 * 10,  # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
+    output_path=output_path,
+    datasets=[dataset_config],
+    use_speaker_embedding=True,  # set this to enable multi-sepeaker training
+    decoder_ssim_alpha=0.0,  # disable ssim losses that causes NaN for some runs.
+    postnet_ssim_alpha=0.0,
+    postnet_diff_spec_alpha=0.0,
+    decoder_diff_spec_alpha=0.0,
+    attention_norm="softmax",
+    optimizer="Adam",
+    lr_scheduler=None,
+    lr=3e-5,
+)
+
+# init audio processor
+ap = AudioProcessor(**config.audio.to_dict())
+
+# load training samples
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+
+# init speaker manager for multi-speaker training
+# it mainly handles speaker-id to speaker-name for the model and the data-loader
+speaker_manager = SpeakerManager()
+speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+
+# init model
+model = Tacotron2(config, speaker_manager)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainingArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+    training_assets={"audio_processor": ap},
+)
+trainer.fit()
diff --git a/requirements.txt b/requirements.txt
index a87a3c6f5c..3ec33ceb02 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,6 +23,6 @@ coqpit
 mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0
 fsspec>=2021.04.0
-pyworld
\ No newline at end of file
+pyworld
diff --git a/tests/aux_tests/test_text_processing.py b/tests/aux_tests/test_text_processing.py
index 3c424a15a1..62d60a42bf 100644
--- a/tests/aux_tests/test_text_processing.py
+++ b/tests/aux_tests/test_text_processing.py
@@ -9,12 +9,12 @@
 
 EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
 
-EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
+EXPECTED_PHONEMES = "ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ| f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s|,| ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l| f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ|!"
 
 # -----------------------------------------------------------------------------
 
 
-class TextProcessingTextCase(unittest.TestCase):
+class TextProcessingTestCase(unittest.TestCase):
     """Tests for text to phoneme conversion"""
 
     def test_phoneme_to_sequence(self):
@@ -40,7 +40,7 @@ def _test_phoneme_to_sequence(self, add_blank):
         sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
         text_hat = sequence_to_phoneme(sequence)
         text_hat_with_params = sequence_to_phoneme(sequence)
-        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
+        gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
         print(text_hat)
         print(len(sequence))
         self.assertEqual(text_hat, text_hat_with_params)
@@ -51,7 +51,7 @@ def _test_phoneme_to_sequence(self, add_blank):
         sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
         text_hat = sequence_to_phoneme(sequence)
         text_hat_with_params = sequence_to_phoneme(sequence)
-        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
+        gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
         print(text_hat)
         print(len(sequence))
         self.assertEqual(text_hat, text_hat_with_params)
@@ -62,7 +62,7 @@ def _test_phoneme_to_sequence(self, add_blank):
         sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
         text_hat = sequence_to_phoneme(sequence)
         text_hat_with_params = sequence_to_phoneme(sequence)
-        gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
+        gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
         print(text_hat)
         print(len(sequence))
         self.assertEqual(text_hat, text_hat_with_params)
@@ -73,7 +73,7 @@ def _test_phoneme_to_sequence(self, add_blank):
         sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
         text_hat = sequence_to_phoneme(sequence)
         text_hat_with_params = sequence_to_phoneme(sequence)
-        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
+        gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
         print(text_hat)
         print(len(sequence))
         self.assertEqual(text_hat, text_hat_with_params)
@@ -86,7 +86,7 @@ def _test_phoneme_to_sequence(self, add_blank):
         )
         text_hat = sequence_to_phoneme(sequence)
         text_hat_with_params = sequence_to_phoneme(sequence)
-        gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
+        gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
         print(text_hat)
         print(len(sequence))
         self.assertEqual(text_hat, text_hat_with_params)