draft page

audeering · May 21, 2024 · eef7f06 · eef7f06
1 parent 27eadf9
commit eef7f06
Show file tree

Hide file tree

Showing 3 changed files with 610 additions and 592 deletions.
diff --git a/demo.py b/demo.py
@@ -1,50 +1,67 @@
 import subprocess
 import soundfile
-import sox
-
-## VOICES mimic3
-# 
-# /scratch/dkounadis/.envs/.tts/lib/python3.8/site-packages/mimic3_tts/
-
-spk1 = 'en_US/vctk_low#p236'
-rate1 = 1.24
-
-spk2 = 'en_UK/apope_low'
-rate2 = 1.64
-
-pitch_semitones = -4
-
-text = ('<speak>'
-'<prosody volume=\'64\'>'
-f'<prosody rate=\'{rate1}\'>'
-f'<voice name=\'{spk1}\'>'
-'<s>'
-'A an exemplary voice.'
-'</s>'
-'</voice>'
-'</prosody>'
-'</prosody>'
-f'<prosody rate=\'{rate2}\'>'
-f'<voice name=\'{spk2}\'>'
-'<s>'
-'.Another pleasant voice.'
-'</s>'
-'</voice>'
-'</prosody>'
-'</speak>')
-
-with open('_tmp_ssml.txt', 'w') as f:
-    f.write(text)
-
-raw_tts = 'test.wav'
-ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {raw_tts}', shell=True)
-ps.wait()
-
-x, fs = soundfile.read(raw_tts)
-tfm = sox.Transformer()
-tfm.pitch(pitch_semitones)
-x_shift = tfm.build_array(
-    input_array=x,
-    sample_rate_in=fs)
-
-soundfile.write(f'test_pitch.wav', x_shift, fs)
+import msinference
+
+
+
+my_text = "Metamorphosis of cultural heritage to augmented hypermedia for accessibility and inclusion."
+_voice = 'en_US/vctk_low#p276' # https://audeering.github.io/shift/
+affect = True                  # False = Non-Affective voices
+out_wav = f'example_{affect=}.wav'
+
+
+if affect:
+
+    # Mimic-3
+
+    reference_wav = '_spk.wav'
+    rate = 4  # high speed sounds nice when used as speaker-reference audio for 2nd stage (StyleTTS2)
+    _ssml = (
+        '<speak>'
+        f'<prosody volume=\'24\'>'
+            f'<prosody rate=\'{rate}\'>'
+            f'<voice name=\'{_voice}\'>'
+            f'<s>Sweet dreams are made of this, ... !!! I travel the world and the seven seas.</s>'
+            '</voice>'
+            '</prosody>'
+            '</prosody>')
+    _ssml += '</speak>'
+    with open('_tmp_ssml.txt', 'w') as f:
+        f.write(_ssml)
+    ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
+    ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
+
+    # StyleTTS2
+
+    x = msinference.inference(my_text,
+                            msinference.compute_style(reference_wav),
+                            alpha=0.3,
+                            beta=0.7,
+                            diffusion_steps=7,
+                            embedding_scale=1)
+    soundfile.write(out_wav, x, 24000)
+
+
+
+else:
+
+
+
+    # Non Affective TTS
+
+    rate = .84
+    _ssml = (
+        '<speak>'
+        f'<prosody volume=\'94\'>'
+            f'<prosody rate=\'{rate}\'>'
+            f'<voice name=\'{_voice}\'>'
+            f'<s>\'{my_text}\'</s>'
+            '</voice>'
+            '</prosody>'
+            '</prosody>')
+    _ssml += '</speak>'
+    with open('_tmp_ssml.txt', 'w') as f:
+        f.write(_ssml)
+    ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {out_wav}', shell=True)
+    ps.wait()
+
diff --git a/generate_config.py b/generate_config.py
@@ -679,7 +679,7 @@ def emotion_predictor(
 
 # == markdown table
 
-y = sorted(y, key=lambda d: d['emotion'][0])  # sort wav_files by valence
+y = sorted(y, key=lambda d: d['emotion'][1])  # sort wav_files by valence
 
 # SORTING OUTPUT IS LIST - 0-th ELEMENT = LOWEST VALENCE
 #_________________________________________________
@@ -709,16 +709,17 @@ def emotion_predictor(
 
 table = (
    f'<html lang="en">\n<body>\n<h1>Available TTS Voices.</h1>'
-   f'\nYou can use the basic/affective version of every voice in \n'
-   f'<a href="https://github.com/audeering/shift/blob/main/demo.py">demo.py</a><hr>'
+   f'\nIn \n'
+   f'<a href="https://github.com/audeering/shift/blob/main/demo.py">demo.py</a> '
+   f'you can use the Affective or Non-Affective version of each voice.<hr>'
    f'<table><tr><td>'  # count
    f'</td><td>\n\n voice \n\n</td>'
-   f'<td>\n\n Basic \n\n</td>'
+   f'<td>\n\n Non-Affective \n\n</td>'
    f'<td>\n\n emotion volatility \n\n</td>'
    f'<td>\n\n Affective \n\n</td>'
 )
 
-for i, tup in enumerate(reversed(y)):  # i is new index
+for i, tup in enumerate(y):
 
     _voice, emotion, tgt_wav, affect_wav, fig_file, str_voice = tup.values()
     print('\n\n', _voice, '\n\n')