diff --git a/demo.py b/demo.py
index 7c98ff7..7efc448 100644
--- a/demo.py
+++ b/demo.py
@@ -1,50 +1,67 @@
import subprocess
import soundfile
-import sox
-
-## VOICES mimic3
-#
-# /scratch/dkounadis/.envs/.tts/lib/python3.8/site-packages/mimic3_tts/
-
-spk1 = 'en_US/vctk_low#p236'
-rate1 = 1.24
-
-spk2 = 'en_UK/apope_low'
-rate2 = 1.64
-
-pitch_semitones = -4
-
-text = (''
-''
-f''
-f''
-''
-'A an exemplary voice.'
-''
-''
-''
-''
-f''
-f''
-''
-'.Another pleasant voice.'
-''
-''
-''
-'')
-
-with open('_tmp_ssml.txt', 'w') as f:
- f.write(text)
-
-raw_tts = 'test.wav'
-ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {raw_tts}', shell=True)
-ps.wait()
-
-x, fs = soundfile.read(raw_tts)
-tfm = sox.Transformer()
-tfm.pitch(pitch_semitones)
-x_shift = tfm.build_array(
- input_array=x,
- sample_rate_in=fs)
-
-soundfile.write(f'test_pitch.wav', x_shift, fs)
+import msinference
+
+
+
+my_text = "Metamorphosis of cultural heritage to augmented hypermedia for accessibility and inclusion."
+_voice = 'en_US/vctk_low#p276' # https://audeering.github.io/shift/
+affect = True # False = Non-Affective voices
+out_wav = f'example_{affect=}.wav'
+
+
+if affect:
+
+ # Mimic-3
+
+ reference_wav = '_spk.wav'
+ rate = 4 # high speed sounds nice when used as speaker-reference audio for 2nd stage (StyleTTS2)
+ _ssml = (
+ ''
+ f''
+ f''
+ f''
+ f'Sweet dreams are made of this, ... !!! I travel the world and the seven seas.'
+ ''
+ ''
+ '')
+ _ssml += ''
+ with open('_tmp_ssml.txt', 'w') as f:
+ f.write(_ssml)
+ ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
+ ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
+
+ # StyleTTS2
+
+ x = msinference.inference(my_text,
+ msinference.compute_style(reference_wav),
+ alpha=0.3,
+ beta=0.7,
+ diffusion_steps=7,
+ embedding_scale=1)
+ soundfile.write(out_wav, x, 24000)
+
+
+
+else:
+
+
+
+ # Non Affective TTS
+
+ rate = .84
+ _ssml = (
+ ''
+ f''
+ f''
+ f''
+ f'\'{my_text}\''
+ ''
+ ''
+ '')
+ _ssml += ''
+ with open('_tmp_ssml.txt', 'w') as f:
+ f.write(_ssml)
+ ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {out_wav}', shell=True)
+ ps.wait()
+
diff --git a/generate_config.py b/generate_config.py
index 69afd47..712080c 100644
--- a/generate_config.py
+++ b/generate_config.py
@@ -679,7 +679,7 @@ def emotion_predictor(
# == markdown table
-y = sorted(y, key=lambda d: d['emotion'][0]) # sort wav_files by valence
+y = sorted(y, key=lambda d: d['emotion'][1]) # sort wav_files by valence
# SORTING OUTPUT IS LIST - 0-th ELEMENT = LOWEST VALENCE
#_________________________________________________
@@ -709,16 +709,17 @@ def emotion_predictor(
table = (
f'\n
\n
Available TTS Voices.
'
- f'\nYou can use the basic/affective version of every voice in \n'
- f'demo.py'
+ f'\nIn \n'
+ f'demo.py '
+ f'you can use the Affective or Non-Affective version of each voice.'
f'
' # count
f'
\n\n voice \n\n
'
- f'
\n\n Basic \n\n
'
+ f'
\n\n Non-Affective \n\n
'
f'
\n\n emotion volatility \n\n
'
f'
\n\n Affective \n\n
'
)
-for i, tup in enumerate(reversed(y)): # i is new index
+for i, tup in enumerate(y):
_voice, emotion, tgt_wav, affect_wav, fig_file, str_voice = tup.values()
print('\n\n', _voice, '\n\n')
diff --git a/index.html b/index.html
index 669fe3f..3d0cccf 100644
--- a/index.html
+++ b/index.html
@@ -1,14 +1,14 @@
Available TTS Voices.
-You can use the basic/affective version of every voice in
-demo.py
+In
+demo.py you can use the Affective or Non-Affective version of each voice.