Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

generate_frame_scale_features の解体 #790

Merged
merged 18 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion test/test_acoustic_feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def test_phoneme_list(self):
self.assertEqual(OjtPhoneme.phoneme_list[41], "v")

def test_const(self):
self.assertEqual(OjtPhoneme.num_phoneme, 45)
TRUE_NUM_PHONEME = 45
self.assertEqual(OjtPhoneme.num_phoneme, TRUE_NUM_PHONEME)
self.assertEqual(OjtPhoneme.space_phoneme, "pau")

def test_convert(self):
Expand Down
202 changes: 156 additions & 46 deletions test/test_synthesis_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import math
from copy import deepcopy
from random import random
from typing import Optional, Union
from typing import Union
from unittest import TestCase
from unittest.mock import Mock

Expand All @@ -13,7 +13,9 @@

# TODO: import from voicevox_engine.synthesis_engine.mora
from voicevox_engine.synthesis_engine.synthesis_engine import (
generate_frame_scale_features,
calc_frame_per_phoneme,
calc_frame_phoneme,
calc_frame_pitch,
mora_phoneme_list,
pre_process,
split_mora,
Expand All @@ -22,6 +24,8 @@
unvoiced_mora_phoneme_list,
)

TRUE_NUM_PHONEME = 45


def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, style_id: numpy.ndarray):
result = []
Expand Down Expand Up @@ -97,14 +101,41 @@ def is_model_loaded(self, style_id):
return True


def _gen_query(
accent_phrases: list[AccentPhrase] | None = None,
speedScale: float = 1.0,
pitchScale: float = 1.0,
intonationScale: float = 1.0,
prePhonemeLength: float = 0.0,
postPhonemeLength: float = 0.0,
volumeScale: float = 1.0,
outputSamplingRate: int = 24000,
outputStereo: bool = False,
):
"""Generate AudioQuery with default meaningless arguments for test simplicity."""
accent_phrases = [] if accent_phrases is None else accent_phrases
return AudioQuery(
accent_phrases=accent_phrases,
speedScale=speedScale,
pitchScale=pitchScale,
intonationScale=intonationScale,
prePhonemeLength=prePhonemeLength,
postPhonemeLength=postPhonemeLength,
volumeScale=volumeScale,
outputSamplingRate=outputSamplingRate,
outputStereo=outputStereo,
)


def _gen_mora(
text: str,
consonant: Optional[str],
consonant_length: Optional[float],
consonant: str | None,
consonant_length: float | None,
vowel: str,
vowel_length: float,
pitch: float,
) -> Mora:
"""Generate Mora with positional arguments for test simplicity."""
return Mora(
text=text,
consonant=consonant,
Expand All @@ -115,19 +146,97 @@ def _gen_mora(
)


def test_generate_frame_scale_features():
"""Test `generate_frame_scale_features`."""
def test_calc_frame_per_phoneme():
"""Test `calc_frame_per_phoneme`."""
# Inputs
query = AudioQuery(
accent_phrases=[],
query = _gen_query(
speedScale=2.0,
prePhonemeLength=2 * 0.01067, # 0.01067 [sec/frame]
postPhonemeLength=6 * 0.01067,
)
moras = [
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
_gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
_gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
]

# Expects
# Pre k o N pau h i h O Pst
true_frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
true_frame_per_phoneme = numpy.array(true_frame_per_phoneme, dtype=numpy.int32)

# Outputs
frame_per_phoneme = calc_frame_per_phoneme(query, moras)

assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme)


def test_calc_frame_pitch():
"""Test `test_calc_frame_pitch`."""
# Inputs
query = _gen_query(pitchScale=2.0, intonationScale=0.5)
moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
_gen_mora("ン", None, None, "N", 0.0, 50.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]
phoneme_str = "pau k o N pau h i h O pau"
phonemes = [OjtPhoneme(p, 0, 0) for p in phoneme_str.split()]
# Pre k o N pau h i h O Pst
frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32)

# Expects - x4 value scaled -> mean=300 var x0.5 intonation scaling
# pau ko ko ko N N
true1_f0 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0]
# pau hi hi hi
true2_f0 = [0.0, 400.0, 400.0, 400.0]
# hO hO hO paw paw paw
true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32)

# Outputs
f0 = calc_frame_pitch(query, moras, phonemes, frame_per_phoneme)

assert numpy.array_equal(f0, true_f0)


def test_calc_frame_phoneme():
"""Test `calc_frame_phoneme`."""
# Inputs
phoneme_str = "pau k o N pau h i h O pau"
phonemes = [OjtPhoneme(p, 0, 0) for p in phoneme_str.split()]
# Pre k o N pau h i h O Pst
frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
n_frame = sum(frame_per_phoneme)
frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32)

# Expects
# Pr k o o N N pau h i i h h O Pt Pt Pt
phoneme_ids = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0]
true_frame_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32)
for frame_idx, phoneme_idx in enumerate(phoneme_ids):
true_frame_phoneme[frame_idx, phoneme_idx] = 1.0

# Outputs
frame_phoneme = calc_frame_phoneme(phonemes, frame_per_phoneme)

assert numpy.array_equal(frame_phoneme, true_frame_phoneme)


def test_feat_to_framescale():
"""Test Mora/Phonemefeature-to-framescaleFeature pipeline."""
# Inputs
query = _gen_query(
speedScale=2.0,
pitchScale=2.0,
intonationScale=0.5,
prePhonemeLength=2 * 0.01067, # 0.01067 [sec/frame]
prePhonemeLength=2 * 0.01067,
postPhonemeLength=6 * 0.01067,
volumeScale=0.0,
outputSamplingRate=0,
outputStereo=False,
)
flatten_moras = [
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
Expand All @@ -139,40 +248,41 @@ def test_generate_frame_scale_features():
phoneme_str = "pau k o N pau h i h O pau"
phoneme_data_list = [OjtPhoneme(p, 0, 0) for p in phoneme_str.split()]

# Ground Truths
# Pre k o N pau h i h O Pst
frm_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
n_frm = sum(frm_per_phoneme)
frm_per_phoneme = numpy.array(frm_per_phoneme, dtype=numpy.int32)

# Pr k o o N N pau h i i h h O Pt Pt Pt
phoneme_frms = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0]
phoneme_gt = numpy.zeros([n_frm, 45], dtype=numpy.float32)
for frm_idx, phoneme_idx in enumerate(phoneme_frms):
phoneme_gt[frm_idx, phoneme_idx] = 1.0

# Pitch - x4 value & x0.5 variance
# Pre ko N pau hi hO Pst
f0_gt = [0.0, 200.0, 200.0, 0.0, 500.0, 0.0, 0.0] # mean 300
f0_gt = [0.0, 250.0, 250.0, 0.0, 400.0, 0.0, 0.0] # intonationScale 0.5
# paw ko N pau hi hO paw
# frm_per_vowel = [1, 3, 2, 1, 3, 3, 3]
# pau ko ko ko N N
f0_gt_1 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0]
# pau hi hi hi
f0_gt_2 = [0.0, 400.0, 400.0, 400.0]
# hO hO hO paw paw paw
f0_gt_3 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
f0_gt = numpy.array(f0_gt_1 + f0_gt_2 + f0_gt_3, dtype=numpy.float32)

phoneme_pred, f0_pred = generate_frame_scale_features(
query, flatten_moras, phoneme_data_list
)

assert frm_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites"

assert numpy.array_equal(phoneme_pred, phoneme_gt), "Wrong phoneme onehot frames"
assert numpy.array_equal(f0_pred, f0_gt), "Wrong frame-wise phoneme onehot"
# Expects
# frame_per_phoneme
# Pre k o N pau h i h O Pst
true_frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
n_frame = sum(true_frame_per_phoneme)
true_frame_per_phoneme = numpy.array(true_frame_per_phoneme, dtype=numpy.int32)
# phoneme
# Pr k o o N N pau h i i h h O Pt Pt Pt
frame_phoneme_idxs = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0]
true_frame_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32)
for frame_idx, phoneme_idx in enumerate(frame_phoneme_idxs):
true_frame_phoneme[frame_idx, phoneme_idx] = 1.0
# Pitch
# Pre ko N pau hi hO Pst
true_f0 = [0.0, 200.0, 200.0, 0.0, 500.0, 0.0, 0.0] # mean 300
true_f0 = [0.0, 250.0, 250.0, 0.0, 400.0, 0.0, 0.0] # intonationScale 0.5
# paw ko N pau hi hO paw
# frame_per_vowel = [1, 3, 2, 1, 3, 3, 3]
# pau ko ko ko N N
true1_f0 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0]
# pau hi hi hi
true2_f0 = [0.0, 400.0, 400.0, 400.0]
# hO hO hO paw paw paw
true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32)

assert true_frame_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites"

# Outputs
frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme)
frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)

assert numpy.array_equal(frame_phoneme, true_frame_phoneme)
assert numpy.array_equal(f0, true_f0)


class TestSynthesisEngine(TestCase):
Expand Down
Loading