StyleTTS Text aligner → ****https://github.com/yl4579/StyleTTS/blob/main/Utils/ASR/models.py
input
output
model architecture
ASRCNN(
(to_mfcc): MFCC()
(init_cnn): ConvNorm(
(conv): Conv1d(40, 256, kernel_size=(7,), stride=(2,), padding=(3,))
)
(cnns): Sequential(
(0-5): 6 x Sequential(
(0): ConvBlock(
(blocks): ModuleList(
(0-2): 3 x Sequential(
(0): ConvNorm(
(conv): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
)
(1): ReLU()
(2): GroupNorm(8, 256, eps=1e-05, affine=True)
(3): Dropout(p=0.2, inplace=False)
(4): ConvNorm(
(conv): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
)
(5): ReLU()
(6): Dropout(p=0.2, inplace=False)
)
)
)
(1): GroupNorm(1, 256, eps=1e-05, affine=True)
)
)
(projection): ConvNorm(
(conv): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
)
(ctc_linear): Sequential(
(0): LinearNorm(
(linear_layer): Linear(in_features=128, out_features=256, bias=True)
)
(1): ReLU()
(2): LinearNorm(
(linear_layer): Linear(in_features=256, out_features=178, bias=True)
)
)
(asr_s2s): ASRS2S(
(embedding): Embedding(178, 512)
(project_to_n_symbols): Linear(in_features=128, out_features=178, bias=True)
(attention_layer): Attention(
(query_layer): LinearNorm(
(linear_layer): Linear(in_features=128, out_features=128, bias=False)
)
(memory_layer): LinearNorm(
(linear_layer): Linear(in_features=128, out_features=128, bias=False)
)
(v): LinearNorm(
(linear_layer): Linear(in_features=128, out_features=1, bias=False)
)
(location_layer): LocationLayer(
(location_conv): ConvNorm(
(conv): Conv1d(2, 32, kernel_size=(63,), stride=(1,), padding=(31,), bias=False)
)
(location_dense): LinearNorm(
(linear_layer): Linear(in_features=32, out_features=128, bias=False)
)
)
)
(decoder_rnn): LSTMCell(640, 128)
(project_to_hidden): Sequential(
(0): LinearNorm(
(linear_layer): Linear(in_features=256, out_features=128, bias=True)
)
(1): Tanh()
)
)
)