forked from pannous/tensorflow-speech-recognition
-
Notifications
You must be signed in to change notification settings - Fork 0
/
speech2text-seq2seq.py
executable file
·144 lines (118 loc) · 4.78 KB
/
speech2text-seq2seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
#!/usr/local/bin/python
print("""
Update:
tf.nn.seq2seq doesn't work as hoped:
It needs a 1D Tensor (chars) as input, not 2D spectrogram/mfcc/...
unless we feed it with very long 1D wave data,
but that is probably not what seq2seq was intended to for.
Fear not: 1D dilated convolution and LSTMs together with CTC are just fine.
New dynamic seq2seq was recently added to the master and will probably be out with 1.0.0 release.I
""")
exit(0)
"""Sequence-to-sequence model with an attention mechanism."""
from __future__ import print_function
import numpy as np
import tensorflow as tf
# import sugartensor as tf
# import sugartensor
import layer
import speech_data
from speech_data import Source,Target
from layer import net
learning_rate = 0.00001
training_iters = 300000 #steps
batch_size = 64
input_classes=20 # mfcc features
max_input_length=80 # (max) length of utterance
max_output_length=20
output_classes=32 # dimensions: characters
# Target.word here just returns the filename "1_STEFFI_160.wav" = digit_speaker_words-per-minute.wav nicely 'encoded' ;)
batch=word_batch=speech_data.mfcc_batch_generator(batch_size, source=Source.DIGIT_WAVES, target=Target.hotword)
X,Y=next(batch)
# EOS='\n' # end of sequence symbol todo use how?
# GO=1 # start symbol 0x01 todo use how?
# def decode(bytes):
# return "".join(map(chr, bytes)).replace('\x00', '').replace('\n', '')
vocab_size=input_classes
target_vocab_size=output_classes
buckets=[(max_input_length, max_output_length)] # our input and response words can be up to 10 characters long
# (1000,1000) Takes 6 minutes on the Mac, half on Nvidia
PAD=[0] # fill words shorter than 10 characters with 'padding' zeroes
input_data = x= X
target_data = y= Y
target_weights= [[1.0]*50 + [0.0]*(max_input_length-50)] *batch_size # mask padding. todo: redundant --
encoder_size = max_input_length
decoder_size = max_output_length #self.buckets[bucket_id]
num_dim=input_classes #?
# residual block
def res_block(tensor, size, rate, dim=num_dim):
# filter convolution
conv_filter = tensor.sg_aconv1d(size=size, rate=rate, act='tanh', bn=True)
# gate convolution
conv_gate = tensor.sg_aconv1d(size=size, rate=rate, act='sigmoid', bn=True)
# output by gate multiplying
out = conv_filter * conv_gate
# final output
out = out.sg_conv1d(size=1, dim=dim, act='tanh', bn=True)
# residual and skip output
return out + tensor, out
# expand dimension
z = x.sg_conv1d(size=1, dim=num_dim, act='tanh', bn=True)
# dilated conv block loop
skip = 0 # skip connections
for i in range(num_blocks):
for r in [1, 2, 4, 8, 16]:
z, s = res_block(z, size=7, rate=r)
skip += s
# final logit layers
logit = (skip
.sg_conv1d(size=1, act='tanh', bn=True)
.sg_conv1d(size=1, dim=voca_size))
# CTC loss
loss = logit.sg_ctc(target=y, seq_len=seq_len)
tf.train.AdamOptimizer(learning_rate).minimize(loss)
saver = tf.train.Saver(tf.global_variables())
# train
tf.sg_train(log_interval=30, lr=0.0001, loss=loss, ep_size=1000, max_ep=200, early_stop=False)
# tf.nn.seq2seq DOES'T WORK: NEEDS 1D Tensor (chars) as input, not mfcc
# class SpeechSeq2Seq(object):
#
# def __init__(self,size, num_layers):
#
# cell = single_cell = tf.nn.rnn_cell.GRUCell(size)
# if num_layers > 1:
# cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
# # Feeds for inputs.
# self.encoder_inputs = []
# self.decoder_inputs = []
# self.target_weights = []
# i=0
# self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
# self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
# self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))
# # tf.nn.rnn()
# # targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)]
# self.outputs, self.losses = tf.nn.seq2seq.basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell)
# tf.train.AdamOptimizer(learning_rate).minimize(self.losses)
# self.saver = tf.train.Saver(tf.all_variables())
#
# # def step(self, session, encoder_inputs, decoder_inputs, target_weights, test):
# pass
# def test():
# perplexity, outputs = model.step(session, input_data, target_data, target_weights, test=True)
# words = np.argmax(outputs, axis=2) # shape (10, 10, 256)
# # word = decode(words[0])
# word = str(words[0])
# print("step %d, perplexity %f, output: hello %s?" % (step, perplexity, word))
# def train():
# step=0
# test_step=1
# with tf.Session() as session:
# model= SpeechSeq2Seq(size=10, num_layers=1)
# session.run(tf.initialize_all_variables())
# while True:
# model.step(session, input_data, target_data, target_weights, test=False) # no outputs in training
# if step % test_step == 0:
# test()
# step=step+1