-
Notifications
You must be signed in to change notification settings - Fork 4
/
mlp_vad.py
110 lines (87 loc) · 3.31 KB
/
mlp_vad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from mlp import MLP
from scikits.audiolab import Sndfile, Format
import sys
import numpy as np
import theano
import theano.tensor as T
import string
import pysox
import random
import os
import argparse
# classifer has been trained on 8khz samples of 25ms length
SAMPLE_RATE = 8000
WINDOW_SIZE = 0.025
def random_string():
return ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6))
def downsample(fs, sig):
in_file = random_string() + ".wav"
out_file = random_string() + ".wav"
frame_len = fs * WINDOW_SIZE
pad = len(sig)%frame_len
if pad > 0:
sig = np.append(sig, np.zeros(frame_len - pad))
f = Sndfile(in_file, 'w', Format(type="wav", encoding='pcm16', endianness="file"), 1, fs)
f.write_frames(sig)
f.close()
sox_in = pysox.CSoxStream(in_file)
sox_out = pysox.CSoxStream(out_file, 'w', pysox.CSignalInfo(SAMPLE_RATE, 1, 8), fileType='wav')
sox_chain = pysox.CEffectsChain(sox_in, sox_out)
sox_chain.add_effect(pysox.CEffect("rate", [str(SAMPLE_RATE)]))
sox_chain.flow_effects()
sox_out.close()
f = Sndfile(out_file, 'r')
sig = f.read_frames(f.nframes)
f.close()
os.unlink(in_file)
os.unlink(out_file)
return sig
class MLP_VAD(object):
def __init__(self, model_file):
rng = np.random.RandomState(1234)
self.x = T.matrix('x')
self.classifier = MLP(
rng=rng,
input=self.x,
n_in=200,
n_hidden=180,
n_out=2
)
self.classifier.load_model(model_file)
def classify(self, fs, sig):
if fs != SAMPLE_RATE:
sig = downsample(fs, sig)
num_samples = int(WINDOW_SIZE * SAMPLE_RATE)
num_frames = len(sig)/num_samples
sig = sig[0:num_frames*num_samples].reshape((num_frames, num_samples))
sig = sig * np.hamming(num_samples)
spec = np.abs(np.fft.fft(sig)) # spectrum of signal
shared_x = theano.shared(np.asarray(spec, dtype=theano.config.floatX), borrow=True)
index = T.lscalar() # index to a [mini]batch
predict_model = theano.function(
inputs=[index],
outputs=self.classifier.y_pred,
givens={
self.x: shared_x[index:index + 1],
}
)
# classify each frame
predicted_values = [predict_model(i)[0] for i in xrange(num_frames)]
return np.asarray(predicted_values)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Voice Activity Detection using Theano')
parser.add_argument('input_file', action='store', type=str, help='segment of input file of 200ms duration')
parser.add_argument('-m, --model-file', action='store', type=str, dest='model_file', help='the model file to use', default='models/params.pkl')
parser.add_argument('-t, --noise-threshold', action='store', type=float, dest='noise_threshold', help='noise thresold (default: 0.25)', default=0.25)
args = parser.parse_args()
f = Sndfile(args.input_file, 'r')
fs = f.samplerate
sig = f.read_frames(f.nframes)
f.close()
mlp = MLP_VAD(args.model_file)
speech_prob = mlp.classify(fs, sig)
result = np.mean(speech_prob)
if result < args.noise_threshold:
print "noise (%.2f)" % (result)
else:
print "speech (%.2f)" % (result)