diff --git a/README.md b/README.md index b47ca9b..d13d8ca 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,7 @@ [![Coverage Status](https://coveralls.io/repos/github/ubclaunchpad/minutes/badge.svg)](https://coveralls.io/github/ubclaunchpad/minutes) -Audio speaker diarization library. - -## Under Construction! +Audio speaker diarization library. ## :running: Development @@ -43,9 +41,25 @@ minutes.add_speakers([s1, s2]) # Fit the model. minutes.fit() -# Collect a new conversation for prediction. -conversation = Conversation('/path/to/conversation.wav') +# Predict against a new conversation had by speakers s1 and s2. +conversation = Conversation('/path/to/conversation.wav', minutes) +``` + +## Reproducibility + +If you want to make sure your `Minutes` models are reprodicible, we recommend +setting the `numpy` and `tensorflow` state. + +```py +import numpy as np +import tensorflow as tf +from minutes import Minutes + +state = 42 +np.random.seed(state) +tf.set_random_seed(state) -# Create phrases from the conversation. -phrases = minutes.phrases(conversation) +# Ensure the test data are generated deterministically by setting the Minutes +# state as well. +minutes = Minutes(parent='cnn', random_state=state) ``` diff --git a/minutes/__init__.py b/minutes/__init__.py index 0cdeec5..818f7df 100644 --- a/minutes/__init__.py +++ b/minutes/__init__.py @@ -1,3 +1,3 @@ -from .minutes import Minutes # noqa -from .speaker import Speaker # noqa -from .conversation import Conversation # noqa +from minutes.minutes import Minutes # noqa +from minutes.speaker import Speaker # noqa +from minutes.conversation import Phrase, Conversation # noqa diff --git a/minutes/audio.py b/minutes/audio.py index 808d974..13e0192 100644 --- a/minutes/audio.py +++ b/minutes/audio.py @@ -6,8 +6,14 @@ import scipy.stats as stats import soundfile as sf +# Parameters for preprocessing audio files. +PREPROCESSING_PARAMS = {'ms_per_observation'} + class Audio: + """Internal audio manipulation class. I reserve the right to change this + API :) + """ def __init__(self, audio_loc): if os.path.isdir(audio_loc): @@ -40,19 +46,19 @@ def samples_per_observation(self, ms_per_observation): """ return int(self.rate * ms_per_observation / 1000.) - def get_spectrograms(self, ms_per_observation, verbose=False): - """Converts a internal table of raw audio audio phrases into with - one spectrogram per row. + def get_observations(self, ms_per_observation): + """Converts a internal raw audio vector into table with + one spectrogram per row. Also returns raw observations. Arguments: ms_per_observation {int} -- The number of desired ms per obs. Returns: - np.array -- An array of spectrograms, one per row. The width - of each spectrogram depends on the ms_per_observation, - The number of rows depends on the length of the audio file - and the ms per observations. + raw -- The raw audio observation table. + processed -- An array of spectrograms, one per row. The width + of each spectrogram depends on the ms_per_observation. """ + # Reshape for processing into spectrograms. d = self.samples_per_observation(ms_per_observation) N = len(self.data) // d @@ -65,17 +71,14 @@ def get_spectrograms(self, ms_per_observation, verbose=False): # Truncate last (partial) observation. data = data[:N * d] - if verbose: - t = len(self.data) - (N * d) - print('Truncating {} bytes from end of sample'.format(t)) - # Reshape for processing into spectrograms. - data = data.reshape((N, d)) + raw = data.reshape((N, d)) def spec_from_row(row): _, _, Sxx = signal.spectrogram(row) return Sxx # This is very slow! Perhaps some logging? - rows = (spec_from_row(row) for row in data) - return np.array([x for x in rows]) + rows = (spec_from_row(row) for row in raw) + processed = np.array([x for x in rows]) + return raw, processed diff --git a/minutes/base.py b/minutes/base.py index 7893f18..9fe3c58 100644 --- a/minutes/base.py +++ b/minutes/base.py @@ -1,6 +1,5 @@ import json import os -import pickle from keras import backend as K from keras.models import Sequential, load_model @@ -12,6 +11,7 @@ from sklearn.model_selection import train_test_split from minutes.models import MINUTES_BASE_MODEL_DIRECTORY +from minutes.audio import PREPROCESSING_PARAMS class BaseModel: @@ -24,6 +24,17 @@ class BaseModel: 'random_state', } + @property + def preprocessing_params(self): + """Returns a mapping of parameters that are required to do preprocessing + of audio data suitable for this model. Useful as kwargs to audio + manipulation classes. + """ + return { + i: getattr(self, i) for i in self.intialization_params + if i in PREPROCESSING_PARAMS + } + @property def fitted(self): return self.model is not None @@ -63,7 +74,7 @@ def load_model(cls, name): def __init__(self, name, ms_per_observation=3000, test_size=0.33, random_state=42): self.name = name - self.speakers = set() + self.speakers = [] self.test_size = test_size self.random_state = random_state self.ms_per_observation = ms_per_observation @@ -78,7 +89,7 @@ def add_speaker(self, speaker): """ if speaker in self.speakers: raise LookupError(f'Speaker {speaker.name} already added.') - self.speakers.add(speaker) + self.speakers.append(speaker) def add_speakers(self, speakers): """Add a collection of speakers to the model. @@ -98,8 +109,12 @@ def _generate_training_data(self): y -- a categorical one-hot encoding of different speakers numbered 1..k. """ - obs = [s.get_observations(self.ms_per_observation) - for s in self.speakers] + obs = [] + for s in self.speakers: + _, processed = s.get_observations(**self.preprocessing_params) + obs += processed, + + # Generate and flatten labels. labels = [[i] * len(o) for i, o in enumerate(obs)] flattened_labels = [j for i in labels for j in i] @@ -153,3 +168,21 @@ def save_model(self): # Save internal model. if self.model is not None: self.model.save(os.path.join(self.home, 'keras.h5')) + + def predict(self, observations): + """Predict against a table of audio observations. + + Arguments: + observations {np.array} -- A table of processed audio observations. + + Returns: + np.array -- An array of predicted speakers. + """ + result = self.model.predict(observations) + y_hat_indices = np.argmax(result, axis=1) + + # Index into the speaker array using the predicted speaker indicies. + return np.array(self.speakers)[y_hat_indices] + + def __str__(self): + return self.name diff --git a/minutes/conversation.py b/minutes/conversation.py index 9f18b1d..20ef414 100644 --- a/minutes/conversation.py +++ b/minutes/conversation.py @@ -1,30 +1,36 @@ from minutes.audio import Audio -class Conversation: +class Phrase: + def __init__(self, observation, speaker): + """A phrase in a conversation, characterized by an audio segment + and a speaker. - def __init__(self, audio_loc, speakers): + Arguments: + observation {np.array} -- 1 dimensional audio sample. + speaker {Speaker} -- The inferred speaker for the audio segment. + """ + self.observation = observation + self.speaker = speaker + + +class Conversation(Audio): + + def __init__(self, audio_loc, model): """Create a new conversation from audio sample. Arguments: audio_loc {str} -- The absolute location of an audio conversation sample. - speakers {List[Speaker]} -- A list of speakers included in this + model {Minutes} -- A model trained on speakers within this conversation. """ - self.speakers = speakers - self.audio = Audio(audio_loc) - - def get_observations(self, ms_per_observation, verbose=False): - """Converts the conversation audio sample into an n x d matrix of - observations. + self.model = model + super().__init__(audio_loc) - Keyword Arguments: - verbose {bool} -- (default: {False}) - ms_per_observation {int} -- (default: {False}) - - Returns: - np.array -- An n x d matrix of observations. - """ - return self.audio.get_spectrograms(ms_per_observation, verbose) + # Predict against the conversation spectrograms. + raw, X_hat = self.get_observations(**model.preprocessing_params) + y_hat = model.predict(X_hat) + # Convert to a list of phrases. + self.phrases = [Phrase(o, speaker) for o, speaker in zip(raw, y_hat)] diff --git a/minutes/minutes.py b/minutes/minutes.py index 2ee20b3..0a99aaf 100644 --- a/minutes/minutes.py +++ b/minutes/minutes.py @@ -31,12 +31,13 @@ def __init__(self, parent='cnn', test_size=0.33, random_state=42): # Load in parent, copy in fixed parameters. self.parent = BaseModel.load_model(parent) - self.ms_per_observation = self.parent.ms_per_observation - self.model = None - self.speakers = set() - self.test_size = test_size - self.random_state = random_state + super().__init__( + self.parent.name + '-child', + self.parent.ms_per_observation, + test_size, + random_state, + ) def fit(self, verbose=0): """Trains the model, given the speakers currently added.""" @@ -72,14 +73,3 @@ def fit(self, verbose=0): batch_size=16, verbose=verbose ) - - def phrases(self, conversation): - """Predict against a new conversation. - - Arguments: - conversation {Conversation} -- A conversation built from an audio - sample. - - Returns: - """ - pass # TODO diff --git a/minutes/speaker.py b/minutes/speaker.py index ef327c3..b0dcea3 100644 --- a/minutes/speaker.py +++ b/minutes/speaker.py @@ -23,10 +23,13 @@ def add_audio(self, audio_loc): """ self.audio += Audio(audio_loc), - def get_observations(self, ms_per_observation, verbose=False): - obs = [a.get_spectrograms(ms_per_observation, verbose) - for a in self.audio] - return np.concatenate(obs) + def get_observations(self, **preprocessing_params): + raw, processed = [], [] + for a in self.audio: + r, p = a.get_observations(**preprocessing_params) + raw += r, + processed += p, + return np.concatenate(raw), np.concatenate(processed) def __eq__(self, other): return self.name == other.name diff --git a/minutes/utils/keras.py b/minutes/utils/keras.py index 155b97e..c61e16e 100644 --- a/minutes/utils/keras.py +++ b/minutes/utils/keras.py @@ -3,7 +3,7 @@ def copy_model(model): """Returns a copy of the model. - + Arguments: model {keras.Sequential} -- A model for copying. """ diff --git a/test/config.py b/test/config.py index d62cbd4..e47b17b 100644 --- a/test/config.py +++ b/test/config.py @@ -3,15 +3,19 @@ import shutil import tempfile +import numpy as np +import tensorflow as tf from minutes import Speaker + TEST_DIR = os.path.dirname(os.path.realpath(__file__)) ROOT_DIR = os.path.join(TEST_DIR, '..') FIXTURE_DIR = os.path.join(TEST_DIR, 'fixtures') SPEAKER1_AUDIO = os.path.join(FIXTURE_DIR, 'sample1.wav') SPEAKER2_AUDIO = os.path.join(FIXTURE_DIR, 'sample2.wav') +CONVERSATION_AUDIO = os.path.join(FIXTURE_DIR, 'conversation.wav') # Load speaker audio just once for all tests. SPEAKER1 = Speaker('speaker1') @@ -20,6 +24,10 @@ SPEAKER2 = Speaker('speaker2') SPEAKER2.add_audio(SPEAKER2_AUDIO) +RANDOM_STATE = 42 +np.random.seed(RANDOM_STATE) +tf.set_random_seed(RANDOM_STATE) + @contextlib.contextmanager def cd(newdir, cleanup=None): diff --git a/test/fixtures/conversation.wav b/test/fixtures/conversation.wav new file mode 100644 index 0000000..32bb26e Binary files /dev/null and b/test/fixtures/conversation.wav differ diff --git a/test/test_audio.py b/test/test_audio.py index 77b8887..b3c625b 100644 --- a/test/test_audio.py +++ b/test/test_audio.py @@ -16,6 +16,6 @@ def test_samples_per_observation(): def test_get_spectrograms(): audio = Audio(c.SPEAKER1_AUDIO) - spec = audio.get_spectrograms(3000) + _, spec = audio.get_observations(3000) assert spec.shape == (5, 129, 214) assert spec.dtype == np.float64 diff --git a/test/test_minutes.py b/test/test_minutes.py index 1c860a6..2ab8b75 100644 --- a/test/test_minutes.py +++ b/test/test_minutes.py @@ -1,4 +1,6 @@ -from minutes import Minutes +import numpy as np + +from minutes import Minutes, Conversation import test.config as c @@ -15,3 +17,21 @@ def test_train(): def test_parents(): assert Minutes.parents == ['cnn'] + + +def test_phrases(): + for model_name in Minutes.parents: + minutes = Minutes(parent=model_name) + minutes.add_speaker(c.SPEAKER1) + minutes.add_speaker(c.SPEAKER2) + minutes.fit() + + # Predict new phrases (make sure we ony predict once per obs) + conversation = Conversation(c.CONVERSATION_AUDIO, minutes) + raw, _ = conversation.get_observations(**minutes.preprocessing_params) + assert len(conversation.phrases) == len(raw) + + # Make sure we predicted some subset of the acceptable values. + names = [p.speaker.name for p in conversation.phrases] + expected = {'speaker1', 'speaker2'} + assert set(np.unique(names)) <= expected