noc-lab · OmarSayedMostafa · May 18, 2020 · May 18, 2020 · May 18, 2020 · May 18, 2020
diff --git a/.gitignore b/.gitignore
@@ -109,3 +109,4 @@ data
 ckpt
 evaluate
 playground
+cce_assets
diff --git a/clinical_concept_extraction/__init__.py b/clinical_concept_extraction/__init__.py
@@ -1 +1 @@
-from .pipeline import get_annotation as clinical_concept_extraction
+from .pipeline import ClinicalConceptExtraction as ClinicalConceptExtraction
diff --git a/clinical_concept_extraction/bilm.py b/clinical_concept_extraction/bilm.py
@@ -59,6 +59,7 @@ def __init__(self, filename, validate_file=False):
     def bos(self):
         return self._bos
 
+
     @property
     def eos(self):
         return self._eos
@@ -166,6 +167,10 @@ def word_char_ids(self):
     def max_word_length(self):
         return self._max_word_length
 
+    @property
+    def vocab(self):
+        return self._id_to_word
+
     def _convert_word_to_char_ids(self, word):
         code = np.zeros([self.max_word_length], dtype=np.int32)
         code[:] = self.pad_char
@@ -216,28 +221,36 @@ def __init__(self, lm_vocab_file: str, max_token_length: int):
         )
         self._max_token_length = max_token_length
 
+
+    @property
+    def words_vocab(self):
+        return self._lm_vocab.vocab
+
     def batch_sentences(self, sentences: List[List[str]]):
         '''
         Batch the sentences as character ids
         Each sentence is a list of tokens without <s> or </s>, e.g.
         [['The', 'first', 'sentence', '.'], ['Second', '.']]
         '''
         n_sentences = len(sentences)
+        # add +2 for the start and end char appended to each sentence
         max_length = max(len(sentence) for sentence in sentences) + 2
 
         X_char_ids = np.zeros(
             (n_sentences, max_length, self._max_token_length),
             dtype=np.int64
         )
 
+        lengths_list = []
         for k, sent in enumerate(sentences):
+            lengths_list.append(max_length-2)
             length = len(sent) + 2
             char_ids_without_mask = self._lm_vocab.encode_chars(
                 sent, split=False)
             # add one so that 0 is the mask value
             X_char_ids[k, :length, :] = char_ids_without_mask + 1
 
-        return X_char_ids
+        return X_char_ids, lengths_list
 
 
 class TokenBatcher(object):
@@ -741,7 +754,7 @@ def custom_getter(getter, name, *args, **kwargs):
         else:
             self._n_tokens_vocab = None
 
-        with tf.variable_scope('bilm', custom_getter=custom_getter):
+        with tf.compat.v1.variable_scope('bilm', custom_getter=custom_getter):
             self._build()
 
     def _build(self):
@@ -797,19 +810,19 @@ def _build_word_char_embeddings(self):
             activation = tf.nn.relu
 
         # the character embeddings
-        with tf.device("/cpu:0"):
-            self.embedding_weights = tf.get_variable(
-                "char_embed", [n_chars, char_embed_dim],
-                dtype=DTYPE,
-                initializer=tf.random_uniform_initializer(-1.0, 1.0)
-            )
-            # shape (batch_size, unroll_steps, max_chars, embed_dim)
-            self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights,
-                                                         self.ids_placeholder)
+        # with tf.device("/cpu:0"):
+        self.embedding_weights = tf.compat.v1.get_variable(
+            "char_embed", [n_chars, char_embed_dim],
+            dtype=DTYPE,
+            initializer=tf.random_uniform_initializer(-1.0, 1.0)
+        )
+        # shape (batch_size, unroll_steps, max_chars, embed_dim)
+        self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights,
+                                                        self.ids_placeholder)
 
         # the convolutions
         def make_convolutions(inp):
-            with tf.variable_scope('CNN') as scope:
+            with tf.compat.v1.variable_scope('CNN') as scope:
                 convolutions = []
                 for i, (width, num) in enumerate(filters):
                     if cnn_options['activation'] == 'relu':
@@ -829,12 +842,12 @@ def make_convolutions(inp):
                             mean=0.0,
                             stddev=np.sqrt(1.0 / (width * char_embed_dim))
                         )
-                    w = tf.get_variable(
+                    w = tf.compat.v1.get_variable(
                         "W_cnn_%s" % i,
                         [1, width, char_embed_dim, num],
                         initializer=w_init,
                         dtype=DTYPE)
-                    b = tf.get_variable(
+                    b = tf.compat.v1.get_variable(
                         "b_cnn_%s" % i, [num], dtype=DTYPE,
                         initializer=tf.constant_initializer(0.0))
 
@@ -843,7 +856,7 @@ def make_convolutions(inp):
                         strides=[1, 1, 1, 1],
                         padding="VALID") + b
                     # now max pool
-                    conv = tf.nn.max_pool(
+                    conv = tf.nn.max_pool2d(
                         conv, [1, 1, max_chars - width + 1, 1],
                         [1, 1, 1, 1], 'VALID')
 
@@ -870,13 +883,13 @@ def make_convolutions(inp):
         # set up weights for projection
         if use_proj:
             assert n_filters > projection_dim
-            with tf.variable_scope('CNN_proj') as scope:
-                W_proj_cnn = tf.get_variable(
+            with tf.compat.v1.variable_scope('CNN_proj') as scope:
+                W_proj_cnn = tf.compat.v1.get_variable(
                     "W_proj", [n_filters, projection_dim],
                     initializer=tf.random_normal_initializer(
                         mean=0.0, stddev=np.sqrt(1.0 / n_filters)),
                     dtype=DTYPE)
-                b_proj_cnn = tf.get_variable(
+                b_proj_cnn = tf.compat.v1.get_variable(
                     "b_proj", [projection_dim],
                     initializer=tf.constant_initializer(0.0),
                     dtype=DTYPE)
@@ -891,23 +904,23 @@ def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
             highway_dim = n_filters
 
             for i in range(n_highway):
-                with tf.variable_scope('CNN_high_%s' % i) as scope:
-                    W_carry = tf.get_variable(
+                with tf.compat.v1.variable_scope('CNN_high_%s' % i) as scope:
+                    W_carry = tf.compat.v1.get_variable(
                         'W_carry', [highway_dim, highway_dim],
                         # glorit init
                         initializer=tf.random_normal_initializer(
                             mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
                         dtype=DTYPE)
-                    b_carry = tf.get_variable(
+                    b_carry = tf.compat.v1.get_variable(
                         'b_carry', [highway_dim],
                         initializer=tf.constant_initializer(-2.0),
                         dtype=DTYPE)
-                    W_transform = tf.get_variable(
+                    W_transform = tf.compat.v1.get_variable(
                         'W_transform', [highway_dim, highway_dim],
                         initializer=tf.random_normal_initializer(
                             mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
                         dtype=DTYPE)
-                    b_transform = tf.get_variable(
+                    b_transform = tf.compat.v1.get_variable(
                         'b_transform', [highway_dim],
                         initializer=tf.constant_initializer(0.0),
                         dtype=DTYPE)
@@ -932,7 +945,7 @@ def _build_word_embeddings(self):
 
         # the word embeddings
         with tf.device("/cpu:0"):
-            self.embedding_weights = tf.get_variable(
+            self.embedding_weights = tf.compat.v1.get_variable(
                 "embedding", [self._n_tokens_vocab, projection_dim],
                 dtype=DTYPE,
             )
@@ -1001,7 +1014,7 @@ def _build_lstms(self):
                         pass
                     else:
                         # add a skip connection
-                        lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell)
+                        lstm_cell = tf.compat.v1.nn.rnn_cell.ResidualWrapper(lstm_cell)
 
                 # collect the input state, run the dynamic rnn, collect
                 # the output
@@ -1026,12 +1039,12 @@ def _build_lstms(self):
                     i_direction = 1
                 variable_scope_name = 'RNN_{0}/RNN/MultiRNNCell/Cell{1}'.format(
                     i_direction, i)
-                with tf.variable_scope(variable_scope_name):
+                with tf.compat.v1.variable_scope(variable_scope_name):
                     layer_output, final_state = tf.nn.dynamic_rnn(
                         lstm_cell,
                         layer_input,
                         sequence_length=sequence_lengths,
-                        initial_state=tf.nn.rnn_cell.LSTMStateTuple(
+                        initial_state=tf.compat.v1.nn.rnn_cell.LSTMStateTuple(
                             *batch_init_states),
                     )
 
@@ -1056,7 +1069,7 @@ def _build_lstms(self):
                         new_state = tf.concat(
                             [final_state[i][:batch_size, :],
                              init_states[i][batch_size:, :]], axis=0)
-                        state_update_op = tf.assign(init_states[i], new_state)
+                        state_update_op = tf.compat.v1.assign(init_states[i], new_state)
                         update_ops.append(state_update_op)
 
                 layer_input = layer_output
@@ -1123,10 +1136,10 @@ def dump_bilm_embeddings(vocab_file, dataset_file, options_file,
     model = BidirectionalLanguageModel(options_file, weight_file)
     ops = model(ids_placeholder)
 
-    config = tf.ConfigProto(allow_soft_placement=True)
+    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
     config.gpu_options.allow_growth = True
-    with tf.Session(config=config) as sess:
-        sess.run(tf.global_variables_initializer())
+    with tf.compat.v1.Session(config=config) as sess:
+        sess.run(tf.compat.v1.global_variables_initializer())
         sentence_id = 0
         with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout:
             for line in fin:

diff --git a/clinical_concept_extraction/elmo_vector.py b/clinical_concept_extraction/elmo_vector.py
@@ -14,22 +14,28 @@ def __init__(self):
         weight_file = os.path.join(base_path,'mimic_wiki.hdf5')
 
         self.batcher = Batcher(vocab_file, 50)
-        self.input = tf.placeholder('int32', shape=(None, None, 50))
+        self.input = tf.compat.v1.placeholder('int32', shape=(None, None, 50))
         self.model = BidirectionalLanguageModel(options_file, weight_file)
         self.output = self.model(self.input)
 
-        config = tf.ConfigProto()
+        config = tf.compat.v1.ConfigProto()
         config.gpu_options.allow_growth = True
 
-        self.session = tf.Session(config=config)
-        self.session.run(tf.global_variables_initializer())
+        self.session = tf.compat.v1.Session(config=config)
+        self.session.run(tf.compat.v1.global_variables_initializer())
+
 
-    def get_embeddings(self, sentence):
-        sentence_ids = self.batcher.batch_sentences([sentence])
-        embedding = self.session.run(self.output['lm_embeddings'], feed_dict={self.input: sentence_ids})
-        embedding = np.transpose(embedding[0], [1, 2, 0])
 
-        return embedding
+    # get all sentences embeddings as a batch instead of getting each sentence embeddings alone
+    def get_embeddings(self, all_sentences):
+        # convert each sentence into list of chars ids.
+        # applying padding for the whole batch of sentence by the max sentence length.
+        sentences_ids, lengths_list = self.batcher.batch_sentences(all_sentences)
+        embedding = tf.transpose(self.output['lm_embeddings'], perm=[0, 2, 3, 1])
+        # embeddings shape = [batch_size, max sentence length, 1024, 3]
+        embedding = self.session.run(embedding, feed_dict={self.input: sentences_ids})
+        return embedding, lengths_list
+
 
     def close_session(self):
         self.session.close()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .pipeline import get_annotation as clinical_concept_extraction
		from .pipeline import ClinicalConceptExtraction as ClinicalConceptExtraction