bfelbo · nklapste · Jan 8, 2020 · Jan 10, 2020 · Jan 10, 2020 · Jan 10, 2020
diff --git a/README.md b/README.md
@@ -44,6 +44,11 @@ This will install the following dependencies:
 
 Ensure that Keras uses your chosen backend. You can find the instructions [here](https://keras.io/backend/), under the *Switching from one backend to another* section.
 
+You can install a working tensorflow backend by running:
+```bash
+pip install -e .[tensorflow_backend]
+```
+
 Run the included script, which downloads the pretrained DeepMoji weights (~85MB) from [here](https://www.dropbox.com/s/xqarafsl6a8f9ny/deepmoji_weights.hdf5?dl=0) and places them in the model/ directory:
 
 ```bash

diff --git a/data/Olympic/raw.pickle b/data/Olympic/raw.pickle
diff --git a/deepmoji/attlayer.py b/deepmoji/attlayer.py
@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import, division
 
 import sys
 from os.path import dirname

diff --git a/deepmoji/class_avg_finetuning.py b/deepmoji/class_avg_finetuning.py
@@ -1,7 +1,6 @@
 """ Class average finetuning functions. Before using any of these finetuning
     functions, ensure that the model is set up with nb_classes=2.
 """
-from __future__ import print_function
 
 import sys
 import uuid
@@ -10,10 +9,10 @@
 from time import sleep
 from keras.optimizers import Adam
 
-from global_variables import (
+from .global_variables import (
     FINETUNING_METHODS,
     WEIGHTS_DIR)
-from finetuning import (
+from .finetuning import (
     freeze_layers,
     sampling_generator,
     finetuning_callbacks,

diff --git a/deepmoji/create_vocab.py b/deepmoji/create_vocab.py
@@ -1,13 +1,12 @@
-from __future__ import print_function, division
 
 import glob
 import json
 import numpy as np
 import uuid
-from filter_utils import is_special_token
-from word_generator import WordGenerator
+from .filter_utils import is_special_token
+from .word_generator import WordGenerator
 from collections import defaultdict, OrderedDict
-from global_variables import SPECIAL_TOKENS, VOCAB_PATH
+from .global_variables import SPECIAL_TOKENS, VOCAB_PATH
 from copy import deepcopy
 
 
@@ -47,7 +46,7 @@ def save_vocab(self, path=None):
                   randomly generated filename is used instead.
         """
         dtype = ([('word', '|S{}'.format(self.word_length_limit)), ('count', 'int')])
-        np_dict = np.array(self.word_counts.items(), dtype=dtype)
+        np_dict = np.array(list(self.word_counts.items()), dtype=dtype)
 
         # sort from highest to lowest frequency
         np_dict[::-1].sort(order='count')
@@ -65,7 +64,7 @@ def get_next_word(self):
         # Returns:
             List of strings, representing the next tokenized sentence.
         """
-        return self.word_gen.__iter__().next()
+        return next(self.word_gen.__iter__())
 
     def count_all_words(self):
         """ Generates word counts for all words in all sentences of the word
@@ -171,13 +170,13 @@ def save_vocab(self, path_count, path_vocab, word_limit=100000):
             words[token] = -1
 
         # sort words by frequency
-        desc_order = OrderedDict(sorted(self.master_vocab.items(),
+        desc_order = OrderedDict(sorted(list(self.master_vocab.items()),
                                         key=lambda kv: kv[1], reverse=True))
         words.update(desc_order)
 
         # use encoding of up to 30 characters (no token conversions)
         # use float to store large numbers (we don't care about precision loss)
-        np_vocab = np.array(words.items(),
+        np_vocab = np.array(list(words.items()),
                             dtype=([('word', '|S30'), ('count', 'float')]))
 
         # output count for debugging
@@ -186,7 +185,7 @@ def save_vocab(self, path_count, path_vocab, word_limit=100000):
 
         # output the index of each word for easy lookup
         final_words = OrderedDict()
-        for i, w in enumerate(words.keys()[:word_limit]):
+        for i, w in enumerate(list(words.keys())[:word_limit]):
             final_words.update({w: i})
         with open(path_vocab, 'w') as f:
             f.write(json.dumps(final_words, indent=4, separators=(',', ': ')))
@@ -257,16 +256,16 @@ def extend_vocab(current_vocab, new_vocab, max_tokens=10000):
     words = OrderedDict()
 
     # sort words by frequency
-    desc_order = OrderedDict(sorted(new_vocab.word_counts.items(),
+    desc_order = OrderedDict(sorted(list(new_vocab.word_counts.items()),
                                     key=lambda kv: kv[1], reverse=True))
     words.update(desc_order)
 
-    base_index = len(current_vocab.keys())
+    base_index = len(list(current_vocab.keys()))
     added = 0
     for word in words:
         if added >= max_tokens:
             break
-        if word not in current_vocab.keys():
+        if word not in list(current_vocab.keys()):
             current_vocab[word] = base_index + added
             added += 1
 

diff --git a/deepmoji/filter_input.py b/deepmoji/filter_input.py
@@ -1,4 +1,3 @@
-from __future__ import print_function, division
 import codecs
 import csv
 import numpy as np

diff --git a/deepmoji/filter_utils.py b/deepmoji/filter_utils.py
@@ -1,38 +1,37 @@
 
-from __future__ import print_function, division
 import sys
 import numpy as np
 import re
 import string
 import emoji
-from tokenizer import RE_MENTION, RE_URL
-from global_variables import SPECIAL_TOKENS
+from .tokenizer import RE_MENTION, RE_URL
+from .global_variables import SPECIAL_TOKENS
 from itertools import groupby
 
 AtMentionRegex = re.compile(RE_MENTION)
 urlRegex = re.compile(RE_URL)
 
 # from http://bit.ly/2rdjgjE (UTF-8 encodings and Unicode chars)
-VARIATION_SELECTORS = [u'\ufe00',
-                       u'\ufe01',
-                       u'\ufe02',
-                       u'\ufe03',
-                       u'\ufe04',
-                       u'\ufe05',
-                       u'\ufe06',
-                       u'\ufe07',
-                       u'\ufe08',
-                       u'\ufe09',
-                       u'\ufe0a',
-                       u'\ufe0b',
-                       u'\ufe0c',
-                       u'\ufe0d',
-                       u'\ufe0e',
-                       u'\ufe0f']
+VARIATION_SELECTORS = ['\ufe00',
+                       '\ufe01',
+                       '\ufe02',
+                       '\ufe03',
+                       '\ufe04',
+                       '\ufe05',
+                       '\ufe06',
+                       '\ufe07',
+                       '\ufe08',
+                       '\ufe09',
+                       '\ufe0a',
+                       '\ufe0b',
+                       '\ufe0c',
+                       '\ufe0d',
+                       '\ufe0e',
+                       '\ufe0f']
 
 # from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
-ALL_CHARS = (unichr(i) for i in xrange(sys.maxunicode))
-CONTROL_CHARS = ''.join(map(unichr, range(0, 32) + range(127, 160)))
+ALL_CHARS = (chr(i) for i in range(sys.maxunicode))
+CONTROL_CHARS = ''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))
 CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))
 
 
@@ -130,18 +129,20 @@ def remove_variation_selectors(text):
         For instance, remove skin color from emojis.
     """
     for var in VARIATION_SELECTORS:
-        text = text.replace(var, u'')
+        text = text.replace(var, '')
     return text
 
 
 def shorten_word(word):
     """ Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!'
     """
-
     # only shorten ASCII words
     try:
-        word.decode('ascii')
-    except (UnicodeDecodeError, UnicodeEncodeError) as e:
+        if isinstance(word, str):
+            word.encode("ascii")
+        else:  # assume we have a bytes type (legacy Python2 code)
+            word.decode('ascii')
+    except (AttributeError, UnicodeDecodeError, UnicodeEncodeError) as e:
         return word
 
     # must have at least 3 char to be shortened
@@ -188,14 +189,14 @@ def remove_control_chars(text):
 
 def convert_nonbreaking_space(text):
     # ugly hack handling non-breaking space no matter how badly it's been encoded in the input
-    for r in [u'\\\\xc2', u'\\xc2', u'\xc2', u'\\\\xa0', u'\\xa0', u'\xa0']:
-        text = text.replace(r, u' ')
+    for r in ['\\\\xc2', '\\xc2', '\xc2', '\\\\xa0', '\\xa0', '\xa0']:
+        text = text.replace(r, ' ')
     return text
 
 
 def convert_linebreaks(text):
     # ugly hack handling non-breaking space no matter how badly it's been encoded in the input
     # space around to ensure proper tokenization
-    for r in [u'\\\\n', u'\\n', u'\n', u'\\\\r', u'\\r', u'\r', '<br>']:
-        text = text.replace(r, u' ' + SPECIAL_TOKENS[5] + u' ')
+    for r in ['\\\\n', '\\n', '\n', '\\\\r', '\\r', '\r', '<br>']:
+        text = text.replace(r, ' ' + SPECIAL_TOKENS[5] + ' ')
     return text
diff --git a/deepmoji/finetuning.py b/deepmoji/finetuning.py
@@ -1,6 +1,5 @@
 """ Finetuning functions for doing transfer learning to new datasets.
 """
-from __future__ import print_function
 
 import sys
 import uuid
@@ -18,13 +17,13 @@
 from keras.utils.np_utils import to_categorical
 from keras.models import model_from_json
 
-from global_variables import (
+from .global_variables import (
     FINETUNING_METHODS,
     FINETUNING_METRICS,
     WEIGHTS_DIR)
-from tokenizer import tokenize
-from sentence_tokenizer import SentenceTokenizer
-from attlayer import AttentionWeightedAverage
+from .tokenizer import tokenize
+from .sentence_tokenizer import SentenceTokenizer
+from .attlayer import AttentionWeightedAverage
 
 
 def load_benchmark(path, vocab, extend_with=0):
@@ -54,12 +53,12 @@ def load_benchmark(path, vocab, extend_with=0):
             maxlen: Maximum length of an input.
     """
     # Pre-processing dataset
-    with open(path) as dataset:
+    with open(path, "rb") as dataset:
         data = pickle.load(dataset)
 
     # Decode data
     try:
-        texts = [unicode(x) for x in data['texts']]
+        texts = [str(x) for x in data['texts']]
     except UnicodeDecodeError:
         texts = [x.decode('utf-8') for x in data['texts']]
 
@@ -254,7 +253,7 @@ def sampling_generator(X_in, y_in, batch_size, epoch_size=25000,
         assert epoch_size % 2 == 0
         samples_pr_class = int(epoch_size / 2)
     else:
-        ind = range(len(X_in))
+        ind = list(range(len(X_in)))
 
     # Keep looping until training halts
     while True:

diff --git a/deepmoji/model_def.py b/deepmoji/model_def.py
@@ -1,14 +1,13 @@
 """ Model definition functions and weight loading.
 """
 
-from __future__ import print_function, division
 
 from keras.models import Model, Sequential
 from keras.layers.merge import concatenate
 from keras.layers import Input, Bidirectional, Embedding, Dense, Dropout, SpatialDropout1D, LSTM, Activation
 from keras.regularizers import L1L2
-from attlayer import AttentionWeightedAverage
-from global_variables import NB_TOKENS, NB_EMOJI_CLASSES
+from .attlayer import AttentionWeightedAverage
+from .global_variables import NB_TOKENS, NB_EMOJI_CLASSES
 import numpy as np
 from copy import deepcopy
 from os.path import exists

diff --git a/deepmoji/sentence_tokenizer.py b/deepmoji/sentence_tokenizer.py
@@ -2,13 +2,12 @@
 Provides functionality for converting a given list of tokens (words) into
 numbers, according to the given vocabulary.
 '''
-from __future__ import print_function, division
 
 import numbers
 import numpy as np
-from create_vocab import extend_vocab, VocabBuilder
-from word_generator import WordGenerator
-from global_variables import SPECIAL_TOKENS
+from .create_vocab import extend_vocab, VocabBuilder
+from .word_generator import WordGenerator
+from .global_variables import SPECIAL_TOKENS
 from sklearn.model_selection import train_test_split
 from copy import deepcopy
 
@@ -163,8 +162,8 @@ def split_train_val_test(self, sentences, info_dicts,
 
             # Helper function to verify provided indices are numbers in range
             def verify_indices(inds):
-                return list(filter(lambda i: isinstance(i, numbers.Number) and
-                                   i < len(sentences), inds))
+                return list([i for i in inds if isinstance(i, numbers.Number) and
+                                   i < len(sentences)])
 
             ind_train = verify_indices(split_parameter[0])
             ind_val = verify_indices(split_parameter[1])
@@ -210,7 +209,7 @@ def to_sentence(self, sentence_idx):
             together with spaces.
         """
         # Have to recalculate the mappings in case the vocab was extended.
-        ind_to_word = {ind: word for word, ind in self.vocabulary.iteritems()}
+        ind_to_word = {ind: word for word, ind in self.vocabulary.items()}
 
         sentence_as_list = [ind_to_word[x] for x in sentence_idx]
         cleaned_list = [x for x in sentence_as_list if x != 'CUSTOM_MASK']