Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/simple python3 migration #53

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
6546fa6
updating for python3 and windows compatibility
nklapste Jan 8, 2020
899587c
locking down requirements
nklapste Jan 10, 2020
96beec0
adding `extras_requires` option for installing a viable tensorflow ba…
nklapste Jan 10, 2020
e4cb4da
adding notes on how to install a recommended tensorflow backend
nklapste Jan 10, 2020
63d1cfc
fixing csv write error
nklapste Jan 20, 2020
c8ceac2
updating `convert_all_datasets.py` to opertate with windows 10 and py…
nklapste Jan 10, 2020
3e2feba
disabling loading of `PsychExp` dataset
nklapste Jan 10, 2020
a628d8a
coverting to `LF` from `CRLF`
nklapste Jan 20, 2020
c108abc
updating `download_weights.py` to work in windows10, python3
nklapste Jan 10, 2020
079d1f8
fixing imports of `example_helper` to be used for local invocation
nklapste Jan 20, 2020
28306cf
fixup on `check_ascii`
nklapste Jan 20, 2020
abca333
removing comment
nklapste Jan 20, 2020
fc0911b
minimizing diffs
nklapste Jan 20, 2020
16bdfbd
adding smoke tests for scripts in `examples/`
nklapste Jan 20, 2020
47646ca
adding smoke tests for `scripts/`
nklapste Jan 20, 2020
177379c
fixing loading of pickle files
nklapste Jan 20, 2020
9047d5b
disabling loading of `data/PsychExp/raw.pickle`
nklapste Jan 20, 2020
dbf55f8
fixing csv write error
nklapste Jan 20, 2020
4b21a73
removing unneeded parenthesis
nklapste Jan 20, 2020
6da04b9
fixing ascii check for incoming words in `shorten_word`
nklapste Jan 20, 2020
0dc13fc
fixing path for subprocess call
nklapste Jan 21, 2020
5ff04f2
adding TODO note on failure in `test_smoke_create_twitter_vocab`
nklapste Jan 21, 2020
81c075c
adding nose slow attribute to slow smoke tests
nklapste Jan 21, 2020
9caaff3
reordering tests
nklapste Jan 21, 2020
c2ab3c1
improving test for `test_smoke_download_weights`
nklapste Jan 21, 2020
8b8fab1
fixing import of `test_helper`
nklapste Jan 21, 2020
a3480e7
adding comments noting the requirement of test execution order for `t…
nklapste Jan 31, 2020
22cfee7
Mark tests as slow as they require another slow test
bfelbo Feb 10, 2020
357dd35
Clarify that twitter dataset is not provided
bfelbo Feb 10, 2020
dc9cbde
Conver to tf.keras w/o eager mode
bfelbo Feb 10, 2020
bcdb035
Update README on how to not run slow tests
bfelbo Feb 10, 2020
269cb43
Bump version number as breaks support with Theano
bfelbo Feb 10, 2020
a327694
Add test for saving/loading model
bfelbo Apr 11, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ This will install the following dependencies:

Ensure that Keras uses your chosen backend. You can find the instructions [here](https://keras.io/backend/), under the *Switching from one backend to another* section.

You can install a working tensorflow backend by running:
```bash
pip install -e .[tensorflow_backend]
```

Run the included script, which downloads the pretrained DeepMoji weights (~85MB) from [here](https://www.dropbox.com/s/xqarafsl6a8f9ny/deepmoji_weights.hdf5?dl=0) and places them in the model/ directory:

```bash
Expand Down
134 changes: 67 additions & 67 deletions data/Olympic/raw.pickle

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion deepmoji/attlayer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division

import sys
from os.path import dirname
Expand Down
5 changes: 2 additions & 3 deletions deepmoji/class_avg_finetuning.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
""" Class average finetuning functions. Before using any of these finetuning
functions, ensure that the model is set up with nb_classes=2.
"""
from __future__ import print_function

import sys
import uuid
Expand All @@ -10,10 +9,10 @@
from time import sleep
from keras.optimizers import Adam

from global_variables import (
from .global_variables import (
FINETUNING_METHODS,
WEIGHTS_DIR)
from finetuning import (
from .finetuning import (
freeze_layers,
sampling_generator,
finetuning_callbacks,
Expand Down
23 changes: 11 additions & 12 deletions deepmoji/create_vocab.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from __future__ import print_function, division

import glob
import json
import numpy as np
import uuid
from filter_utils import is_special_token
from word_generator import WordGenerator
from .filter_utils import is_special_token
from .word_generator import WordGenerator
from collections import defaultdict, OrderedDict
from global_variables import SPECIAL_TOKENS, VOCAB_PATH
from .global_variables import SPECIAL_TOKENS, VOCAB_PATH
from copy import deepcopy


Expand Down Expand Up @@ -47,7 +46,7 @@ def save_vocab(self, path=None):
randomly generated filename is used instead.
"""
dtype = ([('word', '|S{}'.format(self.word_length_limit)), ('count', 'int')])
np_dict = np.array(self.word_counts.items(), dtype=dtype)
np_dict = np.array(list(self.word_counts.items()), dtype=dtype)

# sort from highest to lowest frequency
np_dict[::-1].sort(order='count')
Expand All @@ -65,7 +64,7 @@ def get_next_word(self):
# Returns:
List of strings, representing the next tokenized sentence.
"""
return self.word_gen.__iter__().next()
return next(self.word_gen.__iter__())

def count_all_words(self):
""" Generates word counts for all words in all sentences of the word
Expand Down Expand Up @@ -171,13 +170,13 @@ def save_vocab(self, path_count, path_vocab, word_limit=100000):
words[token] = -1

# sort words by frequency
desc_order = OrderedDict(sorted(self.master_vocab.items(),
desc_order = OrderedDict(sorted(list(self.master_vocab.items()),
key=lambda kv: kv[1], reverse=True))
words.update(desc_order)

# use encoding of up to 30 characters (no token conversions)
# use float to store large numbers (we don't care about precision loss)
np_vocab = np.array(words.items(),
np_vocab = np.array(list(words.items()),
dtype=([('word', '|S30'), ('count', 'float')]))

# output count for debugging
Expand All @@ -186,7 +185,7 @@ def save_vocab(self, path_count, path_vocab, word_limit=100000):

# output the index of each word for easy lookup
final_words = OrderedDict()
for i, w in enumerate(words.keys()[:word_limit]):
for i, w in enumerate(list(words.keys())[:word_limit]):
final_words.update({w: i})
with open(path_vocab, 'w') as f:
f.write(json.dumps(final_words, indent=4, separators=(',', ': ')))
Expand Down Expand Up @@ -257,16 +256,16 @@ def extend_vocab(current_vocab, new_vocab, max_tokens=10000):
words = OrderedDict()

# sort words by frequency
desc_order = OrderedDict(sorted(new_vocab.word_counts.items(),
desc_order = OrderedDict(sorted(list(new_vocab.word_counts.items()),
key=lambda kv: kv[1], reverse=True))
words.update(desc_order)

base_index = len(current_vocab.keys())
base_index = len(list(current_vocab.keys()))
added = 0
for word in words:
if added >= max_tokens:
break
if word not in current_vocab.keys():
if word not in list(current_vocab.keys()):
current_vocab[word] = base_index + added
added += 1

Expand Down
1 change: 0 additions & 1 deletion deepmoji/filter_input.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from __future__ import print_function, division
import codecs
import csv
import numpy as np
Expand Down
59 changes: 30 additions & 29 deletions deepmoji/filter_utils.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,37 @@

from __future__ import print_function, division
import sys
import numpy as np
import re
import string
import emoji
from tokenizer import RE_MENTION, RE_URL
from global_variables import SPECIAL_TOKENS
from .tokenizer import RE_MENTION, RE_URL
from .global_variables import SPECIAL_TOKENS
from itertools import groupby

AtMentionRegex = re.compile(RE_MENTION)
urlRegex = re.compile(RE_URL)

# from http://bit.ly/2rdjgjE (UTF-8 encodings and Unicode chars)
VARIATION_SELECTORS = [u'\ufe00',
u'\ufe01',
u'\ufe02',
u'\ufe03',
u'\ufe04',
u'\ufe05',
u'\ufe06',
u'\ufe07',
u'\ufe08',
u'\ufe09',
u'\ufe0a',
u'\ufe0b',
u'\ufe0c',
u'\ufe0d',
u'\ufe0e',
u'\ufe0f']
VARIATION_SELECTORS = ['\ufe00',
'\ufe01',
'\ufe02',
'\ufe03',
'\ufe04',
'\ufe05',
'\ufe06',
'\ufe07',
'\ufe08',
'\ufe09',
'\ufe0a',
'\ufe0b',
'\ufe0c',
'\ufe0d',
'\ufe0e',
'\ufe0f']

# from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
ALL_CHARS = (unichr(i) for i in xrange(sys.maxunicode))
CONTROL_CHARS = ''.join(map(unichr, range(0, 32) + range(127, 160)))
ALL_CHARS = (chr(i) for i in range(sys.maxunicode))
CONTROL_CHARS = ''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))
CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))


Expand Down Expand Up @@ -130,18 +129,20 @@ def remove_variation_selectors(text):
For instance, remove skin color from emojis.
"""
for var in VARIATION_SELECTORS:
text = text.replace(var, u'')
text = text.replace(var, '')
return text


def shorten_word(word):
""" Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!'
"""

# only shorten ASCII words
try:
word.decode('ascii')
except (UnicodeDecodeError, UnicodeEncodeError) as e:
if isinstance(word, str):
word.encode("ascii")
else: # assume we have a bytes type (legacy Python2 code)
word.decode('ascii')
except (AttributeError, UnicodeDecodeError, UnicodeEncodeError) as e:
return word

# must have at least 3 char to be shortened
Expand Down Expand Up @@ -188,14 +189,14 @@ def remove_control_chars(text):

def convert_nonbreaking_space(text):
# ugly hack handling non-breaking space no matter how badly it's been encoded in the input
for r in [u'\\\\xc2', u'\\xc2', u'\xc2', u'\\\\xa0', u'\\xa0', u'\xa0']:
text = text.replace(r, u' ')
for r in ['\\\\xc2', '\\xc2', '\xc2', '\\\\xa0', '\\xa0', '\xa0']:
text = text.replace(r, ' ')
return text


def convert_linebreaks(text):
# ugly hack handling non-breaking space no matter how badly it's been encoded in the input
# space around to ensure proper tokenization
for r in [u'\\\\n', u'\\n', u'\n', u'\\\\r', u'\\r', u'\r', '<br>']:
text = text.replace(r, u' ' + SPECIAL_TOKENS[5] + u' ')
for r in ['\\\\n', '\\n', '\n', '\\\\r', '\\r', '\r', '<br>']:
text = text.replace(r, ' ' + SPECIAL_TOKENS[5] + ' ')
return text
15 changes: 7 additions & 8 deletions deepmoji/finetuning.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
""" Finetuning functions for doing transfer learning to new datasets.
"""
from __future__ import print_function

import sys
import uuid
Expand All @@ -18,13 +17,13 @@
from keras.utils.np_utils import to_categorical
from keras.models import model_from_json

from global_variables import (
from .global_variables import (
FINETUNING_METHODS,
FINETUNING_METRICS,
WEIGHTS_DIR)
from tokenizer import tokenize
from sentence_tokenizer import SentenceTokenizer
from attlayer import AttentionWeightedAverage
from .tokenizer import tokenize
from .sentence_tokenizer import SentenceTokenizer
from .attlayer import AttentionWeightedAverage


def load_benchmark(path, vocab, extend_with=0):
Expand Down Expand Up @@ -54,12 +53,12 @@ def load_benchmark(path, vocab, extend_with=0):
maxlen: Maximum length of an input.
"""
# Pre-processing dataset
with open(path) as dataset:
with open(path, "rb") as dataset:
data = pickle.load(dataset)

# Decode data
try:
texts = [unicode(x) for x in data['texts']]
texts = [str(x) for x in data['texts']]
except UnicodeDecodeError:
texts = [x.decode('utf-8') for x in data['texts']]

Expand Down Expand Up @@ -254,7 +253,7 @@ def sampling_generator(X_in, y_in, batch_size, epoch_size=25000,
assert epoch_size % 2 == 0
samples_pr_class = int(epoch_size / 2)
else:
ind = range(len(X_in))
ind = list(range(len(X_in)))

# Keep looping until training halts
while True:
Expand Down
5 changes: 2 additions & 3 deletions deepmoji/model_def.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
""" Model definition functions and weight loading.
"""

from __future__ import print_function, division

from keras.models import Model, Sequential
from keras.layers.merge import concatenate
from keras.layers import Input, Bidirectional, Embedding, Dense, Dropout, SpatialDropout1D, LSTM, Activation
from keras.regularizers import L1L2
from attlayer import AttentionWeightedAverage
from global_variables import NB_TOKENS, NB_EMOJI_CLASSES
from .attlayer import AttentionWeightedAverage
from .global_variables import NB_TOKENS, NB_EMOJI_CLASSES
import numpy as np
from copy import deepcopy
from os.path import exists
Expand Down
13 changes: 6 additions & 7 deletions deepmoji/sentence_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
Provides functionality for converting a given list of tokens (words) into
numbers, according to the given vocabulary.
'''
from __future__ import print_function, division

import numbers
import numpy as np
from create_vocab import extend_vocab, VocabBuilder
from word_generator import WordGenerator
from global_variables import SPECIAL_TOKENS
from .create_vocab import extend_vocab, VocabBuilder
from .word_generator import WordGenerator
from .global_variables import SPECIAL_TOKENS
from sklearn.model_selection import train_test_split
from copy import deepcopy

Expand Down Expand Up @@ -163,8 +162,8 @@ def split_train_val_test(self, sentences, info_dicts,

# Helper function to verify provided indices are numbers in range
def verify_indices(inds):
return list(filter(lambda i: isinstance(i, numbers.Number) and
i < len(sentences), inds))
return list([i for i in inds if isinstance(i, numbers.Number) and
i < len(sentences)])

ind_train = verify_indices(split_parameter[0])
ind_val = verify_indices(split_parameter[1])
Expand Down Expand Up @@ -210,7 +209,7 @@ def to_sentence(self, sentence_idx):
together with spaces.
"""
# Have to recalculate the mappings in case the vocab was extended.
ind_to_word = {ind: word for word, ind in self.vocabulary.iteritems()}
ind_to_word = {ind: word for word, ind in self.vocabulary.items()}

sentence_as_list = [ind_to_word[x] for x in sentence_idx]
cleaned_list = [x for x in sentence_as_list if x != 'CUSTOM_MASK']
Expand Down
Loading