Skip to content
This repository has been archived by the owner on Jul 4, 2023. It is now read-only.

Commit

Permalink
Merge pull request #56 from PetrochukM/update
Browse files Browse the repository at this point in the history
Release 0.3.7 - 5 fixed issues and a new label_encoder
  • Loading branch information
PetrochukM authored Nov 29, 2018
2 parents 2d10f0e + c501355 commit 133a54c
Show file tree
Hide file tree
Showing 35 changed files with 190 additions and 77 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
ignore = E402, E722, E731
ignore = E402, E722, E731, W504
max-line-length = 100
exclude = examples/
11 changes: 5 additions & 6 deletions examples/snli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@
best_dev_acc = -1
header = ' Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss Accuracy Dev/Accuracy'
dev_log_template = ' '.join(
'{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'.
split(','))
'{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'
.split(','))
log_template = ' '.join(
'{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'.split(','))
makedirs(args.save_path)
Expand Down Expand Up @@ -108,8 +108,7 @@
answer = model(premise_batch, hypothesis_batch)

# calculate accuracy of predictions in the current batch
n_correct += (torch.max(answer,
1)[1].view(label_batch.size()) == label_batch).sum()
n_correct += (torch.max(answer, 1)[1].view(label_batch.size()) == label_batch).sum()
n_total += premise_batch.size()[1]
train_acc = 100. * n_correct / n_total

Expand Down Expand Up @@ -150,8 +149,8 @@
for dev_batch_idx, (premise_batch, hypothesis_batch,
label_batch) in enumerate(dev_iterator):
answer = model(premise_batch, hypothesis_batch)
n_dev_correct += (torch.max(answer, 1)[1].view(
label_batch.size()) == label_batch).sum()
n_dev_correct += (torch.max(answer,
1)[1].view(label_batch.size()) == label_batch).sum()
dev_loss = criterion(answer, label_batch)
dev_acc = 100. * n_dev_correct / len(dev)

Expand Down
5 changes: 2 additions & 3 deletions examples/snli/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,9 @@ def collate_fn(batch, train=True):
""" list of tensors to a batch tensors """
premise_batch, _ = pad_batch([row['premise'] for row in batch])
hypothesis_batch, _ = pad_batch([row['hypothesis'] for row in batch])
label_batch = [row['label'] for row in batch]
label_batch = torch.stack([row['label'] for row in batch])

# PyTorch RNN requires batches to be transposed for speed and integration with CUDA
transpose = (
lambda b: torch.stack(b).t_().squeeze(0).contiguous())
transpose = (lambda b: b.t_().squeeze(0).contiguous())

return (transpose(premise_batch), transpose(hypothesis_batch), transpose(label_batch))
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ flake8
# Mocking
mock

# # Optional NLP Utilties
# Optional NLP Utilties
# nltk
# spacy
# sacremoses

# # Optional CUDA Utilties
# Optional CUDA Utilties
# pynvrtc
# cupy

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def find_version(*file_paths):
long_description=long_description,
long_description_content_type='text/markdown',
license='BSD',
install_requires=['numpy', 'pandas', 'tqdm', 'ujson', 'requests'],
install_requires=['numpy', 'pandas', 'tqdm', 'requests'],
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
Expand Down
4 changes: 2 additions & 2 deletions tests/datasets/test_simple_qa.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import os
import shutil
import pytest

import mock
import pytest

from torchnlp.datasets import simple_qa_dataset
from tests.datasets.utils import urlretrieve_side_effect

directory = 'tests/_test_data/'


@pytest.mark.skip(reason="Simple Questions dataset url returns 404.")
@pytest.mark.skip(reason="Simple Questions dataset url sometimes returns 404.")
@mock.patch("urllib.request.urlretrieve")
def test_simple_qa_dataset_row(mock_urlretrieve):
mock_urlretrieve.side_effect = urlretrieve_side_effect
Expand Down
2 changes: 1 addition & 1 deletion tests/datasets/test_smt.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_smt_dataset_row(mock_urlretrieve):
" splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven" +
" Segal .",
'label':
'positive'
'very positive'
}

# Clean up
Expand Down
10 changes: 5 additions & 5 deletions tests/nn/test_weight_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_weight_drop_lstm():
run2 = [x.sum() for x in wd_lstm(input_)[0].data]

# First time step, not influenced by hidden to hidden weights, should be equal
assert pytest.approx(run1[0]) == pytest.approx(run2[0])
assert pytest.approx(run1[0].item()) == pytest.approx(run2[0].item())
# Second step should not
assert run1[1] != run2[1]

Expand All @@ -40,7 +40,7 @@ def test_weight_drop_gru():
run2 = [x.sum() for x in wd_lstm(input_)[0].data]

# First time step, not influenced by hidden to hidden weights, should be equal
assert pytest.approx(run1[0]) == pytest.approx(run2[0])
assert pytest.approx(run1[0].item()) == pytest.approx(run2[0].item())
# Second step should not
assert run1[1] != run2[1]

Expand All @@ -53,7 +53,7 @@ def test_weight_drop():
run2 = [x.sum() for x in wd_lstm(input_)[0].data]

# First time step, not influenced by hidden to hidden weights, should be equal
assert pytest.approx(run1[0]) == pytest.approx(run2[0])
assert pytest.approx(run1[0].item()) == pytest.approx(run2[0].item())
# Second step should not
assert run1[1] != run2[1]

Expand All @@ -66,6 +66,6 @@ def test_weight_drop_zero():
run2 = [x.sum() for x in wd_lstm(input_)[0].data]

# First time step, not influenced by hidden to hidden weights, should be equal
assert pytest.approx(run1[0]) == pytest.approx(run2[0])
assert pytest.approx(run1[0].item()) == pytest.approx(run2[0].item())
# Second step should not
assert pytest.approx(run1[1]) == pytest.approx(run2[1])
assert pytest.approx(run1[1].item()) == pytest.approx(run2[1].item())
44 changes: 44 additions & 0 deletions tests/test_label_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pickle

import pytest

from torchnlp.label_encoder import LabelEncoder
from torchnlp.label_encoder import UNKNOWN_TOKEN


@pytest.fixture
def encoder():
sample = ['people/deceased_person/place_of_death', 'symbols/name_source/namesakes']
return LabelEncoder(sample)


def test_label_encoder_vocab(encoder):
assert len(encoder.vocab) == 3
assert len(encoder.vocab) == encoder.vocab_size


def test_label_encoder_scalar(encoder):
input_ = 'symbols/namesake/named_after'
output = encoder.encode(input_)[0]
assert encoder.decode(output) == UNKNOWN_TOKEN


def test_label_encoder_unknown(encoder):
input_ = 'symbols/namesake/named_after'
output = encoder.encode(input_)
assert len(output) == 1
assert encoder.decode(output) == UNKNOWN_TOKEN


def test_label_encoder_known():
input_ = 'symbols/namesake/named_after'
sample = ['people/deceased_person/place_of_death', 'symbols/name_source/namesakes']
sample.append(input_)
encoder = LabelEncoder(sample)
output = encoder.encode(input_)
assert len(output) == 1
assert encoder.decode(output) == input_


def test_is_pickleable(encoder):
pickle.dumps(encoder)
6 changes: 4 additions & 2 deletions tests/text_encoders/test_subword_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,10 @@ def test_encode_decode(self):

original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'

encoder = SubwordTextTokenizer.build_to_target_size_from_corpus(
[corpus, original], target_size=100, min_val=2, max_val=10)
encoder = SubwordTextTokenizer.build_to_target_size_from_corpus([corpus, original],
target_size=100,
min_val=2,
max_val=10)

# Encoding should be reversible.
encoded = encoder.encode(original)
Expand Down
2 changes: 1 addition & 1 deletion torchnlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.5'
__version__ = '0.3.7.post1'
5 changes: 3 additions & 2 deletions torchnlp/datasets/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ def count_dataset(train=False,
seq_max_length (int, optional): Maximum sequence length.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset
, dev dataset and test dataset in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import count_dataset
Expand Down
3 changes: 3 additions & 0 deletions torchnlp/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,6 @@ def __str__(self):

def __eq__(self, other):
return self.columns == other.columns and self.rows == other.rows

def __add__(self, other):
return Dataset(self.rows + other.rows)
5 changes: 3 additions & 2 deletions torchnlp/datasets/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def imdb_dataset(directory='data/',
sentiments (list of str, optional): Sentiments to load from the dataset.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset and
test dataset in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import imdb_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/iwslt.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def iwslt_dataset(
url (str, optional): URL of the dataset file.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
tokens and test tokens in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import iwslt_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/multi30k.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def multi30k_dataset(directory='data/multi30k/',
urls (str, optional): URLs to download.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
tokens and test tokens in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import multi30k_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/penn_treebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ def penn_treebank_dataset(
urls (str, optional): URLs to download.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
tokens and test tokens in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import penn_treebank_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/reverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ def reverse_dataset(train=False,
seq_max_length (int, optional): Maximum sequence length.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset
, dev dataset and test dataset in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import reverse_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/simple_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def simple_qa_dataset(directory='data/',
url (str, optional): URL of the dataset `tar.gz` file.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset
, dev dataset and test dataset in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import simple_qa_dataset
Expand Down
4 changes: 2 additions & 2 deletions torchnlp/datasets/smt.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ def smt_dataset(directory='data/',
for line in f:
line = line.strip()
if subtrees:
examples.extend(parse_tree(line, subtrees=subtrees))
examples.extend(parse_tree(line, subtrees=subtrees, fine_grained=fine_grained))
else:
examples.append(parse_tree(line, subtrees=subtrees))
examples.append(parse_tree(line, subtrees=subtrees, fine_grained=fine_grained))
ret.append(Dataset(examples))

if len(ret) == 1:
Expand Down
7 changes: 4 additions & 3 deletions torchnlp/datasets/snli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import io

import ujson as json
import json

from torchnlp.download import download_file_maybe_extract
from torchnlp.datasets.dataset import Dataset
Expand Down Expand Up @@ -47,8 +47,9 @@ def snli_dataset(directory='data/',
url (str, optional): URL of the dataset `tar.gz` file.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
tokens and test tokens in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import snli_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/trec.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ def trec_dataset(directory='data/trec/',
urls (str, optional): URLs to download.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
tokens and test tokens in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import trec_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/ud_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ def ud_pos_dataset(directory='data/',
url (str, optional): URL of the dataset `tar.gz` file.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
tokens and test tokens in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import ud_pos_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/wikitext_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ def wikitext_2_dataset(
url (str, optional): URL of the dataset `tar.gz` file.
Returns:
:class:`tuple` of :class:`list` of :class:`str`: Tuple with the training tokens, dev tokens
and test tokens in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import wikitext_2_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/wmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ def wmt_dataset(directory='data/wmt16_en_de',
url (str, optional): URL of the dataset `tar.gz` file.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
tokens and test tokens in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import wmt_dataset
Expand Down
5 changes: 3 additions & 2 deletions torchnlp/datasets/zero.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ def zero_dataset(train=False, dev=False, test=False, train_rows=256, dev_rows=64
test_rows (int, optional): Number of test rows to generate.
Returns:
:class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset
, dev dataset and test dataset in order if their respective boolean argument is true.
:class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import zero_dataset
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
class TextEncoder(object):
class Encoder(object):
""" Base class for a text encoder.
"""

Expand Down
Loading

0 comments on commit 133a54c

Please sign in to comment.