Skip to content

Commit

Permalink
Replace models.TokenOffsets with strutils.ByteOffsets
Browse files Browse the repository at this point in the history
  • Loading branch information
marco-nicola committed Dec 11, 2020
1 parent 9d5a4b8 commit 071b8fb
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 29 deletions.
3 changes: 2 additions & 1 deletion models/bpemodel/bpemodel.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package bpemodel
import (
"fmt"
"github.com/nlpodyssey/gotokenizers/models"
"github.com/nlpodyssey/gotokenizers/strutils"
"github.com/nlpodyssey/gotokenizers/vocabulary"
)

Expand Down Expand Up @@ -193,7 +194,7 @@ func (m *BPEModel) wordToTokens(word *Word) ([]models.Token, error) {
tokens[i] = models.Token{
ID: wordSymbol.ID,
Value: value,
Offsets: models.TokenOffsets{
Offsets: strutils.ByteOffsets{
Start: offsetStart,
End: offsetEnd,
},
Expand Down
21 changes: 11 additions & 10 deletions models/bpemodel/bpemodel_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package bpemodel

import (
"github.com/nlpodyssey/gotokenizers/models"
"github.com/nlpodyssey/gotokenizers/strutils"
"github.com/nlpodyssey/gotokenizers/vocabulary"
"reflect"
"testing"
Expand Down Expand Up @@ -81,7 +82,7 @@ func TestTokenizeWithAndWithoutDropout(t *testing.T) {
{
ID: 15,
Value: "unrelated",
Offsets: models.TokenOffsets{Start: 0, End: 9},
Offsets: strutils.ByteOffsets{Start: 0, End: 9},
},
}
if !reflect.DeepEqual(tokens, expectedTokens) {
Expand All @@ -105,15 +106,15 @@ func TestTokenizeWithAndWithoutDropout(t *testing.T) {
}

expectedTokens = []models.Token{
{ID: 0, Value: "u", Offsets: models.TokenOffsets{Start: 0, End: 1}},
{ID: 1, Value: "n", Offsets: models.TokenOffsets{Start: 1, End: 2}},
{ID: 2, Value: "r", Offsets: models.TokenOffsets{Start: 2, End: 3}},
{ID: 3, Value: "e", Offsets: models.TokenOffsets{Start: 3, End: 4}},
{ID: 4, Value: "l", Offsets: models.TokenOffsets{Start: 4, End: 5}},
{ID: 5, Value: "a", Offsets: models.TokenOffsets{Start: 5, End: 6}},
{ID: 6, Value: "t", Offsets: models.TokenOffsets{Start: 6, End: 7}},
{ID: 3, Value: "e", Offsets: models.TokenOffsets{Start: 7, End: 8}},
{ID: 7, Value: "d", Offsets: models.TokenOffsets{Start: 8, End: 9}},
{ID: 0, Value: "u", Offsets: strutils.ByteOffsets{Start: 0, End: 1}},
{ID: 1, Value: "n", Offsets: strutils.ByteOffsets{Start: 1, End: 2}},
{ID: 2, Value: "r", Offsets: strutils.ByteOffsets{Start: 2, End: 3}},
{ID: 3, Value: "e", Offsets: strutils.ByteOffsets{Start: 3, End: 4}},
{ID: 4, Value: "l", Offsets: strutils.ByteOffsets{Start: 4, End: 5}},
{ID: 5, Value: "a", Offsets: strutils.ByteOffsets{Start: 5, End: 6}},
{ID: 6, Value: "t", Offsets: strutils.ByteOffsets{Start: 6, End: 7}},
{ID: 3, Value: "e", Offsets: strutils.ByteOffsets{Start: 7, End: 8}},
{ID: 7, Value: "d", Offsets: strutils.ByteOffsets{Start: 8, End: 9}},
}
if !reflect.DeepEqual(tokens, expectedTokens) {
t.Errorf("expected %+v, actual %+v", expectedTokens, tokens)
Expand Down
9 changes: 3 additions & 6 deletions models/models.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

package models

import "github.com/nlpodyssey/gotokenizers/strutils"

// Model represents a model used during Tokenization (like BPE or Word or Unigram).
type Model interface {
// Tokenize tokenizes the given sequence into multiple underlying Tokens.
Expand All @@ -14,10 +16,5 @@ type Model interface {
type Token struct {
ID int
Value string
Offsets TokenOffsets
}

type TokenOffsets struct {
Start int
End int
Offsets strutils.ByteOffsets
}
7 changes: 4 additions & 3 deletions models/wordpiecemodel/wordpiecemodel.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package wordpiecemodel
import (
"fmt"
"github.com/nlpodyssey/gotokenizers/models"
"github.com/nlpodyssey/gotokenizers/strutils"
"github.com/nlpodyssey/gotokenizers/vocabulary"
)

Expand Down Expand Up @@ -60,7 +61,7 @@ func (m *WordPieceModel) Tokenize(sequence string) ([]models.Token, error) {
return []models.Token{{
ID: unkTokenID,
Value: m.unknownToken,
Offsets: models.TokenOffsets{Start: 0, End: len(sequence)},
Offsets: strutils.ByteOffsets{Start: 0, End: len(sequence)},
}}, nil
}

Expand All @@ -85,7 +86,7 @@ func (m *WordPieceModel) Tokenize(sequence string) ([]models.Token, error) {
curToken = models.Token{
ID: id,
Value: subStr,
Offsets: models.TokenOffsets{Start: start, End: end},
Offsets: strutils.ByteOffsets{Start: start, End: end},
}
break
}
Expand Down Expand Up @@ -114,7 +115,7 @@ func (m *WordPieceModel) Tokenize(sequence string) ([]models.Token, error) {
return []models.Token{{
ID: unkTokenID,
Value: m.unknownToken,
Offsets: models.TokenOffsets{Start: 0, End: len(sequence)},
Offsets: strutils.ByteOffsets{Start: 0, End: len(sequence)},
}}, nil
}

Expand Down
19 changes: 10 additions & 9 deletions models/wordpiecemodel/wordpiecemodel_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package wordpiecemodel
import (
"fmt"
"github.com/nlpodyssey/gotokenizers/models"
"github.com/nlpodyssey/gotokenizers/strutils"
"github.com/nlpodyssey/gotokenizers/vocabulary"
"reflect"
"testing"
Expand Down Expand Up @@ -48,40 +49,40 @@ func TestWordPieceModelTokenize(t *testing.T) {
{
"foo",
[]models.Token{
{ID: 1, Value: "foo", Offsets: models.TokenOffsets{Start: 0, End: 3}},
{ID: 1, Value: "foo", Offsets: strutils.ByteOffsets{Start: 0, End: 3}},
},
},
{
"barbaz",
[]models.Token{
{ID: 3, Value: "bar", Offsets: models.TokenOffsets{Start: 0, End: 3}},
{ID: 6, Value: "##baz", Offsets: models.TokenOffsets{Start: 3, End: 6}},
{ID: 3, Value: "bar", Offsets: strutils.ByteOffsets{Start: 0, End: 3}},
{ID: 6, Value: "##baz", Offsets: strutils.ByteOffsets{Start: 3, End: 6}},
},
},
{
"alphabetagamma",
[]models.Token{
{ID: 0, Value: "[UNK]", Offsets: models.TokenOffsets{Start: 0, End: 14}},
{ID: 0, Value: "[UNK]", Offsets: strutils.ByteOffsets{Start: 0, End: 14}},
},
},
{
"foobarbaz",
[]models.Token{
{ID: 1, Value: "foo", Offsets: models.TokenOffsets{Start: 0, End: 3}},
{ID: 4, Value: "##bar", Offsets: models.TokenOffsets{Start: 3, End: 6}},
{ID: 6, Value: "##baz", Offsets: models.TokenOffsets{Start: 6, End: 9}},
{ID: 1, Value: "foo", Offsets: strutils.ByteOffsets{Start: 0, End: 3}},
{ID: 4, Value: "##bar", Offsets: strutils.ByteOffsets{Start: 3, End: 6}},
{ID: 6, Value: "##baz", Offsets: strutils.ByteOffsets{Start: 6, End: 9}},
},
},
{
"qux",
[]models.Token{
{ID: 0, Value: "[UNK]", Offsets: models.TokenOffsets{Start: 0, End: 3}},
{ID: 0, Value: "[UNK]", Offsets: strutils.ByteOffsets{Start: 0, End: 3}},
},
},
{
"veryverylongterm",
[]models.Token{
{ID: 0, Value: "[UNK]", Offsets: models.TokenOffsets{Start: 0, End: 16}},
{ID: 0, Value: "[UNK]", Offsets: strutils.ByteOffsets{Start: 0, End: 16}},
},
},
}
Expand Down

0 comments on commit 071b8fb

Please sign in to comment.