Skip to content

Commit

Permalink
Replace normalizedstring.Offsets with strutils.ByteOffsets
Browse files Browse the repository at this point in the history
  • Loading branch information
marco-nicola committed Dec 11, 2020
1 parent 071b8fb commit a278316
Show file tree
Hide file tree
Showing 10 changed files with 107 additions and 111 deletions.
12 changes: 2 additions & 10 deletions normalizedstring/normalizedstring.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,6 @@ type AlignmentRange struct {
end int
}

// Offsets represents a (start, end) range of offsets.
type Offsets struct {
// Start byte position, inclusive.
Start int
// End byte position, exclusive.
End int
}

// New returns a new NormalizedString.
func New(
original string,
Expand Down Expand Up @@ -122,8 +114,8 @@ func (ns *NormalizedString) IsEmpty() bool {
}

// OriginalOffsets returns the original offsets.
func (ns *NormalizedString) OriginalOffsets() Offsets {
return Offsets{
func (ns *NormalizedString) OriginalOffsets() strutils.ByteOffsets {
return strutils.ByteOffsets{
Start: ns.originalShift,
End: ns.originalShift + ns.OriginalLen(),
}
Expand Down
13 changes: 7 additions & 6 deletions normalizedstring/normalizedstring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package normalizedstring

import (
"github.com/nlpodyssey/gotokenizers/splitpattern"
"github.com/nlpodyssey/gotokenizers/strutils"
"reflect"
"regexp"
"testing"
Expand Down Expand Up @@ -169,22 +170,22 @@ func TestNormalizedStringOriginalOffsets(t *testing.T) {
t.Parallel()

ns := New("", "x", []AlignmentRange{}, 0)
assertEqual(t, ns.OriginalOffsets(), Offsets{0, 0})
assertEqual(t, ns.OriginalOffsets(), strutils.ByteOffsets{Start: 0, End: 0})

ns = New("", "x", []AlignmentRange{}, 42)
assertEqual(t, ns.OriginalOffsets(), Offsets{42, 42})
assertEqual(t, ns.OriginalOffsets(), strutils.ByteOffsets{Start: 42, End: 42})

ns = New("Foo", "", []AlignmentRange{}, 0)
assertEqual(t, ns.OriginalOffsets(), Offsets{0, 3})
assertEqual(t, ns.OriginalOffsets(), strutils.ByteOffsets{Start: 0, End: 3})

ns = New("Foo", "", []AlignmentRange{}, 42)
assertEqual(t, ns.OriginalOffsets(), Offsets{42, 45})
assertEqual(t, ns.OriginalOffsets(), strutils.ByteOffsets{Start: 42, End: 45})

ns = New("ℝ", "", []AlignmentRange{}, 0)
assertEqual(t, ns.OriginalOffsets(), Offsets{0, 3})
assertEqual(t, ns.OriginalOffsets(), strutils.ByteOffsets{Start: 0, End: 3})

ns = New("ℝ", "", []AlignmentRange{}, 42)
assertEqual(t, ns.OriginalOffsets(), Offsets{42, 45})
assertEqual(t, ns.OriginalOffsets(), strutils.ByteOffsets{Start: 42, End: 45})
}

func TestNormalizedStringPrepend(t *testing.T) {
Expand Down
3 changes: 2 additions & 1 deletion pretokenizedstring/pretokenizedstring.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package pretokenizedstring
import (
"github.com/nlpodyssey/gotokenizers/models"
"github.com/nlpodyssey/gotokenizers/normalizedstring"
"github.com/nlpodyssey/gotokenizers/strutils"
)

// PreTokenizedString is in charge of splitting an underlying string,
Expand Down Expand Up @@ -142,7 +143,7 @@ func (p *PreTokenizedString) GetNormalizedByteSplits() []NormalizedByteSplit {

result[i] = NormalizedByteSplit{
String: split.NormalizedString.Get(),
Offsets: normalizedstring.Offsets{
Offsets: strutils.ByteOffsets{
Start: start,
End: offset,
},
Expand Down
5 changes: 3 additions & 2 deletions pretokenizedstring/split.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package pretokenizedstring
import (
"github.com/nlpodyssey/gotokenizers/models"
"github.com/nlpodyssey/gotokenizers/normalizedstring"
"github.com/nlpodyssey/gotokenizers/strutils"
)

// Split is a wrapper for a subpart of a NormalizedString.
Expand All @@ -28,7 +29,7 @@ type OriginalByteSplit struct {
// A slice of the normalized string
String string
// The associated bytes offsets, in the original referential
Offsets normalizedstring.Offsets
Offsets strutils.ByteOffsets
// The potential tokens
Tokens *[]models.Token
}
Expand All @@ -37,7 +38,7 @@ type NormalizedByteSplit struct {
// A slice of the normalized string
String string
// The associated bytes offsets, in the normalized referential
Offsets normalizedstring.Offsets
Offsets strutils.ByteOffsets
// The potential tokens
Tokens *[]models.Token
}
Expand Down
31 changes: 16 additions & 15 deletions pretokenizers/bertpretokenizer/bertpretokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package bertpretokenizer
import (
"github.com/nlpodyssey/gotokenizers/normalizedstring"
"github.com/nlpodyssey/gotokenizers/pretokenizedstring"
"github.com/nlpodyssey/gotokenizers/strutils"
"reflect"
"testing"
)
Expand All @@ -23,15 +24,15 @@ func TestBertPreTokenizer_PreTokenize(t *testing.T) {
}

assertEqual(t, pts.GetOriginalByteSplits(), []pretokenizedstring.OriginalByteSplit{
{String: "Hey", Offsets: normalizedstring.Offsets{Start: 0, End: 3}},
{String: "friend", Offsets: normalizedstring.Offsets{Start: 4, End: 10}},
{String: "!", Offsets: normalizedstring.Offsets{Start: 10, End: 11}},
{String: "How", Offsets: normalizedstring.Offsets{Start: 16, End: 19}},
{String: "are", Offsets: normalizedstring.Offsets{Start: 20, End: 23}},
{String: "you", Offsets: normalizedstring.Offsets{Start: 24, End: 27}},
{String: "?", Offsets: normalizedstring.Offsets{Start: 27, End: 28}},
{String: "!", Offsets: normalizedstring.Offsets{Start: 28, End: 29}},
{String: "?", Offsets: normalizedstring.Offsets{Start: 29, End: 30}},
{String: "Hey", Offsets: strutils.ByteOffsets{Start: 0, End: 3}},
{String: "friend", Offsets: strutils.ByteOffsets{Start: 4, End: 10}},
{String: "!", Offsets: strutils.ByteOffsets{Start: 10, End: 11}},
{String: "How", Offsets: strutils.ByteOffsets{Start: 16, End: 19}},
{String: "are", Offsets: strutils.ByteOffsets{Start: 20, End: 23}},
{String: "you", Offsets: strutils.ByteOffsets{Start: 24, End: 27}},
{String: "?", Offsets: strutils.ByteOffsets{Start: 27, End: 28}},
{String: "!", Offsets: strutils.ByteOffsets{Start: 28, End: 29}},
{String: "?", Offsets: strutils.ByteOffsets{Start: 29, End: 30}},
})
})

Expand Down Expand Up @@ -62,12 +63,12 @@ func TestBertPreTokenizer_PreTokenize(t *testing.T) {
}

assertEqual(t, pts.GetOriginalByteSplits(), []pretokenizedstring.OriginalByteSplit{
{String: "野", Offsets: normalizedstring.Offsets{Start: 0, End: 3}},
{String: "口", Offsets: normalizedstring.Offsets{Start: 3, End: 6}},
{String: "里", Offsets: normalizedstring.Offsets{Start: 6, End: 9}},
{String: "佳", Offsets: normalizedstring.Offsets{Start: 9, End: 12}},
{String: "Noguchi", Offsets: normalizedstring.Offsets{Start: 13, End: 20}},
{String: "Rika", Offsets: normalizedstring.Offsets{Start: 21, End: 25}},
{String: "野", Offsets: strutils.ByteOffsets{Start: 0, End: 3}},
{String: "口", Offsets: strutils.ByteOffsets{Start: 3, End: 6}},
{String: "里", Offsets: strutils.ByteOffsets{Start: 6, End: 9}},
{String: "佳", Offsets: strutils.ByteOffsets{Start: 9, End: 12}},
{String: "Noguchi", Offsets: strutils.ByteOffsets{Start: 13, End: 20}},
{String: "Rika", Offsets: strutils.ByteOffsets{Start: 21, End: 25}},
})
})
}
Expand Down
72 changes: 36 additions & 36 deletions pretokenizers/bytelevelpretokenizer/bytelevelpretokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ package bytelevelpretokenizer

import (
"fmt"
"github.com/nlpodyssey/gotokenizers/normalizedstring"
"github.com/nlpodyssey/gotokenizers/pretokenizedstring"
"github.com/nlpodyssey/gotokenizers/strutils"
"reflect"
"testing"
)
Expand All @@ -27,16 +27,16 @@ func TestByteLevelPreTokenizer_PreTokenize(t *testing.T) {
}

assertEqual(t, pts.GetOriginalByteSplits(), []pretokenizedstring.OriginalByteSplit{
{String: "Hello", Offsets: normalizedstring.Offsets{Start: 0, End: 5}},
{String: "Ġmy", Offsets: normalizedstring.Offsets{Start: 5, End: 8}},
{String: "Ġfriend", Offsets: normalizedstring.Offsets{Start: 8, End: 15}},
{String: ",", Offsets: normalizedstring.Offsets{Start: 15, End: 16}},
{String: "Ġhow", Offsets: normalizedstring.Offsets{Start: 16, End: 20}},
{String: "Ġis", Offsets: normalizedstring.Offsets{Start: 20, End: 23}},
{String: "Ġyour", Offsets: normalizedstring.Offsets{Start: 23, End: 28}},
{String: "Ġday", Offsets: normalizedstring.Offsets{Start: 28, End: 32}},
{String: "Ġgoing", Offsets: normalizedstring.Offsets{Start: 32, End: 38}},
{String: "?", Offsets: normalizedstring.Offsets{Start: 38, End: 39}},
{String: "Hello", Offsets: strutils.ByteOffsets{Start: 0, End: 5}},
{String: "Ġmy", Offsets: strutils.ByteOffsets{Start: 5, End: 8}},
{String: "Ġfriend", Offsets: strutils.ByteOffsets{Start: 8, End: 15}},
{String: ",", Offsets: strutils.ByteOffsets{Start: 15, End: 16}},
{String: "Ġhow", Offsets: strutils.ByteOffsets{Start: 16, End: 20}},
{String: "Ġis", Offsets: strutils.ByteOffsets{Start: 20, End: 23}},
{String: "Ġyour", Offsets: strutils.ByteOffsets{Start: 23, End: 28}},
{String: "Ġday", Offsets: strutils.ByteOffsets{Start: 28, End: 32}},
{String: "Ġgoing", Offsets: strutils.ByteOffsets{Start: 32, End: 38}},
{String: "?", Offsets: strutils.ByteOffsets{Start: 38, End: 39}},
})
})

Expand All @@ -60,16 +60,16 @@ func TestByteLevelPreTokenizer_PreTokenize(t *testing.T) {
}

assertEqual(t, pts.GetNormalizedByteSplits(), []pretokenizedstring.NormalizedByteSplit{
{String: "ĠHello", Offsets: normalizedstring.Offsets{Start: 0, End: 7}},
{String: "Ġmy", Offsets: normalizedstring.Offsets{Start: 7, End: 11}},
{String: "Ġfriend", Offsets: normalizedstring.Offsets{Start: 11, End: 19}},
{String: ",", Offsets: normalizedstring.Offsets{Start: 19, End: 20}},
{String: "Ġhow", Offsets: normalizedstring.Offsets{Start: 20, End: 25}},
{String: "Ġis", Offsets: normalizedstring.Offsets{Start: 25, End: 29}},
{String: "Ġyour", Offsets: normalizedstring.Offsets{Start: 29, End: 35}},
{String: "Ġday", Offsets: normalizedstring.Offsets{Start: 35, End: 40}},
{String: "Ġgoing", Offsets: normalizedstring.Offsets{Start: 40, End: 47}},
{String: "?", Offsets: normalizedstring.Offsets{Start: 47, End: 48}},
{String: "ĠHello", Offsets: strutils.ByteOffsets{Start: 0, End: 7}},
{String: "Ġmy", Offsets: strutils.ByteOffsets{Start: 7, End: 11}},
{String: "Ġfriend", Offsets: strutils.ByteOffsets{Start: 11, End: 19}},
{String: ",", Offsets: strutils.ByteOffsets{Start: 19, End: 20}},
{String: "Ġhow", Offsets: strutils.ByteOffsets{Start: 20, End: 25}},
{String: "Ġis", Offsets: strutils.ByteOffsets{Start: 25, End: 29}},
{String: "Ġyour", Offsets: strutils.ByteOffsets{Start: 29, End: 35}},
{String: "Ġday", Offsets: strutils.ByteOffsets{Start: 35, End: 40}},
{String: "Ġgoing", Offsets: strutils.ByteOffsets{Start: 40, End: 47}},
{String: "?", Offsets: strutils.ByteOffsets{Start: 47, End: 48}},
})
})
}
Expand All @@ -87,11 +87,11 @@ func TestByteLevelPreTokenizer_PreTokenize(t *testing.T) {
}

assertEqual(t, pts.GetOriginalByteSplits(), []pretokenizedstring.OriginalByteSplit{
{String: "Hello", Offsets: normalizedstring.Offsets{Start: 0, End: 5}},
{String: "Ġthere", Offsets: normalizedstring.Offsets{Start: 5, End: 11}},
{String: "Ċ", Offsets: normalizedstring.Offsets{Start: 11, End: 12}},
{String: "Hello", Offsets: normalizedstring.Offsets{Start: 12, End: 17}},
{String: "Ġthere", Offsets: normalizedstring.Offsets{Start: 17, End: 23}},
{String: "Hello", Offsets: strutils.ByteOffsets{Start: 0, End: 5}},
{String: "Ġthere", Offsets: strutils.ByteOffsets{Start: 5, End: 11}},
{String: "Ċ", Offsets: strutils.ByteOffsets{Start: 11, End: 12}},
{String: "Hello", Offsets: strutils.ByteOffsets{Start: 12, End: 17}},
{String: "Ġthere", Offsets: strutils.ByteOffsets{Start: 17, End: 23}},
})
})

Expand All @@ -107,10 +107,10 @@ func TestByteLevelPreTokenizer_PreTokenize(t *testing.T) {
}

assertEqual(t, pts.GetOriginalByteSplits(), []pretokenizedstring.OriginalByteSplit{
{String: "Hello", Offsets: normalizedstring.Offsets{Start: 0, End: 5}},
{String: "Ġthere", Offsets: normalizedstring.Offsets{Start: 5, End: 11}},
{String: "ĠĠĠĠĠĠ", Offsets: normalizedstring.Offsets{Start: 11, End: 17}},
{String: "Ġdear", Offsets: normalizedstring.Offsets{Start: 17, End: 22}},
{String: "Hello", Offsets: strutils.ByteOffsets{Start: 0, End: 5}},
{String: "Ġthere", Offsets: strutils.ByteOffsets{Start: 5, End: 11}},
{String: "ĠĠĠĠĠĠ", Offsets: strutils.ByteOffsets{Start: 11, End: 17}},
{String: "Ġdear", Offsets: strutils.ByteOffsets{Start: 17, End: 22}},
})
})

Expand All @@ -128,15 +128,15 @@ func TestByteLevelPreTokenizer_PreTokenize(t *testing.T) {
}

assertEqual(t, pts.GetOriginalByteSplits(), []pretokenizedstring.OriginalByteSplit{
{String: "i", Offsets: normalizedstring.Offsets{Start: 0, End: 1}},
{String: "âŃ¢", Offsets: normalizedstring.Offsets{Start: 1, End: 4}},
{String: "j", Offsets: normalizedstring.Offsets{Start: 4, End: 5}},
{String: "i", Offsets: strutils.ByteOffsets{Start: 0, End: 1}},
{String: "âŃ¢", Offsets: strutils.ByteOffsets{Start: 1, End: 4}},
{String: "j", Offsets: strutils.ByteOffsets{Start: 4, End: 5}},
})

assertEqual(t, pts.GetNormalizedByteSplits(), []pretokenizedstring.NormalizedByteSplit{
{String: "i", Offsets: normalizedstring.Offsets{Start: 0, End: 1}},
{String: "âŃ¢", Offsets: normalizedstring.Offsets{Start: 1, End: 7}},
{String: "j", Offsets: normalizedstring.Offsets{Start: 7, End: 8}},
{String: "i", Offsets: strutils.ByteOffsets{Start: 0, End: 1}},
{String: "âŃ¢", Offsets: strutils.ByteOffsets{Start: 1, End: 7}},
{String: "j", Offsets: strutils.ByteOffsets{Start: 7, End: 8}},
})

strings := make([]string, 0)
Expand Down
26 changes: 13 additions & 13 deletions pretokenizers/metaspacepretokenizer/metaspacepretokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ package metaspacepretokenizer

import (
"fmt"
"github.com/nlpodyssey/gotokenizers/normalizedstring"
"github.com/nlpodyssey/gotokenizers/pretokenizedstring"
"github.com/nlpodyssey/gotokenizers/strutils"
"reflect"
"testing"
)
Expand All @@ -23,27 +23,27 @@ func TestMetaSpacePreTokenizer_PreTokenize(t *testing.T) {
{
"Hey friend!",
[]pretokenizedstring.OriginalByteSplit{
{String: "▁Hey", Offsets: normalizedstring.Offsets{Start: 0, End: 3}},
{String: "▁friend!", Offsets: normalizedstring.Offsets{Start: 3, End: 11}},
{String: "▁Hey", Offsets: strutils.ByteOffsets{Start: 0, End: 3}},
{String: "▁friend!", Offsets: strutils.ByteOffsets{Start: 3, End: 11}},
},
[]pretokenizedstring.NormalizedByteSplit{
{String: "▁Hey", Offsets: normalizedstring.Offsets{Start: 0, End: 6}},
{String: "▁friend!", Offsets: normalizedstring.Offsets{Start: 6, End: 16}},
{String: "▁Hey", Offsets: strutils.ByteOffsets{Start: 0, End: 6}},
{String: "▁friend!", Offsets: strutils.ByteOffsets{Start: 6, End: 16}},
},
},
{
"Hey friend!",
[]pretokenizedstring.OriginalByteSplit{
{String: "▁Hey", Offsets: normalizedstring.Offsets{Start: 0, End: 3}},
{String: "▁", Offsets: normalizedstring.Offsets{Start: 3, End: 4}},
{String: "▁", Offsets: normalizedstring.Offsets{Start: 4, End: 5}},
{String: "▁friend!", Offsets: normalizedstring.Offsets{Start: 5, End: 13}},
{String: "▁Hey", Offsets: strutils.ByteOffsets{Start: 0, End: 3}},
{String: "▁", Offsets: strutils.ByteOffsets{Start: 3, End: 4}},
{String: "▁", Offsets: strutils.ByteOffsets{Start: 4, End: 5}},
{String: "▁friend!", Offsets: strutils.ByteOffsets{Start: 5, End: 13}},
},
[]pretokenizedstring.NormalizedByteSplit{
{String: "▁Hey", Offsets: normalizedstring.Offsets{Start: 0, End: 6}},
{String: "▁", Offsets: normalizedstring.Offsets{Start: 6, End: 9}},
{String: "▁", Offsets: normalizedstring.Offsets{Start: 9, End: 12}},
{String: "▁friend!", Offsets: normalizedstring.Offsets{Start: 12, End: 22}},
{String: "▁Hey", Offsets: strutils.ByteOffsets{Start: 0, End: 6}},
{String: "▁", Offsets: strutils.ByteOffsets{Start: 6, End: 9}},
{String: "▁", Offsets: strutils.ByteOffsets{Start: 9, End: 12}},
{String: "▁friend!", Offsets: strutils.ByteOffsets{Start: 12, End: 22}},
},
},
}
Expand Down
Loading

0 comments on commit a278316

Please sign in to comment.