Skip to content

Commit

Permalink
Replace splitpattern.Offsets with strutils.ByteOffsets
Browse files Browse the repository at this point in the history
  • Loading branch information
marco-nicola committed Dec 11, 2020
1 parent 8f76652 commit 9d5a4b8
Show file tree
Hide file tree
Showing 12 changed files with 110 additions and 93 deletions.
2 changes: 1 addition & 1 deletion normalizedstring/normalizedstring.go
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ func (ns *NormalizedString) Split(
}

type SplitMatch struct {
Offsets splitpattern.Offsets
Offsets strutils.ByteOffsets
ShouldRemove bool
}

Expand Down
10 changes: 6 additions & 4 deletions splitpattern/func.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

package splitpattern

import "github.com/nlpodyssey/gotokenizers/strutils"

type FuncSplitPattern struct {
f func(rune) bool
}
Expand All @@ -17,7 +19,7 @@ func FromFunc(f func(rune) bool) *FuncSplitPattern {
func (sp *FuncSplitPattern) FindMatches(s string) ([]Capture, error) {
if len(s) == 0 {
return []Capture{{
Offsets: Offsets{Start: 0, End: 0},
Offsets: strutils.ByteOffsets{Start: 0, End: 0},
IsMatch: false,
}}, nil
}
Expand All @@ -38,12 +40,12 @@ func (sp *FuncSplitPattern) FindMatches(s string) ([]Capture, error) {
if lastOffset < i {
// We need to emit what was before this match
splits = append(splits, Capture{
Offsets: Offsets{Start: lastOffset, End: i},
Offsets: strutils.ByteOffsets{Start: lastOffset, End: i},
IsMatch: false,
})
}
splits = append(splits, Capture{
Offsets: Offsets{Start: i, End: lastSeen},
Offsets: strutils.ByteOffsets{Start: i, End: lastSeen},
IsMatch: true,
})
lastOffset = lastSeen
Expand All @@ -52,7 +54,7 @@ func (sp *FuncSplitPattern) FindMatches(s string) ([]Capture, error) {
// Do not forget the last potential split
if lastSeen > lastOffset {
splits = append(splits, Capture{
Offsets: Offsets{Start: lastOffset, End: lastSeen},
Offsets: strutils.ByteOffsets{Start: lastOffset, End: lastSeen},
IsMatch: false,
})
}
Expand Down
21 changes: 11 additions & 10 deletions splitpattern/func_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package splitpattern

import (
"github.com/nlpodyssey/gotokenizers/strutils"
"testing"
)

Expand All @@ -16,23 +17,23 @@ func TestFuncSplitPatternFindMatches(t *testing.T) {
})

runTest(t, sp, "aba", []Capture{
{Offsets{0, 1}, false},
{Offsets{1, 2}, true},
{Offsets{2, 3}, false},
{strutils.ByteOffsets{Start: 0, End: 1}, false},
{strutils.ByteOffsets{Start: 1, End: 2}, true},
{strutils.ByteOffsets{Start: 2, End: 3}, false},
})
runTest(t, sp, "aaaab", []Capture{
{Offsets{0, 4}, false},
{Offsets{4, 5}, true},
{strutils.ByteOffsets{Start: 0, End: 4}, false},
{strutils.ByteOffsets{Start: 4, End: 5}, true},
})
runTest(t, sp, "bbaaa", []Capture{
{Offsets{0, 1}, true},
{Offsets{1, 2}, true},
{Offsets{2, 5}, false},
{strutils.ByteOffsets{Start: 0, End: 1}, true},
{strutils.ByteOffsets{Start: 1, End: 2}, true},
{strutils.ByteOffsets{Start: 2, End: 5}, false},
})
runTest(t, sp, "", []Capture{
{Offsets{0, 0}, false},
{strutils.ByteOffsets{Start: 0, End: 0}, false},
})
runTest(t, sp, "aaa", []Capture{
{Offsets{0, 3}, false},
{strutils.ByteOffsets{Start: 0, End: 3}, false},
})
}
7 changes: 4 additions & 3 deletions splitpattern/invertedpattern_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package splitpattern

import (
"github.com/nlpodyssey/gotokenizers/strutils"
"testing"
)

Expand All @@ -14,8 +15,8 @@ func TestInvertedPatternFindMatches(t *testing.T) {
sp := Invert(FromRune('a'))

runTest(t, sp, "aba", []Capture{
{Offsets{0, 1}, false},
{Offsets{1, 2}, true},
{Offsets{2, 3}, false},
{strutils.ByteOffsets{Start: 0, End: 1}, false},
{strutils.ByteOffsets{Start: 1, End: 2}, true},
{strutils.ByteOffsets{Start: 2, End: 3}, false},
})
}
16 changes: 11 additions & 5 deletions splitpattern/regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

package splitpattern

import "regexp"
import (
"github.com/nlpodyssey/gotokenizers/strutils"
"regexp"
)

type RegexpSplitPattern struct {
r *regexp.Regexp
Expand All @@ -18,7 +21,10 @@ func FromRegexp(r *regexp.Regexp) *RegexpSplitPattern {

func (sp *RegexpSplitPattern) FindMatches(s string) ([]Capture, error) {
if len(s) == 0 {
return []Capture{{Offsets: Offsets{Start: 0, End: 0}, IsMatch: false}}, nil
return []Capture{{
Offsets: strutils.ByteOffsets{Start: 0, End: 0},
IsMatch: false,
}}, nil
}

prev := 0
Expand All @@ -31,20 +37,20 @@ func (sp *RegexpSplitPattern) FindMatches(s string) ([]Capture, error) {

if prev != startByte {
splits = append(splits, Capture{
Offsets: Offsets{Start: prev, End: startByte},
Offsets: strutils.ByteOffsets{Start: prev, End: startByte},
IsMatch: false,
})
}
splits = append(splits, Capture{
Offsets: Offsets{Start: startByte, End: endByte},
Offsets: strutils.ByteOffsets{Start: startByte, End: endByte},
IsMatch: true,
})
prev = endByte
}

if prev != len(s) {
splits = append(splits, Capture{
Offsets: Offsets{Start: prev, End: len(s)},
Offsets: strutils.ByteOffsets{Start: prev, End: len(s)},
IsMatch: false,
})
}
Expand Down
16 changes: 11 additions & 5 deletions splitpattern/regexp2.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

package splitpattern

import "github.com/dlclark/regexp2"
import (
"github.com/dlclark/regexp2"
"github.com/nlpodyssey/gotokenizers/strutils"
)

type Regexp2SplitPattern struct {
r *regexp2.Regexp
Expand All @@ -18,7 +21,10 @@ func FromRegexp2(r *regexp2.Regexp) *Regexp2SplitPattern {

func (sp *Regexp2SplitPattern) FindMatches(s string) ([]Capture, error) {
if len(s) == 0 {
return []Capture{{Offsets: Offsets{Start: 0, End: 0}, IsMatch: false}}, nil
return []Capture{{
Offsets: strutils.ByteOffsets{Start: 0, End: 0},
IsMatch: false,
}}, nil
}

runes := []rune(s)
Expand All @@ -36,12 +42,12 @@ func (sp *Regexp2SplitPattern) FindMatches(s string) ([]Capture, error) {

if prev != startByte {
splits = append(splits, Capture{
Offsets: Offsets{Start: prev, End: startByte},
Offsets: strutils.ByteOffsets{Start: prev, End: startByte},
IsMatch: false,
})
}
splits = append(splits, Capture{
Offsets: Offsets{Start: startByte, End: endByte},
Offsets: strutils.ByteOffsets{Start: startByte, End: endByte},
IsMatch: true,
})
prev = endByte
Expand All @@ -54,7 +60,7 @@ func (sp *Regexp2SplitPattern) FindMatches(s string) ([]Capture, error) {

if prev != len(s) {
splits = append(splits, Capture{
Offsets: Offsets{Start: prev, End: len(s)},
Offsets: strutils.ByteOffsets{Start: prev, End: len(s)},
IsMatch: false,
})
}
Expand Down
27 changes: 14 additions & 13 deletions splitpattern/regexp2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package splitpattern

import (
"github.com/dlclark/regexp2"
"github.com/nlpodyssey/gotokenizers/strutils"
"testing"
)

Expand All @@ -16,30 +17,30 @@ func TestRegexp2SplitPatternFindMatches(t *testing.T) {
sp := FromRegexp2(r)

runTest(t, sp, "a b", []Capture{
{Offsets{0, 1}, false},
{Offsets{1, 4}, true},
{Offsets{4, 5}, false},
{strutils.ByteOffsets{Start: 0, End: 1}, false},
{strutils.ByteOffsets{Start: 1, End: 4}, true},
{strutils.ByteOffsets{Start: 4, End: 5}, false},
})

runTest(t, sp, " a b ", []Capture{
{Offsets{0, 3}, true},
{Offsets{3, 4}, false},
{Offsets{4, 7}, true},
{Offsets{7, 8}, false},
{Offsets{8, 11}, true},
{strutils.ByteOffsets{Start: 0, End: 3}, true},
{strutils.ByteOffsets{Start: 3, End: 4}, false},
{strutils.ByteOffsets{Start: 4, End: 7}, true},
{strutils.ByteOffsets{Start: 7, End: 8}, false},
{strutils.ByteOffsets{Start: 8, End: 11}, true},
})

runTest(t, sp, "", []Capture{
{Offsets{0, 0}, false},
{strutils.ByteOffsets{Start: 0, End: 0}, false},
})

runTest(t, sp, "𝔾𝕠𝕠𝕕 𝕞𝕠𝕣𝕟𝕚𝕟𝕘", []Capture{
{Offsets{0, 16}, false},
{Offsets{16, 17}, true},
{Offsets{17, 45}, false},
{strutils.ByteOffsets{Start: 0, End: 16}, false},
{strutils.ByteOffsets{Start: 16, End: 17}, true},
{strutils.ByteOffsets{Start: 17, End: 45}, false},
})

runTest(t, sp, "aaa", []Capture{
{Offsets{0, 3}, false},
{strutils.ByteOffsets{Start: 0, End: 3}, false},
})
}
27 changes: 14 additions & 13 deletions splitpattern/regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package splitpattern

import (
"github.com/nlpodyssey/gotokenizers/strutils"
"regexp"
"testing"
)
Expand All @@ -16,30 +17,30 @@ func TestRegexpSplitPatternFindMatches(t *testing.T) {
sp := FromRegexp(r)

runTest(t, sp, "a b", []Capture{
{Offsets{0, 1}, false},
{Offsets{1, 4}, true},
{Offsets{4, 5}, false},
{strutils.ByteOffsets{Start: 0, End: 1}, false},
{strutils.ByteOffsets{Start: 1, End: 4}, true},
{strutils.ByteOffsets{Start: 4, End: 5}, false},
})

runTest(t, sp, " a b ", []Capture{
{Offsets{0, 3}, true},
{Offsets{3, 4}, false},
{Offsets{4, 7}, true},
{Offsets{7, 8}, false},
{Offsets{8, 11}, true},
{strutils.ByteOffsets{Start: 0, End: 3}, true},
{strutils.ByteOffsets{Start: 3, End: 4}, false},
{strutils.ByteOffsets{Start: 4, End: 7}, true},
{strutils.ByteOffsets{Start: 7, End: 8}, false},
{strutils.ByteOffsets{Start: 8, End: 11}, true},
})

runTest(t, sp, "", []Capture{
{Offsets{0, 0}, false},
{strutils.ByteOffsets{Start: 0, End: 0}, false},
})

runTest(t, sp, "𝔾𝕠𝕠𝕕 𝕞𝕠𝕣𝕟𝕚𝕟𝕘", []Capture{
{Offsets{0, 16}, false},
{Offsets{16, 17}, true},
{Offsets{17, 45}, false},
{strutils.ByteOffsets{Start: 0, End: 16}, false},
{strutils.ByteOffsets{Start: 16, End: 17}, true},
{strutils.ByteOffsets{Start: 17, End: 45}, false},
})

runTest(t, sp, "aaa", []Capture{
{Offsets{0, 3}, false},
{strutils.ByteOffsets{Start: 0, End: 3}, false},
})
}
21 changes: 11 additions & 10 deletions splitpattern/rune_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package splitpattern

import (
"github.com/nlpodyssey/gotokenizers/strutils"
"testing"
)

Expand All @@ -14,23 +15,23 @@ func TestRuneSplitPatternFindMatches(t *testing.T) {
sp := FromRune('a')

runTest(t, sp, "aba", []Capture{
{Offsets{0, 1}, true},
{Offsets{1, 2}, false},
{Offsets{2, 3}, true},
{strutils.ByteOffsets{Start: 0, End: 1}, true},
{strutils.ByteOffsets{Start: 1, End: 2}, false},
{strutils.ByteOffsets{Start: 2, End: 3}, true},
})
runTest(t, sp, "bbbba", []Capture{
{Offsets{0, 4}, false},
{Offsets{4, 5}, true},
{strutils.ByteOffsets{Start: 0, End: 4}, false},
{strutils.ByteOffsets{Start: 4, End: 5}, true},
})
runTest(t, sp, "aabbb", []Capture{
{Offsets{0, 1}, true},
{Offsets{1, 2}, true},
{Offsets{2, 5}, false},
{strutils.ByteOffsets{Start: 0, End: 1}, true},
{strutils.ByteOffsets{Start: 1, End: 2}, true},
{strutils.ByteOffsets{Start: 2, End: 5}, false},
})
runTest(t, sp, "", []Capture{
{Offsets{0, 0}, false},
{strutils.ByteOffsets{Start: 0, End: 0}, false},
})
runTest(t, sp, "bbb", []Capture{
{Offsets{0, 3}, false},
{strutils.ByteOffsets{Start: 0, End: 3}, false},
})
}
12 changes: 3 additions & 9 deletions splitpattern/splitpattern.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

package splitpattern

import "github.com/nlpodyssey/gotokenizers/strutils"

// SplitPattern is implemented by any value which represents a pattern
// for splitting a string.
type SplitPattern interface {
Expand All @@ -18,14 +20,6 @@ type SplitPattern interface {
// which provides offset positions and a flag reporting whether this is a
// match or not.
type Capture struct {
Offsets Offsets
Offsets strutils.ByteOffsets
IsMatch bool
}

// Offsets represents a (start, end) range of offset positions.
type Offsets struct {
// Start byte position, inclusive.
Start int
// End byte position, exclusive.
End int
}
Loading

0 comments on commit 9d5a4b8

Please sign in to comment.