Skip to content

Commit

Permalink
Multi-lined envelope reading corruption caused by direct reference in…
Browse files Browse the repository at this point in the history
…to bufio.Reader's internal buffer rollover. (#214)

BUG: #213

If we're dealing with multi-lined envelope (either by rows or by header/footer), readLine()
will be called several times, thus whatever ios.ByteReadLine, which uses bufio.Reader underneath,
returns in a previous call may be potentially be invalidated due to bufio.Reader's internal buf
rollover. If we read the previous line directly, it would cause corruption.

To fix the problem the easiest solution would be simply copying the return []byte from
ios.ByteReadLine every single time. But for files with single-line envelope, which are the vast
majority cases, this copy becomes unnecessary and burdensome on gc. So the trick is to has a flag
on reader.linesBuf's last element to tell if it contains a reference into the bufio.Reader's
internal buffer, or it's a copy. Every time before we call bufio.Reader read, we check
reader.liensBuf's last element flag, if it is not a copy, then we will turn it into a copy.

This way, we optimize for the vast majority cases without needing allocations, and avoid any potential
corruptions in the multi-lined envelope cases.
  • Loading branch information
jf-tech authored Jul 25, 2023
1 parent 37370fa commit 9e0c8da
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 8 deletions.
37 changes: 33 additions & 4 deletions extensions/omniv21/fileformat/flatfile/fixedlength/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ import (
)

type line struct {
lineNum int // 1-based
b []byte
lineNum int // 1-based
b []byte // either a copy of a line content or a direct ref into bufio.Reader.
copied bool // see notes in reader.readLine()
}

type reader struct {
Expand Down Expand Up @@ -148,9 +149,37 @@ func (r *reader) readAndMatchHeaderFooterBasedEnvelope(
}

func (r *reader) readLine() error {
// https://github.com/jf-tech/omniparser/issues/213
//
// If we're dealing with multi-lined envelope (either by rows or by header/footer), this
// readLine() will be called several times, thus whatever ios.ByteReadLine, which uses
// bufio.Reader underneath, returns in a previous call may be potentially be invalidated due to
// bufio.Reader's internal buf rollover. If we read the previous line directly, it would cause
// corruption.
//
// To fix the problem the easiest solution would be simply copying the return []byte from
// ios.ByteReadLine every single time. But for files with single-line envelope, which are the
// vast majority cases, this copy becomes unnecessary and burdensome on gc. So the trick is to
// has a flag on reader.linesBuf's last element to tell if it contains a reference into the
// bufio.Reader's internal buffer, or it's a copy. Every time before we call bufio.Reader read,
// we check reader.liensBuf's last element flag, if it is not a copy, then we will turn it into
// a copy.
//
// This way, we optimize for the vast majority cases without
// needing allocations, and avoid any potential corruptions in the multi-lined envelope cases.
linesBufLen := len(r.linesBuf)
if linesBufLen > 0 && !r.linesBuf[linesBufLen-1].copied {
cp := make([]byte, len(r.linesBuf[linesBufLen-1].b))
copy(cp, r.linesBuf[linesBufLen-1].b)
r.linesBuf[linesBufLen-1].b = cp
r.linesBuf[linesBufLen-1].copied = true
}
for {
// note1: ios.ByteReadLine returns a ln with trailing '\n' (and/or '\r') dropped.
// note2: ios.ByteReadLine won't return io.EOF if ln returned isn't empty.
// note1: ios.ByteReadLine returns a line with trailing '\n' (and/or '\r') dropped.
// note2: ios.ByteReadLine won't return io.EOF if line returned isn't empty.
// note3: ios.ByteReadLine's returned []byte is merely pointing into the bufio.Reader's
// internal buffer, thus the content will be invalided if ios.ByteReadLine is called
// again. Caution!
b, err := ios.ByteReadLine(r.r)
switch {
case err == io.EOF:
Expand Down
31 changes: 31 additions & 0 deletions extensions/omniv21/fileformat/flatfile/fixedlength/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,37 @@ func TestReadLine(t *testing.T) {
assert.Equal(t, line{lineNum: 45, b: []byte("a new line")}, r.linesBuf[2])
}

func TestReadLineMultiCalls(t *testing.T) {
// See more details about the bug: https://github.com/jf-tech/omniparser/issues/213
r := &reader{inputName: "test-input"}
// construct a two-line input, and total length of the two line input is longer than
// the bufio.Reader's internal buffer. Note bufio.Reader has a minReadBufferSize at 16.
r.r = bufio.NewReaderSize(strings.NewReader("0123456789\nabcdefghijklmnopq\n!@#"), 16)
// First read should be completely normal and fine.
err := r.readLine()
assert.NoError(t, err)
assert.Equal(t, 1, r.linesRead)
assert.Equal(t, 1, len(r.linesBuf))
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: false}, r.linesBuf[0])
// Now the second read, if we didn't do any copying of the first line result, the first line
// would have become corrupted since the bufio.Reader's tiny internal buffer gets overwritten
// by the new line data. With the fix, both lines in the linesBuf should be without corruption.
err = r.readLine()
assert.NoError(t, err)
assert.Equal(t, 2, r.linesRead)
assert.Equal(t, 2, len(r.linesBuf))
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: true}, r.linesBuf[0])
assert.Equal(t, line{lineNum: 2, b: []byte("abcdefghijklmnopq"), copied: false}, r.linesBuf[1])
// 3rd line reading
err = r.readLine()
assert.NoError(t, err)
assert.Equal(t, 3, r.linesRead)
assert.Equal(t, 3, len(r.linesBuf))
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: true}, r.linesBuf[0])
assert.Equal(t, line{lineNum: 2, b: []byte("abcdefghijklmnopq"), copied: true}, r.linesBuf[1])
assert.Equal(t, line{lineNum: 3, b: []byte("!@#"), copied: false}, r.linesBuf[2])
}

func TestLinesToNode(t *testing.T) {
for _, test := range []struct {
name string
Expand Down
8 changes: 4 additions & 4 deletions extensions/omniv21/samples/fixedlength2/fixedlength_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,22 +108,22 @@ func Test4_Nested(t *testing.T) {
tests[test4_Nested].doTest(t)
}

// Benchmark1_Single_Row-8 25898 45728 ns/op 28169 B/op 647 allocs/op
// Benchmark1_Single_Row-8 25951 45776 ns/op 28213 B/op 645 allocs/op
func Benchmark1_Single_Row(b *testing.B) {
tests[test1_Single_Row].doBenchmark(b)
}

// Benchmark2_Multi_Rows-8 15338 78273 ns/op 30179 B/op 618 allocs/op
// Benchmark2_Multi_Rows-8 18583 64240 ns/op 34056 B/op 636 allocs/op
func Benchmark2_Multi_Rows(b *testing.B) {
tests[test2_Multi_Rows].doBenchmark(b)
}

// Benchmark3_Header_Footer-8 6390 178301 ns/op 76571 B/op 1501 allocs/op
// Benchmark3_Header_Footer-8 7306 158797 ns/op 78666 B/op 1587 allocs/op
func Benchmark3_Header_Footer(b *testing.B) {
tests[test3_Header_Footer].doBenchmark(b)
}

// Benchmark4_Nested-8 10000 107948 ns/op 78986 B/op 1535 allocs/op
// Benchmark4_Nested-8 10000 107955 ns/op 80929 B/op 1515 allocs/op
func Benchmark4_Nested(b *testing.B) {
tests[test4_Nested].doBenchmark(b)
}

0 comments on commit 9e0c8da

Please sign in to comment.