Skip to content

Commit

Permalink
Limit jstream parse depth
Browse files Browse the repository at this point in the history
Add bcicen/jstream#15 by vendoring the package.

Sets JSON depth limit to 100 entries in S3 Select.
  • Loading branch information
klauspost committed Sep 23, 2024
1 parent 05a6c17 commit 76b2462
Show file tree
Hide file tree
Showing 19 changed files with 1,479 additions and 16 deletions.
4 changes: 2 additions & 2 deletions cmd/postpolicyform.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ import (
"strings"
"time"

"github.com/bcicen/jstream"
"github.com/minio/minio-go/v7/pkg/encrypt"
"github.com/minio/minio-go/v7/pkg/set"
xhttp "github.com/minio/minio/internal/http"
"github.com/minio/minio/internal/s3select/jstream"
)

// startWithConds - map which indicates if a given condition supports starts-with policy operator
Expand Down Expand Up @@ -140,7 +140,7 @@ type PostPolicyForm struct {
func sanitizePolicy(r io.Reader) (io.Reader, error) {
var buf bytes.Buffer
e := json.NewEncoder(&buf)
d := jstream.NewDecoder(r, 0).ObjectAsKVS()
d := jstream.NewDecoder(r, 0).ObjectAsKVS().MaxDepth(10)
sset := set.NewStringSet()
for mv := range d.Stream() {
var kvs jstream.KVS
Expand Down
2 changes: 1 addition & 1 deletion internal/s3select/csv/record.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ import (
"strconv"
"strings"

"github.com/bcicen/jstream"
csv "github.com/minio/csvparser"
"github.com/minio/minio/internal/s3select/jstream"
"github.com/minio/minio/internal/s3select/sql"
)

Expand Down
4 changes: 2 additions & 2 deletions internal/s3select/json/preader.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"runtime"
"sync"

"github.com/bcicen/jstream"
"github.com/minio/minio/internal/s3select/jstream"
"github.com/minio/minio/internal/s3select/sql"
)

Expand Down Expand Up @@ -185,7 +185,7 @@ func (r *PReader) startReaders() {
dst = make([]jstream.KVS, 0, 1000)
}

d := jstream.NewDecoder(bytes.NewBuffer(in.input), 0).ObjectAsKVS()
d := jstream.NewDecoder(bytes.NewBuffer(in.input), 0).ObjectAsKVS().MaxDepth(100)
stream := d.Stream()
all := dst[:0]
for mv := range stream {
Expand Down
5 changes: 2 additions & 3 deletions internal/s3select/json/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,8 @@ import (
"io"
"sync"

"github.com/minio/minio/internal/s3select/jstream"
"github.com/minio/minio/internal/s3select/sql"

"github.com/bcicen/jstream"
)

// Limit single document size to 10MiB, 10x the AWS limit:
Expand Down Expand Up @@ -84,7 +83,7 @@ func (r *Reader) Close() error {
// NewReader - creates new JSON reader using readCloser.
func NewReader(readCloser io.ReadCloser, args *ReaderArgs) *Reader {
readCloser = &syncReadCloser{rc: readCloser}
d := jstream.NewDecoder(io.LimitReader(readCloser, maxDocumentSize), 0).ObjectAsKVS()
d := jstream.NewDecoder(io.LimitReader(readCloser, maxDocumentSize), 0).ObjectAsKVS().MaxDepth(100)
return &Reader{
args: args,
decoder: d,
Expand Down
2 changes: 1 addition & 1 deletion internal/s3select/json/record.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ import (
"strconv"
"strings"

"github.com/bcicen/jstream"
csv "github.com/minio/csvparser"
"github.com/minio/minio/internal/s3select/jstream"
"github.com/minio/minio/internal/s3select/sql"
)

Expand Down
22 changes: 22 additions & 0 deletions internal/s3select/jstream/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
The MIT License (MIT)

Copyright (c) 2018 Bradley Cicenas

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

116 changes: 116 additions & 0 deletions internal/s3select/jstream/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
<p align="center"><img width="350px" src="jstream.png" alt="jstream"/></p>

#

[![GoDoc](https://godoc.org/github.com/bcicen/jstream?status.svg)](https://godoc.org/github.com/bcicen/jstream)


`jstream` is a streaming JSON parser and value extraction library for Go.

Unlike most JSON parsers, `jstream` is document position- and depth-aware -- this enables the extraction of values at a specified depth, eliminating the overhead of allocating encompassing arrays or objects; e.g:

Using the below example document:
<img width="85%" src="https://bradley.codes/static/img/jstream-levels.gif" alt="jstream"/>

we can choose to extract and act only the objects within the top-level array:
```go
f, _ := os.Open("input.json")
decoder := jstream.NewDecoder(f, 1) // extract JSON values at a depth level of 1
for mv := range decoder.Stream() {
fmt.Printf("%v\n ", mv.Value)
}
```

output:
```
map[desc:RGB colors:[red green blue]]
map[desc:CMYK colors:[cyan magenta yellow black]]
```

likewise, increasing depth level to `3` yields:
```
red
green
blue
cyan
magenta
yellow
black
```

optionally, kev:value pairs can be emitted as an individual struct:
```go
decoder := jstream.NewDecoder(f, 2).EmitKV() // enable KV streaming at a depth level of 2
```

```
jstream.KV{desc RGB}
jstream.KV{colors [red green blue]}
jstream.KV{desc CMYK}
jstream.KV{colors [cyan magenta yellow black]}
```

## Installing

```bash
go get github.com/bcicen/jstream
```

## Commandline

`jstream` comes with a cli tool for quick viewing of parsed values from JSON input:

```bash
jstream -d 1 < input.json
```

```json
{"colors":["red","green","blue"],"desc":"RGB"}
{"colors":["cyan","magenta","yellow","black"],"desc":"CMYK"}
```

detailed output with `-v` option:
```bash
cat input.json | jstream -v -d -1

depth start end type | value
2 018 023 string | "RGB"
3 041 046 string | "red"
3 048 055 string | "green"
3 057 063 string | "blue"
2 039 065 array | ["red","green","blue"]
1 004 069 object | {"colors":["red","green","blue"],"desc":"RGB"}
2 087 093 string | "CMYK"
3 111 117 string | "cyan"
3 119 128 string | "magenta"
3 130 138 string | "yellow"
3 140 147 string | "black"
2 109 149 array | ["cyan","magenta","yellow","black"]
1 073 153 object | {"colors":["cyan","magenta","yellow","black"],"desc":"CMYK"}
0 000 155 array | [{"colors":["red","green","blue"],"desc":"RGB"},{"colors":["cyan","magenta","yellow","black"],"desc":"CMYK"}]
```

### Options

Opt | Description
--- | ---
-d \<n\> | emit values at depth n. if n < 0, all values will be emitted
-kv | output inner key value pairs as newly formed objects
-v | output depth and offset details for each value
-h | display help dialog

## Benchmarks

Obligatory benchmarks performed on files with arrays of objects, where the decoded objects are to be extracted.

Two file sizes are used -- regular (1.6mb, 1000 objects) and large (128mb, 100000 objects)

input size | lib | MB/s | Allocated
--- | --- | --- | ---
regular | standard | 97 | 3.6MB
regular | jstream | 175 | 2.1MB
large | standard | 92 | 305MB
large | jstream | 404 | 69MB

In a real world scenario, including initialization and reader overhead from varying blob sizes, performance can be expected as below:
<img src="https://bradley.codes/static/img/bench.svg" alt="jstream"/>
Loading

0 comments on commit 76b2462

Please sign in to comment.