From 4036f1a7df37b7bf23f9b9e839cb05e6836bf6e2 Mon Sep 17 00:00:00 2001 From: Dakota Paasman Date: Mon, 5 Feb 2024 15:14:34 -0500 Subject: [PATCH 1/4] fix parsing bug and add more unit tests --- .../operator/parser/keyvalue/keyvalue.go | 81 +++++++++---- .../operator/parser/keyvalue/keyvalue_test.go | 107 +++++++++++++----- 2 files changed, 137 insertions(+), 51 deletions(-) diff --git a/pkg/stanza/operator/parser/keyvalue/keyvalue.go b/pkg/stanza/operator/parser/keyvalue/keyvalue.go index 7655939bdb26..84caae391ce7 100644 --- a/pkg/stanza/operator/parser/keyvalue/keyvalue.go +++ b/pkg/stanza/operator/parser/keyvalue/keyvalue.go @@ -51,27 +51,23 @@ func (c Config) Build(logger *zap.SugaredLogger) (operator.Operator, error) { return nil, err } - if c.Delimiter == c.PairDelimiter { - return nil, errors.New("delimiter and pair_delimiter cannot be the same value") - } - if c.Delimiter == "" { return nil, errors.New("delimiter is a required parameter") } - // split on whitespace by default, if pair delimiter is set, use - // strings.Split() - pairSplitFunc := splitStringByWhitespace + pairDelimiter := " " if c.PairDelimiter != "" { - pairSplitFunc = func(input string) []string { - return strings.Split(input, c.PairDelimiter) - } + pairDelimiter = c.PairDelimiter + } + + if c.Delimiter == pairDelimiter { + return nil, errors.New("delimiter and pair_delimiter cannot be the same value") } return &Parser{ ParserOperator: parserOperator, delimiter: c.Delimiter, - pairSplitFunc: pairSplitFunc, + pairDelimiter: pairDelimiter, }, nil } @@ -79,7 +75,7 @@ func (c Config) Build(logger *zap.SugaredLogger) (operator.Operator, error) { type Parser struct { helper.ParserOperator delimiter string - pairSplitFunc func(input string) []string + pairDelimiter string } // Process will parse an entry for key value pairs. @@ -91,21 +87,25 @@ func (kv *Parser) Process(ctx context.Context, entry *entry.Entry) error { func (kv *Parser) parse(value any) (any, error) { switch m := value.(type) { case string: - return kv.parser(m, kv.delimiter) + return kv.parser(m, kv.delimiter, kv.pairDelimiter) default: return nil, fmt.Errorf("type %T cannot be parsed as key value pairs", value) } } -func (kv *Parser) parser(input string, delimiter string) (map[string]any, error) { +func (kv *Parser) parser(input string, delimiter string, pairDelimiter string) (map[string]any, error) { if input == "" { return nil, fmt.Errorf("parse from field %s is empty", kv.ParseFrom.String()) } + pairs, err := splitPairs(input, pairDelimiter) + if err != nil { + return nil, fmt.Errorf("failed to parse pairs from input: %w", err) + } + parsed := make(map[string]any) - var err error - for _, raw := range kv.pairSplitFunc(input) { + for _, raw := range pairs { m := strings.SplitN(raw, delimiter, 2) if len(m) != 2 { e := fmt.Errorf("expected '%s' to split by '%s' into two items, got %d", raw, delimiter, len(m)) @@ -122,14 +122,45 @@ func (kv *Parser) parser(input string, delimiter string) (map[string]any, error) return parsed, err } -// split on whitespace and preserve quoted text -func splitStringByWhitespace(input string) []string { - quoted := false - raw := strings.FieldsFunc(input, func(r rune) bool { - if r == '"' || r == '\'' { - quoted = !quoted +// splitPairs will split the input on the pairDelimiter and return the resulting slice. +// `strings.Split` is not used because it does not respect quotes and will split if the delimiter appears in a quoted value +func splitPairs(input, pairDelimiter string) ([]string, error) { + var result []string + currentPair := "" + delimiterLength := len(pairDelimiter) + quoteChar := "" // "" means we are not in quotes + + i := 0 + for i < len(input) { + if quoteChar == "" && i+delimiterLength <= len(input) && input[i:i+delimiterLength] == pairDelimiter { + if currentPair == "" { + i++ + continue + } + result = append(result, currentPair) + currentPair = "" + i += delimiterLength + continue + } else if input[i] == '"' || input[i] == '\'' { + if quoteChar != "" { + if quoteChar == string(input[i]) { + quoteChar = "" + } + } else { + quoteChar = string(input[i]) + } } - return !quoted && r == ' ' - }) - return raw + currentPair += string(input[i]) + i++ + } + + if quoteChar != "" { + return nil, fmt.Errorf("never reached end of a quoted value") + } + + if currentPair != "" { + result = append(result, currentPair) + } + + return result, nil } diff --git a/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go b/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go index 5c02127d1de8..c4bd2133b0ff 100644 --- a/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go +++ b/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go @@ -109,6 +109,15 @@ func TestBuild(t *testing.T) { }(), true, }, + { + "pair-delimiter-equals-default-delimiter", + func() *Config { + cfg := basicConfig() + cfg.Delimiter = " " + return cfg + }(), + true, + }, { "unset-delimiter", func() *Config { @@ -503,7 +512,7 @@ key=value`, false, }, { - "quoted-value-contains-delimiter", + "quoted-value-contains-whitespace-delimiter", func(kv *Config) {}, &entry.Entry{ Body: `msg="Message successfully sent at 2023-12-04 06:47:31.204222276 +0000 UTC m=+5115.932279346"`, @@ -572,6 +581,77 @@ key=value`, false, true, }, + { + "custom pair delimiter in quoted value", + func(kv *Config) { + kv.PairDelimiter = "_" + }, + &entry.Entry{ + Body: `a=b_c="d_e"`, + }, + &entry.Entry{ + Attributes: map[string]any{ + "a": "b", + "c": "d_e", + }, + Body: `a=b_c="d_e"`, + }, + false, + false, + }, + { + "embedded double quotes in single quoted value", + func(kv *Config) {}, + &entry.Entry{ + Body: `a=b c='this is a "co ol" value'`, + }, + &entry.Entry{ + Attributes: map[string]any{ + "a": "b", + "c": "this is a \"co ol\" value", + }, + Body: `a=b c='this is a "co ol" value'`, + }, + false, + false, + }, + { + "leading and trailing pair delimiter w/o quotes", + func(kv *Config) {}, + &entry.Entry{ + Body: " k1=v1 k2==v2 k3=v3= ", + }, + &entry.Entry{ + Attributes: map[string]any{ + "k1": "v1", + "k2": "=v2", + "k3": "v3=", + }, + Body: " k1=v1 k2==v2 k3=v3= ", + }, + false, + false, + }, + { + "complicated delimiters", + func(kv *Config) { + kv.Delimiter = "@*" + kv.PairDelimiter = "_!_" + }, + &entry.Entry{ + Body: `k1@*v1_!_k2@**v2_!__k3@@*v3__`, + }, + &entry.Entry{ + Attributes: map[string]any{ + "k1": "v1", + "k2": "*v2", + "_k3@": "v3__", + }, + Body: `k1@*v1_!_k2@**v2_!__k3@@*v3__`, + }, + false, + false, + }, } for _, tc := range cases { @@ -604,28 +684,3 @@ key=value`, }) } } - -func TestSplitStringByWhitespace(t *testing.T) { - cases := []struct { - name string - intput string - output []string - }{ - { - "simple", - "k=v a=b x=\" y \" job=\"software engineering\"", - []string{ - "k=v", - "a=b", - "x=\" y \"", - "job=\"software engineering\"", - }, - }, - } - - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - require.Equal(t, tc.output, splitStringByWhitespace(tc.intput)) - }) - } -} From 2223a041ba55fa7013c38d77cda820240f596ce5 Mon Sep 17 00:00:00 2001 From: Dakota Paasman Date: Mon, 5 Feb 2024 15:45:04 -0500 Subject: [PATCH 2/4] add changelog --- ...anza-fix-key-value-pair-delimiter-bug.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml diff --git a/.chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml b/.chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml new file mode 100644 index 000000000000..a81246058023 --- /dev/null +++ b/.chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: pkg/stanza + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Fixed a bug in the Key Value Parser that could arise when using a custom pair delimiter. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [31034] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] From 4b8e0777e59c36c2a8e9f822bd4af638eaa96e24 Mon Sep 17 00:00:00 2001 From: Dakota Paasman Date: Tue, 6 Feb 2024 14:17:19 -0500 Subject: [PATCH 3/4] feedback & more tests --- .../operator/parser/keyvalue/keyvalue.go | 40 +++++++++---------- .../operator/parser/keyvalue/keyvalue_test.go | 35 ++++++++++++++++ 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/pkg/stanza/operator/parser/keyvalue/keyvalue.go b/pkg/stanza/operator/parser/keyvalue/keyvalue.go index 84caae391ce7..7598aafdeb73 100644 --- a/pkg/stanza/operator/parser/keyvalue/keyvalue.go +++ b/pkg/stanza/operator/parser/keyvalue/keyvalue.go @@ -113,8 +113,8 @@ func (kv *Parser) parser(input string, delimiter string, pairDelimiter string) ( continue } - key := strings.TrimSpace(strings.Trim(m[0], "\"'")) - value := strings.TrimSpace(strings.Trim(m[1], "\"'")) + key := strings.TrimSpace(m[0]) + value := strings.TrimSpace(m[1]) parsed[key] = value } @@ -130,36 +130,34 @@ func splitPairs(input, pairDelimiter string) ([]string, error) { delimiterLength := len(pairDelimiter) quoteChar := "" // "" means we are not in quotes - i := 0 - for i < len(input) { - if quoteChar == "" && i+delimiterLength <= len(input) && input[i:i+delimiterLength] == pairDelimiter { - if currentPair == "" { - i++ + for i := 0; i < len(input); i++ { + if quoteChar == "" && i+delimiterLength <= len(input) && input[i:i+delimiterLength] == pairDelimiter { // delimiter + if currentPair == "" { // leading || trailing delimiter; ignore continue } result = append(result, currentPair) currentPair = "" - i += delimiterLength + i += delimiterLength - 1 + continue + } + + if quoteChar == "" && (input[i] == '"' || input[i] == '\'') { // start of quote + quoteChar = string(input[i]) continue - } else if input[i] == '"' || input[i] == '\'' { - if quoteChar != "" { - if quoteChar == string(input[i]) { - quoteChar = "" - } - } else { - quoteChar = string(input[i]) - } } + if string(input[i]) == quoteChar { // end of quote + quoteChar = "" + continue + } + currentPair += string(input[i]) - i++ } - if quoteChar != "" { + if quoteChar != "" { // check for closed quotes return nil, fmt.Errorf("never reached end of a quoted value") } - - if currentPair != "" { - result = append(result, currentPair) + if currentPair != "" { // avoid adding empty value bc of a trailing delimiter + return append(result, currentPair), nil } return result, nil diff --git a/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go b/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go index c4bd2133b0ff..41cf7593d577 100644 --- a/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go +++ b/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go @@ -157,6 +157,13 @@ func TestParserInvalidType(t *testing.T) { require.Contains(t, err.Error(), "type []int cannot be parsed as key value pairs") } +func TestParserEmptyInput(t *testing.T) { + parser := newTestParser(t) + _, err := parser.parse("") + require.Error(t, err) + require.Contains(t, err.Error(), "parse from field body is empty") +} + func TestKVImplementations(t *testing.T) { require.Implements(t, (*operator.Operator)(nil), new(Parser)) } @@ -615,6 +622,22 @@ key=value`, false, false, }, + { + "embedded double quotes end single quoted value", + func(kv *Config) {}, + &entry.Entry{ + Body: `a=b c='this is a "co ol"'`, + }, + &entry.Entry{ + Attributes: map[string]any{ + "a": "b", + "c": "this is a \"co ol\"", + }, + Body: `a=b c='this is a "co ol"'`, + }, + false, + false, + }, { "leading and trailing pair delimiter w/o quotes", func(kv *Config) {}, @@ -652,6 +675,18 @@ key=value`, false, false, }, + { + "unclosed quotes", + func(kv *Config) {}, + &entry.Entry{ + Body: `k1='v1' k2='v2`, + }, + &entry.Entry{ + Body: `k1='v1' k2='v2`, + }, + true, + false, + }, } for _, tc := range cases { From 3385d875fcb8bbc70fa6070769bbfc8f8dbc8a1a Mon Sep 17 00:00:00 2001 From: Daniel Jaglowski Date: Tue, 6 Feb 2024 14:19:25 -0600 Subject: [PATCH 4/4] Update .chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml --- .chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml b/.chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml index a81246058023..d2e50b12d9ce 100644 --- a/.chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml +++ b/.chloggen/pkg-stanza-fix-key-value-pair-delimiter-bug.yaml @@ -7,7 +7,7 @@ change_type: bug_fix component: pkg/stanza # A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). -note: Fixed a bug in the Key Value Parser that could arise when using a custom pair delimiter. +note: Fixed a bug in the keyvalue_parser where quoted values could be split if they contained a delimited. # Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. issues: [31034]