diff --git a/pkg/stanza/operator/parser/keyvalue/keyvalue.go b/pkg/stanza/operator/parser/keyvalue/keyvalue.go index 7655939bdb26..84caae391ce7 100644 --- a/pkg/stanza/operator/parser/keyvalue/keyvalue.go +++ b/pkg/stanza/operator/parser/keyvalue/keyvalue.go @@ -51,27 +51,23 @@ func (c Config) Build(logger *zap.SugaredLogger) (operator.Operator, error) { return nil, err } - if c.Delimiter == c.PairDelimiter { - return nil, errors.New("delimiter and pair_delimiter cannot be the same value") - } - if c.Delimiter == "" { return nil, errors.New("delimiter is a required parameter") } - // split on whitespace by default, if pair delimiter is set, use - // strings.Split() - pairSplitFunc := splitStringByWhitespace + pairDelimiter := " " if c.PairDelimiter != "" { - pairSplitFunc = func(input string) []string { - return strings.Split(input, c.PairDelimiter) - } + pairDelimiter = c.PairDelimiter + } + + if c.Delimiter == pairDelimiter { + return nil, errors.New("delimiter and pair_delimiter cannot be the same value") } return &Parser{ ParserOperator: parserOperator, delimiter: c.Delimiter, - pairSplitFunc: pairSplitFunc, + pairDelimiter: pairDelimiter, }, nil } @@ -79,7 +75,7 @@ func (c Config) Build(logger *zap.SugaredLogger) (operator.Operator, error) { type Parser struct { helper.ParserOperator delimiter string - pairSplitFunc func(input string) []string + pairDelimiter string } // Process will parse an entry for key value pairs. @@ -91,21 +87,25 @@ func (kv *Parser) Process(ctx context.Context, entry *entry.Entry) error { func (kv *Parser) parse(value any) (any, error) { switch m := value.(type) { case string: - return kv.parser(m, kv.delimiter) + return kv.parser(m, kv.delimiter, kv.pairDelimiter) default: return nil, fmt.Errorf("type %T cannot be parsed as key value pairs", value) } } -func (kv *Parser) parser(input string, delimiter string) (map[string]any, error) { +func (kv *Parser) parser(input string, delimiter string, pairDelimiter string) (map[string]any, error) { if input == "" { return nil, fmt.Errorf("parse from field %s is empty", kv.ParseFrom.String()) } + pairs, err := splitPairs(input, pairDelimiter) + if err != nil { + return nil, fmt.Errorf("failed to parse pairs from input: %w", err) + } + parsed := make(map[string]any) - var err error - for _, raw := range kv.pairSplitFunc(input) { + for _, raw := range pairs { m := strings.SplitN(raw, delimiter, 2) if len(m) != 2 { e := fmt.Errorf("expected '%s' to split by '%s' into two items, got %d", raw, delimiter, len(m)) @@ -122,14 +122,45 @@ func (kv *Parser) parser(input string, delimiter string) (map[string]any, error) return parsed, err } -// split on whitespace and preserve quoted text -func splitStringByWhitespace(input string) []string { - quoted := false - raw := strings.FieldsFunc(input, func(r rune) bool { - if r == '"' || r == '\'' { - quoted = !quoted +// splitPairs will split the input on the pairDelimiter and return the resulting slice. +// `strings.Split` is not used because it does not respect quotes and will split if the delimiter appears in a quoted value +func splitPairs(input, pairDelimiter string) ([]string, error) { + var result []string + currentPair := "" + delimiterLength := len(pairDelimiter) + quoteChar := "" // "" means we are not in quotes + + i := 0 + for i < len(input) { + if quoteChar == "" && i+delimiterLength <= len(input) && input[i:i+delimiterLength] == pairDelimiter { + if currentPair == "" { + i++ + continue + } + result = append(result, currentPair) + currentPair = "" + i += delimiterLength + continue + } else if input[i] == '"' || input[i] == '\'' { + if quoteChar != "" { + if quoteChar == string(input[i]) { + quoteChar = "" + } + } else { + quoteChar = string(input[i]) + } } - return !quoted && r == ' ' - }) - return raw + currentPair += string(input[i]) + i++ + } + + if quoteChar != "" { + return nil, fmt.Errorf("never reached end of a quoted value") + } + + if currentPair != "" { + result = append(result, currentPair) + } + + return result, nil } diff --git a/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go b/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go index 5c02127d1de8..c4bd2133b0ff 100644 --- a/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go +++ b/pkg/stanza/operator/parser/keyvalue/keyvalue_test.go @@ -109,6 +109,15 @@ func TestBuild(t *testing.T) { }(), true, }, + { + "pair-delimiter-equals-default-delimiter", + func() *Config { + cfg := basicConfig() + cfg.Delimiter = " " + return cfg + }(), + true, + }, { "unset-delimiter", func() *Config { @@ -503,7 +512,7 @@ key=value`, false, }, { - "quoted-value-contains-delimiter", + "quoted-value-contains-whitespace-delimiter", func(kv *Config) {}, &entry.Entry{ Body: `msg="Message successfully sent at 2023-12-04 06:47:31.204222276 +0000 UTC m=+5115.932279346"`, @@ -572,6 +581,77 @@ key=value`, false, true, }, + { + "custom pair delimiter in quoted value", + func(kv *Config) { + kv.PairDelimiter = "_" + }, + &entry.Entry{ + Body: `a=b_c="d_e"`, + }, + &entry.Entry{ + Attributes: map[string]any{ + "a": "b", + "c": "d_e", + }, + Body: `a=b_c="d_e"`, + }, + false, + false, + }, + { + "embedded double quotes in single quoted value", + func(kv *Config) {}, + &entry.Entry{ + Body: `a=b c='this is a "co ol" value'`, + }, + &entry.Entry{ + Attributes: map[string]any{ + "a": "b", + "c": "this is a \"co ol\" value", + }, + Body: `a=b c='this is a "co ol" value'`, + }, + false, + false, + }, + { + "leading and trailing pair delimiter w/o quotes", + func(kv *Config) {}, + &entry.Entry{ + Body: " k1=v1 k2==v2 k3=v3= ", + }, + &entry.Entry{ + Attributes: map[string]any{ + "k1": "v1", + "k2": "=v2", + "k3": "v3=", + }, + Body: " k1=v1 k2==v2 k3=v3= ", + }, + false, + false, + }, + { + "complicated delimiters", + func(kv *Config) { + kv.Delimiter = "@*" + kv.PairDelimiter = "_!_" + }, + &entry.Entry{ + Body: `k1@*v1_!_k2@**v2_!__k3@@*v3__`, + }, + &entry.Entry{ + Attributes: map[string]any{ + "k1": "v1", + "k2": "*v2", + "_k3@": "v3__", + }, + Body: `k1@*v1_!_k2@**v2_!__k3@@*v3__`, + }, + false, + false, + }, } for _, tc := range cases { @@ -604,28 +684,3 @@ key=value`, }) } } - -func TestSplitStringByWhitespace(t *testing.T) { - cases := []struct { - name string - intput string - output []string - }{ - { - "simple", - "k=v a=b x=\" y \" job=\"software engineering\"", - []string{ - "k=v", - "a=b", - "x=\" y \"", - "job=\"software engineering\"", - }, - }, - } - - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - require.Equal(t, tc.output, splitStringByWhitespace(tc.intput)) - }) - } -}