From 17059e8f83fa0ad4a8a9d67c1da05ebb85fb7419 Mon Sep 17 00:00:00 2001 From: Calum Murray Date: Wed, 1 May 2024 14:53:50 -0400 Subject: [PATCH 1/3] feat: improved performance of LIKE matching Signed-off-by: Calum Murray --- sql/v2/expression/like_expression.go | 94 ++++++++++++---------------- 1 file changed, 41 insertions(+), 53 deletions(-) diff --git a/sql/v2/expression/like_expression.go b/sql/v2/expression/like_expression.go index 5f557fa5a..5ddaed52d 100644 --- a/sql/v2/expression/like_expression.go +++ b/sql/v2/expression/like_expression.go @@ -6,9 +6,6 @@ package expression import ( - "regexp" - "strings" - cesql "github.com/cloudevents/sdk-go/sql/v2" "github.com/cloudevents/sdk-go/sql/v2/utils" cloudevents "github.com/cloudevents/sdk-go/v2" @@ -16,7 +13,7 @@ import ( type likeExpression struct { baseUnaryExpression - pattern *regexp.Regexp + pattern string } func (l likeExpression) Evaluate(event cloudevents.Event) (interface{}, error) { @@ -30,70 +27,61 @@ func (l likeExpression) Evaluate(event cloudevents.Event) (interface{}, error) { return nil, err } - return l.pattern.MatchString(val.(string)), nil + return matchString(val.(string), l.pattern), nil + } func NewLikeExpression(child cesql.Expression, pattern string) (cesql.Expression, error) { - // Converting to regex is not the most performant impl, but it works - p, err := convertLikePatternToRegex(pattern) - if err != nil { - return nil, err - } - return likeExpression{ baseUnaryExpression: baseUnaryExpression{ child: child, }, - pattern: p, + pattern: pattern, }, nil } -func convertLikePatternToRegex(pattern string) (*regexp.Regexp, error) { - var chunks []string - chunks = append(chunks, "^") - - var chunk strings.Builder +func matchString(text, pattern string) bool { + textLen := len(text) + patternLen := len(pattern) + textIdx := 0 + patternIdx := 0 + lastWildcardIdx := -1 + lastMatchIdx := 0 - for i := 0; i < len(pattern); i++ { - if pattern[i] == '\\' && i < len(pattern)-1 { - if pattern[i+1] == '%' { - // \% case - chunk.WriteRune('%') - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - chunk.Reset() - i++ - continue - } else if pattern[i+1] == '_' { - // \_ case - chunk.WriteRune('_') - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - chunk.Reset() - i++ - continue - } else { - // if there is an actual literal \ character, we need to include that in the string - chunk.WriteRune('\\') - } - } else if pattern[i] == '_' { - // replace with . - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - chunk.Reset() - chunks = append(chunks, ".") - } else if pattern[i] == '%' { - // replace with .* - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - chunk.Reset() - chunks = append(chunks, ".*") + for textIdx < textLen { + // handle escaped characters -> pattern needs to increment two places here + if patternIdx < patternLen-1 && pattern[patternIdx] == '\\' && + ((pattern[patternIdx+1] == '_' || pattern[patternIdx+1] == '%') && + pattern[patternIdx+1] == text[textIdx]) { + patternIdx += 2 + textIdx += 1 + // handle non escaped characters + } else if patternIdx < patternLen && (pattern[patternIdx] == '_' || pattern[patternIdx] == text[textIdx]) { + textIdx += 1 + patternIdx += 1 + // handle wildcard characters + } else if patternIdx < patternLen && pattern[patternIdx] == '%' { + lastWildcardIdx = patternIdx + lastMatchIdx = textIdx + patternIdx += 1 + // greedy match didn't work, try again from the last known match + } else if lastWildcardIdx != -1 { + patternIdx = lastWildcardIdx + 1 + lastMatchIdx += 1 + textIdx = lastMatchIdx } else { - chunk.WriteByte(pattern[i]) + return false } } - if chunk.Len() != 0 { - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - } + // consume remaining pattern characters as long as they are wildcards + for patternIdx < patternLen { + if pattern[patternIdx] != '%' { + return false + } - chunks = append(chunks, "$") + patternIdx += 1 + } - return regexp.Compile(strings.Join(chunks, "")) + return true } From 782e77634e4e3c3b81f78c9bdb0bbcd7751f95e7 Mon Sep 17 00:00:00 2001 From: Calum Murray Date: Thu, 9 May 2024 14:26:22 -0400 Subject: [PATCH 2/3] cleanup: moved comments Signed-off-by: Calum Murray --- sql/v2/expression/like_expression.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/v2/expression/like_expression.go b/sql/v2/expression/like_expression.go index 5ddaed52d..b34829d7b 100644 --- a/sql/v2/expression/like_expression.go +++ b/sql/v2/expression/like_expression.go @@ -49,23 +49,23 @@ func matchString(text, pattern string) bool { lastMatchIdx := 0 for textIdx < textLen { - // handle escaped characters -> pattern needs to increment two places here if patternIdx < patternLen-1 && pattern[patternIdx] == '\\' && ((pattern[patternIdx+1] == '_' || pattern[patternIdx+1] == '%') && pattern[patternIdx+1] == text[textIdx]) { + // handle escaped characters -> pattern needs to increment two places here patternIdx += 2 textIdx += 1 - // handle non escaped characters } else if patternIdx < patternLen && (pattern[patternIdx] == '_' || pattern[patternIdx] == text[textIdx]) { + // handle non escaped characters textIdx += 1 patternIdx += 1 - // handle wildcard characters } else if patternIdx < patternLen && pattern[patternIdx] == '%' { + // handle wildcard characters lastWildcardIdx = patternIdx lastMatchIdx = textIdx patternIdx += 1 - // greedy match didn't work, try again from the last known match } else if lastWildcardIdx != -1 { + // greedy match didn't work, try again from the last known match patternIdx = lastWildcardIdx + 1 lastMatchIdx += 1 textIdx = lastMatchIdx From 19fd3f25c46cd7556890320ec7a6f63f0eefc77e Mon Sep 17 00:00:00 2001 From: Calum Murray Date: Thu, 9 May 2024 14:31:07 -0400 Subject: [PATCH 3/3] perf: added check for empty pattern Signed-off-by: Calum Murray --- sql/v2/expression/like_expression.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sql/v2/expression/like_expression.go b/sql/v2/expression/like_expression.go index b34829d7b..01734852a 100644 --- a/sql/v2/expression/like_expression.go +++ b/sql/v2/expression/like_expression.go @@ -48,6 +48,10 @@ func matchString(text, pattern string) bool { lastWildcardIdx := -1 lastMatchIdx := 0 + if patternLen == 0 { + return patternLen == textLen + } + for textIdx < textLen { if patternIdx < patternLen-1 && pattern[patternIdx] == '\\' && ((pattern[patternIdx+1] == '_' || pattern[patternIdx+1] == '%') &&