-
Notifications
You must be signed in to change notification settings - Fork 219
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: improved performance of LIKE matching #1049
Changes from all commits
17059e8
782e776
19fd3f2
d8de904
7bc8a63
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,17 +6,14 @@ | |
package expression | ||
|
||
import ( | ||
"regexp" | ||
"strings" | ||
|
||
cesql "github.com/cloudevents/sdk-go/sql/v2" | ||
"github.com/cloudevents/sdk-go/sql/v2/utils" | ||
cloudevents "github.com/cloudevents/sdk-go/v2" | ||
) | ||
|
||
type likeExpression struct { | ||
baseUnaryExpression | ||
pattern *regexp.Regexp | ||
pattern string | ||
} | ||
|
||
func (l likeExpression) Evaluate(event cloudevents.Event) (interface{}, error) { | ||
|
@@ -30,70 +27,65 @@ func (l likeExpression) Evaluate(event cloudevents.Event) (interface{}, error) { | |
return nil, err | ||
} | ||
|
||
return l.pattern.MatchString(val.(string)), nil | ||
return matchString(val.(string), l.pattern), nil | ||
|
||
} | ||
|
||
func NewLikeExpression(child cesql.Expression, pattern string) (cesql.Expression, error) { | ||
// Converting to regex is not the most performant impl, but it works | ||
p, err := convertLikePatternToRegex(pattern) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return likeExpression{ | ||
baseUnaryExpression: baseUnaryExpression{ | ||
child: child, | ||
}, | ||
pattern: p, | ||
pattern: pattern, | ||
}, nil | ||
} | ||
|
||
func convertLikePatternToRegex(pattern string) (*regexp.Regexp, error) { | ||
var chunks []string | ||
chunks = append(chunks, "^") | ||
func matchString(text, pattern string) bool { | ||
textLen := len(text) | ||
patternLen := len(pattern) | ||
textIdx := 0 | ||
patternIdx := 0 | ||
lastWildcardIdx := -1 | ||
lastMatchIdx := 0 | ||
|
||
var chunk strings.Builder | ||
if patternLen == 0 { | ||
return patternLen == textLen | ||
} | ||
|
||
for i := 0; i < len(pattern); i++ { | ||
if pattern[i] == '\\' && i < len(pattern)-1 { | ||
if pattern[i+1] == '%' { | ||
// \% case | ||
chunk.WriteRune('%') | ||
chunks = append(chunks, "\\Q"+chunk.String()+"\\E") | ||
chunk.Reset() | ||
i++ | ||
continue | ||
} else if pattern[i+1] == '_' { | ||
// \_ case | ||
chunk.WriteRune('_') | ||
chunks = append(chunks, "\\Q"+chunk.String()+"\\E") | ||
chunk.Reset() | ||
i++ | ||
continue | ||
} else { | ||
// if there is an actual literal \ character, we need to include that in the string | ||
chunk.WriteRune('\\') | ||
} | ||
} else if pattern[i] == '_' { | ||
// replace with . | ||
chunks = append(chunks, "\\Q"+chunk.String()+"\\E") | ||
chunk.Reset() | ||
chunks = append(chunks, ".") | ||
} else if pattern[i] == '%' { | ||
// replace with .* | ||
chunks = append(chunks, "\\Q"+chunk.String()+"\\E") | ||
chunk.Reset() | ||
chunks = append(chunks, ".*") | ||
for textIdx < textLen { | ||
if patternIdx < patternLen-1 && pattern[patternIdx] == '\\' && | ||
((pattern[patternIdx+1] == '_' || pattern[patternIdx+1] == '%') && | ||
pattern[patternIdx+1] == text[textIdx]) { | ||
// handle escaped characters -> pattern needs to increment two places here | ||
patternIdx += 2 | ||
textIdx += 1 | ||
} else if patternIdx < patternLen && (pattern[patternIdx] == '_' || pattern[patternIdx] == text[textIdx]) { | ||
// handle non escaped characters | ||
textIdx += 1 | ||
patternIdx += 1 | ||
} else if patternIdx < patternLen && pattern[patternIdx] == '%' { | ||
// handle wildcard characters | ||
lastWildcardIdx = patternIdx | ||
lastMatchIdx = textIdx | ||
patternIdx += 1 | ||
} else if lastWildcardIdx != -1 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm probably wrong, and it's probably due to the nature of % being so greedy, but are we sure that we don't need a stack of "last known wildcard indexes" instead of just 1? I'm wondering about a case where we've seen multiple %'s and then we don't match something and we need to back-up to the first % not just the 2nd %. I can't seem to write a testcase to show the issue, hence why I'm probably wrong, but I thought I'd mention it/ask.... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For me the trick to understanding this is that wildcard matching has optimal substructure: if we have already matched part of the input string against part of the pattern string, we only need to use the remaining parts of the pattern string and input string to determine if the entire string matches the pattern or not. The only slight "exception" to the optimal substructure is that the The key is that we know that we have a valid match up until the last wildcard character we saw, and that apart from going back to that wildcard and consuming more characters from the input text there we can not otherwise move backwards in the pattern or the input text. So, storing an older index would not end up being used, since it would be in part of the input text we had already matched. |
||
// greedy match didn't work, try again from the last known match | ||
patternIdx = lastWildcardIdx + 1 | ||
lastMatchIdx += 1 | ||
textIdx = lastMatchIdx | ||
} else { | ||
chunk.WriteByte(pattern[i]) | ||
return false | ||
} | ||
} | ||
|
||
if chunk.Len() != 0 { | ||
chunks = append(chunks, "\\Q"+chunk.String()+"\\E") | ||
} | ||
// consume remaining pattern characters as long as they are wildcards | ||
for patternIdx < patternLen { | ||
if pattern[patternIdx] != '%' { | ||
return false | ||
} | ||
|
||
chunks = append(chunks, "$") | ||
patternIdx += 1 | ||
} | ||
|
||
return regexp.Compile(strings.Join(chunks, "")) | ||
return true | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if moving the
pattern[patternIdx] == '\\'
to be first would speed things up slightly sincepatternIdx < patternLen-1
will be true most of the time, and I think doing the check for\
is the biggest determining factor, so let's stop on that check as quickly as possible.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh crap - never mind, I mixed up text* and pattern* variables. Although, I do wonder if you should add a check for the rest of the pattern being empty and then by-pass all of these if's.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh yeah, an empty pattern is an interesting case. I guess currently you would just end up in the final
else
clause and returnfalse
. We could definitely add a quick check at the top to check thetextLen
andpatternLen
before entering the for loop and all the if statemenets