From 7f15063ad1d0f39f3df43fc54cfa1bc31fe0cbb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 25 Jul 2024 17:53:17 +0200 Subject: [PATCH 01/16] improve performances of `fnmatch.translate` --- Lib/fnmatch.py | 94 ++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 48 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 73acb1fe8d4106..f54d2324ae0b7b 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -9,6 +9,7 @@ The function translate(PATTERN) returns a regular expression corresponding to PATTERN. (It does not compile it.) """ + import os import posixpath import re @@ -77,23 +78,38 @@ def translate(pat): There is no way to quote meta-characters. """ - STAR = object() - parts = _translate(pat, STAR, '.') - return _join_translated_parts(parts, STAR) + parts, indices = _translate(pat, '.') + return _join_translated_parts(parts, indices) +_set_ops_re = re.compile(r'([&~|])') -def _translate(pat, STAR, QUESTION_MARK): +def _translate(pat, QUESTION_MARK): res = [] add = res.append + indices = [] + pending = [] # pending characters to escape + i, n = 0, len(pat) while i < n: c = pat[i] i = i+1 if c == '*': + if pending: + add(re.escape(''.join(pending))) + pending = [] + # store the position of the wildcard + indices.append(len(parts)) + add('*') # compress consecutive `*` into one - if (not res) or res[-1] is not STAR: - add(STAR) + while i < n and pat[i] == '*': + i += 1 elif c == '?': + # Handling '?' one at a time seems to more efficient + # even if there are consecutive '?' that could have + # been written directly. + if pending: + add(re.escape(''.join(pending))) + pending = [] add(QUESTION_MARK) elif c == '[': j = i @@ -104,8 +120,11 @@ def _translate(pat, STAR, QUESTION_MARK): while j < n and pat[j] != ']': j = j+1 if j >= n: - add('\\[') + pending.append('[') else: + if pending: + add(re.escape(''.join(pending))) + pending = [] stuff = pat[i:j] if '-' not in stuff: stuff = stuff.replace('\\', r'\\') @@ -133,8 +152,6 @@ def _translate(pat, STAR, QUESTION_MARK): # Hyphens that create ranges shouldn't be escaped. stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') for s in chunks) - # Escape set operations (&&, ~~ and ||). - stuff = re.sub(r'([&~|])', r'\\\1', stuff) i = j+1 if not stuff: # Empty range: never match. @@ -143,50 +160,31 @@ def _translate(pat, STAR, QUESTION_MARK): # Negated empty range: match any character. add('.') else: + # Escape set operations (&&, ~~ and ||). + stuff = _set_ops_re.sub(r'\\\1', stuff) if stuff[0] == '!': stuff = '^' + stuff[1:] elif stuff[0] in ('^', '['): stuff = '\\' + stuff add(f'[{stuff}]') else: - add(re.escape(c)) - assert i == n - return res - - -def _join_translated_parts(inp, STAR): - # Deal with STARs. - res = [] - add = res.append - i, n = 0, len(inp) - # Fixed pieces at the start? - while i < n and inp[i] is not STAR: - add(inp[i]) - i += 1 - # Now deal with STAR fixed STAR fixed ... - # For an interior `STAR fixed` pairing, we want to do a minimal - # .*? match followed by `fixed`, with no possibility of backtracking. - # Atomic groups ("(?>...)") allow us to spell that directly. - # Note: people rely on the undocumented ability to join multiple - # translate() results together via "|" to build large regexps matching - # "one of many" shell patterns. - while i < n: - assert inp[i] is STAR - i += 1 - if i == n: - add(".*") - break - assert inp[i] is not STAR - fixed = [] - while i < n and inp[i] is not STAR: - fixed.append(inp[i]) - i += 1 - fixed = "".join(fixed) - if i == n: - add(".*") - add(fixed) - else: - add(f"(?>.*?{fixed})") + pending.append(c) + if pending: + add(re.escape(''.join(pending))) assert i == n - res = "".join(res) + return parts, indices + + +def _join_translated_parts(parts, indices): + if not indices: + return fr'(?s:{"".join(parts)})\Z' + iter_indices = iter(indices) + i, j = 0, next(iter_indices) + buffer = parts[i:j] + i = j + 1 + for j in iter_indices: + buffer.append(f'(?>.*?{"".join(parts[i:j])})') + i = j + 1 + buffer.append(f'.*{"".join(parts[i:])}') + res = ''.join(buffer) return fr'(?s:{res})\Z' From 83d0904e3ef34cbc3a3980813e4c9e4d9989791f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 25 Jul 2024 18:05:12 +0200 Subject: [PATCH 02/16] add tests --- Lib/test/test_fnmatch.py | 65 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 10ed496d4e2f37..a4bf4c56783e71 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -1,5 +1,4 @@ """Test cases for the fnmatch module.""" - import unittest import os import string @@ -250,6 +249,70 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) + def test_translate_wildcards(self): + for pattern, expect in [ + ('ab*', r'(?s:ab.*)\Z'), + ('ab*cd', r'(?s:ab.*cd)\Z'), + ('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'), + ('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'), + ('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'), + ('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'), + ('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'), + ]: + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) + + for pattern, expect in [ + ('*ab', r'(?s:.*ab)\Z'), + ('*ab*', r'(?s:(?>.*?ab).*)\Z'), + ('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'), + ('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'), + ('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'), + ('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'), + ('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'), + ('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'), + ]: + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) + + def test_translate_expressions(self): + for pattern, expect in [ + ('[', r'(?s:\[)\Z'), + ('[!', r'(?s:\[!)\Z'), + ('[]', r'(?s:\[\])\Z'), + ('[abc', r'(?s:\[abc)\Z'), + ('[!abc', r'(?s:\[!abc)\Z'), + ('[abc]', r'(?s:[abc])\Z'), + ('[!abc]', r'(?s:[^abc])\Z'), + ('[!abc][!def]', r'(?s:[^abc][^def])\Z'), + # with [[ + ('[[', r'(?s:\[\[)\Z'), + ('[[a', r'(?s:\[\[a)\Z'), + ('[[]', r'(?s:[\[])\Z'), + ('[[]a', r'(?s:[\[]a)\Z'), + ('[[]]', r'(?s:[\[]\])\Z'), + ('[[]a]', r'(?s:[\[]a\])\Z'), + ('[[a]', r'(?s:[\[a])\Z'), + ('[[a]]', r'(?s:[\[a]\])\Z'), + ('[[a]b', r'(?s:[\[a]b)\Z'), + # backslashes + ('[\\', r'(?s:\[\\)\Z'), + (r'[\]', r'(?s:[\\])\Z'), + (r'[\\]', r'(?s:[\\\\])\Z'), + ]: + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) + + def test_indices_locations(self): + from fnmatch import _translate + + blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a'] + parts, indices = _translate(''.join(blocks), '.') + expect_parts = [r'a\^b', '*', '.', '.', '[a-z]', '[1-9]', '*', r'\+\+\[\[a'] + self.assertListEqual(parts, expect_parts) + self.assertListEqual(indices, [1, 6]) + + class FilterTestCase(unittest.TestCase): def test_filter(self): From 275a1c78624444cc0c10625e219f0791c30168c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 25 Jul 2024 18:06:57 +0200 Subject: [PATCH 03/16] blurb --- Lib/fnmatch.py | 5 ++--- Lib/test/test_fnmatch.py | 1 + .../Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index f54d2324ae0b7b..301c0f7963ef20 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -9,7 +9,6 @@ The function translate(PATTERN) returns a regular expression corresponding to PATTERN. (It does not compile it.) """ - import os import posixpath import re @@ -98,7 +97,7 @@ def _translate(pat, QUESTION_MARK): add(re.escape(''.join(pending))) pending = [] # store the position of the wildcard - indices.append(len(parts)) + indices.append(len(res)) add('*') # compress consecutive `*` into one while i < n and pat[i] == '*': @@ -172,7 +171,7 @@ def _translate(pat, QUESTION_MARK): if pending: add(re.escape(''.join(pending))) assert i == n - return parts, indices + return res, indices def _join_translated_parts(parts, indices): diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index a4bf4c56783e71..4a53883811918b 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -1,4 +1,5 @@ """Test cases for the fnmatch module.""" + import unittest import os import string diff --git a/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst new file mode 100644 index 00000000000000..830a4c21c73e1c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst @@ -0,0 +1,2 @@ +Improve the performances of :func:`fnmatch.translate` by a factor 1.3. Patch +by Bénédikt Tran. From e60d057bccf37c24b4723eddee7cc66dc9c9c48e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 25 Jul 2024 18:20:06 +0200 Subject: [PATCH 04/16] fix usages --- Lib/fnmatch.py | 6 +++--- Lib/glob.py | 3 ++- Lib/test/test_fnmatch.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 301c0f7963ef20..2d71478cf641f8 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -77,12 +77,12 @@ def translate(pat): There is no way to quote meta-characters. """ - parts, indices = _translate(pat, '.') + parts, indices = _translate(pat, '*', '.') return _join_translated_parts(parts, indices) _set_ops_re = re.compile(r'([&~|])') -def _translate(pat, QUESTION_MARK): +def _translate(pat, STAR, QUESTION_MARK): res = [] add = res.append indices = [] @@ -98,7 +98,7 @@ def _translate(pat, QUESTION_MARK): pending = [] # store the position of the wildcard indices.append(len(res)) - add('*') + add(STAR) # compress consecutive `*` into one while i < n and pat[i] == '*': i += 1 diff --git a/Lib/glob.py b/Lib/glob.py index 574e5ad51b601d..7aa70cd67576bd 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -312,7 +312,8 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): if part: if not include_hidden and part[0] in '*?': results.append(r'(?!\.)') - results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)) + parts, _ = fnmatch._translate(part, f'{not_sep}*', not_sep) + results.extend(parts) if idx < last_part_idx: results.append(any_sep) res = ''.join(results) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 4a53883811918b..df5d57322f4886 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -308,7 +308,7 @@ def test_indices_locations(self): from fnmatch import _translate blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a'] - parts, indices = _translate(''.join(blocks), '.') + parts, indices = _translate(''.join(blocks), '*', '.') expect_parts = [r'a\^b', '*', '.', '.', '[a-z]', '[1-9]', '*', r'\+\+\[\[a'] self.assertListEqual(parts, expect_parts) self.assertListEqual(indices, [1, 6]) From 03217d7b6dec482a4caa3c2d79b5fbb2041da75b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 25 Jul 2024 18:43:14 +0200 Subject: [PATCH 05/16] keep legacy version for glob --- Lib/glob.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 7aa70cd67576bd..6f060e359415de 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -312,14 +312,85 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): if part: if not include_hidden and part[0] in '*?': results.append(r'(?!\.)') - parts, _ = fnmatch._translate(part, f'{not_sep}*', not_sep) - results.extend(parts) + results.extend(_translate(part, f'{not_sep}*', not_sep)) if idx < last_part_idx: results.append(any_sep) res = ''.join(results) return fr'(?s:{res})\Z' +def _translate(pat, STAR, QUESTION_MARK): + res = [] + add = res.append + i, n = 0, len(pat) + while i < n: + c = pat[i] + i = i+1 + if c == '*': + # compress consecutive `*` into one + if (not res) or res[-1] is not STAR: + add(STAR) + elif c == '?': + add(QUESTION_MARK) + elif c == '[': + j = i + if j < n and pat[j] == '!': + j = j+1 + if j < n and pat[j] == ']': + j = j+1 + while j < n and pat[j] != ']': + j = j+1 + if j >= n: + add('\\[') + else: + stuff = pat[i:j] + if '-' not in stuff: + stuff = stuff.replace('\\', r'\\') + else: + chunks = [] + k = i+2 if pat[i] == '!' else i+1 + while True: + k = pat.find('-', k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k+1 + k = k+3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += '-' + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks)-1, 0, -1): + if chunks[k-1][-1] > chunks[k][0]: + chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') + for s in chunks) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r'([&~|])', r'\\\1', stuff) + i = j+1 + if not stuff: + # Empty range: never match. + add('(?!)') + elif stuff == '!': + # Negated empty range: match any character. + add('.') + else: + if stuff[0] == '!': + stuff = '^' + stuff[1:] + elif stuff[0] in ('^', '['): + stuff = '\\' + stuff + add(f'[{stuff}]') + else: + add(re.escape(c)) + assert i == n + return res + + @functools.lru_cache(maxsize=512) def _compile_pattern(pat, sep, case_sensitive, recursive=True): """Compile given glob pattern to a re.Pattern object (observing case From 804da13fae3ee48f93a5064273af3401a3c8334d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 25 Jul 2024 18:48:18 +0200 Subject: [PATCH 06/16] actually not needed... --- Lib/glob.py | 74 +---------------------------------------------------- 1 file changed, 1 insertion(+), 73 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 6f060e359415de..fe082444c0ec2a 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -312,85 +312,13 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): if part: if not include_hidden and part[0] in '*?': results.append(r'(?!\.)') - results.extend(_translate(part, f'{not_sep}*', not_sep)) + results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0]) if idx < last_part_idx: results.append(any_sep) res = ''.join(results) return fr'(?s:{res})\Z' -def _translate(pat, STAR, QUESTION_MARK): - res = [] - add = res.append - i, n = 0, len(pat) - while i < n: - c = pat[i] - i = i+1 - if c == '*': - # compress consecutive `*` into one - if (not res) or res[-1] is not STAR: - add(STAR) - elif c == '?': - add(QUESTION_MARK) - elif c == '[': - j = i - if j < n and pat[j] == '!': - j = j+1 - if j < n and pat[j] == ']': - j = j+1 - while j < n and pat[j] != ']': - j = j+1 - if j >= n: - add('\\[') - else: - stuff = pat[i:j] - if '-' not in stuff: - stuff = stuff.replace('\\', r'\\') - else: - chunks = [] - k = i+2 if pat[i] == '!' else i+1 - while True: - k = pat.find('-', k, j) - if k < 0: - break - chunks.append(pat[i:k]) - i = k+1 - k = k+3 - chunk = pat[i:j] - if chunk: - chunks.append(chunk) - else: - chunks[-1] += '-' - # Remove empty ranges -- invalid in RE. - for k in range(len(chunks)-1, 0, -1): - if chunks[k-1][-1] > chunks[k][0]: - chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] - del chunks[k] - # Escape backslashes and hyphens for set difference (--). - # Hyphens that create ranges shouldn't be escaped. - stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') - for s in chunks) - # Escape set operations (&&, ~~ and ||). - stuff = re.sub(r'([&~|])', r'\\\1', stuff) - i = j+1 - if not stuff: - # Empty range: never match. - add('(?!)') - elif stuff == '!': - # Negated empty range: match any character. - add('.') - else: - if stuff[0] == '!': - stuff = '^' + stuff[1:] - elif stuff[0] in ('^', '['): - stuff = '\\' + stuff - add(f'[{stuff}]') - else: - add(re.escape(c)) - assert i == n - return res - - @functools.lru_cache(maxsize=512) def _compile_pattern(pat, sep, case_sensitive, recursive=True): """Compile given glob pattern to a re.Pattern object (observing case From baa6ce37b3f5908d311e7342f7314896c07edb34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 17 Aug 2024 12:44:44 +0200 Subject: [PATCH 07/16] reduce the number of calls to `str.join` --- Lib/fnmatch.py | 25 +++++++++---------------- Lib/test/test_fnmatch.py | 6 ++++-- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 2d71478cf641f8..34899facba5f63 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -81,21 +81,18 @@ def translate(pat): return _join_translated_parts(parts, indices) _set_ops_re = re.compile(r'([&~|])') +_re_escape = functools.lru_cache(maxsize=32768)(re.escape) def _translate(pat, STAR, QUESTION_MARK): res = [] add = res.append indices = [] - pending = [] # pending characters to escape i, n = 0, len(pat) while i < n: c = pat[i] i = i+1 if c == '*': - if pending: - add(re.escape(''.join(pending))) - pending = [] # store the position of the wildcard indices.append(len(res)) add(STAR) @@ -106,9 +103,6 @@ def _translate(pat, STAR, QUESTION_MARK): # Handling '?' one at a time seems to more efficient # even if there are consecutive '?' that could have # been written directly. - if pending: - add(re.escape(''.join(pending))) - pending = [] add(QUESTION_MARK) elif c == '[': j = i @@ -119,11 +113,8 @@ def _translate(pat, STAR, QUESTION_MARK): while j < n and pat[j] != ']': j = j+1 if j >= n: - pending.append('[') + add('\\[') else: - if pending: - add(re.escape(''.join(pending))) - pending = [] stuff = pat[i:j] if '-' not in stuff: stuff = stuff.replace('\\', r'\\') @@ -167,9 +158,7 @@ def _translate(pat, STAR, QUESTION_MARK): stuff = '\\' + stuff add(f'[{stuff}]') else: - pending.append(c) - if pending: - add(re.escape(''.join(pending))) + add(_re_escape(c)) assert i == n return res, indices @@ -180,10 +169,14 @@ def _join_translated_parts(parts, indices): iter_indices = iter(indices) i, j = 0, next(iter_indices) buffer = parts[i:j] + append, extend = buffer.append, buffer.extend i = j + 1 for j in iter_indices: - buffer.append(f'(?>.*?{"".join(parts[i:j])})') + append('(?>.*?') + extend(parts[i:j]) + append(')') i = j + 1 - buffer.append(f'.*{"".join(parts[i:])}') + append('.*') + extend(parts[i:]) res = ''.join(buffer) return fr'(?s:{res})\Z' diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index df5d57322f4886..6dc3dc583070f6 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -309,9 +309,11 @@ def test_indices_locations(self): blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a'] parts, indices = _translate(''.join(blocks), '*', '.') - expect_parts = [r'a\^b', '*', '.', '.', '[a-z]', '[1-9]', '*', r'\+\+\[\[a'] + expect_parts = ['a', r'\^', 'b', '*', + '.', '.', '[a-z]', '[1-9]', '*', + r'\+', r'\+', r'\[', r'\[', 'a'] self.assertListEqual(parts, expect_parts) - self.assertListEqual(indices, [1, 6]) + self.assertListEqual(indices, [3, 8]) class FilterTestCase(unittest.TestCase): From 80b22e005087f23cac1551d0daabae4cc00f3f34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 18 Aug 2024 12:09:32 +0200 Subject: [PATCH 08/16] micro-optimization on `re.sub` --- Lib/fnmatch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 34899facba5f63..30b3de2c64b5a8 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -80,7 +80,7 @@ def translate(pat): parts, indices = _translate(pat, '*', '.') return _join_translated_parts(parts, indices) -_set_ops_re = re.compile(r'([&~|])') +_re_setops_sub = re.compile(r'([&~|])').sub _re_escape = functools.lru_cache(maxsize=32768)(re.escape) def _translate(pat, STAR, QUESTION_MARK): @@ -151,7 +151,7 @@ def _translate(pat, STAR, QUESTION_MARK): add('.') else: # Escape set operations (&&, ~~ and ||). - stuff = _set_ops_re.sub(r'\\\1', stuff) + stuff = _re_setops_sub(r'\\\1', stuff) if stuff[0] == '!': stuff = '^' + stuff[1:] elif stuff[0] in ('^', '['): From 7a9a87ce23a5599274a5426ac7103d001eb6c073 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 19 Aug 2024 09:25:45 +0200 Subject: [PATCH 09/16] address Barney's review --- Lib/fnmatch.py | 11 +++++++++-- Lib/test/test_fnmatch.py | 15 +++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 30b3de2c64b5a8..483e1445f280df 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -167,11 +167,18 @@ def _join_translated_parts(parts, indices): if not indices: return fr'(?s:{"".join(parts)})\Z' iter_indices = iter(indices) - i, j = 0, next(iter_indices) - buffer = parts[i:j] + j = next(iter_indices) + buffer = parts[:j] # fixed pieces at the start append, extend = buffer.append, buffer.extend i = j + 1 for j in iter_indices: + # Now deal with STAR fixed STAR fixed ... + # For an interior `STAR fixed` pairing, we want to do a minimal + # .*? match followed by `fixed`, with no possibility of backtracking. + # Atomic groups ("(?>...)") allow us to spell that directly. + # Note: people rely on the undocumented ability to join multiple + # translate() results together via "|" to build large regexps matching + # "one of many" shell patterns. append('(?>.*?') extend(parts[i:j]) append(')') diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 6dc3dc583070f6..68526cf11ed318 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -260,8 +260,9 @@ def test_translate_wildcards(self): ('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'), ('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'), ]: - translated = translate(pattern) - self.assertEqual(translated, expect, pattern) + with self.subTest(pattern): + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) for pattern, expect in [ ('*ab', r'(?s:.*ab)\Z'), @@ -273,8 +274,9 @@ def test_translate_wildcards(self): ('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'), ('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'), ]: - translated = translate(pattern) - self.assertEqual(translated, expect, pattern) + with self.subTest(pattern): + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) def test_translate_expressions(self): for pattern, expect in [ @@ -301,8 +303,9 @@ def test_translate_expressions(self): (r'[\]', r'(?s:[\\])\Z'), (r'[\\]', r'(?s:[\\\\])\Z'), ]: - translated = translate(pattern) - self.assertEqual(translated, expect, pattern) + with self.subTest(pattern): + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) def test_indices_locations(self): from fnmatch import _translate From 90539bcd17d3bb3eaf65cbfa18be236a88accc94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 23 Aug 2024 10:50:29 +0200 Subject: [PATCH 10/16] Update Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst --- .../next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst index 830a4c21c73e1c..26a18afca945d9 100644 --- a/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst +++ b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst @@ -1,2 +1,2 @@ -Improve the performances of :func:`fnmatch.translate` by a factor 1.3. Patch +Improve the performances of :func:`fnmatch.translate` by a factor 1.7. Patch by Bénédikt Tran. From 1d52949cc8300437001c8342cb27441f1d8e3cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 27 Aug 2024 19:06:53 +0200 Subject: [PATCH 11/16] use lower-case parameter names --- Lib/fnmatch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 483e1445f280df..f1a841305b8a0a 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -83,7 +83,7 @@ def translate(pat): _re_setops_sub = re.compile(r'([&~|])').sub _re_escape = functools.lru_cache(maxsize=32768)(re.escape) -def _translate(pat, STAR, QUESTION_MARK): +def _translate(pat, star, question_mark): res = [] add = res.append indices = [] @@ -95,7 +95,7 @@ def _translate(pat, STAR, QUESTION_MARK): if c == '*': # store the position of the wildcard indices.append(len(res)) - add(STAR) + add(star) # compress consecutive `*` into one while i < n and pat[i] == '*': i += 1 @@ -103,7 +103,7 @@ def _translate(pat, STAR, QUESTION_MARK): # Handling '?' one at a time seems to more efficient # even if there are consecutive '?' that could have # been written directly. - add(QUESTION_MARK) + add(question_mark) elif c == '[': j = i if j < n and pat[j] == '!': From 02264371c101929b1e7774ce0c103127887c0bb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 28 Aug 2024 11:35:33 +0200 Subject: [PATCH 12/16] rename variable `indices` to `star_indices` --- Lib/fnmatch.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index f1a841305b8a0a..18fcda79f2608e 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -86,7 +86,7 @@ def translate(pat): def _translate(pat, star, question_mark): res = [] add = res.append - indices = [] + star_indices = [] i, n = 0, len(pat) while i < n: @@ -94,7 +94,7 @@ def _translate(pat, star, question_mark): i = i+1 if c == '*': # store the position of the wildcard - indices.append(len(res)) + star_indices.append(len(res)) add(star) # compress consecutive `*` into one while i < n and pat[i] == '*': @@ -160,18 +160,18 @@ def _translate(pat, star, question_mark): else: add(_re_escape(c)) assert i == n - return res, indices + return res, star_indices -def _join_translated_parts(parts, indices): - if not indices: +def _join_translated_parts(parts, star_indices): + if not star_indices: return fr'(?s:{"".join(parts)})\Z' - iter_indices = iter(indices) - j = next(iter_indices) + iter_star_indices = iter(star_indices) + j = next(iter_star_indices) buffer = parts[:j] # fixed pieces at the start append, extend = buffer.append, buffer.extend i = j + 1 - for j in iter_indices: + for j in iter_star_indices: # Now deal with STAR fixed STAR fixed ... # For an interior `STAR fixed` pairing, we want to do a minimal # .*? match followed by `fixed`, with no possibility of backtracking. From 01a51734dcd8031935432fb36997353e764fd616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 28 Aug 2024 11:35:48 +0200 Subject: [PATCH 13/16] remove ambiguous comment about '?' case --- Lib/fnmatch.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 18fcda79f2608e..5b95e6c8e6f01a 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -100,9 +100,6 @@ def _translate(pat, star, question_mark): while i < n and pat[i] == '*': i += 1 elif c == '?': - # Handling '?' one at a time seems to more efficient - # even if there are consecutive '?' that could have - # been written directly. add(question_mark) elif c == '[': j = i From bb6c3eef37c79f7544e2d5c38c6f7ee19417ad52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 28 Aug 2024 11:36:48 +0200 Subject: [PATCH 14/16] change the cache size for `re.escape` The rationale for this change is as follows: re.escape() is only used to cache single Unicode characters in shell patterns; we may heuristically assume that they are ISO-8859-1 encodable, thereby requiring a cache of size 256. To allow non-traditional glyphs (or alphabets with a small number of common glyphs), we double the cache size. --- Lib/fnmatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 5b95e6c8e6f01a..31bc2aa7585ccf 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -81,7 +81,7 @@ def translate(pat): return _join_translated_parts(parts, indices) _re_setops_sub = re.compile(r'([&~|])').sub -_re_escape = functools.lru_cache(maxsize=32768)(re.escape) +_re_escape = functools.lru_cache(maxsize=512)(re.escape) def _translate(pat, star, question_mark): res = [] From c14ce4f7abc18ca5c427c1c3da4d4131f3f3984e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 18 Oct 2024 05:14:30 +0200 Subject: [PATCH 15/16] Update Lib/fnmatch.py Co-authored-by: Barney Gale --- Lib/fnmatch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 31bc2aa7585ccf..865baea23467ea 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -77,8 +77,8 @@ def translate(pat): There is no way to quote meta-characters. """ - parts, indices = _translate(pat, '*', '.') - return _join_translated_parts(parts, indices) + parts, star_indices = _translate(pat, '*', '.') + return _join_translated_parts(parts, star_indices) _re_setops_sub = re.compile(r'([&~|])').sub _re_escape = functools.lru_cache(maxsize=512)(re.escape) From 38d342701d84370b2b2f713934f848d153f091ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 22 Oct 2024 14:34:55 +0200 Subject: [PATCH 16/16] Update Lib/test/test_fnmatch.py --- Lib/test/test_fnmatch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 68526cf11ed318..9f360e1dc10f47 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -307,16 +307,16 @@ def test_translate_expressions(self): translated = translate(pattern) self.assertEqual(translated, expect, pattern) - def test_indices_locations(self): + def test_star_indices_locations(self): from fnmatch import _translate blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a'] - parts, indices = _translate(''.join(blocks), '*', '.') + parts, star_indices = _translate(''.join(blocks), '*', '.') expect_parts = ['a', r'\^', 'b', '*', '.', '.', '[a-z]', '[1-9]', '*', r'\+', r'\+', r'\[', r'\[', 'a'] self.assertListEqual(parts, expect_parts) - self.assertListEqual(indices, [3, 8]) + self.assertListEqual(star_indices, [3, 8]) class FilterTestCase(unittest.TestCase):