From c355f2e06a0ccd6069328ca082d36eb9ff88697b Mon Sep 17 00:00:00 2001 From: rocky Date: Sun, 23 Apr 2023 06:05:54 -0400 Subject: [PATCH] unicode tweaks - Python 2.x can't accept unicode strings other than \u, so check whether a unicode string is ASCII or not. --- xdis/cross_types.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/xdis/cross_types.py b/xdis/cross_types.py index 0f1c0195..c6203798 100644 --- a/xdis/cross_types.py +++ b/xdis/cross_types.py @@ -17,6 +17,11 @@ Defines types from one set of Python versions that don't exist in another set of Pythons """ +# From +# https://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii +def is_ascii(s: str) -> bool: + """Check if the characters in string s are in ASCII, U+0-U+7F.""" + return len(s) == len(s.encode()) class LongTypeForPython3(int): @@ -28,9 +33,9 @@ class LongTypeForPython3(int): def __init__(self, value): self.value = value - def __repr__(self): + def __repr__(self) -> str: """ - Replacement __str__ and str() for Python3. + Replacement repr() and str() for Python3. This ensures we get the "L" suffix on long types. """ return f"""{self.value}L""" @@ -45,15 +50,22 @@ class UnicodeForPython3(str): def __init__(self, value): self.value = value - def __repr__(self): - """ - Replacement __str__ and str() for Python3. - This ensures we get the "u" suffix on unicode types. + def __repr__(self) -> str: + r""" + Replacement repr() and str() for Python3. + This ensures we get the "u" suffix on unicode types, + and also \u when the string is not ASCII representable """ try: - value = self.value.decode("utf-8") + utf8_value = self.value.decode("utf-8") # Do we need to handle utf-16 and utf-32? except UnicodeDecodeError: return f"""u'{str(self.value)[1:]}'""" - else: - return f"""u'{str(value)}'""" + + if is_ascii(utf8_value): + return f"""u'{utf8_value}'""" + + # Turn the unicode character into its Unicode code point, + # but strip of the leading "0x". + unicode_codepoint = hex(ord(utf8_value))[len("0x") :] + return rf"""u'\u{unicode_codepoint}'"""