Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-128150: improve performances of uuid.uuid* constructor functions. #128151

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
19 changes: 19 additions & 0 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,25 @@ io
file's bytes in full. (Contributed by Cody Maloney and Victor Stinner in
:gh:`120754` and :gh:`90102`.)


uuid
----

* Improve generation of :class:`~uuid.UUID` objects via their dedicated
functions:

* :func:`~uuid.uuid3` is 47% faster for 16-byte names and 13% faster
for 1024-byte names. Performances for longer names remain unchanged.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for 1024-byte names. Performances for longer names remain unchanged.
for 1024-byte names. Performance for longer names remains unchanged.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for 1024-byte names. Performances for longer names remain unchanged.
for 1024-byte names. Performance for longer names remains unchanged.

* :func:`~uuid.uuid5` is 35% faster for 16-byte names and 24% faster
for 1024-byte names. Performances for longer names remain unchanged.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for 1024-byte names. Performances for longer names remain unchanged.
for 1024-byte names. Performance for longer names remains unchanged.

* :func:`~uuid.uuid4` is 33% faster and :func:`~uuid.uuid8` is 38% faster.

Overall, dedicated generation of UUID objects version 3, 4, 5, and 8 is
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you already have the specific improvements I would remove this paragraph (or the other way around)

roughly 20% faster.

(Contributed by Bénédikt Tran in :gh:`128150`.)


Deprecated
==========

Expand Down
82 changes: 58 additions & 24 deletions Lib/uuid.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ class SafeUUID:
unknown = None


_RFC_4122_CLEARFLAGS_MASK = 0xffff_ffff_ffff_0fff_3fff_ffff_ffff_ffff
_RFC_4122_VERSION_1_FLAGS = 0x0000_0000_0000_1000_8000_0000_0000_0000
_RFC_4122_VERSION_3_FLAGS = 0x0000_0000_0000_3000_8000_0000_0000_0000
_RFC_4122_VERSION_4_FLAGS = 0x0000_0000_0000_4000_8000_0000_0000_0000
_RFC_4122_VERSION_5_FLAGS = 0x0000_0000_0000_5000_8000_0000_0000_0000
_RFC_4122_VERSION_8_FLAGS = 0x0000_0000_0000_8000_8000_0000_0000_0000


class UUID:
"""Instances of the UUID class represent UUIDs as specified in RFC 4122.
UUID objects are immutable, hashable, and usable as dictionary keys.
Expand Down Expand Up @@ -174,45 +182,49 @@ def __init__(self, hex=None, bytes=None, bytes_le=None, fields=None,
if [hex, bytes, bytes_le, fields, int].count(None) != 4:
raise TypeError('one of the hex, bytes, bytes_le, fields, '
'or int arguments must be given')
if hex is not None:
if int is not None:
pass
elif hex is not None:
hex = hex.replace('urn:', '').replace('uuid:', '')
hex = hex.strip('{}').replace('-', '')
if len(hex) != 32:
raise ValueError('badly formed hexadecimal UUID string')
int = int_(hex, 16)
if bytes_le is not None:
elif bytes_le is not None:
if len(bytes_le) != 16:
raise ValueError('bytes_le is not a 16-char string')
assert isinstance(bytes_le, bytes_), repr(bytes_le)
bytes = (bytes_le[4-1::-1] + bytes_le[6-1:4-1:-1] +
bytes_le[8-1:6-1:-1] + bytes_le[8:])
if bytes is not None:
int = int_.from_bytes(bytes)
elif bytes is not None:
if len(bytes) != 16:
raise ValueError('bytes is not a 16-char string')
assert isinstance(bytes, bytes_), repr(bytes)
int = int_.from_bytes(bytes) # big endian
if fields is not None:
elif fields is not None:
if len(fields) != 6:
raise ValueError('fields is not a 6-tuple')
(time_low, time_mid, time_hi_version,
clock_seq_hi_variant, clock_seq_low, node) = fields
if not 0 <= time_low < 1<<32:
if time_low < 0 or time_low > 0xffff_ffff:
raise ValueError('field 1 out of range (need a 32-bit value)')
if not 0 <= time_mid < 1<<16:
if time_mid < 0 or time_mid > 0xffff:
raise ValueError('field 2 out of range (need a 16-bit value)')
if not 0 <= time_hi_version < 1<<16:
if time_hi_version < 0 or time_hi_version > 0xffff:
raise ValueError('field 3 out of range (need a 16-bit value)')
if not 0 <= clock_seq_hi_variant < 1<<8:
if clock_seq_hi_variant < 0 or clock_seq_hi_variant > 0xff:
raise ValueError('field 4 out of range (need an 8-bit value)')
if not 0 <= clock_seq_low < 1<<8:
if clock_seq_low < 0 or clock_seq_low > 0xff:
raise ValueError('field 5 out of range (need an 8-bit value)')
if not 0 <= node < 1<<48:
if node < 0 or node > 0xffff_ffff_ffff:
raise ValueError('field 6 out of range (need a 48-bit value)')
clock_seq = (clock_seq_hi_variant << 8) | clock_seq_low
int = ((time_low << 96) | (time_mid << 80) |
(time_hi_version << 64) | (clock_seq << 48) | node)
if int is not None:
if not 0 <= int < 1<<128:
raise ValueError('int is out of range (need a 128-bit value)')
# "x < a or int > b" is slightly faster than "not (a <= x <= b)"
if int < 0 or int > 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff:
raise ValueError('int is out of range (need a 128-bit value)')
if version is not None:
if not 1 <= version <= 8:
raise ValueError('illegal version number')
Expand All @@ -225,6 +237,15 @@ def __init__(self, hex=None, bytes=None, bytes_le=None, fields=None,
object.__setattr__(self, 'int', int)
object.__setattr__(self, 'is_safe', is_safe)

@classmethod
def _from_int(cls, int, *, is_safe=SafeUUID.unknown):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _from_int(cls, int, *, is_safe=SafeUUID.unknown):
def _from_int(cls, int):

"""Internal use only."""
assert int >= 0 and int <= 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff
self = cls.__new__(cls)
object.__setattr__(self, 'int', int)
object.__setattr__(self, 'is_safe', is_safe)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
object.__setattr__(self, 'is_safe', is_safe)

At this moment the argument is unused. Removing it makes the call faster.

return self

def __getstate__(self):
d = {'int': self.int}
if self.is_safe != SafeUUID.unknown:
Expand Down Expand Up @@ -700,24 +721,35 @@ def uuid3(namespace, name):
"""Generate a UUID from the MD5 hash of a namespace UUID and a name."""
if isinstance(name, str):
name = bytes(name, "utf-8")
from hashlib import md5
digest = md5(
namespace.bytes + name,
usedforsecurity=False
).digest()
return UUID(bytes=digest[:16], version=3)
# HACL*-based MD5 is slightly faster than its OpenSSL version,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would put these comments in the pr comments and leave them out of the code. (but I do see the value of them)

# and 'import X; X.Y' is slightly faster than 'from X import Y'.
import _md5
h = _md5.md5(namespace.bytes + name, usedforsecurity=False)
assert len(h.digest()) == 16
int_uuid_3 = int_.from_bytes(h.digest())
int_uuid_3 &= _RFC_4122_CLEARFLAGS_MASK
int_uuid_3 |= _RFC_4122_VERSION_3_FLAGS
return UUID._from_int(int_uuid_3)

def uuid4():
"""Generate a random UUID."""
return UUID(bytes=os.urandom(16), version=4)
int_uuid_4 = int_.from_bytes(os.urandom(16))
int_uuid_4 &= _RFC_4122_CLEARFLAGS_MASK
int_uuid_4 |= _RFC_4122_VERSION_4_FLAGS
return UUID._from_int(int_uuid_4)

def uuid5(namespace, name):
"""Generate a UUID from the SHA-1 hash of a namespace UUID and a name."""
if isinstance(name, str):
name = bytes(name, "utf-8")
from hashlib import sha1
hash = sha1(namespace.bytes + name).digest()
return UUID(bytes=hash[:16], version=5)
# OpenSSL-based SHA-1 is slightly faster than its HACL* version,
# and 'import X; X.Y' is slightly faster than 'from X import Y'.
import hashlib
h = hashlib.sha1(namespace.bytes + name, usedforsecurity=False)
int_uuid_5 = int_.from_bytes(h.digest()[:16])
int_uuid_5 &= _RFC_4122_CLEARFLAGS_MASK
int_uuid_5 |= _RFC_4122_VERSION_5_FLAGS
return UUID._from_int(int_uuid_5)

def uuid8(a=None, b=None, c=None):
"""Generate a UUID from three custom blocks.
Expand All @@ -740,7 +772,9 @@ def uuid8(a=None, b=None, c=None):
int_uuid_8 = (a & 0xffff_ffff_ffff) << 80
int_uuid_8 |= (b & 0xfff) << 64
int_uuid_8 |= c & 0x3fff_ffff_ffff_ffff
return UUID(int=int_uuid_8, version=8)
# by construction, the variant and version bits are already cleared
int_uuid_8 |= _RFC_4122_VERSION_8_FLAGS
return UUID._from_int(int_uuid_8)

def main():
"""Run the uuid command line interface."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve generation of :class:`~uuid.UUID` objects version 3, 4, 5, and 8
via their dedicated functions by 20%. Patch by Bénédikt Tran.
Loading