Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-128131: Completely support random read access of uncompressed unencrypted files in ZipFile #128143

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
83 changes: 83 additions & 0 deletions Lib/test/test_zipfile/test_core.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import _pyio
import array
import contextlib
import importlib.util
Expand Down Expand Up @@ -3448,5 +3449,87 @@ def test_too_short(self):
b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,)))


class StatIO(_pyio.BytesIO):
5ec1cff marked this conversation as resolved.
Show resolved Hide resolved
"""Buffer which remembers the number of bytes that were read."""

def __init__(self):
super().__init__()
self.bytes_read = 0

def read(self, size=-1):
bs = super().read(size)
self.bytes_read += len(bs)
return bs


class StoredZipExtFileRandomReadTest(unittest.TestCase):
5ec1cff marked this conversation as resolved.
Show resolved Hide resolved
"""Tests whether an uncompressed, unencrypted zip entry can be randomly
seek and read without reading redundant bytes."""
def test_stored_seek_and_read(self):

sio = StatIO()
# 20000 bytes
txt = b'0123456789' * 2000

# The seek length must be greater than ZipExtFile.MIN_READ_SIZE
# as `ZipExtFile._read2()` reads in blocks of this size and we
# need to seek out of the buffered data
read_buffer_size = zipfile.ZipExtFile.MIN_READ_SIZE
self.assertGreaterEqual(10002, read_buffer_size) # for forward seek test
self.assertGreaterEqual(5003, read_buffer_size) # for backward seek test
# The read length must be less than MIN_READ_SIZE, since we assume that
# only 1 block is read in the test.
read_length = 100
self.assertGreaterEqual(read_buffer_size, read_length) # for read() calls

with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf:
zipf.writestr("foo.txt", txt)

# check random seek and read on a file
with zipfile.ZipFile(sio, "r") as zipf:
with zipf.open("foo.txt", "r") as fp:
# Test this optimized read hasn't rewound and read from the
# start of the file (as in the case of the unoptimized path)

# forward seek
old_count = sio.bytes_read
forward_seek_len = 10002
current_pos = 0
fp.seek(forward_seek_len, os.SEEK_CUR)
current_pos += forward_seek_len
self.assertEqual(fp.tell(), current_pos)
self.assertEqual(fp._left, fp._compress_left)
arr = fp.read(read_length)
current_pos += read_length
self.assertEqual(fp.tell(), current_pos)
self.assertEqual(arr, txt[current_pos - read_length:current_pos])
self.assertEqual(fp._left, fp._compress_left)
read_count = sio.bytes_read - old_count
self.assertLessEqual(read_count, read_buffer_size)

# backward seek
picnixz marked this conversation as resolved.
Show resolved Hide resolved
old_count = sio.bytes_read
backward_seek_len = 5003
fp.seek(-backward_seek_len, os.SEEK_CUR)
current_pos -= backward_seek_len
self.assertEqual(fp.tell(), current_pos)
self.assertEqual(fp._left, fp._compress_left)
arr = fp.read(read_length)
current_pos += read_length
self.assertEqual(fp.tell(), current_pos)
self.assertEqual(arr, txt[current_pos - read_length:current_pos])
self.assertEqual(fp._left, fp._compress_left)
read_count = sio.bytes_read - old_count
self.assertLessEqual(read_count, read_buffer_size)

# eof flags test
fp.seek(0, os.SEEK_END)
fp.seek(12345, os.SEEK_SET)
current_pos = 12345
arr = fp.read(read_length)
current_pos += read_length
self.assertEqual(arr, txt[current_pos - read_length:current_pos])


5ec1cff marked this conversation as resolved.
Show resolved Hide resolved
if __name__ == "__main__":
unittest.main()
4 changes: 3 additions & 1 deletion Lib/zipfile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,13 +1162,15 @@ def seek(self, offset, whence=os.SEEK_SET):
self._offset = buff_offset
read_offset = 0
# Fast seek uncompressed unencrypted file
elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0:
elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset != 0:
# disable CRC checking after first seeking - it would be invalid
self._expected_crc = None
# seek actual file taking already buffered data into account
read_offset -= len(self._readbuffer) - self._offset
self._fileobj.seek(read_offset, os.SEEK_CUR)
self._left -= read_offset
self._compress_left -= read_offset
5ec1cff marked this conversation as resolved.
Show resolved Hide resolved
self._eof = self._left <= 0
read_offset = 0
# flush read buffer
self._readbuffer = b''
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Completely support random access of uncompressed unencrypted read-only
zip files obtained by :meth:`ZipFile.open <zipfile.ZipFile.open>`.
Loading