diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index c36228c033a414..9746877c36a1a2 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1,3 +1,4 @@ +import _pyio import array import contextlib import importlib.util @@ -3448,5 +3449,87 @@ def test_too_short(self): b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,))) +class StatIO(_pyio.BytesIO): + """Buffer which remembers the number of bytes that were read.""" + + def __init__(self): + super().__init__() + self.bytes_read = 0 + + def read(self, size=-1): + bs = super().read(size) + self.bytes_read += len(bs) + return bs + + +class StoredZipExtFileRandomReadTest(unittest.TestCase): + """Tests whether an uncompressed, unencrypted zip entry can be randomly + seek and read without reading redundant bytes.""" + def test_stored_seek_and_read(self): + + sio = StatIO() + # 20000 bytes + txt = b'0123456789' * 2000 + + # The seek length must be greater than ZipExtFile.MIN_READ_SIZE + # as `ZipExtFile._read2()` reads in blocks of this size and we + # need to seek out of the buffered data + read_buffer_size = zipfile.ZipExtFile.MIN_READ_SIZE + self.assertGreaterEqual(10002, read_buffer_size) # for forward seek test + self.assertGreaterEqual(5003, read_buffer_size) # for backward seek test + # The read length must be less than MIN_READ_SIZE, since we assume that + # only 1 block is read in the test. + read_length = 100 + self.assertGreaterEqual(read_buffer_size, read_length) # for read() calls + + with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf: + zipf.writestr("foo.txt", txt) + + # check random seek and read on a file + with zipfile.ZipFile(sio, "r") as zipf: + with zipf.open("foo.txt", "r") as fp: + # Test this optimized read hasn't rewound and read from the + # start of the file (as in the case of the unoptimized path) + + # forward seek + old_count = sio.bytes_read + forward_seek_len = 10002 + current_pos = 0 + fp.seek(forward_seek_len, os.SEEK_CUR) + current_pos += forward_seek_len + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(fp._left, fp._compress_left) + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) + self.assertEqual(fp._left, fp._compress_left) + read_count = sio.bytes_read - old_count + self.assertLessEqual(read_count, read_buffer_size) + + # backward seek + old_count = sio.bytes_read + backward_seek_len = 5003 + fp.seek(-backward_seek_len, os.SEEK_CUR) + current_pos -= backward_seek_len + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(fp._left, fp._compress_left) + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) + self.assertEqual(fp._left, fp._compress_left) + read_count = sio.bytes_read - old_count + self.assertLessEqual(read_count, read_buffer_size) + + # eof flags test + fp.seek(0, os.SEEK_END) + fp.seek(12345, os.SEEK_SET) + current_pos = 12345 + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) + + if __name__ == "__main__": unittest.main() diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 6907ae6d5b7464..ed70885b74d331 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1162,13 +1162,15 @@ def seek(self, offset, whence=os.SEEK_SET): self._offset = buff_offset read_offset = 0 # Fast seek uncompressed unencrypted file - elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0: + elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset != 0: # disable CRC checking after first seeking - it would be invalid self._expected_crc = None # seek actual file taking already buffered data into account read_offset -= len(self._readbuffer) - self._offset self._fileobj.seek(read_offset, os.SEEK_CUR) self._left -= read_offset + self._compress_left -= read_offset + self._eof = self._left <= 0 read_offset = 0 # flush read buffer self._readbuffer = b'' diff --git a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst new file mode 100644 index 00000000000000..f4c4ebce10729c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst @@ -0,0 +1,2 @@ +Completely support random access of uncompressed unencrypted read-only +zip files obtained by :meth:`ZipFile.open `.