Skip to content

Commit

Permalink
fix(rust): decompress the right number of rows when reading compresse…
Browse files Browse the repository at this point in the history
…d CSVs (#13721)

Co-authored-by: Wainberg <[email protected]>
  • Loading branch information
Wainberg and Wainberg authored Jan 19, 2024
1 parent 764822d commit f4e65df
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 3 deletions.
11 changes: 8 additions & 3 deletions crates/polars-io/src/csv/read_impl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,15 @@ impl<'a> CoreReader<'a> {
// In case the file is compressed this schema inference is wrong and has to be done
// again after decompression.
#[cfg(any(feature = "decompress", feature = "decompress-fast"))]
if let Some(b) =
decompress(&reader_bytes, n_rows, separator, quote_char, eol_char)
{
reader_bytes = ReaderBytes::Owned(b);
let total_n_rows = n_rows.map(|n| {
skip_rows + (has_header as usize) + skip_rows_after_header + n
});
if let Some(b) =
decompress(&reader_bytes, total_n_rows, separator, quote_char, eol_char)
{
reader_bytes = ReaderBytes::Owned(b);
}
}

let (inferred_schema, _, _) = infer_file_schema(
Expand Down
18 changes: 18 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1744,3 +1744,21 @@ def test_invalid_csv_raise() -> None:
"SK0127960V000","SK BT 0018977","
""".strip()
)


@pytest.mark.write_disk()
def test_partial_read_compressed_file(tmp_path: Path) -> None:
df = pl.DataFrame(
{"idx": range(1_000), "dt": date(2025, 12, 31), "txt": "hello world"}
)
tmp_path.mkdir(exist_ok=True)
file_path = tmp_path / "large.csv.gz"
bytes_io = io.BytesIO()
df.write_csv(bytes_io)
bytes_io.seek(0)
with gzip.open(file_path, mode="wb") as f:
f.write(bytes_io.getvalue())
df = pl.read_csv(
file_path, skip_rows=40, has_header=False, skip_rows_after_header=20, n_rows=30
)
assert df.shape == (30, 3)

0 comments on commit f4e65df

Please sign in to comment.