Skip to content

Commit

Permalink
Mark Encoding::BIT_PACKED as deprecated and document its compatibilit…
Browse files Browse the repository at this point in the history
…y issues (#5348)

* Mark Encoding::BIT_PACKED as deprecated and document its compatibility issues

* Allow deprecated BIT_PACKED in parquet-layout binary
  • Loading branch information
jhorstmann authored Jan 31, 2024
1 parent 31cf5ce commit 93c7a12
Show file tree
Hide file tree
Showing 9 changed files with 27 additions and 2 deletions.
1 change: 1 addition & 0 deletions parquet/src/arrow/record_reader/definition_levels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ impl PackedDecoder {
self.packed_offset = 0;
self.packed_count = match encoding {
Encoding::RLE => 0,
#[allow(deprecated)]
Encoding::BIT_PACKED => data.len() * 8,
_ => unreachable!("invalid level encoding: {}", encoding),
};
Expand Down
17 changes: 16 additions & 1 deletion parquet/src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,21 @@ pub enum Encoding {
/// Usable for definition/repetition levels encoding and boolean values.
RLE,

/// Bit packed encoding.
/// **Deprecated** Bit-packed encoding.
///
/// This can only be used if the data has a known max width.
/// Usable for definition/repetition levels encoding.
///
/// There are compatibility issues with files using this encoding.
/// The parquet standard specifies the bits to be packed starting from the
/// most-significant bit, several implementations do not follow this bit order.
/// Several other implementations also have issues reading this encoding
/// because of incorrect assumptions about the length of the encoded data.
///
/// The RLE/bit-packing hybrid is more cpu and memory efficient and should be used instead.
#[deprecated(
note = "Please see documentation for compatibility issues and use the RLE/bit-packing hybrid encoding instead"
)]
BIT_PACKED,

/// Delta encoding for integers, either INT32 or INT64.
Expand Down Expand Up @@ -301,6 +312,7 @@ impl FromStr for Encoding {
"PLAIN" | "plain" => Ok(Encoding::PLAIN),
"PLAIN_DICTIONARY" | "plain_dictionary" => Ok(Encoding::PLAIN_DICTIONARY),
"RLE" | "rle" => Ok(Encoding::RLE),
#[allow(deprecated)]
"BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED),
"DELTA_BINARY_PACKED" | "delta_binary_packed" => Ok(Encoding::DELTA_BINARY_PACKED),
"DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => {
Expand Down Expand Up @@ -910,6 +922,7 @@ impl TryFrom<parquet::Encoding> for Encoding {
parquet::Encoding::PLAIN => Encoding::PLAIN,
parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY,
parquet::Encoding::RLE => Encoding::RLE,
#[allow(deprecated)]
parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED,
parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED,
parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => Encoding::DELTA_LENGTH_BYTE_ARRAY,
Expand All @@ -927,6 +940,7 @@ impl From<Encoding> for parquet::Encoding {
Encoding::PLAIN => parquet::Encoding::PLAIN,
Encoding::PLAIN_DICTIONARY => parquet::Encoding::PLAIN_DICTIONARY,
Encoding::RLE => parquet::Encoding::RLE,
#[allow(deprecated)]
Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED,
Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DELTA_BINARY_PACKED,
Encoding::DELTA_LENGTH_BYTE_ARRAY => parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
Expand Down Expand Up @@ -1114,6 +1128,7 @@ impl str::FromStr for LogicalType {
}

#[cfg(test)]
#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
mod tests {
use super::*;

Expand Down
1 change: 1 addition & 0 deletions parquet/src/bin/parquet-layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ fn encoding(encoding: parquet::format::Encoding) -> &'static str {
Ok(Encoding::PLAIN) => "plain",
Ok(Encoding::PLAIN_DICTIONARY) => "plain_dictionary",
Ok(Encoding::RLE) => "rle",
#[allow(deprecated)]
Ok(Encoding::BIT_PACKED) => "bit_packed",
Ok(Encoding::DELTA_BINARY_PACKED) => "delta_binary_packed",
Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY) => "delta_length_byte_array",
Expand Down
1 change: 1 addition & 0 deletions parquet/src/column/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,7 @@ fn parse_v1_level(
buf.slice(i32_size..i32_size + data_size),
))
}
#[allow(deprecated)]
Encoding::BIT_PACKED => {
let bit_width = num_required_bits(max_level as u64);
let num_bytes = ceil(num_buffered_values as usize * bit_width as usize, 8);
Expand Down
1 change: 1 addition & 0 deletions parquet/src/column/reader/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ impl LevelDecoder {
decoder.set_data(data);
Self::Rle(decoder)
}
#[allow(deprecated)]
Encoding::BIT_PACKED => Self::Packed(BitReader::new(data), bit_width),
_ => unreachable!("invalid level encoding: {}", encoding),
}
Expand Down
1 change: 1 addition & 0 deletions parquet/src/encodings/decoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1136,6 +1136,7 @@ mod tests {
);

// unsupported
#[allow(deprecated)]
create_and_check_decoder::<Int32Type>(
Encoding::BIT_PACKED,
Some(nyi_err!("Encoding BIT_PACKED is not supported")),
Expand Down
1 change: 1 addition & 0 deletions parquet/src/encodings/encoding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,7 @@ mod tests {
);

// unsupported
#[allow(deprecated)]
create_and_check_encoder::<Int32Type>(
Encoding::BIT_PACKED,
Some(nyi_err!("Encoding BIT_PACKED is not supported")),
Expand Down
2 changes: 2 additions & 0 deletions parquet/src/encodings/levels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub fn max_buffer_size(
let bit_width = num_required_bits(max_level as u64);
match encoding {
Encoding::RLE => RleEncoder::max_buffer_size(bit_width, num_buffered_values),
#[allow(deprecated)]
Encoding::BIT_PACKED => ceil(num_buffered_values * bit_width as usize, 8),
_ => panic!("Unsupported encoding type {encoding}"),
}
Expand Down Expand Up @@ -66,6 +67,7 @@ impl LevelEncoder {
buffer.extend_from_slice(&[0; 4]);
LevelEncoder::Rle(RleEncoder::new_from_buf(bit_width, buffer))
}
#[allow(deprecated)]
Encoding::BIT_PACKED => {
// Here we set full byte buffer without adjusting for num_buffered_values,
// because byte buffer will already be allocated with size from
Expand Down
4 changes: 3 additions & 1 deletion parquet/src/file/serialized_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,9 @@ mod tests {
assert_eq!(num_values, 8);
assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
assert_eq!(def_level_encoding, Encoding::RLE);
assert_eq!(rep_level_encoding, Encoding::BIT_PACKED);
#[allow(deprecated)]
let expected_rep_level_encoding = Encoding::BIT_PACKED;
assert_eq!(rep_level_encoding, expected_rep_level_encoding);
assert!(statistics.is_none());
true
}
Expand Down

0 comments on commit 93c7a12

Please sign in to comment.