Adds APIs for accessing encoding of raw stream items (amazon-ion#760)

* Uses the bump allocator to handle text escape processing, allowing `RawSymbolTokenRef` to hold a reference to a `&'bump str` instead of potentially owning a `String`. This change allows the `RawSymbolTokenRef` type to implement `Copy`, which in turn allows all of the `LazyExpandedValue`- and `LazyValue`-related types to also implement `Copy`. * Removes the `RawSymbolToken` type, which is now redundant to the `RawSymbolTokenRef` type. * Adds a `Span` type that provides access to the input bytes that comprised various raw stream items. * Adds a `LazyRawVersionMarker` trait and per-encoding impls that can provide a `Span` upon request. * Adds a `LazyRawField` trait and per-encoding impls that can provide a `Span` upon request. * Adds an `UnexpandedField` type that can represent both raw struct fields and struct fields from a template body. This simplified the code for expanding structs. * Adds methods to convert container types back to the general value type. * Adds `EncodedBinaryValueData_1_0` and `EncodedBinaryAnnotations_1_0` types that can be used to access spans and ranges for the various components of a binary 1.0 value. This patch exposes many functions and types which we likely wish to feature gate, but that change is being left for a future PR.
popematt · May 7, 2024 · 892ef2d · 892ef2d
1 parent b087e7f
commit 892ef2d
Show file tree

Hide file tree

Showing 57 changed files with 2,540 additions and 1,739 deletions.
diff --git a/examples/write_log_events.rs b/examples/write_log_events.rs
@@ -194,9 +194,9 @@ mod example {
                 .write(11, event.thread_id)?
                 .write(12, &event.thread_name)?
                 //                 v--- The fixed strings from the log statement are also SIDs
-                .write(13, RawSymbolToken::SymbolId(17))? // logger name
-                .write(14, RawSymbolToken::SymbolId(18))? // log level
-                .write(15, RawSymbolToken::SymbolId(19))? // format
+                .write(13, RawSymbolTokenRef::SymbolId(17))? // logger name
+                .write(14, RawSymbolTokenRef::SymbolId(18))? // log level
+                .write(15, RawSymbolTokenRef::SymbolId(19))? // format
                 .write(16, &event.parameters)?;
             struct_.close()
         }

diff --git a/src/lazy/any_encoding.rs b/src/lazy/any_encoding.rs
diff --git a/src/lazy/binary/encoded_value.rs b/src/lazy/binary/encoded_value.rs
@@ -1,5 +1,4 @@
 use crate::lazy::binary::raw::type_descriptor::Header;
-use crate::types::SymbolId;
 use crate::IonType;
 use std::ops::Range;
 
@@ -53,33 +52,29 @@ pub(crate) struct EncodedValue<HeaderType: EncodedHeader> {
     // and IonType.
     pub(crate) header: HeaderType,
 
-    // Each encoded value has up to five components, appearing in the following order:
+    // Each encoded value has up to four components, appearing in the following order:
     //
-    // [ field_id? | annotations? | header (type descriptor) | header_length? | value ]
+    // [ annotations? | header (type descriptor) | header_length? | value_body ]
     //
     // Components shown with a `?` are optional.
     //
     // EncodedValue stores the offset of the type descriptor byte from the beginning of the
     // data source (`header_offset`). The lengths of the other fields can be used to calculate
     // their positions relative to the type descriptor byte. For example, to find the offset of the
-    // field ID (if present), we can do:
-    //     header_offset - annotations_header_length - field_id_length
+    // annotations header (if present), we can do:
+    //     header_offset - annotations_header_length
     //
     // This allows us to store a single `usize` for the header offset, while other lengths can be
-    // packed into a `u8`. Values are not permitted to have a field ID or annotations that take
-    // more than 255 bytes to represent.
+    // packed into a `u8`. In this implementation, values are not permitted to have annotations that
+    // take more than 255 bytes to represent.
     //
     // We store the offset for the header byte because it is guaranteed to be present for all values.
-    // Field IDs and annotations appear earlier in the stream but are optional.
-
-    // The number of bytes used to encode the field ID (if present) preceding the Ion value. If
-    // `field_id` is undefined, `field_id_length` will be zero.
-    pub field_id_length: u8,
-    // If this value is inside a struct, `field_id` will contain the SymbolId that represents
-    // its field name.
-    pub field_id: Option<SymbolId>,
+    // Annotations appear earlier in the stream but are optional.
+
     // The number of bytes used to encode the annotations wrapper (if present) preceding the Ion
-    // value. If `annotations` is empty, `annotations_header_length` will be zero.
+    // value. If `annotations` is empty, `annotations_header_length` will be zero. The annotations
+    // wrapper contains several fields: an opcode, a wrapper length, a sequence length, and the
+    // sequence itself.
     pub annotations_header_length: u8,
     // The number of bytes used to encode the series of symbol IDs inside the annotations wrapper.
     pub annotations_sequence_length: u8,
@@ -89,9 +84,9 @@ pub(crate) struct EncodedValue<HeaderType: EncodedHeader> {
     pub length_length: u8,
     // The number of bytes used to encode the value itself, not including the header byte
     // or length fields.
-    pub value_length: usize,
+    pub value_body_length: usize,
     // The sum total of:
-    //     field_id_length + annotations_header_length + header_length + value_length
+    //     annotations_header_length + header_length + value_length
     // While this can be derived from the above fields, storing it for reuse offers a modest
     // optimization. `total_length` is needed when stepping into a value, skipping a value,
     // and reading a value's data.
@@ -127,53 +122,27 @@ impl<HeaderType: EncodedHeader> EncodedValue<HeaderType> {
     /// If the value can fit in the type descriptor byte (e.g. `true`, `false`, `null`, `0`),
     /// this function will return 0.
     #[inline(always)]
-    pub fn value_length(&self) -> usize {
-        self.value_length
+    pub fn value_body_length(&self) -> usize {
+        self.value_body_length
     }
 
     /// The offset of the first byte following the header (including length bytes, if present).
     /// If `value_length()` returns zero, this offset is actually the first byte of
     /// the next encoded value and should not be read.
-    pub fn value_offset(&self) -> usize {
+    pub fn value_body_offset(&self) -> usize {
         self.header_offset + self.header_length()
     }
 
     /// Returns an offset Range containing any bytes following the header.
-    pub fn value_range(&self) -> Range<usize> {
-        let start = self.value_offset();
-        let end = start + self.value_length;
+    pub fn value_body_range(&self) -> Range<usize> {
+        let start = self.value_body_offset();
+        let end = start + self.value_body_length;
         start..end
     }
 
     /// Returns the index of the first byte that is beyond the end of the current value's encoding.
     pub fn value_end_exclusive(&self) -> usize {
-        self.value_offset() + self.value_length
-    }
-
-    /// Returns the number of bytes used to encode this value's field ID, if present.
-    pub fn field_id_length(&self) -> Option<usize> {
-        self.field_id.as_ref()?;
-        Some(self.field_id_length as usize)
-    }
-
-    /// Returns the offset of the first byte used to encode this value's field ID, if present.
-    pub fn field_id_offset(&self) -> Option<usize> {
-        self.field_id.as_ref()?;
-        Some(
-            self.header_offset
-                - self.annotations_header_length as usize
-                - self.field_id_length as usize,
-        )
-    }
-
-    /// Returns an offset Range that contains the bytes used to encode this value's field ID,
-    /// if present.
-    pub fn field_id_range(&self) -> Option<Range<usize>> {
-        if let Some(start) = self.field_id_offset() {
-            let end = start + self.field_id_length as usize;
-            return Some(start..end);
-        }
-        None
+        self.value_body_offset() + self.value_body_length
     }
 
     /// Returns true if this encoded value has an annotations wrapper.
@@ -233,20 +202,28 @@ impl<HeaderType: EncodedHeader> EncodedValue<HeaderType> {
         None
     }
 
-    /// Returns the total number of bytes used to represent the current value, including the
-    /// field ID (if any), its annotations (if any), its header (type descriptor + length bytes),
-    /// and its value.
+    /// Returns the total number of bytes used to represent the current value, including
+    /// its annotations (if any), its header (type descriptor + length bytes), and the body of
+    /// the value.
     pub fn total_length(&self) -> usize {
         self.total_length
     }
 
     /// The offset Range (starting from the beginning of the stream) that contains this value's
-    /// complete encoding, including annotations. (It does not include the leading field ID, if
-    /// any.)
+    /// complete encoding, including annotations.
     pub fn annotated_value_range(&self) -> Range<usize> {
-        // [ field_id? | annotations? | header (type descriptor) | header_length? | value ]
+        // [ annotations? | header (type descriptor) | header_length? | value ]
+        let start = self.header_offset - self.annotations_header_length as usize;
+        let end = start + self.total_length;
+        start..end
+    }
+
+    /// The offset Range (starting from the beginning of the stream) that contains this value's
+    /// complete encoding, not including any annotations.
+    pub fn unannotated_value_range(&self) -> Range<usize> {
+        // [ annotations? | header (type descriptor) | header_length? | value ]
         let start = self.header_offset - self.annotations_header_length as usize;
-        let end = start - self.field_id_length as usize + self.total_length;
+        let end = start + self.total_length;
         start..end
     }
 
@@ -264,20 +241,18 @@ mod tests {
 
     #[test]
     fn accessors() -> IonResult<()> {
-        // 3-byte String with 1-byte annotation and field ID $10
+        // 3-byte String with 1-byte annotation
         let value = EncodedValue {
             header: Header {
                 ion_type: IonType::String,
                 ion_type_code: IonTypeCode::String,
                 length_code: 3,
             },
-            field_id_length: 1,
-            field_id: Some(10),
             annotations_header_length: 3,
             annotations_sequence_length: 1,
             header_offset: 200,
             length_length: 0,
-            value_length: 3,
+            value_body_length: 3,
             total_length: 7,
         };
         assert_eq!(value.ion_type(), IonType::String);
@@ -292,18 +267,15 @@ mod tests {
         assert_eq!(value.header_offset(), 200);
         assert_eq!(value.header_length(), 1);
         assert_eq!(value.header_range(), 200..201);
-        assert_eq!(value.field_id_length(), Some(1));
-        assert_eq!(value.field_id_offset(), Some(196));
-        assert_eq!(value.field_id_range(), Some(196..197));
         assert!(value.has_annotations());
         assert_eq!(value.annotations_range(), Some(197..200));
         assert_eq!(value.annotations_header_length(), Some(3));
         assert_eq!(value.annotations_sequence_offset(), Some(199));
         assert_eq!(value.annotations_sequence_length(), Some(1));
         assert_eq!(value.annotations_sequence_range(), Some(199..200));
-        assert_eq!(value.value_length(), 3);
-        assert_eq!(value.value_offset(), 201);
-        assert_eq!(value.value_range(), 201..204);
+        assert_eq!(value.value_body_length(), 3);
+        assert_eq!(value.value_body_offset(), 201);
+        assert_eq!(value.value_body_range(), 201..204);
         assert_eq!(value.value_end_exclusive(), 204);
         assert_eq!(value.total_length(), 7);
         Ok(())

diff --git a/src/lazy/binary/immutable_buffer.rs b/src/lazy/binary/immutable_buffer.rs
@@ -1,19 +1,25 @@
+use std::fmt::{Debug, Formatter};
+use std::mem;
+use std::ops::Range;
+
+use num_bigint::{BigInt, BigUint, Sign};
+
 use crate::binary::constants::v1_0::{length_codes, IVM};
 use crate::binary::int::DecodedInt;
 use crate::binary::uint::DecodedUInt;
 use crate::binary::var_int::VarInt;
 use crate::binary::var_uint::VarUInt;
 use crate::lazy::binary::encoded_value::EncodedValue;
+use crate::lazy::binary::raw::r#struct::LazyRawBinaryFieldName_1_0;
 use crate::lazy::binary::raw::type_descriptor::{Header, TypeDescriptor, ION_1_0_TYPE_DESCRIPTORS};
-use crate::lazy::binary::raw::value::LazyRawBinaryValue_1_0;
+use crate::lazy::binary::raw::value::{LazyRawBinaryValue_1_0, LazyRawBinaryVersionMarker_1_0};
+use crate::lazy::decoder::LazyRawFieldExpr;
 use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt;
 use crate::lazy::encoder::binary::v1_1::flex_uint::FlexUInt;
+use crate::lazy::encoding::BinaryEncoding_1_0;
 use crate::result::IonFailure;
 use crate::types::UInt;
 use crate::{Int, IonError, IonResult, IonType};
-use num_bigint::{BigInt, BigUint, Sign};
-use std::fmt::{Debug, Formatter};
-use std::mem;
 
 // This limit is used for stack-allocating buffer space to encode/decode UInts.
 const UINT_STACK_BUFFER_SIZE: usize = 16;
@@ -69,7 +75,7 @@ impl<'a> ImmutableBuffer<'a> {
     }
 
     /// Returns a slice containing all of the buffer's bytes.
-    pub fn bytes(&self) -> &[u8] {
+    pub fn bytes(&self) -> &'a [u8] {
         self.data
     }
 
@@ -100,6 +106,10 @@ impl<'a> ImmutableBuffer<'a> {
         self.data.len()
     }
 
+    pub fn range(&self) -> Range<usize> {
+        self.offset..self.offset + self.len()
+    }
+
     /// Returns `true` if there are no bytes in the buffer. Otherwise, returns `false`.
     pub fn is_empty(&self) -> bool {
         self.data.is_empty()
@@ -143,15 +153,16 @@ impl<'a> ImmutableBuffer<'a> {
     /// returns an `Ok(_)` containing a `(major, minor)` version tuple.
     ///
     /// See: <https://amazon-ion.github.io/ion-docs/docs/binary.html#value-streams>
-    pub fn read_ivm(self) -> ParseResult<'a, (u8, u8)> {
+    pub fn read_ivm(self) -> ParseResult<'a, LazyRawBinaryVersionMarker_1_0<'a>> {
         let bytes = self
             .peek_n_bytes(IVM.len())
             .ok_or_else(|| IonError::incomplete("an IVM", self.offset()))?;
 
         match bytes {
             [0xE0, major, minor, 0xEA] => {
-                let version = (*major, *minor);
-                Ok((version, self.consume(IVM.len())))
+                let matched = ImmutableBuffer::new_with_offset(bytes, self.offset);
+                let marker = LazyRawBinaryVersionMarker_1_0::new(matched, *major, *minor);
+                Ok((marker, self.consume(IVM.len())))
             }
             invalid_ivm => IonResult::decoding_error(format!("invalid IVM: {invalid_ivm:?}")),
         }
@@ -607,7 +618,7 @@ impl<'a> ImmutableBuffer<'a> {
     }
 
     /// Reads a field ID and a value from the buffer.
-    pub(crate) fn peek_field(self) -> IonResult<Option<LazyRawBinaryValue_1_0<'a>>> {
+    pub(crate) fn peek_field(self) -> IonResult<Option<LazyRawFieldExpr<'a, BinaryEncoding_1_0>>> {
         let mut input = self;
         if self.is_empty() {
             // We're at the end of the struct
@@ -625,7 +636,7 @@ impl<'a> ImmutableBuffer<'a> {
         let mut type_descriptor = input_after_field_id.peek_type_descriptor()?;
         if type_descriptor.is_nop() {
             // Read past NOP fields until we find the first one that's an actual value
-            // or we run out of struct bytes. Note that we read the NOP field(s) from `self` (the
+            // or we run out of struct bytes. Note that we read the NOP field(s) from `input` (the
             // initial input) rather than `input_after_field_id` because it simplifies
             // the logic of `read_struct_field_nop_pad()`, which is very rarely called.
             (field_id_var_uint, input_after_field_id) = match input.read_struct_field_nop_pad()? {
@@ -643,15 +654,12 @@ impl<'a> ImmutableBuffer<'a> {
             };
         }
 
-        let field_id_length = field_id_var_uint.size_in_bytes() as u8;
         let field_id = field_id_var_uint.value();
+        let matched_field_id = input.slice(0, field_id_var_uint.size_in_bytes());
+        let field_name = LazyRawBinaryFieldName_1_0::new(field_id, matched_field_id);
 
-        let mut value = input_after_field_id.read_value(type_descriptor)?;
-        value.encoded_value.field_id = Some(field_id);
-        value.encoded_value.field_id_length = field_id_length;
-        value.encoded_value.total_length += field_id_length as usize;
-        value.input = input;
-        Ok(Some(value))
+        let field_value = input_after_field_id.read_value(type_descriptor)?;
+        Ok(Some(LazyRawFieldExpr::NameValue(field_name, field_value)))
     }
 
     #[cold]
@@ -745,15 +753,12 @@ impl<'a> ImmutableBuffer<'a> {
 
         let encoded_value = EncodedValue {
             header,
-            // If applicable, these are populated by the caller: `peek_field()`
-            field_id_length: 0,
-            field_id: None,
             // If applicable, these are populated by the caller: `read_annotated_value()`
             annotations_header_length: 0,
             annotations_sequence_length: 0,
             header_offset,
             length_length,
-            value_length,
+            value_body_length: value_length,
             total_length,
         };
         let lazy_value = LazyRawBinaryValue_1_0 {
@@ -810,10 +815,12 @@ pub struct AnnotationsWrapper {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::IonError;
     use num_traits::Num;
 
+    use crate::IonError;
+
+    use super::*;
+
     fn input_test<A: AsRef<[u8]>>(input: A) {
         let input = ImmutableBuffer::new(input.as_ref());
         // We can peek at the first byte...