Skip to content

Commit

Permalink
Adds APIs for accessing encoding of raw stream items (amazon-ion#760)
Browse files Browse the repository at this point in the history
* Uses the bump allocator to handle text escape processing, allowing
  `RawSymbolTokenRef` to hold a reference to a `&'bump str` instead
  of potentially owning a `String`. This change allows the
  `RawSymbolTokenRef` type to implement `Copy`, which in turn allows
  all of the `LazyExpandedValue`- and `LazyValue`-related types
  to also implement `Copy`.
* Removes the `RawSymbolToken` type, which is now redundant to the
  `RawSymbolTokenRef` type.
* Adds a `Span` type that provides access to the input bytes that
  comprised various raw stream items.
* Adds a `LazyRawVersionMarker` trait and per-encoding impls that can
  provide a `Span` upon request.
* Adds a `LazyRawField` trait and per-encoding impls that can provide
  a `Span` upon request.
* Adds an `UnexpandedField` type that can represent both raw struct
  fields and struct fields from a template body. This simplified
  the code for expanding structs.
* Adds methods to convert container types back to the general value
  type.
* Adds `EncodedBinaryValueData_1_0` and `EncodedBinaryAnnotations_1_0`
  types that can be used to access spans and ranges for the various
  components of a binary 1.0 value.

This patch exposes many functions and types which we likely wish
to feature gate, but that change is being left for a future PR.
  • Loading branch information
zslayton authored May 7, 2024
1 parent b087e7f commit 892ef2d
Show file tree
Hide file tree
Showing 57 changed files with 2,540 additions and 1,739 deletions.
6 changes: 3 additions & 3 deletions examples/write_log_events.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,9 @@ mod example {
.write(11, event.thread_id)?
.write(12, &event.thread_name)?
// v--- The fixed strings from the log statement are also SIDs
.write(13, RawSymbolToken::SymbolId(17))? // logger name
.write(14, RawSymbolToken::SymbolId(18))? // log level
.write(15, RawSymbolToken::SymbolId(19))? // format
.write(13, RawSymbolTokenRef::SymbolId(17))? // logger name
.write(14, RawSymbolTokenRef::SymbolId(18))? // log level
.write(15, RawSymbolTokenRef::SymbolId(19))? // format
.write(16, &event.parameters)?;
struct_.close()
}
Expand Down
469 changes: 335 additions & 134 deletions src/lazy/any_encoding.rs

Large diffs are not rendered by default.

108 changes: 40 additions & 68 deletions src/lazy/binary/encoded_value.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use crate::lazy::binary::raw::type_descriptor::Header;
use crate::types::SymbolId;
use crate::IonType;
use std::ops::Range;

Expand Down Expand Up @@ -53,33 +52,29 @@ pub(crate) struct EncodedValue<HeaderType: EncodedHeader> {
// and IonType.
pub(crate) header: HeaderType,

// Each encoded value has up to five components, appearing in the following order:
// Each encoded value has up to four components, appearing in the following order:
//
// [ field_id? | annotations? | header (type descriptor) | header_length? | value ]
// [ annotations? | header (type descriptor) | header_length? | value_body ]
//
// Components shown with a `?` are optional.
//
// EncodedValue stores the offset of the type descriptor byte from the beginning of the
// data source (`header_offset`). The lengths of the other fields can be used to calculate
// their positions relative to the type descriptor byte. For example, to find the offset of the
// field ID (if present), we can do:
// header_offset - annotations_header_length - field_id_length
// annotations header (if present), we can do:
// header_offset - annotations_header_length
//
// This allows us to store a single `usize` for the header offset, while other lengths can be
// packed into a `u8`. Values are not permitted to have a field ID or annotations that take
// more than 255 bytes to represent.
// packed into a `u8`. In this implementation, values are not permitted to have annotations that
// take more than 255 bytes to represent.
//
// We store the offset for the header byte because it is guaranteed to be present for all values.
// Field IDs and annotations appear earlier in the stream but are optional.

// The number of bytes used to encode the field ID (if present) preceding the Ion value. If
// `field_id` is undefined, `field_id_length` will be zero.
pub field_id_length: u8,
// If this value is inside a struct, `field_id` will contain the SymbolId that represents
// its field name.
pub field_id: Option<SymbolId>,
// Annotations appear earlier in the stream but are optional.

// The number of bytes used to encode the annotations wrapper (if present) preceding the Ion
// value. If `annotations` is empty, `annotations_header_length` will be zero.
// value. If `annotations` is empty, `annotations_header_length` will be zero. The annotations
// wrapper contains several fields: an opcode, a wrapper length, a sequence length, and the
// sequence itself.
pub annotations_header_length: u8,
// The number of bytes used to encode the series of symbol IDs inside the annotations wrapper.
pub annotations_sequence_length: u8,
Expand All @@ -89,9 +84,9 @@ pub(crate) struct EncodedValue<HeaderType: EncodedHeader> {
pub length_length: u8,
// The number of bytes used to encode the value itself, not including the header byte
// or length fields.
pub value_length: usize,
pub value_body_length: usize,
// The sum total of:
// field_id_length + annotations_header_length + header_length + value_length
// annotations_header_length + header_length + value_length
// While this can be derived from the above fields, storing it for reuse offers a modest
// optimization. `total_length` is needed when stepping into a value, skipping a value,
// and reading a value's data.
Expand Down Expand Up @@ -127,53 +122,27 @@ impl<HeaderType: EncodedHeader> EncodedValue<HeaderType> {
/// If the value can fit in the type descriptor byte (e.g. `true`, `false`, `null`, `0`),
/// this function will return 0.
#[inline(always)]
pub fn value_length(&self) -> usize {
self.value_length
pub fn value_body_length(&self) -> usize {
self.value_body_length
}

/// The offset of the first byte following the header (including length bytes, if present).
/// If `value_length()` returns zero, this offset is actually the first byte of
/// the next encoded value and should not be read.
pub fn value_offset(&self) -> usize {
pub fn value_body_offset(&self) -> usize {
self.header_offset + self.header_length()
}

/// Returns an offset Range containing any bytes following the header.
pub fn value_range(&self) -> Range<usize> {
let start = self.value_offset();
let end = start + self.value_length;
pub fn value_body_range(&self) -> Range<usize> {
let start = self.value_body_offset();
let end = start + self.value_body_length;
start..end
}

/// Returns the index of the first byte that is beyond the end of the current value's encoding.
pub fn value_end_exclusive(&self) -> usize {
self.value_offset() + self.value_length
}

/// Returns the number of bytes used to encode this value's field ID, if present.
pub fn field_id_length(&self) -> Option<usize> {
self.field_id.as_ref()?;
Some(self.field_id_length as usize)
}

/// Returns the offset of the first byte used to encode this value's field ID, if present.
pub fn field_id_offset(&self) -> Option<usize> {
self.field_id.as_ref()?;
Some(
self.header_offset
- self.annotations_header_length as usize
- self.field_id_length as usize,
)
}

/// Returns an offset Range that contains the bytes used to encode this value's field ID,
/// if present.
pub fn field_id_range(&self) -> Option<Range<usize>> {
if let Some(start) = self.field_id_offset() {
let end = start + self.field_id_length as usize;
return Some(start..end);
}
None
self.value_body_offset() + self.value_body_length
}

/// Returns true if this encoded value has an annotations wrapper.
Expand Down Expand Up @@ -233,20 +202,28 @@ impl<HeaderType: EncodedHeader> EncodedValue<HeaderType> {
None
}

/// Returns the total number of bytes used to represent the current value, including the
/// field ID (if any), its annotations (if any), its header (type descriptor + length bytes),
/// and its value.
/// Returns the total number of bytes used to represent the current value, including
/// its annotations (if any), its header (type descriptor + length bytes), and the body of
/// the value.
pub fn total_length(&self) -> usize {
self.total_length
}

/// The offset Range (starting from the beginning of the stream) that contains this value's
/// complete encoding, including annotations. (It does not include the leading field ID, if
/// any.)
/// complete encoding, including annotations.
pub fn annotated_value_range(&self) -> Range<usize> {
// [ field_id? | annotations? | header (type descriptor) | header_length? | value ]
// [ annotations? | header (type descriptor) | header_length? | value ]
let start = self.header_offset - self.annotations_header_length as usize;
let end = start + self.total_length;
start..end
}

/// The offset Range (starting from the beginning of the stream) that contains this value's
/// complete encoding, not including any annotations.
pub fn unannotated_value_range(&self) -> Range<usize> {
// [ annotations? | header (type descriptor) | header_length? | value ]
let start = self.header_offset - self.annotations_header_length as usize;
let end = start - self.field_id_length as usize + self.total_length;
let end = start + self.total_length;
start..end
}

Expand All @@ -264,20 +241,18 @@ mod tests {

#[test]
fn accessors() -> IonResult<()> {
// 3-byte String with 1-byte annotation and field ID $10
// 3-byte String with 1-byte annotation
let value = EncodedValue {
header: Header {
ion_type: IonType::String,
ion_type_code: IonTypeCode::String,
length_code: 3,
},
field_id_length: 1,
field_id: Some(10),
annotations_header_length: 3,
annotations_sequence_length: 1,
header_offset: 200,
length_length: 0,
value_length: 3,
value_body_length: 3,
total_length: 7,
};
assert_eq!(value.ion_type(), IonType::String);
Expand All @@ -292,18 +267,15 @@ mod tests {
assert_eq!(value.header_offset(), 200);
assert_eq!(value.header_length(), 1);
assert_eq!(value.header_range(), 200..201);
assert_eq!(value.field_id_length(), Some(1));
assert_eq!(value.field_id_offset(), Some(196));
assert_eq!(value.field_id_range(), Some(196..197));
assert!(value.has_annotations());
assert_eq!(value.annotations_range(), Some(197..200));
assert_eq!(value.annotations_header_length(), Some(3));
assert_eq!(value.annotations_sequence_offset(), Some(199));
assert_eq!(value.annotations_sequence_length(), Some(1));
assert_eq!(value.annotations_sequence_range(), Some(199..200));
assert_eq!(value.value_length(), 3);
assert_eq!(value.value_offset(), 201);
assert_eq!(value.value_range(), 201..204);
assert_eq!(value.value_body_length(), 3);
assert_eq!(value.value_body_offset(), 201);
assert_eq!(value.value_body_range(), 201..204);
assert_eq!(value.value_end_exclusive(), 204);
assert_eq!(value.total_length(), 7);
Ok(())
Expand Down
53 changes: 30 additions & 23 deletions src/lazy/binary/immutable_buffer.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
use std::fmt::{Debug, Formatter};
use std::mem;
use std::ops::Range;

use num_bigint::{BigInt, BigUint, Sign};

use crate::binary::constants::v1_0::{length_codes, IVM};
use crate::binary::int::DecodedInt;
use crate::binary::uint::DecodedUInt;
use crate::binary::var_int::VarInt;
use crate::binary::var_uint::VarUInt;
use crate::lazy::binary::encoded_value::EncodedValue;
use crate::lazy::binary::raw::r#struct::LazyRawBinaryFieldName_1_0;
use crate::lazy::binary::raw::type_descriptor::{Header, TypeDescriptor, ION_1_0_TYPE_DESCRIPTORS};
use crate::lazy::binary::raw::value::LazyRawBinaryValue_1_0;
use crate::lazy::binary::raw::value::{LazyRawBinaryValue_1_0, LazyRawBinaryVersionMarker_1_0};
use crate::lazy::decoder::LazyRawFieldExpr;
use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt;
use crate::lazy::encoder::binary::v1_1::flex_uint::FlexUInt;
use crate::lazy::encoding::BinaryEncoding_1_0;
use crate::result::IonFailure;
use crate::types::UInt;
use crate::{Int, IonError, IonResult, IonType};
use num_bigint::{BigInt, BigUint, Sign};
use std::fmt::{Debug, Formatter};
use std::mem;

// This limit is used for stack-allocating buffer space to encode/decode UInts.
const UINT_STACK_BUFFER_SIZE: usize = 16;
Expand Down Expand Up @@ -69,7 +75,7 @@ impl<'a> ImmutableBuffer<'a> {
}

/// Returns a slice containing all of the buffer's bytes.
pub fn bytes(&self) -> &[u8] {
pub fn bytes(&self) -> &'a [u8] {
self.data
}

Expand Down Expand Up @@ -100,6 +106,10 @@ impl<'a> ImmutableBuffer<'a> {
self.data.len()
}

pub fn range(&self) -> Range<usize> {
self.offset..self.offset + self.len()
}

/// Returns `true` if there are no bytes in the buffer. Otherwise, returns `false`.
pub fn is_empty(&self) -> bool {
self.data.is_empty()
Expand Down Expand Up @@ -143,15 +153,16 @@ impl<'a> ImmutableBuffer<'a> {
/// returns an `Ok(_)` containing a `(major, minor)` version tuple.
///
/// See: <https://amazon-ion.github.io/ion-docs/docs/binary.html#value-streams>
pub fn read_ivm(self) -> ParseResult<'a, (u8, u8)> {
pub fn read_ivm(self) -> ParseResult<'a, LazyRawBinaryVersionMarker_1_0<'a>> {
let bytes = self
.peek_n_bytes(IVM.len())
.ok_or_else(|| IonError::incomplete("an IVM", self.offset()))?;

match bytes {
[0xE0, major, minor, 0xEA] => {
let version = (*major, *minor);
Ok((version, self.consume(IVM.len())))
let matched = ImmutableBuffer::new_with_offset(bytes, self.offset);
let marker = LazyRawBinaryVersionMarker_1_0::new(matched, *major, *minor);
Ok((marker, self.consume(IVM.len())))
}
invalid_ivm => IonResult::decoding_error(format!("invalid IVM: {invalid_ivm:?}")),
}
Expand Down Expand Up @@ -607,7 +618,7 @@ impl<'a> ImmutableBuffer<'a> {
}

/// Reads a field ID and a value from the buffer.
pub(crate) fn peek_field(self) -> IonResult<Option<LazyRawBinaryValue_1_0<'a>>> {
pub(crate) fn peek_field(self) -> IonResult<Option<LazyRawFieldExpr<'a, BinaryEncoding_1_0>>> {
let mut input = self;
if self.is_empty() {
// We're at the end of the struct
Expand All @@ -625,7 +636,7 @@ impl<'a> ImmutableBuffer<'a> {
let mut type_descriptor = input_after_field_id.peek_type_descriptor()?;
if type_descriptor.is_nop() {
// Read past NOP fields until we find the first one that's an actual value
// or we run out of struct bytes. Note that we read the NOP field(s) from `self` (the
// or we run out of struct bytes. Note that we read the NOP field(s) from `input` (the
// initial input) rather than `input_after_field_id` because it simplifies
// the logic of `read_struct_field_nop_pad()`, which is very rarely called.
(field_id_var_uint, input_after_field_id) = match input.read_struct_field_nop_pad()? {
Expand All @@ -643,15 +654,12 @@ impl<'a> ImmutableBuffer<'a> {
};
}

let field_id_length = field_id_var_uint.size_in_bytes() as u8;
let field_id = field_id_var_uint.value();
let matched_field_id = input.slice(0, field_id_var_uint.size_in_bytes());
let field_name = LazyRawBinaryFieldName_1_0::new(field_id, matched_field_id);

let mut value = input_after_field_id.read_value(type_descriptor)?;
value.encoded_value.field_id = Some(field_id);
value.encoded_value.field_id_length = field_id_length;
value.encoded_value.total_length += field_id_length as usize;
value.input = input;
Ok(Some(value))
let field_value = input_after_field_id.read_value(type_descriptor)?;
Ok(Some(LazyRawFieldExpr::NameValue(field_name, field_value)))
}

#[cold]
Expand Down Expand Up @@ -745,15 +753,12 @@ impl<'a> ImmutableBuffer<'a> {

let encoded_value = EncodedValue {
header,
// If applicable, these are populated by the caller: `peek_field()`
field_id_length: 0,
field_id: None,
// If applicable, these are populated by the caller: `read_annotated_value()`
annotations_header_length: 0,
annotations_sequence_length: 0,
header_offset,
length_length,
value_length,
value_body_length: value_length,
total_length,
};
let lazy_value = LazyRawBinaryValue_1_0 {
Expand Down Expand Up @@ -810,10 +815,12 @@ pub struct AnnotationsWrapper {

#[cfg(test)]
mod tests {
use super::*;
use crate::IonError;
use num_traits::Num;

use crate::IonError;

use super::*;

fn input_test<A: AsRef<[u8]>>(input: A) {
let input = ImmutableBuffer::new(input.as_ref());
// We can peek at the first byte...
Expand Down
Loading

0 comments on commit 892ef2d

Please sign in to comment.