Skip to content

Commit

Permalink
Avoid copying when decoding UTF-8 or ASCII
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski committed Jan 7, 2025
1 parent c594256 commit dce3c57
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 3 deletions.
57 changes: 54 additions & 3 deletions src/rewritable_units/text_decoder.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::base::SharedEncoding;
use crate::rewriter::RewritingError;
use encoding_rs::{CoderResult, Decoder, Encoding};
use encoding_rs::{CoderResult, Decoder, Encoding, UTF_8};

pub(crate) struct TextDecoder {
encoding: SharedEncoding,
Expand All @@ -27,7 +27,6 @@ impl TextDecoder {
) -> Result<(), RewritingError> {
if self.pending_text_streaming_decoder.is_some() {
self.feed_text(&[], true, output_handler)?;
self.pending_text_streaming_decoder = None;
}
Ok(())
}
Expand All @@ -40,13 +39,25 @@ impl TextDecoder {
output_handler: &mut dyn FnMut(&str, bool, &'static Encoding) -> Result<(), RewritingError>,
) -> Result<(), RewritingError> {
let encoding = self.encoding.get();
let buffer = self.text_buffer.as_mut_str();

if let Some((utf8_text, rest)) = self.split_utf8_start(raw_input, encoding) {
raw_input = rest;
let really_last = last_in_text_node && rest.is_empty();

(output_handler)(utf8_text, really_last, encoding)?;

if really_last {
debug_assert!(self.pending_text_streaming_decoder.is_none());
return Ok(());
}
};

let decoder = self
.pending_text_streaming_decoder
.get_or_insert_with(|| encoding.new_decoder_without_bom_handling());

loop {
let buffer = self.text_buffer.as_mut_str();
let (status, read, written, ..) =
decoder.decode_to_str(raw_input, buffer, last_in_text_node);

Expand All @@ -60,9 +71,49 @@ impl TextDecoder {
}

if finished_decoding {
if last_in_text_node {
self.pending_text_streaming_decoder = None;
}
return Ok(());
}
raw_input = &raw_input[read..];
}
}

/// Fast path for UTF-8 or ASCII prefix
///
/// Returns UTF-8 text to emit + remaining bytes, or `None` if the fast path is not available
#[inline]
fn split_utf8_start<'i>(
&self,
raw_input: &'i [u8],
encoding: &'static Encoding,
) -> Option<(&'i str, &'i [u8])> {
// Can't use the fast path if the decoder may have buffered some bytes
if self.pending_text_streaming_decoder.is_some() {
return None;
}

let text_or_len = if encoding == UTF_8 {
std::str::from_utf8(raw_input).map_err(|err| err.valid_up_to())
} else {
debug_assert!(encoding.is_ascii_compatible());
Err(Encoding::ascii_valid_up_to(raw_input))
};

match text_or_len {
Ok(utf8_text) => Some((utf8_text, &[][..])),
Err(valid_up_to) => {
// The slow path buffers 1KB, and even though this shouldn't matter,
// it is an observable behavior, and it makes bugs worse for text handlers
// that assume they'll get only a single chunk.
if valid_up_to != raw_input.len() && valid_up_to < self.text_buffer.len() {
return None;
}

let (text, rest) = raw_input.split_at_checked(valid_up_to)?;
Some((std::str::from_utf8(text).ok()?, rest))
}
}
}
}
1 change: 1 addition & 0 deletions src/rewritable_units/tokens/text_chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ mod tests {
macro_rules! skip_eof_chunk {
($c:ident) => {
if $c.last_in_text_node() {
// This is not always true — a replacement char for an incomplete UTF-8 sequence could be flushed last
assert!($c.as_str().is_empty());
return;
}
Expand Down

0 comments on commit dce3c57

Please sign in to comment.