From 847561484bfb38f7edc11da1e3c198bb901dfaa5 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Fri, 19 Jul 2024 17:06:31 +0800 Subject: [PATCH 01/18] feat: add a way to create `JsString` from ASCII literal --- core/engine/src/string.rs | 49 ++++++++++++++++- core/string/src/lib.rs | 113 ++++++++++++++++++++++++++++++-------- 2 files changed, 136 insertions(+), 26 deletions(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index cfddb9328ce..920ee786254 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -7,6 +7,15 @@ #[doc(inline)] pub use boa_string::*; +static_assertions::const_assert_eq!( + // SAFETY: + // Compiler will throw error if `transmute` does not meet the requirement. + unsafe { std::mem::transmute::, usize>(std::cell::Cell::new(0usize)) }, + // SAFETY: + // Compiler will throw error if `transmute` does not meet the requirement. + unsafe { std::mem::transmute::, usize>(None) } +); + /// Utility macro to create a [`JsString`]. /// /// # Examples @@ -54,9 +63,40 @@ macro_rules! js_string { () => { $crate::string::JsString::default() }; - ($s:literal) => { - $crate::string::JsString::from($crate::js_str!($s)) - }; + ($s:literal) => {{ + if $s.is_ascii() { + use $crate::string::JsStr; + + #[allow(clippy::items_after_statements)] + // Create a static `JsStr` that references an ASCII literal + static ORIGINAL_JS_STR: JsStr<'static> = JsStr::latin1($s.as_bytes()); + + #[allow(clippy::items_after_statements)] + // Use `[Option<&usize>; 2]` which has the same size with primitive `RawJsString` + // to represent `RawJsString` since `Cell` is unable to construct in static + // and `RawJsString` is private. + // With `Null Pointer Optimization` we could use `None` + // to represent `Cell(0usize)` to mark it as being created from ASCII literal. + static DUMMY_RAW_JS_STRING: &[Option<&usize>; 2] = &[ + // SAFETY: + // Reference of static variable is always valid to cast into an non-null pointer, + // And the primitive size of `RawJsString` is twice as large as `usize`. + Some(unsafe { &*std::ptr::addr_of!(ORIGINAL_JS_STR).cast::() }), + None, + ]; + #[allow(trivial_casts)] + // SAFETY: + // Reference of static variable is always valid to cast into non-null pointer, + // size of `[Option<&usize>; 2]` is equal to the primitive size of `RawJsString`. + unsafe { + $crate::string::JsString::from_opaque_ptr( + std::ptr::from_ref(DUMMY_RAW_JS_STRING) as *mut _ + ) + } + } else { + $crate::string::JsString::from($crate::js_str!($s)) + } + }}; ($s:expr) => { $crate::string::JsString::from($s) }; @@ -92,6 +132,9 @@ mod tests { #[test] fn refcount() { let x = js_string!("Hello world"); + assert_eq!(x.refcount(), None); + + let x = js_string!("你好"); assert_eq!(x.refcount(), Some(1)); { diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index a93cd29872b..6ecc93c72fd 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -152,14 +152,30 @@ impl CodePoint { /// The raw representation of a [`JsString`] in the heap. #[repr(C)] struct RawJsString { + /// A field represented for **`flag_and_len`** or **`pointer`**. + /// + /// ## `flag_and_len`: + /// ```text + /// ┌───────────────────────────────────────────────────┐ + /// │ length((usize::BITS - 1) bits) │ flag(1 bit) │ + /// └───────────────────────────────────────────────────┘ + /// `````` /// Contains the flags and Latin1/UTF-16 length. /// /// The latin1 flag is stored in the bottom bit. - flags_and_len: usize, + /// + /// ## `pointer`: + /// A pointer to a static `JsStr` that references an ASCII literal. + flags_and_len_or_ptr: usize, /// The number of references to the string. /// /// When this reaches `0` the string is deallocated. + /// + /// Since reference count of `RawJsString` created from `try_allocate_inner` + /// will only reach `0` in `drop`, + /// we can set reference count of `RawJsString` created from an ASCII literal to `0` as a mark, + /// see detail in `js_string` macro. refcount: Cell, /// An empty array which is used to get the offset of string data. @@ -170,12 +186,32 @@ impl RawJsString { const LATIN1_BITFLAG: usize = 1 << 0; const BITFLAG_COUNT: usize = 1; + /// This should be called to check if it is from an ASCII literal + /// before modifying reference count to avoid dropping static value + /// that might causes UB. + fn is_ascii_literal(&self) -> bool { + self.refcount.get() == 0 + } + + /// Returns `JsStr` from ASCII literal by using `flags_and_len` as pointer to dereference. + /// # Safety + /// + /// Caller must ensure that `RawJsString` is created from ASCII literal + /// so that pointer casting and dereferencing are valid. + unsafe fn ascii_literal_js_str(&self) -> JsStr<'static> { + // SAFETY: + // + // Caller must ensure that the `RawJsString` is created from ASCII literal + // so that pointer casting and dereferencing are valid. + unsafe { *(self.flags_and_len_or_ptr as *const _) } + } + const fn is_latin1(&self) -> bool { - (self.flags_and_len & Self::LATIN1_BITFLAG) != 0 + (self.flags_and_len_or_ptr & Self::LATIN1_BITFLAG) != 0 } const fn len(&self) -> usize { - self.flags_and_len >> Self::BITFLAG_COUNT + self.flags_and_len_or_ptr >> Self::BITFLAG_COUNT } const fn encode_flags_and_len(len: usize, latin1: bool) -> usize { @@ -221,6 +257,21 @@ impl<'a> IntoIterator for &'a JsString { } impl JsString { + /// Create a [`JsString`] from a raw opaque pointer + /// # Safety + /// + /// Caller must ensure the pointer is valid and the data pointed \ + /// has the same size and alignment of `RawJsString`. + #[must_use] + pub const unsafe fn from_opaque_ptr(src: *mut ()) -> Self { + JsString { + // SAFETY: + // Caller must ensure the pointer is valid and point to data + // with the same size and alignment of `RawJsString`. + ptr: unsafe { Tagged::from_ptr(src.cast()) }, + } + } + /// Create an iterator over the [`JsString`]. #[inline] #[must_use] @@ -250,19 +301,17 @@ impl JsString { // // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen // by its signature, so this doesn't outlive `self`. + // + // - The `RawJsString` created from ASCII literal has a static lifetime `JsStr`. unsafe { - let h = h.as_ptr(); - - if (*h).is_latin1() { - JsStr::latin1(std::slice::from_raw_parts( - addr_of!((*h).data).cast(), - (*h).len(), - )) + let h = h.as_ref(); + if h.is_ascii_literal() { + return h.ascii_literal_js_str(); + } + if h.is_latin1() { + JsStr::latin1(std::slice::from_raw_parts(addr_of!(h.data).cast(), h.len())) } else { - JsStr::utf16(std::slice::from_raw_parts( - addr_of!((*h).data).cast(), - (*h).len(), - )) + JsStr::utf16(std::slice::from_raw_parts(addr_of!(h.data).cast(), h.len())) } } } @@ -665,7 +714,7 @@ impl JsString { unsafe { // Write the first part, the `RawJsString`. inner.as_ptr().write(RawJsString { - flags_and_len: RawJsString::encode_flags_and_len(str_len, latin1), + flags_and_len_or_ptr: RawJsString::encode_flags_and_len(str_len, latin1), refcount: Cell::new(1), data: [0; 0], }); @@ -749,8 +798,12 @@ impl JsString { // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen // by its signature, so this doesn't outlive `self`. unsafe { - let h = h.as_ptr(); - (*h).len() + let h = h.as_ref(); + if h.is_ascii_literal() { + h.ascii_literal_js_str().len() + } else { + h.len() + } } } UnwrappedTagged::Tag(index) => { @@ -860,9 +913,13 @@ impl JsString { UnwrappedTagged::Ptr(inner) => { // SAFETY: The reference count of `JsString` guarantees that `inner` is always valid. let inner = unsafe { inner.as_ref() }; - Some(inner.refcount.get()) + if inner.is_ascii_literal() { + None + } else { + Some(inner.refcount.get()) + } } - UnwrappedTagged::Tag(_inner) => None, + UnwrappedTagged::Tag(_) => None, } } } @@ -873,11 +930,15 @@ impl Clone for JsString { if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() { // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. let inner = unsafe { inner.as_ref() }; - let strong = inner.refcount.get().wrapping_add(1); - if strong == 0 { - abort() + + // Do not increase reference count when it is created from ASCII literal. + if !inner.is_ascii_literal() { + let strong = inner.refcount.get().wrapping_add(1); + if strong == 0 { + abort() + } + inner.refcount.set(strong); } - inner.refcount.set(strong); } Self { ptr: self.ptr } } @@ -898,6 +959,12 @@ impl Drop for JsString { // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. let inner = unsafe { raw.as_ref() }; + + // Do not drop `JsString` created from ASCII literal. + if inner.is_ascii_literal() { + return; + } + inner.refcount.set(inner.refcount.get() - 1); if inner.refcount.get() != 0 { return; From 8e0badc8473559f17b1dcef7f9335521ef4e6e38 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Fri, 19 Jul 2024 17:41:50 +0800 Subject: [PATCH 02/18] chore: add todo comment to change `DUMMY_RAW_JS_STRING` to constant --- core/engine/src/string.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index 920ee786254..cf949ce918f 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -77,6 +77,7 @@ macro_rules! js_string { // and `RawJsString` is private. // With `Null Pointer Optimization` we could use `None` // to represent `Cell(0usize)` to mark it as being created from ASCII literal. + // TODO: change to `const` once https://github.com/rust-lang/rust/issues/119618 is done. static DUMMY_RAW_JS_STRING: &[Option<&usize>; 2] = &[ // SAFETY: // Reference of static variable is always valid to cast into an non-null pointer, From 06f2b703521d341b0d99745c5960fcab5855672c Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Sun, 21 Jul 2024 13:33:07 +0800 Subject: [PATCH 03/18] chore: make `ORIGINAL_JS_STR` a static reference to reduce binary size --- core/engine/src/string.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index cf949ce918f..f71c6b22033 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -69,7 +69,7 @@ macro_rules! js_string { #[allow(clippy::items_after_statements)] // Create a static `JsStr` that references an ASCII literal - static ORIGINAL_JS_STR: JsStr<'static> = JsStr::latin1($s.as_bytes()); + static ORIGINAL_JS_STR: &JsStr<'static> = &JsStr::latin1($s.as_bytes()); #[allow(clippy::items_after_statements)] // Use `[Option<&usize>; 2]` which has the same size with primitive `RawJsString` @@ -82,7 +82,7 @@ macro_rules! js_string { // SAFETY: // Reference of static variable is always valid to cast into an non-null pointer, // And the primitive size of `RawJsString` is twice as large as `usize`. - Some(unsafe { &*std::ptr::addr_of!(ORIGINAL_JS_STR).cast::() }), + Some(unsafe { &*std::ptr::addr_of!(*ORIGINAL_JS_STR).cast::() }), None, ]; #[allow(trivial_casts)] From 4b00447b96ef0ae6a8d9de6c71b761a56e338216 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Sun, 21 Jul 2024 14:26:53 +0800 Subject: [PATCH 04/18] chore: replace `static` to `const` and simplify the code --- core/engine/src/string.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index f71c6b22033..13f2be3cc16 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -67,22 +67,16 @@ macro_rules! js_string { if $s.is_ascii() { use $crate::string::JsStr; - #[allow(clippy::items_after_statements)] - // Create a static `JsStr` that references an ASCII literal - static ORIGINAL_JS_STR: &JsStr<'static> = &JsStr::latin1($s.as_bytes()); - #[allow(clippy::items_after_statements)] // Use `[Option<&usize>; 2]` which has the same size with primitive `RawJsString` - // to represent `RawJsString` since `Cell` is unable to construct in static - // and `RawJsString` is private. + // to represent `RawJsString` because `RawJsString` is private. // With `Null Pointer Optimization` we could use `None` // to represent `Cell(0usize)` to mark it as being created from ASCII literal. - // TODO: change to `const` once https://github.com/rust-lang/rust/issues/119618 is done. - static DUMMY_RAW_JS_STRING: &[Option<&usize>; 2] = &[ + const DUMMY_RAW_JS_STRING: &[Option<&usize>; 2] = &[ // SAFETY: - // Reference of static variable is always valid to cast into an non-null pointer, - // And the primitive size of `RawJsString` is twice as large as `usize`. - Some(unsafe { &*std::ptr::addr_of!(*ORIGINAL_JS_STR).cast::() }), + // This transmutation is valid becuase of the rvalue static promotion + // and the primitive size of `RawJsString` is twice as large as `usize`. + Some(unsafe { std::mem::transmute(&JsStr::latin1($s.as_bytes())) }), None, ]; #[allow(trivial_casts)] From a12fc2320c89674905244892cc4119c9193a2a06 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Sun, 21 Jul 2024 14:38:27 +0800 Subject: [PATCH 05/18] chore: add generic parameters to `transmute` --- core/engine/src/string.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index 13f2be3cc16..6652cfa8dd4 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -76,7 +76,7 @@ macro_rules! js_string { // SAFETY: // This transmutation is valid becuase of the rvalue static promotion // and the primitive size of `RawJsString` is twice as large as `usize`. - Some(unsafe { std::mem::transmute(&JsStr::latin1($s.as_bytes())) }), + Some(unsafe { std::mem::transmute::<&'static JsStr<'static>, &usize>(&JsStr::latin1($s.as_bytes())) }), None, ]; #[allow(trivial_casts)] From 6143f768a4fdbc9f5cfb202bba29819d35946964 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 23 Jul 2024 10:39:33 +0800 Subject: [PATCH 06/18] chore: try to use union to represent --- core/engine/src/string.rs | 17 ++-- core/string/src/lib.rs | 205 ++++++++++++++++++++++---------------- 2 files changed, 126 insertions(+), 96 deletions(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index 6652cfa8dd4..72835f66eae 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -65,20 +65,21 @@ macro_rules! js_string { }; ($s:literal) => {{ if $s.is_ascii() { - use $crate::string::JsStr; + use $crate::string::RawJsString; + use $crate::string::StaticLatin1String; #[allow(clippy::items_after_statements)] // Use `[Option<&usize>; 2]` which has the same size with primitive `RawJsString` // to represent `RawJsString` because `RawJsString` is private. // With `Null Pointer Optimization` we could use `None` // to represent `Cell(0usize)` to mark it as being created from ASCII literal. - const DUMMY_RAW_JS_STRING: &[Option<&usize>; 2] = &[ - // SAFETY: - // This transmutation is valid becuase of the rvalue static promotion - // and the primitive size of `RawJsString` is twice as large as `usize`. - Some(unsafe { std::mem::transmute::<&'static JsStr<'static>, &usize>(&JsStr::latin1($s.as_bytes())) }), - None, - ]; + const DUMMY_RAW_JS_STRING: &RawJsString = &RawJsString { + repr: StaticLatin1String { + len: $s.len(), + zero_marker: 0, + pointer:$s.as_ptr(), + } + }; #[allow(trivial_casts)] // SAFETY: // Reference of static variable is always valid to cast into non-null pointer, diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 6ecc93c72fd..34288179b55 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -34,10 +34,11 @@ pub use crate::{ }; use std::{ alloc::{alloc, dealloc, Layout}, - cell::Cell, + cell::UnsafeCell, convert::Infallible, hash::{Hash, Hasher}, iter::Peekable, + mem::ManuallyDrop, process::abort, ptr::{self, addr_of, addr_of_mut, NonNull}, str::FromStr, @@ -151,67 +152,78 @@ impl CodePoint { /// The raw representation of a [`JsString`] in the heap. #[repr(C)] -struct RawJsString { - /// A field represented for **`flag_and_len`** or **`pointer`**. - /// - /// ## `flag_and_len`: - /// ```text - /// ┌───────────────────────────────────────────────────┐ - /// │ length((usize::BITS - 1) bits) │ flag(1 bit) │ - /// └───────────────────────────────────────────────────┘ - /// `````` - /// Contains the flags and Latin1/UTF-16 length. - /// - /// The latin1 flag is stored in the bottom bit. - /// - /// ## `pointer`: - /// A pointer to a static `JsStr` that references an ASCII literal. - flags_and_len_or_ptr: usize, +struct RawJsStringInner { + flags_and_len: usize, + pub refcount: UnsafeCell, + data: [u16; 0], +} - /// The number of references to the string. - /// - /// When this reaches `0` the string is deallocated. - /// - /// Since reference count of `RawJsString` created from `try_allocate_inner` - /// will only reach `0` in `drop`, - /// we can set reference count of `RawJsString` created from an ASCII literal to `0` as a mark, - /// see detail in `js_string` macro. - refcount: Cell, +unsafe impl Sync for RawJsStringInner {} - /// An empty array which is used to get the offset of string data. - data: [u16; 0], +#[repr(C)] +#[derive(Clone, Copy)] +pub struct StaticLatin1String { + pub len: usize, // string length + pub zero_marker: usize, // guaranteed zero + pub pointer: *const u8, // pointer to static str +} + +impl StaticLatin1String { + fn is_ascii_literal(&self) -> bool { + self.zero_marker == 0 + } + + fn ascii_literal_js_str(&self) -> JsStr<'static> { + unsafe { + let Self { len, pointer, .. } = *self; + let s = std::slice::from_raw_parts(pointer, len); + JsStr::latin1(s) + } + } } +#[repr(C)] +pub union RawJsString { + pub repr: StaticLatin1String, + pub inner: ManuallyDrop, +} + +unsafe impl Sync for RawJsString {} + impl RawJsString { const LATIN1_BITFLAG: usize = 1 << 0; const BITFLAG_COUNT: usize = 1; - /// This should be called to check if it is from an ASCII literal - /// before modifying reference count to avoid dropping static value - /// that might causes UB. - fn is_ascii_literal(&self) -> bool { - self.refcount.get() == 0 + fn repr(&self) -> &StaticLatin1String { + unsafe { &self.repr } } - /// Returns `JsStr` from ASCII literal by using `flags_and_len` as pointer to dereference. - /// # Safety - /// - /// Caller must ensure that `RawJsString` is created from ASCII literal - /// so that pointer casting and dereferencing are valid. - unsafe fn ascii_literal_js_str(&self) -> JsStr<'static> { - // SAFETY: - // - // Caller must ensure that the `RawJsString` is created from ASCII literal - // so that pointer casting and dereferencing are valid. - unsafe { *(self.flags_and_len_or_ptr as *const _) } + fn inner_mut(&mut self) -> &mut RawJsStringInner { + unsafe { &mut self.inner } } - const fn is_latin1(&self) -> bool { - (self.flags_and_len_or_ptr & Self::LATIN1_BITFLAG) != 0 + fn inner(&self) -> &RawJsStringInner { + unsafe { &self.inner } } - const fn len(&self) -> usize { - self.flags_and_len_or_ptr >> Self::BITFLAG_COUNT + fn refcount(&self) -> &UnsafeCell { + &self.inner().refcount + } + + fn data(&self) -> *const [u16; 0] { + addr_of!(self.inner().data) + } + + fn data_mut(&mut self) -> *mut [u16; 0] { + addr_of_mut!(self.inner_mut().data) + } + + fn is_latin1(&self) -> bool { + (self.inner().flags_and_len & Self::LATIN1_BITFLAG) != 0 + } + + fn len(&self) -> usize { + self.inner().flags_and_len >> Self::BITFLAG_COUNT } const fn encode_flags_and_len(len: usize, latin1: bool) -> usize { @@ -296,7 +308,7 @@ impl JsString { // - The `RawJsString` type has all the necessary information to reconstruct a valid // slice (length and starting pointer). // - // - We aligned `h.data` on allocation, and the block is of size `h.len`, so this + // - We aligned `h.data(` on allocation, and the block is of size `h.len`, so this // should only generate valid reads. // // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen @@ -304,14 +316,18 @@ impl JsString { // // - The `RawJsString` created from ASCII literal has a static lifetime `JsStr`. unsafe { - let h = h.as_ref(); - if h.is_ascii_literal() { - return h.ascii_literal_js_str(); + { + let h = h.as_ptr().cast_const(); + let static_h = *h.cast::(); + if static_h.is_ascii_literal() { + return static_h.ascii_literal_js_str(); + } } + let h = h.as_ref(); if h.is_latin1() { - JsStr::latin1(std::slice::from_raw_parts(addr_of!(h.data).cast(), h.len())) + JsStr::latin1(std::slice::from_raw_parts(h.data().cast(), h.len())) } else { - JsStr::utf16(std::slice::from_raw_parts(addr_of!(h.data).cast(), h.len())) + JsStr::utf16(std::slice::from_raw_parts(h.data().cast(), h.len())) } } } @@ -351,7 +367,7 @@ impl JsString { let string = { // SAFETY: `allocate_inner` guarantees that `ptr` is a valid pointer. - let mut data = unsafe { addr_of_mut!((*ptr.as_ptr()).data).cast::() }; + let mut data = unsafe { (*ptr.as_ptr()).data_mut().cast::() }; for &string in strings { // SAFETY: // The sum of all `count` for each `string` equals `full_count`, and since we're @@ -714,9 +730,11 @@ impl JsString { unsafe { // Write the first part, the `RawJsString`. inner.as_ptr().write(RawJsString { - flags_and_len_or_ptr: RawJsString::encode_flags_and_len(str_len, latin1), - refcount: Cell::new(1), - data: [0; 0], + inner: ManuallyDrop::new(RawJsStringInner { + flags_and_len: RawJsString::encode_flags_and_len(str_len, latin1), + refcount: UnsafeCell::new(1), + data: [0; 0], + }), }); } @@ -729,12 +747,7 @@ impl JsString { // and since we requested an `RawJsString` layout with a trailing // `[u16; str_len]`, the memory of the array must be in the `usize` // range for the allocation to succeed. - unsafe { - ptr::eq( - inner.cast::().add(offset).cast(), - (*inner).data.as_mut_ptr(), - ) - } + unsafe { ptr::eq(inner.cast::().add(offset).cast(), (*inner).data_mut()) } }); Ok(inner) @@ -746,7 +759,7 @@ impl JsString { let ptr = Self::allocate_inner(count, string.is_latin1()); // SAFETY: `allocate_inner` guarantees that `ptr` is a valid pointer. - let data = unsafe { addr_of_mut!((*ptr.as_ptr()).data).cast::() }; + let data = unsafe { (*ptr.as_ptr()).data_mut().cast::() }; // SAFETY: // - We read `count = data.len()` elements from `data`, which is within the bounds of the slice. @@ -792,18 +805,21 @@ impl JsString { // - The `RawJsString` type has all the necessary information to reconstruct a valid // slice (length and starting pointer). // - // - We aligned `h.data` on allocation, and the block is of size `h.len`, so this + // - We aligned `h.data(` on allocation, and the block is of size `h.len`, so this // should only generate valid reads. // // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen // by its signature, so this doesn't outlive `self`. unsafe { - let h = h.as_ref(); - if h.is_ascii_literal() { - h.ascii_literal_js_str().len() - } else { - h.len() + { + let h = h.as_ptr().cast_const(); + let static_h = *h.cast::(); + if static_h.is_ascii_literal() { + return static_h.ascii_literal_js_str().len(); + } } + let h = h.as_ref(); + h.len() } } UnwrappedTagged::Tag(index) => { @@ -912,12 +928,15 @@ impl JsString { match self.ptr.unwrap() { UnwrappedTagged::Ptr(inner) => { // SAFETY: The reference count of `JsString` guarantees that `inner` is always valid. - let inner = unsafe { inner.as_ref() }; - if inner.is_ascii_literal() { - None - } else { - Some(inner.refcount.get()) + unsafe { + let h = inner.as_ptr().cast_const(); + let static_h = *h.cast::(); + if static_h.is_ascii_literal() { + return None; + } } + let inner = unsafe { inner.as_ref() }; + unsafe { Some(*inner.refcount().get()) } } UnwrappedTagged::Tag(_) => None, } @@ -929,15 +948,19 @@ impl Clone for JsString { fn clone(&self) -> Self { if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() { // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. - let inner = unsafe { inner.as_ref() }; // Do not increase reference count when it is created from ASCII literal. - if !inner.is_ascii_literal() { - let strong = inner.refcount.get().wrapping_add(1); - if strong == 0 { - abort() + unsafe { + let h = inner.as_ptr().cast_const(); + let static_h = *h.cast::(); + if !static_h.is_ascii_literal() { + let inner = unsafe { inner.as_ref() }; + let strong = (*inner.refcount().get()).wrapping_add(1); + if strong == 0 { + abort() + } + *inner.refcount().get() = strong; } - inner.refcount.set(strong); } } Self { ptr: self.ptr } @@ -958,16 +981,22 @@ impl Drop for JsString { // See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details. // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. - let inner = unsafe { raw.as_ref() }; // Do not drop `JsString` created from ASCII literal. - if inner.is_ascii_literal() { - return; + unsafe { + let h = raw.as_ptr().cast_const(); + let static_h = *h.cast::(); + if static_h.is_ascii_literal() { + return; + } } - inner.refcount.set(inner.refcount.get() - 1); - if inner.refcount.get() != 0 { - return; + let inner = unsafe { raw.as_ref() }; + unsafe { + *inner.refcount().get() = *inner.refcount().get() - 1; + if *inner.refcount().get() != 0 { + return; + } } // SAFETY: From 7dff145d917ce775e85ec1263e8c34ea3a163beb Mon Sep 17 00:00:00 2001 From: jedel1043 Date: Mon, 22 Jul 2024 23:11:30 -0600 Subject: [PATCH 07/18] chore: make miri happy --- core/engine/src/string.rs | 31 +---- core/string/src/lib.rs | 258 ++++++++++++++++---------------------- core/string/src/str.rs | 4 +- core/string/src/tests.rs | 26 +++- 4 files changed, 136 insertions(+), 183 deletions(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index 72835f66eae..d21ec50569e 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -64,34 +64,9 @@ macro_rules! js_string { $crate::string::JsString::default() }; ($s:literal) => {{ - if $s.is_ascii() { - use $crate::string::RawJsString; - use $crate::string::StaticLatin1String; - - #[allow(clippy::items_after_statements)] - // Use `[Option<&usize>; 2]` which has the same size with primitive `RawJsString` - // to represent `RawJsString` because `RawJsString` is private. - // With `Null Pointer Optimization` we could use `None` - // to represent `Cell(0usize)` to mark it as being created from ASCII literal. - const DUMMY_RAW_JS_STRING: &RawJsString = &RawJsString { - repr: StaticLatin1String { - len: $s.len(), - zero_marker: 0, - pointer:$s.as_ptr(), - } - }; - #[allow(trivial_casts)] - // SAFETY: - // Reference of static variable is always valid to cast into non-null pointer, - // size of `[Option<&usize>; 2]` is equal to the primitive size of `RawJsString`. - unsafe { - $crate::string::JsString::from_opaque_ptr( - std::ptr::from_ref(DUMMY_RAW_JS_STRING) as *mut _ - ) - } - } else { - $crate::string::JsString::from($crate::js_str!($s)) - } + static LITERAL: $crate::string::StaticJsString = $crate::string::StaticJsString::new($crate::js_str!($s)); + + $crate::string::JsString::from_static_js_string(&LITERAL) }}; ($s:expr) => { $crate::string::JsString::from($s) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 34288179b55..45fc92ed56b 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -34,11 +34,10 @@ pub use crate::{ }; use std::{ alloc::{alloc, dealloc, Layout}, - cell::UnsafeCell, + cell::Cell, convert::Infallible, hash::{Hash, Hasher}, iter::Peekable, - mem::ManuallyDrop, process::abort, ptr::{self, addr_of, addr_of_mut, NonNull}, str::FromStr, @@ -150,84 +149,68 @@ impl CodePoint { } } -/// The raw representation of a [`JsString`] in the heap. -#[repr(C)] -struct RawJsStringInner { - flags_and_len: usize, - pub refcount: UnsafeCell, - data: [u16; 0], -} - -unsafe impl Sync for RawJsStringInner {} - -#[repr(C)] -#[derive(Clone, Copy)] -pub struct StaticLatin1String { - pub len: usize, // string length - pub zero_marker: usize, // guaranteed zero - pub pointer: *const u8, // pointer to static str -} +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +struct TaggedLen(usize); -impl StaticLatin1String { - fn is_ascii_literal(&self) -> bool { - self.zero_marker == 0 - } - - fn ascii_literal_js_str(&self) -> JsStr<'static> { - unsafe { - let Self { len, pointer, .. } = *self; - let s = std::slice::from_raw_parts(pointer, len); - JsStr::latin1(s) - } - } -} - -#[repr(C)] -pub union RawJsString { - pub repr: StaticLatin1String, - pub inner: ManuallyDrop, -} - -unsafe impl Sync for RawJsString {} - -impl RawJsString { +impl TaggedLen { const LATIN1_BITFLAG: usize = 1 << 0; const BITFLAG_COUNT: usize = 1; - fn repr(&self) -> &StaticLatin1String { - unsafe { &self.repr } + const fn new(len: usize, latin1: bool) -> Self { + Self((len << Self::BITFLAG_COUNT) | (latin1 as usize)) } - fn inner_mut(&mut self) -> &mut RawJsStringInner { - unsafe { &mut self.inner } + const fn is_latin1(self) -> bool { + (self.0 & Self::LATIN1_BITFLAG) != 0 } - fn inner(&self) -> &RawJsStringInner { - unsafe { &self.inner } + const fn len(self) -> usize { + self.0 >> Self::BITFLAG_COUNT } +} - fn refcount(&self) -> &UnsafeCell { - &self.inner().refcount - } +pub struct StaticJsString { + tagged_len: TaggedLen, + _zero: Cell, + ptr: *const u8, +} - fn data(&self) -> *const [u16; 0] { - addr_of!(self.inner().data) +// SAFETY: This is Sync because reads to `_zero` will always read 0 and +// `ptr` cannot be mutated thanks to the 'static requirement. +unsafe impl Sync for StaticJsString {} + +impl StaticJsString { + pub const fn new(string: JsStr<'static>) -> StaticJsString { + match string.variant() { + JsStrVariant::Latin1(l) => StaticJsString { + tagged_len: TaggedLen::new(l.len(), true), + _zero: Cell::new(0), + ptr: l.as_ptr(), + }, + JsStrVariant::Utf16(u) => StaticJsString { + tagged_len: TaggedLen::new(u.len(), false), + _zero: Cell::new(0), + ptr: u.as_ptr().cast(), + }, + } } +} - fn data_mut(&mut self) -> *mut [u16; 0] { - addr_of_mut!(self.inner_mut().data) - } +/// The raw representation of a [`JsString`] in the heap. +#[repr(C)] +struct RawJsString { + tagged_len: TaggedLen, + refcount: Cell, + data: [u8; 0], +} +impl RawJsString { fn is_latin1(&self) -> bool { - (self.inner().flags_and_len & Self::LATIN1_BITFLAG) != 0 + self.tagged_len.is_latin1() } fn len(&self) -> usize { - self.inner().flags_and_len >> Self::BITFLAG_COUNT - } - - const fn encode_flags_and_len(len: usize, latin1: bool) -> usize { - (len << Self::BITFLAG_COUNT) | (latin1 as usize) + self.tagged_len.len() } } @@ -269,18 +252,15 @@ impl<'a> IntoIterator for &'a JsString { } impl JsString { - /// Create a [`JsString`] from a raw opaque pointer - /// # Safety - /// - /// Caller must ensure the pointer is valid and the data pointed \ - /// has the same size and alignment of `RawJsString`. + /// Create a [`JsString`] from a static js string. #[must_use] - pub const unsafe fn from_opaque_ptr(src: *mut ()) -> Self { + pub const fn from_static_js_string(src: &'static StaticJsString) -> Self { + let src = ptr::from_ref(src).cast::(); JsString { // SAFETY: // Caller must ensure the pointer is valid and point to data // with the same size and alignment of `RawJsString`. - ptr: unsafe { Tagged::from_ptr(src.cast()) }, + ptr: unsafe { Tagged::from_ptr(src.cast_mut()) }, } } @@ -308,7 +288,7 @@ impl JsString { // - The `RawJsString` type has all the necessary information to reconstruct a valid // slice (length and starting pointer). // - // - We aligned `h.data(` on allocation, and the block is of size `h.len`, so this + // - We aligned `h.data()` on allocation, and the block is of size `h.len`, so this // should only generate valid reads. // // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen @@ -316,18 +296,27 @@ impl JsString { // // - The `RawJsString` created from ASCII literal has a static lifetime `JsStr`. unsafe { - { - let h = h.as_ptr().cast_const(); - let static_h = *h.cast::(); - if static_h.is_ascii_literal() { - return static_h.ascii_literal_js_str(); - } + let h = h.as_ptr(); + if (*h).refcount.get() == 0 { + let h = h.cast::(); + return if (*h).tagged_len.is_latin1() { + JsStr::latin1(std::slice::from_raw_parts( + (*h).ptr, + (*h).tagged_len.len(), + )) + } else { + JsStr::utf16(std::slice::from_raw_parts( + (*h).ptr.cast(), + (*h).tagged_len.len(), + )) + }; } - let h = h.as_ref(); - if h.is_latin1() { - JsStr::latin1(std::slice::from_raw_parts(h.data().cast(), h.len())) + + let len = (*h).len(); + if (*h).is_latin1() { + JsStr::latin1(std::slice::from_raw_parts(addr_of!((*h).data).cast(), len)) } else { - JsStr::utf16(std::slice::from_raw_parts(h.data().cast(), h.len())) + JsStr::utf16(std::slice::from_raw_parts(addr_of!((*h).data).cast(), len)) } } } @@ -367,7 +356,7 @@ impl JsString { let string = { // SAFETY: `allocate_inner` guarantees that `ptr` is a valid pointer. - let mut data = unsafe { (*ptr.as_ptr()).data_mut().cast::() }; + let mut data = unsafe { addr_of_mut!((*ptr.as_ptr()).data).cast::() }; for &string in strings { // SAFETY: // The sum of all `count` for each `string` equals `full_count`, and since we're @@ -730,11 +719,9 @@ impl JsString { unsafe { // Write the first part, the `RawJsString`. inner.as_ptr().write(RawJsString { - inner: ManuallyDrop::new(RawJsStringInner { - flags_and_len: RawJsString::encode_flags_and_len(str_len, latin1), - refcount: UnsafeCell::new(1), - data: [0; 0], - }), + tagged_len: TaggedLen::new(str_len, latin1), + refcount: Cell::new(1), + data: [0; 0], }); } @@ -747,7 +734,12 @@ impl JsString { // and since we requested an `RawJsString` layout with a trailing // `[u16; str_len]`, the memory of the array must be in the `usize` // range for the allocation to succeed. - unsafe { ptr::eq(inner.cast::().add(offset).cast(), (*inner).data_mut()) } + unsafe { + ptr::eq( + inner.cast::().add(offset).cast(), + (*inner).data.as_mut_ptr(), + ) + } }); Ok(inner) @@ -759,7 +751,7 @@ impl JsString { let ptr = Self::allocate_inner(count, string.is_latin1()); // SAFETY: `allocate_inner` guarantees that `ptr` is a valid pointer. - let data = unsafe { (*ptr.as_ptr()).data_mut().cast::() }; + let data = unsafe { addr_of_mut!((*ptr.as_ptr()).data).cast::() }; // SAFETY: // - We read `count = data.len()` elements from `data`, which is within the bounds of the slice. @@ -799,35 +791,7 @@ impl JsString { #[inline] #[must_use] pub fn len(&self) -> usize { - match self.ptr.unwrap() { - UnwrappedTagged::Ptr(h) => { - // SAFETY: - // - The `RawJsString` type has all the necessary information to reconstruct a valid - // slice (length and starting pointer). - // - // - We aligned `h.data(` on allocation, and the block is of size `h.len`, so this - // should only generate valid reads. - // - // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen - // by its signature, so this doesn't outlive `self`. - unsafe { - { - let h = h.as_ptr().cast_const(); - let static_h = *h.cast::(); - if static_h.is_ascii_literal() { - return static_h.ascii_literal_js_str().len(); - } - } - let h = h.as_ref(); - h.len() - } - } - UnwrappedTagged::Tag(index) => { - // SAFETY: all static strings are valid indices on `STATIC_JS_STRINGS`, so `get` should always - // return `Some`. - unsafe { StaticJsStrings::get(index).unwrap_unchecked().len() } - } - } + self.as_str().len() } /// Return true if the [`JsString`] is emtpy. @@ -928,17 +892,15 @@ impl JsString { match self.ptr.unwrap() { UnwrappedTagged::Ptr(inner) => { // SAFETY: The reference count of `JsString` guarantees that `inner` is always valid. - unsafe { - let h = inner.as_ptr().cast_const(); - let static_h = *h.cast::(); - if static_h.is_ascii_literal() { - return None; - } - } let inner = unsafe { inner.as_ref() }; - unsafe { Some(*inner.refcount().get()) } + let rc = inner.refcount.get(); + if rc == 0 { + None + } else { + Some(rc) + } } - UnwrappedTagged::Tag(_) => None, + UnwrappedTagged::Tag(_inner) => None, } } } @@ -948,20 +910,18 @@ impl Clone for JsString { fn clone(&self) -> Self { if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() { // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. + let inner = unsafe { inner.as_ref() }; + let rc = inner.refcount.get(); + if rc == 0 { + // pointee is a static string + return Self { ptr: self.ptr }; + } - // Do not increase reference count when it is created from ASCII literal. - unsafe { - let h = inner.as_ptr().cast_const(); - let static_h = *h.cast::(); - if !static_h.is_ascii_literal() { - let inner = unsafe { inner.as_ref() }; - let strong = (*inner.refcount().get()).wrapping_add(1); - if strong == 0 { - abort() - } - *inner.refcount().get() = strong; - } + let strong = rc.wrapping_add(1); + if strong == 0 { + abort() } + inner.refcount.set(strong); } Self { ptr: self.ptr } } @@ -981,22 +941,16 @@ impl Drop for JsString { // See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details. // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. - - // Do not drop `JsString` created from ASCII literal. - unsafe { - let h = raw.as_ptr().cast_const(); - let static_h = *h.cast::(); - if static_h.is_ascii_literal() { - return; - } + let inner = unsafe { raw.as_ref() }; + let refcount = inner.refcount.get(); + if refcount == 0 { + // Just a static string. No need to drop. + return; } - let inner = unsafe { raw.as_ref() }; - unsafe { - *inner.refcount().get() = *inner.refcount().get() - 1; - if *inner.refcount().get() != 0 { - return; - } + inner.refcount.set(refcount - 1); + if inner.refcount.get() != 0 { + return; } // SAFETY: diff --git a/core/string/src/str.rs b/core/string/src/str.rs index d851307dbae..c42821558fd 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -87,14 +87,14 @@ impl<'a> JsStr<'a> { /// Return the inner [`JsStrVariant`] varient of the [`JsStr`]. #[inline] #[must_use] - pub fn variant(self) -> JsStrVariant<'a> { + pub const fn variant(self) -> JsStrVariant<'a> { self.inner } /// Check if the [`JsStr`] is latin1 encoded. #[inline] #[must_use] - pub fn is_latin1(&self) -> bool { + pub const fn is_latin1(&self) -> bool { matches!(self.inner, JsStrVariant::Latin1(_)) } diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 61a23cfa35b..508021ddcc3 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -2,7 +2,7 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash}; -use crate::{JsStr, JsString, StaticJsStrings}; +use crate::{JsStr, JsString, StaticJsString, StaticJsStrings}; use rustc_hash::FxHasher; @@ -173,3 +173,27 @@ fn conversion_to_known_static_js_string() { assert!(string.is_some()); assert!(string.unwrap().as_str().is_latin1()); } + +#[test] +fn from_static_js_string() { + static STATIC_HELLO_WORLD: StaticJsString = + StaticJsString::new(JsStr::latin1("hello world".as_bytes())); + static STATIC_EMOJIS: StaticJsString = + StaticJsString::new(JsStr::utf16(&[0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5])); // 🎹🎶🎵 + let latin1 = JsString::from_static_js_string(&STATIC_HELLO_WORLD); + let utf16 = JsString::from_static_js_string(&STATIC_EMOJIS); + + assert_eq!(latin1.as_str(), "hello world"); + assert_eq!(utf16.as_str(), "🎹🎶🎵"); + + let clone = latin1.clone(); + + assert_eq!(clone, latin1); + + let clone = utf16.clone(); + + assert_eq!(clone, utf16); + + assert!(latin1.refcount().is_none()); + assert!(utf16.refcount().is_none()); +} From 5984536976347d0b0525f5322253c229389bfcf2 Mon Sep 17 00:00:00 2001 From: jedel1043 Date: Tue, 23 Jul 2024 02:18:46 -0600 Subject: [PATCH 08/18] Optimize const accesses for static strings --- core/string/src/lib.rs | 45 ++++++++++++++++++++++++++-------------- core/string/src/tests.rs | 5 +++-- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 45fc92ed56b..82251a49e24 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -38,6 +38,7 @@ use std::{ convert::Infallible, hash::{Hash, Hasher}, iter::Peekable, + mem::ManuallyDrop, process::abort, ptr::{self, addr_of, addr_of_mut, NonNull}, str::FromStr, @@ -171,7 +172,7 @@ impl TaggedLen { pub struct StaticJsString { tagged_len: TaggedLen, - _zero: Cell, + _zero: usize, ptr: *const u8, } @@ -184,23 +185,28 @@ impl StaticJsString { match string.variant() { JsStrVariant::Latin1(l) => StaticJsString { tagged_len: TaggedLen::new(l.len(), true), - _zero: Cell::new(0), + _zero: 0, ptr: l.as_ptr(), }, JsStrVariant::Utf16(u) => StaticJsString { tagged_len: TaggedLen::new(u.len(), false), - _zero: Cell::new(0), + _zero: 0, ptr: u.as_ptr().cast(), }, } } } +union RefCount { + read_only: usize, + read_write: ManuallyDrop>, +} + /// The raw representation of a [`JsString`] in the heap. #[repr(C)] struct RawJsString { tagged_len: TaggedLen, - refcount: Cell, + refcount: RefCount, data: [u8; 0], } @@ -297,7 +303,7 @@ impl JsString { // - The `RawJsString` created from ASCII literal has a static lifetime `JsStr`. unsafe { let h = h.as_ptr(); - if (*h).refcount.get() == 0 { + if (*h).refcount.read_only == 0 { let h = h.cast::(); return if (*h).tagged_len.is_latin1() { JsStr::latin1(std::slice::from_raw_parts( @@ -720,7 +726,9 @@ impl JsString { // Write the first part, the `RawJsString`. inner.as_ptr().write(RawJsString { tagged_len: TaggedLen::new(str_len, latin1), - refcount: Cell::new(1), + refcount: RefCount { + read_write: ManuallyDrop::new(Cell::new(1)), + }, data: [0; 0], }); } @@ -892,8 +900,7 @@ impl JsString { match self.ptr.unwrap() { UnwrappedTagged::Ptr(inner) => { // SAFETY: The reference count of `JsString` guarantees that `inner` is always valid. - let inner = unsafe { inner.as_ref() }; - let rc = inner.refcount.get(); + let rc = unsafe { (*inner.as_ptr()).refcount.read_only }; if rc == 0 { None } else { @@ -910,18 +917,21 @@ impl Clone for JsString { fn clone(&self) -> Self { if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() { // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. - let inner = unsafe { inner.as_ref() }; - let rc = inner.refcount.get(); + let rc = unsafe { (*inner.as_ptr()).refcount.read_only }; if rc == 0 { // pointee is a static string return Self { ptr: self.ptr }; } + let inner = unsafe { inner.as_ref() }; + let strong = rc.wrapping_add(1); if strong == 0 { abort() } - inner.refcount.set(strong); + unsafe { + inner.refcount.read_write.set(strong); + } } Self { ptr: self.ptr } } @@ -941,16 +951,19 @@ impl Drop for JsString { // See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details. // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. - let inner = unsafe { raw.as_ref() }; - let refcount = inner.refcount.get(); + let refcount = unsafe { (*raw.as_ptr()).refcount.read_only }; if refcount == 0 { // Just a static string. No need to drop. return; } - inner.refcount.set(refcount - 1); - if inner.refcount.get() != 0 { - return; + let inner = unsafe { raw.as_ref() }; + + unsafe { + inner.refcount.read_write.set(refcount - 1); + if inner.refcount.read_write.get() != 0 { + return; + } } // SAFETY: diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 508021ddcc3..c9221095c5f 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -178,8 +178,9 @@ fn conversion_to_known_static_js_string() { fn from_static_js_string() { static STATIC_HELLO_WORLD: StaticJsString = StaticJsString::new(JsStr::latin1("hello world".as_bytes())); - static STATIC_EMOJIS: StaticJsString = - StaticJsString::new(JsStr::utf16(&[0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5])); // 🎹🎶🎵 + static STATIC_EMOJIS: StaticJsString = StaticJsString::new(JsStr::utf16(&[ + 0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5, + ])); // 🎹🎶🎵 let latin1 = JsString::from_static_js_string(&STATIC_HELLO_WORLD); let utf16 = JsString::from_static_js_string(&STATIC_EMOJIS); From 6e6931ef36d71f35f246111fa326ff78998b7bd5 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 23 Jul 2024 16:33:53 +0800 Subject: [PATCH 09/18] perf: set LITERAL a constant reference to avoid duplication on data segment --- core/engine/src/string.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index d21ec50569e..4887370bdb4 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -64,9 +64,9 @@ macro_rules! js_string { $crate::string::JsString::default() }; ($s:literal) => {{ - static LITERAL: $crate::string::StaticJsString = $crate::string::StaticJsString::new($crate::js_str!($s)); + const LITERAL: &$crate::string::StaticJsString = &$crate::string::StaticJsString::new($crate::js_str!($s)); - $crate::string::JsString::from_static_js_string(&LITERAL) + $crate::string::JsString::from_static_js_string(LITERAL) }}; ($s:expr) => { $crate::string::JsString::from($s) From c21c8ec9029015cc8911df2e4170762670b524aa Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 23 Jul 2024 17:09:21 +0800 Subject: [PATCH 10/18] chore: add comments --- core/string/src/lib.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 82251a49e24..76eebd27e35 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -150,6 +150,9 @@ impl CodePoint { } } +/// Contains the flags and Latin1/UTF-16 length. +/// +/// The latin1 flag is stored in the bottom bit. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] struct TaggedLen(usize); @@ -170,6 +173,8 @@ impl TaggedLen { } } +/// The raw representation of a [`JsString`] from a string literal. +#[derive(Debug)] pub struct StaticJsString { tagged_len: TaggedLen, _zero: usize, @@ -181,6 +186,8 @@ pub struct StaticJsString { unsafe impl Sync for StaticJsString {} impl StaticJsString { + /// Create a `StaticJsString` from a static `JsStr`. + #[must_use] pub const fn new(string: JsStr<'static>) -> StaticJsString { match string.variant() { JsStrVariant::Latin1(l) => StaticJsString { @@ -197,6 +204,9 @@ impl StaticJsString { } } +/// Memory variant to pass `Miri` test.\ +/// If it equals to `0usize`, +/// we mark it read-only, otherwise it is readable and writable union RefCount { read_only: usize, read_write: ManuallyDrop>, @@ -300,7 +310,7 @@ impl JsString { // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen // by its signature, so this doesn't outlive `self`. // - // - The `RawJsString` created from ASCII literal has a static lifetime `JsStr`. + // - The `RawJsString` created from string literal has a static lifetime `JsStr`. unsafe { let h = h.as_ptr(); if (*h).refcount.read_only == 0 { @@ -402,7 +412,7 @@ impl JsString { } } Self { - // Safety: We already know it's a valid heap pointer. + // SAFETY: We already know it's a valid heap pointer. ptr: unsafe { Tagged::from_ptr(ptr.as_ptr()) }, } }; @@ -782,7 +792,7 @@ impl JsString { } } Self { - // Safety: `allocate_inner` guarantees `ptr` is a valid heap pointer. + // SAFETY: `allocate_inner` guarantees `ptr` is a valid heap pointer. ptr: Tagged::from_non_null(ptr), } } @@ -875,7 +885,7 @@ impl JsString { where I: JsSliceIndex<'a>, { - // Safety: Caller must ensure the index is not out of bounds + // SAFETY: Caller must ensure the index is not out of bounds unsafe { I::get_unchecked(self.as_str(), index) } } @@ -922,13 +932,14 @@ impl Clone for JsString { // pointee is a static string return Self { ptr: self.ptr }; } - + // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. let inner = unsafe { inner.as_ref() }; let strong = rc.wrapping_add(1); if strong == 0 { abort() } + // SAFETY: This has been checked aboved to ensure it is a `read_write` variant. unsafe { inner.refcount.read_write.set(strong); } @@ -959,6 +970,7 @@ impl Drop for JsString { let inner = unsafe { raw.as_ref() }; + // SAFETY: This has been checked aboved to ensure it is a `read_write` variant. unsafe { inner.refcount.read_write.set(refcount - 1); if inner.refcount.read_write.get() != 0 { @@ -985,7 +997,7 @@ impl Drop for JsString { } }; - // Safety: + // SAFETY: // If refcount is 0 and we call drop, that means this is the last `JsString` which // points to this memory allocation, so deallocating it is safe. unsafe { From cd990d7977d6f09d361a640564a6057cc6258f74 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 23 Jul 2024 17:17:37 +0800 Subject: [PATCH 11/18] chore: add comments --- core/string/src/lib.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 76eebd27e35..e86bdd39d27 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -926,13 +926,13 @@ impl Clone for JsString { #[inline] fn clone(&self) -> Self { if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() { - // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. + // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. let rc = unsafe { (*inner.as_ptr()).refcount.read_only }; if rc == 0 { // pointee is a static string return Self { ptr: self.ptr }; } - // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. + // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. let inner = unsafe { inner.as_ref() }; let strong = rc.wrapping_add(1); @@ -961,13 +961,14 @@ impl Drop for JsString { if let UnwrappedTagged::Ptr(raw) = self.ptr.unwrap() { // See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details. - // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. + // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. let refcount = unsafe { (*raw.as_ptr()).refcount.read_only }; if refcount == 0 { // Just a static string. No need to drop. return; } + // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. let inner = unsafe { raw.as_ref() }; // SAFETY: This has been checked aboved to ensure it is a `read_write` variant. From 6db71749a2e9218dca723253b24170254b41c323 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 23 Jul 2024 17:23:03 +0800 Subject: [PATCH 12/18] chore : fix `refcount` test --- core/engine/src/string.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index 4887370bdb4..c016d955725 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -106,6 +106,9 @@ mod tests { assert_eq!(x.refcount(), None); let x = js_string!("你好"); + assert_eq!(x.refcount(), None); + + let x = js_string!("Hello world".to_string()); assert_eq!(x.refcount(), Some(1)); { From 2a6014380d9811afe9973d5fa0a336c32accc2a3 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 23 Jul 2024 17:43:33 +0800 Subject: [PATCH 13/18] chore: remove unnessnary assertion --- core/engine/src/string.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index c016d955725..a8aee6201c9 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -7,15 +7,6 @@ #[doc(inline)] pub use boa_string::*; -static_assertions::const_assert_eq!( - // SAFETY: - // Compiler will throw error if `transmute` does not meet the requirement. - unsafe { std::mem::transmute::, usize>(std::cell::Cell::new(0usize)) }, - // SAFETY: - // Compiler will throw error if `transmute` does not meet the requirement. - unsafe { std::mem::transmute::, usize>(None) } -); - /// Utility macro to create a [`JsString`]. /// /// # Examples From 2865d5fb3548b4c91b4576d49f3eb50f39e0b40f Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 30 Jul 2024 11:15:34 +0800 Subject: [PATCH 14/18] chore: modify definition of `JsString::is_static` --- core/string/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 5ca4b119873..e688d8b5615 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -861,7 +861,7 @@ impl JsString { #[inline] #[must_use] pub fn is_static(&self) -> bool { - self.ptr.is_tagged() + self.refcount().is_none() } /// Get the element a the given index, [`None`] otherwise. From ec2e0819a7d2148dbd185d537995689b790f1f2a Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 30 Jul 2024 11:21:06 +0800 Subject: [PATCH 15/18] chore: add more tests --- core/string/src/tests.rs | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index c9221095c5f..c9cf68eca7e 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -184,9 +184,11 @@ fn from_static_js_string() { let latin1 = JsString::from_static_js_string(&STATIC_HELLO_WORLD); let utf16 = JsString::from_static_js_string(&STATIC_EMOJIS); - assert_eq!(latin1.as_str(), "hello world"); - assert_eq!(utf16.as_str(), "🎹🎶🎵"); + // content compare + assert_eq!(latin1, "hello world"); + assert_eq!(utf16, "🎹🎶🎵"); + // refcount check let clone = latin1.clone(); assert_eq!(clone, latin1); @@ -197,4 +199,36 @@ fn from_static_js_string() { assert!(latin1.refcount().is_none()); assert!(utf16.refcount().is_none()); + + // `is_latin1` check + assert!(latin1.as_str().is_latin1()); + assert!(!utf16.as_str().is_latin1()); +} + +#[test] +fn compare_static_and_dynamic_js_string() { + static STATIC_HELLO_WORLD: StaticJsString = + StaticJsString::new(JsStr::latin1("hello world".as_bytes())); + static STATIC_EMOJIS: StaticJsString = StaticJsString::new(JsStr::utf16(&[ + 0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5, + ])); // 🎹🎶🎵 + let static_latin1 = JsString::from_static_js_string(&STATIC_HELLO_WORLD); + let static_utf16 = JsString::from_static_js_string(&STATIC_EMOJIS); + + let dynamic_latin1 = JsString::from(JsStr::latin1("hello world".as_bytes())); + let dynamic_utf16 = JsString::from(&[0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5]); + + // content compare + assert_eq!(static_latin1, dynamic_latin1); + assert_eq!(static_utf16, dynamic_utf16); + + // length check + assert_eq!(static_latin1.len(), dynamic_latin1.len()); + assert_eq!(static_utf16.len(), dynamic_utf16.len()); + + // `is_static` check + assert!(static_latin1.is_static()); + assert!(static_utf16.is_static()); + assert!(!dynamic_latin1.is_static()); + assert!(!dynamic_utf16.is_static()); } From b195c3761d986f43a3409618a7229e83a0ed8b2d Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 30 Jul 2024 14:45:36 +0800 Subject: [PATCH 16/18] chore: mark `StaticJsString` as `repr[C]` --- core/string/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index e688d8b5615..a35785acf6f 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -175,6 +175,7 @@ impl TaggedLen { /// The raw representation of a [`JsString`] from a string literal. #[derive(Debug)] +#[repr(C)] pub struct StaticJsString { tagged_len: TaggedLen, _zero: usize, From f86bc5669f7651397bbfa70b90e3ae0321c5984c Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 30 Jul 2024 14:52:23 +0800 Subject: [PATCH 17/18] chore: improve comment --- core/string/src/lib.rs | 53 ++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index a35785acf6f..0dfeb625068 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -150,9 +150,13 @@ impl CodePoint { } } -/// Contains the flags and Latin1/UTF-16 length. -/// -/// The latin1 flag is stored in the bottom bit. +/// A `usize` contains a flag and the length of Latin1/UTF-16 . +/// ```text +/// ┌────────────────────────────────────┐ +/// │ length (usize::BITS - 1) │ flag(1) │ +/// └────────────────────────────────────┘ +/// ``` +/// The latin1/UTF-16 flag is stored in the bottom bit. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] struct TaggedLen(usize); @@ -205,7 +209,8 @@ impl StaticJsString { } } -/// Memory variant to pass `Miri` test.\ +/// Memory variant to pass `Miri` test. +/// /// If it equals to `0usize`, /// we mark it read-only, otherwise it is readable and writable union RefCount { @@ -222,11 +227,11 @@ struct RawJsString { } impl RawJsString { - fn is_latin1(&self) -> bool { + const fn is_latin1(&self) -> bool { self.tagged_len.is_latin1() } - fn len(&self) -> usize { + const fn len(&self) -> usize { self.tagged_len.len() } } @@ -275,8 +280,9 @@ impl JsString { let src = ptr::from_ref(src).cast::(); JsString { // SAFETY: - // Caller must ensure the pointer is valid and point to data - // with the same size and alignment of `RawJsString`. + // `StaticJsString` has the same memory layout as `RawJsString` for the first 2 fields + // which means it is safe to use it to represent `RawJsString` as long as we only acccess the first 2 fields, + // and the static reference indicates that the pointer cast is valid. ptr: unsafe { Tagged::from_ptr(src.cast_mut()) }, } } @@ -311,7 +317,11 @@ impl JsString { // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen // by its signature, so this doesn't outlive `self`. // - // - The `RawJsString` created from string literal has a static lifetime `JsStr`. + // - The `RawJsString` created from string literal has a static reference to the string literal, + // making it safe to be dereferenced and used as a static `JsStr`. + // + // - `Cell` is readable as an usize as long as we don't try to mutate the pointed variable, + // which means it is safe to read the `refcount` as `read_only` here. unsafe { let h = h.as_ptr(); if (*h).refcount.read_only == 0 { @@ -910,7 +920,10 @@ impl JsString { pub fn refcount(&self) -> Option { match self.ptr.unwrap() { UnwrappedTagged::Ptr(inner) => { - // SAFETY: The reference count of `JsString` guarantees that `inner` is always valid. + // SAFETY: + // `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid. + // And `Cell` is readable as an usize as long as we don't try to mutate the pointed variable, + // which means it is safe to read the `refcount` as `read_only` here. let rc = unsafe { (*inner.as_ptr()).refcount.read_only }; if rc == 0 { None @@ -927,20 +940,25 @@ impl Clone for JsString { #[inline] fn clone(&self) -> Self { if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() { - // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. + // SAFETY: + // `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid. + // And `Cell` is readable as an usize as long as we don't try to mutate the pointed variable, + // which means it is safe to read the `refcount` as `read_only` here. let rc = unsafe { (*inner.as_ptr()).refcount.read_only }; if rc == 0 { // pointee is a static string return Self { ptr: self.ptr }; } - // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. + // SAFETY: `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid. let inner = unsafe { inner.as_ref() }; let strong = rc.wrapping_add(1); if strong == 0 { abort() } - // SAFETY: This has been checked aboved to ensure it is a `read_write` variant. + // SAFETY: + // This has been checked aboved to ensure it is a `read_write` variant, + // which means it is safe to write the `refcount` as `read_write` here. unsafe { inner.refcount.read_write.set(strong); } @@ -962,7 +980,10 @@ impl Drop for JsString { if let UnwrappedTagged::Ptr(raw) = self.ptr.unwrap() { // See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details. - // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. + // SAFETY: + // `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. + // And `Cell` is readable as an usize as long as we don't try to mutate the pointed variable, + // which means it is safe to read the `refcount` as `read_only` here. let refcount = unsafe { (*raw.as_ptr()).refcount.read_only }; if refcount == 0 { // Just a static string. No need to drop. @@ -972,7 +993,9 @@ impl Drop for JsString { // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. let inner = unsafe { raw.as_ref() }; - // SAFETY: This has been checked aboved to ensure it is a `read_write` variant. + // SAFETY: + // This has been checked aboved to ensure it is a `read_write` variant, + // which means it is safe to write the `refcount` as `read_write` here. unsafe { inner.refcount.read_write.set(refcount - 1); if inner.refcount.read_write.get() != 0 { From 1cbec4193b778bff1cb4140410a60fb46558b201 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD Date: Tue, 30 Jul 2024 14:58:57 +0800 Subject: [PATCH 18/18] chore: Mark `TaggedLen` as `repr(transparent)` --- core/string/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 0dfeb625068..c45963add1c 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -158,6 +158,7 @@ impl CodePoint { /// ``` /// The latin1/UTF-16 flag is stored in the bottom bit. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[repr(transparent)] struct TaggedLen(usize); impl TaggedLen {