From ddc8b4212815e7a18d234b2abb072170a4ed8436 Mon Sep 17 00:00:00 2001 From: CrazyboyQCD <53971641+CrazyboyQCD@users.noreply.github.com> Date: Mon, 12 Aug 2024 07:48:21 +0800 Subject: [PATCH] Refactor `RawJsString`'s representation to make `JsString`s construction from string literal heap-allocation free (#3935) * feat: add a way to create `JsString` from ASCII literal * chore: add todo comment to change `DUMMY_RAW_JS_STRING` to constant * chore: make `ORIGINAL_JS_STR` a static reference to reduce binary size * chore: replace `static` to `const` and simplify the code * chore: add generic parameters to `transmute` * chore: try to use union to represent * chore: make miri happy * Optimize const accesses for static strings * perf: set LITERAL a constant reference to avoid duplication on data segment * chore: add comments * chore: add comments * chore : fix `refcount` test * chore: remove unnessnary assertion * chore: modify definition of `JsString::is_static` * chore: add more tests * chore: mark `StaticJsString` as `repr[C]` * chore: improve comment * chore: Mark `TaggedLen` as `repr(transparent)` --------- Co-authored-by: jedel1043 --- core/engine/src/string.rs | 14 ++- core/string/src/lib.rs | 239 +++++++++++++++++++++++++++----------- core/string/src/str.rs | 4 +- core/string/src/tests.rs | 61 +++++++++- 4 files changed, 243 insertions(+), 75 deletions(-) diff --git a/core/engine/src/string.rs b/core/engine/src/string.rs index cfddb9328c..a8aee6201c 100644 --- a/core/engine/src/string.rs +++ b/core/engine/src/string.rs @@ -54,9 +54,11 @@ macro_rules! js_string { () => { $crate::string::JsString::default() }; - ($s:literal) => { - $crate::string::JsString::from($crate::js_str!($s)) - }; + ($s:literal) => {{ + const LITERAL: &$crate::string::StaticJsString = &$crate::string::StaticJsString::new($crate::js_str!($s)); + + $crate::string::JsString::from_static_js_string(LITERAL) + }}; ($s:expr) => { $crate::string::JsString::from($s) }; @@ -92,6 +94,12 @@ mod tests { #[test] fn refcount() { let x = js_string!("Hello world"); + assert_eq!(x.refcount(), None); + + let x = js_string!("你好"); + assert_eq!(x.refcount(), None); + + let x = js_string!("Hello world".to_string()); assert_eq!(x.refcount(), Some(1)); { diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 3c2833c6e5..c45963add1 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -38,6 +38,7 @@ use std::{ convert::Infallible, hash::{Hash, Hasher}, iter::Peekable, + mem::ManuallyDrop, process::abort, ptr::{self, addr_of, addr_of_mut, NonNull}, str::FromStr, @@ -149,37 +150,90 @@ impl CodePoint { } } -/// The raw representation of a [`JsString`] in the heap. +/// A `usize` contains a flag and the length of Latin1/UTF-16 . +/// ```text +/// ┌────────────────────────────────────┐ +/// │ length (usize::BITS - 1) │ flag(1) │ +/// └────────────────────────────────────┘ +/// ``` +/// The latin1/UTF-16 flag is stored in the bottom bit. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[repr(transparent)] +struct TaggedLen(usize); + +impl TaggedLen { + const LATIN1_BITFLAG: usize = 1 << 0; + const BITFLAG_COUNT: usize = 1; + + const fn new(len: usize, latin1: bool) -> Self { + Self((len << Self::BITFLAG_COUNT) | (latin1 as usize)) + } + + const fn is_latin1(self) -> bool { + (self.0 & Self::LATIN1_BITFLAG) != 0 + } + + const fn len(self) -> usize { + self.0 >> Self::BITFLAG_COUNT + } +} + +/// The raw representation of a [`JsString`] from a string literal. +#[derive(Debug)] #[repr(C)] -struct RawJsString { - /// Contains the flags and Latin1/UTF-16 length. - /// - /// The latin1 flag is stored in the bottom bit. - flags_and_len: usize, +pub struct StaticJsString { + tagged_len: TaggedLen, + _zero: usize, + ptr: *const u8, +} - /// The number of references to the string. - /// - /// When this reaches `0` the string is deallocated. - refcount: Cell, +// SAFETY: This is Sync because reads to `_zero` will always read 0 and +// `ptr` cannot be mutated thanks to the 'static requirement. +unsafe impl Sync for StaticJsString {} - /// An empty array which is used to get the offset of string data. - data: [u16; 0], +impl StaticJsString { + /// Create a `StaticJsString` from a static `JsStr`. + #[must_use] + pub const fn new(string: JsStr<'static>) -> StaticJsString { + match string.variant() { + JsStrVariant::Latin1(l) => StaticJsString { + tagged_len: TaggedLen::new(l.len(), true), + _zero: 0, + ptr: l.as_ptr(), + }, + JsStrVariant::Utf16(u) => StaticJsString { + tagged_len: TaggedLen::new(u.len(), false), + _zero: 0, + ptr: u.as_ptr().cast(), + }, + } + } } -impl RawJsString { - const LATIN1_BITFLAG: usize = 1 << 0; - const BITFLAG_COUNT: usize = 1; +/// Memory variant to pass `Miri` test. +/// +/// If it equals to `0usize`, +/// we mark it read-only, otherwise it is readable and writable +union RefCount { + read_only: usize, + read_write: ManuallyDrop>, +} + +/// The raw representation of a [`JsString`] in the heap. +#[repr(C)] +struct RawJsString { + tagged_len: TaggedLen, + refcount: RefCount, + data: [u8; 0], +} +impl RawJsString { const fn is_latin1(&self) -> bool { - (self.flags_and_len & Self::LATIN1_BITFLAG) != 0 + self.tagged_len.is_latin1() } const fn len(&self) -> usize { - self.flags_and_len >> Self::BITFLAG_COUNT - } - - const fn encode_flags_and_len(len: usize, latin1: bool) -> usize { - (len << Self::BITFLAG_COUNT) | (latin1 as usize) + self.tagged_len.len() } } @@ -221,6 +275,19 @@ impl<'a> IntoIterator for &'a JsString { } impl JsString { + /// Create a [`JsString`] from a static js string. + #[must_use] + pub const fn from_static_js_string(src: &'static StaticJsString) -> Self { + let src = ptr::from_ref(src).cast::(); + JsString { + // SAFETY: + // `StaticJsString` has the same memory layout as `RawJsString` for the first 2 fields + // which means it is safe to use it to represent `RawJsString` as long as we only acccess the first 2 fields, + // and the static reference indicates that the pointer cast is valid. + ptr: unsafe { Tagged::from_ptr(src.cast_mut()) }, + } + } + /// Create an iterator over the [`JsString`]. #[inline] #[must_use] @@ -245,24 +312,39 @@ impl JsString { // - The `RawJsString` type has all the necessary information to reconstruct a valid // slice (length and starting pointer). // - // - We aligned `h.data` on allocation, and the block is of size `h.len`, so this + // - We aligned `h.data()` on allocation, and the block is of size `h.len`, so this // should only generate valid reads. // // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen // by its signature, so this doesn't outlive `self`. + // + // - The `RawJsString` created from string literal has a static reference to the string literal, + // making it safe to be dereferenced and used as a static `JsStr`. + // + // - `Cell` is readable as an usize as long as we don't try to mutate the pointed variable, + // which means it is safe to read the `refcount` as `read_only` here. unsafe { let h = h.as_ptr(); + if (*h).refcount.read_only == 0 { + let h = h.cast::(); + return if (*h).tagged_len.is_latin1() { + JsStr::latin1(std::slice::from_raw_parts( + (*h).ptr, + (*h).tagged_len.len(), + )) + } else { + JsStr::utf16(std::slice::from_raw_parts( + (*h).ptr.cast(), + (*h).tagged_len.len(), + )) + }; + } + let len = (*h).len(); if (*h).is_latin1() { - JsStr::latin1(std::slice::from_raw_parts( - addr_of!((*h).data).cast(), - (*h).len(), - )) + JsStr::latin1(std::slice::from_raw_parts(addr_of!((*h).data).cast(), len)) } else { - JsStr::utf16(std::slice::from_raw_parts( - addr_of!((*h).data).cast(), - (*h).len(), - )) + JsStr::utf16(std::slice::from_raw_parts(addr_of!((*h).data).cast(), len)) } } } @@ -342,7 +424,7 @@ impl JsString { } } Self { - // Safety: We already know it's a valid heap pointer. + // SAFETY: We already know it's a valid heap pointer. ptr: unsafe { Tagged::from_ptr(ptr.as_ptr()) }, } }; @@ -665,8 +747,10 @@ impl JsString { unsafe { // Write the first part, the `RawJsString`. inner.as_ptr().write(RawJsString { - flags_and_len: RawJsString::encode_flags_and_len(str_len, latin1), - refcount: Cell::new(1), + tagged_len: TaggedLen::new(str_len, latin1), + refcount: RefCount { + read_write: ManuallyDrop::new(Cell::new(1)), + }, data: [0; 0], }); } @@ -720,7 +804,7 @@ impl JsString { } } Self { - // Safety: `allocate_inner` guarantees `ptr` is a valid heap pointer. + // SAFETY: `allocate_inner` guarantees `ptr` is a valid heap pointer. ptr: Tagged::from_non_null(ptr), } } @@ -737,28 +821,7 @@ impl JsString { #[inline] #[must_use] pub fn len(&self) -> usize { - match self.ptr.unwrap() { - UnwrappedTagged::Ptr(h) => { - // SAFETY: - // - The `RawJsString` type has all the necessary information to reconstruct a valid - // slice (length and starting pointer). - // - // - We aligned `h.data` on allocation, and the block is of size `h.len`, so this - // should only generate valid reads. - // - // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen - // by its signature, so this doesn't outlive `self`. - unsafe { - let h = h.as_ptr(); - (*h).len() - } - } - UnwrappedTagged::Tag(index) => { - // SAFETY: all static strings are valid indices on `STATIC_JS_STRINGS`, so `get` should always - // return `Some`. - unsafe { StaticJsStrings::get(index).unwrap_unchecked().len() } - } - } + self.as_str().len() } /// Return true if the [`JsString`] is emtpy. @@ -810,7 +873,7 @@ impl JsString { #[inline] #[must_use] pub fn is_static(&self) -> bool { - self.ptr.is_tagged() + self.refcount().is_none() } /// Get the element a the given index, [`None`] otherwise. @@ -834,7 +897,7 @@ impl JsString { where I: JsSliceIndex<'a>, { - // Safety: Caller must ensure the index is not out of bounds + // SAFETY: Caller must ensure the index is not out of bounds unsafe { I::get_unchecked(self.as_str(), index) } } @@ -858,9 +921,16 @@ impl JsString { pub fn refcount(&self) -> Option { match self.ptr.unwrap() { UnwrappedTagged::Ptr(inner) => { - // SAFETY: The reference count of `JsString` guarantees that `inner` is always valid. - let inner = unsafe { inner.as_ref() }; - Some(inner.refcount.get()) + // SAFETY: + // `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid. + // And `Cell` is readable as an usize as long as we don't try to mutate the pointed variable, + // which means it is safe to read the `refcount` as `read_only` here. + let rc = unsafe { (*inner.as_ptr()).refcount.read_only }; + if rc == 0 { + None + } else { + Some(rc) + } } UnwrappedTagged::Tag(_inner) => None, } @@ -871,13 +941,28 @@ impl Clone for JsString { #[inline] fn clone(&self) -> Self { if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() { - // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. + // SAFETY: + // `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid. + // And `Cell` is readable as an usize as long as we don't try to mutate the pointed variable, + // which means it is safe to read the `refcount` as `read_only` here. + let rc = unsafe { (*inner.as_ptr()).refcount.read_only }; + if rc == 0 { + // pointee is a static string + return Self { ptr: self.ptr }; + } + // SAFETY: `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid. let inner = unsafe { inner.as_ref() }; - let strong = inner.refcount.get().wrapping_add(1); + + let strong = rc.wrapping_add(1); if strong == 0 { abort() } - inner.refcount.set(strong); + // SAFETY: + // This has been checked aboved to ensure it is a `read_write` variant, + // which means it is safe to write the `refcount` as `read_write` here. + unsafe { + inner.refcount.read_write.set(strong); + } } Self { ptr: self.ptr } } @@ -896,13 +981,29 @@ impl Drop for JsString { if let UnwrappedTagged::Ptr(raw) = self.ptr.unwrap() { // See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details. - // SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. - let inner = unsafe { raw.as_ref() }; - inner.refcount.set(inner.refcount.get() - 1); - if inner.refcount.get() != 0 { + // SAFETY: + // `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. + // And `Cell` is readable as an usize as long as we don't try to mutate the pointed variable, + // which means it is safe to read the `refcount` as `read_only` here. + let refcount = unsafe { (*raw.as_ptr()).refcount.read_only }; + if refcount == 0 { + // Just a static string. No need to drop. return; } + // SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid. + let inner = unsafe { raw.as_ref() }; + + // SAFETY: + // This has been checked aboved to ensure it is a `read_write` variant, + // which means it is safe to write the `refcount` as `read_write` here. + unsafe { + inner.refcount.read_write.set(refcount - 1); + if inner.refcount.read_write.get() != 0 { + return; + } + } + // SAFETY: // All the checks for the validity of the layout have already been made on `alloc_inner`, // so we can skip the unwrap. @@ -922,7 +1023,7 @@ impl Drop for JsString { } }; - // Safety: + // SAFETY: // If refcount is 0 and we call drop, that means this is the last `JsString` which // points to this memory allocation, so deallocating it is safe. unsafe { diff --git a/core/string/src/str.rs b/core/string/src/str.rs index d851307dba..c42821558f 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -87,14 +87,14 @@ impl<'a> JsStr<'a> { /// Return the inner [`JsStrVariant`] varient of the [`JsStr`]. #[inline] #[must_use] - pub fn variant(self) -> JsStrVariant<'a> { + pub const fn variant(self) -> JsStrVariant<'a> { self.inner } /// Check if the [`JsStr`] is latin1 encoded. #[inline] #[must_use] - pub fn is_latin1(&self) -> bool { + pub const fn is_latin1(&self) -> bool { matches!(self.inner, JsStrVariant::Latin1(_)) } diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 61a23cfa35..c9cf68eca7 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -2,7 +2,7 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash}; -use crate::{JsStr, JsString, StaticJsStrings}; +use crate::{JsStr, JsString, StaticJsString, StaticJsStrings}; use rustc_hash::FxHasher; @@ -173,3 +173,62 @@ fn conversion_to_known_static_js_string() { assert!(string.is_some()); assert!(string.unwrap().as_str().is_latin1()); } + +#[test] +fn from_static_js_string() { + static STATIC_HELLO_WORLD: StaticJsString = + StaticJsString::new(JsStr::latin1("hello world".as_bytes())); + static STATIC_EMOJIS: StaticJsString = StaticJsString::new(JsStr::utf16(&[ + 0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5, + ])); // 🎹🎶🎵 + let latin1 = JsString::from_static_js_string(&STATIC_HELLO_WORLD); + let utf16 = JsString::from_static_js_string(&STATIC_EMOJIS); + + // content compare + assert_eq!(latin1, "hello world"); + assert_eq!(utf16, "🎹🎶🎵"); + + // refcount check + let clone = latin1.clone(); + + assert_eq!(clone, latin1); + + let clone = utf16.clone(); + + assert_eq!(clone, utf16); + + assert!(latin1.refcount().is_none()); + assert!(utf16.refcount().is_none()); + + // `is_latin1` check + assert!(latin1.as_str().is_latin1()); + assert!(!utf16.as_str().is_latin1()); +} + +#[test] +fn compare_static_and_dynamic_js_string() { + static STATIC_HELLO_WORLD: StaticJsString = + StaticJsString::new(JsStr::latin1("hello world".as_bytes())); + static STATIC_EMOJIS: StaticJsString = StaticJsString::new(JsStr::utf16(&[ + 0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5, + ])); // 🎹🎶🎵 + let static_latin1 = JsString::from_static_js_string(&STATIC_HELLO_WORLD); + let static_utf16 = JsString::from_static_js_string(&STATIC_EMOJIS); + + let dynamic_latin1 = JsString::from(JsStr::latin1("hello world".as_bytes())); + let dynamic_utf16 = JsString::from(&[0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5]); + + // content compare + assert_eq!(static_latin1, dynamic_latin1); + assert_eq!(static_utf16, dynamic_utf16); + + // length check + assert_eq!(static_latin1.len(), dynamic_latin1.len()); + assert_eq!(static_utf16.len(), dynamic_utf16.len()); + + // `is_static` check + assert!(static_latin1.is_static()); + assert!(static_utf16.is_static()); + assert!(!dynamic_latin1.is_static()); + assert!(!dynamic_utf16.is_static()); +}