Browse Source

Refactor `RawJsString`'s representation to make `JsString`s construction from string literal heap-allocation free (#3935)

* feat: add a way to create `JsString` from ASCII literal

* chore: add todo comment to change `DUMMY_RAW_JS_STRING` to constant

* chore: make `ORIGINAL_JS_STR` a static reference to reduce binary size

* chore: replace `static` to `const` and simplify the code

* chore: add generic parameters to `transmute`

* chore: try to use union to represent

* chore: make miri happy

* Optimize const accesses for static strings

* perf: set LITERAL a constant reference to avoid duplication on data segment

* chore: add comments

* chore: add comments

* chore : fix `refcount` test

* chore: remove unnessnary assertion

* chore: modify definition of `JsString::is_static`

* chore: add more tests

* chore: mark `StaticJsString` as `repr[C]`

* chore: improve comment

* chore: Mark `TaggedLen` as `repr(transparent)`

---------

Co-authored-by: jedel1043 <jedel0124@gmail.com>
pull/3918/merge
CrazyboyQCD 4 months ago committed by GitHub
parent
commit
ddc8b42128
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 14
      core/engine/src/string.rs
  2. 239
      core/string/src/lib.rs
  3. 4
      core/string/src/str.rs
  4. 61
      core/string/src/tests.rs

14
core/engine/src/string.rs

@ -54,9 +54,11 @@ macro_rules! js_string {
() => { () => {
$crate::string::JsString::default() $crate::string::JsString::default()
}; };
($s:literal) => { ($s:literal) => {{
$crate::string::JsString::from($crate::js_str!($s)) const LITERAL: &$crate::string::StaticJsString = &$crate::string::StaticJsString::new($crate::js_str!($s));
};
$crate::string::JsString::from_static_js_string(LITERAL)
}};
($s:expr) => { ($s:expr) => {
$crate::string::JsString::from($s) $crate::string::JsString::from($s)
}; };
@ -92,6 +94,12 @@ mod tests {
#[test] #[test]
fn refcount() { fn refcount() {
let x = js_string!("Hello world"); let x = js_string!("Hello world");
assert_eq!(x.refcount(), None);
let x = js_string!("你好");
assert_eq!(x.refcount(), None);
let x = js_string!("Hello world".to_string());
assert_eq!(x.refcount(), Some(1)); assert_eq!(x.refcount(), Some(1));
{ {

239
core/string/src/lib.rs

@ -38,6 +38,7 @@ use std::{
convert::Infallible, convert::Infallible,
hash::{Hash, Hasher}, hash::{Hash, Hasher},
iter::Peekable, iter::Peekable,
mem::ManuallyDrop,
process::abort, process::abort,
ptr::{self, addr_of, addr_of_mut, NonNull}, ptr::{self, addr_of, addr_of_mut, NonNull},
str::FromStr, str::FromStr,
@ -149,37 +150,90 @@ impl CodePoint {
} }
} }
/// The raw representation of a [`JsString`] in the heap. /// A `usize` contains a flag and the length of Latin1/UTF-16 .
/// ```text
/// ┌────────────────────────────────────┐
/// │ length (usize::BITS - 1) │ flag(1) │
/// └────────────────────────────────────┘
/// ```
/// The latin1/UTF-16 flag is stored in the bottom bit.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
#[repr(transparent)]
struct TaggedLen(usize);
impl TaggedLen {
const LATIN1_BITFLAG: usize = 1 << 0;
const BITFLAG_COUNT: usize = 1;
const fn new(len: usize, latin1: bool) -> Self {
Self((len << Self::BITFLAG_COUNT) | (latin1 as usize))
}
const fn is_latin1(self) -> bool {
(self.0 & Self::LATIN1_BITFLAG) != 0
}
const fn len(self) -> usize {
self.0 >> Self::BITFLAG_COUNT
}
}
/// The raw representation of a [`JsString`] from a string literal.
#[derive(Debug)]
#[repr(C)] #[repr(C)]
struct RawJsString { pub struct StaticJsString {
/// Contains the flags and Latin1/UTF-16 length. tagged_len: TaggedLen,
/// _zero: usize,
/// The latin1 flag is stored in the bottom bit. ptr: *const u8,
flags_and_len: usize, }
/// The number of references to the string. // SAFETY: This is Sync because reads to `_zero` will always read 0 and
/// // `ptr` cannot be mutated thanks to the 'static requirement.
/// When this reaches `0` the string is deallocated. unsafe impl Sync for StaticJsString {}
refcount: Cell<usize>,
/// An empty array which is used to get the offset of string data. impl StaticJsString {
data: [u16; 0], /// Create a `StaticJsString` from a static `JsStr`.
#[must_use]
pub const fn new(string: JsStr<'static>) -> StaticJsString {
match string.variant() {
JsStrVariant::Latin1(l) => StaticJsString {
tagged_len: TaggedLen::new(l.len(), true),
_zero: 0,
ptr: l.as_ptr(),
},
JsStrVariant::Utf16(u) => StaticJsString {
tagged_len: TaggedLen::new(u.len(), false),
_zero: 0,
ptr: u.as_ptr().cast(),
},
}
}
} }
impl RawJsString { /// Memory variant to pass `Miri` test.
const LATIN1_BITFLAG: usize = 1 << 0; ///
const BITFLAG_COUNT: usize = 1; /// If it equals to `0usize`,
/// we mark it read-only, otherwise it is readable and writable
union RefCount {
read_only: usize,
read_write: ManuallyDrop<Cell<usize>>,
}
/// The raw representation of a [`JsString`] in the heap.
#[repr(C)]
struct RawJsString {
tagged_len: TaggedLen,
refcount: RefCount,
data: [u8; 0],
}
impl RawJsString {
const fn is_latin1(&self) -> bool { const fn is_latin1(&self) -> bool {
(self.flags_and_len & Self::LATIN1_BITFLAG) != 0 self.tagged_len.is_latin1()
} }
const fn len(&self) -> usize { const fn len(&self) -> usize {
self.flags_and_len >> Self::BITFLAG_COUNT self.tagged_len.len()
}
const fn encode_flags_and_len(len: usize, latin1: bool) -> usize {
(len << Self::BITFLAG_COUNT) | (latin1 as usize)
} }
} }
@ -221,6 +275,19 @@ impl<'a> IntoIterator for &'a JsString {
} }
impl JsString { impl JsString {
/// Create a [`JsString`] from a static js string.
#[must_use]
pub const fn from_static_js_string(src: &'static StaticJsString) -> Self {
let src = ptr::from_ref(src).cast::<RawJsString>();
JsString {
// SAFETY:
// `StaticJsString` has the same memory layout as `RawJsString` for the first 2 fields
// which means it is safe to use it to represent `RawJsString` as long as we only acccess the first 2 fields,
// and the static reference indicates that the pointer cast is valid.
ptr: unsafe { Tagged::from_ptr(src.cast_mut()) },
}
}
/// Create an iterator over the [`JsString`]. /// Create an iterator over the [`JsString`].
#[inline] #[inline]
#[must_use] #[must_use]
@ -245,24 +312,39 @@ impl JsString {
// - The `RawJsString` type has all the necessary information to reconstruct a valid // - The `RawJsString` type has all the necessary information to reconstruct a valid
// slice (length and starting pointer). // slice (length and starting pointer).
// //
// - We aligned `h.data` on allocation, and the block is of size `h.len`, so this // - We aligned `h.data()` on allocation, and the block is of size `h.len`, so this
// should only generate valid reads. // should only generate valid reads.
// //
// - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen // - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen
// by its signature, so this doesn't outlive `self`. // by its signature, so this doesn't outlive `self`.
//
// - The `RawJsString` created from string literal has a static reference to the string literal,
// making it safe to be dereferenced and used as a static `JsStr`.
//
// - `Cell<usize>` is readable as an usize as long as we don't try to mutate the pointed variable,
// which means it is safe to read the `refcount` as `read_only` here.
unsafe { unsafe {
let h = h.as_ptr(); let h = h.as_ptr();
if (*h).refcount.read_only == 0 {
let h = h.cast::<StaticJsString>();
return if (*h).tagged_len.is_latin1() {
JsStr::latin1(std::slice::from_raw_parts(
(*h).ptr,
(*h).tagged_len.len(),
))
} else {
JsStr::utf16(std::slice::from_raw_parts(
(*h).ptr.cast(),
(*h).tagged_len.len(),
))
};
}
let len = (*h).len();
if (*h).is_latin1() { if (*h).is_latin1() {
JsStr::latin1(std::slice::from_raw_parts( JsStr::latin1(std::slice::from_raw_parts(addr_of!((*h).data).cast(), len))
addr_of!((*h).data).cast(),
(*h).len(),
))
} else { } else {
JsStr::utf16(std::slice::from_raw_parts( JsStr::utf16(std::slice::from_raw_parts(addr_of!((*h).data).cast(), len))
addr_of!((*h).data).cast(),
(*h).len(),
))
} }
} }
} }
@ -342,7 +424,7 @@ impl JsString {
} }
} }
Self { Self {
// Safety: We already know it's a valid heap pointer. // SAFETY: We already know it's a valid heap pointer.
ptr: unsafe { Tagged::from_ptr(ptr.as_ptr()) }, ptr: unsafe { Tagged::from_ptr(ptr.as_ptr()) },
} }
}; };
@ -665,8 +747,10 @@ impl JsString {
unsafe { unsafe {
// Write the first part, the `RawJsString`. // Write the first part, the `RawJsString`.
inner.as_ptr().write(RawJsString { inner.as_ptr().write(RawJsString {
flags_and_len: RawJsString::encode_flags_and_len(str_len, latin1), tagged_len: TaggedLen::new(str_len, latin1),
refcount: Cell::new(1), refcount: RefCount {
read_write: ManuallyDrop::new(Cell::new(1)),
},
data: [0; 0], data: [0; 0],
}); });
} }
@ -720,7 +804,7 @@ impl JsString {
} }
} }
Self { Self {
// Safety: `allocate_inner` guarantees `ptr` is a valid heap pointer. // SAFETY: `allocate_inner` guarantees `ptr` is a valid heap pointer.
ptr: Tagged::from_non_null(ptr), ptr: Tagged::from_non_null(ptr),
} }
} }
@ -737,28 +821,7 @@ impl JsString {
#[inline] #[inline]
#[must_use] #[must_use]
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
match self.ptr.unwrap() { self.as_str().len()
UnwrappedTagged::Ptr(h) => {
// SAFETY:
// - The `RawJsString` type has all the necessary information to reconstruct a valid
// slice (length and starting pointer).
//
// - We aligned `h.data` on allocation, and the block is of size `h.len`, so this
// should only generate valid reads.
//
// - The lifetime of `&Self::Target` is shorter than the lifetime of `self`, as seen
// by its signature, so this doesn't outlive `self`.
unsafe {
let h = h.as_ptr();
(*h).len()
}
}
UnwrappedTagged::Tag(index) => {
// SAFETY: all static strings are valid indices on `STATIC_JS_STRINGS`, so `get` should always
// return `Some`.
unsafe { StaticJsStrings::get(index).unwrap_unchecked().len() }
}
}
} }
/// Return true if the [`JsString`] is emtpy. /// Return true if the [`JsString`] is emtpy.
@ -810,7 +873,7 @@ impl JsString {
#[inline] #[inline]
#[must_use] #[must_use]
pub fn is_static(&self) -> bool { pub fn is_static(&self) -> bool {
self.ptr.is_tagged() self.refcount().is_none()
} }
/// Get the element a the given index, [`None`] otherwise. /// Get the element a the given index, [`None`] otherwise.
@ -834,7 +897,7 @@ impl JsString {
where where
I: JsSliceIndex<'a>, I: JsSliceIndex<'a>,
{ {
// Safety: Caller must ensure the index is not out of bounds // SAFETY: Caller must ensure the index is not out of bounds
unsafe { I::get_unchecked(self.as_str(), index) } unsafe { I::get_unchecked(self.as_str(), index) }
} }
@ -858,9 +921,16 @@ impl JsString {
pub fn refcount(&self) -> Option<usize> { pub fn refcount(&self) -> Option<usize> {
match self.ptr.unwrap() { match self.ptr.unwrap() {
UnwrappedTagged::Ptr(inner) => { UnwrappedTagged::Ptr(inner) => {
// SAFETY: The reference count of `JsString` guarantees that `inner` is always valid. // SAFETY:
let inner = unsafe { inner.as_ref() }; // `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid.
Some(inner.refcount.get()) // And `Cell<usize>` is readable as an usize as long as we don't try to mutate the pointed variable,
// which means it is safe to read the `refcount` as `read_only` here.
let rc = unsafe { (*inner.as_ptr()).refcount.read_only };
if rc == 0 {
None
} else {
Some(rc)
}
} }
UnwrappedTagged::Tag(_inner) => None, UnwrappedTagged::Tag(_inner) => None,
} }
@ -871,13 +941,28 @@ impl Clone for JsString {
#[inline] #[inline]
fn clone(&self) -> Self { fn clone(&self) -> Self {
if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() { if let UnwrappedTagged::Ptr(inner) = self.ptr.unwrap() {
// SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. // SAFETY:
// `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid.
// And `Cell<usize>` is readable as an usize as long as we don't try to mutate the pointed variable,
// which means it is safe to read the `refcount` as `read_only` here.
let rc = unsafe { (*inner.as_ptr()).refcount.read_only };
if rc == 0 {
// pointee is a static string
return Self { ptr: self.ptr };
}
// SAFETY: `NonNull` and the constructions of `JsString` guarantee that `inner` is always valid.
let inner = unsafe { inner.as_ref() }; let inner = unsafe { inner.as_ref() };
let strong = inner.refcount.get().wrapping_add(1);
let strong = rc.wrapping_add(1);
if strong == 0 { if strong == 0 {
abort() abort()
} }
inner.refcount.set(strong); // SAFETY:
// This has been checked aboved to ensure it is a `read_write` variant,
// which means it is safe to write the `refcount` as `read_write` here.
unsafe {
inner.refcount.read_write.set(strong);
}
} }
Self { ptr: self.ptr } Self { ptr: self.ptr }
} }
@ -896,13 +981,29 @@ impl Drop for JsString {
if let UnwrappedTagged::Ptr(raw) = self.ptr.unwrap() { if let UnwrappedTagged::Ptr(raw) = self.ptr.unwrap() {
// See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details. // See https://doc.rust-lang.org/src/alloc/sync.rs.html#1672 for details.
// SAFETY: The reference count of `JsString` guarantees that `raw` is always valid. // SAFETY:
let inner = unsafe { raw.as_ref() }; // `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid.
inner.refcount.set(inner.refcount.get() - 1); // And `Cell<usize>` is readable as an usize as long as we don't try to mutate the pointed variable,
if inner.refcount.get() != 0 { // which means it is safe to read the `refcount` as `read_only` here.
let refcount = unsafe { (*raw.as_ptr()).refcount.read_only };
if refcount == 0 {
// Just a static string. No need to drop.
return; return;
} }
// SAFETY: `NonNull` and the constructions of `JsString` guarantees that `raw` is always valid.
let inner = unsafe { raw.as_ref() };
// SAFETY:
// This has been checked aboved to ensure it is a `read_write` variant,
// which means it is safe to write the `refcount` as `read_write` here.
unsafe {
inner.refcount.read_write.set(refcount - 1);
if inner.refcount.read_write.get() != 0 {
return;
}
}
// SAFETY: // SAFETY:
// All the checks for the validity of the layout have already been made on `alloc_inner`, // All the checks for the validity of the layout have already been made on `alloc_inner`,
// so we can skip the unwrap. // so we can skip the unwrap.
@ -922,7 +1023,7 @@ impl Drop for JsString {
} }
}; };
// Safety: // SAFETY:
// If refcount is 0 and we call drop, that means this is the last `JsString` which // If refcount is 0 and we call drop, that means this is the last `JsString` which
// points to this memory allocation, so deallocating it is safe. // points to this memory allocation, so deallocating it is safe.
unsafe { unsafe {

4
core/string/src/str.rs

@ -87,14 +87,14 @@ impl<'a> JsStr<'a> {
/// Return the inner [`JsStrVariant`] varient of the [`JsStr`]. /// Return the inner [`JsStrVariant`] varient of the [`JsStr`].
#[inline] #[inline]
#[must_use] #[must_use]
pub fn variant(self) -> JsStrVariant<'a> { pub const fn variant(self) -> JsStrVariant<'a> {
self.inner self.inner
} }
/// Check if the [`JsStr`] is latin1 encoded. /// Check if the [`JsStr`] is latin1 encoded.
#[inline] #[inline]
#[must_use] #[must_use]
pub fn is_latin1(&self) -> bool { pub const fn is_latin1(&self) -> bool {
matches!(self.inner, JsStrVariant::Latin1(_)) matches!(self.inner, JsStrVariant::Latin1(_))
} }

61
core/string/src/tests.rs

@ -2,7 +2,7 @@
use std::hash::{BuildHasher, BuildHasherDefault, Hash}; use std::hash::{BuildHasher, BuildHasherDefault, Hash};
use crate::{JsStr, JsString, StaticJsStrings}; use crate::{JsStr, JsString, StaticJsString, StaticJsStrings};
use rustc_hash::FxHasher; use rustc_hash::FxHasher;
@ -173,3 +173,62 @@ fn conversion_to_known_static_js_string() {
assert!(string.is_some()); assert!(string.is_some());
assert!(string.unwrap().as_str().is_latin1()); assert!(string.unwrap().as_str().is_latin1());
} }
#[test]
fn from_static_js_string() {
static STATIC_HELLO_WORLD: StaticJsString =
StaticJsString::new(JsStr::latin1("hello world".as_bytes()));
static STATIC_EMOJIS: StaticJsString = StaticJsString::new(JsStr::utf16(&[
0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5,
])); // 🎹🎶🎵
let latin1 = JsString::from_static_js_string(&STATIC_HELLO_WORLD);
let utf16 = JsString::from_static_js_string(&STATIC_EMOJIS);
// content compare
assert_eq!(latin1, "hello world");
assert_eq!(utf16, "🎹🎶🎵");
// refcount check
let clone = latin1.clone();
assert_eq!(clone, latin1);
let clone = utf16.clone();
assert_eq!(clone, utf16);
assert!(latin1.refcount().is_none());
assert!(utf16.refcount().is_none());
// `is_latin1` check
assert!(latin1.as_str().is_latin1());
assert!(!utf16.as_str().is_latin1());
}
#[test]
fn compare_static_and_dynamic_js_string() {
static STATIC_HELLO_WORLD: StaticJsString =
StaticJsString::new(JsStr::latin1("hello world".as_bytes()));
static STATIC_EMOJIS: StaticJsString = StaticJsString::new(JsStr::utf16(&[
0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5,
])); // 🎹🎶🎵
let static_latin1 = JsString::from_static_js_string(&STATIC_HELLO_WORLD);
let static_utf16 = JsString::from_static_js_string(&STATIC_EMOJIS);
let dynamic_latin1 = JsString::from(JsStr::latin1("hello world".as_bytes()));
let dynamic_utf16 = JsString::from(&[0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5]);
// content compare
assert_eq!(static_latin1, dynamic_latin1);
assert_eq!(static_utf16, dynamic_utf16);
// length check
assert_eq!(static_latin1.len(), dynamic_latin1.len());
assert_eq!(static_utf16.len(), dynamic_utf16.len());
// `is_static` check
assert!(static_latin1.is_static());
assert!(static_utf16.is_static());
assert!(!dynamic_latin1.is_static());
assert!(!dynamic_utf16.is_static());
}

Loading…
Cancel
Save