From f7aef10a61f8f784feaf5d995b944afc0221396b Mon Sep 17 00:00:00 2001 From: jedel1043 Date: Thu, 30 Jun 2022 15:34:22 +0000 Subject: [PATCH] Remove `string-interner` dependency and implement custom string `Interner` (#2147) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So, @raskad and myself had a short discussion about the state of #736, and we came to the conclusion that it would be a good time to implement our own string interner; partly because the `string-interner` crate is a bit unmaintained (as shown by https://github.com/Robbepop/string-interner/pull/42 and https://github.com/Robbepop/string-interner/pull/47), and partly because it would be hard to experiment with custom optimizations for UTF-16 strings. I still want to thank @Robbepop for the original implementation though, because some parts of this design have been shamelessly stolen from it 😅. Having said that, this PR is a complete reimplementation of the interner, but with some modifications to (hopefully!) make it a bit easier to experiment with UTF-16 strings, apply optimizations, and whatnot :) --- Cargo.lock | 38 +- boa_engine/src/builtins/function/mod.rs | 47 ++- boa_engine/src/bytecompiler.rs | 7 +- boa_engine/src/object/mod.rs | 13 +- .../ast/node/iteration/continue_node/mod.rs | 3 +- boa_engine/src/vm/code_block.rs | 2 + boa_gc/src/lib.rs | 2 +- boa_interner/Cargo.toml | 5 +- boa_interner/src/fixed_string.rs | 62 +++ boa_interner/src/interned_str.rs | 70 ++++ boa_interner/src/lib.rs | 365 +++++++----------- boa_interner/src/sym.rs | 148 +++++++ boa_interner/src/tests.rs | 40 +- 13 files changed, 488 insertions(+), 314 deletions(-) create mode 100644 boa_interner/src/fixed_string.rs create mode 100644 boa_interner/src/interned_str.rs create mode 100644 boa_interner/src/sym.rs diff --git a/Cargo.lock b/Cargo.lock index 9d28e12c9d..6ce624b507 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,17 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "ahash" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - [[package]] name = "aho-corasick" version = "0.7.18" @@ -144,9 +133,10 @@ dependencies = [ name = "boa_interner" version = "0.15.0" dependencies = [ - "gc", + "phf", + "rustc-hash", "serde", - "string-interner", + "static_assertions", ] [[package]] @@ -622,15 +612,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" -dependencies = [ - "ahash", -] - [[package]] name = "hashbrown" version = "0.12.1" @@ -797,7 +778,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" dependencies = [ "autocfg", - "hashbrown 0.12.1", + "hashbrown", ] [[package]] @@ -1528,17 +1509,6 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0" -[[package]] -name = "string-interner" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e2531d8525b29b514d25e275a43581320d587b86db302b9a7e464bac579648" -dependencies = [ - "cfg-if", - "hashbrown 0.11.2", - "serde", -] - [[package]] name = "strsim" version = "0.8.0" diff --git a/boa_engine/src/builtins/function/mod.rs b/boa_engine/src/builtins/function/mod.rs index 70ad096839..350a8743a6 100644 --- a/boa_engine/src/builtins/function/mod.rs +++ b/boa_engine/src/builtins/function/mod.rs @@ -26,7 +26,7 @@ use crate::{ value::IntegerOrInfinity, Context, JsResult, JsString, JsValue, }; -use boa_gc::{self, Finalize, Gc, Trace}; +use boa_gc::{self, custom_trace, Finalize, Gc, Trace}; use boa_interner::Sym; use boa_profiler::Profiler; use dyn_clone::DynClone; @@ -138,12 +138,26 @@ impl ConstructorKind { /// - [ECMAScript specification][spec] /// /// [spec]: https://tc39.es/ecma262/#sec-classfielddefinition-record-specification-type -#[derive(Clone, Debug, Trace, Finalize)] +#[derive(Clone, Debug, Finalize)] pub enum ClassFieldDefinition { Public(PropertyKey, JsFunction), Private(Sym, JsFunction), } +unsafe impl Trace for ClassFieldDefinition { + custom_trace! {this, { + match this { + Self::Public(key, func) => { + mark(key); + mark(func); + } + Self::Private(_, func) => { + mark(func); + } + } + }} +} + /// Wrapper for `Gc>` that allows passing additional /// captures through a `Copy` closure. /// @@ -191,18 +205,14 @@ impl Captures { /// (AST Node). /// /// -#[derive(Trace, Finalize)] +#[derive(Finalize)] pub enum Function { Native { - #[unsafe_ignore_trace] function: NativeFunctionSignature, - #[unsafe_ignore_trace] constructor: Option, }, Closure { - #[unsafe_ignore_trace] function: Box, - #[unsafe_ignore_trace] constructor: Option, captures: Captures, }, @@ -211,7 +221,6 @@ pub enum Function { environments: DeclarativeEnvironmentStack, /// The `[[ConstructorKind]]` internal slot. - #[unsafe_ignore_trace] constructor_kind: ConstructorKind, /// The `[[HomeObject]]` internal slot. @@ -229,6 +238,28 @@ pub enum Function { }, } +unsafe impl Trace for Function { + custom_trace! {this, { + match this { + Self::Native { .. } => {} + Self::Closure { captures, .. } => mark(captures), + Self::Ordinary { code, environments, home_object, fields, private_methods, .. } => { + mark(code); + mark(environments); + mark(home_object); + mark(fields); + for (_, elem) in private_methods { + mark(elem); + } + } + Self::Generator { code, environments } => { + mark(code); + mark(environments); + } + } + }} +} + impl fmt::Debug for Function { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Function {{ ... }}") diff --git a/boa_engine/src/bytecompiler.rs b/boa_engine/src/bytecompiler.rs index 0e30741049..1f29c57303 100644 --- a/boa_engine/src/bytecompiler.rs +++ b/boa_engine/src/bytecompiler.rs @@ -20,7 +20,7 @@ use crate::{ vm::{BindingOpcode, CodeBlock, Opcode}, Context, JsBigInt, JsResult, JsString, JsValue, }; -use boa_gc::{Cell, Gc}; +use boa_gc::Gc; use boa_interner::{Interner, Sym}; use rustc_hash::FxHashMap; use std::mem::size_of; @@ -98,7 +98,10 @@ impl<'b> ByteCompiler<'b> { /// Push a compile time environment to the current `CodeBlock` and return it's index. #[inline] - fn push_compile_environment(&mut self, environment: Gc>) -> usize { + fn push_compile_environment( + &mut self, + environment: Gc>, + ) -> usize { let index = self.code_block.compile_environments.len(); self.code_block.compile_environments.push(environment); index diff --git a/boa_engine/src/object/mod.rs b/boa_engine/src/object/mod.rs index bb85953a4f..fc22b694d8 100644 --- a/boa_engine/src/object/mod.rs +++ b/boa_engine/src/object/mod.rs @@ -108,7 +108,7 @@ impl NativeObject for T { } /// The internal representation of a JavaScript object. -#[derive(Debug, Trace, Finalize)] +#[derive(Debug, Finalize)] pub struct Object { /// The type of the object. pub data: ObjectData, @@ -122,6 +122,17 @@ pub struct Object { private_elements: FxHashMap, } +unsafe impl Trace for Object { + boa_gc::custom_trace!(this, { + mark(&this.data); + mark(&this.properties); + mark(&this.prototype); + for elem in this.private_elements.values() { + mark(elem); + } + }); +} + /// The representation of private object elements. #[derive(Clone, Debug, Trace, Finalize)] pub enum PrivateElement { diff --git a/boa_engine/src/syntax/ast/node/iteration/continue_node/mod.rs b/boa_engine/src/syntax/ast/node/iteration/continue_node/mod.rs index 7933a0bebc..bf1055f5e6 100644 --- a/boa_engine/src/syntax/ast/node/iteration/continue_node/mod.rs +++ b/boa_engine/src/syntax/ast/node/iteration/continue_node/mod.rs @@ -1,5 +1,4 @@ use crate::syntax::ast::node::Node; -use boa_gc::{Finalize, Trace}; use boa_interner::{Interner, Sym, ToInternedString}; #[cfg(feature = "deser")] @@ -19,7 +18,7 @@ use serde::{Deserialize, Serialize}; /// [spec]: https://tc39.es/ecma262/#prod-ContinueStatement /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/continue #[cfg_attr(feature = "deser", derive(Serialize, Deserialize))] -#[derive(Clone, Debug, Trace, Finalize, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub struct Continue { label: Option, } diff --git a/boa_engine/src/vm/code_block.rs b/boa_engine/src/vm/code_block.rs index 43f57ddb44..9cc52a32a6 100644 --- a/boa_engine/src/vm/code_block.rs +++ b/boa_engine/src/vm/code_block.rs @@ -55,6 +55,7 @@ unsafe impl Readable for f64 {} #[derive(Clone, Debug, Trace, Finalize)] pub struct CodeBlock { /// Name of this function + #[unsafe_ignore_trace] pub(crate) name: Sym, /// The number of arguments expected. @@ -77,6 +78,7 @@ pub struct CodeBlock { pub(crate) literals: Vec, /// Property field names. + #[unsafe_ignore_trace] pub(crate) names: Vec, /// Locators for all bindings in the codeblock. diff --git a/boa_gc/src/lib.rs b/boa_gc/src/lib.rs index 0ae80e1a5e..ac24531793 100644 --- a/boa_gc/src/lib.rs +++ b/boa_gc/src/lib.rs @@ -1,6 +1,6 @@ //! Garbage collector for the Boa JavaScript engine. pub use gc::{ - custom_trace, force_collect, unsafe_empty_trace, Finalize, Gc, GcCell as Cell, + custom_trace, finalizer_safe, force_collect, unsafe_empty_trace, Finalize, Gc, GcCell as Cell, GcCellRef as Ref, GcCellRefMut as RefMut, Trace, }; diff --git a/boa_interner/Cargo.toml b/boa_interner/Cargo.toml index 803a4c5b68..95f27c5810 100644 --- a/boa_interner/Cargo.toml +++ b/boa_interner/Cargo.toml @@ -11,6 +11,7 @@ categories = ["data-structures"] license = "Unlicense/MIT" [dependencies] -string-interner = "0.14.0" serde = { version = "1.0.137", features = ["derive"], optional = true } -gc = { version = "0.4.1", features = ["derive"] } +phf = { version = "0.10.1", features = ["macros"] } +rustc-hash = "1.1.0" +static_assertions = "1.1.0" diff --git a/boa_interner/src/fixed_string.rs b/boa_interner/src/fixed_string.rs new file mode 100644 index 0000000000..659ab27f2e --- /dev/null +++ b/boa_interner/src/fixed_string.rs @@ -0,0 +1,62 @@ +use crate::interned_str::InternedStr; + +#[derive(Debug, Default)] +pub(super) struct FixedString { + inner: String, +} + +impl FixedString { + /// Creates a new, pinned [`FixedString`]. + pub(super) fn new(capacity: usize) -> Self { + Self { + inner: String::with_capacity(capacity), + } + } + + /// Gets the maximum capacity of the [`FixedString`]. + pub(super) fn capacity(&self) -> usize { + self.inner.capacity() + } + + /// Returns `true` if the [`FixedString`] has length zero, + /// and `false` otherwise. + pub(super) fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Tries to push `string` to the [`FixedString`], and returns + /// an [`InternedStr`] pointer to the stored `string`, or + /// `None` if the capacity is not enough to store `string`. + /// + /// # Safety + /// + /// The caller is responsible for ensuring `self` outlives the returned + /// `InternedStr`. + pub(super) unsafe fn push(&mut self, string: &str) -> Option { + let capacity = self.inner.capacity(); + (capacity >= self.inner.len() + string.len()).then(|| { + let old_len = self.inner.len(); + self.inner.push_str(string); + // SAFETY: The caller is responsible for extending the lifetime + // of `self` to outlive the return value. + unsafe { InternedStr::new(self.inner[old_len..self.inner.len()].into()) } + }) + } + + /// Pushes `string` to the [`FixedString`], and returns + /// an [`InternedStr`] pointer to the stored `string`, without + /// checking if the total `capacity` is enough to store `string`. + /// + /// # Safety + /// + /// The caller is responsible for ensuring that `self` outlives the returned + /// `InternedStr` and that it has enough capacity to store `string` without + /// reallocating. + pub(super) unsafe fn push_unchecked(&mut self, string: &str) -> InternedStr { + let old_len = self.inner.len(); + self.inner.push_str(string); + // SAFETY: The caller is responsible for extending the lifetime + // of `self` to outlive the return value. + unsafe { InternedStr::new(self.inner[old_len..self.inner.len()].into()) } + } +} diff --git a/boa_interner/src/interned_str.rs b/boa_interner/src/interned_str.rs new file mode 100644 index 0000000000..f8db815430 --- /dev/null +++ b/boa_interner/src/interned_str.rs @@ -0,0 +1,70 @@ +use std::{borrow::Borrow, ptr::NonNull}; + +/// Wrapper for an interned str pointer, required to +/// quickly check using a hash if a string is inside an [`Interner`][`super::Interner`]. +/// +/// # Safety +/// +/// This struct could cause Undefined Behaviour on: +/// - Use without ensuring the referenced memory is still allocated. +/// - Construction of an [`InternedStr`] from an invalid [`NonNull`]. +/// +/// In general, this should not be used outside of an [`Interner`][`super::Interner`]. +#[derive(Debug, Clone)] +pub(super) struct InternedStr { + ptr: NonNull, +} + +impl InternedStr { + /// Create a new interned string from the given `str`. + /// + /// # Safety + /// + /// Not maintaining the invariants specified on the struct definition + /// could cause Undefined Behaviour. + #[inline] + pub(super) unsafe fn new(ptr: NonNull) -> Self { + Self { ptr } + } + + /// Returns a shared reference to the underlying string. + /// + /// # Safety + /// + /// Not maintaining the invariants specified on the struct definition + /// could cause Undefined Behaviour. + #[inline] + pub(super) unsafe fn as_str(&self) -> &str { + // SAFETY: The caller must verify the invariants + // specified on the struct definition. + unsafe { self.ptr.as_ref() } + } +} + +impl std::hash::Hash for InternedStr { + fn hash(&self, state: &mut H) { + // SAFETY: The caller must verify the invariants + // specified in the struct definition. + unsafe { + self.as_str().hash(state); + } + } +} + +impl Eq for InternedStr {} + +impl PartialEq for InternedStr { + fn eq(&self, other: &Self) -> bool { + // SAFETY: The caller must verify the invariants + // specified in the struct definition. + unsafe { self.as_str() == other.as_str() } + } +} + +impl Borrow for InternedStr { + fn borrow(&self) -> &str { + // SAFETY: The caller must verify the invariants + // specified in the struct definition. + unsafe { self.as_str() } + } +} diff --git a/boa_interner/src/lib.rs b/boa_interner/src/lib.rs index 11cf49b3eb..94e1bddb3c 100644 --- a/boa_interner/src/lib.rs +++ b/boa_interner/src/lib.rs @@ -49,6 +49,7 @@ rust_2018_idioms, future_incompatible, nonstandard_style, + unsafe_op_in_unsafe_fn )] #![allow( clippy::module_name_repetitions, @@ -69,46 +70,71 @@ rustdoc::missing_doc_code_examples )] +extern crate static_assertions as sa; + +mod fixed_string; +mod interned_str; +mod sym; #[cfg(test)] mod tests; -use std::{fmt::Display, num::NonZeroUsize}; +use fixed_string::FixedString; +pub use sym::*; -use gc::{unsafe_empty_trace, Finalize, Trace}; -#[cfg(feature = "serde")] -use serde::{Deserialize, Serialize}; -use string_interner::{backend::BucketBackend, StringInterner, Symbol}; +use std::fmt::{Debug, Display}; -/// Backend of the string interner. -type Backend = BucketBackend; +use interned_str::InternedStr; +use rustc_hash::FxHashMap; /// The string interner for Boa. -/// -/// This is a type alias that makes it easier to reference it in the code. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct Interner { - inner: StringInterner, + // COMMENT FOR DEVS: + // This interner works on the assumption that + // `head` won't ever be reallocated, since this could invalidate + // some of our stored pointers inside `spans`. + // This means that any operation on `head` and `full` should be carefully + // reviewed to not cause Undefined Behaviour. + // `get_or_intern` has a more thorough explanation on this. + // + // Also, if you want to implement `shrink_to_fit` (and friends), + // please check out https://github.com/Robbepop/string-interner/pull/47 first. + // This doesn't implement that method, since implementing it increases + // our memory footprint. + symbols: FxHashMap, + spans: Vec, + head: FixedString, + full: Vec, } impl Interner { - /// Creates a new `StringInterner` with the given initial capacity. + /// Creates a new [`Interner`]. + #[inline] + pub fn new() -> Self { + Self::default() + } + + /// Creates a new [`Interner`] with the specified capacity. #[inline] - pub fn with_capacity(cap: usize) -> Self { + pub fn with_capacity(capacity: usize) -> Self { Self { - inner: StringInterner::with_capacity(cap), + symbols: FxHashMap::default(), + spans: Vec::with_capacity(capacity), + head: FixedString::new(capacity), + full: Vec::new(), } } /// Returns the number of strings interned by the interner. #[inline] pub fn len(&self) -> usize { - self.inner.len() + COMMON_STRINGS.len() + self.spans.len() } - /// Returns `true` if the string interner has no interned strings. + /// Returns `true` if the [`Interner`] contains no interned strings. #[inline] pub fn is_empty(&self) -> bool { - self.inner.is_empty() + COMMON_STRINGS.is_empty() && self.spans.is_empty() } /// Returns the symbol for the given string if any. @@ -119,7 +145,7 @@ impl Interner { T: AsRef, { let string = string.as_ref(); - Self::get_static(string).or_else(|| self.inner.get(string)) + Self::get_common(string).or_else(|| self.symbols.get(string).copied()) } /// Interns the given string. @@ -127,13 +153,67 @@ impl Interner { /// Returns a symbol for resolution into the original string. /// /// # Panics + /// /// If the interner already interns the maximum number of strings possible by the chosen symbol type. pub fn get_or_intern(&mut self, string: T) -> Sym where T: AsRef, { let string = string.as_ref(); - Self::get_static(string).unwrap_or_else(|| self.inner.get_or_intern(string)) + if let Some(sym) = self.get(string) { + return sym; + } + + // SAFETY: + // + // Firstly, this interner works on the assumption that the allocated + // memory by `head` won't ever be moved from its position on the heap, + // which is an important point to understand why manipulating it like + // this is safe. + // + // `String` (which is simply a `Vec` with additional invariants) + // is essentially a pointer to heap memory that can be moved without + // any problems, since copying a pointer cannot invalidate the memory + // that it points to. + // + // However, `String` CAN be invalidated when pushing, extending or + // shrinking it, since all those operations reallocate on the heap. + // + // To prevent that, we HAVE to ensure the capacity will succeed without + // having to reallocate, and the only way to do that without invalidating + // any other alive `InternedStr` is to create a brand new `head` with + // enough capacity and push the old `head` to `full` to keep it alive + // throughout the lifetime of the whole `Interner`. + // + // `FixedString` encapsulates this by only allowing checked `push`es + // to the internal string, but we still have to ensure the memory + // of `head` is not deallocated until the whole `Interner` deallocates, + // which we can do by moving it inside the `Interner` itself, specifically + // on the `full` vector, where every other old `head` also lives. + let interned_str = unsafe { + self.head.push(string).unwrap_or_else(|| { + let new_cap = + (usize::max(self.head.capacity(), string.len()) + 1).next_power_of_two(); + let new_head = FixedString::new(new_cap); + let old_head = std::mem::replace(&mut self.head, new_head); + + // If the user creates an `Interner` + // with `Interner::with_capacity(BIG_NUMBER)` and + // the first interned string's length is bigger than `BIG_NUMBER`, + // `self.full.push(old_head)` would push a big, empty string of + // allocated size `BIG_NUMBER` into `full`. + // This prevents that case. + if !old_head.is_empty() { + self.full.push(old_head); + } + self.head.push_unchecked(string) + }) + }; + + // SAFETY: We are obtaining a pointer to the internal memory of + // `head`, which is alive through the whole life of `Interner`, so + // this is safe. + unsafe { self.generate_symbol(interned_str) } } /// Interns the given `'static` string. @@ -142,32 +222,32 @@ impl Interner { /// /// # Note /// - /// This is more efficient than [`StringInterner::get_or_intern`] since it might - /// avoid some memory allocations if the backends supports this. + /// This is more efficient than [`Interner::get_or_intern`], since it + /// avoids storing `string` inside the [`Interner`]. /// /// # Panics /// /// If the interner already interns the maximum number of strings possible /// by the chosen symbol type. pub fn get_or_intern_static(&mut self, string: &'static str) -> Sym { - Self::get_static(string).unwrap_or_else(|| self.inner.get_or_intern_static(string)) - } - - /// Shrink backend capacity to fit the interned strings exactly. - #[inline] - pub fn shrink_to_fit(&mut self) { - self.inner.shrink_to_fit(); + self.get(string).unwrap_or_else(|| { + // SAFETY: a static `str` is always alive, so its pointer + // should therefore always be valid. + unsafe { self.generate_symbol(InternedStr::new(string.into())) } + }) } /// Returns the string for the given symbol if any. #[inline] pub fn resolve(&self, symbol: Sym) -> Option<&str> { - let index = symbol.as_raw().get(); - if index <= Self::STATIC_STRINGS.len() { - Some(Self::STATIC_STRINGS[index - 1]) - } else { - self.inner.resolve(symbol) - } + let index = symbol.get() - 1; + + COMMON_STRINGS.index(index).copied().or_else(|| { + self.spans.get(index - COMMON_STRINGS.len()).map(|ptr| + // SAFETY: We always ensure the stored `InternedStr`s always + // reference memory inside `head` and `full` + unsafe {ptr.as_str()}) + }) } /// Returns the string for the given symbol. @@ -180,178 +260,33 @@ impl Interner { self.resolve(symbol).expect("string disappeared") } - /// Gets the symbol of the static string if one of them - fn get_static(string: &str) -> Option { - Self::STATIC_STRINGS - .into_iter() - .enumerate() - .find(|&(_i, s)| string == s) - .map(|(i, _str)| { - let raw = NonZeroUsize::new(i.wrapping_add(1)).expect("static array too big"); - Sym::from_raw(raw) + /// Gets the symbol of the common string if one of them + fn get_common(string: &str) -> Option { + COMMON_STRINGS.get_index(string).map(|idx| + // SAFETY: `idx >= 0`, since it's an `usize`, and `idx + 1 > 0`. + // In this case, we don't need to worry about overflows + // because we have a static assertion in place checking that + // `COMMON_STRINGS.len() < usize::MAX`. + unsafe { + Sym::new_unchecked(idx + 1) }) } -} - -impl FromIterator for Interner -where - T: AsRef, -{ - #[inline] - fn from_iter(iter: I) -> Self - where - I: IntoIterator, - { - Self { - inner: StringInterner::from_iter(iter), - } - } -} - -impl Extend for Interner -where - T: AsRef, -{ - #[inline] - fn extend(&mut self, iter: I) - where - I: IntoIterator, - { - self.inner.extend(iter); - } -} - -impl<'a> IntoIterator for &'a Interner { - type Item = (Sym, &'a str); - type IntoIter = <&'a Backend as IntoIterator>::IntoIter; - - #[inline] - fn into_iter(self) -> Self::IntoIter { - self.inner.into_iter() - } -} - -impl Default for Interner { - fn default() -> Self { - Self { - inner: StringInterner::new(), - } - } -} - -/// The string symbol type for Boa. -/// -/// This symbol type is internally a `NonZeroUsize`, which makes it pointer-width in size and it's -/// optimized so that it can occupy 1 pointer width even in an `Option` type. -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Finalize)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[cfg_attr(feature = "serde", serde(transparent))] -#[allow(clippy::unsafe_derive_deserialize)] -pub struct Sym { - value: NonZeroUsize, -} - -impl Sym { - /// Padding for the symbol internal value. - const PADDING: usize = Interner::STATIC_STRINGS.len() + 1; - - /// Symbol for the empty string (`""`). - pub const EMPTY_STRING: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(1)) }; - - /// Symbol for the `"arguments"` string. - pub const ARGUMENTS: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(2)) }; - - /// Symbol for the `"await"` string. - pub const AWAIT: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(3)) }; - - /// Symbol for the `"yield"` string. - pub const YIELD: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(4)) }; - - /// Symbol for the `"eval"` string. - pub const EVAL: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(5)) }; - - /// Symbol for the `"default"` string. - pub const DEFAULT: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(6)) }; - - /// Symbol for the `"null"` string. - pub const NULL: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(7)) }; - - /// Symbol for the `"RegExp"` string. - pub const REGEXP: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(8)) }; - - /// Symbol for the `"get"` string. - pub const GET: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(9)) }; - - /// Symbol for the `"set"` string. - pub const SET: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(10)) }; - - /// Symbol for the `"
"` string. - pub const MAIN: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(11)) }; - - /// Symbol for the `"raw"` string. - pub const RAW: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(12)) }; - - /// Symbol for the `"static"` string. - pub const STATIC: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(13)) }; - - /// Symbol for the `"prototype"` string. - pub const PROTOTYPE: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(14)) }; - - /// Symbol for the `"constructor"` string. - pub const CONSTRUCTOR: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(15)) }; - - /// Symbol for the `"implements"` string. - pub const IMPLEMENTS: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(16)) }; - - /// Symbol for the `"interface"` string. - pub const INTERFACE: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(17)) }; - - /// Symbol for the `"let"` string. - pub const LET: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(18)) }; - - /// Symbol for the `"package"` string. - pub const PACKAGE: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(19)) }; - - /// Symbol for the `"private"` string. - pub const PRIVATE: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(20)) }; - - /// Symbol for the `"protected"` string. - pub const PROTECTED: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(21)) }; - - /// Symbol for the `"public"` string. - pub const PUBLIC: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(22)) }; - - /// Creates a `Sym` from a raw `NonZeroUsize`. - const fn from_raw(value: NonZeroUsize) -> Self { - Self { value } - } - /// Retrieves the raw `NonZeroUsize` for this symbol. - const fn as_raw(self) -> NonZeroUsize { - self.value - } -} - -impl Symbol for Sym { - #[inline] - fn try_from_usize(index: usize) -> Option { - index - .checked_add(Self::PADDING) - .and_then(NonZeroUsize::new) - .map(|value| Self { value }) - } - - #[inline] - fn to_usize(self) -> usize { - self.value.get() - Self::PADDING + /// Generates a new symbol for the provided [`str`] pointer. + /// + /// # Safety + /// + /// The caller must ensure `string` points to a valid + /// memory inside `head` and that it won't be invalidated + /// by allocations and deallocations. + unsafe fn generate_symbol(&mut self, string: InternedStr) -> Sym { + let next = Sym::new(self.len() + 1).expect("cannot get interner symbol: integer overflow"); + self.spans.push(string.clone()); + self.symbols.insert(string, next); + next } } -// Safe because `Sym` implements `Copy`. -unsafe impl Trace for Sym { - unsafe_empty_trace!(); -} - /// Converts a given element to a string using an interner. pub trait ToInternedString { /// Converts a given element to a string using an interner. @@ -366,33 +301,3 @@ where self.to_string() } } - -impl Interner { - /// List of commonly used static strings. - /// - /// Make sure that any string added as a `Sym` constant is also added here. - const STATIC_STRINGS: [&'static str; 22] = [ - "", - "arguments", - "await", - "yield", - "eval", - "default", - "null", - "RegExp", - "get", - "set", - "
", - "raw", - "static", - "prototype", - "constructor", - "implements", - "interface", - "let", - "package", - "private", - "protected", - "public", - ]; -} diff --git a/boa_interner/src/sym.rs b/boa_interner/src/sym.rs new file mode 100644 index 0000000000..60f147c707 --- /dev/null +++ b/boa_interner/src/sym.rs @@ -0,0 +1,148 @@ +use std::num::NonZeroUsize; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +/// The string symbol type for Boa. +/// +/// This symbol type is internally a `NonZeroUsize`, which makes it pointer-width in size and it's +/// optimized so that it can occupy 1 pointer width even in an `Option` type. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serde", serde(transparent))] +#[allow(clippy::unsafe_derive_deserialize)] +pub struct Sym { + value: NonZeroUsize, +} + +impl Sym { + /// Symbol for the empty string (`""`). + pub const EMPTY_STRING: Self = unsafe { Self::new_unchecked(1) }; + + /// Symbol for the `"arguments"` string. + pub const ARGUMENTS: Self = unsafe { Self::new_unchecked(2) }; + + /// Symbol for the `"await"` string. + pub const AWAIT: Self = unsafe { Self::new_unchecked(3) }; + + /// Symbol for the `"yield"` string. + pub const YIELD: Self = unsafe { Self::new_unchecked(4) }; + + /// Symbol for the `"eval"` string. + pub const EVAL: Self = unsafe { Self::new_unchecked(5) }; + + /// Symbol for the `"default"` string. + pub const DEFAULT: Self = unsafe { Self::new_unchecked(6) }; + + /// Symbol for the `"null"` string. + pub const NULL: Self = unsafe { Self::new_unchecked(7) }; + + /// Symbol for the `"RegExp"` string. + pub const REGEXP: Self = unsafe { Self::new_unchecked(8) }; + + /// Symbol for the `"get"` string. + pub const GET: Self = unsafe { Self::new_unchecked(9) }; + + /// Symbol for the `"set"` string. + pub const SET: Self = unsafe { Self::new_unchecked(10) }; + + /// Symbol for the `"
"` string. + pub const MAIN: Self = unsafe { Self::new_unchecked(11) }; + + /// Symbol for the `"raw"` string. + pub const RAW: Self = unsafe { Self::new_unchecked(12) }; + + /// Symbol for the `"static"` string. + pub const STATIC: Self = unsafe { Self::new_unchecked(13) }; + + /// Symbol for the `"prototype"` string. + pub const PROTOTYPE: Self = unsafe { Self::new_unchecked(14) }; + + /// Symbol for the `"constructor"` string. + pub const CONSTRUCTOR: Self = unsafe { Self::new_unchecked(15) }; + + /// Symbol for the `"implements"` string. + pub const IMPLEMENTS: Self = unsafe { Self::new_unchecked(16) }; + + /// Symbol for the `"interface"` string. + pub const INTERFACE: Self = unsafe { Self::new_unchecked(17) }; + + /// Symbol for the `"let"` string. + pub const LET: Self = unsafe { Self::new_unchecked(18) }; + + /// Symbol for the `"package"` string. + pub const PACKAGE: Self = unsafe { Self::new_unchecked(19) }; + + /// Symbol for the `"private"` string. + pub const PRIVATE: Self = unsafe { Self::new_unchecked(20) }; + + /// Symbol for the `"protected"` string. + pub const PROTECTED: Self = unsafe { Self::new_unchecked(21) }; + + /// Symbol for the `"public"` string. + pub const PUBLIC: Self = unsafe { Self::new_unchecked(22) }; + + /// Creates a new [`Sym`] from the provided `value`, or returns `None` if `index` is zero. + #[inline] + pub(super) fn new(value: usize) -> Option { + NonZeroUsize::new(value).map(|value| Self { value }) + } + + /// Creates a new [`Sym`] from the provided `value`, without checking if `value` is not zero + /// + /// # Safety + /// + /// `value` must not be zero. + #[inline] + pub(super) const unsafe fn new_unchecked(value: usize) -> Self { + Self { + value: + // SAFETY: The caller must ensure the invariants of the function. + unsafe { + NonZeroUsize::new_unchecked(value) + }, + } + } + + /// Returns the internal value of the [`Sym`] + #[inline] + pub(super) const fn get(self) -> usize { + self.value.get() + } +} + +/// Ordered set of commonly used static strings. +/// +/// # Note +/// +/// `COMMON_STRINGS` and the constants defined in [`Sym`] must always +/// be in sync. +pub(super) static COMMON_STRINGS: phf::OrderedSet<&'static str> = { + const COMMON_STRINGS: phf::OrderedSet<&'static str> = phf::phf_ordered_set! { + "", + "arguments", + "await", + "yield", + "eval", + "default", + "null", + "RegExp", + "get", + "set", + "
", + "raw", + "static", + "prototype", + "constructor", + "implements", + "interface", + "let", + "package", + "private", + "protected", + "public", + }; + // A `COMMON_STRINGS` of size `usize::MAX` would cause an overflow on our `Interner` + sa::const_assert!(COMMON_STRINGS.len() < usize::MAX); + COMMON_STRINGS +}; diff --git a/boa_interner/src/tests.rs b/boa_interner/src/tests.rs index 3df0fd194f..d42d5b9bf2 100644 --- a/boa_interner/src/tests.rs +++ b/boa_interner/src/tests.rs @@ -1,53 +1,24 @@ -use crate::{Interner, Sym}; -use std::num::NonZeroUsize; +use crate::{Interner, Sym, COMMON_STRINGS}; #[track_caller] fn sym_from_usize(index: usize) -> Sym { - Sym::from_raw(NonZeroUsize::new(index).expect("Invalid NonZeroUsize")) + Sym::new(index).expect("Invalid NonZeroUsize") } #[test] fn check_static_strings() { let mut interner = Interner::default(); - for (i, str) in Interner::STATIC_STRINGS.into_iter().enumerate() { + for (i, str) in COMMON_STRINGS.into_iter().enumerate() { assert_eq!(interner.get_or_intern(str), sym_from_usize(i + 1)); } } -#[test] -fn check_constants() { - assert_eq!(Sym::EMPTY_STRING, sym_from_usize(1)); - assert_eq!(Sym::ARGUMENTS, sym_from_usize(2)); - assert_eq!(Sym::AWAIT, sym_from_usize(3)); - assert_eq!(Sym::YIELD, sym_from_usize(4)); - assert_eq!(Sym::EVAL, sym_from_usize(5)); - assert_eq!(Sym::DEFAULT, sym_from_usize(6)); - assert_eq!(Sym::NULL, sym_from_usize(7)); - assert_eq!(Sym::REGEXP, sym_from_usize(8)); - assert_eq!(Sym::GET, sym_from_usize(9)); - assert_eq!(Sym::SET, sym_from_usize(10)); - assert_eq!(Sym::MAIN, sym_from_usize(11)); - assert_eq!(Sym::RAW, sym_from_usize(12)); - assert_eq!(Sym::STATIC, sym_from_usize(13)); - assert_eq!(Sym::PROTOTYPE, sym_from_usize(14)); - assert_eq!(Sym::CONSTRUCTOR, sym_from_usize(15)); - assert_eq!(Sym::IMPLEMENTS, sym_from_usize(16)); - assert_eq!(Sym::INTERFACE, sym_from_usize(17)); - assert_eq!(Sym::LET, sym_from_usize(18)); - assert_eq!(Sym::PACKAGE, sym_from_usize(19)); - assert_eq!(Sym::PRIVATE, sym_from_usize(20)); - assert_eq!(Sym::PROTECTED, sym_from_usize(21)); - assert_eq!(Sym::PUBLIC, sym_from_usize(22)); -} - #[test] fn check_new_string() { let mut interner = Interner::default(); - assert!( - interner.get_or_intern("my test string").as_raw().get() > Interner::STATIC_STRINGS.len() - ); + assert!(interner.get_or_intern("my test string").get() > COMMON_STRINGS.len()); } #[test] @@ -71,8 +42,9 @@ fn check_resolve() { fn check_static_resolve() { let mut interner = Interner::default(); - for string in Interner::STATIC_STRINGS + for string in COMMON_STRINGS .into_iter() + .copied() .chain(["my test str", "hello world", ";"].into_iter()) { let sym = interner.get_or_intern_static(string);