Browse Source

Remove `string-interner` dependency and implement custom string `Interner` (#2147)

So, @raskad and myself had a short discussion about the state of #736, and we came to the conclusion that it would be a good time to implement our own string interner; partly because the `string-interner` crate is a bit unmaintained (as shown by https://github.com/Robbepop/string-interner/pull/42 and https://github.com/Robbepop/string-interner/pull/47), and partly because it would be hard to experiment with custom optimizations for UTF-16 strings. I still want to thank @Robbepop for the original implementation though, because some parts of this design have been shamelessly stolen from it 😅.

Having said that, this PR is a complete reimplementation of the interner, but with some modifications to (hopefully!) make it a bit easier to experiment with UTF-16 strings, apply optimizations, and whatnot :)
pull/2153/head
jedel1043 2 years ago
parent
commit
f7aef10a61
  1. 38
      Cargo.lock
  2. 47
      boa_engine/src/builtins/function/mod.rs
  3. 7
      boa_engine/src/bytecompiler.rs
  4. 13
      boa_engine/src/object/mod.rs
  5. 3
      boa_engine/src/syntax/ast/node/iteration/continue_node/mod.rs
  6. 2
      boa_engine/src/vm/code_block.rs
  7. 2
      boa_gc/src/lib.rs
  8. 5
      boa_interner/Cargo.toml
  9. 62
      boa_interner/src/fixed_string.rs
  10. 70
      boa_interner/src/interned_str.rs
  11. 365
      boa_interner/src/lib.rs
  12. 148
      boa_interner/src/sym.rs
  13. 40
      boa_interner/src/tests.rs

38
Cargo.lock generated

@ -2,17 +2,6 @@
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 3
[[package]]
name = "ahash"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
dependencies = [
"getrandom",
"once_cell",
"version_check",
]
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.18" version = "0.7.18"
@ -144,9 +133,10 @@ dependencies = [
name = "boa_interner" name = "boa_interner"
version = "0.15.0" version = "0.15.0"
dependencies = [ dependencies = [
"gc", "phf",
"rustc-hash",
"serde", "serde",
"string-interner", "static_assertions",
] ]
[[package]] [[package]]
@ -622,15 +612,6 @@ version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
[[package]]
name = "hashbrown"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
dependencies = [
"ahash",
]
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.12.1" version = "0.12.1"
@ -797,7 +778,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"hashbrown 0.12.1", "hashbrown",
] ]
[[package]] [[package]]
@ -1528,17 +1509,6 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0" checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0"
[[package]]
name = "string-interner"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91e2531d8525b29b514d25e275a43581320d587b86db302b9a7e464bac579648"
dependencies = [
"cfg-if",
"hashbrown 0.11.2",
"serde",
]
[[package]] [[package]]
name = "strsim" name = "strsim"
version = "0.8.0" version = "0.8.0"

47
boa_engine/src/builtins/function/mod.rs

@ -26,7 +26,7 @@ use crate::{
value::IntegerOrInfinity, value::IntegerOrInfinity,
Context, JsResult, JsString, JsValue, Context, JsResult, JsString, JsValue,
}; };
use boa_gc::{self, Finalize, Gc, Trace}; use boa_gc::{self, custom_trace, Finalize, Gc, Trace};
use boa_interner::Sym; use boa_interner::Sym;
use boa_profiler::Profiler; use boa_profiler::Profiler;
use dyn_clone::DynClone; use dyn_clone::DynClone;
@ -138,12 +138,26 @@ impl ConstructorKind {
/// - [ECMAScript specification][spec] /// - [ECMAScript specification][spec]
/// ///
/// [spec]: https://tc39.es/ecma262/#sec-classfielddefinition-record-specification-type /// [spec]: https://tc39.es/ecma262/#sec-classfielddefinition-record-specification-type
#[derive(Clone, Debug, Trace, Finalize)] #[derive(Clone, Debug, Finalize)]
pub enum ClassFieldDefinition { pub enum ClassFieldDefinition {
Public(PropertyKey, JsFunction), Public(PropertyKey, JsFunction),
Private(Sym, JsFunction), Private(Sym, JsFunction),
} }
unsafe impl Trace for ClassFieldDefinition {
custom_trace! {this, {
match this {
Self::Public(key, func) => {
mark(key);
mark(func);
}
Self::Private(_, func) => {
mark(func);
}
}
}}
}
/// Wrapper for `Gc<GcCell<dyn NativeObject>>` that allows passing additional /// Wrapper for `Gc<GcCell<dyn NativeObject>>` that allows passing additional
/// captures through a `Copy` closure. /// captures through a `Copy` closure.
/// ///
@ -191,18 +205,14 @@ impl Captures {
/// (AST Node). /// (AST Node).
/// ///
/// <https://tc39.es/ecma262/#sec-ecmascript-function-objects> /// <https://tc39.es/ecma262/#sec-ecmascript-function-objects>
#[derive(Trace, Finalize)] #[derive(Finalize)]
pub enum Function { pub enum Function {
Native { Native {
#[unsafe_ignore_trace]
function: NativeFunctionSignature, function: NativeFunctionSignature,
#[unsafe_ignore_trace]
constructor: Option<ConstructorKind>, constructor: Option<ConstructorKind>,
}, },
Closure { Closure {
#[unsafe_ignore_trace]
function: Box<dyn ClosureFunctionSignature>, function: Box<dyn ClosureFunctionSignature>,
#[unsafe_ignore_trace]
constructor: Option<ConstructorKind>, constructor: Option<ConstructorKind>,
captures: Captures, captures: Captures,
}, },
@ -211,7 +221,6 @@ pub enum Function {
environments: DeclarativeEnvironmentStack, environments: DeclarativeEnvironmentStack,
/// The `[[ConstructorKind]]` internal slot. /// The `[[ConstructorKind]]` internal slot.
#[unsafe_ignore_trace]
constructor_kind: ConstructorKind, constructor_kind: ConstructorKind,
/// The `[[HomeObject]]` internal slot. /// The `[[HomeObject]]` internal slot.
@ -229,6 +238,28 @@ pub enum Function {
}, },
} }
unsafe impl Trace for Function {
custom_trace! {this, {
match this {
Self::Native { .. } => {}
Self::Closure { captures, .. } => mark(captures),
Self::Ordinary { code, environments, home_object, fields, private_methods, .. } => {
mark(code);
mark(environments);
mark(home_object);
mark(fields);
for (_, elem) in private_methods {
mark(elem);
}
}
Self::Generator { code, environments } => {
mark(code);
mark(environments);
}
}
}}
}
impl fmt::Debug for Function { impl fmt::Debug for Function {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Function {{ ... }}") write!(f, "Function {{ ... }}")

7
boa_engine/src/bytecompiler.rs

@ -20,7 +20,7 @@ use crate::{
vm::{BindingOpcode, CodeBlock, Opcode}, vm::{BindingOpcode, CodeBlock, Opcode},
Context, JsBigInt, JsResult, JsString, JsValue, Context, JsBigInt, JsResult, JsString, JsValue,
}; };
use boa_gc::{Cell, Gc}; use boa_gc::Gc;
use boa_interner::{Interner, Sym}; use boa_interner::{Interner, Sym};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use std::mem::size_of; use std::mem::size_of;
@ -98,7 +98,10 @@ impl<'b> ByteCompiler<'b> {
/// Push a compile time environment to the current `CodeBlock` and return it's index. /// Push a compile time environment to the current `CodeBlock` and return it's index.
#[inline] #[inline]
fn push_compile_environment(&mut self, environment: Gc<Cell<CompileTimeEnvironment>>) -> usize { fn push_compile_environment(
&mut self,
environment: Gc<boa_gc::Cell<CompileTimeEnvironment>>,
) -> usize {
let index = self.code_block.compile_environments.len(); let index = self.code_block.compile_environments.len();
self.code_block.compile_environments.push(environment); self.code_block.compile_environments.push(environment);
index index

13
boa_engine/src/object/mod.rs

@ -108,7 +108,7 @@ impl<T: Any + Debug + Trace> NativeObject for T {
} }
/// The internal representation of a JavaScript object. /// The internal representation of a JavaScript object.
#[derive(Debug, Trace, Finalize)] #[derive(Debug, Finalize)]
pub struct Object { pub struct Object {
/// The type of the object. /// The type of the object.
pub data: ObjectData, pub data: ObjectData,
@ -122,6 +122,17 @@ pub struct Object {
private_elements: FxHashMap<Sym, PrivateElement>, private_elements: FxHashMap<Sym, PrivateElement>,
} }
unsafe impl Trace for Object {
boa_gc::custom_trace!(this, {
mark(&this.data);
mark(&this.properties);
mark(&this.prototype);
for elem in this.private_elements.values() {
mark(elem);
}
});
}
/// The representation of private object elements. /// The representation of private object elements.
#[derive(Clone, Debug, Trace, Finalize)] #[derive(Clone, Debug, Trace, Finalize)]
pub enum PrivateElement { pub enum PrivateElement {

3
boa_engine/src/syntax/ast/node/iteration/continue_node/mod.rs

@ -1,5 +1,4 @@
use crate::syntax::ast::node::Node; use crate::syntax::ast::node::Node;
use boa_gc::{Finalize, Trace};
use boa_interner::{Interner, Sym, ToInternedString}; use boa_interner::{Interner, Sym, ToInternedString};
#[cfg(feature = "deser")] #[cfg(feature = "deser")]
@ -19,7 +18,7 @@ use serde::{Deserialize, Serialize};
/// [spec]: https://tc39.es/ecma262/#prod-ContinueStatement /// [spec]: https://tc39.es/ecma262/#prod-ContinueStatement
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/continue /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/continue
#[cfg_attr(feature = "deser", derive(Serialize, Deserialize))] #[cfg_attr(feature = "deser", derive(Serialize, Deserialize))]
#[derive(Clone, Debug, Trace, Finalize, PartialEq)] #[derive(Clone, Copy, Debug, PartialEq)]
pub struct Continue { pub struct Continue {
label: Option<Sym>, label: Option<Sym>,
} }

2
boa_engine/src/vm/code_block.rs

@ -55,6 +55,7 @@ unsafe impl Readable for f64 {}
#[derive(Clone, Debug, Trace, Finalize)] #[derive(Clone, Debug, Trace, Finalize)]
pub struct CodeBlock { pub struct CodeBlock {
/// Name of this function /// Name of this function
#[unsafe_ignore_trace]
pub(crate) name: Sym, pub(crate) name: Sym,
/// The number of arguments expected. /// The number of arguments expected.
@ -77,6 +78,7 @@ pub struct CodeBlock {
pub(crate) literals: Vec<JsValue>, pub(crate) literals: Vec<JsValue>,
/// Property field names. /// Property field names.
#[unsafe_ignore_trace]
pub(crate) names: Vec<Sym>, pub(crate) names: Vec<Sym>,
/// Locators for all bindings in the codeblock. /// Locators for all bindings in the codeblock.

2
boa_gc/src/lib.rs

@ -1,6 +1,6 @@
//! Garbage collector for the Boa JavaScript engine. //! Garbage collector for the Boa JavaScript engine.
pub use gc::{ pub use gc::{
custom_trace, force_collect, unsafe_empty_trace, Finalize, Gc, GcCell as Cell, custom_trace, finalizer_safe, force_collect, unsafe_empty_trace, Finalize, Gc, GcCell as Cell,
GcCellRef as Ref, GcCellRefMut as RefMut, Trace, GcCellRef as Ref, GcCellRefMut as RefMut, Trace,
}; };

5
boa_interner/Cargo.toml

@ -11,6 +11,7 @@ categories = ["data-structures"]
license = "Unlicense/MIT" license = "Unlicense/MIT"
[dependencies] [dependencies]
string-interner = "0.14.0"
serde = { version = "1.0.137", features = ["derive"], optional = true } serde = { version = "1.0.137", features = ["derive"], optional = true }
gc = { version = "0.4.1", features = ["derive"] } phf = { version = "0.10.1", features = ["macros"] }
rustc-hash = "1.1.0"
static_assertions = "1.1.0"

62
boa_interner/src/fixed_string.rs

@ -0,0 +1,62 @@
use crate::interned_str::InternedStr;
#[derive(Debug, Default)]
pub(super) struct FixedString {
inner: String,
}
impl FixedString {
/// Creates a new, pinned [`FixedString`].
pub(super) fn new(capacity: usize) -> Self {
Self {
inner: String::with_capacity(capacity),
}
}
/// Gets the maximum capacity of the [`FixedString`].
pub(super) fn capacity(&self) -> usize {
self.inner.capacity()
}
/// Returns `true` if the [`FixedString`] has length zero,
/// and `false` otherwise.
pub(super) fn is_empty(&self) -> bool {
self.inner.is_empty()
}
/// Tries to push `string` to the [`FixedString`], and returns
/// an [`InternedStr`] pointer to the stored `string`, or
/// `None` if the capacity is not enough to store `string`.
///
/// # Safety
///
/// The caller is responsible for ensuring `self` outlives the returned
/// `InternedStr`.
pub(super) unsafe fn push(&mut self, string: &str) -> Option<InternedStr> {
let capacity = self.inner.capacity();
(capacity >= self.inner.len() + string.len()).then(|| {
let old_len = self.inner.len();
self.inner.push_str(string);
// SAFETY: The caller is responsible for extending the lifetime
// of `self` to outlive the return value.
unsafe { InternedStr::new(self.inner[old_len..self.inner.len()].into()) }
})
}
/// Pushes `string` to the [`FixedString`], and returns
/// an [`InternedStr`] pointer to the stored `string`, without
/// checking if the total `capacity` is enough to store `string`.
///
/// # Safety
///
/// The caller is responsible for ensuring that `self` outlives the returned
/// `InternedStr` and that it has enough capacity to store `string` without
/// reallocating.
pub(super) unsafe fn push_unchecked(&mut self, string: &str) -> InternedStr {
let old_len = self.inner.len();
self.inner.push_str(string);
// SAFETY: The caller is responsible for extending the lifetime
// of `self` to outlive the return value.
unsafe { InternedStr::new(self.inner[old_len..self.inner.len()].into()) }
}
}

70
boa_interner/src/interned_str.rs

@ -0,0 +1,70 @@
use std::{borrow::Borrow, ptr::NonNull};
/// Wrapper for an interned str pointer, required to
/// quickly check using a hash if a string is inside an [`Interner`][`super::Interner`].
///
/// # Safety
///
/// This struct could cause Undefined Behaviour on:
/// - Use without ensuring the referenced memory is still allocated.
/// - Construction of an [`InternedStr`] from an invalid [`NonNull<str>`].
///
/// In general, this should not be used outside of an [`Interner`][`super::Interner`].
#[derive(Debug, Clone)]
pub(super) struct InternedStr {
ptr: NonNull<str>,
}
impl InternedStr {
/// Create a new interned string from the given `str`.
///
/// # Safety
///
/// Not maintaining the invariants specified on the struct definition
/// could cause Undefined Behaviour.
#[inline]
pub(super) unsafe fn new(ptr: NonNull<str>) -> Self {
Self { ptr }
}
/// Returns a shared reference to the underlying string.
///
/// # Safety
///
/// Not maintaining the invariants specified on the struct definition
/// could cause Undefined Behaviour.
#[inline]
pub(super) unsafe fn as_str(&self) -> &str {
// SAFETY: The caller must verify the invariants
// specified on the struct definition.
unsafe { self.ptr.as_ref() }
}
}
impl std::hash::Hash for InternedStr {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
// SAFETY: The caller must verify the invariants
// specified in the struct definition.
unsafe {
self.as_str().hash(state);
}
}
}
impl Eq for InternedStr {}
impl PartialEq for InternedStr {
fn eq(&self, other: &Self) -> bool {
// SAFETY: The caller must verify the invariants
// specified in the struct definition.
unsafe { self.as_str() == other.as_str() }
}
}
impl Borrow<str> for InternedStr {
fn borrow(&self) -> &str {
// SAFETY: The caller must verify the invariants
// specified in the struct definition.
unsafe { self.as_str() }
}
}

365
boa_interner/src/lib.rs

@ -49,6 +49,7 @@
rust_2018_idioms, rust_2018_idioms,
future_incompatible, future_incompatible,
nonstandard_style, nonstandard_style,
unsafe_op_in_unsafe_fn
)] )]
#![allow( #![allow(
clippy::module_name_repetitions, clippy::module_name_repetitions,
@ -69,46 +70,71 @@
rustdoc::missing_doc_code_examples rustdoc::missing_doc_code_examples
)] )]
extern crate static_assertions as sa;
mod fixed_string;
mod interned_str;
mod sym;
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;
use std::{fmt::Display, num::NonZeroUsize}; use fixed_string::FixedString;
pub use sym::*;
use gc::{unsafe_empty_trace, Finalize, Trace}; use std::fmt::{Debug, Display};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use string_interner::{backend::BucketBackend, StringInterner, Symbol};
/// Backend of the string interner. use interned_str::InternedStr;
type Backend = BucketBackend<Sym>; use rustc_hash::FxHashMap;
/// The string interner for Boa. /// The string interner for Boa.
/// #[derive(Debug, Default)]
/// This is a type alias that makes it easier to reference it in the code.
#[derive(Debug)]
pub struct Interner { pub struct Interner {
inner: StringInterner<Backend>, // COMMENT FOR DEVS:
// This interner works on the assumption that
// `head` won't ever be reallocated, since this could invalidate
// some of our stored pointers inside `spans`.
// This means that any operation on `head` and `full` should be carefully
// reviewed to not cause Undefined Behaviour.
// `get_or_intern` has a more thorough explanation on this.
//
// Also, if you want to implement `shrink_to_fit` (and friends),
// please check out https://github.com/Robbepop/string-interner/pull/47 first.
// This doesn't implement that method, since implementing it increases
// our memory footprint.
symbols: FxHashMap<InternedStr, Sym>,
spans: Vec<InternedStr>,
head: FixedString,
full: Vec<FixedString>,
} }
impl Interner { impl Interner {
/// Creates a new `StringInterner` with the given initial capacity. /// Creates a new [`Interner`].
#[inline]
pub fn new() -> Self {
Self::default()
}
/// Creates a new [`Interner`] with the specified capacity.
#[inline] #[inline]
pub fn with_capacity(cap: usize) -> Self { pub fn with_capacity(capacity: usize) -> Self {
Self { Self {
inner: StringInterner::with_capacity(cap), symbols: FxHashMap::default(),
spans: Vec::with_capacity(capacity),
head: FixedString::new(capacity),
full: Vec::new(),
} }
} }
/// Returns the number of strings interned by the interner. /// Returns the number of strings interned by the interner.
#[inline] #[inline]
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
self.inner.len() COMMON_STRINGS.len() + self.spans.len()
} }
/// Returns `true` if the string interner has no interned strings. /// Returns `true` if the [`Interner`] contains no interned strings.
#[inline] #[inline]
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.inner.is_empty() COMMON_STRINGS.is_empty() && self.spans.is_empty()
} }
/// Returns the symbol for the given string if any. /// Returns the symbol for the given string if any.
@ -119,7 +145,7 @@ impl Interner {
T: AsRef<str>, T: AsRef<str>,
{ {
let string = string.as_ref(); let string = string.as_ref();
Self::get_static(string).or_else(|| self.inner.get(string)) Self::get_common(string).or_else(|| self.symbols.get(string).copied())
} }
/// Interns the given string. /// Interns the given string.
@ -127,13 +153,67 @@ impl Interner {
/// Returns a symbol for resolution into the original string. /// Returns a symbol for resolution into the original string.
/// ///
/// # Panics /// # Panics
///
/// If the interner already interns the maximum number of strings possible by the chosen symbol type. /// If the interner already interns the maximum number of strings possible by the chosen symbol type.
pub fn get_or_intern<T>(&mut self, string: T) -> Sym pub fn get_or_intern<T>(&mut self, string: T) -> Sym
where where
T: AsRef<str>, T: AsRef<str>,
{ {
let string = string.as_ref(); let string = string.as_ref();
Self::get_static(string).unwrap_or_else(|| self.inner.get_or_intern(string)) if let Some(sym) = self.get(string) {
return sym;
}
// SAFETY:
//
// Firstly, this interner works on the assumption that the allocated
// memory by `head` won't ever be moved from its position on the heap,
// which is an important point to understand why manipulating it like
// this is safe.
//
// `String` (which is simply a `Vec<u8>` with additional invariants)
// is essentially a pointer to heap memory that can be moved without
// any problems, since copying a pointer cannot invalidate the memory
// that it points to.
//
// However, `String` CAN be invalidated when pushing, extending or
// shrinking it, since all those operations reallocate on the heap.
//
// To prevent that, we HAVE to ensure the capacity will succeed without
// having to reallocate, and the only way to do that without invalidating
// any other alive `InternedStr` is to create a brand new `head` with
// enough capacity and push the old `head` to `full` to keep it alive
// throughout the lifetime of the whole `Interner`.
//
// `FixedString` encapsulates this by only allowing checked `push`es
// to the internal string, but we still have to ensure the memory
// of `head` is not deallocated until the whole `Interner` deallocates,
// which we can do by moving it inside the `Interner` itself, specifically
// on the `full` vector, where every other old `head` also lives.
let interned_str = unsafe {
self.head.push(string).unwrap_or_else(|| {
let new_cap =
(usize::max(self.head.capacity(), string.len()) + 1).next_power_of_two();
let new_head = FixedString::new(new_cap);
let old_head = std::mem::replace(&mut self.head, new_head);
// If the user creates an `Interner`
// with `Interner::with_capacity(BIG_NUMBER)` and
// the first interned string's length is bigger than `BIG_NUMBER`,
// `self.full.push(old_head)` would push a big, empty string of
// allocated size `BIG_NUMBER` into `full`.
// This prevents that case.
if !old_head.is_empty() {
self.full.push(old_head);
}
self.head.push_unchecked(string)
})
};
// SAFETY: We are obtaining a pointer to the internal memory of
// `head`, which is alive through the whole life of `Interner`, so
// this is safe.
unsafe { self.generate_symbol(interned_str) }
} }
/// Interns the given `'static` string. /// Interns the given `'static` string.
@ -142,32 +222,32 @@ impl Interner {
/// ///
/// # Note /// # Note
/// ///
/// This is more efficient than [`StringInterner::get_or_intern`] since it might /// This is more efficient than [`Interner::get_or_intern`], since it
/// avoid some memory allocations if the backends supports this. /// avoids storing `string` inside the [`Interner`].
/// ///
/// # Panics /// # Panics
/// ///
/// If the interner already interns the maximum number of strings possible /// If the interner already interns the maximum number of strings possible
/// by the chosen symbol type. /// by the chosen symbol type.
pub fn get_or_intern_static(&mut self, string: &'static str) -> Sym { pub fn get_or_intern_static(&mut self, string: &'static str) -> Sym {
Self::get_static(string).unwrap_or_else(|| self.inner.get_or_intern_static(string)) self.get(string).unwrap_or_else(|| {
} // SAFETY: a static `str` is always alive, so its pointer
// should therefore always be valid.
/// Shrink backend capacity to fit the interned strings exactly. unsafe { self.generate_symbol(InternedStr::new(string.into())) }
#[inline] })
pub fn shrink_to_fit(&mut self) {
self.inner.shrink_to_fit();
} }
/// Returns the string for the given symbol if any. /// Returns the string for the given symbol if any.
#[inline] #[inline]
pub fn resolve(&self, symbol: Sym) -> Option<&str> { pub fn resolve(&self, symbol: Sym) -> Option<&str> {
let index = symbol.as_raw().get(); let index = symbol.get() - 1;
if index <= Self::STATIC_STRINGS.len() {
Some(Self::STATIC_STRINGS[index - 1]) COMMON_STRINGS.index(index).copied().or_else(|| {
} else { self.spans.get(index - COMMON_STRINGS.len()).map(|ptr|
self.inner.resolve(symbol) // SAFETY: We always ensure the stored `InternedStr`s always
} // reference memory inside `head` and `full`
unsafe {ptr.as_str()})
})
} }
/// Returns the string for the given symbol. /// Returns the string for the given symbol.
@ -180,178 +260,33 @@ impl Interner {
self.resolve(symbol).expect("string disappeared") self.resolve(symbol).expect("string disappeared")
} }
/// Gets the symbol of the static string if one of them /// Gets the symbol of the common string if one of them
fn get_static(string: &str) -> Option<Sym> { fn get_common(string: &str) -> Option<Sym> {
Self::STATIC_STRINGS COMMON_STRINGS.get_index(string).map(|idx|
.into_iter() // SAFETY: `idx >= 0`, since it's an `usize`, and `idx + 1 > 0`.
.enumerate() // In this case, we don't need to worry about overflows
.find(|&(_i, s)| string == s) // because we have a static assertion in place checking that
.map(|(i, _str)| { // `COMMON_STRINGS.len() < usize::MAX`.
let raw = NonZeroUsize::new(i.wrapping_add(1)).expect("static array too big"); unsafe {
Sym::from_raw(raw) Sym::new_unchecked(idx + 1)
}) })
} }
}
impl<T> FromIterator<T> for Interner
where
T: AsRef<str>,
{
#[inline]
fn from_iter<I>(iter: I) -> Self
where
I: IntoIterator<Item = T>,
{
Self {
inner: StringInterner::from_iter(iter),
}
}
}
impl<T> Extend<T> for Interner
where
T: AsRef<str>,
{
#[inline]
fn extend<I>(&mut self, iter: I)
where
I: IntoIterator<Item = T>,
{
self.inner.extend(iter);
}
}
impl<'a> IntoIterator for &'a Interner {
type Item = (Sym, &'a str);
type IntoIter = <&'a Backend as IntoIterator>::IntoIter;
#[inline]
fn into_iter(self) -> Self::IntoIter {
self.inner.into_iter()
}
}
impl Default for Interner {
fn default() -> Self {
Self {
inner: StringInterner::new(),
}
}
}
/// The string symbol type for Boa.
///
/// This symbol type is internally a `NonZeroUsize`, which makes it pointer-width in size and it's
/// optimized so that it can occupy 1 pointer width even in an `Option` type.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Finalize)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "serde", serde(transparent))]
#[allow(clippy::unsafe_derive_deserialize)]
pub struct Sym {
value: NonZeroUsize,
}
impl Sym {
/// Padding for the symbol internal value.
const PADDING: usize = Interner::STATIC_STRINGS.len() + 1;
/// Symbol for the empty string (`""`).
pub const EMPTY_STRING: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(1)) };
/// Symbol for the `"arguments"` string.
pub const ARGUMENTS: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(2)) };
/// Symbol for the `"await"` string.
pub const AWAIT: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(3)) };
/// Symbol for the `"yield"` string.
pub const YIELD: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(4)) };
/// Symbol for the `"eval"` string.
pub const EVAL: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(5)) };
/// Symbol for the `"default"` string.
pub const DEFAULT: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(6)) };
/// Symbol for the `"null"` string.
pub const NULL: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(7)) };
/// Symbol for the `"RegExp"` string.
pub const REGEXP: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(8)) };
/// Symbol for the `"get"` string.
pub const GET: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(9)) };
/// Symbol for the `"set"` string.
pub const SET: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(10)) };
/// Symbol for the `"<main>"` string.
pub const MAIN: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(11)) };
/// Symbol for the `"raw"` string.
pub const RAW: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(12)) };
/// Symbol for the `"static"` string.
pub const STATIC: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(13)) };
/// Symbol for the `"prototype"` string.
pub const PROTOTYPE: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(14)) };
/// Symbol for the `"constructor"` string.
pub const CONSTRUCTOR: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(15)) };
/// Symbol for the `"implements"` string.
pub const IMPLEMENTS: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(16)) };
/// Symbol for the `"interface"` string.
pub const INTERFACE: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(17)) };
/// Symbol for the `"let"` string. /// Generates a new symbol for the provided [`str`] pointer.
pub const LET: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(18)) }; ///
/// # Safety
/// Symbol for the `"package"` string. ///
pub const PACKAGE: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(19)) }; /// The caller must ensure `string` points to a valid
/// memory inside `head` and that it won't be invalidated
/// Symbol for the `"private"` string. /// by allocations and deallocations.
pub const PRIVATE: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(20)) }; unsafe fn generate_symbol(&mut self, string: InternedStr) -> Sym {
let next = Sym::new(self.len() + 1).expect("cannot get interner symbol: integer overflow");
/// Symbol for the `"protected"` string. self.spans.push(string.clone());
pub const PROTECTED: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(21)) }; self.symbols.insert(string, next);
next
/// Symbol for the `"public"` string.
pub const PUBLIC: Self = unsafe { Self::from_raw(NonZeroUsize::new_unchecked(22)) };
/// Creates a `Sym` from a raw `NonZeroUsize`.
const fn from_raw(value: NonZeroUsize) -> Self {
Self { value }
}
/// Retrieves the raw `NonZeroUsize` for this symbol.
const fn as_raw(self) -> NonZeroUsize {
self.value
}
}
impl Symbol for Sym {
#[inline]
fn try_from_usize(index: usize) -> Option<Self> {
index
.checked_add(Self::PADDING)
.and_then(NonZeroUsize::new)
.map(|value| Self { value })
}
#[inline]
fn to_usize(self) -> usize {
self.value.get() - Self::PADDING
} }
} }
// Safe because `Sym` implements `Copy`.
unsafe impl Trace for Sym {
unsafe_empty_trace!();
}
/// Converts a given element to a string using an interner. /// Converts a given element to a string using an interner.
pub trait ToInternedString { pub trait ToInternedString {
/// Converts a given element to a string using an interner. /// Converts a given element to a string using an interner.
@ -366,33 +301,3 @@ where
self.to_string() self.to_string()
} }
} }
impl Interner {
/// List of commonly used static strings.
///
/// Make sure that any string added as a `Sym` constant is also added here.
const STATIC_STRINGS: [&'static str; 22] = [
"",
"arguments",
"await",
"yield",
"eval",
"default",
"null",
"RegExp",
"get",
"set",
"<main>",
"raw",
"static",
"prototype",
"constructor",
"implements",
"interface",
"let",
"package",
"private",
"protected",
"public",
];
}

148
boa_interner/src/sym.rs

@ -0,0 +1,148 @@
use std::num::NonZeroUsize;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
/// The string symbol type for Boa.
///
/// This symbol type is internally a `NonZeroUsize`, which makes it pointer-width in size and it's
/// optimized so that it can occupy 1 pointer width even in an `Option` type.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "serde", serde(transparent))]
#[allow(clippy::unsafe_derive_deserialize)]
pub struct Sym {
value: NonZeroUsize,
}
impl Sym {
/// Symbol for the empty string (`""`).
pub const EMPTY_STRING: Self = unsafe { Self::new_unchecked(1) };
/// Symbol for the `"arguments"` string.
pub const ARGUMENTS: Self = unsafe { Self::new_unchecked(2) };
/// Symbol for the `"await"` string.
pub const AWAIT: Self = unsafe { Self::new_unchecked(3) };
/// Symbol for the `"yield"` string.
pub const YIELD: Self = unsafe { Self::new_unchecked(4) };
/// Symbol for the `"eval"` string.
pub const EVAL: Self = unsafe { Self::new_unchecked(5) };
/// Symbol for the `"default"` string.
pub const DEFAULT: Self = unsafe { Self::new_unchecked(6) };
/// Symbol for the `"null"` string.
pub const NULL: Self = unsafe { Self::new_unchecked(7) };
/// Symbol for the `"RegExp"` string.
pub const REGEXP: Self = unsafe { Self::new_unchecked(8) };
/// Symbol for the `"get"` string.
pub const GET: Self = unsafe { Self::new_unchecked(9) };
/// Symbol for the `"set"` string.
pub const SET: Self = unsafe { Self::new_unchecked(10) };
/// Symbol for the `"<main>"` string.
pub const MAIN: Self = unsafe { Self::new_unchecked(11) };
/// Symbol for the `"raw"` string.
pub const RAW: Self = unsafe { Self::new_unchecked(12) };
/// Symbol for the `"static"` string.
pub const STATIC: Self = unsafe { Self::new_unchecked(13) };
/// Symbol for the `"prototype"` string.
pub const PROTOTYPE: Self = unsafe { Self::new_unchecked(14) };
/// Symbol for the `"constructor"` string.
pub const CONSTRUCTOR: Self = unsafe { Self::new_unchecked(15) };
/// Symbol for the `"implements"` string.
pub const IMPLEMENTS: Self = unsafe { Self::new_unchecked(16) };
/// Symbol for the `"interface"` string.
pub const INTERFACE: Self = unsafe { Self::new_unchecked(17) };
/// Symbol for the `"let"` string.
pub const LET: Self = unsafe { Self::new_unchecked(18) };
/// Symbol for the `"package"` string.
pub const PACKAGE: Self = unsafe { Self::new_unchecked(19) };
/// Symbol for the `"private"` string.
pub const PRIVATE: Self = unsafe { Self::new_unchecked(20) };
/// Symbol for the `"protected"` string.
pub const PROTECTED: Self = unsafe { Self::new_unchecked(21) };
/// Symbol for the `"public"` string.
pub const PUBLIC: Self = unsafe { Self::new_unchecked(22) };
/// Creates a new [`Sym`] from the provided `value`, or returns `None` if `index` is zero.
#[inline]
pub(super) fn new(value: usize) -> Option<Self> {
NonZeroUsize::new(value).map(|value| Self { value })
}
/// Creates a new [`Sym`] from the provided `value`, without checking if `value` is not zero
///
/// # Safety
///
/// `value` must not be zero.
#[inline]
pub(super) const unsafe fn new_unchecked(value: usize) -> Self {
Self {
value:
// SAFETY: The caller must ensure the invariants of the function.
unsafe {
NonZeroUsize::new_unchecked(value)
},
}
}
/// Returns the internal value of the [`Sym`]
#[inline]
pub(super) const fn get(self) -> usize {
self.value.get()
}
}
/// Ordered set of commonly used static strings.
///
/// # Note
///
/// `COMMON_STRINGS` and the constants defined in [`Sym`] must always
/// be in sync.
pub(super) static COMMON_STRINGS: phf::OrderedSet<&'static str> = {
const COMMON_STRINGS: phf::OrderedSet<&'static str> = phf::phf_ordered_set! {
"",
"arguments",
"await",
"yield",
"eval",
"default",
"null",
"RegExp",
"get",
"set",
"<main>",
"raw",
"static",
"prototype",
"constructor",
"implements",
"interface",
"let",
"package",
"private",
"protected",
"public",
};
// A `COMMON_STRINGS` of size `usize::MAX` would cause an overflow on our `Interner`
sa::const_assert!(COMMON_STRINGS.len() < usize::MAX);
COMMON_STRINGS
};

40
boa_interner/src/tests.rs

@ -1,53 +1,24 @@
use crate::{Interner, Sym}; use crate::{Interner, Sym, COMMON_STRINGS};
use std::num::NonZeroUsize;
#[track_caller] #[track_caller]
fn sym_from_usize(index: usize) -> Sym { fn sym_from_usize(index: usize) -> Sym {
Sym::from_raw(NonZeroUsize::new(index).expect("Invalid NonZeroUsize")) Sym::new(index).expect("Invalid NonZeroUsize")
} }
#[test] #[test]
fn check_static_strings() { fn check_static_strings() {
let mut interner = Interner::default(); let mut interner = Interner::default();
for (i, str) in Interner::STATIC_STRINGS.into_iter().enumerate() { for (i, str) in COMMON_STRINGS.into_iter().enumerate() {
assert_eq!(interner.get_or_intern(str), sym_from_usize(i + 1)); assert_eq!(interner.get_or_intern(str), sym_from_usize(i + 1));
} }
} }
#[test]
fn check_constants() {
assert_eq!(Sym::EMPTY_STRING, sym_from_usize(1));
assert_eq!(Sym::ARGUMENTS, sym_from_usize(2));
assert_eq!(Sym::AWAIT, sym_from_usize(3));
assert_eq!(Sym::YIELD, sym_from_usize(4));
assert_eq!(Sym::EVAL, sym_from_usize(5));
assert_eq!(Sym::DEFAULT, sym_from_usize(6));
assert_eq!(Sym::NULL, sym_from_usize(7));
assert_eq!(Sym::REGEXP, sym_from_usize(8));
assert_eq!(Sym::GET, sym_from_usize(9));
assert_eq!(Sym::SET, sym_from_usize(10));
assert_eq!(Sym::MAIN, sym_from_usize(11));
assert_eq!(Sym::RAW, sym_from_usize(12));
assert_eq!(Sym::STATIC, sym_from_usize(13));
assert_eq!(Sym::PROTOTYPE, sym_from_usize(14));
assert_eq!(Sym::CONSTRUCTOR, sym_from_usize(15));
assert_eq!(Sym::IMPLEMENTS, sym_from_usize(16));
assert_eq!(Sym::INTERFACE, sym_from_usize(17));
assert_eq!(Sym::LET, sym_from_usize(18));
assert_eq!(Sym::PACKAGE, sym_from_usize(19));
assert_eq!(Sym::PRIVATE, sym_from_usize(20));
assert_eq!(Sym::PROTECTED, sym_from_usize(21));
assert_eq!(Sym::PUBLIC, sym_from_usize(22));
}
#[test] #[test]
fn check_new_string() { fn check_new_string() {
let mut interner = Interner::default(); let mut interner = Interner::default();
assert!( assert!(interner.get_or_intern("my test string").get() > COMMON_STRINGS.len());
interner.get_or_intern("my test string").as_raw().get() > Interner::STATIC_STRINGS.len()
);
} }
#[test] #[test]
@ -71,8 +42,9 @@ fn check_resolve() {
fn check_static_resolve() { fn check_static_resolve() {
let mut interner = Interner::default(); let mut interner = Interner::default();
for string in Interner::STATIC_STRINGS for string in COMMON_STRINGS
.into_iter() .into_iter()
.copied()
.chain(["my test str", "hello world", ";"].into_iter()) .chain(["my test str", "hello world", ";"].into_iter())
{ {
let sym = interner.get_or_intern_static(string); let sym = interner.get_or_intern_static(string);

Loading…
Cancel
Save