Browse Source

Add string builder to build `JsString` (#3915)

* feat: add `JsStringBuilder` and test

* chore: fix calculation on capacity and add `clone` impl

* chore: some misc fix

* fix: wrong capacity calculation in `extend`

* chore: prevent `reserve` except for the shrink in `build`

* chore: fix misc

* perf: use `realloc` for allocation

* chore: fix lint

* fix: wrong ascii validation

* fix: wrong allocated data bytes calculation

* fix: wrong capacity calcultion on `with_capacity`

* fix: clippy fix

* chore: add public `reserve`

* chore: comments and renaming

* chore: update misc

* chore: moved to module `builder` and implement `AddAssign` for `Builder`

* chore: add zero case for `with_capacity`

* chore: mark public methods `inline`

* chore: extract allocation check into `allocate_if_needed`

* chore: fix lint

* chore: expose `JsStringData`

* feat: add common string builder and export 1 byte and 2 bytes string builder

* chore: add missed trait

* chore: fix lint

* chore: fix doc

* chore: add `reserve_exact`

* chore: typos

* chore: fix argument

* chore: fix doc

* chore: mark `current_layout` unsafe

* chore: fix lint

* chore: remove `JsStringData` and rename builders

* chore: add build methods to typed builders

* chore: add more build methods to `CommonJsStringBuilder`

* chore: rename `latin1` check to `ascii` check

* chore: refine docs

* chore: update tests

* chore: limit the generic type `D` of `JsStringBuilder` to Copy

* chore: move `entend` method under `Extend` trait

* chore: should validate `Latin1` segement in `build_from_latin1`

* chore: add `Add` trait implementation to builders

* chore: refines docs and add `inline` to trait methods

* chore: adds `clone_from` and related tests

* chore: adds `as_mut_slice` to typed builders
main
CrazyboyQCD 14 hours ago committed by GitHub
parent
commit
8f1d8d473a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 915
      core/string/src/builder.rs
  2. 2
      core/string/src/lib.rs
  3. 221
      core/string/src/tests.rs

915
core/string/src/builder.rs

@ -0,0 +1,915 @@
use crate::{
alloc_overflow, tagged::Tagged, JsStr, JsStrVariant, JsString, RawJsString, RefCount,
TaggedLen, DATA_OFFSET,
};
use std::{
alloc::{alloc, dealloc, realloc, Layout},
cell::Cell,
marker::PhantomData,
mem::ManuallyDrop,
ops::{Add, AddAssign},
ptr::{self, addr_of_mut, NonNull},
str::{self},
};
/// A mutable builder to create instance of `JsString`.
///
#[derive(Debug)]
pub struct JsStringBuilder<D: Copy> {
cap: usize,
len: usize,
inner: NonNull<RawJsString>,
phantom_data: PhantomData<D>,
}
impl<D: Copy> Default for JsStringBuilder<D> {
fn default() -> Self {
Self::new()
}
}
impl<D: Copy> JsStringBuilder<D> {
const DATA_SIZE: usize = size_of::<D>();
const MIN_NON_ZERO_CAP: usize = 8 / Self::DATA_SIZE;
/// Create a new `JsStringBuilder` with capacity of zero.
#[inline]
#[must_use]
pub const fn new() -> Self {
Self {
cap: 0,
len: 0,
inner: NonNull::dangling(),
phantom_data: PhantomData,
}
}
/// Returns the number of elements that inner `RawJsString` holds.
#[inline]
#[must_use]
pub const fn len(&self) -> usize {
self.len
}
/// Forces the length of the [`JsStringBuilder`] to `new_len`.
///
/// # Safety
///
/// - `new_len` must be less than or equal to `capacity()`.
/// - The elements at `old_len..new_len` must be initialized.
///
#[inline]
pub unsafe fn set_len(&mut self, new_len: usize) {
debug_assert!(new_len <= self.capacity());
self.len = new_len;
}
/// Returns the total number of elements can hold without reallocating
#[inline]
#[must_use]
pub const fn capacity(&self) -> usize {
self.cap
}
/// Returns the allocated byte of inner `RawJsString`'s data.
#[must_use]
const fn allocated_data_byte_len(&self) -> usize {
self.len() * Self::DATA_SIZE
}
/// Returns the capacity calculated from given layout.
#[must_use]
const fn capacity_from_layout(layout: Layout) -> usize {
(layout.size() - DATA_OFFSET) / Self::DATA_SIZE
}
/// Create a new `JsStringBuilder` with specific capacity
#[inline]
#[must_use]
pub fn with_capacity(cap: usize) -> Self {
if cap == 0 {
return Self::new();
}
let layout = Self::new_layout(cap);
#[allow(clippy::cast_ptr_alignment)]
// SAFETY:
// The layout size of `RawJsString` is never zero, since it has to store
// the length of the string and the reference count.
let ptr = unsafe { alloc(layout) };
let Some(ptr) = NonNull::new(ptr.cast()) else {
std::alloc::handle_alloc_error(layout)
};
Self {
cap: Self::capacity_from_layout(layout),
len: 0,
inner: ptr,
phantom_data: PhantomData,
}
}
/// Checks if the inner `RawJsString` is allocated.
#[must_use]
fn is_allocated(&self) -> bool {
self.inner != NonNull::dangling()
}
/// Returns the inner `RawJsString`'s layout.
///
/// # Safety
///
/// Caller should ensure that the inner is allocated.
#[must_use]
unsafe fn current_layout(&self) -> Layout {
// SAFETY:
// Caller should ensure that the inner is allocated.
unsafe {
Layout::for_value(self.inner.as_ref())
.extend(Layout::array::<D>(self.capacity()).unwrap_unchecked())
.unwrap_unchecked()
.0
.pad_to_align()
}
}
/// Returns the pointer of `data` of inner.
///
/// # Safety
///
/// Caller should ensure that the inner is allocated.
#[must_use]
unsafe fn data(&self) -> *mut D {
// SAFETY:
// Caller should ensure that the inner is allocated.
unsafe { addr_of_mut!((*self.inner.as_ptr()).data).cast() }
}
/// Allocates when there is not sufficient capacity.
#[allow(clippy::inline_always)]
#[inline(always)]
fn allocate_if_needed(&mut self, reuired_cap: usize) {
if reuired_cap > self.capacity() {
self.allocate(reuired_cap);
}
}
/// Inner logic of `allocate`.
///
/// Use `realloc` here because it has a better performance than using combination of `alloc`, `copy` and `dealloc`.
#[allow(clippy::cast_ptr_alignment)]
fn allocate_inner(&mut self, new_layout: Layout) {
let new_ptr = if self.is_allocated() {
let old_ptr = self.inner.as_ptr();
// SAFETY:
// Allocation check has been made above.
let old_layout = unsafe { self.current_layout() };
// SAFETY:
// Valid pointer is required by `realloc` and pointer is checked above to be valid.
// The layout size of `RawJsString` is never zero, since it has to store
// the length of the string and the reference count.
unsafe { realloc(old_ptr.cast(), old_layout, new_layout.size()) }
} else {
// SAFETY:
// The layout size of `RawJsString` is never zero, since it has to store
// the length of the string and the reference count.
unsafe { alloc(new_layout) }
};
let Some(new_ptr) = NonNull::new(new_ptr.cast::<RawJsString>()) else {
std::alloc::handle_alloc_error(new_layout)
};
self.inner = new_ptr;
self.cap = Self::capacity_from_layout(new_layout);
}
/// Appends an element to the inner `RawJsString` of `JsStringBuilder`.
#[inline]
pub fn push(&mut self, v: D) {
let required_cap = self.len() + 1;
self.allocate_if_needed(required_cap);
// SAFETY:
// Capacity has been expanded to be large enough to hold elements.
unsafe {
self.push_unchecked(v);
}
}
/// Pushes elements from slice to `JsStringBuilder` without doing capacity check.
///
/// Unlike the standard vector, our holded element types are only `u8` and `u16`, which is [`Copy`] derived,
///
/// so we only need to copy them instead of cloning.
///
/// # Safety
///
/// Caller should ensure the capacity is large enough to hold elements.
#[inline]
pub unsafe fn extend_from_slice_unchecked(&mut self, v: &[D]) {
// SAFETY: Caller should ensure the capacity is large enough to hold elements.
unsafe {
ptr::copy_nonoverlapping(v.as_ptr(), self.data().add(self.len()), v.len());
}
self.len += v.len();
}
/// Pushes elements from slice to `JsStringBuilder`.
#[inline]
pub fn extend_from_slice(&mut self, v: &[D]) {
let required_cap = self.len() + v.len();
self.allocate_if_needed(required_cap);
// SAFETY:
// Capacity has been expanded to be large enough to hold elements.
unsafe {
self.extend_from_slice_unchecked(v);
}
}
fn new_layout(cap: usize) -> Layout {
let new_layout = Layout::array::<D>(cap)
.and_then(|arr| Layout::new::<RawJsString>().extend(arr))
.map(|(layout, offset)| (layout.pad_to_align(), offset))
.map_err(|_| None);
match new_layout {
Ok((new_layout, offset)) => {
debug_assert_eq!(offset, DATA_OFFSET);
new_layout
}
Err(None) => alloc_overflow(),
Err(Some(layout)) => std::alloc::handle_alloc_error(layout),
}
}
/// Similar to [`Vec::reserve`]
///
/// Reserves capacity for at least `additional` more elements to be inserted
/// in the given `JsStringBuilder<D>`. The collection may reserve more space to
/// speculatively avoid frequent reallocations. After calling `reserve`,
/// capacity will be greater than or equal to `self.len() + additional`.
/// Does nothing if capacity is already sufficient.
#[inline]
pub fn reserve(&mut self, additional: usize) {
if additional > self.capacity().wrapping_sub(self.len) {
let Some(cap) = self.len().checked_add(additional) else {
alloc_overflow()
};
self.allocate(cap);
}
}
/// Similar to [`Vec::reserve_exact`]
///
/// Reserves the minimum capacity for at least `additional` more elements to
/// be inserted in the given `JsStringBuilder<D>`. Unlike [`reserve`], this will not
/// deliberately over-allocate to speculatively avoid frequent allocations.
/// After calling `reserve_exact`, capacity will be greater than or equal to
/// `self.len() + additional`. Does nothing if the capacity is already
/// sufficient.
///
/// Note that the allocator may give the collection more space than it
/// requests. Therefore, capacity can not be relied upon to be precisely
/// minimal. Prefer [`reserve`] if future insertions are expected.
///
/// [`reserve`]: JsStringBuilder::reserve
#[inline]
pub fn reserve_exact(&mut self, additional: usize) {
if additional > self.capacity().wrapping_sub(self.len) {
let Some(cap) = self.len().checked_add(additional) else {
alloc_overflow()
};
self.allocate_inner(Self::new_layout(cap));
}
}
/// Allocates memory to the inner `RawJsString` by the given capacity.
/// Capacity calculation is from [`std::vec::Vec::reserve`].
fn allocate(&mut self, cap: usize) {
let cap = std::cmp::max(self.capacity() * 2, cap);
let cap = std::cmp::max(Self::MIN_NON_ZERO_CAP, cap);
self.allocate_inner(Self::new_layout(cap));
}
/// Appends an element to the inner `RawJsString` of `JsStringBuilder` without doing bounds check.
/// # Safety
///
/// Caller should ensure the capacity is large enough to hold elements.
#[inline]
pub unsafe fn push_unchecked(&mut self, v: D) {
// SAFETY: Caller should ensure the capacity is large enough to hold elements.
unsafe {
self.data().add(self.len()).write(v);
self.len += 1;
}
}
/// Returns true if this `JsStringBuilder` has a length of zero, and false otherwise.
#[inline]
#[must_use]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Checks if all bytes in inner `RawJsString`'s data are ascii.
#[inline]
#[must_use]
pub fn is_ascii(&self) -> bool {
// SAFETY:
// `NonNull` verified for us that the pointer returned by `alloc` is valid,
// meaning we can read to its pointed memory.
let data = unsafe {
std::slice::from_raw_parts(self.data().cast::<u8>(), self.allocated_data_byte_len())
};
data.is_ascii()
}
/// Extracts a slice containing the elements in the inner `RawJsString`.
#[inline]
#[must_use]
pub fn as_slice(&self) -> &[D] {
if self.is_allocated() {
// SAFETY:
// The inner `RawJsString` is allocated which means it is not null.
unsafe { std::slice::from_raw_parts(self.data(), self.len()) }
} else {
&[]
}
}
/// Extracts a mutable slice containing the elements in the inner `RawJsString`.
///
/// # Safety
/// The caller must ensure that the content of the slice is valid encoding before the borrow ends.
/// Use of a builder whose contents are not valid encoding is undefined behavior.
#[inline]
#[must_use]
pub unsafe fn as_mut_slice(&mut self) -> &mut [D] {
if self.is_allocated() {
// SAFETY:
// The inner `RawJsString` is allocated which means it is not null.
unsafe { std::slice::from_raw_parts_mut(self.data(), self.len()) }
} else {
&mut []
}
}
/// Builds `JsString` from `JsStringBuilder`
#[inline]
#[must_use]
fn build_inner(mut self, latin1: bool) -> JsString {
if self.is_empty() {
return JsString::default();
}
let len = self.len();
// Shrink to fit the length.
if len != self.capacity() {
let layout = Self::new_layout(self.len());
self.allocate_inner(layout);
}
let inner = self.inner;
// SAFETY:
// `NonNull` verified for us that the pointer returned by `alloc` is valid,
// meaning we can write to its pointed memory.
unsafe {
inner.as_ptr().write(RawJsString {
tagged_len: TaggedLen::new(len, latin1),
refcount: RefCount {
read_write: ManuallyDrop::new(Cell::new(1)),
},
data: [0; 0],
});
}
// Tell the compiler not to call the destructor of `JsStringBuilder`,
// becuase we move inner `RawJsString` to `JsString`.
std::mem::forget(self);
JsString {
ptr: Tagged::from_non_null(inner),
}
}
}
impl<D: Copy> Drop for JsStringBuilder<D> {
/// Set cold since [`JsStringBuilder`] should be created to build `JsString`
#[cold]
#[inline]
fn drop(&mut self) {
if self.is_allocated() {
// SAFETY:
// Allocation check has been made above.
let layout = unsafe { self.current_layout() };
// SAFETY:
// layout: All the checks for the validity of the layout have already been made on `allocate_inner`.
// `NonNull` verified for us that the pointer returned by `alloc` is valid,
// meaning we can free its pointed memory.
unsafe {
dealloc(self.inner.as_ptr().cast(), layout);
}
}
}
}
impl<D: Copy> AddAssign<&JsStringBuilder<D>> for JsStringBuilder<D> {
#[inline]
fn add_assign(&mut self, rhs: &JsStringBuilder<D>) {
self.extend_from_slice(rhs.as_slice());
}
}
impl<D: Copy> AddAssign<&[D]> for JsStringBuilder<D> {
#[inline]
fn add_assign(&mut self, rhs: &[D]) {
self.extend_from_slice(rhs);
}
}
impl<D: Copy> Add<&JsStringBuilder<D>> for JsStringBuilder<D> {
type Output = Self;
#[inline]
#[must_use]
fn add(mut self, rhs: &JsStringBuilder<D>) -> Self::Output {
self.extend_from_slice(rhs.as_slice());
self
}
}
impl<D: Copy> Add<&[D]> for JsStringBuilder<D> {
type Output = Self;
#[inline]
#[must_use]
fn add(mut self, rhs: &[D]) -> Self::Output {
self.extend_from_slice(rhs);
self
}
}
impl<D: Copy> Extend<D> for JsStringBuilder<D> {
#[inline]
fn extend<I: IntoIterator<Item = D>>(&mut self, iter: I) {
let iterator = iter.into_iter();
let (lower_bound, _) = iterator.size_hint();
let require_cap = self.len() + lower_bound;
self.allocate_if_needed(require_cap);
iterator.for_each(|c| self.push(c));
}
}
impl<D: Copy> FromIterator<D> for JsStringBuilder<D> {
#[inline]
fn from_iter<T: IntoIterator<Item = D>>(iter: T) -> Self {
let mut builder = Self::new();
builder.extend(iter);
builder
}
}
impl<D: Copy> From<&[D]> for JsStringBuilder<D> {
#[inline]
#[must_use]
fn from(value: &[D]) -> Self {
let mut builder = Self::with_capacity(value.len());
// SAFETY: The capacity is large enough to hold elements.
unsafe { builder.extend_from_slice_unchecked(value) };
builder
}
}
impl<D: Copy + Eq + PartialEq> PartialEq for JsStringBuilder<D> {
#[inline]
#[must_use]
fn eq(&self, other: &Self) -> bool {
self.as_slice().eq(other.as_slice())
}
}
impl<D: Copy> Clone for JsStringBuilder<D> {
#[inline]
#[must_use]
fn clone(&self) -> Self {
if self.is_allocated() {
let mut builder = Self::with_capacity(self.capacity());
// SAFETY: The capacity is large enough to hold elements.
unsafe { builder.extend_from_slice_unchecked(self.as_slice()) };
builder
} else {
Self::new()
}
}
/// Performs copy-assignment from `source`.
///
/// Rewritten to avoid unnecessary allocation.
#[inline]
fn clone_from(&mut self, source: &Self) {
let source_len = source.len();
if source_len > self.capacity() {
self.allocate(source_len);
} else {
// At this point, inner `RawJsString` of self or source can be not allocated,
// returns earlier to avoid copying from/to `null`.
if source_len == 0 {
// SAFETY: 0 is always less or equal to self's capacity.
unsafe { self.set_len(0) };
return;
}
}
// SAFETY: self shoud be allocated after allocation.
let self_data = unsafe { self.data() };
// SAFETY: source_len is greter than 0 so source shoud be allocated.
let source_data = unsafe { source.data() };
// SAFETY: Borrow checker should not allow this to be overlapped and pointers are valid.
unsafe { ptr::copy_nonoverlapping(source_data, self_data, source_len) };
// SAFETY: source_len has checked to be less or equal to self's capacity.
unsafe { self.set_len(source_len) };
}
}
/// **`Latin1`** encoded `JsStringBuilder`
/// # Warning
/// If you are not sure the characters that will be added and don't want to preprocess them,
/// use [`CommonJsStringBuilder`] instead.
/// ## Examples
///
/// ```rust
/// use boa_string::Latin1JsStringBuilder;
/// let mut s = Latin1JsStringBuilder::new();
/// s.push(b'x');
/// s.extend_from_slice(&[b'1', b'2', b'3']);
/// s.extend([b'1', b'2', b'3']);
/// let js_string = s.build();
/// ```
pub type Latin1JsStringBuilder = JsStringBuilder<u8>;
impl Latin1JsStringBuilder {
/// Builds a `JsString` if the current instance is strictly `ASCII`.
///
/// When the string contains characters outside the `ASCII` range, it cannot be determined
/// whether the encoding is `Latin1` or others. Therefore, this method only returns a
/// valid `JsString` when the instance is entirely `ASCII`. If any non-`ASCII` characters
/// are present, it returns `None` to avoid ambiguity in encoding.
///
/// If the caller is certain that the string is encoded in `Latin1`,
/// [`build_as_latin1`](Self::build_as_latin1) can be used to avoid the `ASCII` check.
#[inline]
#[must_use]
pub fn build(self) -> Option<JsString> {
if self.is_ascii() {
Some(self.build_inner(true))
} else {
None
}
}
/// Builds `JsString` from `Latin1JsStringBuilder`, assume that the inner data is `Latin1` encoded
///
/// # Safety
/// Caller must ensure that the string is encoded in `Latin1`.
///
/// If the string contains characters outside the `Latin1` range, it may lead to encoding errors,
/// resulting in an incorrect or malformed `JsString`. This could cause undefined behavior
/// when the resulting string is used in further operations or when interfacing with other
/// parts of the system that expect valid `Latin1` encoded string.
#[inline]
#[must_use]
pub unsafe fn build_as_latin1(self) -> JsString {
self.build_inner(true)
}
}
/// **`UTF-16`** encoded `JsStringBuilder`
/// ## Examples
///
/// ```rust
/// use boa_string::Utf16JsStringBuilder;
/// let mut s = Utf16JsStringBuilder::new();
/// s.push(b'x' as u16);
/// s.extend_from_slice(&[b'1', b'2', b'3'].map(u16::from));
/// s.extend([0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5,]); // 🎹🎶🎵
/// let js_string = s.build();
/// ```
pub type Utf16JsStringBuilder = JsStringBuilder<u16>;
impl Utf16JsStringBuilder {
/// Builds `JsString` from `Utf16JsStringBuilder`
#[inline]
#[must_use]
pub fn build(self) -> JsString {
self.build_inner(false)
}
}
/// Represents a segment of a string used to construct a [`JsString`].
#[derive(Clone, Debug)]
pub enum Segment<'a> {
/// A string segment represented as a `JsString`.
String(JsString),
/// A string segment represented as a `JsStr`.
Str(JsStr<'a>),
/// A string segment represented as a byte.
Latin1(u8),
/// A Unicode code point segment represented as a character.
CodePoint(char),
}
impl Segment<'_> {
/// Checks if the segment consists solely of `ASCII` characters.
#[inline]
#[must_use]
fn is_ascii(&self) -> bool {
match self {
Segment::String(s) => s.as_str().is_latin1(),
Segment::Str(s) => s.is_latin1(),
Segment::Latin1(b) => *b <= 0x7f,
Segment::CodePoint(ch) => *ch as u32 <= 0x7F,
}
}
}
impl From<JsString> for Segment<'_> {
#[inline]
fn from(value: JsString) -> Self {
Self::String(value)
}
}
impl From<String> for Segment<'_> {
#[inline]
fn from(value: String) -> Self {
Self::String(value.into())
}
}
impl From<&[u16]> for Segment<'_> {
#[inline]
fn from(value: &[u16]) -> Self {
Self::String(value.into())
}
}
impl From<&str> for Segment<'_> {
#[inline]
fn from(value: &str) -> Self {
Self::String(value.into())
}
}
impl<'seg, 'ref_str: 'seg> From<JsStr<'ref_str>> for Segment<'seg> {
#[inline]
fn from(value: JsStr<'ref_str>) -> Self {
Self::Str(value)
}
}
impl From<u8> for Segment<'_> {
#[inline]
fn from(value: u8) -> Self {
Self::Latin1(value)
}
}
impl From<char> for Segment<'_> {
#[inline]
fn from(value: char) -> Self {
Self::CodePoint(value)
}
}
/// Common `JsString` builder that accepts multiple variant of string or character.
///
/// Originally based on [kiesel-js](https://codeberg.org/kiesel-js/kiesel/src/branch/main/src/types/language/String/Builder.zig)
#[derive(Clone, Debug, Default)]
pub struct CommonJsStringBuilder<'a> {
segments: Vec<Segment<'a>>,
}
impl<'seg, 'ref_str: 'seg> CommonJsStringBuilder<'seg> {
/// Creates a new `CommonJsStringBuilder` with capacity of zero.
#[inline]
#[must_use]
pub const fn new() -> Self {
Self {
segments: Vec::new(),
}
}
/// Similar to `Vec::with_capacity`.
///
/// Creates a new `CommonJsStringBuilder` with given capacity.
#[inline]
#[must_use]
pub fn with_capacity(capacity: usize) -> Self {
Self {
segments: Vec::with_capacity(capacity),
}
}
/// Similar to `Vec::reserve`.
///
/// Reserves additional capacity for the inner vector.
#[inline]
pub fn reserve(&mut self, additional: usize) {
self.segments.reserve(additional);
}
/// Similar to `Vec::reserve_exact`.
///
/// Reserves the minimum capacity for the inner vector.
#[inline]
pub fn reserve_exact(&mut self, additional: usize) {
self.segments.reserve_exact(additional);
}
/// Appends string segments to the back of the inner vector.
#[inline]
pub fn push<T: Into<Segment<'ref_str>>>(&mut self, seg: T) {
self.segments.push(seg.into());
}
/// Checks if all string segments contains only `ASCII` bytes.
#[inline]
#[must_use]
pub fn is_ascii(&self) -> bool {
self.segments.iter().all(Segment::is_ascii)
}
/// Returns the number of string segment in inner vector.
#[inline]
#[must_use]
pub fn len(&self) -> usize {
self.segments.len()
}
/// Returns true if this `CommonJsStringBuilder` has a length of zero, and false otherwise.
#[inline]
#[must_use]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Builds `Latin1` encoded `JsString` from string segments.
///
/// This doesn't consume the builder itself because it may fails to build
/// and the caller may wants to keep the builder for further operations.
///
/// This processes the following types of segments:
///
/// - `Segment::String(s)`: Encodes the string if it can be represented in `Latin1`.
/// - `Segment::Str(s)`: Encodes the string slice if it can be represented in `Latin1`.
/// - `Segment::Latin1(b)`: Encodes the byte if it's within the `ASCII` range.
/// - `Segment::CodePoint(ch)`: Encodes the code point by converting it to a byte if it's within the `ASCII` range.
///
/// Return `None` if any segment fails to encode.
#[inline]
#[must_use]
#[allow(clippy::cast_lossless)]
pub fn build_from_latin1(&self) -> Option<JsString> {
let mut builder = Latin1JsStringBuilder::new();
for seg in &self.segments {
match seg {
Segment::String(s) => {
if let Some(data) = s.as_str().as_latin1() {
builder.extend_from_slice(data);
} else {
return None;
}
}
Segment::Str(s) => {
if let Some(data) = s.as_latin1() {
builder.extend_from_slice(data);
} else {
return None;
}
}
Segment::Latin1(b) => {
if *b <= 0x7f {
builder.push(*b);
} else {
return None;
}
}
Segment::CodePoint(ch) => {
if let Ok(b) = u8::try_from(*ch as u32) {
builder.push(b);
} else {
return None;
}
}
}
}
builder.build()
}
/// Builds `Utf-16` encoded `JsString` from string segments.
#[inline]
#[must_use]
#[allow(clippy::cast_possible_truncation)]
pub fn build_from_utf16(self) -> JsString {
let mut builder = Utf16JsStringBuilder::new();
for seg in self.segments {
match seg {
Segment::String(s) => {
let js_str = s.as_str();
match js_str.variant() {
JsStrVariant::Latin1(s) => builder.extend(s.iter().copied().map(u16::from)),
JsStrVariant::Utf16(s) => builder.extend_from_slice(s),
}
}
Segment::Str(s) => match s.variant() {
JsStrVariant::Latin1(s) => builder.extend(s.iter().copied().map(u16::from)),
JsStrVariant::Utf16(s) => builder.extend_from_slice(s),
},
Segment::Latin1(latin1) => builder.push(u16::from(latin1)),
Segment::CodePoint(code_point) => {
builder.extend_from_slice(code_point.encode_utf16(&mut [0_u16; 2]));
}
}
}
builder.build()
}
/// Builds `JsString` from `CommonJsStringBuilder`,
///
/// This function first checks if the instance is empty:
/// - If it is empty, it returns the default `JsString`.
/// - If it contains only ASCII characters, it safely encodes it as `Latin1`.
/// - If it contains non-ASCII characters, it falls back to encoding using `UTF-16`.
#[inline]
#[must_use]
pub fn build(self) -> JsString {
if self.is_empty() {
JsString::default()
} else if self.is_ascii() {
// SAFETY:
// All string segment contains only ascii byte, so this can be encoded as `Latin1`.
unsafe { self.build_as_latin1() }
} else {
self.build_from_utf16()
}
}
/// Builds `Latin1` encoded `JsString` from `CommonJsStringBuilder`, return `None` if segments can't be encoded as `Latin1`
///
/// # Safety
/// Caller must ensure that the string segments can be `Latin1` encoded.
///
/// If string segments can't be `Latin1` encoded, it may lead to encoding errors,
/// resulting in an incorrect or malformed `JsString`. This could cause undefined behavior
/// when the resulting string is used in further operations or when interfacing with other
/// parts of the system that expect valid `Latin1` encoded string.
#[inline]
#[must_use]
pub unsafe fn build_as_latin1(self) -> JsString {
let mut builder = Latin1JsStringBuilder::new();
for seg in self.segments {
match seg {
Segment::String(s) => {
let js_str = s.as_str();
let Some(s) = js_str.as_latin1() else {
unreachable!("string segment shoud be latin1")
};
builder.extend_from_slice(s);
}
Segment::Str(s) => {
let Some(s) = s.as_latin1() else {
unreachable!("string segment shoud be latin1")
};
builder.extend_from_slice(s);
}
Segment::Latin1(latin1) => builder.push(latin1),
Segment::CodePoint(code_point) => builder.push(code_point as u8),
}
}
// SAFETY: All string segments can be encoded as `Latin1` string.
unsafe { builder.build_as_latin1() }
}
}
impl<'ref_str, T: Into<Segment<'ref_str>>> AddAssign<T> for CommonJsStringBuilder<'ref_str> {
#[inline]
fn add_assign(&mut self, rhs: T) {
self.push(rhs);
}
}
impl<'ref_str, T: Into<Segment<'ref_str>>> Add<T> for CommonJsStringBuilder<'ref_str> {
type Output = Self;
#[inline]
#[must_use]
fn add(mut self, rhs: T) -> Self::Output {
self.push(rhs);
self
}
}

2
core/string/src/lib.rs

@ -16,6 +16,7 @@
#![allow(unstable_name_collisions)]
#![allow(clippy::module_name_repetitions)]
mod builder;
mod common;
mod display;
mod iter;
@ -30,6 +31,7 @@ use crate::display::{JsStrDisplayEscaped, JsStrDisplayLossy};
use crate::tagged::{Tagged, UnwrappedTagged};
#[doc(inline)]
pub use crate::{
builder::{CommonJsStringBuilder, Latin1JsStringBuilder, Utf16JsStringBuilder},
common::StaticJsStrings,
iter::Iter,
str::{JsStr, JsStrVariant},

221
core/string/src/tests.rs

@ -2,7 +2,10 @@
use std::hash::{BuildHasher, BuildHasherDefault, Hash};
use crate::{JsStr, JsString, StaticJsString, StaticJsStrings, ToStringEscaped};
use crate::{
CommonJsStringBuilder, JsStr, JsString, Latin1JsStringBuilder, StaticJsString, StaticJsStrings,
ToStringEscaped, Utf16JsStringBuilder,
};
use rustc_hash::FxHasher;
@ -252,3 +255,219 @@ fn compare_static_and_dynamic_js_string() {
assert!(!dynamic_latin1.is_static());
assert!(!dynamic_utf16.is_static());
}
#[test]
#[allow(clippy::cast_possible_truncation)]
#[allow(clippy::undocumented_unsafe_blocks)]
fn js_string_builder() {
let s = "2024年5月21日";
let utf16 = s.encode_utf16().collect::<Vec<_>>();
let s_utf16 = utf16.as_slice();
let ascii = "Lorem ipsum dolor sit amet";
let s_ascii = ascii.as_bytes();
let latin1_as_utf8_literal = "Déjà vu";
let s_latin1_literal: &[u8] = &[
b'D', 0xE9, /* é */
b'j', 0xE0, /* à */
b' ', b'v', b'u',
];
// latin1 builder -- test
// push ascii
let mut builder = Latin1JsStringBuilder::new();
for &code in s_ascii {
builder.push(code);
}
let s_builder = builder.build().unwrap_or_default();
assert_eq!(s_builder, ascii);
// push latin1
let mut builder = Latin1JsStringBuilder::new();
for &code in s_latin1_literal {
builder.push(code);
}
let s_builder = unsafe { builder.build_as_latin1() };
assert_eq!(
s_builder.to_std_string().unwrap_or_default(),
latin1_as_utf8_literal
);
// from_iter ascii
let s_builder = s_ascii
.iter()
.copied()
.collect::<Latin1JsStringBuilder>()
.build()
.unwrap_or_default();
assert_eq!(s_builder.to_std_string().unwrap_or_default(), ascii);
// from_iter latin1
let s_builder = unsafe {
s_latin1_literal
.iter()
.copied()
.collect::<Latin1JsStringBuilder>()
.build_as_latin1()
};
assert_eq!(
s_builder.to_std_string().unwrap_or_default(),
latin1_as_utf8_literal
);
// extend_from_slice ascii
let mut builder = Latin1JsStringBuilder::new();
builder.extend_from_slice(s_ascii);
let s_builder = builder.build().unwrap_or_default();
assert_eq!(s_builder.to_std_string().unwrap_or_default(), ascii);
// extend_from_slice latin1
let mut builder = Latin1JsStringBuilder::new();
builder.extend_from_slice(s_latin1_literal);
let s_builder = unsafe { builder.build_as_latin1() };
assert_eq!(
s_builder.to_std_string().unwrap_or_default(),
latin1_as_utf8_literal
);
// build from utf16 encoded string
let s_builder = s
.as_bytes()
.iter()
.copied()
.collect::<Latin1JsStringBuilder>()
.build();
assert_eq!(None, s_builder);
let s_builder = s_utf16
.iter()
.copied()
.map(|v| v as u8)
.collect::<Latin1JsStringBuilder>()
.build();
assert_eq!(None, s_builder);
// utf16 builder -- test
// push
let mut builder = Utf16JsStringBuilder::new();
for &code in s_utf16 {
builder.push(code);
}
let s_builder = builder.build();
assert_eq!(s_builder.to_std_string().unwrap_or_default(), s);
// from_iter
let s_builder = s_utf16
.iter()
.copied()
.collect::<Utf16JsStringBuilder>()
.build();
assert_eq!(s_builder.to_std_string().unwrap_or_default(), s);
// extend_from_slice
let mut builder = Utf16JsStringBuilder::new();
builder.extend_from_slice(s_utf16);
let s_builder = builder.build();
assert_eq!(s_builder.to_std_string().unwrap_or_default(), s);
}
#[test]
fn clone_builder() {
// latin1 builder -- test
let origin = Latin1JsStringBuilder::from(&b"0123456789"[..]);
let empty_origin = Latin1JsStringBuilder::new();
// clone == origin
let cloned = origin.clone();
assert_eq!(origin, cloned);
// clone_from == origin
let mut cloned_from = Latin1JsStringBuilder::new();
cloned_from.clone_from(&origin);
assert_eq!(origin, cloned_from);
// clone == origin(empty)
let cloned = empty_origin.clone();
assert_eq!(empty_origin, cloned);
// clone_from == origin(empty)
cloned_from.clone_from(&empty_origin);
assert!(cloned_from.capacity() > 0); // Should not be reallocated so the capacity is preserved.
assert_eq!(empty_origin, cloned_from);
// clone_from(empty) == origin(empty)
let mut cloned_from = Latin1JsStringBuilder::new();
cloned_from.clone_from(&empty_origin);
assert!(cloned_from.capacity() == 0);
assert_eq!(empty_origin, cloned_from);
// utf16 builder -- test
let s = "2024年5月21日";
let origin = Utf16JsStringBuilder::from(s.encode_utf16().collect::<Vec<_>>().as_slice());
let empty_origin = Utf16JsStringBuilder::new();
// clone == origin
let cloned = origin.clone();
assert_eq!(origin, cloned);
// clone_from == origin(empty)
let mut cloned_from = Utf16JsStringBuilder::new();
cloned_from.clone_from(&origin);
assert_eq!(origin, cloned_from);
// clone == origin(empty)
let cloned = empty_origin.clone();
assert_eq!(empty_origin, cloned);
// clone_from == origin(empty)
cloned_from.clone_from(&empty_origin);
assert!(cloned_from.capacity() > 0); // should not be reallocated so the capacity is preserved.
assert_eq!(empty_origin, cloned_from);
// clone_from(empty) == origin(empty)
let mut cloned_from = Utf16JsStringBuilder::new();
cloned_from.clone_from(&empty_origin);
assert!(cloned_from.capacity() == 0);
assert_eq!(empty_origin, cloned_from);
}
#[test]
fn common_js_string_builder() {
let utf16 = "2024年5月21日".encode_utf16().collect::<Vec<_>>();
let s_utf16 = utf16.as_slice();
let s = "Lorem ipsum dolor sit amet";
let js_str_utf16 = JsStr::utf16(s_utf16);
let js_str_ascii = JsStr::latin1(s.as_bytes());
let latin1_bytes = [
b'D', 0xE9, /* é */
b'j', 0xE0, /* à */
b' ', b'v', b'u',
];
let ch = '🎹';
let mut builder = CommonJsStringBuilder::with_capacity(10);
builder += ch;
builder += s;
builder += js_str_utf16;
builder += js_str_ascii;
builder += ch;
assert_eq!(builder.len(), 5);
let js_string = builder.build_from_utf16();
assert_eq!(
js_string,
"🎹Lorem ipsum dolor sit amet2024年5月21日Lorem ipsum dolor sit amet🎹"
);
let mut builder = CommonJsStringBuilder::new();
for b in latin1_bytes {
builder += b;
}
builder += s_utf16;
builder += ch;
let js_string = builder.build();
assert_eq!(
js_string.to_std_string().unwrap_or_default(),
"Déjà vu2024年5月21日🎹"
);
}

Loading…
Cancel
Save